Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale_template.c @ 6e1c66bc

History | View | Annotate | Download (87 KB)

1
/*
2
    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
#undef REAL_MOVNTQ
20
#undef MOVNTQ
21
#undef PAVGB
22
#undef PREFETCH
23
#undef PREFETCHW
24
#undef EMMS
25
#undef SFENCE
26

    
27
#ifdef HAVE_3DNOW
28
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29
#define EMMS     "femms"
30
#else
31
#define EMMS     "emms"
32
#endif
33

    
34
#ifdef HAVE_3DNOW
35
#define PREFETCH  "prefetch"
36
#define PREFETCHW "prefetchw"
37
#elif defined ( HAVE_MMX2 )
38
#define PREFETCH "prefetchnta"
39
#define PREFETCHW "prefetcht0"
40
#else
41
#define PREFETCH "/nop"
42
#define PREFETCHW "/nop"
43
#endif
44

    
45
#ifdef HAVE_MMX2
46
#define SFENCE "sfence"
47
#else
48
#define SFENCE "/nop"
49
#endif
50

    
51
#ifdef HAVE_MMX2
52
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53
#elif defined (HAVE_3DNOW)
54
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55
#endif
56

    
57
#ifdef HAVE_MMX2
58
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59
#else
60
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61
#endif
62
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
63

    
64
#ifdef HAVE_ALTIVEC
65
#include "swscale_altivec_template.c"
66
#endif
67

    
68
#define YSCALEYUV2YV12X(x, offset) \
69
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
70
                        "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71
                        "movq %%mm3, %%mm4                \n\t"\
72
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
73
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
74
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
75
                        "1:                                \n\t"\
76
                        "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
77
                        "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78
                        "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79
                        "add $16, %%"REG_d"                \n\t"\
80
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
81
                        "test %%"REG_S", %%"REG_S"        \n\t"\
82
                        "pmulhw %%mm0, %%mm2                \n\t"\
83
                        "pmulhw %%mm0, %%mm5                \n\t"\
84
                        "paddw %%mm2, %%mm3                \n\t"\
85
                        "paddw %%mm5, %%mm4                \n\t"\
86
                        " jnz 1b                        \n\t"\
87
                        "psraw $3, %%mm3                \n\t"\
88
                        "psraw $3, %%mm4                \n\t"\
89
                        "packuswb %%mm4, %%mm3                \n\t"\
90
                        MOVNTQ(%%mm3, (%1, %%REGa))\
91
                        "add $8, %%"REG_a"                \n\t"\
92
                        "cmp %2, %%"REG_a"                \n\t"\
93
                        "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94
                        "movq %%mm3, %%mm4                \n\t"\
95
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
96
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
97
                        "jb 1b                                \n\t"
98

    
99
#define YSCALEYUV2YV121 \
100
                        "mov %2, %%"REG_a"                \n\t"\
101
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
102
                        "1:                                \n\t"\
103
                        "movq (%0, %%"REG_a", 2), %%mm0        \n\t"\
104
                        "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105
                        "psraw $7, %%mm0                \n\t"\
106
                        "psraw $7, %%mm1                \n\t"\
107
                        "packuswb %%mm1, %%mm0                \n\t"\
108
                        MOVNTQ(%%mm0, (%1, %%REGa))\
109
                        "add $8, %%"REG_a"                \n\t"\
110
                        "jnc 1b                                \n\t"
111

    
112
/*
113
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115
                           "r" (dest), "m" (dstW),
116
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
118
*/
119
#define YSCALEYUV2PACKEDX \
120
                "xor %%"REG_a", %%"REG_a"        \n\t"\
121
                ".balign 16                        \n\t"\
122
                "nop                                \n\t"\
123
                "1:                                \n\t"\
124
                "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
126
                "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127
                "movq %%mm3, %%mm4                \n\t"\
128
                ".balign 16                        \n\t"\
129
                "2:                                \n\t"\
130
                "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
131
                "movq (%%"REG_S", %%"REG_a"), %%mm2        \n\t" /* UsrcData */\
132
                "movq 4096(%%"REG_S", %%"REG_a"), %%mm5        \n\t" /* VsrcData */\
133
                "add $16, %%"REG_d"                \n\t"\
134
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
135
                "pmulhw %%mm0, %%mm2                \n\t"\
136
                "pmulhw %%mm0, %%mm5                \n\t"\
137
                "paddw %%mm2, %%mm3                \n\t"\
138
                "paddw %%mm5, %%mm4                \n\t"\
139
                "test %%"REG_S", %%"REG_S"        \n\t"\
140
                " jnz 2b                        \n\t"\
141
\
142
                "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
144
                "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145
                "movq %%mm1, %%mm7                \n\t"\
146
                ".balign 16                        \n\t"\
147
                "2:                                \n\t"\
148
                "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
149
                "movq (%%"REG_S", %%"REG_a", 2), %%mm2        \n\t" /* Y1srcData */\
150
                "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5        \n\t" /* Y2srcData */\
151
                "add $16, %%"REG_d"                \n\t"\
152
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
153
                "pmulhw %%mm0, %%mm2                \n\t"\
154
                "pmulhw %%mm0, %%mm5                \n\t"\
155
                "paddw %%mm2, %%mm1                \n\t"\
156
                "paddw %%mm5, %%mm7                \n\t"\
157
                "test %%"REG_S", %%"REG_S"        \n\t"\
158
                " jnz 2b                        \n\t"\
159

    
160

    
161
#define YSCALEYUV2RGBX \
162
                YSCALEYUV2PACKEDX\
163
                "psubw "U_OFFSET"(%0), %%mm3        \n\t" /* (U-128)8*/\
164
                "psubw "V_OFFSET"(%0), %%mm4        \n\t" /* (V-128)8*/\
165
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
166
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
167
                "pmulhw "UG_COEFF"(%0), %%mm3        \n\t"\
168
                "pmulhw "VG_COEFF"(%0), %%mm4        \n\t"\
169
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170
                "pmulhw "UB_COEFF"(%0), %%mm2        \n\t"\
171
                "pmulhw "VR_COEFF"(%0), %%mm5        \n\t"\
172
                "psubw "Y_OFFSET"(%0), %%mm1        \n\t" /* 8(Y-16)*/\
173
                "psubw "Y_OFFSET"(%0), %%mm7        \n\t" /* 8(Y-16)*/\
174
                "pmulhw "Y_COEFF"(%0), %%mm1        \n\t"\
175
                "pmulhw "Y_COEFF"(%0), %%mm7        \n\t"\
176
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177
                "paddw %%mm3, %%mm4                \n\t"\
178
                "movq %%mm2, %%mm0                \n\t"\
179
                "movq %%mm5, %%mm6                \n\t"\
180
                "movq %%mm4, %%mm3                \n\t"\
181
                "punpcklwd %%mm2, %%mm2                \n\t"\
182
                "punpcklwd %%mm5, %%mm5                \n\t"\
183
                "punpcklwd %%mm4, %%mm4                \n\t"\
184
                "paddw %%mm1, %%mm2                \n\t"\
185
                "paddw %%mm1, %%mm5                \n\t"\
186
                "paddw %%mm1, %%mm4                \n\t"\
187
                "punpckhwd %%mm0, %%mm0                \n\t"\
188
                "punpckhwd %%mm6, %%mm6                \n\t"\
189
                "punpckhwd %%mm3, %%mm3                \n\t"\
190
                "paddw %%mm7, %%mm0                \n\t"\
191
                "paddw %%mm7, %%mm6                \n\t"\
192
                "paddw %%mm7, %%mm3                \n\t"\
193
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194
                "packuswb %%mm0, %%mm2                \n\t"\
195
                "packuswb %%mm6, %%mm5                \n\t"\
196
                "packuswb %%mm3, %%mm4                \n\t"\
197
                "pxor %%mm7, %%mm7                \n\t"
198
#if 0
199
#define FULL_YSCALEYUV2RGB \
200
                "pxor %%mm7, %%mm7                \n\t"\
201
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
202
                "punpcklwd %%mm6, %%mm6                \n\t"\
203
                "punpcklwd %%mm6, %%mm6                \n\t"\
204
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
205
                "punpcklwd %%mm5, %%mm5                \n\t"\
206
                "punpcklwd %%mm5, %%mm5                \n\t"\
207
                "xor %%"REG_a", %%"REG_a"                \n\t"\
208
                ".balign 16                        \n\t"\
209
                "1:                                \n\t"\
210
                "movq (%0, %%"REG_a", 2), %%mm0        \n\t" /*buf0[eax]*/\
211
                "movq (%1, %%"REG_a", 2), %%mm1        \n\t" /*buf1[eax]*/\
212
                "movq (%2, %%"REG_a",2), %%mm2        \n\t" /* uvbuf0[eax]*/\
213
                "movq (%3, %%"REG_a",2), %%mm3        \n\t" /* uvbuf1[eax]*/\
214
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
215
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219
                "movq 4096(%2, %%"REG_a",2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
220
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222
                "movq 4096(%3, %%"REG_a",2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
223
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
226
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* 8(U-128)*/\
227
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
228
\
229
\
230
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
232
                "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234
                "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236
                "psubw "MANGLE(w400)", %%mm0        \n\t" /* (V-128)8*/\
237
\
238
\
239
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
240
                "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
243
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
244
                "packuswb %%mm3, %%mm3                \n\t"\
245
\
246
                "packuswb %%mm0, %%mm0                \n\t"\
247
                "paddw %%mm4, %%mm2                \n\t"\
248
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
249
\
250
                "packuswb %%mm1, %%mm1                \n\t"
251
#endif
252

    
253
#define REAL_YSCALEYUV2PACKED(index, c) \
254
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255
                "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256
                "psraw $3, %%mm0                \n\t"\
257
                "psraw $3, %%mm1                \n\t"\
258
                "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259
                "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260
                "xor "#index", "#index"                \n\t"\
261
                ".balign 16                        \n\t"\
262
                "1:                                \n\t"\
263
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
264
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
265
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
266
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
267
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272
                "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273
                "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276
                "movq (%0, "#index", 2), %%mm0        \n\t" /*buf0[eax]*/\
277
                "movq (%1, "#index", 2), %%mm1        \n\t" /*buf1[eax]*/\
278
                "movq 8(%0, "#index", 2), %%mm6        \n\t" /*buf0[eax]*/\
279
                "movq 8(%1, "#index", 2), %%mm7        \n\t" /*buf1[eax]*/\
280
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
281
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
282
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284
                "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285
                "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
288
                
289
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
290
                
291
#define REAL_YSCALEYUV2RGB(index, c) \
292
                "xor "#index", "#index"        \n\t"\
293
                ".balign 16                        \n\t"\
294
                "1:                                \n\t"\
295
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
296
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
297
                "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298
                "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
309
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
310
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
311
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
312
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315
                "movq (%0, "#index", 2), %%mm0        \n\t" /*buf0[eax]*/\
316
                "movq (%1, "#index", 2), %%mm1        \n\t" /*buf1[eax]*/\
317
                "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318
                "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
320
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
321
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
330
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
331
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
332
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
333
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334
                "paddw %%mm3, %%mm4                \n\t"\
335
                "movq %%mm2, %%mm0                \n\t"\
336
                "movq %%mm5, %%mm6                \n\t"\
337
                "movq %%mm4, %%mm3                \n\t"\
338
                "punpcklwd %%mm2, %%mm2                \n\t"\
339
                "punpcklwd %%mm5, %%mm5                \n\t"\
340
                "punpcklwd %%mm4, %%mm4                \n\t"\
341
                "paddw %%mm1, %%mm2                \n\t"\
342
                "paddw %%mm1, %%mm5                \n\t"\
343
                "paddw %%mm1, %%mm4                \n\t"\
344
                "punpckhwd %%mm0, %%mm0                \n\t"\
345
                "punpckhwd %%mm6, %%mm6                \n\t"\
346
                "punpckhwd %%mm3, %%mm3                \n\t"\
347
                "paddw %%mm7, %%mm0                \n\t"\
348
                "paddw %%mm7, %%mm6                \n\t"\
349
                "paddw %%mm7, %%mm3                \n\t"\
350
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351
                "packuswb %%mm0, %%mm2                \n\t"\
352
                "packuswb %%mm6, %%mm5                \n\t"\
353
                "packuswb %%mm3, %%mm4                \n\t"\
354
                "pxor %%mm7, %%mm7                \n\t"
355
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
356
                
357
#define REAL_YSCALEYUV2PACKED1(index, c) \
358
                "xor "#index", "#index"                \n\t"\
359
                ".balign 16                        \n\t"\
360
                "1:                                \n\t"\
361
                "movq (%2, "#index"), %%mm3        \n\t" /* uvbuf0[eax]*/\
362
                "movq 4096(%2, "#index"), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
363
                "psraw $7, %%mm3                \n\t" \
364
                "psraw $7, %%mm4                \n\t" \
365
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
366
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
367
                "psraw $7, %%mm1                \n\t" \
368
                "psraw $7, %%mm7                \n\t" \
369
                
370
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
371
                
372
#define REAL_YSCALEYUV2RGB1(index, c) \
373
                "xor "#index", "#index"        \n\t"\
374
                ".balign 16                        \n\t"\
375
                "1:                                \n\t"\
376
                "movq (%2, "#index"), %%mm3        \n\t" /* uvbuf0[eax]*/\
377
                "movq 4096(%2, "#index"), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
378
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
381
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
382
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
383
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
384
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
388
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
389
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
394
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
395
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
396
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
397
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398
                "paddw %%mm3, %%mm4                \n\t"\
399
                "movq %%mm2, %%mm0                \n\t"\
400
                "movq %%mm5, %%mm6                \n\t"\
401
                "movq %%mm4, %%mm3                \n\t"\
402
                "punpcklwd %%mm2, %%mm2                \n\t"\
403
                "punpcklwd %%mm5, %%mm5                \n\t"\
404
                "punpcklwd %%mm4, %%mm4                \n\t"\
405
                "paddw %%mm1, %%mm2                \n\t"\
406
                "paddw %%mm1, %%mm5                \n\t"\
407
                "paddw %%mm1, %%mm4                \n\t"\
408
                "punpckhwd %%mm0, %%mm0                \n\t"\
409
                "punpckhwd %%mm6, %%mm6                \n\t"\
410
                "punpckhwd %%mm3, %%mm3                \n\t"\
411
                "paddw %%mm7, %%mm0                \n\t"\
412
                "paddw %%mm7, %%mm6                \n\t"\
413
                "paddw %%mm7, %%mm3                \n\t"\
414
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415
                "packuswb %%mm0, %%mm2                \n\t"\
416
                "packuswb %%mm6, %%mm5                \n\t"\
417
                "packuswb %%mm3, %%mm4                \n\t"\
418
                "pxor %%mm7, %%mm7                \n\t"
419
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
420

    
421
#define REAL_YSCALEYUV2PACKED1b(index, c) \
422
                "xor "#index", "#index"                \n\t"\
423
                ".balign 16                        \n\t"\
424
                "1:                                \n\t"\
425
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
426
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
427
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
428
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
429
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431
                "psrlw $8, %%mm3                \n\t" \
432
                "psrlw $8, %%mm4                \n\t" \
433
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
434
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
435
                "psraw $7, %%mm1                \n\t" \
436
                "psraw $7, %%mm7                \n\t" 
437
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
438
                
439
// do vertical chrominance interpolation
440
#define REAL_YSCALEYUV2RGB1b(index, c) \
441
                "xor "#index", "#index"                \n\t"\
442
                ".balign 16                        \n\t"\
443
                "1:                                \n\t"\
444
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
445
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
446
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
447
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
448
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450
                "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
451
                "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
452
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
453
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
454
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
455
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
456
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
460
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
461
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
466
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
467
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
468
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
469
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470
                "paddw %%mm3, %%mm4                \n\t"\
471
                "movq %%mm2, %%mm0                \n\t"\
472
                "movq %%mm5, %%mm6                \n\t"\
473
                "movq %%mm4, %%mm3                \n\t"\
474
                "punpcklwd %%mm2, %%mm2                \n\t"\
475
                "punpcklwd %%mm5, %%mm5                \n\t"\
476
                "punpcklwd %%mm4, %%mm4                \n\t"\
477
                "paddw %%mm1, %%mm2                \n\t"\
478
                "paddw %%mm1, %%mm5                \n\t"\
479
                "paddw %%mm1, %%mm4                \n\t"\
480
                "punpckhwd %%mm0, %%mm0                \n\t"\
481
                "punpckhwd %%mm6, %%mm6                \n\t"\
482
                "punpckhwd %%mm3, %%mm3                \n\t"\
483
                "paddw %%mm7, %%mm0                \n\t"\
484
                "paddw %%mm7, %%mm6                \n\t"\
485
                "paddw %%mm7, %%mm3                \n\t"\
486
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487
                "packuswb %%mm0, %%mm2                \n\t"\
488
                "packuswb %%mm6, %%mm5                \n\t"\
489
                "packuswb %%mm3, %%mm4                \n\t"\
490
                "pxor %%mm7, %%mm7                \n\t"
491
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
492

    
493
#define REAL_WRITEBGR32(dst, dstw, index) \
494
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495
                        "movq %%mm2, %%mm1                \n\t" /* B */\
496
                        "movq %%mm5, %%mm6                \n\t" /* R */\
497
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
498
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
499
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
500
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
501
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
502
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
503
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
504
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
505
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
506
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
507
\
508
                        MOVNTQ(%%mm0, (dst, index, 4))\
509
                        MOVNTQ(%%mm2, 8(dst, index, 4))\
510
                        MOVNTQ(%%mm1, 16(dst, index, 4))\
511
                        MOVNTQ(%%mm3, 24(dst, index, 4))\
512
\
513
                        "add $8, "#index"                \n\t"\
514
                        "cmp "#dstw", "#index"                \n\t"\
515
                        " jb 1b                                \n\t"
516
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
517

    
518
#define REAL_WRITEBGR16(dst, dstw, index) \
519
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
520
                        "pand "MANGLE(bFC)", %%mm4        \n\t" /* G */\
521
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
522
                        "psrlq $3, %%mm2                \n\t"\
523
\
524
                        "movq %%mm2, %%mm1                \n\t"\
525
                        "movq %%mm4, %%mm3                \n\t"\
526
\
527
                        "punpcklbw %%mm7, %%mm3                \n\t"\
528
                        "punpcklbw %%mm5, %%mm2                \n\t"\
529
                        "punpckhbw %%mm7, %%mm4                \n\t"\
530
                        "punpckhbw %%mm5, %%mm1                \n\t"\
531
\
532
                        "psllq $3, %%mm3                \n\t"\
533
                        "psllq $3, %%mm4                \n\t"\
534
\
535
                        "por %%mm3, %%mm2                \n\t"\
536
                        "por %%mm4, %%mm1                \n\t"\
537
\
538
                        MOVNTQ(%%mm2, (dst, index, 2))\
539
                        MOVNTQ(%%mm1, 8(dst, index, 2))\
540
\
541
                        "add $8, "#index"                \n\t"\
542
                        "cmp "#dstw", "#index"                \n\t"\
543
                        " jb 1b                                \n\t"
544
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
545

    
546
#define REAL_WRITEBGR15(dst, dstw, index) \
547
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
548
                        "pand "MANGLE(bF8)", %%mm4        \n\t" /* G */\
549
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
550
                        "psrlq $3, %%mm2                \n\t"\
551
                        "psrlq $1, %%mm5                \n\t"\
552
\
553
                        "movq %%mm2, %%mm1                \n\t"\
554
                        "movq %%mm4, %%mm3                \n\t"\
555
\
556
                        "punpcklbw %%mm7, %%mm3                \n\t"\
557
                        "punpcklbw %%mm5, %%mm2                \n\t"\
558
                        "punpckhbw %%mm7, %%mm4                \n\t"\
559
                        "punpckhbw %%mm5, %%mm1                \n\t"\
560
\
561
                        "psllq $2, %%mm3                \n\t"\
562
                        "psllq $2, %%mm4                \n\t"\
563
\
564
                        "por %%mm3, %%mm2                \n\t"\
565
                        "por %%mm4, %%mm1                \n\t"\
566
\
567
                        MOVNTQ(%%mm2, (dst, index, 2))\
568
                        MOVNTQ(%%mm1, 8(dst, index, 2))\
569
\
570
                        "add $8, "#index"                \n\t"\
571
                        "cmp "#dstw", "#index"                \n\t"\
572
                        " jb 1b                                \n\t"
573
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
574

    
575
#define WRITEBGR24OLD(dst, dstw, index) \
576
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577
                        "movq %%mm2, %%mm1                \n\t" /* B */\
578
                        "movq %%mm5, %%mm6                \n\t" /* R */\
579
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
580
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
581
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
582
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
583
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
584
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
585
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
586
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
587
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
588
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
589
\
590
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
591
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
592
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593
                        "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
595
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
596
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
597
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
598
\
599
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
600
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
601
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
602
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
603
                        "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
605
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
606
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607
                        "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
609
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
610
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
611
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
612
\
613
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
614
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
615
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
616
                        "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
619
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
620
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
621
\
622
                        MOVNTQ(%%mm0, (dst))\
623
                        MOVNTQ(%%mm2, 8(dst))\
624
                        MOVNTQ(%%mm3, 16(dst))\
625
                        "add $24, "#dst"                \n\t"\
626
\
627
                        "add $8, "#index"                \n\t"\
628
                        "cmp "#dstw", "#index"                \n\t"\
629
                        " jb 1b                                \n\t"
630

    
631
#define WRITEBGR24MMX(dst, dstw, index) \
632
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633
                        "movq %%mm2, %%mm1                \n\t" /* B */\
634
                        "movq %%mm5, %%mm6                \n\t" /* R */\
635
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
636
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
637
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
638
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
639
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
640
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
641
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
642
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
643
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
644
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
645
\
646
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
647
                        "movq %%mm2, %%mm6                \n\t" /* 0RGB0RGB 1 */\
648
                        "movq %%mm1, %%mm5                \n\t" /* 0RGB0RGB 2 */\
649
                        "movq %%mm3, %%mm7                \n\t" /* 0RGB0RGB 3 */\
650
\
651
                        "psllq $40, %%mm0                \n\t" /* RGB00000 0 */\
652
                        "psllq $40, %%mm2                \n\t" /* RGB00000 1 */\
653
                        "psllq $40, %%mm1                \n\t" /* RGB00000 2 */\
654
                        "psllq $40, %%mm3                \n\t" /* RGB00000 3 */\
655
\
656
                        "punpckhdq %%mm4, %%mm0                \n\t" /* 0RGBRGB0 0 */\
657
                        "punpckhdq %%mm6, %%mm2                \n\t" /* 0RGBRGB0 1 */\
658
                        "punpckhdq %%mm5, %%mm1                \n\t" /* 0RGBRGB0 2 */\
659
                        "punpckhdq %%mm7, %%mm3                \n\t" /* 0RGBRGB0 3 */\
660
\
661
                        "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
662
                        "movq %%mm2, %%mm6                \n\t" /* 0RGBRGB0 1 */\
663
                        "psllq $40, %%mm2                \n\t" /* GB000000 1 */\
664
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
665
                        MOVNTQ(%%mm0, (dst))\
666
\
667
                        "psrlq $24, %%mm6                \n\t" /* 0000RGBR 1 */\
668
                        "movq %%mm1, %%mm5                \n\t" /* 0RGBRGB0 2 */\
669
                        "psllq $24, %%mm1                \n\t" /* BRGB0000 2 */\
670
                        "por %%mm1, %%mm6                \n\t" /* BRGBRGBR 1 */\
671
                        MOVNTQ(%%mm6, 8(dst))\
672
\
673
                        "psrlq $40, %%mm5                \n\t" /* 000000RG 2 */\
674
                        "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
675
                        "por %%mm3, %%mm5                \n\t" /* RGBRGBRG 2 */\
676
                        MOVNTQ(%%mm5, 16(dst))\
677
\
678
                        "add $24, "#dst"                \n\t"\
679
\
680
                        "add $8, "#index"                        \n\t"\
681
                        "cmp "#dstw", "#index"                        \n\t"\
682
                        " jb 1b                                \n\t"
683

    
684
#define WRITEBGR24MMX2(dst, dstw, index) \
685
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686
                        "movq "MANGLE(M24A)", %%mm0        \n\t"\
687
                        "movq "MANGLE(M24C)", %%mm7        \n\t"\
688
                        "pshufw $0x50, %%mm2, %%mm1        \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
689
                        "pshufw $0x50, %%mm4, %%mm3        \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
690
                        "pshufw $0x00, %%mm5, %%mm6        \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
691
\
692
                        "pand %%mm0, %%mm1                \n\t" /*    B2        B1       B0 */\
693
                        "pand %%mm0, %%mm3                \n\t" /*    G2        G1       G0 */\
694
                        "pand %%mm7, %%mm6                \n\t" /*       R1        R0       */\
695
\
696
                        "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
697
                        "por %%mm1, %%mm6                \n\t"\
698
                        "por %%mm3, %%mm6                \n\t"\
699
                        MOVNTQ(%%mm6, (dst))\
700
\
701
                        "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
702
                        "pshufw $0xA5, %%mm2, %%mm1        \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
703
                        "pshufw $0x55, %%mm4, %%mm3        \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
704
                        "pshufw $0xA5, %%mm5, %%mm6        \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
705
\
706
                        "pand "MANGLE(M24B)", %%mm1        \n\t" /* B5       B4        B3    */\
707
                        "pand %%mm7, %%mm3                \n\t" /*       G4        G3       */\
708
                        "pand %%mm0, %%mm6                \n\t" /*    R4        R3       R2 */\
709
\
710
                        "por %%mm1, %%mm3                \n\t" /* B5    G4 B4     G3 B3    */\
711
                        "por %%mm3, %%mm6                \n\t"\
712
                        MOVNTQ(%%mm6, 8(dst))\
713
\
714
                        "pshufw $0xFF, %%mm2, %%mm1        \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
715
                        "pshufw $0xFA, %%mm4, %%mm3        \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
716
                        "pshufw $0xFA, %%mm5, %%mm6        \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
717
\
718
                        "pand %%mm7, %%mm1                \n\t" /*       B7        B6       */\
719
                        "pand %%mm0, %%mm3                \n\t" /*    G7        G6       G5 */\
720
                        "pand "MANGLE(M24B)", %%mm6        \n\t" /* R7       R6        R5    */\
721
\
722
                        "por %%mm1, %%mm3                \n\t"\
723
                        "por %%mm3, %%mm6                \n\t"\
724
                        MOVNTQ(%%mm6, 16(dst))\
725
\
726
                        "add $24, "#dst"                \n\t"\
727
\
728
                        "add $8, "#index"                \n\t"\
729
                        "cmp "#dstw", "#index"                \n\t"\
730
                        " jb 1b                                \n\t"
731

    
732
#ifdef HAVE_MMX2
733
#undef WRITEBGR24
734
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
735
#else
736
#undef WRITEBGR24
737
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
738
#endif
739

    
740
#define REAL_WRITEYUY2(dst, dstw, index) \
741
                        "packuswb %%mm3, %%mm3                \n\t"\
742
                        "packuswb %%mm4, %%mm4                \n\t"\
743
                        "packuswb %%mm7, %%mm1                \n\t"\
744
                        "punpcklbw %%mm4, %%mm3                \n\t"\
745
                        "movq %%mm1, %%mm7                \n\t"\
746
                        "punpcklbw %%mm3, %%mm1                \n\t"\
747
                        "punpckhbw %%mm3, %%mm7                \n\t"\
748
\
749
                        MOVNTQ(%%mm1, (dst, index, 2))\
750
                        MOVNTQ(%%mm7, 8(dst, index, 2))\
751
\
752
                        "add $8, "#index"                \n\t"\
753
                        "cmp "#dstw", "#index"                \n\t"\
754
                        " jb 1b                                \n\t"
755
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
756

    
757

    
758
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
761
{
762
#ifdef HAVE_MMX
763
        if(uDest != NULL)
764
        {
765
                asm volatile(
766
                                YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767
                                :: "r" (&c->redDither),
768
                                "r" (uDest), "m" ((long)chrDstW)
769
                                : "%"REG_a, "%"REG_d, "%"REG_S
770
                        );
771

    
772
                asm volatile(
773
                                YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774
                                :: "r" (&c->redDither),
775
                                "r" (vDest), "m" ((long)chrDstW)
776
                                : "%"REG_a, "%"REG_d, "%"REG_S
777
                        );
778
        }
779

    
780
        asm volatile(
781
                        YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782
                        :: "r" (&c->redDither),
783
                           "r" (dest), "m" ((long)dstW)
784
                        : "%"REG_a, "%"REG_d, "%"REG_S
785
                );
786
#else
787
#ifdef HAVE_ALTIVEC
788
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789
                      chrFilter, chrSrc, chrFilterSize,
790
                      dest, uDest, vDest, dstW, chrDstW);
791
#else //HAVE_ALTIVEC
792
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793
            chrFilter, chrSrc, chrFilterSize,
794
            dest, uDest, vDest, dstW, chrDstW);
795
#endif //!HAVE_ALTIVEC
796
#endif
797
}
798

    
799
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
800
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
801
{
802
#ifdef HAVE_MMX
803
        if(uDest != NULL)
804
        {
805
                asm volatile(
806
                                YSCALEYUV2YV121
807
                                :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
808
                                "g" ((long)-chrDstW)
809
                                : "%"REG_a
810
                        );
811

    
812
                asm volatile(
813
                                YSCALEYUV2YV121
814
                                :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
815
                                "g" ((long)-chrDstW)
816
                                : "%"REG_a
817
                        );
818
        }
819

    
820
        asm volatile(
821
                YSCALEYUV2YV121
822
                :: "r" (lumSrc + dstW), "r" (dest + dstW),
823
                "g" ((long)-dstW)
824
                : "%"REG_a
825
        );
826
#else
827
        int i;
828
        for(i=0; i<dstW; i++)
829
        {
830
                int val= lumSrc[i]>>7;
831
                
832
                if(val&256){
833
                        if(val<0) val=0;
834
                        else      val=255;
835
                }
836

    
837
                dest[i]= val;
838
        }
839

    
840
        if(uDest != NULL)
841
                for(i=0; i<chrDstW; i++)
842
                {
843
                        int u=chrSrc[i]>>7;
844
                        int v=chrSrc[i + 2048]>>7;
845

    
846
                        if((u|v)&256){
847
                                if(u<0)         u=0;
848
                                else if (u>255) u=255;
849
                                if(v<0)         v=0;
850
                                else if (v>255) v=255;
851
                        }
852

    
853
                        uDest[i]= u;
854
                        vDest[i]= v;
855
                }
856
#endif
857
}
858

    
859

    
860
/**
861
 * vertical scale YV12 to RGB
862
 */
863
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
864
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
865
                            uint8_t *dest, int dstW, int dstY)
866
{
867
        int dummy=0;
868
        switch(c->dstFormat)
869
        {
870
#ifdef HAVE_MMX
871
        case IMGFMT_BGR32:
872
                {
873
                        asm volatile(
874
                                YSCALEYUV2RGBX
875
                                WRITEBGR32(%4, %5, %%REGa)
876

    
877
                        :: "r" (&c->redDither), 
878
                           "m" (dummy), "m" (dummy), "m" (dummy),
879
                           "r" (dest), "m" (dstW)
880
                        : "%"REG_a, "%"REG_d, "%"REG_S
881
                        );
882
                }
883
                break;
884
        case IMGFMT_BGR24:
885
                {
886
                        asm volatile(
887
                                YSCALEYUV2RGBX
888
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
889
                                "add %4, %%"REG_b"                        \n\t"
890
                                WRITEBGR24(%%REGb, %5, %%REGa)
891

    
892
                        :: "r" (&c->redDither), 
893
                           "m" (dummy), "m" (dummy), "m" (dummy),
894
                           "r" (dest), "m" (dstW)
895
                        : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
896
                        );
897
                }
898
                break;
899
        case IMGFMT_BGR15:
900
                {
901
                        asm volatile(
902
                                YSCALEYUV2RGBX
903
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
904
#ifdef DITHER1XBPP
905
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
906
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
907
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
908
#endif
909

    
910
                                WRITEBGR15(%4, %5, %%REGa)
911

    
912
                        :: "r" (&c->redDither), 
913
                           "m" (dummy), "m" (dummy), "m" (dummy),
914
                           "r" (dest), "m" (dstW)
915
                        : "%"REG_a, "%"REG_d, "%"REG_S
916
                        );
917
                }
918
                break;
919
        case IMGFMT_BGR16:
920
                {
921
                        asm volatile(
922
                                YSCALEYUV2RGBX
923
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
924
#ifdef DITHER1XBPP
925
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
926
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
927
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
928
#endif
929

    
930
                                WRITEBGR16(%4, %5, %%REGa)
931

    
932
                        :: "r" (&c->redDither), 
933
                           "m" (dummy), "m" (dummy), "m" (dummy),
934
                           "r" (dest), "m" (dstW)
935
                        : "%"REG_a, "%"REG_d, "%"REG_S
936
                        );
937
                }
938
                break;
939
        case IMGFMT_YUY2:
940
                {
941
                        asm volatile(
942
                                YSCALEYUV2PACKEDX
943
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
944

    
945
                                "psraw $3, %%mm3                \n\t"
946
                                "psraw $3, %%mm4                \n\t"
947
                                "psraw $3, %%mm1                \n\t"
948
                                "psraw $3, %%mm7                \n\t"
949
                                WRITEYUY2(%4, %5, %%REGa)
950

    
951
                        :: "r" (&c->redDither), 
952
                           "m" (dummy), "m" (dummy), "m" (dummy),
953
                           "r" (dest), "m" (dstW)
954
                        : "%"REG_a, "%"REG_d, "%"REG_S
955
                        );
956
                }
957
                break;
958
#endif
959
        default:
960
#ifdef HAVE_ALTIVEC
961
                altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
962
                            chrFilter, chrSrc, chrFilterSize,
963
                            dest, dstW, dstY);
964
#else
965
                yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
966
                            chrFilter, chrSrc, chrFilterSize,
967
                            dest, dstW, dstY);
968
#endif
969
                break;
970
        }
971
}
972

    
973
/**
974
 * vertical bilinear scale YV12 to RGB
975
 */
976
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
977
                            uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
978
{
979
        int yalpha1=yalpha^4095;
980
        int uvalpha1=uvalpha^4095;
981
        int i;
982

    
983
#if 0 //isn't used
984
        if(flags&SWS_FULL_CHR_H_INT)
985
        {
986
                switch(dstFormat)
987
                {
988
#ifdef HAVE_MMX
989
                case IMGFMT_BGR32:
990
                        asm volatile(
991

992

993
FULL_YSCALEYUV2RGB
994
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
995
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
996

997
                        "movq %%mm3, %%mm1                \n\t"
998
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
999
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
1000

1001
                        MOVNTQ(%%mm3, (%4, %%REGa, 4))
1002
                        MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1003

1004
                        "add $4, %%"REG_a"                \n\t"
1005
                        "cmp %5, %%"REG_a"                \n\t"
1006
                        " jb 1b                                \n\t"
1007

1008

1009
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1010
                        "m" (yalpha1), "m" (uvalpha1)
1011
                        : "%"REG_a
1012
                        );
1013
                        break;
1014
                case IMGFMT_BGR24:
1015
                        asm volatile(
1016

1017
FULL_YSCALEYUV2RGB
1018

1019
                                                                // lsb ... msb
1020
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
1021
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
1022

1023
                        "movq %%mm3, %%mm1                \n\t"
1024
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
1025
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
1026

1027
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
1028
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
1029
                        "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1030
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1031
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
1032
                        "movq %%mm1, %%mm2                \n\t"
1033
                        "psllq $48, %%mm1                \n\t" // 000000BG
1034
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
1035

1036
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
1037
                        "psrld $16, %%mm2                \n\t" // R000R000
1038
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
1039
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
1040

1041
                        "mov %4, %%"REG_b"                \n\t"
1042
                        "add %%"REG_a", %%"REG_b"        \n\t"
1043

1044
#ifdef HAVE_MMX2
1045
                        //FIXME Alignment
1046
                        "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1047
                        "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1048
#else
1049
                        "movd %%mm3, (%%"REG_b", %%"REG_a", 2)        \n\t"
1050
                        "psrlq $32, %%mm3                \n\t"
1051
                        "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)        \n\t"
1052
                        "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)        \n\t"
1053
#endif
1054
                        "add $4, %%"REG_a"                \n\t"
1055
                        "cmp %5, %%"REG_a"                \n\t"
1056
                        " jb 1b                                \n\t"
1057

    
1058
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1059
                        "m" (yalpha1), "m" (uvalpha1)
1060
                        : "%"REG_a, "%"REG_b
1061
                        );
1062
                        break;
1063
                case IMGFMT_BGR15:
1064
                        asm volatile(
1065

    
1066
FULL_YSCALEYUV2RGB
1067
#ifdef DITHER1XBPP
1068
                        "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1069
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1070
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1071
#endif
1072
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
1073
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
1074
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
1075

    
1076
                        "psrlw $3, %%mm3                \n\t"
1077
                        "psllw $2, %%mm1                \n\t"
1078
                        "psllw $7, %%mm0                \n\t"
1079
                        "pand "MANGLE(g15Mask)", %%mm1        \n\t"
1080
                        "pand "MANGLE(r15Mask)", %%mm0        \n\t"
1081

    
1082
                        "por %%mm3, %%mm1                \n\t"
1083
                        "por %%mm1, %%mm0                \n\t"
1084

    
1085
                        MOVNTQ(%%mm0, (%4, %%REGa, 2))
1086

    
1087
                        "add $4, %%"REG_a"                \n\t"
1088
                        "cmp %5, %%"REG_a"                \n\t"
1089
                        " jb 1b                                \n\t"
1090

    
1091
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1092
                        "m" (yalpha1), "m" (uvalpha1)
1093
                        : "%"REG_a
1094
                        );
1095
                        break;
1096
                case IMGFMT_BGR16:
1097
                        asm volatile(
1098

    
1099
FULL_YSCALEYUV2RGB
1100
#ifdef DITHER1XBPP
1101
                        "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1102
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1103
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1104
#endif
1105
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
1106
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
1107
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
1108

    
1109
                        "psrlw $3, %%mm3                \n\t"
1110
                        "psllw $3, %%mm1                \n\t"
1111
                        "psllw $8, %%mm0                \n\t"
1112
                        "pand "MANGLE(g16Mask)", %%mm1        \n\t"
1113
                        "pand "MANGLE(r16Mask)", %%mm0        \n\t"
1114

    
1115
                        "por %%mm3, %%mm1                \n\t"
1116
                        "por %%mm1, %%mm0                \n\t"
1117

    
1118
                        MOVNTQ(%%mm0, (%4, %%REGa, 2))
1119

    
1120
                        "add $4, %%"REG_a"                \n\t"
1121
                        "cmp %5, %%"REG_a"                \n\t"
1122
                        " jb 1b                                \n\t"
1123

    
1124
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1125
                        "m" (yalpha1), "m" (uvalpha1)
1126
                        : "%"REG_a
1127
                        );
1128
                break;
1129
#endif
1130
                case IMGFMT_RGB32:
1131
#ifndef HAVE_MMX
1132
                case IMGFMT_BGR32:
1133
#endif
1134
                if(dstFormat==IMGFMT_BGR32)
1135
                {
1136
                        int i;
1137
#ifdef WORDS_BIGENDIAN
1138
                        dest++;
1139
#endif
1140
                        for(i=0;i<dstW;i++){
1141
                                // vertical linear interpolation && yuv2rgb in a single step:
1142
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1143
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1144
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1145
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1146
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1147
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1148
                                dest+= 4;
1149
                        }
1150
                }
1151
                else if(dstFormat==IMGFMT_BGR24)
1152
                {
1153
                        int i;
1154
                        for(i=0;i<dstW;i++){
1155
                                // vertical linear interpolation && yuv2rgb in a single step:
1156
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1157
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1158
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1159
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1160
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1161
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1162
                                dest+= 3;
1163
                        }
1164
                }
1165
                else if(dstFormat==IMGFMT_BGR16)
1166
                {
1167
                        int i;
1168
                        for(i=0;i<dstW;i++){
1169
                                // vertical linear interpolation && yuv2rgb in a single step:
1170
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1171
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1172
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1173

    
1174
                                ((uint16_t*)dest)[i] =
1175
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1176
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1177
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1178
                        }
1179
                }
1180
                else if(dstFormat==IMGFMT_BGR15)
1181
                {
1182
                        int i;
1183
                        for(i=0;i<dstW;i++){
1184
                                // vertical linear interpolation && yuv2rgb in a single step:
1185
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1186
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1187
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1188

    
1189
                                ((uint16_t*)dest)[i] =
1190
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1191
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1192
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1193
                        }
1194
                }
1195
        }//FULL_UV_IPOL
1196
        else
1197
        {
1198
#endif // if 0
1199
#ifdef HAVE_MMX
1200
        switch(c->dstFormat)
1201
        {
1202
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1203
        case IMGFMT_BGR32:
1204
                        asm volatile(
1205
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1206
                                "mov %4, %%"REG_SP"                        \n\t"
1207
                                YSCALEYUV2RGB(%%REGa, %5)
1208
                                WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1209
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1210

    
1211
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1212
                        "r" (&c->redDither)
1213
                        : "%"REG_a
1214
                        );
1215
                        return;
1216
        case IMGFMT_BGR24:
1217
                        asm volatile(
1218
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1219
                                "mov %4, %%"REG_SP"                        \n\t"
1220
                                YSCALEYUV2RGB(%%REGa, %5)
1221
                                WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1222
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1223
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1224
                        "r" (&c->redDither)
1225
                        : "%"REG_a
1226
                        );
1227
                        return;
1228
        case IMGFMT_BGR15:
1229
                        asm volatile(
1230
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1231
                                "mov %4, %%"REG_SP"                        \n\t"
1232
                                YSCALEYUV2RGB(%%REGa, %5)
1233
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1234
#ifdef DITHER1XBPP
1235
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1236
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1237
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1238
#endif
1239

    
1240
                                WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1241
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1242

    
1243
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1244
                        "r" (&c->redDither)
1245
                        : "%"REG_a
1246
                        );
1247
                        return;
1248
        case IMGFMT_BGR16:
1249
                        asm volatile(
1250
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1251
                                "mov %4, %%"REG_SP"                        \n\t"
1252
                                YSCALEYUV2RGB(%%REGa, %5)
1253
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1254
#ifdef DITHER1XBPP
1255
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1256
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1257
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1258
#endif
1259

    
1260
                                WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1261
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1262
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1263
                        "r" (&c->redDither)
1264
                        : "%"REG_a
1265
                        );
1266
                        return;
1267
        case IMGFMT_YUY2:
1268
                        asm volatile(
1269
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1270
                                "mov %4, %%"REG_SP"                        \n\t"
1271
                                YSCALEYUV2PACKED(%%REGa, %5)
1272
                                WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1273
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1274
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1275
                        "r" (&c->redDither)
1276
                        : "%"REG_a
1277
                        );
1278
                        return;
1279
        default: break;
1280
        }
1281
#endif //HAVE_MMX
1282
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1283
}
1284

    
1285
/**
1286
 * YV12 to RGB without scaling or interpolating
1287
 */
1288
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1289
                            uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1290
{
1291
        const int yalpha1=0;
1292
        int i;
1293
        
1294
        uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1295
        const int yalpha= 4096; //FIXME ...
1296

    
1297
        if(flags&SWS_FULL_CHR_H_INT)
1298
        {
1299
                RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1300
                return;
1301
        }
1302

    
1303
#ifdef HAVE_MMX
1304
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1305
        {
1306
                switch(dstFormat)
1307
                {
1308
                case IMGFMT_BGR32:
1309
                        asm volatile(
1310
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1311
                                "mov %4, %%"REG_SP"                        \n\t"
1312
                                YSCALEYUV2RGB1(%%REGa, %5)
1313
                                WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1314
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1315

    
1316
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1317
                        "r" (&c->redDither)
1318
                        : "%"REG_a
1319
                        );
1320
                        return;
1321
                case IMGFMT_BGR24:
1322
                        asm volatile(
1323
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1324
                                "mov %4, %%"REG_SP"                        \n\t"
1325
                                YSCALEYUV2RGB1(%%REGa, %5)
1326
                                WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1327
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1328

    
1329
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1330
                        "r" (&c->redDither)
1331
                        : "%"REG_a
1332
                        );
1333
                        return;
1334
                case IMGFMT_BGR15:
1335
                        asm volatile(
1336
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1337
                                "mov %4, %%"REG_SP"                        \n\t"
1338
                                YSCALEYUV2RGB1(%%REGa, %5)
1339
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1340
#ifdef DITHER1XBPP
1341
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1342
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1343
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1344
#endif
1345
                                WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1346
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1347

    
1348
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1349
                        "r" (&c->redDither)
1350
                        : "%"REG_a
1351
                        );
1352
                        return;
1353
                case IMGFMT_BGR16:
1354
                        asm volatile(
1355
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1356
                                "mov %4, %%"REG_SP"                        \n\t"
1357
                                YSCALEYUV2RGB1(%%REGa, %5)
1358
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1359
#ifdef DITHER1XBPP
1360
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1361
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1362
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1363
#endif
1364

    
1365
                                WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1366
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1367

    
1368
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1369
                        "r" (&c->redDither)
1370
                        : "%"REG_a
1371
                        );
1372
                        return;
1373
                case IMGFMT_YUY2:
1374
                        asm volatile(
1375
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1376
                                "mov %4, %%"REG_SP"                        \n\t"
1377
                                YSCALEYUV2PACKED1(%%REGa, %5)
1378
                                WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1379
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1380

    
1381
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1382
                        "r" (&c->redDither)
1383
                        : "%"REG_a
1384
                        );
1385
                        return;
1386
                }
1387
        }
1388
        else
1389
        {
1390
                switch(dstFormat)
1391
                {
1392
                case IMGFMT_BGR32:
1393
                        asm volatile(
1394
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1395
                                "mov %4, %%"REG_SP"                        \n\t"
1396
                                YSCALEYUV2RGB1b(%%REGa, %5)
1397
                                WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1398
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1399

    
1400
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1401
                        "r" (&c->redDither)
1402
                        : "%"REG_a
1403
                        );
1404
                        return;
1405
                case IMGFMT_BGR24:
1406
                        asm volatile(
1407
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1408
                                "mov %4, %%"REG_SP"                        \n\t"
1409
                                YSCALEYUV2RGB1b(%%REGa, %5)
1410
                                WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1411
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1412

    
1413
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1414
                        "r" (&c->redDither)
1415
                        : "%"REG_a
1416
                        );
1417
                        return;
1418
                case IMGFMT_BGR15:
1419
                        asm volatile(
1420
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1421
                                "mov %4, %%"REG_SP"                        \n\t"
1422
                                YSCALEYUV2RGB1b(%%REGa, %5)
1423
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1424
#ifdef DITHER1XBPP
1425
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1426
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1427
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1428
#endif
1429
                                WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1430
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1431

    
1432
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1433
                        "r" (&c->redDither)
1434
                        : "%"REG_a
1435
                        );
1436
                        return;
1437
                case IMGFMT_BGR16:
1438
                        asm volatile(
1439
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1440
                                "mov %4, %%"REG_SP"                        \n\t"
1441
                                YSCALEYUV2RGB1b(%%REGa, %5)
1442
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1443
#ifdef DITHER1XBPP
1444
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1445
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1446
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1447
#endif
1448

    
1449
                                WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1450
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1451

    
1452
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1453
                        "r" (&c->redDither)
1454
                        : "%"REG_a
1455
                        );
1456
                        return;
1457
                case IMGFMT_YUY2:
1458
                        asm volatile(
1459
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1460
                                "mov %4, %%"REG_SP"                        \n\t"
1461
                                YSCALEYUV2PACKED1b(%%REGa, %5)
1462
                                WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1463
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1464

    
1465
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1466
                        "r" (&c->redDither)
1467
                        : "%"REG_a
1468
                        );
1469
                        return;
1470
                }
1471
        }
1472
#endif
1473
        if( uvalpha < 2048 )
1474
        {
1475
                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1476
        }else{
1477
                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1478
        }
1479
}
1480

    
1481
//FIXME yuy2* can read upto 7 samples to much
1482

    
1483
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1484
{
1485
#ifdef HAVE_MMX
1486
        asm volatile(
1487
                "movq "MANGLE(bm01010101)", %%mm2\n\t"
1488
                "mov %0, %%"REG_a"                \n\t"
1489
                "1:                                \n\t"
1490
                "movq (%1, %%"REG_a",2), %%mm0        \n\t"
1491
                "movq 8(%1, %%"REG_a",2), %%mm1        \n\t"
1492
                "pand %%mm2, %%mm0                \n\t"
1493
                "pand %%mm2, %%mm1                \n\t"
1494
                "packuswb %%mm1, %%mm0                \n\t"
1495
                "movq %%mm0, (%2, %%"REG_a")        \n\t"
1496
                "add $8, %%"REG_a"                \n\t"
1497
                " js 1b                                \n\t"
1498
                : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1499
                : "%"REG_a
1500
        );
1501
#else
1502
        int i;
1503
        for(i=0; i<width; i++)
1504
                dst[i]= src[2*i];
1505
#endif
1506
}
1507

    
1508
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1509
{
1510
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1511
        asm volatile(
1512
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1513
                "mov %0, %%"REG_a"                \n\t"
1514
                "1:                                \n\t"
1515
                "movq (%1, %%"REG_a",4), %%mm0        \n\t"
1516
                "movq 8(%1, %%"REG_a",4), %%mm1        \n\t"
1517
                "movq (%2, %%"REG_a",4), %%mm2        \n\t"
1518
                "movq 8(%2, %%"REG_a",4), %%mm3        \n\t"
1519
                PAVGB(%%mm2, %%mm0)
1520
                PAVGB(%%mm3, %%mm1)
1521
                "psrlw $8, %%mm0                \n\t"
1522
                "psrlw $8, %%mm1                \n\t"
1523
                "packuswb %%mm1, %%mm0                \n\t"
1524
                "movq %%mm0, %%mm1                \n\t"
1525
                "psrlw $8, %%mm0                \n\t"
1526
                "pand %%mm4, %%mm1                \n\t"
1527
                "packuswb %%mm0, %%mm0                \n\t"
1528
                "packuswb %%mm1, %%mm1                \n\t"
1529
                "movd %%mm0, (%4, %%"REG_a")        \n\t"
1530
                "movd %%mm1, (%3, %%"REG_a")        \n\t"
1531
                "add $4, %%"REG_a"                \n\t"
1532
                " js 1b                                \n\t"
1533
                : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1534
                : "%"REG_a
1535
        );
1536
#else
1537
        int i;
1538
        for(i=0; i<width; i++)
1539
        {
1540
                dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1541
                dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1542
        }
1543
#endif
1544
}
1545

    
1546
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1547
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1548
{
1549
#ifdef HAVE_MMX
1550
        asm volatile(
1551
                "mov %0, %%"REG_a"                \n\t"
1552
                "1:                                \n\t"
1553
                "movq (%1, %%"REG_a",2), %%mm0        \n\t"
1554
                "movq 8(%1, %%"REG_a",2), %%mm1        \n\t"
1555
                "psrlw $8, %%mm0                \n\t"
1556
                "psrlw $8, %%mm1                \n\t"
1557
                "packuswb %%mm1, %%mm0                \n\t"
1558
                "movq %%mm0, (%2, %%"REG_a")        \n\t"
1559
                "add $8, %%"REG_a"                \n\t"
1560
                " js 1b                                \n\t"
1561
                : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1562
                : "%"REG_a
1563
        );
1564
#else
1565
        int i;
1566
        for(i=0; i<width; i++)
1567
                dst[i]= src[2*i+1];
1568
#endif
1569
}
1570

    
1571
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1572
{
1573
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1574
        asm volatile(
1575
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1576
                "mov %0, %%"REG_a"                \n\t"
1577
                "1:                                \n\t"
1578
                "movq (%1, %%"REG_a",4), %%mm0        \n\t"
1579
                "movq 8(%1, %%"REG_a",4), %%mm1        \n\t"
1580
                "movq (%2, %%"REG_a",4), %%mm2        \n\t"
1581
                "movq 8(%2, %%"REG_a",4), %%mm3        \n\t"
1582
                PAVGB(%%mm2, %%mm0)
1583
                PAVGB(%%mm3, %%mm1)
1584
                "pand %%mm4, %%mm0                \n\t"
1585
                "pand %%mm4, %%mm1                \n\t"
1586
                "packuswb %%mm1, %%mm0                \n\t"
1587
                "movq %%mm0, %%mm1                \n\t"
1588
                "psrlw $8, %%mm0                \n\t"
1589
                "pand %%mm4, %%mm1                \n\t"
1590
                "packuswb %%mm0, %%mm0                \n\t"
1591
                "packuswb %%mm1, %%mm1                \n\t"
1592
                "movd %%mm0, (%4, %%"REG_a")        \n\t"
1593
                "movd %%mm1, (%3, %%"REG_a")        \n\t"
1594
                "add $4, %%"REG_a"                \n\t"
1595
                " js 1b                                \n\t"
1596
                : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1597
                : "%"REG_a
1598
        );
1599
#else
1600
        int i;
1601
        for(i=0; i<width; i++)
1602
        {
1603
                dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1604
                dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1605
        }
1606
#endif
1607
}
1608

    
1609
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1610
{
1611
#ifdef HAVE_MMXFIXME
1612
#else
1613
        int i;
1614
        for(i=0; i<width; i++)
1615
        {
1616
                int b=  ((uint32_t*)src)[i]&0xFF;
1617
                int g= (((uint32_t*)src)[i]>>8)&0xFF;
1618
                int r= (((uint32_t*)src)[i]>>16)&0xFF;
1619

    
1620
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1621
        }
1622
#endif
1623
}
1624

    
1625
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1626
{
1627
#ifdef HAVE_MMXFIXME
1628
#else
1629
        int i;
1630
        for(i=0; i<width; i++)
1631
        {
1632
                const int a= ((uint32_t*)src1)[2*i+0];
1633
                const int e= ((uint32_t*)src1)[2*i+1];
1634
                const int c= ((uint32_t*)src2)[2*i+0];
1635
                const int d= ((uint32_t*)src2)[2*i+1];
1636
                const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1637
                const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1638
                 const int b=  l&0x3FF;
1639
                const int g=  h>>8;
1640
                const int r=  l>>16;
1641

    
1642
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1643
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1644
        }
1645
#endif
1646
}
1647

    
1648
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1649
{
1650
#ifdef HAVE_MMX
1651
        asm volatile(
1652
                "mov %2, %%"REG_a"                \n\t"
1653
                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
1654
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1655
                "pxor %%mm7, %%mm7                \n\t"
1656
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1657
                ".balign 16                        \n\t"
1658
                "1:                                \n\t"
1659
                PREFETCH" 64(%0, %%"REG_b")        \n\t"
1660
                "movd (%0, %%"REG_b"), %%mm0        \n\t"
1661
                "movd 3(%0, %%"REG_b"), %%mm1        \n\t"
1662
                "punpcklbw %%mm7, %%mm0                \n\t"
1663
                "punpcklbw %%mm7, %%mm1                \n\t"
1664
                "movd 6(%0, %%"REG_b"), %%mm2        \n\t"
1665
                "movd 9(%0, %%"REG_b"), %%mm3        \n\t"
1666
                "punpcklbw %%mm7, %%mm2                \n\t"
1667
                "punpcklbw %%mm7, %%mm3                \n\t"
1668
                "pmaddwd %%mm6, %%mm0                \n\t"
1669
                "pmaddwd %%mm6, %%mm1                \n\t"
1670
                "pmaddwd %%mm6, %%mm2                \n\t"
1671
                "pmaddwd %%mm6, %%mm3                \n\t"
1672
#ifndef FAST_BGR2YV12
1673
                "psrad $8, %%mm0                \n\t"
1674
                "psrad $8, %%mm1                \n\t"
1675
                "psrad $8, %%mm2                \n\t"
1676
                "psrad $8, %%mm3                \n\t"
1677
#endif
1678
                "packssdw %%mm1, %%mm0                \n\t"
1679
                "packssdw %%mm3, %%mm2                \n\t"
1680
                "pmaddwd %%mm5, %%mm0                \n\t"
1681
                "pmaddwd %%mm5, %%mm2                \n\t"
1682
                "packssdw %%mm2, %%mm0                \n\t"
1683
                "psraw $7, %%mm0                \n\t"
1684

    
1685
                "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
1686
                "movd 15(%0, %%"REG_b"), %%mm1        \n\t"
1687
                "punpcklbw %%mm7, %%mm4                \n\t"
1688
                "punpcklbw %%mm7, %%mm1                \n\t"
1689
                "movd 18(%0, %%"REG_b"), %%mm2        \n\t"
1690
                "movd 21(%0, %%"REG_b"), %%mm3        \n\t"
1691
                "punpcklbw %%mm7, %%mm2                \n\t"
1692
                "punpcklbw %%mm7, %%mm3                \n\t"
1693
                "pmaddwd %%mm6, %%mm4                \n\t"
1694
                "pmaddwd %%mm6, %%mm1                \n\t"
1695
                "pmaddwd %%mm6, %%mm2                \n\t"
1696
                "pmaddwd %%mm6, %%mm3                \n\t"
1697
#ifndef FAST_BGR2YV12
1698
                "psrad $8, %%mm4                \n\t"
1699
                "psrad $8, %%mm1                \n\t"
1700
                "psrad $8, %%mm2                \n\t"
1701
                "psrad $8, %%mm3                \n\t"
1702
#endif
1703
                "packssdw %%mm1, %%mm4                \n\t"
1704
                "packssdw %%mm3, %%mm2                \n\t"
1705
                "pmaddwd %%mm5, %%mm4                \n\t"
1706
                "pmaddwd %%mm5, %%mm2                \n\t"
1707
                "add $24, %%"REG_b"                \n\t"
1708
                "packssdw %%mm2, %%mm4                \n\t"
1709
                "psraw $7, %%mm4                \n\t"
1710

    
1711
                "packuswb %%mm4, %%mm0                \n\t"
1712
                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
1713

    
1714
                "movq %%mm0, (%1, %%"REG_a")        \n\t"
1715
                "add $8, %%"REG_a"                \n\t"
1716
                " js 1b                                \n\t"
1717
                : : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1718
                : "%"REG_a, "%"REG_b
1719
        );
1720
#else
1721
        int i;
1722
        for(i=0; i<width; i++)
1723
        {
1724
                int b= src[i*3+0];
1725
                int g= src[i*3+1];
1726
                int r= src[i*3+2];
1727

    
1728
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1729
        }
1730
#endif
1731
}
1732

    
1733
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1734
{
1735
#ifdef HAVE_MMX
1736
        asm volatile(
1737
                "mov %4, %%"REG_a"                \n\t"
1738
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1739
                "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
1740
                "pxor %%mm7, %%mm7                \n\t"
1741
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"        \n\t"
1742
                "add %%"REG_b", %%"REG_b"        \n\t"
1743
                ".balign 16                        \n\t"
1744
                "1:                                \n\t"
1745
                PREFETCH" 64(%0, %%"REG_b")        \n\t"
1746
                PREFETCH" 64(%1, %%"REG_b")        \n\t"
1747
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1748
                "movq (%0, %%"REG_b"), %%mm0        \n\t"
1749
                "movq (%1, %%"REG_b"), %%mm1        \n\t"
1750
                "movq 6(%0, %%"REG_b"), %%mm2        \n\t"
1751
                "movq 6(%1, %%"REG_b"), %%mm3        \n\t"
1752
                PAVGB(%%mm1, %%mm0)
1753
                PAVGB(%%mm3, %%mm2)
1754
                "movq %%mm0, %%mm1                \n\t"
1755
                "movq %%mm2, %%mm3                \n\t"
1756
                "psrlq $24, %%mm0                \n\t"
1757
                "psrlq $24, %%mm2                \n\t"
1758
                PAVGB(%%mm1, %%mm0)
1759
                PAVGB(%%mm3, %%mm2)
1760
                "punpcklbw %%mm7, %%mm0                \n\t"
1761
                "punpcklbw %%mm7, %%mm2                \n\t"
1762
#else
1763
                "movd (%0, %%"REG_b"), %%mm0        \n\t"
1764
                "movd (%1, %%"REG_b"), %%mm1        \n\t"
1765
                "movd 3(%0, %%"REG_b"), %%mm2        \n\t"
1766
                "movd 3(%1, %%"REG_b"), %%mm3        \n\t"
1767
                "punpcklbw %%mm7, %%mm0                \n\t"
1768
                "punpcklbw %%mm7, %%mm1                \n\t"
1769
                "punpcklbw %%mm7, %%mm2                \n\t"
1770
                "punpcklbw %%mm7, %%mm3                \n\t"
1771
                "paddw %%mm1, %%mm0                \n\t"
1772
                "paddw %%mm3, %%mm2                \n\t"
1773
                "paddw %%mm2, %%mm0                \n\t"
1774
                "movd 6(%0, %%"REG_b"), %%mm4        \n\t"
1775
                "movd 6(%1, %%"REG_b"), %%mm1        \n\t"
1776
                "movd 9(%0, %%"REG_b"), %%mm2        \n\t"
1777
                "movd 9(%1, %%"REG_b"), %%mm3        \n\t"
1778
                "punpcklbw %%mm7, %%mm4                \n\t"
1779
                "punpcklbw %%mm7, %%mm1                \n\t"
1780
                "punpcklbw %%mm7, %%mm2                \n\t"
1781
                "punpcklbw %%mm7, %%mm3                \n\t"
1782
                "paddw %%mm1, %%mm4                \n\t"
1783
                "paddw %%mm3, %%mm2                \n\t"
1784
                "paddw %%mm4, %%mm2                \n\t"
1785
                "psrlw $2, %%mm0                \n\t"
1786
                "psrlw $2, %%mm2                \n\t"
1787
#endif
1788
                "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
1789
                "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
1790
                
1791
                "pmaddwd %%mm0, %%mm1                \n\t"
1792
                "pmaddwd %%mm2, %%mm3                \n\t"
1793
                "pmaddwd %%mm6, %%mm0                \n\t"
1794
                "pmaddwd %%mm6, %%mm2                \n\t"
1795
#ifndef FAST_BGR2YV12
1796
                "psrad $8, %%mm0                \n\t"
1797
                "psrad $8, %%mm1                \n\t"
1798
                "psrad $8, %%mm2                \n\t"
1799
                "psrad $8, %%mm3                \n\t"
1800
#endif
1801
                "packssdw %%mm2, %%mm0                \n\t"
1802
                "packssdw %%mm3, %%mm1                \n\t"
1803
                "pmaddwd %%mm5, %%mm0                \n\t"
1804
                "pmaddwd %%mm5, %%mm1                \n\t"
1805
                "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
1806
                "psraw $7, %%mm0                \n\t"
1807

    
1808
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1809
                "movq 12(%0, %%"REG_b"), %%mm4        \n\t"
1810
                "movq 12(%1, %%"REG_b"), %%mm1        \n\t"
1811
                "movq 18(%0, %%"REG_b"), %%mm2        \n\t"
1812
                "movq 18(%1, %%"REG_b"), %%mm3        \n\t"
1813
                PAVGB(%%mm1, %%mm4)
1814
                PAVGB(%%mm3, %%mm2)
1815
                "movq %%mm4, %%mm1                \n\t"
1816
                "movq %%mm2, %%mm3                \n\t"
1817
                "psrlq $24, %%mm4                \n\t"
1818
                "psrlq $24, %%mm2                \n\t"
1819
                PAVGB(%%mm1, %%mm4)
1820
                PAVGB(%%mm3, %%mm2)
1821
                "punpcklbw %%mm7, %%mm4                \n\t"
1822
                "punpcklbw %%mm7, %%mm2                \n\t"
1823
#else
1824
                "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
1825
                "movd 12(%1, %%"REG_b"), %%mm1        \n\t"
1826
                "movd 15(%0, %%"REG_b"), %%mm2        \n\t"
1827
                "movd 15(%1, %%"REG_b"), %%mm3        \n\t"
1828
                "punpcklbw %%mm7, %%mm4                \n\t"
1829
                "punpcklbw %%mm7, %%mm1                \n\t"
1830
                "punpcklbw %%mm7, %%mm2                \n\t"
1831
                "punpcklbw %%mm7, %%mm3                \n\t"
1832
                "paddw %%mm1, %%mm4                \n\t"
1833
                "paddw %%mm3, %%mm2                \n\t"
1834
                "paddw %%mm2, %%mm4                \n\t"
1835
                "movd 18(%0, %%"REG_b"), %%mm5        \n\t"
1836
                "movd 18(%1, %%"REG_b"), %%mm1        \n\t"
1837
                "movd 21(%0, %%"REG_b"), %%mm2        \n\t"
1838
                "movd 21(%1, %%"REG_b"), %%mm3        \n\t"
1839
                "punpcklbw %%mm7, %%mm5                \n\t"
1840
                "punpcklbw %%mm7, %%mm1                \n\t"
1841
                "punpcklbw %%mm7, %%mm2                \n\t"
1842
                "punpcklbw %%mm7, %%mm3                \n\t"
1843
                "paddw %%mm1, %%mm5                \n\t"
1844
                "paddw %%mm3, %%mm2                \n\t"
1845
                "paddw %%mm5, %%mm2                \n\t"
1846
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1847
                "psrlw $2, %%mm4                \n\t"
1848
                "psrlw $2, %%mm2                \n\t"
1849
#endif
1850
                "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
1851
                "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
1852
                
1853
                "pmaddwd %%mm4, %%mm1                \n\t"
1854
                "pmaddwd %%mm2, %%mm3                \n\t"
1855
                "pmaddwd %%mm6, %%mm4                \n\t"
1856
                "pmaddwd %%mm6, %%mm2                \n\t"
1857
#ifndef FAST_BGR2YV12
1858
                "psrad $8, %%mm4                \n\t"
1859
                "psrad $8, %%mm1                \n\t"
1860
                "psrad $8, %%mm2                \n\t"
1861
                "psrad $8, %%mm3                \n\t"
1862
#endif
1863
                "packssdw %%mm2, %%mm4                \n\t"
1864
                "packssdw %%mm3, %%mm1                \n\t"
1865
                "pmaddwd %%mm5, %%mm4                \n\t"
1866
                "pmaddwd %%mm5, %%mm1                \n\t"
1867
                "add $24, %%"REG_b"                \n\t"
1868
                "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
1869
                "psraw $7, %%mm4                \n\t"
1870
                
1871
                "movq %%mm0, %%mm1                \n\t"
1872
                "punpckldq %%mm4, %%mm0                \n\t"
1873
                "punpckhdq %%mm4, %%mm1                \n\t"
1874
                "packsswb %%mm1, %%mm0                \n\t"
1875
                "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
1876

    
1877
                "movd %%mm0, (%2, %%"REG_a")        \n\t"
1878
                "punpckhdq %%mm0, %%mm0                \n\t"
1879
                "movd %%mm0, (%3, %%"REG_a")        \n\t"
1880
                "add $4, %%"REG_a"                \n\t"
1881
                " js 1b                                \n\t"
1882
                : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1883
                : "%"REG_a, "%"REG_b
1884
        );
1885
#else
1886
        int i;
1887
        for(i=0; i<width; i++)
1888
        {
1889
                int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1890
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1891
                int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1892

    
1893
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1894
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1895
        }
1896
#endif
1897
}
1898

    
1899
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1900
{
1901
        int i;
1902
        for(i=0; i<width; i++)
1903
        {
1904
                int d= ((uint16_t*)src)[i];
1905
                int b= d&0x1F;
1906
                int g= (d>>5)&0x3F;
1907
                int r= (d>>11)&0x1F;
1908

    
1909
                dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1910
        }
1911
}
1912

    
1913
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1914
{
1915
        int i;
1916
        for(i=0; i<width; i++)
1917
        {
1918
                int d0= ((uint32_t*)src1)[i];
1919
                int d1= ((uint32_t*)src2)[i];
1920
                
1921
                int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1922
                int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1923

    
1924
                int dh2= (dh>>11) + (dh<<21);
1925
                int d= dh2 + dl;
1926

    
1927
                int b= d&0x7F;
1928
                int r= (d>>11)&0x7F;
1929
                int g= d>>21;
1930
                dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1931
                dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1932
        }
1933
}
1934

    
1935
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1936
{
1937
        int i;
1938
        for(i=0; i<width; i++)
1939
        {
1940
                int d= ((uint16_t*)src)[i];
1941
                int b= d&0x1F;
1942
                int g= (d>>5)&0x1F;
1943
                int r= (d>>10)&0x1F;
1944

    
1945
                dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1946
        }
1947
}
1948

    
1949
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1950
{
1951
        int i;
1952
        for(i=0; i<width; i++)
1953
        {
1954
                int d0= ((uint32_t*)src1)[i];
1955
                int d1= ((uint32_t*)src2)[i];
1956
                
1957
                int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1958
                int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1959

    
1960
                int dh2= (dh>>11) + (dh<<21);
1961
                int d= dh2 + dl;
1962

    
1963
                int b= d&0x7F;
1964
                int r= (d>>10)&0x7F;
1965
                int g= d>>21;
1966
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1967
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1968
        }
1969
}
1970

    
1971

    
1972
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1973
{
1974
        int i;
1975
        for(i=0; i<width; i++)
1976
        {
1977
                int r=  ((uint32_t*)src)[i]&0xFF;
1978
                int g= (((uint32_t*)src)[i]>>8)&0xFF;
1979
                int b= (((uint32_t*)src)[i]>>16)&0xFF;
1980

    
1981
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1982
        }
1983
}
1984

    
1985
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1986
{
1987
        int i;
1988
        for(i=0; i<width; i++)
1989
        {
1990
                const int a= ((uint32_t*)src1)[2*i+0];
1991
                const int e= ((uint32_t*)src1)[2*i+1];
1992
                const int c= ((uint32_t*)src2)[2*i+0];
1993
                const int d= ((uint32_t*)src2)[2*i+1];
1994
                const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1995
                const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1996
                 const int r=  l&0x3FF;
1997
                const int g=  h>>8;
1998
                const int b=  l>>16;
1999

    
2000
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2001
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2002
        }
2003
}
2004

    
2005
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2006
{
2007
        int i;
2008
        for(i=0; i<width; i++)
2009
        {
2010
                int r= src[i*3+0];
2011
                int g= src[i*3+1];
2012
                int b= src[i*3+2];
2013

    
2014
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2015
        }
2016
}
2017

    
2018
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2019
{
2020
        int i;
2021
        for(i=0; i<width; i++)
2022
        {
2023
                int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2024
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2025
                int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2026

    
2027
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2028
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2029
        }
2030
}
2031

    
2032

    
2033
// Bilinear / Bicubic scaling
2034
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2035
                                  int16_t *filter, int16_t *filterPos, int filterSize)
2036
{
2037
#ifdef HAVE_MMX
2038
        assert(filterSize % 4 == 0 && filterSize>0);
2039
        if(filterSize==4) // allways true for upscaling, sometimes for down too
2040
        {
2041
                long counter= -2*dstW;
2042
                filter-= counter*2;
2043
                filterPos-= counter/2;
2044
                dst-= counter/2;
2045
                asm volatile(
2046
                        "pxor %%mm7, %%mm7                \n\t"
2047
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2048
                        "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2049
                        "mov %%"REG_a", %%"REG_BP"        \n\t"
2050
                        ".balign 16                        \n\t"
2051
                        "1:                                \n\t"
2052
                        "movzxw (%2, %%"REG_BP"), %%"REG_a"\n\t"
2053
                        "movzxw 2(%2, %%"REG_BP"), %%"REG_b"\n\t"
2054
                        "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2055
                        "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2056
                        "movd (%3, %%"REG_a"), %%mm0        \n\t"
2057
                        "movd (%3, %%"REG_b"), %%mm2        \n\t"
2058
                        "punpcklbw %%mm7, %%mm0                \n\t"
2059
                        "punpcklbw %%mm7, %%mm2                \n\t"
2060
                        "pmaddwd %%mm1, %%mm0                \n\t"
2061
                        "pmaddwd %%mm2, %%mm3                \n\t"
2062
                        "psrad $8, %%mm0                \n\t"
2063
                        "psrad $8, %%mm3                \n\t"
2064
                        "packssdw %%mm3, %%mm0                \n\t"
2065
                        "pmaddwd %%mm6, %%mm0                \n\t"
2066
                        "packssdw %%mm0, %%mm0                \n\t"
2067
                        "movd %%mm0, (%4, %%"REG_BP")        \n\t"
2068
                        "add $4, %%"REG_BP"                \n\t"
2069
                        " jnc 1b                        \n\t"
2070

    
2071
                        "pop %%"REG_BP"                        \n\t"
2072
                        : "+a" (counter)
2073
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2074
                        : "%"REG_b
2075
                );
2076
        }
2077
        else if(filterSize==8)
2078
        {
2079
                long counter= -2*dstW;
2080
                filter-= counter*4;
2081
                filterPos-= counter/2;
2082
                dst-= counter/2;
2083
                asm volatile(
2084
                        "pxor %%mm7, %%mm7                \n\t"
2085
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2086
                        "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2087
                        "mov %%"REG_a", %%"REG_BP"        \n\t"
2088
                        ".balign 16                        \n\t"
2089
                        "1:                                \n\t"
2090
                        "movzxw (%2, %%"REG_BP"), %%"REG_a"\n\t"
2091
                        "movzxw 2(%2, %%"REG_BP"), %%"REG_b"\n\t"
2092
                        "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2093
                        "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2094
                        "movd (%3, %%"REG_a"), %%mm0        \n\t"
2095
                        "movd (%3, %%"REG_b"), %%mm2        \n\t"
2096
                        "punpcklbw %%mm7, %%mm0                \n\t"
2097
                        "punpcklbw %%mm7, %%mm2                \n\t"
2098
                        "pmaddwd %%mm1, %%mm0                \n\t"
2099
                        "pmaddwd %%mm2, %%mm3                \n\t"
2100

    
2101
                        "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2102
                        "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2103
                        "movd 4(%3, %%"REG_a"), %%mm4        \n\t"
2104
                        "movd 4(%3, %%"REG_b"), %%mm2        \n\t"
2105
                        "punpcklbw %%mm7, %%mm4                \n\t"
2106
                        "punpcklbw %%mm7, %%mm2                \n\t"
2107
                        "pmaddwd %%mm1, %%mm4                \n\t"
2108
                        "pmaddwd %%mm2, %%mm5                \n\t"
2109
                        "paddd %%mm4, %%mm0                \n\t"
2110
                        "paddd %%mm5, %%mm3                \n\t"
2111
                                                
2112
                        "psrad $8, %%mm0                \n\t"
2113
                        "psrad $8, %%mm3                \n\t"
2114
                        "packssdw %%mm3, %%mm0                \n\t"
2115
                        "pmaddwd %%mm6, %%mm0                \n\t"
2116
                        "packssdw %%mm0, %%mm0                \n\t"
2117
                        "movd %%mm0, (%4, %%"REG_BP")        \n\t"
2118
                        "add $4, %%"REG_BP"                \n\t"
2119
                        " jnc 1b                        \n\t"
2120

    
2121
                        "pop %%"REG_BP"                        \n\t"
2122
                        : "+a" (counter)
2123
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2124
                        : "%"REG_b
2125
                );
2126
        }
2127
        else
2128
        {
2129
                long counter= -2*dstW;
2130
//                filter-= counter*filterSize/2;
2131
                filterPos-= counter/2;
2132
                dst-= counter/2;
2133
                asm volatile(
2134
                        "pxor %%mm7, %%mm7                \n\t"
2135
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2136
                        ".balign 16                        \n\t"
2137
                        "1:                                \n\t"
2138
                        "mov %2, %%"REG_c"                \n\t"
2139
                        "movzxw (%%"REG_c", %0), %%"REG_a"\n\t"
2140
                        "movzxw 2(%%"REG_c", %0), %%"REG_b"\n\t"
2141
                        "mov %5, %%"REG_c"                \n\t"
2142
                        "pxor %%mm4, %%mm4                \n\t"
2143
                        "pxor %%mm5, %%mm5                \n\t"
2144
                        "2:                                \n\t"
2145
                        "movq (%1), %%mm1                \n\t"
2146
                        "movq (%1, %6), %%mm3                \n\t"
2147
                        "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2148
                        "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2149
                        "punpcklbw %%mm7, %%mm0                \n\t"
2150
                        "punpcklbw %%mm7, %%mm2                \n\t"
2151
                        "pmaddwd %%mm1, %%mm0                \n\t"
2152
                        "pmaddwd %%mm2, %%mm3                \n\t"
2153
                        "paddd %%mm3, %%mm5                \n\t"
2154
                        "paddd %%mm0, %%mm4                \n\t"
2155
                        "add $8, %1                        \n\t"
2156
                        "add $4, %%"REG_c"                \n\t"
2157
                        "cmp %4, %%"REG_c"                \n\t"
2158
                        " jb 2b                                \n\t"
2159
                        "add %6, %1                        \n\t"
2160
                        "psrad $8, %%mm4                \n\t"
2161
                        "psrad $8, %%mm5                \n\t"
2162
                        "packssdw %%mm5, %%mm4                \n\t"
2163
                        "pmaddwd %%mm6, %%mm4                \n\t"
2164
                        "packssdw %%mm4, %%mm4                \n\t"
2165
                        "mov %3, %%"REG_a"                \n\t"
2166
                        "movd %%mm4, (%%"REG_a", %0)        \n\t"
2167
                        "add $4, %0                        \n\t"
2168
                        " jnc 1b                        \n\t"
2169

    
2170
                        : "+r" (counter), "+r" (filter)
2171
                        : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2172
                          "m" (src), "r" ((long)filterSize*2)
2173
                        : "%"REG_b, "%"REG_a, "%"REG_c
2174
                );
2175
        }
2176
#else
2177
#ifdef HAVE_ALTIVEC
2178
        hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2179
#else
2180
        int i;
2181
        for(i=0; i<dstW; i++)
2182
        {
2183
                int j;
2184
                int srcPos= filterPos[i];
2185
                int val=0;
2186
//                printf("filterPos: %d\n", filterPos[i]);
2187
                for(j=0; j<filterSize; j++)
2188
                {
2189
//                        printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2190
                        val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2191
                }
2192
//                filter += hFilterSize;
2193
                dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2194
//                dst[i] = val>>7;
2195
        }
2196
#endif
2197
#endif
2198
}
2199
      // *** horizontal scale Y line to temp buffer
2200
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2201
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2202
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2203
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2204
                                   int32_t *mmx2FilterPos)
2205
{
2206
    if(srcFormat==IMGFMT_YUY2)
2207
    {
2208
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2209
        src= formatConvBuffer;
2210
    }
2211
    else if(srcFormat==IMGFMT_UYVY)
2212
    {
2213
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2214
        src= formatConvBuffer;
2215
    }
2216
    else if(srcFormat==IMGFMT_BGR32)
2217
    {
2218
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2219
        src= formatConvBuffer;
2220
    }
2221
    else if(srcFormat==IMGFMT_BGR24)
2222
    {
2223
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2224
        src= formatConvBuffer;
2225
    }
2226
    else if(srcFormat==IMGFMT_BGR16)
2227
    {
2228
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2229
        src= formatConvBuffer;
2230
    }
2231
    else if(srcFormat==IMGFMT_BGR15)
2232
    {
2233
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2234
        src= formatConvBuffer;
2235
    }
2236
    else if(srcFormat==IMGFMT_RGB32)
2237
    {
2238
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2239
        src= formatConvBuffer;
2240
    }
2241
    else if(srcFormat==IMGFMT_RGB24)
2242
    {
2243
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2244
        src= formatConvBuffer;
2245
    }
2246

    
2247
#ifdef HAVE_MMX
2248
        // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2249
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2250
#else
2251
    if(!(flags&SWS_FAST_BILINEAR))
2252
#endif
2253
    {
2254
            RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2255
    }
2256
    else // Fast Bilinear upscale / crap downscale
2257
    {
2258
#if defined(ARCH_X86) || defined(ARCH_X86_64)
2259
#ifdef HAVE_MMX2
2260
        int i;
2261
        if(canMMX2BeUsed)
2262
        {
2263
                asm volatile(
2264
                        "pxor %%mm7, %%mm7                \n\t"
2265
                        "mov %0, %%"REG_c"                \n\t"
2266
                        "mov %1, %%"REG_D"                \n\t"
2267
                        "mov %2, %%"REG_d"                \n\t"
2268
                        "mov %3, %%"REG_b"                \n\t"
2269
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2270
                        PREFETCH" (%%"REG_c")                \n\t"
2271
                        PREFETCH" 32(%%"REG_c")                \n\t"
2272
                        PREFETCH" 64(%%"REG_c")                \n\t"
2273

    
2274
#define FUNNY_Y_CODE \
2275
                        "mov (%%"REG_b"), %%"REG_S"        \n\t"\
2276
                        "call *%4                        \n\t"\
2277
                        "addl (%%"REG_b", %%"REG_a"), %%ecx\n\t"\
2278
                        "add %%"REG_a", %%"REG_d"        \n\t"\
2279
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2280

    
2281
FUNNY_Y_CODE
2282
FUNNY_Y_CODE
2283
FUNNY_Y_CODE
2284
FUNNY_Y_CODE
2285
FUNNY_Y_CODE
2286
FUNNY_Y_CODE
2287
FUNNY_Y_CODE
2288
FUNNY_Y_CODE
2289

    
2290
                        :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2291
                        "m" (funnyYCode)
2292
                        : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_d
2293
                );
2294
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2295
        }
2296
        else
2297
        {
2298
#endif
2299
        //NO MMX just normal asm ...
2300
        asm volatile(
2301
                "xor %%"REG_a", %%"REG_a"        \n\t" // i
2302
                "xor %%"REG_b", %%"REG_b"        \n\t" // xx
2303
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2304
                ".balign 16                        \n\t"
2305
                "1:                                \n\t"
2306
                "movzbl  (%0, %%"REG_b"), %%edi        \n\t" //src[xx]
2307
                "movzbl 1(%0, %%"REG_b"), %%esi        \n\t" //src[xx+1]
2308
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2309
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2310
                "shll $16, %%edi                \n\t"
2311
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2312
                "mov %1, %%"REG_D"                \n\t"
2313
                "shrl $9, %%esi                        \n\t"
2314
                "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2315
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2316
                "adc %3, %%"REG_b"                \n\t" //xx+= xInc>>8 + carry
2317

    
2318
                "movzbl (%0, %%"REG_b"), %%edi        \n\t" //src[xx]
2319
                "movzbl 1(%0, %%"REG_b"), %%esi        \n\t" //src[xx+1]
2320
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2321
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2322
                "shll $16, %%edi                \n\t"
2323
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2324
                "mov %1, %%"REG_D"                \n\t"
2325
                "shrl $9, %%esi                        \n\t"
2326
                "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2327
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2328
                "adc %3, %%"REG_b"                \n\t" //xx+= xInc>>8 + carry
2329

    
2330

    
2331
                "add $2, %%"REG_a"                \n\t"
2332
                "cmp %2, %%"REG_a"                \n\t"
2333
                " jb 1b                                \n\t"
2334

    
2335

    
2336
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2337
                : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2338
                );
2339
#ifdef HAVE_MMX2
2340
        } //if MMX2 can't be used
2341
#endif
2342
#else
2343
        int i;
2344
        unsigned int xpos=0;
2345
        for(i=0;i<dstWidth;i++)
2346
        {
2347
                register unsigned int xx=xpos>>16;
2348
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2349
                dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2350
                xpos+=xInc;
2351
        }
2352
#endif
2353
    }
2354
}
2355

    
2356
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2357
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2358
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2359
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2360
                                   int32_t *mmx2FilterPos)
2361
{
2362
    if(srcFormat==IMGFMT_YUY2)
2363
    {
2364
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2365
        src1= formatConvBuffer;
2366
        src2= formatConvBuffer+2048;
2367
    }
2368
    else if(srcFormat==IMGFMT_UYVY)
2369
    {
2370
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2371
        src1= formatConvBuffer;
2372
        src2= formatConvBuffer+2048;
2373
    }
2374
    else if(srcFormat==IMGFMT_BGR32)
2375
    {
2376
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2377
        src1= formatConvBuffer;
2378
        src2= formatConvBuffer+2048;
2379
    }
2380
    else if(srcFormat==IMGFMT_BGR24)
2381
    {
2382
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2383
        src1= formatConvBuffer;
2384
        src2= formatConvBuffer+2048;
2385
    }
2386
    else if(srcFormat==IMGFMT_BGR16)
2387
    {
2388
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2389
        src1= formatConvBuffer;
2390
        src2= formatConvBuffer+2048;
2391
    }
2392
    else if(srcFormat==IMGFMT_BGR15)
2393
    {
2394
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2395
        src1= formatConvBuffer;
2396
        src2= formatConvBuffer+2048;
2397
    }
2398
    else if(srcFormat==IMGFMT_RGB32)
2399
    {
2400
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2401
        src1= formatConvBuffer;
2402
        src2= formatConvBuffer+2048;
2403
    }
2404
    else if(srcFormat==IMGFMT_RGB24)
2405
    {
2406
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2407
        src1= formatConvBuffer;
2408
        src2= formatConvBuffer+2048;
2409
    }
2410
    else if(isGray(srcFormat))
2411
    {
2412
            return;
2413
    }
2414

    
2415
#ifdef HAVE_MMX
2416
        // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2417
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2418
#else
2419
    if(!(flags&SWS_FAST_BILINEAR))
2420
#endif
2421
    {
2422
            RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2423
            RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2424
    }
2425
    else // Fast Bilinear upscale / crap downscale
2426
    {
2427
#if defined(ARCH_X86) || defined(ARCH_X86_64)
2428
#ifdef HAVE_MMX2
2429
        int i;
2430
        if(canMMX2BeUsed)
2431
        {
2432
                asm volatile(
2433
                        "pxor %%mm7, %%mm7                \n\t"
2434
                        "mov %0, %%"REG_c"                \n\t"
2435
                        "mov %1, %%"REG_D"                \n\t"
2436
                        "mov %2, %%"REG_d"                \n\t"
2437
                        "mov %3, %%"REG_b"                \n\t"
2438
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2439
                        PREFETCH" (%%"REG_c")                \n\t"
2440
                        PREFETCH" 32(%%"REG_c")                \n\t"
2441
                        PREFETCH" 64(%%"REG_c")                \n\t"
2442

    
2443
#define FUNNY_UV_CODE \
2444
                        "movl (%%"REG_b"), %%esi        \n\t"\
2445
                        "call *%4                        \n\t"\
2446
                        "addl (%%"REG_b", %%"REG_a"), %%ecx\n\t"\
2447
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2448
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2449

    
2450
FUNNY_UV_CODE
2451
FUNNY_UV_CODE
2452
FUNNY_UV_CODE
2453
FUNNY_UV_CODE
2454
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2455
                        "mov %5, %%"REG_c"                \n\t" // src
2456
                        "mov %1, %%"REG_D"                \n\t" // buf1
2457
                        "add $4096, %%"REG_D"                \n\t"
2458
                        PREFETCH" (%%"REG_c")                \n\t"
2459
                        PREFETCH" 32(%%"REG_c")                \n\t"
2460
                        PREFETCH" 64(%%"REG_c")                \n\t"
2461

    
2462
FUNNY_UV_CODE
2463
FUNNY_UV_CODE
2464
FUNNY_UV_CODE
2465
FUNNY_UV_CODE
2466

    
2467
                        :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2468
                        "m" (funnyUVCode), "m" (src2)
2469
                        : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%esi", "%"REG_D
2470
                );
2471
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2472
                {
2473
//                        printf("%d %d %d\n", dstWidth, i, srcW);
2474
                        dst[i] = src1[srcW-1]*128;
2475
                        dst[i+2048] = src2[srcW-1]*128;
2476
                }
2477
        }
2478
        else
2479
        {
2480
#endif
2481
        asm volatile(
2482
                "xor %%"REG_a", %%"REG_a"        \n\t" // i
2483
                "xor %%"REG_b", %%"REG_b"                \n\t" // xx
2484
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2485
                ".balign 16                        \n\t"
2486
                "1:                                \n\t"
2487
                "mov %0, %%"REG_S"                \n\t"
2488
                "movzbl  (%%"REG_S", %%"REG_b"), %%edi        \n\t" //src[xx]
2489
                "movzbl 1(%%"REG_S", %%"REG_b"), %%esi        \n\t" //src[xx+1]
2490
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2491
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2492
                "shll $16, %%edi                \n\t"
2493
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2494
                "mov %1, %%"REG_D"                \n\t"
2495
                "shrl $9, %%esi                        \n\t"
2496
                "movw %%si, (%%"REG_d", %%"REG_a", 2)\n\t"
2497

    
2498
                "movzbl  (%5, %%"REG_b"), %%edi        \n\t" //src[xx]
2499
                "movzbl 1(%5, %%"REG_b"), %%esi        \n\t" //src[xx+1]
2500
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2501
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2502
                "shll $16, %%edi                \n\t"
2503
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2504
                "mov %1, %%"REG_D"                \n\t"
2505
                "shrl $9, %%esi                        \n\t"
2506
                "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2507

    
2508
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2509
                "adc %3, %%"REG_b"                \n\t" //xx+= xInc>>8 + carry
2510
                "add $1, %%"REG_a"                \n\t"
2511
                "cmp %2, %%"REG_a"                \n\t"
2512
                " jb 1b                                \n\t"
2513

    
2514
                :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" ((long)(xInc>>16)), "m" ((xInc&0xFFFF)),
2515
                "r" (src2)
2516
                : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2517
                );
2518
#ifdef HAVE_MMX2
2519
        } //if MMX2 can't be used
2520
#endif
2521
#else
2522
        int i;
2523
        unsigned int xpos=0;
2524
        for(i=0;i<dstWidth;i++)
2525
        {
2526
                register unsigned int xx=xpos>>16;
2527
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2528
                dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2529
                dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2530
/* slower
2531
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2532
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2533
*/
2534
                xpos+=xInc;
2535
        }
2536
#endif
2537
   }
2538
}
2539

    
2540
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2541
             int srcSliceH, uint8_t* dst[], int dstStride[]){
2542

    
2543
        /* load a few things into local vars to make the code more readable? and faster */
2544
        const int srcW= c->srcW;
2545
        const int dstW= c->dstW;
2546
        const int dstH= c->dstH;
2547
        const int chrDstW= c->chrDstW;
2548
        const int chrSrcW= c->chrSrcW;
2549
        const int lumXInc= c->lumXInc;
2550
        const int chrXInc= c->chrXInc;
2551
        const int dstFormat= c->dstFormat;
2552
        const int srcFormat= c->srcFormat;
2553
        const int flags= c->flags;
2554
        const int canMMX2BeUsed= c->canMMX2BeUsed;
2555
        int16_t *vLumFilterPos= c->vLumFilterPos;
2556
        int16_t *vChrFilterPos= c->vChrFilterPos;
2557
        int16_t *hLumFilterPos= c->hLumFilterPos;
2558
        int16_t *hChrFilterPos= c->hChrFilterPos;
2559
        int16_t *vLumFilter= c->vLumFilter;
2560
        int16_t *vChrFilter= c->vChrFilter;
2561
        int16_t *hLumFilter= c->hLumFilter;
2562
        int16_t *hChrFilter= c->hChrFilter;
2563
        int32_t *lumMmxFilter= c->lumMmxFilter;
2564
        int32_t *chrMmxFilter= c->chrMmxFilter;
2565
        const int vLumFilterSize= c->vLumFilterSize;
2566
        const int vChrFilterSize= c->vChrFilterSize;
2567
        const int hLumFilterSize= c->hLumFilterSize;
2568
        const int hChrFilterSize= c->hChrFilterSize;
2569
        int16_t **lumPixBuf= c->lumPixBuf;
2570
        int16_t **chrPixBuf= c->chrPixBuf;
2571
        const int vLumBufSize= c->vLumBufSize;
2572
        const int vChrBufSize= c->vChrBufSize;
2573
        uint8_t *funnyYCode= c->funnyYCode;
2574
        uint8_t *funnyUVCode= c->funnyUVCode;
2575
        uint8_t *formatConvBuffer= c->formatConvBuffer;
2576
        const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2577
        const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2578
        int lastDstY;
2579

    
2580
        /* vars whch will change and which we need to storw back in the context */
2581
        int dstY= c->dstY;
2582
        int lumBufIndex= c->lumBufIndex;
2583
        int chrBufIndex= c->chrBufIndex;
2584
        int lastInLumBuf= c->lastInLumBuf;
2585
        int lastInChrBuf= c->lastInChrBuf;
2586
        
2587
        if(isPacked(c->srcFormat)){
2588
                src[0]=
2589
                src[1]=
2590
                src[2]= src[0];
2591
                srcStride[0]=
2592
                srcStride[1]=
2593
                srcStride[2]= srcStride[0];
2594
        }
2595
        srcStride[1]<<= c->vChrDrop;
2596
        srcStride[2]<<= c->vChrDrop;
2597

    
2598
//        printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2599
//                (int)dst[0], (int)dst[1], (int)dst[2]);
2600

    
2601
#if 0 //self test FIXME move to a vfilter or something
2602
{
2603
static volatile int i=0;
2604
i++;
2605
if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2606
        selfTest(src, srcStride, c->srcW, c->srcH);
2607
i--;
2608
}
2609
#endif
2610

    
2611
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2612
//dstStride[0],dstStride[1],dstStride[2]);
2613

    
2614
        if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2615
        {
2616
                static int firstTime=1; //FIXME move this into the context perhaps
2617
                if(flags & SWS_PRINT_INFO && firstTime)
2618
                {
2619
                        MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2620
                                        "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2621
                        firstTime=0;
2622
                }
2623
        }
2624

    
2625
        /* Note the user might start scaling the picture in the middle so this will not get executed
2626
           this is not really intended but works currently, so ppl might do it */
2627
        if(srcSliceY ==0){
2628
                lumBufIndex=0;
2629
                chrBufIndex=0;
2630
                dstY=0;        
2631
                lastInLumBuf= -1;
2632
                lastInChrBuf= -1;
2633
        }
2634

    
2635
        lastDstY= dstY;
2636

    
2637
        for(;dstY < dstH; dstY++){
2638
                unsigned char *dest =dst[0]+dstStride[0]*dstY;
2639
                const int chrDstY= dstY>>c->chrDstVSubSample;
2640
                unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2641
                unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2642

    
2643
                const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2644
                const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2645
                const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2646
                const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2647

    
2648
//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2649
// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2650
                //handle holes (FAST_BILINEAR & weird filters)
2651
                if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2652
                if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2653
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2654
                ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2655
                ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2656

    
2657
                // Do we have enough lines in this slice to output the dstY line
2658
                if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2659
                {
2660
                        //Do horizontal scaling
2661
                        while(lastInLumBuf < lastLumSrcY)
2662
                        {
2663
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2664
                                lumBufIndex++;
2665
//                                printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2666
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2667
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2668
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2669
//                                printf("%d %d\n", lumBufIndex, vLumBufSize);
2670
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2671
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2672
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
2673
                                                c->lumMmx2Filter, c->lumMmx2FilterPos);
2674
                                lastInLumBuf++;
2675
                        }
2676
                        while(lastInChrBuf < lastChrSrcY)
2677
                        {
2678
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2679
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2680
                                chrBufIndex++;
2681
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2682
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2683
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2684
                                //FIXME replace parameters through context struct (some at least)
2685

    
2686
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
2687
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2688
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2689
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
2690
                                                c->chrMmx2Filter, c->chrMmx2FilterPos);
2691
                                lastInChrBuf++;
2692
                        }
2693
                        //wrap buf index around to stay inside the ring buffer
2694
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2695
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2696
                }
2697
                else // not enough lines left in this slice -> load the rest in the buffer
2698
                {
2699
/*                printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2700
                        firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2701
                        lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2702
                        vChrBufSize, vLumBufSize);*/
2703

    
2704
                        //Do horizontal scaling
2705
                        while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2706
                        {
2707
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2708
                                lumBufIndex++;
2709
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2710
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2711
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2712
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2713
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2714
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
2715
                                                c->lumMmx2Filter, c->lumMmx2FilterPos);
2716
                                lastInLumBuf++;
2717
                        }
2718
                        while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2719
                        {
2720
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2721
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2722
                                chrBufIndex++;
2723
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2724
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2725
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2726

    
2727
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
2728
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2729
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2730
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
2731
                                                c->chrMmx2Filter, c->chrMmx2FilterPos);
2732
                                lastInChrBuf++;
2733
                        }
2734
                        //wrap buf index around to stay inside the ring buffer
2735
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2736
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2737
                        break; //we can't output a dstY line so let's try with the next slice
2738
                }
2739

    
2740
#ifdef HAVE_MMX
2741
                b5Dither= dither8[dstY&1];
2742
                g6Dither= dither4[dstY&1];
2743
                g5Dither= dither8[dstY&1];
2744
                r5Dither= dither8[(dstY+1)&1];
2745
#endif
2746
            if(dstY < dstH-2)
2747
            {
2748
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2749
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2750
#ifdef HAVE_MMX
2751
                int i;
2752
                for(i=0; i<vLumFilterSize; i++)
2753
                {
2754
                        lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2755
                        lumMmxFilter[4*i+2]= 
2756
                        lumMmxFilter[4*i+3]= 
2757
                                ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2758
                }
2759
                for(i=0; i<vChrFilterSize; i++)
2760
                {
2761
                        chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2762
                        chrMmxFilter[4*i+2]= 
2763
                        chrMmxFilter[4*i+3]= 
2764
                                ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2765
                }
2766
#endif
2767
                if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2768
                {
2769
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2770
                        if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2771
                        if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2772
                        {
2773
                                int16_t *lumBuf = lumPixBuf[0];
2774
                                int16_t *chrBuf= chrPixBuf[0];
2775
                                RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2776
                        }
2777
                        else //General YV12
2778
                        {
2779
                                RENAME(yuv2yuvX)(c,
2780
                                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2781
                                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2782
                                        dest, uDest, vDest, dstW, chrDstW);
2783
                        }
2784
                }
2785
                else
2786
                {
2787
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2788
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2789
                        if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2790
                        {
2791
                                int chrAlpha= vChrFilter[2*dstY+1];
2792
                                RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2793
                                                 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2794
                        }
2795
                        else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2796
                        {
2797
                                int lumAlpha= vLumFilter[2*dstY+1];
2798
                                int chrAlpha= vChrFilter[2*dstY+1];
2799
                                RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2800
                                                 dest, dstW, lumAlpha, chrAlpha, dstY);
2801
                        }
2802
                        else //General RGB
2803
                        {
2804
                                RENAME(yuv2packedX)(c,
2805
                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2806
                                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2807
                                        dest, dstW, dstY);
2808
                        }
2809
                }
2810
            }
2811
            else // hmm looks like we can't use MMX here without overwriting this array's tail
2812
            {
2813
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2814
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2815
                if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2816
                {
2817
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2818
                        if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2819
                        yuv2yuvXinC(
2820
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2821
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2822
                                dest, uDest, vDest, dstW, chrDstW);
2823
                }
2824
                else
2825
                {
2826
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2827
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2828
                        yuv2packedXinC(c, 
2829
                                vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2830
                                vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2831
                                dest, dstW, dstY);
2832
                }
2833
            }
2834
        }
2835

    
2836
#ifdef HAVE_MMX
2837
        __asm __volatile(SFENCE:::"memory");
2838
        __asm __volatile(EMMS:::"memory");
2839
#endif
2840
        /* store changed local vars back in the context */
2841
        c->dstY= dstY;
2842
        c->lumBufIndex= lumBufIndex;
2843
        c->chrBufIndex= chrBufIndex;
2844
        c->lastInLumBuf= lastInLumBuf;
2845
        c->lastInChrBuf= lastInChrBuf;
2846

    
2847
        return dstY - lastDstY;
2848
}