Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale_template.c @ 6d606c4f

History | View | Annotate | Download (87.5 KB)

1
/*
2
    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
#undef REAL_MOVNTQ
20
#undef MOVNTQ
21
#undef PAVGB
22
#undef PREFETCH
23
#undef PREFETCHW
24
#undef EMMS
25
#undef SFENCE
26

    
27
#ifdef HAVE_3DNOW
28
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29
#define EMMS     "femms"
30
#else
31
#define EMMS     "emms"
32
#endif
33

    
34
#ifdef HAVE_3DNOW
35
#define PREFETCH  "prefetch"
36
#define PREFETCHW "prefetchw"
37
#elif defined ( HAVE_MMX2 )
38
#define PREFETCH "prefetchnta"
39
#define PREFETCHW "prefetcht0"
40
#else
41
#define PREFETCH "/nop"
42
#define PREFETCHW "/nop"
43
#endif
44

    
45
#ifdef HAVE_MMX2
46
#define SFENCE "sfence"
47
#else
48
#define SFENCE "/nop"
49
#endif
50

    
51
#ifdef HAVE_MMX2
52
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53
#elif defined (HAVE_3DNOW)
54
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55
#endif
56

    
57
#ifdef HAVE_MMX2
58
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59
#else
60
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61
#endif
62
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
63

    
64
#ifdef HAVE_ALTIVEC
65
#include "swscale_altivec_template.c"
66
#endif
67

    
68
#define YSCALEYUV2YV12X(x, offset) \
69
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
70
                        "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71
                        "movq %%mm3, %%mm4                \n\t"\
72
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
73
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
74
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
75
                        "1:                                \n\t"\
76
                        "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
77
                        "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78
                        "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79
                        "add $16, %%"REG_d"                \n\t"\
80
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
81
                        "test %%"REG_S", %%"REG_S"        \n\t"\
82
                        "pmulhw %%mm0, %%mm2                \n\t"\
83
                        "pmulhw %%mm0, %%mm5                \n\t"\
84
                        "paddw %%mm2, %%mm3                \n\t"\
85
                        "paddw %%mm5, %%mm4                \n\t"\
86
                        " jnz 1b                        \n\t"\
87
                        "psraw $3, %%mm3                \n\t"\
88
                        "psraw $3, %%mm4                \n\t"\
89
                        "packuswb %%mm4, %%mm3                \n\t"\
90
                        MOVNTQ(%%mm3, (%1, %%REGa))\
91
                        "add $8, %%"REG_a"                \n\t"\
92
                        "cmp %2, %%"REG_a"                \n\t"\
93
                        "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94
                        "movq %%mm3, %%mm4                \n\t"\
95
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
96
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
97
                        "jb 1b                                \n\t"
98

    
99
#define YSCALEYUV2YV121 \
100
                        "mov %2, %%"REG_a"                \n\t"\
101
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
102
                        "1:                                \n\t"\
103
                        "movq (%0, %%"REG_a", 2), %%mm0        \n\t"\
104
                        "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105
                        "psraw $7, %%mm0                \n\t"\
106
                        "psraw $7, %%mm1                \n\t"\
107
                        "packuswb %%mm1, %%mm0                \n\t"\
108
                        MOVNTQ(%%mm0, (%1, %%REGa))\
109
                        "add $8, %%"REG_a"                \n\t"\
110
                        "jnc 1b                                \n\t"
111

    
112
/*
113
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115
                           "r" (dest), "m" (dstW),
116
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
118
*/
119
#define YSCALEYUV2PACKEDX \
120
                "xor %%"REG_a", %%"REG_a"        \n\t"\
121
                ".balign 16                        \n\t"\
122
                "nop                                \n\t"\
123
                "1:                                \n\t"\
124
                "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
126
                "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127
                "movq %%mm3, %%mm4                \n\t"\
128
                ".balign 16                        \n\t"\
129
                "2:                                \n\t"\
130
                "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
131
                "movq (%%"REG_S", %%"REG_a"), %%mm2        \n\t" /* UsrcData */\
132
                "movq 4096(%%"REG_S", %%"REG_a"), %%mm5        \n\t" /* VsrcData */\
133
                "add $16, %%"REG_d"                \n\t"\
134
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
135
                "pmulhw %%mm0, %%mm2                \n\t"\
136
                "pmulhw %%mm0, %%mm5                \n\t"\
137
                "paddw %%mm2, %%mm3                \n\t"\
138
                "paddw %%mm5, %%mm4                \n\t"\
139
                "test %%"REG_S", %%"REG_S"        \n\t"\
140
                " jnz 2b                        \n\t"\
141
\
142
                "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
144
                "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145
                "movq %%mm1, %%mm7                \n\t"\
146
                ".balign 16                        \n\t"\
147
                "2:                                \n\t"\
148
                "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
149
                "movq (%%"REG_S", %%"REG_a", 2), %%mm2        \n\t" /* Y1srcData */\
150
                "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5        \n\t" /* Y2srcData */\
151
                "add $16, %%"REG_d"                \n\t"\
152
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
153
                "pmulhw %%mm0, %%mm2                \n\t"\
154
                "pmulhw %%mm0, %%mm5                \n\t"\
155
                "paddw %%mm2, %%mm1                \n\t"\
156
                "paddw %%mm5, %%mm7                \n\t"\
157
                "test %%"REG_S", %%"REG_S"        \n\t"\
158
                " jnz 2b                        \n\t"\
159

    
160

    
161
#define YSCALEYUV2RGBX \
162
                YSCALEYUV2PACKEDX\
163
                "psubw "U_OFFSET"(%0), %%mm3        \n\t" /* (U-128)8*/\
164
                "psubw "V_OFFSET"(%0), %%mm4        \n\t" /* (V-128)8*/\
165
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
166
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
167
                "pmulhw "UG_COEFF"(%0), %%mm3        \n\t"\
168
                "pmulhw "VG_COEFF"(%0), %%mm4        \n\t"\
169
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170
                "pmulhw "UB_COEFF"(%0), %%mm2        \n\t"\
171
                "pmulhw "VR_COEFF"(%0), %%mm5        \n\t"\
172
                "psubw "Y_OFFSET"(%0), %%mm1        \n\t" /* 8(Y-16)*/\
173
                "psubw "Y_OFFSET"(%0), %%mm7        \n\t" /* 8(Y-16)*/\
174
                "pmulhw "Y_COEFF"(%0), %%mm1        \n\t"\
175
                "pmulhw "Y_COEFF"(%0), %%mm7        \n\t"\
176
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177
                "paddw %%mm3, %%mm4                \n\t"\
178
                "movq %%mm2, %%mm0                \n\t"\
179
                "movq %%mm5, %%mm6                \n\t"\
180
                "movq %%mm4, %%mm3                \n\t"\
181
                "punpcklwd %%mm2, %%mm2                \n\t"\
182
                "punpcklwd %%mm5, %%mm5                \n\t"\
183
                "punpcklwd %%mm4, %%mm4                \n\t"\
184
                "paddw %%mm1, %%mm2                \n\t"\
185
                "paddw %%mm1, %%mm5                \n\t"\
186
                "paddw %%mm1, %%mm4                \n\t"\
187
                "punpckhwd %%mm0, %%mm0                \n\t"\
188
                "punpckhwd %%mm6, %%mm6                \n\t"\
189
                "punpckhwd %%mm3, %%mm3                \n\t"\
190
                "paddw %%mm7, %%mm0                \n\t"\
191
                "paddw %%mm7, %%mm6                \n\t"\
192
                "paddw %%mm7, %%mm3                \n\t"\
193
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194
                "packuswb %%mm0, %%mm2                \n\t"\
195
                "packuswb %%mm6, %%mm5                \n\t"\
196
                "packuswb %%mm3, %%mm4                \n\t"\
197
                "pxor %%mm7, %%mm7                \n\t"
198
#if 0
199
#define FULL_YSCALEYUV2RGB \
200
                "pxor %%mm7, %%mm7                \n\t"\
201
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
202
                "punpcklwd %%mm6, %%mm6                \n\t"\
203
                "punpcklwd %%mm6, %%mm6                \n\t"\
204
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
205
                "punpcklwd %%mm5, %%mm5                \n\t"\
206
                "punpcklwd %%mm5, %%mm5                \n\t"\
207
                "xor %%"REG_a", %%"REG_a"                \n\t"\
208
                ".balign 16                        \n\t"\
209
                "1:                                \n\t"\
210
                "movq (%0, %%"REG_a", 2), %%mm0        \n\t" /*buf0[eax]*/\
211
                "movq (%1, %%"REG_a", 2), %%mm1        \n\t" /*buf1[eax]*/\
212
                "movq (%2, %%"REG_a",2), %%mm2        \n\t" /* uvbuf0[eax]*/\
213
                "movq (%3, %%"REG_a",2), %%mm3        \n\t" /* uvbuf1[eax]*/\
214
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
215
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219
                "movq 4096(%2, %%"REG_a",2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
220
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222
                "movq 4096(%3, %%"REG_a",2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
223
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
226
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* 8(U-128)*/\
227
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
228
\
229
\
230
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
232
                "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234
                "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236
                "psubw "MANGLE(w400)", %%mm0        \n\t" /* (V-128)8*/\
237
\
238
\
239
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
240
                "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
243
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
244
                "packuswb %%mm3, %%mm3                \n\t"\
245
\
246
                "packuswb %%mm0, %%mm0                \n\t"\
247
                "paddw %%mm4, %%mm2                \n\t"\
248
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
249
\
250
                "packuswb %%mm1, %%mm1                \n\t"
251
#endif
252

    
253
#define REAL_YSCALEYUV2PACKED(index, c) \
254
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255
                "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256
                "psraw $3, %%mm0                \n\t"\
257
                "psraw $3, %%mm1                \n\t"\
258
                "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259
                "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260
                "xor "#index", "#index"                \n\t"\
261
                ".balign 16                        \n\t"\
262
                "1:                                \n\t"\
263
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
264
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
265
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
266
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
267
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272
                "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273
                "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276
                "movq (%0, "#index", 2), %%mm0        \n\t" /*buf0[eax]*/\
277
                "movq (%1, "#index", 2), %%mm1        \n\t" /*buf1[eax]*/\
278
                "movq 8(%0, "#index", 2), %%mm6        \n\t" /*buf0[eax]*/\
279
                "movq 8(%1, "#index", 2), %%mm7        \n\t" /*buf1[eax]*/\
280
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
281
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
282
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284
                "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285
                "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
288
                
289
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
290
                
291
#define REAL_YSCALEYUV2RGB(index, c) \
292
                "xor "#index", "#index"        \n\t"\
293
                ".balign 16                        \n\t"\
294
                "1:                                \n\t"\
295
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
296
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
297
                "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298
                "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
309
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
310
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
311
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
312
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315
                "movq (%0, "#index", 2), %%mm0        \n\t" /*buf0[eax]*/\
316
                "movq (%1, "#index", 2), %%mm1        \n\t" /*buf1[eax]*/\
317
                "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318
                "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
320
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
321
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
330
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
331
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
332
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
333
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334
                "paddw %%mm3, %%mm4                \n\t"\
335
                "movq %%mm2, %%mm0                \n\t"\
336
                "movq %%mm5, %%mm6                \n\t"\
337
                "movq %%mm4, %%mm3                \n\t"\
338
                "punpcklwd %%mm2, %%mm2                \n\t"\
339
                "punpcklwd %%mm5, %%mm5                \n\t"\
340
                "punpcklwd %%mm4, %%mm4                \n\t"\
341
                "paddw %%mm1, %%mm2                \n\t"\
342
                "paddw %%mm1, %%mm5                \n\t"\
343
                "paddw %%mm1, %%mm4                \n\t"\
344
                "punpckhwd %%mm0, %%mm0                \n\t"\
345
                "punpckhwd %%mm6, %%mm6                \n\t"\
346
                "punpckhwd %%mm3, %%mm3                \n\t"\
347
                "paddw %%mm7, %%mm0                \n\t"\
348
                "paddw %%mm7, %%mm6                \n\t"\
349
                "paddw %%mm7, %%mm3                \n\t"\
350
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351
                "packuswb %%mm0, %%mm2                \n\t"\
352
                "packuswb %%mm6, %%mm5                \n\t"\
353
                "packuswb %%mm3, %%mm4                \n\t"\
354
                "pxor %%mm7, %%mm7                \n\t"
355
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
356
                
357
#define REAL_YSCALEYUV2PACKED1(index, c) \
358
                "xor "#index", "#index"                \n\t"\
359
                ".balign 16                        \n\t"\
360
                "1:                                \n\t"\
361
                "movq (%2, "#index"), %%mm3        \n\t" /* uvbuf0[eax]*/\
362
                "movq 4096(%2, "#index"), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
363
                "psraw $7, %%mm3                \n\t" \
364
                "psraw $7, %%mm4                \n\t" \
365
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
366
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
367
                "psraw $7, %%mm1                \n\t" \
368
                "psraw $7, %%mm7                \n\t" \
369
                
370
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
371
                
372
#define REAL_YSCALEYUV2RGB1(index, c) \
373
                "xor "#index", "#index"        \n\t"\
374
                ".balign 16                        \n\t"\
375
                "1:                                \n\t"\
376
                "movq (%2, "#index"), %%mm3        \n\t" /* uvbuf0[eax]*/\
377
                "movq 4096(%2, "#index"), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
378
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
381
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
382
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
383
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
384
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
388
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
389
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
394
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
395
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
396
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
397
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398
                "paddw %%mm3, %%mm4                \n\t"\
399
                "movq %%mm2, %%mm0                \n\t"\
400
                "movq %%mm5, %%mm6                \n\t"\
401
                "movq %%mm4, %%mm3                \n\t"\
402
                "punpcklwd %%mm2, %%mm2                \n\t"\
403
                "punpcklwd %%mm5, %%mm5                \n\t"\
404
                "punpcklwd %%mm4, %%mm4                \n\t"\
405
                "paddw %%mm1, %%mm2                \n\t"\
406
                "paddw %%mm1, %%mm5                \n\t"\
407
                "paddw %%mm1, %%mm4                \n\t"\
408
                "punpckhwd %%mm0, %%mm0                \n\t"\
409
                "punpckhwd %%mm6, %%mm6                \n\t"\
410
                "punpckhwd %%mm3, %%mm3                \n\t"\
411
                "paddw %%mm7, %%mm0                \n\t"\
412
                "paddw %%mm7, %%mm6                \n\t"\
413
                "paddw %%mm7, %%mm3                \n\t"\
414
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415
                "packuswb %%mm0, %%mm2                \n\t"\
416
                "packuswb %%mm6, %%mm5                \n\t"\
417
                "packuswb %%mm3, %%mm4                \n\t"\
418
                "pxor %%mm7, %%mm7                \n\t"
419
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
420

    
421
#define REAL_YSCALEYUV2PACKED1b(index, c) \
422
                "xor "#index", "#index"                \n\t"\
423
                ".balign 16                        \n\t"\
424
                "1:                                \n\t"\
425
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
426
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
427
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
428
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
429
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431
                "psrlw $8, %%mm3                \n\t" \
432
                "psrlw $8, %%mm4                \n\t" \
433
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
434
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
435
                "psraw $7, %%mm1                \n\t" \
436
                "psraw $7, %%mm7                \n\t" 
437
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
438
                
439
// do vertical chrominance interpolation
440
#define REAL_YSCALEYUV2RGB1b(index, c) \
441
                "xor "#index", "#index"                \n\t"\
442
                ".balign 16                        \n\t"\
443
                "1:                                \n\t"\
444
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
445
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
446
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
447
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
448
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450
                "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
451
                "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
452
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
453
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
454
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
455
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
456
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
460
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
461
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
466
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
467
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
468
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
469
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470
                "paddw %%mm3, %%mm4                \n\t"\
471
                "movq %%mm2, %%mm0                \n\t"\
472
                "movq %%mm5, %%mm6                \n\t"\
473
                "movq %%mm4, %%mm3                \n\t"\
474
                "punpcklwd %%mm2, %%mm2                \n\t"\
475
                "punpcklwd %%mm5, %%mm5                \n\t"\
476
                "punpcklwd %%mm4, %%mm4                \n\t"\
477
                "paddw %%mm1, %%mm2                \n\t"\
478
                "paddw %%mm1, %%mm5                \n\t"\
479
                "paddw %%mm1, %%mm4                \n\t"\
480
                "punpckhwd %%mm0, %%mm0                \n\t"\
481
                "punpckhwd %%mm6, %%mm6                \n\t"\
482
                "punpckhwd %%mm3, %%mm3                \n\t"\
483
                "paddw %%mm7, %%mm0                \n\t"\
484
                "paddw %%mm7, %%mm6                \n\t"\
485
                "paddw %%mm7, %%mm3                \n\t"\
486
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487
                "packuswb %%mm0, %%mm2                \n\t"\
488
                "packuswb %%mm6, %%mm5                \n\t"\
489
                "packuswb %%mm3, %%mm4                \n\t"\
490
                "pxor %%mm7, %%mm7                \n\t"
491
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
492

    
493
#define REAL_WRITEBGR32(dst, dstw, index) \
494
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495
                        "movq %%mm2, %%mm1                \n\t" /* B */\
496
                        "movq %%mm5, %%mm6                \n\t" /* R */\
497
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
498
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
499
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
500
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
501
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
502
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
503
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
504
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
505
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
506
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
507
\
508
                        MOVNTQ(%%mm0, (dst, index, 4))\
509
                        MOVNTQ(%%mm2, 8(dst, index, 4))\
510
                        MOVNTQ(%%mm1, 16(dst, index, 4))\
511
                        MOVNTQ(%%mm3, 24(dst, index, 4))\
512
\
513
                        "add $8, "#index"                \n\t"\
514
                        "cmp "#dstw", "#index"                \n\t"\
515
                        " jb 1b                                \n\t"
516
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
517

    
518
#define REAL_WRITEBGR16(dst, dstw, index) \
519
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
520
                        "pand "MANGLE(bFC)", %%mm4        \n\t" /* G */\
521
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
522
                        "psrlq $3, %%mm2                \n\t"\
523
\
524
                        "movq %%mm2, %%mm1                \n\t"\
525
                        "movq %%mm4, %%mm3                \n\t"\
526
\
527
                        "punpcklbw %%mm7, %%mm3                \n\t"\
528
                        "punpcklbw %%mm5, %%mm2                \n\t"\
529
                        "punpckhbw %%mm7, %%mm4                \n\t"\
530
                        "punpckhbw %%mm5, %%mm1                \n\t"\
531
\
532
                        "psllq $3, %%mm3                \n\t"\
533
                        "psllq $3, %%mm4                \n\t"\
534
\
535
                        "por %%mm3, %%mm2                \n\t"\
536
                        "por %%mm4, %%mm1                \n\t"\
537
\
538
                        MOVNTQ(%%mm2, (dst, index, 2))\
539
                        MOVNTQ(%%mm1, 8(dst, index, 2))\
540
\
541
                        "add $8, "#index"                \n\t"\
542
                        "cmp "#dstw", "#index"                \n\t"\
543
                        " jb 1b                                \n\t"
544
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
545

    
546
#define REAL_WRITEBGR15(dst, dstw, index) \
547
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
548
                        "pand "MANGLE(bF8)", %%mm4        \n\t" /* G */\
549
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
550
                        "psrlq $3, %%mm2                \n\t"\
551
                        "psrlq $1, %%mm5                \n\t"\
552
\
553
                        "movq %%mm2, %%mm1                \n\t"\
554
                        "movq %%mm4, %%mm3                \n\t"\
555
\
556
                        "punpcklbw %%mm7, %%mm3                \n\t"\
557
                        "punpcklbw %%mm5, %%mm2                \n\t"\
558
                        "punpckhbw %%mm7, %%mm4                \n\t"\
559
                        "punpckhbw %%mm5, %%mm1                \n\t"\
560
\
561
                        "psllq $2, %%mm3                \n\t"\
562
                        "psllq $2, %%mm4                \n\t"\
563
\
564
                        "por %%mm3, %%mm2                \n\t"\
565
                        "por %%mm4, %%mm1                \n\t"\
566
\
567
                        MOVNTQ(%%mm2, (dst, index, 2))\
568
                        MOVNTQ(%%mm1, 8(dst, index, 2))\
569
\
570
                        "add $8, "#index"                \n\t"\
571
                        "cmp "#dstw", "#index"                \n\t"\
572
                        " jb 1b                                \n\t"
573
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
574

    
575
#define WRITEBGR24OLD(dst, dstw, index) \
576
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577
                        "movq %%mm2, %%mm1                \n\t" /* B */\
578
                        "movq %%mm5, %%mm6                \n\t" /* R */\
579
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
580
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
581
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
582
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
583
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
584
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
585
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
586
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
587
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
588
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
589
\
590
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
591
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
592
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593
                        "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
595
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
596
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
597
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
598
\
599
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
600
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
601
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
602
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
603
                        "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
605
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
606
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607
                        "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
609
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
610
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
611
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
612
\
613
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
614
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
615
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
616
                        "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
619
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
620
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
621
\
622
                        MOVNTQ(%%mm0, (dst))\
623
                        MOVNTQ(%%mm2, 8(dst))\
624
                        MOVNTQ(%%mm3, 16(dst))\
625
                        "add $24, "#dst"                \n\t"\
626
\
627
                        "add $8, "#index"                \n\t"\
628
                        "cmp "#dstw", "#index"                \n\t"\
629
                        " jb 1b                                \n\t"
630

    
631
#define WRITEBGR24MMX(dst, dstw, index) \
632
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633
                        "movq %%mm2, %%mm1                \n\t" /* B */\
634
                        "movq %%mm5, %%mm6                \n\t" /* R */\
635
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
636
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
637
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
638
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
639
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
640
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
641
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
642
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
643
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
644
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
645
\
646
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
647
                        "movq %%mm2, %%mm6                \n\t" /* 0RGB0RGB 1 */\
648
                        "movq %%mm1, %%mm5                \n\t" /* 0RGB0RGB 2 */\
649
                        "movq %%mm3, %%mm7                \n\t" /* 0RGB0RGB 3 */\
650
\
651
                        "psllq $40, %%mm0                \n\t" /* RGB00000 0 */\
652
                        "psllq $40, %%mm2                \n\t" /* RGB00000 1 */\
653
                        "psllq $40, %%mm1                \n\t" /* RGB00000 2 */\
654
                        "psllq $40, %%mm3                \n\t" /* RGB00000 3 */\
655
\
656
                        "punpckhdq %%mm4, %%mm0                \n\t" /* 0RGBRGB0 0 */\
657
                        "punpckhdq %%mm6, %%mm2                \n\t" /* 0RGBRGB0 1 */\
658
                        "punpckhdq %%mm5, %%mm1                \n\t" /* 0RGBRGB0 2 */\
659
                        "punpckhdq %%mm7, %%mm3                \n\t" /* 0RGBRGB0 3 */\
660
\
661
                        "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
662
                        "movq %%mm2, %%mm6                \n\t" /* 0RGBRGB0 1 */\
663
                        "psllq $40, %%mm2                \n\t" /* GB000000 1 */\
664
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
665
                        MOVNTQ(%%mm0, (dst))\
666
\
667
                        "psrlq $24, %%mm6                \n\t" /* 0000RGBR 1 */\
668
                        "movq %%mm1, %%mm5                \n\t" /* 0RGBRGB0 2 */\
669
                        "psllq $24, %%mm1                \n\t" /* BRGB0000 2 */\
670
                        "por %%mm1, %%mm6                \n\t" /* BRGBRGBR 1 */\
671
                        MOVNTQ(%%mm6, 8(dst))\
672
\
673
                        "psrlq $40, %%mm5                \n\t" /* 000000RG 2 */\
674
                        "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
675
                        "por %%mm3, %%mm5                \n\t" /* RGBRGBRG 2 */\
676
                        MOVNTQ(%%mm5, 16(dst))\
677
\
678
                        "add $24, "#dst"                \n\t"\
679
\
680
                        "add $8, "#index"                        \n\t"\
681
                        "cmp "#dstw", "#index"                        \n\t"\
682
                        " jb 1b                                \n\t"
683

    
684
#define WRITEBGR24MMX2(dst, dstw, index) \
685
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686
                        "movq "MANGLE(M24A)", %%mm0        \n\t"\
687
                        "movq "MANGLE(M24C)", %%mm7        \n\t"\
688
                        "pshufw $0x50, %%mm2, %%mm1        \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
689
                        "pshufw $0x50, %%mm4, %%mm3        \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
690
                        "pshufw $0x00, %%mm5, %%mm6        \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
691
\
692
                        "pand %%mm0, %%mm1                \n\t" /*    B2        B1       B0 */\
693
                        "pand %%mm0, %%mm3                \n\t" /*    G2        G1       G0 */\
694
                        "pand %%mm7, %%mm6                \n\t" /*       R1        R0       */\
695
\
696
                        "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
697
                        "por %%mm1, %%mm6                \n\t"\
698
                        "por %%mm3, %%mm6                \n\t"\
699
                        MOVNTQ(%%mm6, (dst))\
700
\
701
                        "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
702
                        "pshufw $0xA5, %%mm2, %%mm1        \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
703
                        "pshufw $0x55, %%mm4, %%mm3        \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
704
                        "pshufw $0xA5, %%mm5, %%mm6        \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
705
\
706
                        "pand "MANGLE(M24B)", %%mm1        \n\t" /* B5       B4        B3    */\
707
                        "pand %%mm7, %%mm3                \n\t" /*       G4        G3       */\
708
                        "pand %%mm0, %%mm6                \n\t" /*    R4        R3       R2 */\
709
\
710
                        "por %%mm1, %%mm3                \n\t" /* B5    G4 B4     G3 B3    */\
711
                        "por %%mm3, %%mm6                \n\t"\
712
                        MOVNTQ(%%mm6, 8(dst))\
713
\
714
                        "pshufw $0xFF, %%mm2, %%mm1        \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
715
                        "pshufw $0xFA, %%mm4, %%mm3        \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
716
                        "pshufw $0xFA, %%mm5, %%mm6        \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
717
\
718
                        "pand %%mm7, %%mm1                \n\t" /*       B7        B6       */\
719
                        "pand %%mm0, %%mm3                \n\t" /*    G7        G6       G5 */\
720
                        "pand "MANGLE(M24B)", %%mm6        \n\t" /* R7       R6        R5    */\
721
\
722
                        "por %%mm1, %%mm3                \n\t"\
723
                        "por %%mm3, %%mm6                \n\t"\
724
                        MOVNTQ(%%mm6, 16(dst))\
725
\
726
                        "add $24, "#dst"                \n\t"\
727
\
728
                        "add $8, "#index"                \n\t"\
729
                        "cmp "#dstw", "#index"                \n\t"\
730
                        " jb 1b                                \n\t"
731

    
732
#ifdef HAVE_MMX2
733
#undef WRITEBGR24
734
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
735
#else
736
#undef WRITEBGR24
737
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
738
#endif
739

    
740
#define REAL_WRITEYUY2(dst, dstw, index) \
741
                        "packuswb %%mm3, %%mm3                \n\t"\
742
                        "packuswb %%mm4, %%mm4                \n\t"\
743
                        "packuswb %%mm7, %%mm1                \n\t"\
744
                        "punpcklbw %%mm4, %%mm3                \n\t"\
745
                        "movq %%mm1, %%mm7                \n\t"\
746
                        "punpcklbw %%mm3, %%mm1                \n\t"\
747
                        "punpckhbw %%mm3, %%mm7                \n\t"\
748
\
749
                        MOVNTQ(%%mm1, (dst, index, 2))\
750
                        MOVNTQ(%%mm7, 8(dst, index, 2))\
751
\
752
                        "add $8, "#index"                \n\t"\
753
                        "cmp "#dstw", "#index"                \n\t"\
754
                        " jb 1b                                \n\t"
755
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
756

    
757

    
758
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
761
{
762
#ifdef HAVE_MMX
763
        if(uDest != NULL)
764
        {
765
                asm volatile(
766
                                YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767
                                :: "r" (&c->redDither),
768
                                "r" (uDest), "m" ((long)chrDstW)
769
                                : "%"REG_a, "%"REG_d, "%"REG_S
770
                        );
771

    
772
                asm volatile(
773
                                YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774
                                :: "r" (&c->redDither),
775
                                "r" (vDest), "m" ((long)chrDstW)
776
                                : "%"REG_a, "%"REG_d, "%"REG_S
777
                        );
778
        }
779

    
780
        asm volatile(
781
                        YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782
                        :: "r" (&c->redDither),
783
                           "r" (dest), "m" ((long)dstW)
784
                        : "%"REG_a, "%"REG_d, "%"REG_S
785
                );
786
#else
787
#ifdef HAVE_ALTIVEC
788
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789
                      chrFilter, chrSrc, chrFilterSize,
790
                      dest, uDest, vDest, dstW, chrDstW);
791
#else //HAVE_ALTIVEC
792
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793
            chrFilter, chrSrc, chrFilterSize,
794
            dest, uDest, vDest, dstW, chrDstW);
795
#endif //!HAVE_ALTIVEC
796
#endif
797
}
798

    
799
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
800
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
801
{
802
#ifdef HAVE_MMX
803
        if(uDest != NULL)
804
        {
805
                asm volatile(
806
                                YSCALEYUV2YV121
807
                                :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
808
                                "g" ((long)-chrDstW)
809
                                : "%"REG_a
810
                        );
811

    
812
                asm volatile(
813
                                YSCALEYUV2YV121
814
                                :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
815
                                "g" ((long)-chrDstW)
816
                                : "%"REG_a
817
                        );
818
        }
819

    
820
        asm volatile(
821
                YSCALEYUV2YV121
822
                :: "r" (lumSrc + dstW), "r" (dest + dstW),
823
                "g" ((long)-dstW)
824
                : "%"REG_a
825
        );
826
#else
827
        int i;
828
        for(i=0; i<dstW; i++)
829
        {
830
                int val= lumSrc[i]>>7;
831
                
832
                if(val&256){
833
                        if(val<0) val=0;
834
                        else      val=255;
835
                }
836

    
837
                dest[i]= val;
838
        }
839

    
840
        if(uDest != NULL)
841
                for(i=0; i<chrDstW; i++)
842
                {
843
                        int u=chrSrc[i]>>7;
844
                        int v=chrSrc[i + 2048]>>7;
845

    
846
                        if((u|v)&256){
847
                                if(u<0)         u=0;
848
                                else if (u>255) u=255;
849
                                if(v<0)         v=0;
850
                                else if (v>255) v=255;
851
                        }
852

    
853
                        uDest[i]= u;
854
                        vDest[i]= v;
855
                }
856
#endif
857
}
858

    
859

    
860
/**
861
 * vertical scale YV12 to RGB
862
 */
863
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
864
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
865
                            uint8_t *dest, int dstW, int dstY)
866
{
867
        int dummy=0;
868
        switch(c->dstFormat)
869
        {
870
#ifdef HAVE_MMX
871
        case IMGFMT_BGR32:
872
                {
873
                        asm volatile(
874
                                YSCALEYUV2RGBX
875
                                WRITEBGR32(%4, %5, %%REGa)
876

    
877
                        :: "r" (&c->redDither), 
878
                           "m" (dummy), "m" (dummy), "m" (dummy),
879
                           "r" (dest), "m" (dstW)
880
                        : "%"REG_a, "%"REG_d, "%"REG_S
881
                        );
882
                }
883
                break;
884
        case IMGFMT_BGR24:
885
                {
886
                        asm volatile(
887
                                YSCALEYUV2RGBX
888
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
889
                                "add %4, %%"REG_b"                        \n\t"
890
                                WRITEBGR24(%%REGb, %5, %%REGa)
891

    
892
                        :: "r" (&c->redDither), 
893
                           "m" (dummy), "m" (dummy), "m" (dummy),
894
                           "r" (dest), "m" (dstW)
895
                        : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
896
                        );
897
                }
898
                break;
899
        case IMGFMT_BGR15:
900
                {
901
                        asm volatile(
902
                                YSCALEYUV2RGBX
903
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
904
#ifdef DITHER1XBPP
905
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
906
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
907
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
908
#endif
909

    
910
                                WRITEBGR15(%4, %5, %%REGa)
911

    
912
                        :: "r" (&c->redDither), 
913
                           "m" (dummy), "m" (dummy), "m" (dummy),
914
                           "r" (dest), "m" (dstW)
915
                        : "%"REG_a, "%"REG_d, "%"REG_S
916
                        );
917
                }
918
                break;
919
        case IMGFMT_BGR16:
920
                {
921
                        asm volatile(
922
                                YSCALEYUV2RGBX
923
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
924
#ifdef DITHER1XBPP
925
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
926
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
927
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
928
#endif
929

    
930
                                WRITEBGR16(%4, %5, %%REGa)
931

    
932
                        :: "r" (&c->redDither), 
933
                           "m" (dummy), "m" (dummy), "m" (dummy),
934
                           "r" (dest), "m" (dstW)
935
                        : "%"REG_a, "%"REG_d, "%"REG_S
936
                        );
937
                }
938
                break;
939
        case IMGFMT_YUY2:
940
                {
941
                        asm volatile(
942
                                YSCALEYUV2PACKEDX
943
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
944

    
945
                                "psraw $3, %%mm3                \n\t"
946
                                "psraw $3, %%mm4                \n\t"
947
                                "psraw $3, %%mm1                \n\t"
948
                                "psraw $3, %%mm7                \n\t"
949
                                WRITEYUY2(%4, %5, %%REGa)
950

    
951
                        :: "r" (&c->redDither), 
952
                           "m" (dummy), "m" (dummy), "m" (dummy),
953
                           "r" (dest), "m" (dstW)
954
                        : "%"REG_a, "%"REG_d, "%"REG_S
955
                        );
956
                }
957
                break;
958
#endif
959
        default:
960
#ifdef HAVE_ALTIVEC
961
                altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
962
                            chrFilter, chrSrc, chrFilterSize,
963
                            dest, dstW, dstY);
964
#else
965
                yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
966
                            chrFilter, chrSrc, chrFilterSize,
967
                            dest, dstW, dstY);
968
#endif
969
                break;
970
        }
971
}
972

    
973
/**
974
 * vertical bilinear scale YV12 to RGB
975
 */
976
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
977
                            uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
978
{
979
        int yalpha1=yalpha^4095;
980
        int uvalpha1=uvalpha^4095;
981
        int i;
982

    
983
#if 0 //isn't used
984
        if(flags&SWS_FULL_CHR_H_INT)
985
        {
986
                switch(dstFormat)
987
                {
988
#ifdef HAVE_MMX
989
                case IMGFMT_BGR32:
990
                        asm volatile(
991

992

993
FULL_YSCALEYUV2RGB
994
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
995
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
996

997
                        "movq %%mm3, %%mm1                \n\t"
998
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
999
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
1000

1001
                        MOVNTQ(%%mm3, (%4, %%REGa, 4))
1002
                        MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1003

1004
                        "add $4, %%"REG_a"                \n\t"
1005
                        "cmp %5, %%"REG_a"                \n\t"
1006
                        " jb 1b                                \n\t"
1007

1008

1009
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1010
                        "m" (yalpha1), "m" (uvalpha1)
1011
                        : "%"REG_a
1012
                        );
1013
                        break;
1014
                case IMGFMT_BGR24:
1015
                        asm volatile(
1016

1017
FULL_YSCALEYUV2RGB
1018

1019
                                                                // lsb ... msb
1020
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
1021
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
1022

1023
                        "movq %%mm3, %%mm1                \n\t"
1024
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
1025
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
1026

1027
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
1028
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
1029
                        "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1030
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1031
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
1032
                        "movq %%mm1, %%mm2                \n\t"
1033
                        "psllq $48, %%mm1                \n\t" // 000000BG
1034
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
1035

1036
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
1037
                        "psrld $16, %%mm2                \n\t" // R000R000
1038
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
1039
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
1040

1041
                        "mov %4, %%"REG_b"                \n\t"
1042
                        "add %%"REG_a", %%"REG_b"        \n\t"
1043

1044
#ifdef HAVE_MMX2
1045
                        //FIXME Alignment
1046
                        "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1047
                        "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1048
#else
1049
                        "movd %%mm3, (%%"REG_b", %%"REG_a", 2)        \n\t"
1050
                        "psrlq $32, %%mm3                \n\t"
1051
                        "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)        \n\t"
1052
                        "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)        \n\t"
1053
#endif
1054
                        "add $4, %%"REG_a"                \n\t"
1055
                        "cmp %5, %%"REG_a"                \n\t"
1056
                        " jb 1b                                \n\t"
1057

    
1058
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1059
                        "m" (yalpha1), "m" (uvalpha1)
1060
                        : "%"REG_a, "%"REG_b
1061
                        );
1062
                        break;
1063
                case IMGFMT_BGR15:
1064
                        asm volatile(
1065

    
1066
FULL_YSCALEYUV2RGB
1067
#ifdef DITHER1XBPP
1068
                        "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1069
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1070
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1071
#endif
1072
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
1073
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
1074
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
1075

    
1076
                        "psrlw $3, %%mm3                \n\t"
1077
                        "psllw $2, %%mm1                \n\t"
1078
                        "psllw $7, %%mm0                \n\t"
1079
                        "pand "MANGLE(g15Mask)", %%mm1        \n\t"
1080
                        "pand "MANGLE(r15Mask)", %%mm0        \n\t"
1081

    
1082
                        "por %%mm3, %%mm1                \n\t"
1083
                        "por %%mm1, %%mm0                \n\t"
1084

    
1085
                        MOVNTQ(%%mm0, (%4, %%REGa, 2))
1086

    
1087
                        "add $4, %%"REG_a"                \n\t"
1088
                        "cmp %5, %%"REG_a"                \n\t"
1089
                        " jb 1b                                \n\t"
1090

    
1091
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1092
                        "m" (yalpha1), "m" (uvalpha1)
1093
                        : "%"REG_a
1094
                        );
1095
                        break;
1096
                case IMGFMT_BGR16:
1097
                        asm volatile(
1098

    
1099
FULL_YSCALEYUV2RGB
1100
#ifdef DITHER1XBPP
1101
                        "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1102
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1103
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1104
#endif
1105
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
1106
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
1107
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
1108

    
1109
                        "psrlw $3, %%mm3                \n\t"
1110
                        "psllw $3, %%mm1                \n\t"
1111
                        "psllw $8, %%mm0                \n\t"
1112
                        "pand "MANGLE(g16Mask)", %%mm1        \n\t"
1113
                        "pand "MANGLE(r16Mask)", %%mm0        \n\t"
1114

    
1115
                        "por %%mm3, %%mm1                \n\t"
1116
                        "por %%mm1, %%mm0                \n\t"
1117

    
1118
                        MOVNTQ(%%mm0, (%4, %%REGa, 2))
1119

    
1120
                        "add $4, %%"REG_a"                \n\t"
1121
                        "cmp %5, %%"REG_a"                \n\t"
1122
                        " jb 1b                                \n\t"
1123

    
1124
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1125
                        "m" (yalpha1), "m" (uvalpha1)
1126
                        : "%"REG_a
1127
                        );
1128
                break;
1129
#endif
1130
                case IMGFMT_RGB32:
1131
#ifndef HAVE_MMX
1132
                case IMGFMT_BGR32:
1133
#endif
1134
                if(dstFormat==IMGFMT_BGR32)
1135
                {
1136
                        int i;
1137
#ifdef WORDS_BIGENDIAN
1138
                        dest++;
1139
#endif
1140
                        for(i=0;i<dstW;i++){
1141
                                // vertical linear interpolation && yuv2rgb in a single step:
1142
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1143
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1144
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1145
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1146
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1147
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1148
                                dest+= 4;
1149
                        }
1150
                }
1151
                else if(dstFormat==IMGFMT_BGR24)
1152
                {
1153
                        int i;
1154
                        for(i=0;i<dstW;i++){
1155
                                // vertical linear interpolation && yuv2rgb in a single step:
1156
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1157
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1158
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1159
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1160
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1161
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1162
                                dest+= 3;
1163
                        }
1164
                }
1165
                else if(dstFormat==IMGFMT_BGR16)
1166
                {
1167
                        int i;
1168
                        for(i=0;i<dstW;i++){
1169
                                // vertical linear interpolation && yuv2rgb in a single step:
1170
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1171
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1172
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1173

    
1174
                                ((uint16_t*)dest)[i] =
1175
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1176
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1177
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1178
                        }
1179
                }
1180
                else if(dstFormat==IMGFMT_BGR15)
1181
                {
1182
                        int i;
1183
                        for(i=0;i<dstW;i++){
1184
                                // vertical linear interpolation && yuv2rgb in a single step:
1185
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1186
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1187
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1188

    
1189
                                ((uint16_t*)dest)[i] =
1190
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1191
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1192
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1193
                        }
1194
                }
1195
        }//FULL_UV_IPOL
1196
        else
1197
        {
1198
#endif // if 0
1199
#ifdef HAVE_MMX
1200
        switch(c->dstFormat)
1201
        {
1202
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1203
        case IMGFMT_BGR32:
1204
                        asm volatile(
1205
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1206
                                "mov %4, %%"REG_SP"                        \n\t"
1207
                                YSCALEYUV2RGB(%%REGa, %5)
1208
                                WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1209
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1210

    
1211
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1212
                        "r" (&c->redDither)
1213
                        : "%"REG_a
1214
                        );
1215
                        return;
1216
        case IMGFMT_BGR24:
1217
                        asm volatile(
1218
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1219
                                "mov %4, %%"REG_SP"                        \n\t"
1220
                                YSCALEYUV2RGB(%%REGa, %5)
1221
                                WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1222
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1223
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1224
                        "r" (&c->redDither)
1225
                        : "%"REG_a
1226
                        );
1227
                        return;
1228
        case IMGFMT_BGR15:
1229
                        asm volatile(
1230
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1231
                                "mov %4, %%"REG_SP"                        \n\t"
1232
                                YSCALEYUV2RGB(%%REGa, %5)
1233
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1234
#ifdef DITHER1XBPP
1235
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1236
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1237
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1238
#endif
1239

    
1240
                                WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1241
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1242

    
1243
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1244
                        "r" (&c->redDither)
1245
                        : "%"REG_a
1246
                        );
1247
                        return;
1248
        case IMGFMT_BGR16:
1249
                        asm volatile(
1250
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1251
                                "mov %4, %%"REG_SP"                        \n\t"
1252
                                YSCALEYUV2RGB(%%REGa, %5)
1253
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1254
#ifdef DITHER1XBPP
1255
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1256
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1257
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1258
#endif
1259

    
1260
                                WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1261
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1262
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1263
                        "r" (&c->redDither)
1264
                        : "%"REG_a
1265
                        );
1266
                        return;
1267
        case IMGFMT_YUY2:
1268
                        asm volatile(
1269
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1270
                                "mov %4, %%"REG_SP"                        \n\t"
1271
                                YSCALEYUV2PACKED(%%REGa, %5)
1272
                                WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1273
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1274
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1275
                        "r" (&c->redDither)
1276
                        : "%"REG_a
1277
                        );
1278
                        return;
1279
        default: break;
1280
        }
1281
#endif //HAVE_MMX
1282
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1283
}
1284

    
1285
/**
1286
 * YV12 to RGB without scaling or interpolating
1287
 */
1288
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1289
                            uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1290
{
1291
        const int yalpha1=0;
1292
        int i;
1293
        
1294
        uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1295
        const int yalpha= 4096; //FIXME ...
1296

    
1297
        if(flags&SWS_FULL_CHR_H_INT)
1298
        {
1299
                RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1300
                return;
1301
        }
1302

    
1303
#ifdef HAVE_MMX
1304
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1305
        {
1306
                switch(dstFormat)
1307
                {
1308
                case IMGFMT_BGR32:
1309
                        asm volatile(
1310
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1311
                                "mov %4, %%"REG_SP"                        \n\t"
1312
                                YSCALEYUV2RGB1(%%REGa, %5)
1313
                                WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1314
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1315

    
1316
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1317
                        "r" (&c->redDither)
1318
                        : "%"REG_a
1319
                        );
1320
                        return;
1321
                case IMGFMT_BGR24:
1322
                        asm volatile(
1323
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1324
                                "mov %4, %%"REG_SP"                        \n\t"
1325
                                YSCALEYUV2RGB1(%%REGa, %5)
1326
                                WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1327
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1328

    
1329
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1330
                        "r" (&c->redDither)
1331
                        : "%"REG_a
1332
                        );
1333
                        return;
1334
                case IMGFMT_BGR15:
1335
                        asm volatile(
1336
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1337
                                "mov %4, %%"REG_SP"                        \n\t"
1338
                                YSCALEYUV2RGB1(%%REGa, %5)
1339
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1340
#ifdef DITHER1XBPP
1341
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1342
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1343
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1344
#endif
1345
                                WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1346
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1347

    
1348
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1349
                        "r" (&c->redDither)
1350
                        : "%"REG_a
1351
                        );
1352
                        return;
1353
                case IMGFMT_BGR16:
1354
                        asm volatile(
1355
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1356
                                "mov %4, %%"REG_SP"                        \n\t"
1357
                                YSCALEYUV2RGB1(%%REGa, %5)
1358
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1359
#ifdef DITHER1XBPP
1360
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1361
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1362
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1363
#endif
1364

    
1365
                                WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1366
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1367

    
1368
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1369
                        "r" (&c->redDither)
1370
                        : "%"REG_a
1371
                        );
1372
                        return;
1373
                case IMGFMT_YUY2:
1374
                        asm volatile(
1375
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1376
                                "mov %4, %%"REG_SP"                        \n\t"
1377
                                YSCALEYUV2PACKED1(%%REGa, %5)
1378
                                WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1379
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1380

    
1381
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1382
                        "r" (&c->redDither)
1383
                        : "%"REG_a
1384
                        );
1385
                        return;
1386
                }
1387
        }
1388
        else
1389
        {
1390
                switch(dstFormat)
1391
                {
1392
                case IMGFMT_BGR32:
1393
                        asm volatile(
1394
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1395
                                "mov %4, %%"REG_SP"                        \n\t"
1396
                                YSCALEYUV2RGB1b(%%REGa, %5)
1397
                                WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1398
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1399

    
1400
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1401
                        "r" (&c->redDither)
1402
                        : "%"REG_a
1403
                        );
1404
                        return;
1405
                case IMGFMT_BGR24:
1406
                        asm volatile(
1407
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1408
                                "mov %4, %%"REG_SP"                        \n\t"
1409
                                YSCALEYUV2RGB1b(%%REGa, %5)
1410
                                WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1411
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1412

    
1413
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1414
                        "r" (&c->redDither)
1415
                        : "%"REG_a
1416
                        );
1417
                        return;
1418
                case IMGFMT_BGR15:
1419
                        asm volatile(
1420
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1421
                                "mov %4, %%"REG_SP"                        \n\t"
1422
                                YSCALEYUV2RGB1b(%%REGa, %5)
1423
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1424
#ifdef DITHER1XBPP
1425
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1426
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1427
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1428
#endif
1429
                                WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1430
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1431

    
1432
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1433
                        "r" (&c->redDither)
1434
                        : "%"REG_a
1435
                        );
1436
                        return;
1437
                case IMGFMT_BGR16:
1438
                        asm volatile(
1439
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1440
                                "mov %4, %%"REG_SP"                        \n\t"
1441
                                YSCALEYUV2RGB1b(%%REGa, %5)
1442
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1443
#ifdef DITHER1XBPP
1444
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1445
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1446
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1447
#endif
1448

    
1449
                                WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1450
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1451

    
1452
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1453
                        "r" (&c->redDither)
1454
                        : "%"REG_a
1455
                        );
1456
                        return;
1457
                case IMGFMT_YUY2:
1458
                        asm volatile(
1459
                                "mov %%"REG_SP", "ESP_OFFSET"(%5)        \n\t"
1460
                                "mov %4, %%"REG_SP"                        \n\t"
1461
                                YSCALEYUV2PACKED1b(%%REGa, %5)
1462
                                WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1463
                                "mov "ESP_OFFSET"(%5), %%"REG_SP"        \n\t"
1464

    
1465
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1466
                        "r" (&c->redDither)
1467
                        : "%"REG_a
1468
                        );
1469
                        return;
1470
                }
1471
        }
1472
#endif
1473
        if( uvalpha < 2048 )
1474
        {
1475
                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1476
        }else{
1477
                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1478
        }
1479
}
1480

    
1481
//FIXME yuy2* can read upto 7 samples to much
1482

    
1483
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1484
{
1485
#ifdef HAVE_MMX
1486
        asm volatile(
1487
                "movq "MANGLE(bm01010101)", %%mm2\n\t"
1488
                "mov %0, %%"REG_a"                \n\t"
1489
                "1:                                \n\t"
1490
                "movq (%1, %%"REG_a",2), %%mm0        \n\t"
1491
                "movq 8(%1, %%"REG_a",2), %%mm1        \n\t"
1492
                "pand %%mm2, %%mm0                \n\t"
1493
                "pand %%mm2, %%mm1                \n\t"
1494
                "packuswb %%mm1, %%mm0                \n\t"
1495
                "movq %%mm0, (%2, %%"REG_a")        \n\t"
1496
                "add $8, %%"REG_a"                \n\t"
1497
                " js 1b                                \n\t"
1498
                : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1499
                : "%"REG_a
1500
        );
1501
#else
1502
        int i;
1503
        for(i=0; i<width; i++)
1504
                dst[i]= src[2*i];
1505
#endif
1506
}
1507

    
1508
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1509
{
1510
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1511
        asm volatile(
1512
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1513
                "mov %0, %%"REG_a"                \n\t"
1514
                "1:                                \n\t"
1515
                "movq (%1, %%"REG_a",4), %%mm0        \n\t"
1516
                "movq 8(%1, %%"REG_a",4), %%mm1        \n\t"
1517
                "movq (%2, %%"REG_a",4), %%mm2        \n\t"
1518
                "movq 8(%2, %%"REG_a",4), %%mm3        \n\t"
1519
                PAVGB(%%mm2, %%mm0)
1520
                PAVGB(%%mm3, %%mm1)
1521
                "psrlw $8, %%mm0                \n\t"
1522
                "psrlw $8, %%mm1                \n\t"
1523
                "packuswb %%mm1, %%mm0                \n\t"
1524
                "movq %%mm0, %%mm1                \n\t"
1525
                "psrlw $8, %%mm0                \n\t"
1526
                "pand %%mm4, %%mm1                \n\t"
1527
                "packuswb %%mm0, %%mm0                \n\t"
1528
                "packuswb %%mm1, %%mm1                \n\t"
1529
                "movd %%mm0, (%4, %%"REG_a")        \n\t"
1530
                "movd %%mm1, (%3, %%"REG_a")        \n\t"
1531
                "add $4, %%"REG_a"                \n\t"
1532
                " js 1b                                \n\t"
1533
                : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1534
                : "%"REG_a
1535
        );
1536
#else
1537
        int i;
1538
        for(i=0; i<width; i++)
1539
        {
1540
                dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1541
                dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1542
        }
1543
#endif
1544
}
1545

    
1546
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1547
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
1548
{
1549
#ifdef HAVE_MMX
1550
        asm volatile(
1551
                "mov %0, %%"REG_a"                \n\t"
1552
                "1:                                \n\t"
1553
                "movq (%1, %%"REG_a",2), %%mm0        \n\t"
1554
                "movq 8(%1, %%"REG_a",2), %%mm1        \n\t"
1555
                "psrlw $8, %%mm0                \n\t"
1556
                "psrlw $8, %%mm1                \n\t"
1557
                "packuswb %%mm1, %%mm0                \n\t"
1558
                "movq %%mm0, (%2, %%"REG_a")        \n\t"
1559
                "add $8, %%"REG_a"                \n\t"
1560
                " js 1b                                \n\t"
1561
                : : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
1562
                : "%"REG_a
1563
        );
1564
#else
1565
        int i;
1566
        for(i=0; i<width; i++)
1567
                dst[i]= src[2*i+1];
1568
#endif
1569
}
1570

    
1571
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1572
{
1573
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1574
        asm volatile(
1575
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1576
                "mov %0, %%"REG_a"                \n\t"
1577
                "1:                                \n\t"
1578
                "movq (%1, %%"REG_a",4), %%mm0        \n\t"
1579
                "movq 8(%1, %%"REG_a",4), %%mm1        \n\t"
1580
                "movq (%2, %%"REG_a",4), %%mm2        \n\t"
1581
                "movq 8(%2, %%"REG_a",4), %%mm3        \n\t"
1582
                PAVGB(%%mm2, %%mm0)
1583
                PAVGB(%%mm3, %%mm1)
1584
                "pand %%mm4, %%mm0                \n\t"
1585
                "pand %%mm4, %%mm1                \n\t"
1586
                "packuswb %%mm1, %%mm0                \n\t"
1587
                "movq %%mm0, %%mm1                \n\t"
1588
                "psrlw $8, %%mm0                \n\t"
1589
                "pand %%mm4, %%mm1                \n\t"
1590
                "packuswb %%mm0, %%mm0                \n\t"
1591
                "packuswb %%mm1, %%mm1                \n\t"
1592
                "movd %%mm0, (%4, %%"REG_a")        \n\t"
1593
                "movd %%mm1, (%3, %%"REG_a")        \n\t"
1594
                "add $4, %%"REG_a"                \n\t"
1595
                " js 1b                                \n\t"
1596
                : : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1597
                : "%"REG_a
1598
        );
1599
#else
1600
        int i;
1601
        for(i=0; i<width; i++)
1602
        {
1603
                dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1604
                dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1605
        }
1606
#endif
1607
}
1608

    
1609
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1610
{
1611
#ifdef HAVE_MMXFIXME
1612
#else
1613
        int i;
1614
        for(i=0; i<width; i++)
1615
        {
1616
                int b=  ((uint32_t*)src)[i]&0xFF;
1617
                int g= (((uint32_t*)src)[i]>>8)&0xFF;
1618
                int r= (((uint32_t*)src)[i]>>16)&0xFF;
1619

    
1620
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1621
        }
1622
#endif
1623
}
1624

    
1625
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1626
{
1627
#ifdef HAVE_MMXFIXME
1628
#else
1629
        int i;
1630
        for(i=0; i<width; i++)
1631
        {
1632
                const int a= ((uint32_t*)src1)[2*i+0];
1633
                const int e= ((uint32_t*)src1)[2*i+1];
1634
                const int c= ((uint32_t*)src2)[2*i+0];
1635
                const int d= ((uint32_t*)src2)[2*i+1];
1636
                const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1637
                const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1638
                 const int b=  l&0x3FF;
1639
                const int g=  h>>8;
1640
                const int r=  l>>16;
1641

    
1642
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1643
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1644
        }
1645
#endif
1646
}
1647

    
1648
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1649
{
1650
#ifdef HAVE_MMX
1651
        asm volatile(
1652
                "mov %2, %%"REG_a"                \n\t"
1653
                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
1654
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1655
                "pxor %%mm7, %%mm7                \n\t"
1656
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1657
                ".balign 16                        \n\t"
1658
                "1:                                \n\t"
1659
                PREFETCH" 64(%0, %%"REG_b")        \n\t"
1660
                "movd (%0, %%"REG_b"), %%mm0        \n\t"
1661
                "movd 3(%0, %%"REG_b"), %%mm1        \n\t"
1662
                "punpcklbw %%mm7, %%mm0                \n\t"
1663
                "punpcklbw %%mm7, %%mm1                \n\t"
1664
                "movd 6(%0, %%"REG_b"), %%mm2        \n\t"
1665
                "movd 9(%0, %%"REG_b"), %%mm3        \n\t"
1666
                "punpcklbw %%mm7, %%mm2                \n\t"
1667
                "punpcklbw %%mm7, %%mm3                \n\t"
1668
                "pmaddwd %%mm6, %%mm0                \n\t"
1669
                "pmaddwd %%mm6, %%mm1                \n\t"
1670
                "pmaddwd %%mm6, %%mm2                \n\t"
1671
                "pmaddwd %%mm6, %%mm3                \n\t"
1672
#ifndef FAST_BGR2YV12
1673
                "psrad $8, %%mm0                \n\t"
1674
                "psrad $8, %%mm1                \n\t"
1675
                "psrad $8, %%mm2                \n\t"
1676
                "psrad $8, %%mm3                \n\t"
1677
#endif
1678
                "packssdw %%mm1, %%mm0                \n\t"
1679
                "packssdw %%mm3, %%mm2                \n\t"
1680
                "pmaddwd %%mm5, %%mm0                \n\t"
1681
                "pmaddwd %%mm5, %%mm2                \n\t"
1682
                "packssdw %%mm2, %%mm0                \n\t"
1683
                "psraw $7, %%mm0                \n\t"
1684

    
1685
                "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
1686
                "movd 15(%0, %%"REG_b"), %%mm1        \n\t"
1687
                "punpcklbw %%mm7, %%mm4                \n\t"
1688
                "punpcklbw %%mm7, %%mm1                \n\t"
1689
                "movd 18(%0, %%"REG_b"), %%mm2        \n\t"
1690
                "movd 21(%0, %%"REG_b"), %%mm3        \n\t"
1691
                "punpcklbw %%mm7, %%mm2                \n\t"
1692
                "punpcklbw %%mm7, %%mm3                \n\t"
1693
                "pmaddwd %%mm6, %%mm4                \n\t"
1694
                "pmaddwd %%mm6, %%mm1                \n\t"
1695
                "pmaddwd %%mm6, %%mm2                \n\t"
1696
                "pmaddwd %%mm6, %%mm3                \n\t"
1697
#ifndef FAST_BGR2YV12
1698
                "psrad $8, %%mm4                \n\t"
1699
                "psrad $8, %%mm1                \n\t"
1700
                "psrad $8, %%mm2                \n\t"
1701
                "psrad $8, %%mm3                \n\t"
1702
#endif
1703
                "packssdw %%mm1, %%mm4                \n\t"
1704
                "packssdw %%mm3, %%mm2                \n\t"
1705
                "pmaddwd %%mm5, %%mm4                \n\t"
1706
                "pmaddwd %%mm5, %%mm2                \n\t"
1707
                "add $24, %%"REG_b"                \n\t"
1708
                "packssdw %%mm2, %%mm4                \n\t"
1709
                "psraw $7, %%mm4                \n\t"
1710

    
1711
                "packuswb %%mm4, %%mm0                \n\t"
1712
                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
1713

    
1714
                "movq %%mm0, (%1, %%"REG_a")        \n\t"
1715
                "add $8, %%"REG_a"                \n\t"
1716
                " js 1b                                \n\t"
1717
                : : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
1718
                : "%"REG_a, "%"REG_b
1719
        );
1720
#else
1721
        int i;
1722
        for(i=0; i<width; i++)
1723
        {
1724
                int b= src[i*3+0];
1725
                int g= src[i*3+1];
1726
                int r= src[i*3+2];
1727

    
1728
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1729
        }
1730
#endif
1731
}
1732

    
1733
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1734
{
1735
#ifdef HAVE_MMX
1736
        asm volatile(
1737
                "mov %4, %%"REG_a"                \n\t"
1738
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1739
                "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
1740
                "pxor %%mm7, %%mm7                \n\t"
1741
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"        \n\t"
1742
                "add %%"REG_b", %%"REG_b"        \n\t"
1743
                ".balign 16                        \n\t"
1744
                "1:                                \n\t"
1745
                PREFETCH" 64(%0, %%"REG_b")        \n\t"
1746
                PREFETCH" 64(%1, %%"REG_b")        \n\t"
1747
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1748
                "movq (%0, %%"REG_b"), %%mm0        \n\t"
1749
                "movq (%1, %%"REG_b"), %%mm1        \n\t"
1750
                "movq 6(%0, %%"REG_b"), %%mm2        \n\t"
1751
                "movq 6(%1, %%"REG_b"), %%mm3        \n\t"
1752
                PAVGB(%%mm1, %%mm0)
1753
                PAVGB(%%mm3, %%mm2)
1754
                "movq %%mm0, %%mm1                \n\t"
1755
                "movq %%mm2, %%mm3                \n\t"
1756
                "psrlq $24, %%mm0                \n\t"
1757
                "psrlq $24, %%mm2                \n\t"
1758
                PAVGB(%%mm1, %%mm0)
1759
                PAVGB(%%mm3, %%mm2)
1760
                "punpcklbw %%mm7, %%mm0                \n\t"
1761
                "punpcklbw %%mm7, %%mm2                \n\t"
1762
#else
1763
                "movd (%0, %%"REG_b"), %%mm0        \n\t"
1764
                "movd (%1, %%"REG_b"), %%mm1        \n\t"
1765
                "movd 3(%0, %%"REG_b"), %%mm2        \n\t"
1766
                "movd 3(%1, %%"REG_b"), %%mm3        \n\t"
1767
                "punpcklbw %%mm7, %%mm0                \n\t"
1768
                "punpcklbw %%mm7, %%mm1                \n\t"
1769
                "punpcklbw %%mm7, %%mm2                \n\t"
1770
                "punpcklbw %%mm7, %%mm3                \n\t"
1771
                "paddw %%mm1, %%mm0                \n\t"
1772
                "paddw %%mm3, %%mm2                \n\t"
1773
                "paddw %%mm2, %%mm0                \n\t"
1774
                "movd 6(%0, %%"REG_b"), %%mm4        \n\t"
1775
                "movd 6(%1, %%"REG_b"), %%mm1        \n\t"
1776
                "movd 9(%0, %%"REG_b"), %%mm2        \n\t"
1777
                "movd 9(%1, %%"REG_b"), %%mm3        \n\t"
1778
                "punpcklbw %%mm7, %%mm4                \n\t"
1779
                "punpcklbw %%mm7, %%mm1                \n\t"
1780
                "punpcklbw %%mm7, %%mm2                \n\t"
1781
                "punpcklbw %%mm7, %%mm3                \n\t"
1782
                "paddw %%mm1, %%mm4                \n\t"
1783
                "paddw %%mm3, %%mm2                \n\t"
1784
                "paddw %%mm4, %%mm2                \n\t"
1785
                "psrlw $2, %%mm0                \n\t"
1786
                "psrlw $2, %%mm2                \n\t"
1787
#endif
1788
                "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
1789
                "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
1790
                
1791
                "pmaddwd %%mm0, %%mm1                \n\t"
1792
                "pmaddwd %%mm2, %%mm3                \n\t"
1793
                "pmaddwd %%mm6, %%mm0                \n\t"
1794
                "pmaddwd %%mm6, %%mm2                \n\t"
1795
#ifndef FAST_BGR2YV12
1796
                "psrad $8, %%mm0                \n\t"
1797
                "psrad $8, %%mm1                \n\t"
1798
                "psrad $8, %%mm2                \n\t"
1799
                "psrad $8, %%mm3                \n\t"
1800
#endif
1801
                "packssdw %%mm2, %%mm0                \n\t"
1802
                "packssdw %%mm3, %%mm1                \n\t"
1803
                "pmaddwd %%mm5, %%mm0                \n\t"
1804
                "pmaddwd %%mm5, %%mm1                \n\t"
1805
                "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
1806
                "psraw $7, %%mm0                \n\t"
1807

    
1808
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1809
                "movq 12(%0, %%"REG_b"), %%mm4        \n\t"
1810
                "movq 12(%1, %%"REG_b"), %%mm1        \n\t"
1811
                "movq 18(%0, %%"REG_b"), %%mm2        \n\t"
1812
                "movq 18(%1, %%"REG_b"), %%mm3        \n\t"
1813
                PAVGB(%%mm1, %%mm4)
1814
                PAVGB(%%mm3, %%mm2)
1815
                "movq %%mm4, %%mm1                \n\t"
1816
                "movq %%mm2, %%mm3                \n\t"
1817
                "psrlq $24, %%mm4                \n\t"
1818
                "psrlq $24, %%mm2                \n\t"
1819
                PAVGB(%%mm1, %%mm4)
1820
                PAVGB(%%mm3, %%mm2)
1821
                "punpcklbw %%mm7, %%mm4                \n\t"
1822
                "punpcklbw %%mm7, %%mm2                \n\t"
1823
#else
1824
                "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
1825
                "movd 12(%1, %%"REG_b"), %%mm1        \n\t"
1826
                "movd 15(%0, %%"REG_b"), %%mm2        \n\t"
1827
                "movd 15(%1, %%"REG_b"), %%mm3        \n\t"
1828
                "punpcklbw %%mm7, %%mm4                \n\t"
1829
                "punpcklbw %%mm7, %%mm1                \n\t"
1830
                "punpcklbw %%mm7, %%mm2                \n\t"
1831
                "punpcklbw %%mm7, %%mm3                \n\t"
1832
                "paddw %%mm1, %%mm4                \n\t"
1833
                "paddw %%mm3, %%mm2                \n\t"
1834
                "paddw %%mm2, %%mm4                \n\t"
1835
                "movd 18(%0, %%"REG_b"), %%mm5        \n\t"
1836
                "movd 18(%1, %%"REG_b"), %%mm1        \n\t"
1837
                "movd 21(%0, %%"REG_b"), %%mm2        \n\t"
1838
                "movd 21(%1, %%"REG_b"), %%mm3        \n\t"
1839
                "punpcklbw %%mm7, %%mm5                \n\t"
1840
                "punpcklbw %%mm7, %%mm1                \n\t"
1841
                "punpcklbw %%mm7, %%mm2                \n\t"
1842
                "punpcklbw %%mm7, %%mm3                \n\t"
1843
                "paddw %%mm1, %%mm5                \n\t"
1844
                "paddw %%mm3, %%mm2                \n\t"
1845
                "paddw %%mm5, %%mm2                \n\t"
1846
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1847
                "psrlw $2, %%mm4                \n\t"
1848
                "psrlw $2, %%mm2                \n\t"
1849
#endif
1850
                "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
1851
                "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
1852
                
1853
                "pmaddwd %%mm4, %%mm1                \n\t"
1854
                "pmaddwd %%mm2, %%mm3                \n\t"
1855
                "pmaddwd %%mm6, %%mm4                \n\t"
1856
                "pmaddwd %%mm6, %%mm2                \n\t"
1857
#ifndef FAST_BGR2YV12
1858
                "psrad $8, %%mm4                \n\t"
1859
                "psrad $8, %%mm1                \n\t"
1860
                "psrad $8, %%mm2                \n\t"
1861
                "psrad $8, %%mm3                \n\t"
1862
#endif
1863
                "packssdw %%mm2, %%mm4                \n\t"
1864
                "packssdw %%mm3, %%mm1                \n\t"
1865
                "pmaddwd %%mm5, %%mm4                \n\t"
1866
                "pmaddwd %%mm5, %%mm1                \n\t"
1867
                "add $24, %%"REG_b"                \n\t"
1868
                "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
1869
                "psraw $7, %%mm4                \n\t"
1870
                
1871
                "movq %%mm0, %%mm1                \n\t"
1872
                "punpckldq %%mm4, %%mm0                \n\t"
1873
                "punpckhdq %%mm4, %%mm1                \n\t"
1874
                "packsswb %%mm1, %%mm0                \n\t"
1875
                "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
1876

    
1877
                "movd %%mm0, (%2, %%"REG_a")        \n\t"
1878
                "punpckhdq %%mm0, %%mm0                \n\t"
1879
                "movd %%mm0, (%3, %%"REG_a")        \n\t"
1880
                "add $4, %%"REG_a"                \n\t"
1881
                " js 1b                                \n\t"
1882
                : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
1883
                : "%"REG_a, "%"REG_b
1884
        );
1885
#else
1886
        int i;
1887
        for(i=0; i<width; i++)
1888
        {
1889
                int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1890
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1891
                int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1892

    
1893
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1894
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1895
        }
1896
#endif
1897
}
1898

    
1899
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1900
{
1901
        int i;
1902
        for(i=0; i<width; i++)
1903
        {
1904
                int d= ((uint16_t*)src)[i];
1905
                int b= d&0x1F;
1906
                int g= (d>>5)&0x3F;
1907
                int r= (d>>11)&0x1F;
1908

    
1909
                dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1910
        }
1911
}
1912

    
1913
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1914
{
1915
        int i;
1916
        for(i=0; i<width; i++)
1917
        {
1918
                int d0= ((uint32_t*)src1)[i];
1919
                int d1= ((uint32_t*)src2)[i];
1920
                
1921
                int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1922
                int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1923

    
1924
                int dh2= (dh>>11) + (dh<<21);
1925
                int d= dh2 + dl;
1926

    
1927
                int b= d&0x7F;
1928
                int r= (d>>11)&0x7F;
1929
                int g= d>>21;
1930
                dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1931
                dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1932
        }
1933
}
1934

    
1935
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1936
{
1937
        int i;
1938
        for(i=0; i<width; i++)
1939
        {
1940
                int d= ((uint16_t*)src)[i];
1941
                int b= d&0x1F;
1942
                int g= (d>>5)&0x1F;
1943
                int r= (d>>10)&0x1F;
1944

    
1945
                dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1946
        }
1947
}
1948

    
1949
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1950
{
1951
        int i;
1952
        for(i=0; i<width; i++)
1953
        {
1954
                int d0= ((uint32_t*)src1)[i];
1955
                int d1= ((uint32_t*)src2)[i];
1956
                
1957
                int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1958
                int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1959

    
1960
                int dh2= (dh>>11) + (dh<<21);
1961
                int d= dh2 + dl;
1962

    
1963
                int b= d&0x7F;
1964
                int r= (d>>10)&0x7F;
1965
                int g= d>>21;
1966
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1967
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1968
        }
1969
}
1970

    
1971

    
1972
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1973
{
1974
        int i;
1975
        for(i=0; i<width; i++)
1976
        {
1977
                int r=  ((uint32_t*)src)[i]&0xFF;
1978
                int g= (((uint32_t*)src)[i]>>8)&0xFF;
1979
                int b= (((uint32_t*)src)[i]>>16)&0xFF;
1980

    
1981
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1982
        }
1983
}
1984

    
1985
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1986
{
1987
        int i;
1988
        for(i=0; i<width; i++)
1989
        {
1990
                const int a= ((uint32_t*)src1)[2*i+0];
1991
                const int e= ((uint32_t*)src1)[2*i+1];
1992
                const int c= ((uint32_t*)src2)[2*i+0];
1993
                const int d= ((uint32_t*)src2)[2*i+1];
1994
                const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1995
                const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1996
                 const int r=  l&0x3FF;
1997
                const int g=  h>>8;
1998
                const int b=  l>>16;
1999

    
2000
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2001
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2002
        }
2003
}
2004

    
2005
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2006
{
2007
        int i;
2008
        for(i=0; i<width; i++)
2009
        {
2010
                int r= src[i*3+0];
2011
                int g= src[i*3+1];
2012
                int b= src[i*3+2];
2013

    
2014
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2015
        }
2016
}
2017

    
2018
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2019
{
2020
        int i;
2021
        for(i=0; i<width; i++)
2022
        {
2023
                int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2024
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2025
                int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2026

    
2027
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2028
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2029
        }
2030
}
2031

    
2032

    
2033
// Bilinear / Bicubic scaling
2034
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2035
                                  int16_t *filter, int16_t *filterPos, int filterSize)
2036
{
2037
#ifdef HAVE_MMX
2038
        assert(filterSize % 4 == 0 && filterSize>0);
2039
        if(filterSize==4) // allways true for upscaling, sometimes for down too
2040
        {
2041
                long counter= -2*dstW;
2042
                filter-= counter*2;
2043
                filterPos-= counter/2;
2044
                dst-= counter/2;
2045
                asm volatile(
2046
                        "pxor %%mm7, %%mm7                \n\t"
2047
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2048
                        "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2049
                        "mov %%"REG_a", %%"REG_BP"        \n\t"
2050
                        ".balign 16                        \n\t"
2051
                        "1:                                \n\t"
2052
                        "movzwl (%2, %%"REG_BP"), %%eax        \n\t"
2053
                        "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2054
                        "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2055
                        "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2056
                        "movd (%3, %%"REG_a"), %%mm0        \n\t"
2057
                        "movd (%3, %%"REG_b"), %%mm2        \n\t"
2058
                        "punpcklbw %%mm7, %%mm0                \n\t"
2059
                        "punpcklbw %%mm7, %%mm2                \n\t"
2060
                        "pmaddwd %%mm1, %%mm0                \n\t"
2061
                        "pmaddwd %%mm2, %%mm3                \n\t"
2062
                        "psrad $8, %%mm0                \n\t"
2063
                        "psrad $8, %%mm3                \n\t"
2064
                        "packssdw %%mm3, %%mm0                \n\t"
2065
                        "pmaddwd %%mm6, %%mm0                \n\t"
2066
                        "packssdw %%mm0, %%mm0                \n\t"
2067
                        "movd %%mm0, (%4, %%"REG_BP")        \n\t"
2068
                        "add $4, %%"REG_BP"                \n\t"
2069
                        " jnc 1b                        \n\t"
2070

    
2071
                        "pop %%"REG_BP"                        \n\t"
2072
                        : "+a" (counter)
2073
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2074
                        : "%"REG_b
2075
                );
2076
        }
2077
        else if(filterSize==8)
2078
        {
2079
                long counter= -2*dstW;
2080
                filter-= counter*4;
2081
                filterPos-= counter/2;
2082
                dst-= counter/2;
2083
                asm volatile(
2084
                        "pxor %%mm7, %%mm7                \n\t"
2085
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2086
                        "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2087
                        "mov %%"REG_a", %%"REG_BP"        \n\t"
2088
                        ".balign 16                        \n\t"
2089
                        "1:                                \n\t"
2090
                        "movzwl (%2, %%"REG_BP"), %%eax        \n\t"
2091
                        "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2092
                        "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2093
                        "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2094
                        "movd (%3, %%"REG_a"), %%mm0        \n\t"
2095
                        "movd (%3, %%"REG_b"), %%mm2        \n\t"
2096
                        "punpcklbw %%mm7, %%mm0                \n\t"
2097
                        "punpcklbw %%mm7, %%mm2                \n\t"
2098
                        "pmaddwd %%mm1, %%mm0                \n\t"
2099
                        "pmaddwd %%mm2, %%mm3                \n\t"
2100

    
2101
                        "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2102
                        "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2103
                        "movd 4(%3, %%"REG_a"), %%mm4        \n\t"
2104
                        "movd 4(%3, %%"REG_b"), %%mm2        \n\t"
2105
                        "punpcklbw %%mm7, %%mm4                \n\t"
2106
                        "punpcklbw %%mm7, %%mm2                \n\t"
2107
                        "pmaddwd %%mm1, %%mm4                \n\t"
2108
                        "pmaddwd %%mm2, %%mm5                \n\t"
2109
                        "paddd %%mm4, %%mm0                \n\t"
2110
                        "paddd %%mm5, %%mm3                \n\t"
2111
                                                
2112
                        "psrad $8, %%mm0                \n\t"
2113
                        "psrad $8, %%mm3                \n\t"
2114
                        "packssdw %%mm3, %%mm0                \n\t"
2115
                        "pmaddwd %%mm6, %%mm0                \n\t"
2116
                        "packssdw %%mm0, %%mm0                \n\t"
2117
                        "movd %%mm0, (%4, %%"REG_BP")        \n\t"
2118
                        "add $4, %%"REG_BP"                \n\t"
2119
                        " jnc 1b                        \n\t"
2120

    
2121
                        "pop %%"REG_BP"                        \n\t"
2122
                        : "+a" (counter)
2123
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2124
                        : "%"REG_b
2125
                );
2126
        }
2127
        else
2128
        {
2129
                long counter= -2*dstW;
2130
//                filter-= counter*filterSize/2;
2131
                filterPos-= counter/2;
2132
                dst-= counter/2;
2133
                asm volatile(
2134
                        "pxor %%mm7, %%mm7                \n\t"
2135
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2136
                        ".balign 16                        \n\t"
2137
                        "1:                                \n\t"
2138
                        "mov %2, %%"REG_c"                \n\t"
2139
                        "movzwl (%%"REG_c", %0), %%eax        \n\t"
2140
                        "movzwl 2(%%"REG_c", %0), %%ebx        \n\t"
2141
                        "mov %5, %%"REG_c"                \n\t"
2142
                        "pxor %%mm4, %%mm4                \n\t"
2143
                        "pxor %%mm5, %%mm5                \n\t"
2144
                        "2:                                \n\t"
2145
                        "movq (%1), %%mm1                \n\t"
2146
                        "movq (%1, %6), %%mm3                \n\t"
2147
                        "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2148
                        "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2149
                        "punpcklbw %%mm7, %%mm0                \n\t"
2150
                        "punpcklbw %%mm7, %%mm2                \n\t"
2151
                        "pmaddwd %%mm1, %%mm0                \n\t"
2152
                        "pmaddwd %%mm2, %%mm3                \n\t"
2153
                        "paddd %%mm3, %%mm5                \n\t"
2154
                        "paddd %%mm0, %%mm4                \n\t"
2155
                        "add $8, %1                        \n\t"
2156
                        "add $4, %%"REG_c"                \n\t"
2157
                        "cmp %4, %%"REG_c"                \n\t"
2158
                        " jb 2b                                \n\t"
2159
                        "add %6, %1                        \n\t"
2160
                        "psrad $8, %%mm4                \n\t"
2161
                        "psrad $8, %%mm5                \n\t"
2162
                        "packssdw %%mm5, %%mm4                \n\t"
2163
                        "pmaddwd %%mm6, %%mm4                \n\t"
2164
                        "packssdw %%mm4, %%mm4                \n\t"
2165
                        "mov %3, %%"REG_a"                \n\t"
2166
                        "movd %%mm4, (%%"REG_a", %0)        \n\t"
2167
                        "add $4, %0                        \n\t"
2168
                        " jnc 1b                        \n\t"
2169

    
2170
                        : "+r" (counter), "+r" (filter)
2171
                        : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2172
                          "m" (src), "r" ((long)filterSize*2)
2173
                        : "%"REG_b, "%"REG_a, "%"REG_c
2174
                );
2175
        }
2176
#else
2177
#ifdef HAVE_ALTIVEC
2178
        hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2179
#else
2180
        int i;
2181
        for(i=0; i<dstW; i++)
2182
        {
2183
                int j;
2184
                int srcPos= filterPos[i];
2185
                int val=0;
2186
//                printf("filterPos: %d\n", filterPos[i]);
2187
                for(j=0; j<filterSize; j++)
2188
                {
2189
//                        printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2190
                        val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2191
                }
2192
//                filter += hFilterSize;
2193
                dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2194
//                dst[i] = val>>7;
2195
        }
2196
#endif
2197
#endif
2198
}
2199
      // *** horizontal scale Y line to temp buffer
2200
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2201
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2202
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2203
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2204
                                   int32_t *mmx2FilterPos)
2205
{
2206
    if(srcFormat==IMGFMT_YUY2)
2207
    {
2208
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2209
        src= formatConvBuffer;
2210
    }
2211
    else if(srcFormat==IMGFMT_UYVY)
2212
    {
2213
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2214
        src= formatConvBuffer;
2215
    }
2216
    else if(srcFormat==IMGFMT_BGR32)
2217
    {
2218
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2219
        src= formatConvBuffer;
2220
    }
2221
    else if(srcFormat==IMGFMT_BGR24)
2222
    {
2223
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2224
        src= formatConvBuffer;
2225
    }
2226
    else if(srcFormat==IMGFMT_BGR16)
2227
    {
2228
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2229
        src= formatConvBuffer;
2230
    }
2231
    else if(srcFormat==IMGFMT_BGR15)
2232
    {
2233
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2234
        src= formatConvBuffer;
2235
    }
2236
    else if(srcFormat==IMGFMT_RGB32)
2237
    {
2238
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2239
        src= formatConvBuffer;
2240
    }
2241
    else if(srcFormat==IMGFMT_RGB24)
2242
    {
2243
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2244
        src= formatConvBuffer;
2245
    }
2246

    
2247
#ifdef HAVE_MMX
2248
        // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2249
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2250
#else
2251
    if(!(flags&SWS_FAST_BILINEAR))
2252
#endif
2253
    {
2254
            RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2255
    }
2256
    else // Fast Bilinear upscale / crap downscale
2257
    {
2258
#if defined(ARCH_X86) || defined(ARCH_X86_64)
2259
#ifdef HAVE_MMX2
2260
        int i;
2261
        if(canMMX2BeUsed)
2262
        {
2263
                asm volatile(
2264
                        "pxor %%mm7, %%mm7                \n\t"
2265
                        "mov %0, %%"REG_c"                \n\t"
2266
                        "mov %1, %%"REG_D"                \n\t"
2267
                        "mov %2, %%"REG_d"                \n\t"
2268
                        "mov %3, %%"REG_b"                \n\t"
2269
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2270
                        PREFETCH" (%%"REG_c")                \n\t"
2271
                        PREFETCH" 32(%%"REG_c")                \n\t"
2272
                        PREFETCH" 64(%%"REG_c")                \n\t"
2273

    
2274
#ifdef ARCH_X86_64
2275

    
2276
#define FUNNY_Y_CODE \
2277
                        "movl (%%"REG_b"), %%esi        \n\t"\
2278
                        "call *%4                        \n\t"\
2279
                        "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2280
                        "add %%"REG_S", %%"REG_c"        \n\t"\
2281
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2282
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2283

    
2284
#else
2285

    
2286
#define FUNNY_Y_CODE \
2287
                        "movl (%%"REG_b"), %%esi        \n\t"\
2288
                        "call *%4                        \n\t"\
2289
                        "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2290
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2291
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2292

    
2293
#endif
2294

    
2295
FUNNY_Y_CODE
2296
FUNNY_Y_CODE
2297
FUNNY_Y_CODE
2298
FUNNY_Y_CODE
2299
FUNNY_Y_CODE
2300
FUNNY_Y_CODE
2301
FUNNY_Y_CODE
2302
FUNNY_Y_CODE
2303

    
2304
                        :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2305
                        "m" (funnyYCode)
2306
                        : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2307
                );
2308
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2309
        }
2310
        else
2311
        {
2312
#endif
2313
        //NO MMX just normal asm ...
2314
        asm volatile(
2315
                "xor %%"REG_a", %%"REG_a"        \n\t" // i
2316
                "xor %%"REG_b", %%"REG_b"        \n\t" // xx
2317
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2318
                ".balign 16                        \n\t"
2319
                "1:                                \n\t"
2320
                "movzbl  (%0, %%"REG_b"), %%edi        \n\t" //src[xx]
2321
                "movzbl 1(%0, %%"REG_b"), %%esi        \n\t" //src[xx+1]
2322
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2323
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2324
                "shll $16, %%edi                \n\t"
2325
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2326
                "mov %1, %%"REG_D"                \n\t"
2327
                "shrl $9, %%esi                        \n\t"
2328
                "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2329
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2330
                "adc %3, %%"REG_b"                \n\t" //xx+= xInc>>8 + carry
2331

    
2332
                "movzbl (%0, %%"REG_b"), %%edi        \n\t" //src[xx]
2333
                "movzbl 1(%0, %%"REG_b"), %%esi        \n\t" //src[xx+1]
2334
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2335
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2336
                "shll $16, %%edi                \n\t"
2337
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2338
                "mov %1, %%"REG_D"                \n\t"
2339
                "shrl $9, %%esi                        \n\t"
2340
                "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2341
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2342
                "adc %3, %%"REG_b"                \n\t" //xx+= xInc>>8 + carry
2343

    
2344

    
2345
                "add $2, %%"REG_a"                \n\t"
2346
                "cmp %2, %%"REG_a"                \n\t"
2347
                " jb 1b                                \n\t"
2348

    
2349

    
2350
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2351
                : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2352
                );
2353
#ifdef HAVE_MMX2
2354
        } //if MMX2 can't be used
2355
#endif
2356
#else
2357
        int i;
2358
        unsigned int xpos=0;
2359
        for(i=0;i<dstWidth;i++)
2360
        {
2361
                register unsigned int xx=xpos>>16;
2362
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2363
                dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2364
                xpos+=xInc;
2365
        }
2366
#endif
2367
    }
2368
}
2369

    
2370
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2371
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2372
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2373
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2374
                                   int32_t *mmx2FilterPos)
2375
{
2376
    if(srcFormat==IMGFMT_YUY2)
2377
    {
2378
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2379
        src1= formatConvBuffer;
2380
        src2= formatConvBuffer+2048;
2381
    }
2382
    else if(srcFormat==IMGFMT_UYVY)
2383
    {
2384
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2385
        src1= formatConvBuffer;
2386
        src2= formatConvBuffer+2048;
2387
    }
2388
    else if(srcFormat==IMGFMT_BGR32)
2389
    {
2390
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2391
        src1= formatConvBuffer;
2392
        src2= formatConvBuffer+2048;
2393
    }
2394
    else if(srcFormat==IMGFMT_BGR24)
2395
    {
2396
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2397
        src1= formatConvBuffer;
2398
        src2= formatConvBuffer+2048;
2399
    }
2400
    else if(srcFormat==IMGFMT_BGR16)
2401
    {
2402
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2403
        src1= formatConvBuffer;
2404
        src2= formatConvBuffer+2048;
2405
    }
2406
    else if(srcFormat==IMGFMT_BGR15)
2407
    {
2408
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2409
        src1= formatConvBuffer;
2410
        src2= formatConvBuffer+2048;
2411
    }
2412
    else if(srcFormat==IMGFMT_RGB32)
2413
    {
2414
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2415
        src1= formatConvBuffer;
2416
        src2= formatConvBuffer+2048;
2417
    }
2418
    else if(srcFormat==IMGFMT_RGB24)
2419
    {
2420
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2421
        src1= formatConvBuffer;
2422
        src2= formatConvBuffer+2048;
2423
    }
2424
    else if(isGray(srcFormat))
2425
    {
2426
            return;
2427
    }
2428

    
2429
#ifdef HAVE_MMX
2430
        // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2431
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2432
#else
2433
    if(!(flags&SWS_FAST_BILINEAR))
2434
#endif
2435
    {
2436
            RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2437
            RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2438
    }
2439
    else // Fast Bilinear upscale / crap downscale
2440
    {
2441
#if defined(ARCH_X86) || defined(ARCH_X86_64)
2442
#ifdef HAVE_MMX2
2443
        int i;
2444
        if(canMMX2BeUsed)
2445
        {
2446
                asm volatile(
2447
                        "pxor %%mm7, %%mm7                \n\t"
2448
                        "mov %0, %%"REG_c"                \n\t"
2449
                        "mov %1, %%"REG_D"                \n\t"
2450
                        "mov %2, %%"REG_d"                \n\t"
2451
                        "mov %3, %%"REG_b"                \n\t"
2452
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2453
                        PREFETCH" (%%"REG_c")                \n\t"
2454
                        PREFETCH" 32(%%"REG_c")                \n\t"
2455
                        PREFETCH" 64(%%"REG_c")                \n\t"
2456

    
2457
#ifdef ARCH_X86_64
2458

    
2459
#define FUNNY_UV_CODE \
2460
                        "movl (%%"REG_b"), %%esi        \n\t"\
2461
                        "call *%4                        \n\t"\
2462
                        "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2463
                        "add %%"REG_S", %%"REG_c"        \n\t"\
2464
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2465
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2466

    
2467
#else
2468

    
2469
#define FUNNY_UV_CODE \
2470
                        "movl (%%"REG_b"), %%esi        \n\t"\
2471
                        "call *%4                        \n\t"\
2472
                        "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2473
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2474
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2475

    
2476
#endif
2477

    
2478
FUNNY_UV_CODE
2479
FUNNY_UV_CODE
2480
FUNNY_UV_CODE
2481
FUNNY_UV_CODE
2482
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2483
                        "mov %5, %%"REG_c"                \n\t" // src
2484
                        "mov %1, %%"REG_D"                \n\t" // buf1
2485
                        "add $4096, %%"REG_D"                \n\t"
2486
                        PREFETCH" (%%"REG_c")                \n\t"
2487
                        PREFETCH" 32(%%"REG_c")                \n\t"
2488
                        PREFETCH" 64(%%"REG_c")                \n\t"
2489

    
2490
FUNNY_UV_CODE
2491
FUNNY_UV_CODE
2492
FUNNY_UV_CODE
2493
FUNNY_UV_CODE
2494

    
2495
                        :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2496
                        "m" (funnyUVCode), "m" (src2)
2497
                        : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2498
                );
2499
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2500
                {
2501
//                        printf("%d %d %d\n", dstWidth, i, srcW);
2502
                        dst[i] = src1[srcW-1]*128;
2503
                        dst[i+2048] = src2[srcW-1]*128;
2504
                }
2505
        }
2506
        else
2507
        {
2508
#endif
2509
        asm volatile(
2510
                "xor %%"REG_a", %%"REG_a"        \n\t" // i
2511
                "xor %%"REG_b", %%"REG_b"                \n\t" // xx
2512
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2513
                ".balign 16                        \n\t"
2514
                "1:                                \n\t"
2515
                "mov %0, %%"REG_S"                \n\t"
2516
                "movzbl  (%%"REG_S", %%"REG_b"), %%edi        \n\t" //src[xx]
2517
                "movzbl 1(%%"REG_S", %%"REG_b"), %%esi        \n\t" //src[xx+1]
2518
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2519
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2520
                "shll $16, %%edi                \n\t"
2521
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2522
                "mov %1, %%"REG_D"                \n\t"
2523
                "shrl $9, %%esi                        \n\t"
2524
                "movw %%si, (%%"REG_d", %%"REG_a", 2)\n\t"
2525

    
2526
                "movzbl  (%5, %%"REG_b"), %%edi        \n\t" //src[xx]
2527
                "movzbl 1(%5, %%"REG_b"), %%esi        \n\t" //src[xx+1]
2528
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2529
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2530
                "shll $16, %%edi                \n\t"
2531
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2532
                "mov %1, %%"REG_D"                \n\t"
2533
                "shrl $9, %%esi                        \n\t"
2534
                "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2535

    
2536
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2537
                "adc %3, %%"REG_b"                \n\t" //xx+= xInc>>8 + carry
2538
                "add $1, %%"REG_a"                \n\t"
2539
                "cmp %2, %%"REG_a"                \n\t"
2540
                " jb 1b                                \n\t"
2541

    
2542
                :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" ((long)(xInc>>16)), "m" ((xInc&0xFFFF)),
2543
                "r" (src2)
2544
                : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2545
                );
2546
#ifdef HAVE_MMX2
2547
        } //if MMX2 can't be used
2548
#endif
2549
#else
2550
        int i;
2551
        unsigned int xpos=0;
2552
        for(i=0;i<dstWidth;i++)
2553
        {
2554
                register unsigned int xx=xpos>>16;
2555
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2556
                dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2557
                dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2558
/* slower
2559
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2560
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2561
*/
2562
                xpos+=xInc;
2563
        }
2564
#endif
2565
   }
2566
}
2567

    
2568
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2569
             int srcSliceH, uint8_t* dst[], int dstStride[]){
2570

    
2571
        /* load a few things into local vars to make the code more readable? and faster */
2572
        const int srcW= c->srcW;
2573
        const int dstW= c->dstW;
2574
        const int dstH= c->dstH;
2575
        const int chrDstW= c->chrDstW;
2576
        const int chrSrcW= c->chrSrcW;
2577
        const int lumXInc= c->lumXInc;
2578
        const int chrXInc= c->chrXInc;
2579
        const int dstFormat= c->dstFormat;
2580
        const int srcFormat= c->srcFormat;
2581
        const int flags= c->flags;
2582
        const int canMMX2BeUsed= c->canMMX2BeUsed;
2583
        int16_t *vLumFilterPos= c->vLumFilterPos;
2584
        int16_t *vChrFilterPos= c->vChrFilterPos;
2585
        int16_t *hLumFilterPos= c->hLumFilterPos;
2586
        int16_t *hChrFilterPos= c->hChrFilterPos;
2587
        int16_t *vLumFilter= c->vLumFilter;
2588
        int16_t *vChrFilter= c->vChrFilter;
2589
        int16_t *hLumFilter= c->hLumFilter;
2590
        int16_t *hChrFilter= c->hChrFilter;
2591
        int32_t *lumMmxFilter= c->lumMmxFilter;
2592
        int32_t *chrMmxFilter= c->chrMmxFilter;
2593
        const int vLumFilterSize= c->vLumFilterSize;
2594
        const int vChrFilterSize= c->vChrFilterSize;
2595
        const int hLumFilterSize= c->hLumFilterSize;
2596
        const int hChrFilterSize= c->hChrFilterSize;
2597
        int16_t **lumPixBuf= c->lumPixBuf;
2598
        int16_t **chrPixBuf= c->chrPixBuf;
2599
        const int vLumBufSize= c->vLumBufSize;
2600
        const int vChrBufSize= c->vChrBufSize;
2601
        uint8_t *funnyYCode= c->funnyYCode;
2602
        uint8_t *funnyUVCode= c->funnyUVCode;
2603
        uint8_t *formatConvBuffer= c->formatConvBuffer;
2604
        const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2605
        const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2606
        int lastDstY;
2607

    
2608
        /* vars whch will change and which we need to storw back in the context */
2609
        int dstY= c->dstY;
2610
        int lumBufIndex= c->lumBufIndex;
2611
        int chrBufIndex= c->chrBufIndex;
2612
        int lastInLumBuf= c->lastInLumBuf;
2613
        int lastInChrBuf= c->lastInChrBuf;
2614
        
2615
        if(isPacked(c->srcFormat)){
2616
                src[0]=
2617
                src[1]=
2618
                src[2]= src[0];
2619
                srcStride[0]=
2620
                srcStride[1]=
2621
                srcStride[2]= srcStride[0];
2622
        }
2623
        srcStride[1]<<= c->vChrDrop;
2624
        srcStride[2]<<= c->vChrDrop;
2625

    
2626
//        printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2627
//                (int)dst[0], (int)dst[1], (int)dst[2]);
2628

    
2629
#if 0 //self test FIXME move to a vfilter or something
2630
{
2631
static volatile int i=0;
2632
i++;
2633
if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2634
        selfTest(src, srcStride, c->srcW, c->srcH);
2635
i--;
2636
}
2637
#endif
2638

    
2639
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2640
//dstStride[0],dstStride[1],dstStride[2]);
2641

    
2642
        if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2643
        {
2644
                static int firstTime=1; //FIXME move this into the context perhaps
2645
                if(flags & SWS_PRINT_INFO && firstTime)
2646
                {
2647
                        MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2648
                                        "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2649
                        firstTime=0;
2650
                }
2651
        }
2652

    
2653
        /* Note the user might start scaling the picture in the middle so this will not get executed
2654
           this is not really intended but works currently, so ppl might do it */
2655
        if(srcSliceY ==0){
2656
                lumBufIndex=0;
2657
                chrBufIndex=0;
2658
                dstY=0;        
2659
                lastInLumBuf= -1;
2660
                lastInChrBuf= -1;
2661
        }
2662

    
2663
        lastDstY= dstY;
2664

    
2665
        for(;dstY < dstH; dstY++){
2666
                unsigned char *dest =dst[0]+dstStride[0]*dstY;
2667
                const int chrDstY= dstY>>c->chrDstVSubSample;
2668
                unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2669
                unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2670

    
2671
                const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2672
                const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2673
                const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2674
                const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2675

    
2676
//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2677
// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2678
                //handle holes (FAST_BILINEAR & weird filters)
2679
                if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2680
                if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2681
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2682
                ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2683
                ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2684

    
2685
                // Do we have enough lines in this slice to output the dstY line
2686
                if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2687
                {
2688
                        //Do horizontal scaling
2689
                        while(lastInLumBuf < lastLumSrcY)
2690
                        {
2691
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2692
                                lumBufIndex++;
2693
//                                printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2694
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2695
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2696
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2697
//                                printf("%d %d\n", lumBufIndex, vLumBufSize);
2698
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2699
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2700
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
2701
                                                c->lumMmx2Filter, c->lumMmx2FilterPos);
2702
                                lastInLumBuf++;
2703
                        }
2704
                        while(lastInChrBuf < lastChrSrcY)
2705
                        {
2706
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2707
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2708
                                chrBufIndex++;
2709
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2710
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2711
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2712
                                //FIXME replace parameters through context struct (some at least)
2713

    
2714
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
2715
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2716
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2717
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
2718
                                                c->chrMmx2Filter, c->chrMmx2FilterPos);
2719
                                lastInChrBuf++;
2720
                        }
2721
                        //wrap buf index around to stay inside the ring buffer
2722
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2723
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2724
                }
2725
                else // not enough lines left in this slice -> load the rest in the buffer
2726
                {
2727
/*                printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2728
                        firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2729
                        lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2730
                        vChrBufSize, vLumBufSize);*/
2731

    
2732
                        //Do horizontal scaling
2733
                        while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2734
                        {
2735
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2736
                                lumBufIndex++;
2737
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2738
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2739
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2740
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2741
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2742
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
2743
                                                c->lumMmx2Filter, c->lumMmx2FilterPos);
2744
                                lastInLumBuf++;
2745
                        }
2746
                        while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2747
                        {
2748
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2749
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2750
                                chrBufIndex++;
2751
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2752
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2753
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2754

    
2755
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
2756
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2757
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2758
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
2759
                                                c->chrMmx2Filter, c->chrMmx2FilterPos);
2760
                                lastInChrBuf++;
2761
                        }
2762
                        //wrap buf index around to stay inside the ring buffer
2763
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2764
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2765
                        break; //we can't output a dstY line so let's try with the next slice
2766
                }
2767

    
2768
#ifdef HAVE_MMX
2769
                b5Dither= dither8[dstY&1];
2770
                g6Dither= dither4[dstY&1];
2771
                g5Dither= dither8[dstY&1];
2772
                r5Dither= dither8[(dstY+1)&1];
2773
#endif
2774
            if(dstY < dstH-2)
2775
            {
2776
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2777
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2778
#ifdef HAVE_MMX
2779
                int i;
2780
                for(i=0; i<vLumFilterSize; i++)
2781
                {
2782
                        lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2783
                        lumMmxFilter[4*i+2]= 
2784
                        lumMmxFilter[4*i+3]= 
2785
                                ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2786
                }
2787
                for(i=0; i<vChrFilterSize; i++)
2788
                {
2789
                        chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2790
                        chrMmxFilter[4*i+2]= 
2791
                        chrMmxFilter[4*i+3]= 
2792
                                ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2793
                }
2794
#endif
2795
                if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2796
                {
2797
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2798
                        if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2799
                        if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2800
                        {
2801
                                int16_t *lumBuf = lumPixBuf[0];
2802
                                int16_t *chrBuf= chrPixBuf[0];
2803
                                RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2804
                        }
2805
                        else //General YV12
2806
                        {
2807
                                RENAME(yuv2yuvX)(c,
2808
                                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2809
                                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2810
                                        dest, uDest, vDest, dstW, chrDstW);
2811
                        }
2812
                }
2813
                else
2814
                {
2815
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2816
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2817
                        if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2818
                        {
2819
                                int chrAlpha= vChrFilter[2*dstY+1];
2820
                                RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2821
                                                 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2822
                        }
2823
                        else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2824
                        {
2825
                                int lumAlpha= vLumFilter[2*dstY+1];
2826
                                int chrAlpha= vChrFilter[2*dstY+1];
2827
                                RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2828
                                                 dest, dstW, lumAlpha, chrAlpha, dstY);
2829
                        }
2830
                        else //General RGB
2831
                        {
2832
                                RENAME(yuv2packedX)(c,
2833
                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2834
                                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2835
                                        dest, dstW, dstY);
2836
                        }
2837
                }
2838
            }
2839
            else // hmm looks like we can't use MMX here without overwriting this array's tail
2840
            {
2841
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2842
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2843
                if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2844
                {
2845
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2846
                        if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2847
                        yuv2yuvXinC(
2848
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2849
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2850
                                dest, uDest, vDest, dstW, chrDstW);
2851
                }
2852
                else
2853
                {
2854
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2855
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2856
                        yuv2packedXinC(c, 
2857
                                vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2858
                                vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2859
                                dest, dstW, dstY);
2860
                }
2861
            }
2862
        }
2863

    
2864
#ifdef HAVE_MMX
2865
        __asm __volatile(SFENCE:::"memory");
2866
        __asm __volatile(EMMS:::"memory");
2867
#endif
2868
        /* store changed local vars back in the context */
2869
        c->dstY= dstY;
2870
        c->lumBufIndex= lumBufIndex;
2871
        c->chrBufIndex= chrBufIndex;
2872
        c->lastInLumBuf= lastInLumBuf;
2873
        c->lastInChrBuf= lastInChrBuf;
2874

    
2875
        return dstY - lastDstY;
2876
}