Revision 86330b4c libswscale/swscale_template.c

View differences:

libswscale/swscale_template.c
18 18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 19
 */
20 20

  
21
#undef REAL_MOVNTQ
22
#undef MOVNTQ
23
#undef PAVGB
24
#undef PREFETCH
25

  
26
#if COMPILE_TEMPLATE_AMD3DNOW
27
#define PREFETCH  "prefetch"
28
#elif COMPILE_TEMPLATE_MMX2
29
#define PREFETCH "prefetchnta"
30
#else
31
#define PREFETCH  " # nop"
32
#endif
33

  
34
#if COMPILE_TEMPLATE_MMX2
35
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36
#elif COMPILE_TEMPLATE_AMD3DNOW
37
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
38
#endif
39

  
40
#if COMPILE_TEMPLATE_MMX2
41
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
42
#else
43
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
44
#endif
45
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
46

  
47
#if COMPILE_TEMPLATE_ALTIVEC
48
#include "ppc/swscale_altivec_template.c"
49
#endif
50

  
51
#define YSCALEYUV2YV12X(x, offset, dest, width) \
52
    __asm__ volatile(\
53
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
54
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
55
        "movq                             %%mm3, %%mm4      \n\t"\
56
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
57
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
58
        ".p2align                             4             \n\t" /* FIXME Unroll? */\
59
        "1:                                                 \n\t"\
60
        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
61
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
62
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
63
        "add                                $16, %%"REG_d"  \n\t"\
64
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
65
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
66
        "pmulhw                           %%mm0, %%mm2      \n\t"\
67
        "pmulhw                           %%mm0, %%mm5      \n\t"\
68
        "paddw                            %%mm2, %%mm3      \n\t"\
69
        "paddw                            %%mm5, %%mm4      \n\t"\
70
        " jnz                                1b             \n\t"\
71
        "psraw                               $3, %%mm3      \n\t"\
72
        "psraw                               $3, %%mm4      \n\t"\
73
        "packuswb                         %%mm4, %%mm3      \n\t"\
74
        MOVNTQ(%%mm3, (%1, %%REGa))\
75
        "add                                 $8, %%"REG_a"  \n\t"\
76
        "cmp                                 %2, %%"REG_a"  \n\t"\
77
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
78
        "movq                             %%mm3, %%mm4      \n\t"\
79
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
80
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
81
        "jb                                  1b             \n\t"\
82
        :: "r" (&c->redDither),\
83
        "r" (dest), "g" ((x86_reg)width)\
84
        : "%"REG_a, "%"REG_d, "%"REG_S\
85
    );
86

  
87
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
88
    __asm__ volatile(\
89
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
90
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
91
        "pxor                             %%mm4, %%mm4      \n\t"\
92
        "pxor                             %%mm5, %%mm5      \n\t"\
93
        "pxor                             %%mm6, %%mm6      \n\t"\
94
        "pxor                             %%mm7, %%mm7      \n\t"\
95
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
96
        ".p2align                             4             \n\t"\
97
        "1:                                                 \n\t"\
98
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
99
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
100
        "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
101
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
102
        "movq                             %%mm0, %%mm3      \n\t"\
103
        "punpcklwd                        %%mm1, %%mm0      \n\t"\
104
        "punpckhwd                        %%mm1, %%mm3      \n\t"\
105
        "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
106
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
107
        "pmaddwd                          %%mm1, %%mm3      \n\t"\
108
        "paddd                            %%mm0, %%mm4      \n\t"\
109
        "paddd                            %%mm3, %%mm5      \n\t"\
110
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
111
        "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
112
        "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
113
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
114
        "movq                             %%mm2, %%mm0      \n\t"\
115
        "punpcklwd                        %%mm3, %%mm2      \n\t"\
116
        "punpckhwd                        %%mm3, %%mm0      \n\t"\
117
        "pmaddwd                          %%mm1, %%mm2      \n\t"\
118
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
119
        "paddd                            %%mm2, %%mm6      \n\t"\
120
        "paddd                            %%mm0, %%mm7      \n\t"\
121
        " jnz                                1b             \n\t"\
122
        "psrad                              $16, %%mm4      \n\t"\
123
        "psrad                              $16, %%mm5      \n\t"\
124
        "psrad                              $16, %%mm6      \n\t"\
125
        "psrad                              $16, %%mm7      \n\t"\
126
        "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
127
        "packssdw                         %%mm5, %%mm4      \n\t"\
128
        "packssdw                         %%mm7, %%mm6      \n\t"\
129
        "paddw                            %%mm0, %%mm4      \n\t"\
130
        "paddw                            %%mm0, %%mm6      \n\t"\
131
        "psraw                               $3, %%mm4      \n\t"\
132
        "psraw                               $3, %%mm6      \n\t"\
133
        "packuswb                         %%mm6, %%mm4      \n\t"\
134
        MOVNTQ(%%mm4, (%1, %%REGa))\
135
        "add                                 $8, %%"REG_a"  \n\t"\
136
        "cmp                                 %2, %%"REG_a"  \n\t"\
137
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
138
        "pxor                             %%mm4, %%mm4      \n\t"\
139
        "pxor                             %%mm5, %%mm5      \n\t"\
140
        "pxor                             %%mm6, %%mm6      \n\t"\
141
        "pxor                             %%mm7, %%mm7      \n\t"\
142
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
143
        "jb                                  1b             \n\t"\
144
        :: "r" (&c->redDither),\
145
        "r" (dest), "g" ((x86_reg)width)\
146
        : "%"REG_a, "%"REG_d, "%"REG_S\
147
    );
148

  
149
#define YSCALEYUV2YV121 \
150
    "mov %2, %%"REG_a"                    \n\t"\
151
    ".p2align               4             \n\t" /* FIXME Unroll? */\
152
    "1:                                   \n\t"\
153
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
154
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
155
    "psraw                 $7, %%mm0      \n\t"\
156
    "psraw                 $7, %%mm1      \n\t"\
157
    "packuswb           %%mm1, %%mm0      \n\t"\
158
    MOVNTQ(%%mm0, (%1, %%REGa))\
159
    "add                   $8, %%"REG_a"  \n\t"\
160
    "jnc                   1b             \n\t"
161

  
162
#define YSCALEYUV2YV121_ACCURATE \
163
    "mov %2, %%"REG_a"                    \n\t"\
164
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
165
    "psrlw                 $15, %%mm7     \n\t"\
166
    "psllw                  $6, %%mm7     \n\t"\
167
    ".p2align                4            \n\t" /* FIXME Unroll? */\
168
    "1:                                   \n\t"\
169
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
170
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
171
    "paddsw             %%mm7, %%mm0      \n\t"\
172
    "paddsw             %%mm7, %%mm1      \n\t"\
173
    "psraw                 $7, %%mm0      \n\t"\
174
    "psraw                 $7, %%mm1      \n\t"\
175
    "packuswb           %%mm1, %%mm0      \n\t"\
176
    MOVNTQ(%%mm0, (%1, %%REGa))\
177
    "add                   $8, %%"REG_a"  \n\t"\
178
    "jnc                   1b             \n\t"
179

  
180
/*
181
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
182
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
183
       "r" (dest), "m" (dstW_reg),
184
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
185
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
186
*/
187
#define YSCALEYUV2PACKEDX_UV \
188
    __asm__ volatile(\
189
        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
190
        ".p2align                      4                \n\t"\
191
        "nop                                            \n\t"\
192
        "1:                                             \n\t"\
193
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
194
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
195
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
196
        "movq                      %%mm3, %%mm4         \n\t"\
197
        ".p2align                      4                \n\t"\
198
        "2:                                             \n\t"\
199
        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
200
        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
201
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
202
        "add                         $16, %%"REG_d"     \n\t"\
203
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
204
        "pmulhw                    %%mm0, %%mm2         \n\t"\
205
        "pmulhw                    %%mm0, %%mm5         \n\t"\
206
        "paddw                     %%mm2, %%mm3         \n\t"\
207
        "paddw                     %%mm5, %%mm4         \n\t"\
208
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
209
        " jnz                         2b                \n\t"\
210

  
211
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
212
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
213
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
214
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
215
    "movq                    "#dst1", "#dst2"       \n\t"\
216
    ".p2align                      4                \n\t"\
217
    "2:                                             \n\t"\
218
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
219
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
220
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
221
    "add                         $16, %%"REG_d"            \n\t"\
222
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
223
    "pmulhw                 "#coeff", "#src1"       \n\t"\
224
    "pmulhw                 "#coeff", "#src2"       \n\t"\
225
    "paddw                   "#src1", "#dst1"       \n\t"\
226
    "paddw                   "#src2", "#dst2"       \n\t"\
227
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
228
    " jnz                         2b                \n\t"\
229

  
230
#define YSCALEYUV2PACKEDX \
231
    YSCALEYUV2PACKEDX_UV \
232
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
233

  
234
#define YSCALEYUV2PACKEDX_END                     \
235
        :: "r" (&c->redDither),                   \
236
            "m" (dummy), "m" (dummy), "m" (dummy),\
237
            "r" (dest), "m" (dstW_reg)            \
238
        : "%"REG_a, "%"REG_d, "%"REG_S            \
239
    );
240

  
241
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
242
    __asm__ volatile(\
243
        "xor %%"REG_a", %%"REG_a"                       \n\t"\
244
        ".p2align                      4                \n\t"\
245
        "nop                                            \n\t"\
246
        "1:                                             \n\t"\
247
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
248
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
249
        "pxor                      %%mm4, %%mm4         \n\t"\
250
        "pxor                      %%mm5, %%mm5         \n\t"\
251
        "pxor                      %%mm6, %%mm6         \n\t"\
252
        "pxor                      %%mm7, %%mm7         \n\t"\
253
        ".p2align                      4                \n\t"\
254
        "2:                                             \n\t"\
255
        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
256
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
257
        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
258
        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
259
        "movq                      %%mm0, %%mm3         \n\t"\
260
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
261
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
262
        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
263
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
264
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
265
        "paddd                     %%mm0, %%mm4         \n\t"\
266
        "paddd                     %%mm3, %%mm5         \n\t"\
267
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
268
        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
269
        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
270
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
271
        "movq                      %%mm2, %%mm0         \n\t"\
272
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
273
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
274
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
275
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
276
        "paddd                     %%mm2, %%mm6         \n\t"\
277
        "paddd                     %%mm0, %%mm7         \n\t"\
278
        " jnz                         2b                \n\t"\
279
        "psrad                       $16, %%mm4         \n\t"\
280
        "psrad                       $16, %%mm5         \n\t"\
281
        "psrad                       $16, %%mm6         \n\t"\
282
        "psrad                       $16, %%mm7         \n\t"\
283
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
284
        "packssdw                  %%mm5, %%mm4         \n\t"\
285
        "packssdw                  %%mm7, %%mm6         \n\t"\
286
        "paddw                     %%mm0, %%mm4         \n\t"\
287
        "paddw                     %%mm0, %%mm6         \n\t"\
288
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
289
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
290

  
291
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
292
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
293
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
294
    "pxor                      %%mm1, %%mm1         \n\t"\
295
    "pxor                      %%mm5, %%mm5         \n\t"\
296
    "pxor                      %%mm7, %%mm7         \n\t"\
297
    "pxor                      %%mm6, %%mm6         \n\t"\
298
    ".p2align                      4                \n\t"\
299
    "2:                                             \n\t"\
300
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
301
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
302
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
303
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
304
    "movq                      %%mm0, %%mm3         \n\t"\
305
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
306
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
307
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
308
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
309
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
310
    "paddd                     %%mm0, %%mm1         \n\t"\
311
    "paddd                     %%mm3, %%mm5         \n\t"\
312
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
313
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
314
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
315
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
316
    "movq                      %%mm2, %%mm0         \n\t"\
317
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
318
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
319
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
320
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
321
    "paddd                     %%mm2, %%mm7         \n\t"\
322
    "paddd                     %%mm0, %%mm6         \n\t"\
323
    " jnz                         2b                \n\t"\
324
    "psrad                       $16, %%mm1         \n\t"\
325
    "psrad                       $16, %%mm5         \n\t"\
326
    "psrad                       $16, %%mm7         \n\t"\
327
    "psrad                       $16, %%mm6         \n\t"\
328
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
329
    "packssdw                  %%mm5, %%mm1         \n\t"\
330
    "packssdw                  %%mm6, %%mm7         \n\t"\
331
    "paddw                     %%mm0, %%mm1         \n\t"\
332
    "paddw                     %%mm0, %%mm7         \n\t"\
333
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
334
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
335

  
336
#define YSCALEYUV2PACKEDX_ACCURATE \
337
    YSCALEYUV2PACKEDX_ACCURATE_UV \
338
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
339

  
340
#define YSCALEYUV2RGBX \
341
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
342
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
343
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
344
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
345
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
346
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
347
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
348
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
349
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
350
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
351
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
352
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
353
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
354
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
355
    "paddw           %%mm3, %%mm4       \n\t"\
356
    "movq            %%mm2, %%mm0       \n\t"\
357
    "movq            %%mm5, %%mm6       \n\t"\
358
    "movq            %%mm4, %%mm3       \n\t"\
359
    "punpcklwd       %%mm2, %%mm2       \n\t"\
360
    "punpcklwd       %%mm5, %%mm5       \n\t"\
361
    "punpcklwd       %%mm4, %%mm4       \n\t"\
362
    "paddw           %%mm1, %%mm2       \n\t"\
363
    "paddw           %%mm1, %%mm5       \n\t"\
364
    "paddw           %%mm1, %%mm4       \n\t"\
365
    "punpckhwd       %%mm0, %%mm0       \n\t"\
366
    "punpckhwd       %%mm6, %%mm6       \n\t"\
367
    "punpckhwd       %%mm3, %%mm3       \n\t"\
368
    "paddw           %%mm7, %%mm0       \n\t"\
369
    "paddw           %%mm7, %%mm6       \n\t"\
370
    "paddw           %%mm7, %%mm3       \n\t"\
371
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
372
    "packuswb        %%mm0, %%mm2       \n\t"\
373
    "packuswb        %%mm6, %%mm5       \n\t"\
374
    "packuswb        %%mm3, %%mm4       \n\t"\
375

  
376
#define REAL_YSCALEYUV2PACKED(index, c) \
377
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
378
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
379
    "psraw                $3, %%mm0                           \n\t"\
380
    "psraw                $3, %%mm1                           \n\t"\
381
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
382
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
383
    "xor            "#index", "#index"                        \n\t"\
384
    ".p2align              4            \n\t"\
385
    "1:                                 \n\t"\
386
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
387
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
388
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
389
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
390
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
391
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
392
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
393
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
394
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
395
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
396
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
397
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
398
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
399
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
400
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
401
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
402
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
403
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
404
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
405
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
406
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
408
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
410
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
411

  
412
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
413

  
414
#define REAL_YSCALEYUV2RGB_UV(index, c) \
415
    "xor            "#index", "#index"  \n\t"\
416
    ".p2align              4            \n\t"\
417
    "1:                                 \n\t"\
418
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
419
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
420
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
421
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
422
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
423
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
424
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
425
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
426
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
427
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
428
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
429
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
430
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
431
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
432
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
433
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
434
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
435
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
436
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
437
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
438

  
439
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
440
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
441
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
442
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
443
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
444
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
445
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
446
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
447
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
448
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
449
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
450
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
451
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
452

  
453
#define REAL_YSCALEYUV2RGB_COEFF(c) \
454
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
455
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
456
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
457
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
458
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
459
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
460
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461
    "paddw             %%mm3, %%mm4     \n\t"\
462
    "movq              %%mm2, %%mm0     \n\t"\
463
    "movq              %%mm5, %%mm6     \n\t"\
464
    "movq              %%mm4, %%mm3     \n\t"\
465
    "punpcklwd         %%mm2, %%mm2     \n\t"\
466
    "punpcklwd         %%mm5, %%mm5     \n\t"\
467
    "punpcklwd         %%mm4, %%mm4     \n\t"\
468
    "paddw             %%mm1, %%mm2     \n\t"\
469
    "paddw             %%mm1, %%mm5     \n\t"\
470
    "paddw             %%mm1, %%mm4     \n\t"\
471
    "punpckhwd         %%mm0, %%mm0     \n\t"\
472
    "punpckhwd         %%mm6, %%mm6     \n\t"\
473
    "punpckhwd         %%mm3, %%mm3     \n\t"\
474
    "paddw             %%mm7, %%mm0     \n\t"\
475
    "paddw             %%mm7, %%mm6     \n\t"\
476
    "paddw             %%mm7, %%mm3     \n\t"\
477
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478
    "packuswb          %%mm0, %%mm2     \n\t"\
479
    "packuswb          %%mm6, %%mm5     \n\t"\
480
    "packuswb          %%mm3, %%mm4     \n\t"\
481

  
482
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
483

  
484
#define YSCALEYUV2RGB(index, c) \
485
    REAL_YSCALEYUV2RGB_UV(index, c) \
486
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
487
    REAL_YSCALEYUV2RGB_COEFF(c)
488

  
489
#define REAL_YSCALEYUV2PACKED1(index, c) \
490
    "xor            "#index", "#index"  \n\t"\
491
    ".p2align              4            \n\t"\
492
    "1:                                 \n\t"\
493
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
494
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
495
    "psraw                $7, %%mm3     \n\t" \
496
    "psraw                $7, %%mm4     \n\t" \
497
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
498
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
499
    "psraw                $7, %%mm1     \n\t" \
500
    "psraw                $7, %%mm7     \n\t" \
501

  
502
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
503

  
504
#define REAL_YSCALEYUV2RGB1(index, c) \
505
    "xor            "#index", "#index"  \n\t"\
506
    ".p2align              4            \n\t"\
507
    "1:                                 \n\t"\
508
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
509
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
510
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
511
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
512
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
513
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
514
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
515
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
516
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
517
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
518
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
519
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
520
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
521
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
522
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
523
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
524
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
525
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
526
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
527
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
528
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
529
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
530
    "paddw             %%mm3, %%mm4     \n\t"\
531
    "movq              %%mm2, %%mm0     \n\t"\
532
    "movq              %%mm5, %%mm6     \n\t"\
533
    "movq              %%mm4, %%mm3     \n\t"\
534
    "punpcklwd         %%mm2, %%mm2     \n\t"\
535
    "punpcklwd         %%mm5, %%mm5     \n\t"\
536
    "punpcklwd         %%mm4, %%mm4     \n\t"\
537
    "paddw             %%mm1, %%mm2     \n\t"\
538
    "paddw             %%mm1, %%mm5     \n\t"\
539
    "paddw             %%mm1, %%mm4     \n\t"\
540
    "punpckhwd         %%mm0, %%mm0     \n\t"\
541
    "punpckhwd         %%mm6, %%mm6     \n\t"\
542
    "punpckhwd         %%mm3, %%mm3     \n\t"\
543
    "paddw             %%mm7, %%mm0     \n\t"\
544
    "paddw             %%mm7, %%mm6     \n\t"\
545
    "paddw             %%mm7, %%mm3     \n\t"\
546
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
547
    "packuswb          %%mm0, %%mm2     \n\t"\
548
    "packuswb          %%mm6, %%mm5     \n\t"\
549
    "packuswb          %%mm3, %%mm4     \n\t"\
550

  
551
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
552

  
553
#define REAL_YSCALEYUV2PACKED1b(index, c) \
554
    "xor "#index", "#index"             \n\t"\
555
    ".p2align              4            \n\t"\
556
    "1:                                 \n\t"\
557
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
558
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
559
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
560
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
561
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
562
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
563
    "psrlw                $8, %%mm3     \n\t" \
564
    "psrlw                $8, %%mm4     \n\t" \
565
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
566
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
567
    "psraw                $7, %%mm1     \n\t" \
568
    "psraw                $7, %%mm7     \n\t"
569
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
570

  
571
// do vertical chrominance interpolation
572
#define REAL_YSCALEYUV2RGB1b(index, c) \
573
    "xor            "#index", "#index"  \n\t"\
574
    ".p2align              4            \n\t"\
575
    "1:                                 \n\t"\
576
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
577
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
578
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
579
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
580
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
581
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
582
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
583
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
584
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
585
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
586
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
587
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
588
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
589
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
590
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
591
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
592
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
593
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
594
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
595
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
596
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
597
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
598
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
599
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
600
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
601
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
602
    "paddw             %%mm3, %%mm4     \n\t"\
603
    "movq              %%mm2, %%mm0     \n\t"\
604
    "movq              %%mm5, %%mm6     \n\t"\
605
    "movq              %%mm4, %%mm3     \n\t"\
606
    "punpcklwd         %%mm2, %%mm2     \n\t"\
607
    "punpcklwd         %%mm5, %%mm5     \n\t"\
608
    "punpcklwd         %%mm4, %%mm4     \n\t"\
609
    "paddw             %%mm1, %%mm2     \n\t"\
610
    "paddw             %%mm1, %%mm5     \n\t"\
611
    "paddw             %%mm1, %%mm4     \n\t"\
612
    "punpckhwd         %%mm0, %%mm0     \n\t"\
613
    "punpckhwd         %%mm6, %%mm6     \n\t"\
614
    "punpckhwd         %%mm3, %%mm3     \n\t"\
615
    "paddw             %%mm7, %%mm0     \n\t"\
616
    "paddw             %%mm7, %%mm6     \n\t"\
617
    "paddw             %%mm7, %%mm3     \n\t"\
618
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
619
    "packuswb          %%mm0, %%mm2     \n\t"\
620
    "packuswb          %%mm6, %%mm5     \n\t"\
621
    "packuswb          %%mm3, %%mm4     \n\t"\
622

  
623
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
624

  
625
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
626
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
627
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
628
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
629
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
630
    "packuswb          %%mm1, %%mm7     \n\t"
631
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
632

  
633
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
634
    "movq       "#b", "#q2"     \n\t" /* B */\
635
    "movq       "#r", "#t"      \n\t" /* R */\
636
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
637
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
638
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
639
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
640
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
641
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
642
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
643
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
644
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
645
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
646
\
647
    MOVNTQ(   q0,   (dst, index, 4))\
648
    MOVNTQ(    b,  8(dst, index, 4))\
649
    MOVNTQ(   q2, 16(dst, index, 4))\
650
    MOVNTQ(   q3, 24(dst, index, 4))\
651
\
652
    "add      $8, "#index"      \n\t"\
653
    "cmp "#dstw", "#index"      \n\t"\
654
    " jb      1b                \n\t"
655
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
656

  
657
#define REAL_WRITERGB16(dst, dstw, index) \
658
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
659
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
660
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
661
    "psrlq           $3, %%mm2  \n\t"\
662
\
663
    "movq         %%mm2, %%mm1  \n\t"\
664
    "movq         %%mm4, %%mm3  \n\t"\
665
\
666
    "punpcklbw    %%mm7, %%mm3  \n\t"\
667
    "punpcklbw    %%mm5, %%mm2  \n\t"\
668
    "punpckhbw    %%mm7, %%mm4  \n\t"\
669
    "punpckhbw    %%mm5, %%mm1  \n\t"\
670
\
671
    "psllq           $3, %%mm3  \n\t"\
672
    "psllq           $3, %%mm4  \n\t"\
673
\
674
    "por          %%mm3, %%mm2  \n\t"\
675
    "por          %%mm4, %%mm1  \n\t"\
676
\
677
    MOVNTQ(%%mm2,  (dst, index, 2))\
678
    MOVNTQ(%%mm1, 8(dst, index, 2))\
679
\
680
    "add             $8, "#index"   \n\t"\
681
    "cmp        "#dstw", "#index"   \n\t"\
682
    " jb             1b             \n\t"
683
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
684

  
685
#define REAL_WRITERGB15(dst, dstw, index) \
686
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
687
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
688
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
689
    "psrlq           $3, %%mm2  \n\t"\
690
    "psrlq           $1, %%mm5  \n\t"\
691
\
692
    "movq         %%mm2, %%mm1  \n\t"\
693
    "movq         %%mm4, %%mm3  \n\t"\
694
\
695
    "punpcklbw    %%mm7, %%mm3  \n\t"\
696
    "punpcklbw    %%mm5, %%mm2  \n\t"\
697
    "punpckhbw    %%mm7, %%mm4  \n\t"\
698
    "punpckhbw    %%mm5, %%mm1  \n\t"\
699
\
700
    "psllq           $2, %%mm3  \n\t"\
701
    "psllq           $2, %%mm4  \n\t"\
702
\
703
    "por          %%mm3, %%mm2  \n\t"\
704
    "por          %%mm4, %%mm1  \n\t"\
705
\
706
    MOVNTQ(%%mm2,  (dst, index, 2))\
707
    MOVNTQ(%%mm1, 8(dst, index, 2))\
708
\
709
    "add             $8, "#index"   \n\t"\
710
    "cmp        "#dstw", "#index"   \n\t"\
711
    " jb             1b             \n\t"
712
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
713

  
714
#define WRITEBGR24OLD(dst, dstw, index) \
715
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
716
    "movq      %%mm2, %%mm1             \n\t" /* B */\
717
    "movq      %%mm5, %%mm6             \n\t" /* R */\
718
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
719
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
720
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
721
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
722
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
723
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
724
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
725
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
726
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
727
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
728
\
729
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
730
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
731
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
732
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
733
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
734
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
735
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
736
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
737
\
738
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
739
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
740
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
741
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
742
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
743
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
744
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
745
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
746
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
747
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
748
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
749
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
750
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
751
\
752
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
753
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
754
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
755
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
756
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
757
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
758
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
759
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
760
\
761
    MOVNTQ(%%mm0,   (dst))\
762
    MOVNTQ(%%mm2,  8(dst))\
763
    MOVNTQ(%%mm3, 16(dst))\
764
    "add         $24, "#dst"            \n\t"\
765
\
766
    "add          $8, "#index"          \n\t"\
767
    "cmp     "#dstw", "#index"          \n\t"\
768
    " jb          1b                    \n\t"
769

  
770
#define WRITEBGR24MMX(dst, dstw, index) \
771
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
772
    "movq      %%mm2, %%mm1     \n\t" /* B */\
773
    "movq      %%mm5, %%mm6     \n\t" /* R */\
774
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
775
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
776
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
777
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
778
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
779
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
780
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
781
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
782
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
783
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
784
\
785
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
786
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
787
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
788
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
789
\
790
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
791
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
792
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
793
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
794
\
795
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
796
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
797
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
798
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
799
\
800
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
801
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
802
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
803
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
804
    MOVNTQ(%%mm0, (dst))\
805
\
806
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
807
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
808
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
809
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
810
    MOVNTQ(%%mm6, 8(dst))\
811
\
812
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
813
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
814
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
815
    MOVNTQ(%%mm5, 16(dst))\
816
\
817
    "add         $24, "#dst"    \n\t"\
818
\
819
    "add          $8, "#index"  \n\t"\
820
    "cmp     "#dstw", "#index"  \n\t"\
821
    " jb          1b            \n\t"
822

  
823
#define WRITEBGR24MMX2(dst, dstw, index) \
824
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
825
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
826
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
827
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
828
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
829
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
830
\
831
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
832
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
833
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
834
\
835
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
836
    "por    %%mm1, %%mm6        \n\t"\
837
    "por    %%mm3, %%mm6        \n\t"\
838
    MOVNTQ(%%mm6, (dst))\
839
\
840
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
841
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
842
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
843
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
844
\
845
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
846
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
847
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
848
\
849
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
850
    "por    %%mm3, %%mm6        \n\t"\
851
    MOVNTQ(%%mm6, 8(dst))\
852
\
853
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
854
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
855
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
856
\
857
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
858
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
859
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
860
\
861
    "por    %%mm1, %%mm3        \n\t"\
862
    "por    %%mm3, %%mm6        \n\t"\
863
    MOVNTQ(%%mm6, 16(dst))\
864
\
865
    "add      $24, "#dst"       \n\t"\
866
\
867
    "add       $8, "#index"     \n\t"\
868
    "cmp  "#dstw", "#index"     \n\t"\
869
    " jb       1b               \n\t"
870

  
871
#if COMPILE_TEMPLATE_MMX2
872
#undef WRITEBGR24
873
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
874
#else
875
#undef WRITEBGR24
876
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
877
#endif
878

  
879
#define REAL_WRITEYUY2(dst, dstw, index) \
880
    "packuswb  %%mm3, %%mm3     \n\t"\
881
    "packuswb  %%mm4, %%mm4     \n\t"\
882
    "packuswb  %%mm7, %%mm1     \n\t"\
883
    "punpcklbw %%mm4, %%mm3     \n\t"\
884
    "movq      %%mm1, %%mm7     \n\t"\
885
    "punpcklbw %%mm3, %%mm1     \n\t"\
886
    "punpckhbw %%mm3, %%mm7     \n\t"\
887
\
888
    MOVNTQ(%%mm1, (dst, index, 2))\
889
    MOVNTQ(%%mm7, 8(dst, index, 2))\
890
\
891
    "add          $8, "#index"  \n\t"\
892
    "cmp     "#dstw", "#index"  \n\t"\
893
    " jb          1b            \n\t"
894
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
895

  
896

  
897
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
898
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
899
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
21
static inline void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
22
                              const int16_t **lumSrc, int lumFilterSize,
23
                              const int16_t *chrFilter, const int16_t **chrSrc,
24
                              int chrFilterSize, const int16_t **alpSrc,
25
                              uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
26
                              uint8_t *aDest, long dstW, long chrDstW)
900 27
{
901
#if COMPILE_TEMPLATE_MMX
902
    if(!(c->flags & SWS_BITEXACT)) {
903
        if (c->flags & SWS_ACCURATE_RND) {
904
            if (uDest) {
905
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
906
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
907
            }
908
            if (CONFIG_SWSCALE_ALPHA && aDest) {
909
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
910
            }
911

  
912
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
913
        } else {
914
            if (uDest) {
915
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
916
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
917
            }
918
            if (CONFIG_SWSCALE_ALPHA && aDest) {
919
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
920
            }
921

  
922
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
923
        }
924
        return;
925
    }
926
#endif
927
#if COMPILE_TEMPLATE_ALTIVEC
928
    yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
929
                          chrFilter, chrSrc, chrFilterSize,
930
                          dest, uDest, vDest, dstW, chrDstW);
931
#else //COMPILE_TEMPLATE_ALTIVEC
932 28
    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
933 29
                chrFilter, chrSrc, chrFilterSize,
934 30
                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
935
#endif //!COMPILE_TEMPLATE_ALTIVEC
936 31
}
937 32

  
938
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
939
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
940
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
33
static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
34
                               const int16_t **lumSrc, int lumFilterSize,
35
                               const int16_t *chrFilter, const int16_t **chrSrc,
36
                               int chrFilterSize, uint8_t *dest, uint8_t *uDest,
37
                               int dstW, int chrDstW, enum PixelFormat dstFormat)
941 38
{
942 39
    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
943 40
                 chrFilter, chrSrc, chrFilterSize,
944 41
                 dest, uDest, dstW, chrDstW, dstFormat);
945 42
}
946 43

  
947
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
948
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
44
static inline void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
45
                              const int16_t *chrSrc, const int16_t *alpSrc,
46
                              uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
47
                              uint8_t *aDest, long dstW, long chrDstW)
949 48
{
950 49
    int i;
951
#if COMPILE_TEMPLATE_MMX
952
    if(!(c->flags & SWS_BITEXACT)) {
953
        long p= 4;
954
        const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
955
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
956
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
957

  
958
        if (c->flags & SWS_ACCURATE_RND) {
959
            while(p--) {
960
                if (dst[p]) {
961
                    __asm__ volatile(
962
                        YSCALEYUV2YV121_ACCURATE
963
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
964
                        "g" (-counter[p])
965
                        : "%"REG_a
966
                    );
967
                }
968
            }
969
        } else {
970
            while(p--) {
971
                if (dst[p]) {
972
                    __asm__ volatile(
973
                        YSCALEYUV2YV121
974
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
975
                        "g" (-counter[p])
976
                        : "%"REG_a
977
                    );
978
                }
979
            }
980
        }
981
        return;
982
    }
983
#endif
984 50
    for (i=0; i<dstW; i++) {
985 51
        int val= (lumSrc[i]+64)>>7;
986 52

  
......
1019 85
/**
1020 86
 * vertical scale YV12 to RGB
1021 87
 */
1022
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1023
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1024
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
88
static inline void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
89
                                 const int16_t **lumSrc, int lumFilterSize,
90
                                 const int16_t *chrFilter, const int16_t **chrSrc,
91
                                 int chrFilterSize, const int16_t **alpSrc,
92
                                 uint8_t *dest, long dstW, long dstY)
1025 93
{
1026
#if COMPILE_TEMPLATE_MMX
1027
    x86_reg dummy=0;
1028
    x86_reg dstW_reg = dstW;
1029
    if(!(c->flags & SWS_BITEXACT)) {
1030
        if (c->flags & SWS_ACCURATE_RND) {
1031
            switch(c->dstFormat) {
1032
            case PIX_FMT_RGB32:
1033
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1034
                    YSCALEYUV2PACKEDX_ACCURATE
1035
                    YSCALEYUV2RGBX
1036
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1037
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1038
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1039
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1040
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1041
                    "psraw                        $3, %%mm1         \n\t"
1042
                    "psraw                        $3, %%mm7         \n\t"
1043
                    "packuswb                  %%mm7, %%mm1         \n\t"
1044
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1045

  
1046
                    YSCALEYUV2PACKEDX_END
1047
                } else {
1048
                    YSCALEYUV2PACKEDX_ACCURATE
1049
                    YSCALEYUV2RGBX
1050
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1051
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1052

  
1053
                    YSCALEYUV2PACKEDX_END
1054
                }
1055
                return;
1056
            case PIX_FMT_BGR24:
1057
                YSCALEYUV2PACKEDX_ACCURATE
1058
                YSCALEYUV2RGBX
1059
                "pxor %%mm7, %%mm7 \n\t"
1060
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1061
                "add %4, %%"REG_c"                        \n\t"
1062
                WRITEBGR24(%%REGc, %5, %%REGa)
1063

  
1064

  
1065
                :: "r" (&c->redDither),
1066
                "m" (dummy), "m" (dummy), "m" (dummy),
1067
                "r" (dest), "m" (dstW_reg)
1068
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1069
                );
1070
                return;
1071
            case PIX_FMT_RGB555:
1072
                YSCALEYUV2PACKEDX_ACCURATE
1073
                YSCALEYUV2RGBX
1074
                "pxor %%mm7, %%mm7 \n\t"
1075
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1076
#ifdef DITHER1XBPP
1077
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1078
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1079
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1080
#endif
1081

  
1082
                WRITERGB15(%4, %5, %%REGa)
1083
                YSCALEYUV2PACKEDX_END
1084
                return;
1085
            case PIX_FMT_RGB565:
1086
                YSCALEYUV2PACKEDX_ACCURATE
1087
                YSCALEYUV2RGBX
1088
                "pxor %%mm7, %%mm7 \n\t"
1089
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090
#ifdef DITHER1XBPP
1091
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1092
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1093
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1094
#endif
1095

  
1096
                WRITERGB16(%4, %5, %%REGa)
1097
                YSCALEYUV2PACKEDX_END
1098
                return;
1099
            case PIX_FMT_YUYV422:
1100
                YSCALEYUV2PACKEDX_ACCURATE
1101
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102

  
1103
                "psraw $3, %%mm3    \n\t"
1104
                "psraw $3, %%mm4    \n\t"
1105
                "psraw $3, %%mm1    \n\t"
1106
                "psraw $3, %%mm7    \n\t"
1107
                WRITEYUY2(%4, %5, %%REGa)
1108
                YSCALEYUV2PACKEDX_END
1109
                return;
1110
            }
1111
        } else {
1112
            switch(c->dstFormat) {
1113
            case PIX_FMT_RGB32:
1114
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1115
                    YSCALEYUV2PACKEDX
1116
                    YSCALEYUV2RGBX
1117
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1118
                    "psraw                        $3, %%mm1         \n\t"
1119
                    "psraw                        $3, %%mm7         \n\t"
1120
                    "packuswb                  %%mm7, %%mm1         \n\t"
1121
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1122
                    YSCALEYUV2PACKEDX_END
1123
                } else {
1124
                    YSCALEYUV2PACKEDX
1125
                    YSCALEYUV2RGBX
1126
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1127
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1128
                    YSCALEYUV2PACKEDX_END
1129
                }
1130
                return;
1131
            case PIX_FMT_BGR24:
1132
                YSCALEYUV2PACKEDX
1133
                YSCALEYUV2RGBX
1134
                "pxor                    %%mm7, %%mm7       \n\t"
1135
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1136
                "add                        %4, %%"REG_c"   \n\t"
1137
                WRITEBGR24(%%REGc, %5, %%REGa)
1138

  
1139
                :: "r" (&c->redDither),
1140
                "m" (dummy), "m" (dummy), "m" (dummy),
1141
                "r" (dest),  "m" (dstW_reg)
1142
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1143
                );
1144
                return;
1145
            case PIX_FMT_RGB555:
1146
                YSCALEYUV2PACKEDX
1147
                YSCALEYUV2RGBX
1148
                "pxor %%mm7, %%mm7 \n\t"
1149
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150
#ifdef DITHER1XBPP
1151
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1152
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1153
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1154
#endif
1155

  
1156
                WRITERGB15(%4, %5, %%REGa)
1157
                YSCALEYUV2PACKEDX_END
1158
                return;
1159
            case PIX_FMT_RGB565:
1160
                YSCALEYUV2PACKEDX
1161
                YSCALEYUV2RGBX
1162
                "pxor %%mm7, %%mm7 \n\t"
1163
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1164
#ifdef DITHER1XBPP
1165
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1166
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1167
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1168
#endif
1169

  
1170
                WRITERGB16(%4, %5, %%REGa)
1171
                YSCALEYUV2PACKEDX_END
1172
                return;
1173
            case PIX_FMT_YUYV422:
1174
                YSCALEYUV2PACKEDX
1175
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1176

  
1177
                "psraw $3, %%mm3    \n\t"
1178
                "psraw $3, %%mm4    \n\t"
1179
                "psraw $3, %%mm1    \n\t"
1180
                "psraw $3, %%mm7    \n\t"
1181
                WRITEYUY2(%4, %5, %%REGa)
1182
                YSCALEYUV2PACKEDX_END
1183
                return;
1184
            }
1185
        }
1186
    }
1187
#endif /* COMPILE_TEMPLATE_MMX */
1188
#if COMPILE_TEMPLATE_ALTIVEC
1189
    /* The following list of supported dstFormat values should
1190
       match what's found in the body of ff_yuv2packedX_altivec() */
1191
    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1192
         (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1193
          c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1194
          c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1195
            ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1196
                                   chrFilter, chrSrc, chrFilterSize,
1197
                                   dest, dstW, dstY);
1198
    else
1199
#endif
1200 94
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1201 95
                       chrFilter, chrSrc, chrFilterSize,
1202 96
                       alpSrc, dest, dstW, dstY);
......
1205 99
/**
1206 100
 * vertical bilinear scale YV12 to RGB
1207 101
 */
1208
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1209
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
102
static inline void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
103
                                 const uint16_t *buf1, const uint16_t *uvbuf0,
104
                                 const uint16_t *uvbuf1, const uint16_t *abuf0,
105
                                 const uint16_t *abuf1, uint8_t *dest, int dstW,
106
                                 int yalpha, int uvalpha, int y)
1210 107
{
1211 108
    int  yalpha1=4095- yalpha;
1212 109
    int uvalpha1=4095-uvalpha;
1213 110
    int i;
1214 111

  
1215
#if COMPILE_TEMPLATE_MMX
1216
    if(!(c->flags & SWS_BITEXACT)) {
1217
        switch(c->dstFormat) {
1218
        //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1219
        case PIX_FMT_RGB32:
1220
            if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1221
#if ARCH_X86_64
1222
                __asm__ volatile(
1223
                    YSCALEYUV2RGB(%%r8, %5)
1224
                    YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1225
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1226
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1227
                    "packuswb            %%mm7, %%mm1       \n\t"
1228
                    WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1229

  
1230
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1231
                    "a" (&c->redDither)
1232
                    ,"r" (abuf0), "r" (abuf1)
1233
                    : "%r8"
1234
                );
1235
#else
1236
                *(const uint16_t **)(&c->u_temp)=abuf0;
1237
                *(const uint16_t **)(&c->v_temp)=abuf1;
1238
                __asm__ volatile(
1239
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1240
                    "mov        %4, %%"REG_b"               \n\t"
1241
                    "push %%"REG_BP"                        \n\t"
1242
                    YSCALEYUV2RGB(%%REGBP, %5)
1243
                    "push                   %0              \n\t"
1244
                    "push                   %1              \n\t"
1245
                    "mov          "U_TEMP"(%5), %0          \n\t"
1246
                    "mov          "V_TEMP"(%5), %1          \n\t"
1247
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1248
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1249
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1250
                    "packuswb            %%mm7, %%mm1       \n\t"
1251
                    "pop                    %1              \n\t"
1252
                    "pop                    %0              \n\t"
1253
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1254
                    "pop %%"REG_BP"                         \n\t"
1255
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1256

  
1257
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1258
                    "a" (&c->redDither)
1259
                );
1260
#endif
1261
            } else {
1262
                __asm__ volatile(
1263
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1264
                    "mov        %4, %%"REG_b"               \n\t"
1265
                    "push %%"REG_BP"                        \n\t"
1266
                    YSCALEYUV2RGB(%%REGBP, %5)
1267
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1268
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1269
                    "pop %%"REG_BP"                         \n\t"
1270
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1271

  
1272
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1273
                    "a" (&c->redDither)
1274
                );
1275
            }
1276
            return;
1277
        case PIX_FMT_BGR24:
1278
            __asm__ volatile(
1279
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1280
                "mov        %4, %%"REG_b"               \n\t"
1281
                "push %%"REG_BP"                        \n\t"
1282
                YSCALEYUV2RGB(%%REGBP, %5)
1283
                "pxor    %%mm7, %%mm7                   \n\t"
1284
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1285
                "pop %%"REG_BP"                         \n\t"
1286
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1287
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1288
                "a" (&c->redDither)
1289
            );
1290
            return;
1291
        case PIX_FMT_RGB555:
1292
            __asm__ volatile(
1293
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1294
                "mov        %4, %%"REG_b"               \n\t"
1295
                "push %%"REG_BP"                        \n\t"
1296
                YSCALEYUV2RGB(%%REGBP, %5)
1297
                "pxor    %%mm7, %%mm7                   \n\t"
1298
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1299
#ifdef DITHER1XBPP
1300
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1301
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1302
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1303
#endif
1304

  
1305
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1306
                "pop %%"REG_BP"                         \n\t"
1307
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1308

  
1309
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1310
                "a" (&c->redDither)
1311
            );
1312
            return;
1313
        case PIX_FMT_RGB565:
1314
            __asm__ volatile(
1315
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1316
                "mov        %4, %%"REG_b"               \n\t"
1317
                "push %%"REG_BP"                        \n\t"
1318
                YSCALEYUV2RGB(%%REGBP, %5)
1319
                "pxor    %%mm7, %%mm7                   \n\t"
1320
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1321
#ifdef DITHER1XBPP
1322
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1323
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1324
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1325
#endif
1326

  
1327
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1328
                "pop %%"REG_BP"                         \n\t"
1329
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1330
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1331
                "a" (&c->redDither)
1332
            );
1333
            return;
1334
        case PIX_FMT_YUYV422:
1335
            __asm__ volatile(
1336
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1337
                "mov %4, %%"REG_b"                        \n\t"
1338
                "push %%"REG_BP"                        \n\t"
1339
                YSCALEYUV2PACKED(%%REGBP, %5)
1340
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1341
                "pop %%"REG_BP"                         \n\t"
1342
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1343
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1344
                "a" (&c->redDither)
1345
            );
1346
            return;
1347
        default: break;
1348
        }
1349
    }
1350
#endif //COMPILE_TEMPLATE_MMX
1351 112
    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1352 113
}
1353 114

  
1354 115
/**
1355 116
 * YV12 to RGB without scaling or interpolating
1356 117
 */
1357
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1358
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
118
static inline void yuv2packed1_c(SwsContext *c, const uint16_t *buf0,
119
                                 const uint16_t *uvbuf0, const uint16_t *uvbuf1,
120
                                 const uint16_t *abuf0, uint8_t *dest, int dstW,
121
                                 int uvalpha, enum PixelFormat dstFormat,
122
                                 int flags, int y)
1359 123
{
1360 124
    const int yalpha1=0;
1361 125
    int i;
......
1368 132
        return;
1369 133
    }
1370 134

  
1371
#if COMPILE_TEMPLATE_MMX
1372
    if(!(flags & SWS_BITEXACT)) {
1373
        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1374
            switch(dstFormat) {
1375
            case PIX_FMT_RGB32:
1376
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1377
                    __asm__ volatile(
1378
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1379
                        "mov        %4, %%"REG_b"               \n\t"
1380
                        "push %%"REG_BP"                        \n\t"
1381
                        YSCALEYUV2RGB1(%%REGBP, %5)
1382
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1383
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1384
                        "pop %%"REG_BP"                         \n\t"
1385
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1386

  
1387
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1388
                        "a" (&c->redDither)
1389
                    );
1390
                } else {
1391
                    __asm__ volatile(
1392
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1393
                        "mov        %4, %%"REG_b"               \n\t"
1394
                        "push %%"REG_BP"                        \n\t"
1395
                        YSCALEYUV2RGB1(%%REGBP, %5)
1396
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1397
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1398
                        "pop %%"REG_BP"                         \n\t"
1399
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1400

  
1401
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1402
                        "a" (&c->redDither)
1403
                    );
1404
                }
1405
                return;
1406
            case PIX_FMT_BGR24:
1407
                __asm__ volatile(
1408
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1409
                    "mov        %4, %%"REG_b"               \n\t"
1410
                    "push %%"REG_BP"                        \n\t"
1411
                    YSCALEYUV2RGB1(%%REGBP, %5)
1412
                    "pxor    %%mm7, %%mm7                   \n\t"
1413
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1414
                    "pop %%"REG_BP"                         \n\t"
1415
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1416

  
1417
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1418
                    "a" (&c->redDither)
1419
                );
1420
                return;
1421
            case PIX_FMT_RGB555:
1422
                __asm__ volatile(
1423
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1424
                    "mov        %4, %%"REG_b"               \n\t"
1425
                    "push %%"REG_BP"                        \n\t"
1426
                    YSCALEYUV2RGB1(%%REGBP, %5)
1427
                    "pxor    %%mm7, %%mm7                   \n\t"
1428
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1429
#ifdef DITHER1XBPP
1430
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1431
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1432
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1433
#endif
1434
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1435
                    "pop %%"REG_BP"                         \n\t"
1436
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1437

  
1438
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1439
                    "a" (&c->redDither)
1440
                );
1441
                return;
1442
            case PIX_FMT_RGB565:
1443
                __asm__ volatile(
1444
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1445
                    "mov        %4, %%"REG_b"               \n\t"
1446
                    "push %%"REG_BP"                        \n\t"
1447
                    YSCALEYUV2RGB1(%%REGBP, %5)
1448
                    "pxor    %%mm7, %%mm7                   \n\t"
1449
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1450
#ifdef DITHER1XBPP
1451
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1452
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1453
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1454
#endif
1455

  
1456
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1457
                    "pop %%"REG_BP"                         \n\t"
1458
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1459

  
1460
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1461
                    "a" (&c->redDither)
1462
                );
1463
                return;
1464
            case PIX_FMT_YUYV422:
1465
                __asm__ volatile(
1466
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1467
                    "mov        %4, %%"REG_b"               \n\t"
1468
                    "push %%"REG_BP"                        \n\t"
1469
                    YSCALEYUV2PACKED1(%%REGBP, %5)
1470
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1471
                    "pop %%"REG_BP"                         \n\t"
1472
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1473

  
1474
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475
                    "a" (&c->redDither)
1476
                );
1477
                return;
1478
            }
1479
        } else {
1480
            switch(dstFormat) {
1481
            case PIX_FMT_RGB32:
1482
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1483
                    __asm__ volatile(
1484
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1485
                        "mov        %4, %%"REG_b"               \n\t"
1486
                        "push %%"REG_BP"                        \n\t"
1487
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1488
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1489
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1490
                        "pop %%"REG_BP"                         \n\t"
1491
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1492

  
1493
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1494
                        "a" (&c->redDither)
1495
                    );
1496
                } else {
1497
                    __asm__ volatile(
1498
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1499
                        "mov        %4, %%"REG_b"               \n\t"
1500
                        "push %%"REG_BP"                        \n\t"
1501
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1502
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1503
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1504
                        "pop %%"REG_BP"                         \n\t"
1505
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1506

  
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff