Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 8d884020

History | View | Annotate | Download (138 KB)

1 fe8054c0 Michael Niedermayer
/*
2 d026b45e Diego Biurrun
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6 819ee683 Diego Biurrun
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10 d026b45e Diego Biurrun
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 819ee683 Diego Biurrun
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15 d026b45e Diego Biurrun
 *
16 819ee683 Diego Biurrun
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18 b19bcbaa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 d026b45e Diego Biurrun
 */
20 783e9cc9 Michael Niedermayer
21 6e1c66bc Aurelien Jacobs
#undef REAL_MOVNTQ
22 541c4eb9 Michael Niedermayer
#undef MOVNTQ
23 7d7f78b5 Michael Niedermayer
#undef PAVGB
24 48a05cec Michael Niedermayer
#undef PREFETCH
25
26 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_AMD3DNOW
27 48a05cec Michael Niedermayer
#define PREFETCH  "prefetch"
28 94daf2e9 Ramiro Polla
#elif COMPILE_TEMPLATE_MMX2
29 48a05cec Michael Niedermayer
#define PREFETCH "prefetchnta"
30
#else
31 d904b5fc Nigel Pearson
#define PREFETCH  " # nop"
32 48a05cec Michael Niedermayer
#endif
33
34 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX2
35 d604bab9 Michael Niedermayer
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36 94daf2e9 Ramiro Polla
#elif COMPILE_TEMPLATE_AMD3DNOW
37 d604bab9 Michael Niedermayer
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
38
#endif
39 d3f41512 Michael Niedermayer
40 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX2
41 6e1c66bc Aurelien Jacobs
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
42 d604bab9 Michael Niedermayer
#else
43 6e1c66bc Aurelien Jacobs
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
44 d604bab9 Michael Niedermayer
#endif
45 6e1c66bc Aurelien Jacobs
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
46 d604bab9 Michael Niedermayer
47 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_ALTIVEC
48 009d2d74 Diego Biurrun
#include "ppc/swscale_altivec_template.c"
49 a2faa401 Romain Dolbeau
#endif
50
51 bca11e75 Michael Niedermayer
#define YSCALEYUV2YV12X(x, offset, dest, width) \
52 7ad6469e Diego Pettenò
    __asm__ volatile(\
53 c255994b Ramiro Polla
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
54
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
55
        "movq                             %%mm3, %%mm4      \n\t"\
56
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
57
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
58
        ASMALIGN(4) /* FIXME Unroll? */\
59
        "1:                                                 \n\t"\
60
        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
61
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
62
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
63
        "add                                $16, %%"REG_d"  \n\t"\
64
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
65
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
66
        "pmulhw                           %%mm0, %%mm2      \n\t"\
67
        "pmulhw                           %%mm0, %%mm5      \n\t"\
68
        "paddw                            %%mm2, %%mm3      \n\t"\
69
        "paddw                            %%mm5, %%mm4      \n\t"\
70
        " jnz                                1b             \n\t"\
71
        "psraw                               $3, %%mm3      \n\t"\
72
        "psraw                               $3, %%mm4      \n\t"\
73
        "packuswb                         %%mm4, %%mm3      \n\t"\
74
        MOVNTQ(%%mm3, (%1, %%REGa))\
75
        "add                                 $8, %%"REG_a"  \n\t"\
76
        "cmp                                 %2, %%"REG_a"  \n\t"\
77
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
78
        "movq                             %%mm3, %%mm4      \n\t"\
79
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
80
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
81
        "jb                                  1b             \n\t"\
82
        :: "r" (&c->redDither),\
83 c85007d5 Anton Mitrofanov
        "r" (dest), "g" ((x86_reg)width)\
84 c255994b Ramiro Polla
        : "%"REG_a, "%"REG_d, "%"REG_S\
85 2da0d70d Diego Biurrun
    );
86 bca11e75 Michael Niedermayer
87
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
88 7ad6469e Diego Pettenò
    __asm__ volatile(\
89 c255994b Ramiro Polla
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
90
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
91
        "pxor                             %%mm4, %%mm4      \n\t"\
92
        "pxor                             %%mm5, %%mm5      \n\t"\
93
        "pxor                             %%mm6, %%mm6      \n\t"\
94
        "pxor                             %%mm7, %%mm7      \n\t"\
95
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
96
        ASMALIGN(4) \
97
        "1:                                                 \n\t"\
98
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
99
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
100
        "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
101
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
102
        "movq                             %%mm0, %%mm3      \n\t"\
103
        "punpcklwd                        %%mm1, %%mm0      \n\t"\
104
        "punpckhwd                        %%mm1, %%mm3      \n\t"\
105
        "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
106
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
107
        "pmaddwd                          %%mm1, %%mm3      \n\t"\
108
        "paddd                            %%mm0, %%mm4      \n\t"\
109
        "paddd                            %%mm3, %%mm5      \n\t"\
110
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
111
        "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
112
        "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
113
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
114
        "movq                             %%mm2, %%mm0      \n\t"\
115
        "punpcklwd                        %%mm3, %%mm2      \n\t"\
116
        "punpckhwd                        %%mm3, %%mm0      \n\t"\
117
        "pmaddwd                          %%mm1, %%mm2      \n\t"\
118
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
119
        "paddd                            %%mm2, %%mm6      \n\t"\
120
        "paddd                            %%mm0, %%mm7      \n\t"\
121
        " jnz                                1b             \n\t"\
122
        "psrad                              $16, %%mm4      \n\t"\
123
        "psrad                              $16, %%mm5      \n\t"\
124
        "psrad                              $16, %%mm6      \n\t"\
125
        "psrad                              $16, %%mm7      \n\t"\
126
        "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
127
        "packssdw                         %%mm5, %%mm4      \n\t"\
128
        "packssdw                         %%mm7, %%mm6      \n\t"\
129
        "paddw                            %%mm0, %%mm4      \n\t"\
130
        "paddw                            %%mm0, %%mm6      \n\t"\
131
        "psraw                               $3, %%mm4      \n\t"\
132
        "psraw                               $3, %%mm6      \n\t"\
133
        "packuswb                         %%mm6, %%mm4      \n\t"\
134
        MOVNTQ(%%mm4, (%1, %%REGa))\
135
        "add                                 $8, %%"REG_a"  \n\t"\
136
        "cmp                                 %2, %%"REG_a"  \n\t"\
137
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
138
        "pxor                             %%mm4, %%mm4      \n\t"\
139
        "pxor                             %%mm5, %%mm5      \n\t"\
140
        "pxor                             %%mm6, %%mm6      \n\t"\
141
        "pxor                             %%mm7, %%mm7      \n\t"\
142
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
143
        "jb                                  1b             \n\t"\
144
        :: "r" (&c->redDither),\
145 c85007d5 Anton Mitrofanov
        "r" (dest), "g" ((x86_reg)width)\
146 c255994b Ramiro Polla
        : "%"REG_a, "%"REG_d, "%"REG_S\
147 2da0d70d Diego Biurrun
    );
148 c1b0bfb4 Michael Niedermayer
149
#define YSCALEYUV2YV121 \
150 2da0d70d Diego Biurrun
    "mov %2, %%"REG_a"                    \n\t"\
151
    ASMALIGN(4) /* FIXME Unroll? */\
152
    "1:                                   \n\t"\
153
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
154
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
155
    "psraw                 $7, %%mm0      \n\t"\
156
    "psraw                 $7, %%mm1      \n\t"\
157
    "packuswb           %%mm1, %%mm0      \n\t"\
158
    MOVNTQ(%%mm0, (%1, %%REGa))\
159
    "add                   $8, %%"REG_a"  \n\t"\
160
    "jnc                   1b             \n\t"
161 c1b0bfb4 Michael Niedermayer
162 bf2bdde6 Michael Niedermayer
#define YSCALEYUV2YV121_ACCURATE \
163
    "mov %2, %%"REG_a"                    \n\t"\
164
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
165
    "psrlw                 $15, %%mm7     \n\t"\
166
    "psllw                  $6, %%mm7     \n\t"\
167
    ASMALIGN(4) /* FIXME Unroll? */\
168
    "1:                                   \n\t"\
169
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
170
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
171 33a67bd6 Michael Niedermayer
    "paddsw             %%mm7, %%mm0      \n\t"\
172
    "paddsw             %%mm7, %%mm1      \n\t"\
173 bf2bdde6 Michael Niedermayer
    "psraw                 $7, %%mm0      \n\t"\
174
    "psraw                 $7, %%mm1      \n\t"\
175
    "packuswb           %%mm1, %%mm0      \n\t"\
176
    MOVNTQ(%%mm0, (%1, %%REGa))\
177
    "add                   $8, %%"REG_a"  \n\t"\
178
    "jnc                   1b             \n\t"
179
180 c1b0bfb4 Michael Niedermayer
/*
181 2da0d70d Diego Biurrun
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
182
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
183 c85007d5 Anton Mitrofanov
       "r" (dest), "m" (dstW_reg),
184 2da0d70d Diego Biurrun
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
185
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
186 c1b0bfb4 Michael Niedermayer
*/
187 df57ab14 Cédric Schieli
#define YSCALEYUV2PACKEDX_UV \
188 7ad6469e Diego Pettenò
    __asm__ volatile(\
189 c255994b Ramiro Polla
        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
190
        ASMALIGN(4)\
191
        "nop                                            \n\t"\
192
        "1:                                             \n\t"\
193
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
194
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
195
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
196
        "movq                      %%mm3, %%mm4         \n\t"\
197
        ASMALIGN(4)\
198
        "2:                                             \n\t"\
199
        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
200
        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
201
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
202
        "add                         $16, %%"REG_d"     \n\t"\
203
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
204
        "pmulhw                    %%mm0, %%mm2         \n\t"\
205
        "pmulhw                    %%mm0, %%mm5         \n\t"\
206
        "paddw                     %%mm2, %%mm3         \n\t"\
207
        "paddw                     %%mm5, %%mm4         \n\t"\
208
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
209
        " jnz                         2b                \n\t"\
210 df57ab14 Cédric Schieli
211 fe91924d Cédric Schieli
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
212 df57ab14 Cédric Schieli
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
213 2da0d70d Diego Biurrun
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
214 fe91924d Cédric Schieli
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
215
    "movq                    "#dst1", "#dst2"       \n\t"\
216 2da0d70d Diego Biurrun
    ASMALIGN(4)\
217
    "2:                                             \n\t"\
218 fe91924d Cédric Schieli
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
219
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
220
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
221 2da0d70d Diego Biurrun
    "add                         $16, %%"REG_d"            \n\t"\
222
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
223 fe91924d Cédric Schieli
    "pmulhw                 "#coeff", "#src1"       \n\t"\
224
    "pmulhw                 "#coeff", "#src2"       \n\t"\
225
    "paddw                   "#src1", "#dst1"       \n\t"\
226
    "paddw                   "#src2", "#dst2"       \n\t"\
227 2da0d70d Diego Biurrun
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
228
    " jnz                         2b                \n\t"\
229
230 df57ab14 Cédric Schieli
#define YSCALEYUV2PACKEDX \
231
    YSCALEYUV2PACKEDX_UV \
232 fe91924d Cédric Schieli
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
233 df57ab14 Cédric Schieli
234 c255994b Ramiro Polla
#define YSCALEYUV2PACKEDX_END                     \
235
        :: "r" (&c->redDither),                   \
236
            "m" (dummy), "m" (dummy), "m" (dummy),\
237 c85007d5 Anton Mitrofanov
            "r" (dest), "m" (dstW_reg)            \
238 c255994b Ramiro Polla
        : "%"REG_a, "%"REG_d, "%"REG_S            \
239 2da0d70d Diego Biurrun
    );
240 8422aa88 Michael Niedermayer
241 df57ab14 Cédric Schieli
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
242 7ad6469e Diego Pettenò
    __asm__ volatile(\
243 c255994b Ramiro Polla
        "xor %%"REG_a", %%"REG_a"                       \n\t"\
244
        ASMALIGN(4)\
245
        "nop                                            \n\t"\
246
        "1:                                             \n\t"\
247
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
248
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
249
        "pxor                      %%mm4, %%mm4         \n\t"\
250
        "pxor                      %%mm5, %%mm5         \n\t"\
251
        "pxor                      %%mm6, %%mm6         \n\t"\
252
        "pxor                      %%mm7, %%mm7         \n\t"\
253
        ASMALIGN(4)\
254
        "2:                                             \n\t"\
255
        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
256
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
257
        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
258
        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
259
        "movq                      %%mm0, %%mm3         \n\t"\
260
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
261
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
262
        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
263
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
264
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
265
        "paddd                     %%mm0, %%mm4         \n\t"\
266
        "paddd                     %%mm3, %%mm5         \n\t"\
267
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
268
        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
269
        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
270
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
271
        "movq                      %%mm2, %%mm0         \n\t"\
272
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
273
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
274
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
275
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
276
        "paddd                     %%mm2, %%mm6         \n\t"\
277
        "paddd                     %%mm0, %%mm7         \n\t"\
278
        " jnz                         2b                \n\t"\
279
        "psrad                       $16, %%mm4         \n\t"\
280
        "psrad                       $16, %%mm5         \n\t"\
281
        "psrad                       $16, %%mm6         \n\t"\
282
        "psrad                       $16, %%mm7         \n\t"\
283
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
284
        "packssdw                  %%mm5, %%mm4         \n\t"\
285
        "packssdw                  %%mm7, %%mm6         \n\t"\
286
        "paddw                     %%mm0, %%mm4         \n\t"\
287
        "paddw                     %%mm0, %%mm6         \n\t"\
288
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
289
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
290 df57ab14 Cédric Schieli
291
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
292
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
293 2da0d70d Diego Biurrun
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
294
    "pxor                      %%mm1, %%mm1         \n\t"\
295
    "pxor                      %%mm5, %%mm5         \n\t"\
296
    "pxor                      %%mm7, %%mm7         \n\t"\
297
    "pxor                      %%mm6, %%mm6         \n\t"\
298
    ASMALIGN(4)\
299
    "2:                                             \n\t"\
300
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
301
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
302 1625216e Michael Niedermayer
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
303 2da0d70d Diego Biurrun
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
304
    "movq                      %%mm0, %%mm3         \n\t"\
305
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
306
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
307 1625216e Michael Niedermayer
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
308 2da0d70d Diego Biurrun
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
309
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
310
    "paddd                     %%mm0, %%mm1         \n\t"\
311
    "paddd                     %%mm3, %%mm5         \n\t"\
312
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
313 1625216e Michael Niedermayer
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
314
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
315 2da0d70d Diego Biurrun
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
316
    "movq                      %%mm2, %%mm0         \n\t"\
317
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
318
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
319
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
320
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
321
    "paddd                     %%mm2, %%mm7         \n\t"\
322
    "paddd                     %%mm0, %%mm6         \n\t"\
323
    " jnz                         2b                \n\t"\
324
    "psrad                       $16, %%mm1         \n\t"\
325
    "psrad                       $16, %%mm5         \n\t"\
326
    "psrad                       $16, %%mm7         \n\t"\
327
    "psrad                       $16, %%mm6         \n\t"\
328
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
329
    "packssdw                  %%mm5, %%mm1         \n\t"\
330
    "packssdw                  %%mm6, %%mm7         \n\t"\
331
    "paddw                     %%mm0, %%mm1         \n\t"\
332
    "paddw                     %%mm0, %%mm7         \n\t"\
333
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
334
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
335 bca11e75 Michael Niedermayer
336 df57ab14 Cédric Schieli
#define YSCALEYUV2PACKEDX_ACCURATE \
337
    YSCALEYUV2PACKEDX_ACCURATE_UV \
338
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
339
340 8422aa88 Michael Niedermayer
#define YSCALEYUV2RGBX \
341 2da0d70d Diego Biurrun
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
342
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
343
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
344
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
345
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
346
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
347 c255994b Ramiro Polla
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
348 2da0d70d Diego Biurrun
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
349
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
350
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
351
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
352
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
353
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
354 c255994b Ramiro Polla
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
355 2da0d70d Diego Biurrun
    "paddw           %%mm3, %%mm4       \n\t"\
356
    "movq            %%mm2, %%mm0       \n\t"\
357
    "movq            %%mm5, %%mm6       \n\t"\
358
    "movq            %%mm4, %%mm3       \n\t"\
359
    "punpcklwd       %%mm2, %%mm2       \n\t"\
360
    "punpcklwd       %%mm5, %%mm5       \n\t"\
361
    "punpcklwd       %%mm4, %%mm4       \n\t"\
362
    "paddw           %%mm1, %%mm2       \n\t"\
363
    "paddw           %%mm1, %%mm5       \n\t"\
364
    "paddw           %%mm1, %%mm4       \n\t"\
365
    "punpckhwd       %%mm0, %%mm0       \n\t"\
366
    "punpckhwd       %%mm6, %%mm6       \n\t"\
367
    "punpckhwd       %%mm3, %%mm3       \n\t"\
368
    "paddw           %%mm7, %%mm0       \n\t"\
369
    "paddw           %%mm7, %%mm6       \n\t"\
370
    "paddw           %%mm7, %%mm3       \n\t"\
371
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
372
    "packuswb        %%mm0, %%mm2       \n\t"\
373
    "packuswb        %%mm6, %%mm5       \n\t"\
374
    "packuswb        %%mm3, %%mm4       \n\t"\
375 d604bab9 Michael Niedermayer
376 6e1c66bc Aurelien Jacobs
#define REAL_YSCALEYUV2PACKED(index, c) \
377 2da0d70d Diego Biurrun
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
378
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
379
    "psraw                $3, %%mm0                           \n\t"\
380
    "psraw                $3, %%mm1                           \n\t"\
381
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
382
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
383
    "xor            "#index", "#index"                        \n\t"\
384
    ASMALIGN(4)\
385
    "1:                                 \n\t"\
386
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
387
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
388 8b2fce0d Michael Niedermayer
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
389
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
390 2da0d70d Diego Biurrun
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
391
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
392
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
393
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
394
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
395
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
396
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
397
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
398
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
399
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
400
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
401
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
402
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
403
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
404
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
405
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
406
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
408
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
410
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
411 6a4970ab Diego Biurrun
412 6e1c66bc Aurelien Jacobs
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
413 6a4970ab Diego Biurrun
414 df57ab14 Cédric Schieli
#define REAL_YSCALEYUV2RGB_UV(index, c) \
415 2da0d70d Diego Biurrun
    "xor            "#index", "#index"  \n\t"\
416
    ASMALIGN(4)\
417
    "1:                                 \n\t"\
418
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
419
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
420 8b2fce0d Michael Niedermayer
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
421
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
422 2da0d70d Diego Biurrun
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
423
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
424
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
425
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
426
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
427
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
428
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
429
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
430
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
431
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
432
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
433
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
434
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
435
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
436
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
437
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
438 df57ab14 Cédric Schieli
439 786dcfef Cédric Schieli
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
440
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
441
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
442
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
443
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
444 2da0d70d Diego Biurrun
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
445
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
446
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
447
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
448
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
449
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
450
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
451
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
452 df57ab14 Cédric Schieli
453
#define REAL_YSCALEYUV2RGB_COEFF(c) \
454 2da0d70d Diego Biurrun
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
455
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
456
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
457
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
458
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
459
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
460
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461
    "paddw             %%mm3, %%mm4     \n\t"\
462
    "movq              %%mm2, %%mm0     \n\t"\
463
    "movq              %%mm5, %%mm6     \n\t"\
464
    "movq              %%mm4, %%mm3     \n\t"\
465
    "punpcklwd         %%mm2, %%mm2     \n\t"\
466
    "punpcklwd         %%mm5, %%mm5     \n\t"\
467
    "punpcklwd         %%mm4, %%mm4     \n\t"\
468
    "paddw             %%mm1, %%mm2     \n\t"\
469
    "paddw             %%mm1, %%mm5     \n\t"\
470
    "paddw             %%mm1, %%mm4     \n\t"\
471
    "punpckhwd         %%mm0, %%mm0     \n\t"\
472
    "punpckhwd         %%mm6, %%mm6     \n\t"\
473
    "punpckhwd         %%mm3, %%mm3     \n\t"\
474
    "paddw             %%mm7, %%mm0     \n\t"\
475
    "paddw             %%mm7, %%mm6     \n\t"\
476
    "paddw             %%mm7, %%mm3     \n\t"\
477
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478
    "packuswb          %%mm0, %%mm2     \n\t"\
479
    "packuswb          %%mm6, %%mm5     \n\t"\
480
    "packuswb          %%mm3, %%mm4     \n\t"\
481 40494418 Cédric Schieli
482 786dcfef Cédric Schieli
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
483 df57ab14 Cédric Schieli
484
#define YSCALEYUV2RGB(index, c) \
485
    REAL_YSCALEYUV2RGB_UV(index, c) \
486 786dcfef Cédric Schieli
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
487 df57ab14 Cédric Schieli
    REAL_YSCALEYUV2RGB_COEFF(c)
488 6a4970ab Diego Biurrun
489 6e1c66bc Aurelien Jacobs
#define REAL_YSCALEYUV2PACKED1(index, c) \
490 2da0d70d Diego Biurrun
    "xor            "#index", "#index"  \n\t"\
491
    ASMALIGN(4)\
492
    "1:                                 \n\t"\
493
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
494 8b2fce0d Michael Niedermayer
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
495 2da0d70d Diego Biurrun
    "psraw                $7, %%mm3     \n\t" \
496
    "psraw                $7, %%mm4     \n\t" \
497
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
498
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
499
    "psraw                $7, %%mm1     \n\t" \
500
    "psraw                $7, %%mm7     \n\t" \
501 6a4970ab Diego Biurrun
502 6e1c66bc Aurelien Jacobs
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
503 6a4970ab Diego Biurrun
504 6e1c66bc Aurelien Jacobs
#define REAL_YSCALEYUV2RGB1(index, c) \
505 2da0d70d Diego Biurrun
    "xor            "#index", "#index"  \n\t"\
506
    ASMALIGN(4)\
507
    "1:                                 \n\t"\
508
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
509 8b2fce0d Michael Niedermayer
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
510 2da0d70d Diego Biurrun
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
511
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
512
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
513
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
514
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
515
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
516
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
517
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
518
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
519
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
520
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
521
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
522
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
523
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
524
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
525
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
526
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
527
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
528
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
529
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
530
    "paddw             %%mm3, %%mm4     \n\t"\
531
    "movq              %%mm2, %%mm0     \n\t"\
532
    "movq              %%mm5, %%mm6     \n\t"\
533
    "movq              %%mm4, %%mm3     \n\t"\
534
    "punpcklwd         %%mm2, %%mm2     \n\t"\
535
    "punpcklwd         %%mm5, %%mm5     \n\t"\
536
    "punpcklwd         %%mm4, %%mm4     \n\t"\
537
    "paddw             %%mm1, %%mm2     \n\t"\
538
    "paddw             %%mm1, %%mm5     \n\t"\
539
    "paddw             %%mm1, %%mm4     \n\t"\
540
    "punpckhwd         %%mm0, %%mm0     \n\t"\
541
    "punpckhwd         %%mm6, %%mm6     \n\t"\
542
    "punpckhwd         %%mm3, %%mm3     \n\t"\
543
    "paddw             %%mm7, %%mm0     \n\t"\
544
    "paddw             %%mm7, %%mm6     \n\t"\
545
    "paddw             %%mm7, %%mm3     \n\t"\
546
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
547
    "packuswb          %%mm0, %%mm2     \n\t"\
548
    "packuswb          %%mm6, %%mm5     \n\t"\
549
    "packuswb          %%mm3, %%mm4     \n\t"\
550 40494418 Cédric Schieli
551 6e1c66bc Aurelien Jacobs
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
552 497d4f99 Michael Niedermayer
553 6e1c66bc Aurelien Jacobs
#define REAL_YSCALEYUV2PACKED1b(index, c) \
554 2da0d70d Diego Biurrun
    "xor "#index", "#index"             \n\t"\
555
    ASMALIGN(4)\
556
    "1:                                 \n\t"\
557
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
558
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
559 8b2fce0d Michael Niedermayer
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
560
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
561 2da0d70d Diego Biurrun
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
562
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
563
    "psrlw                $8, %%mm3     \n\t" \
564
    "psrlw                $8, %%mm4     \n\t" \
565
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
566
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
567
    "psraw                $7, %%mm1     \n\t" \
568
    "psraw                $7, %%mm7     \n\t"
569 6e1c66bc Aurelien Jacobs
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
570 6a4970ab Diego Biurrun
571 497d4f99 Michael Niedermayer
// do vertical chrominance interpolation
572 6e1c66bc Aurelien Jacobs
#define REAL_YSCALEYUV2RGB1b(index, c) \
573 2da0d70d Diego Biurrun
    "xor            "#index", "#index"  \n\t"\
574
    ASMALIGN(4)\
575
    "1:                                 \n\t"\
576
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
577
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
578 8b2fce0d Michael Niedermayer
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
579
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
580 2da0d70d Diego Biurrun
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
581
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
582
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
583
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
584
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
585
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
586
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
587
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
588
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
589
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
590
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
591
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
592
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
593
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
594
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
595
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
596
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
597
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
598
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
599
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
600
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
601
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
602
    "paddw             %%mm3, %%mm4     \n\t"\
603
    "movq              %%mm2, %%mm0     \n\t"\
604
    "movq              %%mm5, %%mm6     \n\t"\
605
    "movq              %%mm4, %%mm3     \n\t"\
606
    "punpcklwd         %%mm2, %%mm2     \n\t"\
607
    "punpcklwd         %%mm5, %%mm5     \n\t"\
608
    "punpcklwd         %%mm4, %%mm4     \n\t"\
609
    "paddw             %%mm1, %%mm2     \n\t"\
610
    "paddw             %%mm1, %%mm5     \n\t"\
611
    "paddw             %%mm1, %%mm4     \n\t"\
612
    "punpckhwd         %%mm0, %%mm0     \n\t"\
613
    "punpckhwd         %%mm6, %%mm6     \n\t"\
614
    "punpckhwd         %%mm3, %%mm3     \n\t"\
615
    "paddw             %%mm7, %%mm0     \n\t"\
616
    "paddw             %%mm7, %%mm6     \n\t"\
617
    "paddw             %%mm7, %%mm3     \n\t"\
618
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
619
    "packuswb          %%mm0, %%mm2     \n\t"\
620
    "packuswb          %%mm6, %%mm5     \n\t"\
621
    "packuswb          %%mm3, %%mm4     \n\t"\
622 40494418 Cédric Schieli
623 6e1c66bc Aurelien Jacobs
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
624 d604bab9 Michael Niedermayer
625 6858492e Cédric Schieli
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
626
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
627
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
628
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
629
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
630
    "packuswb          %%mm1, %%mm7     \n\t"
631
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
632
633 9c77b26b Cédric Schieli
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
634
    "movq       "#b", "#q2"     \n\t" /* B */\
635
    "movq       "#r", "#t"      \n\t" /* R */\
636
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
637
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
638
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
639
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
640
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
641
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
642
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
643
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
644
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
645
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
646 d604bab9 Michael Niedermayer
\
647 9c77b26b Cédric Schieli
    MOVNTQ(   q0,   (dst, index, 4))\
648
    MOVNTQ(    b,  8(dst, index, 4))\
649
    MOVNTQ(   q2, 16(dst, index, 4))\
650
    MOVNTQ(   q3, 24(dst, index, 4))\
651 d604bab9 Michael Niedermayer
\
652 2da0d70d Diego Biurrun
    "add      $8, "#index"      \n\t"\
653
    "cmp "#dstw", "#index"      \n\t"\
654
    " jb      1b                \n\t"
655 9c77b26b Cédric Schieli
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
656 d604bab9 Michael Niedermayer
657 27a90b04 Michael Niedermayer
#define REAL_WRITERGB16(dst, dstw, index) \
658 2da0d70d Diego Biurrun
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
659
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
660
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
661
    "psrlq           $3, %%mm2  \n\t"\
662 d604bab9 Michael Niedermayer
\
663 2da0d70d Diego Biurrun
    "movq         %%mm2, %%mm1  \n\t"\
664
    "movq         %%mm4, %%mm3  \n\t"\
665 d604bab9 Michael Niedermayer
\
666 2da0d70d Diego Biurrun
    "punpcklbw    %%mm7, %%mm3  \n\t"\
667
    "punpcklbw    %%mm5, %%mm2  \n\t"\
668
    "punpckhbw    %%mm7, %%mm4  \n\t"\
669
    "punpckhbw    %%mm5, %%mm1  \n\t"\
670 d604bab9 Michael Niedermayer
\
671 2da0d70d Diego Biurrun
    "psllq           $3, %%mm3  \n\t"\
672
    "psllq           $3, %%mm4  \n\t"\
673 d604bab9 Michael Niedermayer
\
674 2da0d70d Diego Biurrun
    "por          %%mm3, %%mm2  \n\t"\
675
    "por          %%mm4, %%mm1  \n\t"\
676 d604bab9 Michael Niedermayer
\
677 2da0d70d Diego Biurrun
    MOVNTQ(%%mm2,  (dst, index, 2))\
678
    MOVNTQ(%%mm1, 8(dst, index, 2))\
679 d604bab9 Michael Niedermayer
\
680 2da0d70d Diego Biurrun
    "add             $8, "#index"   \n\t"\
681
    "cmp        "#dstw", "#index"   \n\t"\
682
    " jb             1b             \n\t"
683 27a90b04 Michael Niedermayer
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
684 d604bab9 Michael Niedermayer
685 27a90b04 Michael Niedermayer
#define REAL_WRITERGB15(dst, dstw, index) \
686 2da0d70d Diego Biurrun
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
687
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
688
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
689
    "psrlq           $3, %%mm2  \n\t"\
690
    "psrlq           $1, %%mm5  \n\t"\
691 d604bab9 Michael Niedermayer
\
692 2da0d70d Diego Biurrun
    "movq         %%mm2, %%mm1  \n\t"\
693
    "movq         %%mm4, %%mm3  \n\t"\
694 d604bab9 Michael Niedermayer
\
695 2da0d70d Diego Biurrun
    "punpcklbw    %%mm7, %%mm3  \n\t"\
696
    "punpcklbw    %%mm5, %%mm2  \n\t"\
697
    "punpckhbw    %%mm7, %%mm4  \n\t"\
698
    "punpckhbw    %%mm5, %%mm1  \n\t"\
699 d604bab9 Michael Niedermayer
\
700 2da0d70d Diego Biurrun
    "psllq           $2, %%mm3  \n\t"\
701
    "psllq           $2, %%mm4  \n\t"\
702 d604bab9 Michael Niedermayer
\
703 2da0d70d Diego Biurrun
    "por          %%mm3, %%mm2  \n\t"\
704
    "por          %%mm4, %%mm1  \n\t"\
705 d604bab9 Michael Niedermayer
\
706 2da0d70d Diego Biurrun
    MOVNTQ(%%mm2,  (dst, index, 2))\
707
    MOVNTQ(%%mm1, 8(dst, index, 2))\
708 d604bab9 Michael Niedermayer
\
709 2da0d70d Diego Biurrun
    "add             $8, "#index"   \n\t"\
710
    "cmp        "#dstw", "#index"   \n\t"\
711
    " jb             1b             \n\t"
712 27a90b04 Michael Niedermayer
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
713 f62255fb Michael Niedermayer
714 6542b44e Michael Niedermayer
#define WRITEBGR24OLD(dst, dstw, index) \
715 2da0d70d Diego Biurrun
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
716
    "movq      %%mm2, %%mm1             \n\t" /* B */\
717
    "movq      %%mm5, %%mm6             \n\t" /* R */\
718
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
719
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
720
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
721
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
722
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
723
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
724
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
725
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
726
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
727
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
728 d604bab9 Michael Niedermayer
\
729 2da0d70d Diego Biurrun
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
730
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
731
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
732
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
733
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
734
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
735
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
736
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
737 d604bab9 Michael Niedermayer
\
738 2da0d70d Diego Biurrun
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
739
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
740
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
741
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
742
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
743
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
744
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
745
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
746
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
747
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
748
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
749
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
750
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
751 d604bab9 Michael Niedermayer
\
752 2da0d70d Diego Biurrun
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
753
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
754
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
755
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
756
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
757
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
758
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
759
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
760 d604bab9 Michael Niedermayer
\
761 2da0d70d Diego Biurrun
    MOVNTQ(%%mm0,   (dst))\
762
    MOVNTQ(%%mm2,  8(dst))\
763
    MOVNTQ(%%mm3, 16(dst))\
764
    "add         $24, "#dst"            \n\t"\
765 d604bab9 Michael Niedermayer
\
766 2da0d70d Diego Biurrun
    "add          $8, "#index"          \n\t"\
767
    "cmp     "#dstw", "#index"          \n\t"\
768
    " jb          1b                    \n\t"
769 d604bab9 Michael Niedermayer
770 6542b44e Michael Niedermayer
#define WRITEBGR24MMX(dst, dstw, index) \
771 2da0d70d Diego Biurrun
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
772
    "movq      %%mm2, %%mm1     \n\t" /* B */\
773
    "movq      %%mm5, %%mm6     \n\t" /* R */\
774
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
775
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
776
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
777
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
778
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
779
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
780
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
781
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
782
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
783
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
784 99d2cb72 Michael Niedermayer
\
785 2da0d70d Diego Biurrun
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
786
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
787
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
788
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
789 99d2cb72 Michael Niedermayer
\
790 2da0d70d Diego Biurrun
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
791
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
792
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
793
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
794 99d2cb72 Michael Niedermayer
\
795 2da0d70d Diego Biurrun
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
796
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
797
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
798
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
799 99d2cb72 Michael Niedermayer
\
800 2da0d70d Diego Biurrun
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
801
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
802
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
803
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
804
    MOVNTQ(%%mm0, (dst))\
805 99d2cb72 Michael Niedermayer
\
806 2da0d70d Diego Biurrun
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
807
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
808
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
809
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
810
    MOVNTQ(%%mm6, 8(dst))\
811 99d2cb72 Michael Niedermayer
\
812 2da0d70d Diego Biurrun
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
813
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
814
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
815
    MOVNTQ(%%mm5, 16(dst))\
816 99d2cb72 Michael Niedermayer
\
817 2da0d70d Diego Biurrun
    "add         $24, "#dst"    \n\t"\
818 99d2cb72 Michael Niedermayer
\
819 2da0d70d Diego Biurrun
    "add          $8, "#index"  \n\t"\
820
    "cmp     "#dstw", "#index"  \n\t"\
821
    " jb          1b            \n\t"
822 99d2cb72 Michael Niedermayer
823 6542b44e Michael Niedermayer
#define WRITEBGR24MMX2(dst, dstw, index) \
824 2da0d70d Diego Biurrun
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
825 5802683a Reimar Döffinger
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
826
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
827 2da0d70d Diego Biurrun
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
828
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
829
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
830 99d2cb72 Michael Niedermayer
\
831 2da0d70d Diego Biurrun
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
832
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
833
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
834 99d2cb72 Michael Niedermayer
\
835 2da0d70d Diego Biurrun
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
836
    "por    %%mm1, %%mm6        \n\t"\
837
    "por    %%mm3, %%mm6        \n\t"\
838
    MOVNTQ(%%mm6, (dst))\
839 99d2cb72 Michael Niedermayer
\
840 2da0d70d Diego Biurrun
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
841
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
842
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
843
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
844 99d2cb72 Michael Niedermayer
\
845 5802683a Reimar Döffinger
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
846 2da0d70d Diego Biurrun
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
847
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
848 99d2cb72 Michael Niedermayer
\
849 2da0d70d Diego Biurrun
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
850
    "por    %%mm3, %%mm6        \n\t"\
851
    MOVNTQ(%%mm6, 8(dst))\
852 99d2cb72 Michael Niedermayer
\
853 2da0d70d Diego Biurrun
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
854
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
855
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
856 99d2cb72 Michael Niedermayer
\
857 2da0d70d Diego Biurrun
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
858
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
859 5802683a Reimar Döffinger
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
860 99d2cb72 Michael Niedermayer
\
861 2da0d70d Diego Biurrun
    "por    %%mm1, %%mm3        \n\t"\
862
    "por    %%mm3, %%mm6        \n\t"\
863
    MOVNTQ(%%mm6, 16(dst))\
864 99d2cb72 Michael Niedermayer
\
865 2da0d70d Diego Biurrun
    "add      $24, "#dst"       \n\t"\
866 99d2cb72 Michael Niedermayer
\
867 2da0d70d Diego Biurrun
    "add       $8, "#index"     \n\t"\
868
    "cmp  "#dstw", "#index"     \n\t"\
869
    " jb       1b               \n\t"
870 99d2cb72 Michael Niedermayer
871 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX2
872 7630f2e0 Michael Niedermayer
#undef WRITEBGR24
873 6e1c66bc Aurelien Jacobs
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
874 99d2cb72 Michael Niedermayer
#else
875 7630f2e0 Michael Niedermayer
#undef WRITEBGR24
876 6e1c66bc Aurelien Jacobs
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
877 99d2cb72 Michael Niedermayer
#endif
878
879 6e1c66bc Aurelien Jacobs
#define REAL_WRITEYUY2(dst, dstw, index) \
880 2da0d70d Diego Biurrun
    "packuswb  %%mm3, %%mm3     \n\t"\
881
    "packuswb  %%mm4, %%mm4     \n\t"\
882
    "packuswb  %%mm7, %%mm1     \n\t"\
883
    "punpcklbw %%mm4, %%mm3     \n\t"\
884
    "movq      %%mm1, %%mm7     \n\t"\
885
    "punpcklbw %%mm3, %%mm1     \n\t"\
886
    "punpckhbw %%mm3, %%mm7     \n\t"\
887 25593e29 Michael Niedermayer
\
888 2da0d70d Diego Biurrun
    MOVNTQ(%%mm1, (dst, index, 2))\
889
    MOVNTQ(%%mm7, 8(dst, index, 2))\
890 25593e29 Michael Niedermayer
\
891 2da0d70d Diego Biurrun
    "add          $8, "#index"  \n\t"\
892
    "cmp     "#dstw", "#index"  \n\t"\
893
    " jb          1b            \n\t"
894 6e1c66bc Aurelien Jacobs
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
895 25593e29 Michael Niedermayer
896
897 7ac40327 Ramiro Polla
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
898
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
899 c3ab0004 Ramiro Polla
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
900 38858470 Michael Niedermayer
{
901 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
902 dd68318c Ramiro Polla
    if(!(c->flags & SWS_BITEXACT)) {
903
        if (c->flags & SWS_ACCURATE_RND) {
904
            if (uDest) {
905 14014d47 Michael Niedermayer
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
906
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
907
            }
908 dd68318c Ramiro Polla
            if (CONFIG_SWSCALE_ALPHA && aDest) {
909 6858492e Cédric Schieli
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
910
            }
911 bca11e75 Michael Niedermayer
912 14014d47 Michael Niedermayer
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
913 dd68318c Ramiro Polla
        } else {
914
            if (uDest) {
915 14014d47 Michael Niedermayer
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
916
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
917
            }
918 dd68318c Ramiro Polla
            if (CONFIG_SWSCALE_ALPHA && aDest) {
919 6858492e Cédric Schieli
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
920
            }
921 2da0d70d Diego Biurrun
922 14014d47 Michael Niedermayer
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
923
        }
924 f433c8ab Michael Niedermayer
        return;
925
    }
926
#endif
927 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_ALTIVEC
928 9b734d44 Ramiro Polla
    yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
929
                          chrFilter, chrSrc, chrFilterSize,
930
                          dest, uDest, vDest, dstW, chrDstW);
931 94daf2e9 Ramiro Polla
#else //COMPILE_TEMPLATE_ALTIVEC
932 9b734d44 Ramiro Polla
    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
933
                chrFilter, chrSrc, chrFilterSize,
934
                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
935 94daf2e9 Ramiro Polla
#endif //!COMPILE_TEMPLATE_ALTIVEC
936 c1b0bfb4 Michael Niedermayer
}
937 2add307d Michael Niedermayer
938 7ac40327 Ramiro Polla
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
939
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
940 b411dfff Carl Eugen Hoyos
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
941 6118e52e Ville Syrjälä
{
942 9b734d44 Ramiro Polla
    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
943
                 chrFilter, chrSrc, chrFilterSize,
944
                 dest, uDest, dstW, chrDstW, dstFormat);
945 6118e52e Ville Syrjälä
}
946
947 7ac40327 Ramiro Polla
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
948 c3ab0004 Ramiro Polla
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
949 c1b0bfb4 Michael Niedermayer
{
950 f433c8ab Michael Niedermayer
    int i;
951 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
952 dd68318c Ramiro Polla
    if(!(c->flags & SWS_BITEXACT)) {
953 6858492e Cédric Schieli
        long p= 4;
954 a959e247 Zuxy Meng
        const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
955 6858492e Cédric Schieli
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
956
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
957 2da0d70d Diego Biurrun
958 dd68318c Ramiro Polla
        if (c->flags & SWS_ACCURATE_RND) {
959
            while(p--) {
960
                if (dst[p]) {
961 3164d25e Cédric Schieli
                    __asm__ volatile(
962
                        YSCALEYUV2YV121_ACCURATE
963
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
964
                        "g" (-counter[p])
965
                        : "%"REG_a
966
                    );
967
                }
968 6858492e Cédric Schieli
            }
969 dd68318c Ramiro Polla
        } else {
970
            while(p--) {
971
                if (dst[p]) {
972 3164d25e Cédric Schieli
                    __asm__ volatile(
973
                        YSCALEYUV2YV121
974
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
975
                        "g" (-counter[p])
976
                        : "%"REG_a
977
                    );
978
                }
979 6858492e Cédric Schieli
            }
980 d78c1ea1 Michael Niedermayer
        }
981 f433c8ab Michael Niedermayer
        return;
982
    }
983
#endif
984 dd68318c Ramiro Polla
    for (i=0; i<dstW; i++) {
985 a1f3ffa3 Michael Niedermayer
        int val= (lumSrc[i]+64)>>7;
986 2da0d70d Diego Biurrun
987 dd68318c Ramiro Polla
        if (val&256) {
988 2da0d70d Diego Biurrun
            if (val<0) val=0;
989
            else       val=255;
990
        }
991
992
        dest[i]= val;
993
    }
994
995 1b0a4572 Benoit Fouet
    if (uDest)
996 dd68318c Ramiro Polla
        for (i=0; i<chrDstW; i++) {
997 a1f3ffa3 Michael Niedermayer
            int u=(chrSrc[i       ]+64)>>7;
998
            int v=(chrSrc[i + VOFW]+64)>>7;
999 2da0d70d Diego Biurrun
1000 dd68318c Ramiro Polla
            if ((u|v)&256) {
1001 2da0d70d Diego Biurrun
                if (u<0)        u=0;
1002
                else if (u>255) u=255;
1003
                if (v<0)        v=0;
1004
                else if (v>255) v=255;
1005
            }
1006
1007
            uDest[i]= u;
1008
            vDest[i]= v;
1009
        }
1010 6858492e Cédric Schieli
1011
    if (CONFIG_SWSCALE_ALPHA && aDest)
1012 dd68318c Ramiro Polla
        for (i=0; i<dstW; i++) {
1013 6858492e Cédric Schieli
            int val= (alpSrc[i]+64)>>7;
1014
            aDest[i]= av_clip_uint8(val);
1015
        }
1016 38858470 Michael Niedermayer
}
1017
1018 c1b0bfb4 Michael Niedermayer
1019 d604bab9 Michael Niedermayer
/**
1020
 * vertical scale YV12 to RGB
1021
 */
1022 7ac40327 Ramiro Polla
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1023
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1024 c3ab0004 Ramiro Polla
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1025 c1b0bfb4 Michael Niedermayer
{
1026 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1027 d0ce212a Ramiro Polla
    x86_reg dummy=0;
1028 c85007d5 Anton Mitrofanov
    x86_reg dstW_reg = dstW;
1029 dd68318c Ramiro Polla
    if(!(c->flags & SWS_BITEXACT)) {
1030
        if (c->flags & SWS_ACCURATE_RND) {
1031
            switch(c->dstFormat) {
1032 14014d47 Michael Niedermayer
            case PIX_FMT_RGB32:
1033 dd68318c Ramiro Polla
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1034 6858492e Cédric Schieli
                    YSCALEYUV2PACKEDX_ACCURATE
1035
                    YSCALEYUV2RGBX
1036
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1037
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1038
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1039
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1040
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1041
                    "psraw                        $3, %%mm1         \n\t"
1042
                    "psraw                        $3, %%mm7         \n\t"
1043
                    "packuswb                  %%mm7, %%mm1         \n\t"
1044
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1045
1046
                    YSCALEYUV2PACKEDX_END
1047 dd68318c Ramiro Polla
                } else {
1048 3164d25e Cédric Schieli
                    YSCALEYUV2PACKEDX_ACCURATE
1049
                    YSCALEYUV2RGBX
1050
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1051
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1052 2da0d70d Diego Biurrun
1053 3164d25e Cédric Schieli
                    YSCALEYUV2PACKEDX_END
1054 6858492e Cédric Schieli
                }
1055 14014d47 Michael Niedermayer
                return;
1056
            case PIX_FMT_BGR24:
1057
                YSCALEYUV2PACKEDX_ACCURATE
1058
                YSCALEYUV2RGBX
1059 40494418 Cédric Schieli
                "pxor %%mm7, %%mm7 \n\t"
1060 14014d47 Michael Niedermayer
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1061
                "add %4, %%"REG_c"                        \n\t"
1062
                WRITEBGR24(%%REGc, %5, %%REGa)
1063 2da0d70d Diego Biurrun
1064
1065 14014d47 Michael Niedermayer
                :: "r" (&c->redDither),
1066
                "m" (dummy), "m" (dummy), "m" (dummy),
1067 c85007d5 Anton Mitrofanov
                "r" (dest), "m" (dstW_reg)
1068 14014d47 Michael Niedermayer
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1069
                );
1070
                return;
1071
            case PIX_FMT_RGB555:
1072
                YSCALEYUV2PACKEDX_ACCURATE
1073
                YSCALEYUV2RGBX
1074 40494418 Cédric Schieli
                "pxor %%mm7, %%mm7 \n\t"
1075 14014d47 Michael Niedermayer
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1076 bca11e75 Michael Niedermayer
#ifdef DITHER1XBPP
1077 88e2a9ae Carl Eugen Hoyos
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1078
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1079
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1080 2da0d70d Diego Biurrun
#endif
1081
1082 14014d47 Michael Niedermayer
                WRITERGB15(%4, %5, %%REGa)
1083
                YSCALEYUV2PACKEDX_END
1084
                return;
1085
            case PIX_FMT_RGB565:
1086
                YSCALEYUV2PACKEDX_ACCURATE
1087
                YSCALEYUV2RGBX
1088 40494418 Cédric Schieli
                "pxor %%mm7, %%mm7 \n\t"
1089 14014d47 Michael Niedermayer
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090 bca11e75 Michael Niedermayer
#ifdef DITHER1XBPP
1091 88e2a9ae Carl Eugen Hoyos
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1092
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1093
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1094 2da0d70d Diego Biurrun
#endif
1095
1096 14014d47 Michael Niedermayer
                WRITERGB16(%4, %5, %%REGa)
1097
                YSCALEYUV2PACKEDX_END
1098
                return;
1099
            case PIX_FMT_YUYV422:
1100
                YSCALEYUV2PACKEDX_ACCURATE
1101
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102
1103
                "psraw $3, %%mm3    \n\t"
1104
                "psraw $3, %%mm4    \n\t"
1105
                "psraw $3, %%mm1    \n\t"
1106
                "psraw $3, %%mm7    \n\t"
1107
                WRITEYUY2(%4, %5, %%REGa)
1108
                YSCALEYUV2PACKEDX_END
1109
                return;
1110
            }
1111 dd68318c Ramiro Polla
        } else {
1112
            switch(c->dstFormat) {
1113 14014d47 Michael Niedermayer
            case PIX_FMT_RGB32:
1114 dd68318c Ramiro Polla
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1115 6858492e Cédric Schieli
                    YSCALEYUV2PACKEDX
1116
                    YSCALEYUV2RGBX
1117
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1118
                    "psraw                        $3, %%mm1         \n\t"
1119
                    "psraw                        $3, %%mm7         \n\t"
1120
                    "packuswb                  %%mm7, %%mm1         \n\t"
1121
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1122
                    YSCALEYUV2PACKEDX_END
1123 dd68318c Ramiro Polla
                } else {
1124 3164d25e Cédric Schieli
                    YSCALEYUV2PACKEDX
1125
                    YSCALEYUV2RGBX
1126
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1127
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1128
                    YSCALEYUV2PACKEDX_END
1129 6858492e Cédric Schieli
                }
1130 14014d47 Michael Niedermayer
                return;
1131
            case PIX_FMT_BGR24:
1132
                YSCALEYUV2PACKEDX
1133
                YSCALEYUV2RGBX
1134 40494418 Cédric Schieli
                "pxor                    %%mm7, %%mm7       \n\t"
1135 14014d47 Michael Niedermayer
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1136
                "add                        %4, %%"REG_c"   \n\t"
1137
                WRITEBGR24(%%REGc, %5, %%REGa)
1138
1139
                :: "r" (&c->redDither),
1140
                "m" (dummy), "m" (dummy), "m" (dummy),
1141 c85007d5 Anton Mitrofanov
                "r" (dest),  "m" (dstW_reg)
1142 14014d47 Michael Niedermayer
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1143
                );
1144
                return;
1145
            case PIX_FMT_RGB555:
1146
                YSCALEYUV2PACKEDX
1147
                YSCALEYUV2RGBX
1148 40494418 Cédric Schieli
                "pxor %%mm7, %%mm7 \n\t"
1149 14014d47 Michael Niedermayer
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150 c1b0bfb4 Michael Niedermayer
#ifdef DITHER1XBPP
1151 88e2a9ae Carl Eugen Hoyos
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1152
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1153
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1154 2da0d70d Diego Biurrun
#endif
1155
1156 14014d47 Michael Niedermayer
                WRITERGB15(%4, %5, %%REGa)
1157
                YSCALEYUV2PACKEDX_END
1158
                return;
1159
            case PIX_FMT_RGB565:
1160
                YSCALEYUV2PACKEDX
1161
                YSCALEYUV2RGBX
1162 40494418 Cédric Schieli
                "pxor %%mm7, %%mm7 \n\t"
1163 14014d47 Michael Niedermayer
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1164 c1b0bfb4 Michael Niedermayer
#ifdef DITHER1XBPP
1165 88e2a9ae Carl Eugen Hoyos
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1166
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1167
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1168 2da0d70d Diego Biurrun
#endif
1169
1170 14014d47 Michael Niedermayer
                WRITERGB16(%4, %5, %%REGa)
1171
                YSCALEYUV2PACKEDX_END
1172
                return;
1173
            case PIX_FMT_YUYV422:
1174
                YSCALEYUV2PACKEDX
1175
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1176
1177
                "psraw $3, %%mm3    \n\t"
1178
                "psraw $3, %%mm4    \n\t"
1179
                "psraw $3, %%mm1    \n\t"
1180
                "psraw $3, %%mm7    \n\t"
1181
                WRITEYUY2(%4, %5, %%REGa)
1182
                YSCALEYUV2PACKEDX_END
1183
                return;
1184
            }
1185 bca11e75 Michael Niedermayer
        }
1186
    }
1187 94daf2e9 Ramiro Polla
#endif /* COMPILE_TEMPLATE_MMX */
1188
#if COMPILE_TEMPLATE_ALTIVEC
1189 2da0d70d Diego Biurrun
    /* The following list of supported dstFormat values should
1190 780daf2b Diego Biurrun
       match what's found in the body of ff_yuv2packedX_altivec() */
1191 d55ef636 Reimar Döffinger
    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1192 9b734d44 Ramiro Polla
         (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1193
          c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1194
          c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1195 780daf2b Diego Biurrun
            ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1196
                                   chrFilter, chrSrc, chrFilterSize,
1197
                                   dest, dstW, dstY);
1198 2da0d70d Diego Biurrun
    else
1199
#endif
1200
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1201
                       chrFilter, chrSrc, chrFilterSize,
1202 6858492e Cédric Schieli
                       alpSrc, dest, dstW, dstY);
1203 c1b0bfb4 Michael Niedermayer
}
1204
1205
/**
1206
 * vertical bilinear scale YV12 to RGB
1207
 */
1208 7ac40327 Ramiro Polla
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1209
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1210 d604bab9 Michael Niedermayer
{
1211 ac0ad729 Michael Niedermayer
    int  yalpha1=4095- yalpha;
1212
    int uvalpha1=4095-uvalpha;
1213 2da0d70d Diego Biurrun
    int i;
1214 d604bab9 Michael Niedermayer
1215 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1216 dd68318c Ramiro Polla
    if(!(c->flags & SWS_BITEXACT)) {
1217
        switch(c->dstFormat) {
1218 c255994b Ramiro Polla
        //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1219
        case PIX_FMT_RGB32:
1220
            if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1221 6858492e Cédric Schieli
#if ARCH_X86_64
1222 c255994b Ramiro Polla
                __asm__ volatile(
1223 f514b4f9 Reimar Döffinger
                    YSCALEYUV2RGB(%%r8, %5)
1224
                    YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1225 6858492e Cédric Schieli
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1226
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1227
                    "packuswb            %%mm7, %%mm1       \n\t"
1228 f514b4f9 Reimar Döffinger
                    WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1229 6858492e Cédric Schieli
1230 04ef1d3f Reimar Döffinger
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1231 6858492e Cédric Schieli
                    "a" (&c->redDither)
1232
                    ,"r" (abuf0), "r" (abuf1)
1233 f514b4f9 Reimar Döffinger
                    : "%r8"
1234 c255994b Ramiro Polla
                );
1235 6858492e Cédric Schieli
#else
1236 a959e247 Zuxy Meng
                *(const uint16_t **)(&c->u_temp)=abuf0;
1237
                *(const uint16_t **)(&c->v_temp)=abuf1;
1238 c255994b Ramiro Polla
                __asm__ volatile(
1239 6858492e Cédric Schieli
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1240
                    "mov        %4, %%"REG_b"               \n\t"
1241
                    "push %%"REG_BP"                        \n\t"
1242
                    YSCALEYUV2RGB(%%REGBP, %5)
1243
                    "push                   %0              \n\t"
1244
                    "push                   %1              \n\t"
1245
                    "mov          "U_TEMP"(%5), %0          \n\t"
1246
                    "mov          "V_TEMP"(%5), %1          \n\t"
1247
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1248
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1249
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1250
                    "packuswb            %%mm7, %%mm1       \n\t"
1251
                    "pop                    %1              \n\t"
1252
                    "pop                    %0              \n\t"
1253
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1254
                    "pop %%"REG_BP"                         \n\t"
1255
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1256
1257
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1258
                    "a" (&c->redDither)
1259 c255994b Ramiro Polla
                );
1260 6858492e Cédric Schieli
#endif
1261 c255994b Ramiro Polla
            } else {
1262
                __asm__ volatile(
1263 3164d25e Cédric Schieli
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1264
                    "mov        %4, %%"REG_b"               \n\t"
1265
                    "push %%"REG_BP"                        \n\t"
1266
                    YSCALEYUV2RGB(%%REGBP, %5)
1267
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1268
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1269
                    "pop %%"REG_BP"                         \n\t"
1270
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1271 2da0d70d Diego Biurrun
1272 3164d25e Cédric Schieli
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1273
                    "a" (&c->redDither)
1274 c255994b Ramiro Polla
                );
1275
            }
1276
            return;
1277
        case PIX_FMT_BGR24:
1278
            __asm__ volatile(
1279 2da0d70d Diego Biurrun
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1280
                "mov        %4, %%"REG_b"               \n\t"
1281
                "push %%"REG_BP"                        \n\t"
1282
                YSCALEYUV2RGB(%%REGBP, %5)
1283 40494418 Cédric Schieli
                "pxor    %%mm7, %%mm7                   \n\t"
1284 2da0d70d Diego Biurrun
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1285
                "pop %%"REG_BP"                         \n\t"
1286
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1287
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1288
                "a" (&c->redDither)
1289 c255994b Ramiro Polla
            );
1290
            return;
1291
        case PIX_FMT_RGB555:
1292
            __asm__ volatile(
1293 2da0d70d Diego Biurrun
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1294
                "mov        %4, %%"REG_b"               \n\t"
1295
                "push %%"REG_BP"                        \n\t"
1296
                YSCALEYUV2RGB(%%REGBP, %5)
1297 40494418 Cédric Schieli
                "pxor    %%mm7, %%mm7                   \n\t"
1298 2da0d70d Diego Biurrun
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1299 d604bab9 Michael Niedermayer
#ifdef DITHER1XBPP
1300 88e2a9ae Carl Eugen Hoyos
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1301
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1302
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1303 2da0d70d Diego Biurrun
#endif
1304
1305 27a90b04 Michael Niedermayer
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1306 2da0d70d Diego Biurrun
                "pop %%"REG_BP"                         \n\t"
1307
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1308
1309
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1310
                "a" (&c->redDither)
1311 c255994b Ramiro Polla
            );
1312
            return;
1313
        case PIX_FMT_RGB565:
1314
            __asm__ volatile(
1315 2da0d70d Diego Biurrun
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1316
                "mov        %4, %%"REG_b"               \n\t"
1317
                "push %%"REG_BP"                        \n\t"
1318
                YSCALEYUV2RGB(%%REGBP, %5)
1319 40494418 Cédric Schieli
                "pxor    %%mm7, %%mm7                   \n\t"
1320 2da0d70d Diego Biurrun
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1321 d604bab9 Michael Niedermayer
#ifdef DITHER1XBPP
1322 88e2a9ae Carl Eugen Hoyos
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1323
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1324
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1325 2da0d70d Diego Biurrun
#endif
1326
1327 27a90b04 Michael Niedermayer
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1328 2da0d70d Diego Biurrun
                "pop %%"REG_BP"                         \n\t"
1329
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1330
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1331
                "a" (&c->redDither)
1332 c255994b Ramiro Polla
            );
1333
            return;
1334
        case PIX_FMT_YUYV422:
1335
            __asm__ volatile(
1336 2da0d70d Diego Biurrun
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1337
                "mov %4, %%"REG_b"                        \n\t"
1338
                "push %%"REG_BP"                        \n\t"
1339
                YSCALEYUV2PACKED(%%REGBP, %5)
1340
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1341
                "pop %%"REG_BP"                         \n\t"
1342
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1343
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1344
                "a" (&c->redDither)
1345 c255994b Ramiro Polla
            );
1346
            return;
1347
        default: break;
1348 2da0d70d Diego Biurrun
        }
1349 f433c8ab Michael Niedermayer
    }
1350 94daf2e9 Ramiro Polla
#endif //COMPILE_TEMPLATE_MMX
1351 9b734d44 Ramiro Polla
    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1352 d604bab9 Michael Niedermayer
}
1353
1354
/**
1355
 * YV12 to RGB without scaling or interpolating
1356
 */
1357 7ac40327 Ramiro Polla
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1358 b411dfff Carl Eugen Hoyos
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1359 d604bab9 Michael Niedermayer
{
1360 2da0d70d Diego Biurrun
    const int yalpha1=0;
1361
    int i;
1362 6a4970ab Diego Biurrun
1363 7ac40327 Ramiro Polla
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1364 2da0d70d Diego Biurrun
    const int yalpha= 4096; //FIXME ...
1365 96034638 Michael Niedermayer
1366 dd68318c Ramiro Polla
    if (flags&SWS_FULL_CHR_H_INT) {
1367 40fa5140 Ramiro Polla
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1368 2da0d70d Diego Biurrun
        return;
1369
    }
1370 397c035e Michael Niedermayer
1371 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1372 dd68318c Ramiro Polla
    if(!(flags & SWS_BITEXACT)) {
1373
        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1374
            switch(dstFormat) {
1375 14014d47 Michael Niedermayer
            case PIX_FMT_RGB32:
1376 dd68318c Ramiro Polla
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1377 6858492e Cédric Schieli
                    __asm__ volatile(
1378 c255994b Ramiro Polla
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1379
                        "mov        %4, %%"REG_b"               \n\t"
1380
                        "push %%"REG_BP"                        \n\t"
1381
                        YSCALEYUV2RGB1(%%REGBP, %5)
1382
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1383
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1384
                        "pop %%"REG_BP"                         \n\t"
1385
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1386
1387
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1388
                        "a" (&c->redDither)
1389 6858492e Cédric Schieli
                    );
1390 dd68318c Ramiro Polla
                } else {
1391 3164d25e Cédric Schieli
                    __asm__ volatile(
1392 c255994b Ramiro Polla
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1393
                        "mov        %4, %%"REG_b"               \n\t"
1394
                        "push %%"REG_BP"                        \n\t"
1395
                        YSCALEYUV2RGB1(%%REGBP, %5)
1396
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1397
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1398
                        "pop %%"REG_BP"                         \n\t"
1399
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1400
1401
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1402
                        "a" (&c->redDither)
1403
                    );
1404
                }
1405
                return;
1406
            case PIX_FMT_BGR24:
1407
                __asm__ volatile(
1408 3164d25e Cédric Schieli
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1409
                    "mov        %4, %%"REG_b"               \n\t"
1410
                    "push %%"REG_BP"                        \n\t"
1411
                    YSCALEYUV2RGB1(%%REGBP, %5)
1412 c255994b Ramiro Polla
                    "pxor    %%mm7, %%mm7                   \n\t"
1413
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1414 3164d25e Cédric Schieli
                    "pop %%"REG_BP"                         \n\t"
1415
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1416 14014d47 Michael Niedermayer
1417 3164d25e Cédric Schieli
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1418
                    "a" (&c->redDither)
1419 14014d47 Michael Niedermayer
                );
1420
                return;
1421
            case PIX_FMT_RGB555:
1422 7ad6469e Diego Pettenò
                __asm__ volatile(
1423 c255994b Ramiro Polla
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1424
                    "mov        %4, %%"REG_b"               \n\t"
1425
                    "push %%"REG_BP"                        \n\t"
1426
                    YSCALEYUV2RGB1(%%REGBP, %5)
1427
                    "pxor    %%mm7, %%mm7                   \n\t"
1428
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1429 d604bab9 Michael Niedermayer
#ifdef DITHER1XBPP
1430 c255994b Ramiro Polla
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1431
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1432
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1433 2da0d70d Diego Biurrun
#endif
1434 c255994b Ramiro Polla
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1435
                    "pop %%"REG_BP"                         \n\t"
1436
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1437 2da0d70d Diego Biurrun
1438 c255994b Ramiro Polla
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1439
                    "a" (&c->redDither)
1440 14014d47 Michael Niedermayer
                );
1441
                return;
1442
            case PIX_FMT_RGB565:
1443 7ad6469e Diego Pettenò
                __asm__ volatile(
1444 c255994b Ramiro Polla
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1445
                    "mov        %4, %%"REG_b"               \n\t"
1446
                    "push %%"REG_BP"                        \n\t"
1447
                    YSCALEYUV2RGB1(%%REGBP, %5)
1448
                    "pxor    %%mm7, %%mm7                   \n\t"
1449
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1450 d604bab9 Michael Niedermayer
#ifdef DITHER1XBPP
1451 c255994b Ramiro Polla
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1452
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1453
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1454 2da0d70d Diego Biurrun
#endif
1455
1456 c255994b Ramiro Polla
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1457
                    "pop %%"REG_BP"                         \n\t"
1458
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1459 2da0d70d Diego Biurrun
1460 c255994b Ramiro Polla
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1461
                    "a" (&c->redDither)
1462 14014d47 Michael Niedermayer
                );
1463
                return;
1464
            case PIX_FMT_YUYV422:
1465 7ad6469e Diego Pettenò
                __asm__ volatile(
1466 c255994b Ramiro Polla
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1467
                    "mov        %4, %%"REG_b"               \n\t"
1468
                    "push %%"REG_BP"                        \n\t"
1469
                    YSCALEYUV2PACKED1(%%REGBP, %5)
1470
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1471
                    "pop %%"REG_BP"                         \n\t"
1472
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1473 14014d47 Michael Niedermayer
1474 c255994b Ramiro Polla
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475
                    "a" (&c->redDither)
1476 14014d47 Michael Niedermayer
                );
1477
                return;
1478
            }
1479 dd68318c Ramiro Polla
        } else {
1480
            switch(dstFormat) {
1481 14014d47 Michael Niedermayer
            case PIX_FMT_RGB32:
1482 dd68318c Ramiro Polla
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1483 6858492e Cédric Schieli
                    __asm__ volatile(
1484 c255994b Ramiro Polla
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1485
                        "mov        %4, %%"REG_b"               \n\t"
1486
                        "push %%"REG_BP"                        \n\t"
1487
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1488
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1489
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1490
                        "pop %%"REG_BP"                         \n\t"
1491
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1492
1493
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1494
                        "a" (&c->redDither)
1495 6858492e Cédric Schieli
                    );
1496 dd68318c Ramiro Polla
                } else {
1497 3164d25e Cédric Schieli
                    __asm__ volatile(
1498 c255994b Ramiro Polla
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1499
                        "mov        %4, %%"REG_b"               \n\t"
1500
                        "push %%"REG_BP"                        \n\t"
1501
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1502
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1503
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1504
                        "pop %%"REG_BP"                         \n\t"
1505
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1506
1507
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1508
                        "a" (&c->redDither)
1509
                    );
1510
                }
1511
                return;
1512
            case PIX_FMT_BGR24:
1513
                __asm__ volatile(
1514 3164d25e Cédric Schieli
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1515
                    "mov        %4, %%"REG_b"               \n\t"
1516
                    "push %%"REG_BP"                        \n\t"
1517
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1518 c255994b Ramiro Polla
                    "pxor    %%mm7, %%mm7                   \n\t"
1519
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1520 3164d25e Cédric Schieli
                    "pop %%"REG_BP"                         \n\t"
1521
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1522 14014d47 Michael Niedermayer
1523 3164d25e Cédric Schieli
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1524
                    "a" (&c->redDither)
1525 14014d47 Michael Niedermayer
                );
1526
                return;
1527
            case PIX_FMT_RGB555:
1528 7ad6469e Diego Pettenò
                __asm__ volatile(
1529 c255994b Ramiro Polla
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1530
                    "mov        %4, %%"REG_b"               \n\t"
1531
                    "push %%"REG_BP"                        \n\t"
1532
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1533
                    "pxor    %%mm7, %%mm7                   \n\t"
1534
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1535 497d4f99 Michael Niedermayer
#ifdef DITHER1XBPP
1536 c255994b Ramiro Polla
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1537
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1538
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1539 2da0d70d Diego Biurrun
#endif
1540 c255994b Ramiro Polla
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1541
                    "pop %%"REG_BP"                         \n\t"
1542
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1543 2da0d70d Diego Biurrun
1544 c255994b Ramiro Polla
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545
                    "a" (&c->redDither)
1546 14014d47 Michael Niedermayer
                );
1547
                return;
1548
            case PIX_FMT_RGB565:
1549 7ad6469e Diego Pettenò
                __asm__ volatile(
1550 c255994b Ramiro Polla
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1551
                    "mov        %4, %%"REG_b"               \n\t"
1552
                    "push %%"REG_BP"                        \n\t"
1553
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1554
                    "pxor    %%mm7, %%mm7                   \n\t"
1555
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1556 497d4f99 Michael Niedermayer
#ifdef DITHER1XBPP
1557 c255994b Ramiro Polla
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1558
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1559
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1560 2da0d70d Diego Biurrun
#endif
1561
1562 c255994b Ramiro Polla
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1563
                    "pop %%"REG_BP"                         \n\t"
1564
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1565 2da0d70d Diego Biurrun
1566 c255994b Ramiro Polla
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1567
                    "a" (&c->redDither)
1568 14014d47 Michael Niedermayer
                );
1569
                return;
1570
            case PIX_FMT_YUYV422:
1571 7ad6469e Diego Pettenò
                __asm__ volatile(
1572 c255994b Ramiro Polla
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1573
                    "mov        %4, %%"REG_b"               \n\t"
1574
                    "push %%"REG_BP"                        \n\t"
1575
                    YSCALEYUV2PACKED1b(%%REGBP, %5)
1576
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1577
                    "pop %%"REG_BP"                         \n\t"
1578
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1579 14014d47 Michael Niedermayer
1580 c255994b Ramiro Polla
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1581
                    "a" (&c->redDither)
1582 14014d47 Michael Niedermayer
                );
1583
                return;
1584
            }
1585 2da0d70d Diego Biurrun
        }
1586
    }
1587 94daf2e9 Ramiro Polla
#endif /* COMPILE_TEMPLATE_MMX */
1588 dd68318c Ramiro Polla
    if (uvalpha < 2048) {
1589 6858492e Cédric Schieli
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1590 dd68318c Ramiro Polla
    } else {
1591 6858492e Cédric Schieli
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1592 2da0d70d Diego Biurrun
    }
1593 d604bab9 Michael Niedermayer
}
1594
1595 8a322796 Diego Biurrun
//FIXME yuy2* can read up to 7 samples too much
1596 6ff0ad6b Michael Niedermayer
1597 c3ab0004 Ramiro Polla
static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1598 1e621b18 Michael Niedermayer
{
1599 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1600 7ad6469e Diego Pettenò
    __asm__ volatile(
1601 c255994b Ramiro Polla
        "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1602
        "mov                    %0, %%"REG_a"       \n\t"
1603
        "1:                                         \n\t"
1604
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1605
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1606
        "pand                %%mm2, %%mm0           \n\t"
1607
        "pand                %%mm2, %%mm1           \n\t"
1608
        "packuswb            %%mm1, %%mm0           \n\t"
1609
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1610
        "add                    $8, %%"REG_a"       \n\t"
1611
        " js                    1b                  \n\t"
1612
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1613
        : "%"REG_a
1614 2da0d70d Diego Biurrun
    );
1615 1e621b18 Michael Niedermayer
#else
1616 2da0d70d Diego Biurrun
    int i;
1617
    for (i=0; i<width; i++)
1618
        dst[i]= src[2*i];
1619 1e621b18 Michael Niedermayer
#endif
1620
}
1621
1622 c3ab0004 Ramiro Polla
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1623 1e621b18 Michael Niedermayer
{
1624 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1625 7ad6469e Diego Pettenò
    __asm__ volatile(
1626 c255994b Ramiro Polla
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1627
        "mov                    %0, %%"REG_a"       \n\t"
1628
        "1:                                         \n\t"
1629
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1630
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1631
        "psrlw                  $8, %%mm0           \n\t"
1632
        "psrlw                  $8, %%mm1           \n\t"
1633
        "packuswb            %%mm1, %%mm0           \n\t"
1634
        "movq                %%mm0, %%mm1           \n\t"
1635
        "psrlw                  $8, %%mm0           \n\t"
1636
        "pand                %%mm4, %%mm1           \n\t"
1637
        "packuswb            %%mm0, %%mm0           \n\t"
1638
        "packuswb            %%mm1, %%mm1           \n\t"
1639
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1640
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1641
        "add                    $4, %%"REG_a"       \n\t"
1642
        " js                    1b                  \n\t"
1643
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1644
        : "%"REG_a
1645 2da0d70d Diego Biurrun
    );
1646 1e621b18 Michael Niedermayer
#else
1647 2da0d70d Diego Biurrun
    int i;
1648 dd68318c Ramiro Polla
    for (i=0; i<width; i++) {
1649 2da0d70d Diego Biurrun
        dstU[i]= src1[4*i + 1];
1650
        dstV[i]= src1[4*i + 3];
1651
    }
1652
#endif
1653
    assert(src1 == src2);
1654 1e621b18 Michael Niedermayer
}
1655
1656 c3ab0004 Ramiro Polla
static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1657 de1275d5 Michael Niedermayer
{
1658 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1659 de1275d5 Michael Niedermayer
    __asm__ volatile(
1660 c255994b Ramiro Polla
        "mov                    %0, %%"REG_a"       \n\t"
1661
        "1:                                         \n\t"
1662
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1663
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1664
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1665
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1666
        "psrlw                  $8, %%mm0           \n\t"
1667
        "psrlw                  $8, %%mm1           \n\t"
1668
        "psrlw                  $8, %%mm2           \n\t"
1669
        "psrlw                  $8, %%mm3           \n\t"
1670
        "packuswb            %%mm1, %%mm0           \n\t"
1671
        "packuswb            %%mm3, %%mm2           \n\t"
1672
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1673
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1674
        "add                    $8, %%"REG_a"       \n\t"
1675
        " js                    1b                  \n\t"
1676
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1677
        : "%"REG_a
1678 de1275d5 Michael Niedermayer
    );
1679
#else
1680
    int i;
1681 dd68318c Ramiro Polla
    for (i=0; i<width; i++) {
1682 de1275d5 Michael Niedermayer
        dstU[i]= src1[2*i + 1];
1683
        dstV[i]= src2[2*i + 1];
1684
    }
1685
#endif
1686
}
1687
1688 4cf16bbe Diego Biurrun
/* This is almost identical to the previous, end exists only because
1689
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1690 c3ab0004 Ramiro Polla
static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1691 7322a67c Michael Niedermayer
{
1692 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1693 7ad6469e Diego Pettenò
    __asm__ volatile(
1694 c255994b Ramiro Polla
        "mov                  %0, %%"REG_a"         \n\t"
1695
        "1:                                         \n\t"
1696
        "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1697
        "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1698
        "psrlw                $8, %%mm0             \n\t"
1699
        "psrlw                $8, %%mm1             \n\t"
1700
        "packuswb          %%mm1, %%mm0             \n\t"
1701
        "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1702
        "add                  $8, %%"REG_a"         \n\t"
1703
        " js                  1b                    \n\t"
1704
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1705
        : "%"REG_a
1706 2da0d70d Diego Biurrun
    );
1707 7322a67c Michael Niedermayer
#else
1708 2da0d70d Diego Biurrun
    int i;
1709
    for (i=0; i<width; i++)
1710
        dst[i]= src[2*i+1];
1711 7322a67c Michael Niedermayer
#endif
1712
}
1713
1714 c3ab0004 Ramiro Polla
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1715 7322a67c Michael Niedermayer
{
1716 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1717 7ad6469e Diego Pettenò
    __asm__ volatile(
1718 c255994b Ramiro Polla
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1719
        "mov                    %0, %%"REG_a"       \n\t"
1720
        "1:                                         \n\t"
1721
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1722
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1723
        "pand                %%mm4, %%mm0           \n\t"
1724
        "pand                %%mm4, %%mm1           \n\t"
1725
        "packuswb            %%mm1, %%mm0           \n\t"
1726
        "movq                %%mm0, %%mm1           \n\t"
1727
        "psrlw                  $8, %%mm0           \n\t"
1728
        "pand                %%mm4, %%mm1           \n\t"
1729
        "packuswb            %%mm0, %%mm0           \n\t"
1730
        "packuswb            %%mm1, %%mm1           \n\t"
1731
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1732
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1733
        "add                    $4, %%"REG_a"       \n\t"
1734
        " js                    1b                  \n\t"
1735
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1736
        : "%"REG_a
1737 2da0d70d Diego Biurrun
    );
1738 7322a67c Michael Niedermayer
#else
1739 2da0d70d Diego Biurrun
    int i;
1740 dd68318c Ramiro Polla
    for (i=0; i<width; i++) {
1741 2da0d70d Diego Biurrun
        dstU[i]= src1[4*i + 0];
1742
        dstV[i]= src1[4*i + 2];
1743
    }
1744
#endif
1745
    assert(src1 == src2);
1746 7322a67c Michael Niedermayer
}
1747
1748 c3ab0004 Ramiro Polla
static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1749 de1275d5 Michael Niedermayer
{
1750 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1751 de1275d5 Michael Niedermayer
    __asm__ volatile(
1752 c255994b Ramiro Polla
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1753
        "mov                    %0, %%"REG_a"       \n\t"
1754
        "1:                                         \n\t"
1755
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1756
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1757
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1758
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1759
        "pand                %%mm4, %%mm0           \n\t"
1760
        "pand                %%mm4, %%mm1           \n\t"
1761
        "pand                %%mm4, %%mm2           \n\t"
1762
        "pand                %%mm4, %%mm3           \n\t"
1763
        "packuswb            %%mm1, %%mm0           \n\t"
1764
        "packuswb            %%mm3, %%mm2           \n\t"
1765
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1766
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1767
        "add                    $8, %%"REG_a"       \n\t"
1768
        " js                    1b                  \n\t"
1769
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1770
        : "%"REG_a
1771 de1275d5 Michael Niedermayer
    );
1772
#else
1773
    int i;
1774 dd68318c Ramiro Polla
    for (i=0; i<width; i++) {
1775 de1275d5 Michael Niedermayer
        dstU[i]= src1[2*i];
1776
        dstV[i]= src2[2*i];
1777
    }
1778
#endif
1779
}
1780
1781 f415be68 Ramiro Polla
static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1782 c3ab0004 Ramiro Polla
                                    const uint8_t *src, long width)
1783 f415be68 Ramiro Polla
{
1784
#if COMPILE_TEMPLATE_MMX
1785
    __asm__ volatile(
1786
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1787
        "mov                    %0, %%"REG_a"       \n\t"
1788
        "1:                                         \n\t"
1789
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1790
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1791
        "movq                %%mm0, %%mm2           \n\t"
1792
        "movq                %%mm1, %%mm3           \n\t"
1793
        "pand                %%mm4, %%mm0           \n\t"
1794
        "pand                %%mm4, %%mm1           \n\t"
1795
        "psrlw                  $8, %%mm2           \n\t"
1796
        "psrlw                  $8, %%mm3           \n\t"
1797
        "packuswb            %%mm1, %%mm0           \n\t"
1798
        "packuswb            %%mm3, %%mm2           \n\t"
1799
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1800
        "movq                %%mm2, (%3, %%"REG_a") \n\t"
1801
        "add                    $8, %%"REG_a"       \n\t"
1802
        " js                    1b                  \n\t"
1803
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1804
        : "%"REG_a
1805
    );
1806
#else
1807
    int i;
1808
    for (i = 0; i < width; i++) {
1809
        dst1[i] = src[2*i+0];
1810
        dst2[i] = src[2*i+1];
1811
    }
1812
#endif
1813
}
1814
1815 e470691a Ramiro Polla
static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1816
                                    const uint8_t *src1, const uint8_t *src2,
1817 c3ab0004 Ramiro Polla
                                    long width, uint32_t *unused)
1818 f415be68 Ramiro Polla
{
1819
    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1820
}
1821
1822 e470691a Ramiro Polla
static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1823
                                    const uint8_t *src1, const uint8_t *src2,
1824 c3ab0004 Ramiro Polla
                                    long width, uint32_t *unused)
1825 f415be68 Ramiro Polla
{
1826
    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1827
}
1828
1829 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1830 c3ab0004 Ramiro Polla
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1831 dfb09bd1 Michael Niedermayer
{
1832
1833 dd68318c Ramiro Polla
    if(srcFormat == PIX_FMT_BGR24) {
1834 7ad6469e Diego Pettenò
        __asm__ volatile(
1835 ff9a056d Michael Niedermayer
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1836
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1837
            :
1838 dfb09bd1 Michael Niedermayer
        );
1839 dd68318c Ramiro Polla
    } else {
1840 7ad6469e Diego Pettenò
        __asm__ volatile(
1841 ff9a056d Michael Niedermayer
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1842
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1843
            :
1844 dfb09bd1 Michael Niedermayer
        );
1845
    }
1846
1847 7ad6469e Diego Pettenò
    __asm__ volatile(
1848 dfb09bd1 Michael Niedermayer
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1849
        "mov                        %2, %%"REG_a"   \n\t"
1850
        "pxor                    %%mm7, %%mm7       \n\t"
1851
        "1:                                         \n\t"
1852
        PREFETCH"               64(%0)              \n\t"
1853
        "movd                     (%0), %%mm0       \n\t"
1854
        "movd                    2(%0), %%mm1       \n\t"
1855
        "movd                    6(%0), %%mm2       \n\t"
1856
        "movd                    8(%0), %%mm3       \n\t"
1857
        "add                       $12, %0          \n\t"
1858
        "punpcklbw               %%mm7, %%mm0       \n\t"
1859
        "punpcklbw               %%mm7, %%mm1       \n\t"
1860
        "punpcklbw               %%mm7, %%mm2       \n\t"
1861
        "punpcklbw               %%mm7, %%mm3       \n\t"
1862
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1863
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1864
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1865
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1866
        "paddd                   %%mm1, %%mm0       \n\t"
1867
        "paddd                   %%mm3, %%mm2       \n\t"
1868
        "paddd                   %%mm4, %%mm0       \n\t"
1869
        "paddd                   %%mm4, %%mm2       \n\t"
1870
        "psrad                     $15, %%mm0       \n\t"
1871
        "psrad                     $15, %%mm2       \n\t"
1872
        "packssdw                %%mm2, %%mm0       \n\t"
1873
        "packuswb                %%mm0, %%mm0       \n\t"
1874
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1875
        "add                        $4, %%"REG_a"   \n\t"
1876
        " js                        1b              \n\t"
1877
    : "+r" (src)
1878 d0ce212a Ramiro Polla
    : "r" (dst+width), "g" ((x86_reg)-width)
1879 dfb09bd1 Michael Niedermayer
    : "%"REG_a
1880 2da0d70d Diego Biurrun
    );
1881 dfb09bd1 Michael Niedermayer
}
1882
1883 c3ab0004 Ramiro Polla
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1884 dfb09bd1 Michael Niedermayer
{
1885 7ad6469e Diego Pettenò
    __asm__ volatile(
1886 dfb09bd1 Michael Niedermayer
        "movq                    24+%4, %%mm6       \n\t"
1887
        "mov                        %3, %%"REG_a"   \n\t"
1888
        "pxor                    %%mm7, %%mm7       \n\t"
1889
        "1:                                         \n\t"
1890
        PREFETCH"               64(%0)              \n\t"
1891
        "movd                     (%0), %%mm0       \n\t"
1892
        "movd                    2(%0), %%mm1       \n\t"
1893
        "punpcklbw               %%mm7, %%mm0       \n\t"
1894
        "punpcklbw               %%mm7, %%mm1       \n\t"
1895
        "movq                    %%mm0, %%mm2       \n\t"
1896
        "movq                    %%mm1, %%mm3       \n\t"
1897
        "pmaddwd                    %4, %%mm0       \n\t"
1898
        "pmaddwd                  8+%4, %%mm1       \n\t"
1899
        "pmaddwd                 16+%4, %%mm2       \n\t"
1900
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1901
        "paddd                   %%mm1, %%mm0       \n\t"
1902
        "paddd                   %%mm3, %%mm2       \n\t"
1903
1904
        "movd                    6(%0), %%mm1       \n\t"
1905
        "movd                    8(%0), %%mm3       \n\t"
1906
        "add                       $12, %0          \n\t"
1907
        "punpcklbw               %%mm7, %%mm1       \n\t"
1908
        "punpcklbw               %%mm7, %%mm3       \n\t"
1909
        "movq                    %%mm1, %%mm4       \n\t"
1910
        "movq                    %%mm3, %%mm5       \n\t"
1911
        "pmaddwd                    %4, %%mm1       \n\t"
1912
        "pmaddwd                  8+%4, %%mm3       \n\t"
1913
        "pmaddwd                 16+%4, %%mm4       \n\t"
1914
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1915
        "paddd                   %%mm3, %%mm1       \n\t"
1916
        "paddd                   %%mm5, %%mm4       \n\t"
1917
1918
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1919
        "paddd                   %%mm3, %%mm0       \n\t"
1920
        "paddd                   %%mm3, %%mm2       \n\t"
1921
        "paddd                   %%mm3, %%mm1       \n\t"
1922
        "paddd                   %%mm3, %%mm4       \n\t"
1923
        "psrad                     $15, %%mm0       \n\t"
1924
        "psrad                     $15, %%mm2       \n\t"
1925
        "psrad                     $15, %%mm1       \n\t"
1926
        "psrad                     $15, %%mm4       \n\t"
1927
        "packssdw                %%mm1, %%mm0       \n\t"
1928
        "packssdw                %%mm4, %%mm2       \n\t"
1929
        "packuswb                %%mm0, %%mm0       \n\t"
1930
        "packuswb                %%mm2, %%mm2       \n\t"
1931
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1932
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1933
        "add                        $4, %%"REG_a"   \n\t"
1934
        " js                        1b              \n\t"
1935
    : "+r" (src)
1936 d0ce212a Ramiro Polla
    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1937 dfb09bd1 Michael Niedermayer
    : "%"REG_a
1938
    );
1939
}
1940
#endif
1941
1942 c3ab0004 Ramiro Polla
static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1943 dfb09bd1 Michael Niedermayer
{
1944 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1945 a35acd7f Benjamin Zores
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1946 1e621b18 Michael Niedermayer
#else
1947 2da0d70d Diego Biurrun
    int i;
1948 dd68318c Ramiro Polla
    for (i=0; i<width; i++) {
1949 2da0d70d Diego Biurrun
        int b= src[i*3+0];
1950
        int g= src[i*3+1];
1951
        int r= src[i*3+2];
1952 1e621b18 Michael Niedermayer
1953 e5091488 Benoit Fouet
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1954 2da0d70d Diego Biurrun
    }
1955 94daf2e9 Ramiro Polla
#endif /* COMPILE_TEMPLATE_MMX */
1956 1e621b18 Michael Niedermayer
}
1957
1958 c3ab0004 Ramiro Polla
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1959 1e621b18 Michael Niedermayer
{
1960 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1961 a35acd7f Benjamin Zores
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1962 1e621b18 Michael Niedermayer
#else
1963 2da0d70d Diego Biurrun
    int i;
1964 dd68318c Ramiro Polla
    for (i=0; i<width; i++) {
1965 dfb09bd1 Michael Niedermayer
        int b= src1[3*i + 0];
1966
        int g= src1[3*i + 1];
1967
        int r= src1[3*i + 2];
1968 2da0d70d Diego Biurrun
1969 dfb09bd1 Michael Niedermayer
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1970
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1971 2da0d70d Diego Biurrun
    }
1972 94daf2e9 Ramiro Polla
#endif /* COMPILE_TEMPLATE_MMX */
1973 2da0d70d Diego Biurrun
    assert(src1 == src2);
1974 1e621b18 Michael Niedermayer
}
1975
1976 c3ab0004 Ramiro Polla
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1977 2f60f629 Michael Niedermayer
{
1978
    int i;
1979 dd68318c Ramiro Polla
    for (i=0; i<width; i++) {
1980 2f60f629 Michael Niedermayer
        int b= src1[6*i + 0] + src1[6*i + 3];
1981
        int g= src1[6*i + 1] + src1[6*i + 4];
1982
        int r= src1[6*i + 2] + src1[6*i + 5];
1983
1984
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1985
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1986
    }
1987
    assert(src1 == src2);
1988
}
1989
1990 c3ab0004 Ramiro Polla
static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1991 a861d4d7 Michael Niedermayer
{
1992 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1993 a35acd7f Benjamin Zores
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1994 dfb09bd1 Michael Niedermayer
#else
1995 2da0d70d Diego Biurrun
    int i;
1996 dd68318c Ramiro Polla
    for (i=0; i<width; i++) {
1997 2da0d70d Diego Biurrun
        int r= src[i*3+0];
1998
        int g= src[i*3+1];
1999
        int b= src[i*3+2];
2000
2001 e5091488 Benoit Fouet
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2002 2da0d70d Diego Biurrun
    }
2003 dfb09bd1 Michael Niedermayer
#endif
2004 a861d4d7 Michael Niedermayer
}
2005
2006 c3ab0004 Ramiro Polla
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2007 a861d4d7 Michael Niedermayer
{
2008 94daf2e9 Ramiro Polla
#if COMPILE_TEMPLATE_MMX
2009 5155b839 Diego Biurrun
    assert(src1==src2);
2010 a35acd7f Benjamin Zores
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2011 dfb09bd1 Michael Niedermayer
#else
2012 5155b839 Diego Biurrun
    int i;
2013
    assert(src1==src2);
2014 dd68318c Ramiro Polla
    for (i=0; i<width; i++) {
2015 dfb09bd1 Michael Niedermayer
        int r= src1[3*i + 0];
2016
        int g= src1[3*i + 1];
2017
        int b= src1[3*i + 2];
2018 2da0d70d Diego Biurrun
2019 dfb09bd1 Michael Niedermayer
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2020
        d