Statistics
| Branch: | Revision:

ffmpeg / libswscale / rgb2rgb_template.c @ db46be01

History | View | Annotate | Download (113 KB)

1 fcfbc150 Michael Niedermayer
/*
2 8a322796 Diego Biurrun
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 * lot of big-endian byte order fixes by Alex Beregszaszi
9 4fadc2b4 Diego Biurrun
 *
10 2912e87a Mans Rullgard
 * This file is part of Libav.
11 d026b45e Diego Biurrun
 *
12 2912e87a Mans Rullgard
 * Libav is free software; you can redistribute it and/or
13 819ee683 Diego Biurrun
 * modify it under the terms of the GNU Lesser General Public
14
 * License as published by the Free Software Foundation; either
15
 * version 2.1 of the License, or (at your option) any later version.
16 4fadc2b4 Diego Biurrun
 *
17 2912e87a Mans Rullgard
 * Libav is distributed in the hope that it will be useful,
18 4fadc2b4 Diego Biurrun
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 819ee683 Diego Biurrun
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
 * Lesser General Public License for more details.
21 4fadc2b4 Diego Biurrun
 *
22 819ee683 Diego Biurrun
 * You should have received a copy of the GNU Lesser General Public
23 2912e87a Mans Rullgard
 * License along with Libav; if not, write to the Free Software
24 b19bcbaa Diego Biurrun
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 a3aece93 Nick Kurshev
 */
26
27 0d9f3d85 Arpi
#include <stddef.h>
28
29 1de97d84 Michael Niedermayer
#undef PREFETCH
30
#undef MOVNTQ
31
#undef EMMS
32
#undef SFENCE
33
#undef MMREG_SIZE
34
#undef PAVGB
35
36 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_SSE2
37 1de97d84 Michael Niedermayer
#define MMREG_SIZE 16
38
#else
39
#define MMREG_SIZE 8
40
#endif
41
42 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_AMD3DNOW
43 1de97d84 Michael Niedermayer
#define PREFETCH  "prefetch"
44 6e42e6c4 Diego Biurrun
#define PAVGB     "pavgusb"
45 c12f7b2d Ramiro Polla
#elif COMPILE_TEMPLATE_MMX2
46 1de97d84 Michael Niedermayer
#define PREFETCH "prefetchnta"
47 6e42e6c4 Diego Biurrun
#define PAVGB     "pavgb"
48 1de97d84 Michael Niedermayer
#else
49 d904b5fc Nigel Pearson
#define PREFETCH  " # nop"
50 99969243 Michael Niedermayer
#endif
51 1de97d84 Michael Niedermayer
52 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_AMD3DNOW
53 aeb87a49 Diego Biurrun
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
54 1de97d84 Michael Niedermayer
#define EMMS     "femms"
55
#else
56
#define EMMS     "emms"
57 e697a141 Michael Niedermayer
#endif
58 79811694 Nick Kurshev
59 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX2
60 1de97d84 Michael Niedermayer
#define MOVNTQ "movntq"
61
#define SFENCE "sfence"
62
#else
63
#define MOVNTQ "movq"
64 d904b5fc Nigel Pearson
#define SFENCE " # nop"
65 fac8012c Nicolas Plourde
#endif
66 1de97d84 Michael Niedermayer
67 6107059c Michael Niedermayer
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
68 b234ae81 Nick Kurshev
{
69 6e42e6c4 Diego Biurrun
    uint8_t *dest = dst;
70
    const uint8_t *s = src;
71
    const uint8_t *end;
72 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
73 9b734d44 Ramiro Polla
    const uint8_t *mm_end;
74 7d73d1c3 Ramiro Polla
#endif
75 6e42e6c4 Diego Biurrun
    end = s + src_size;
76 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
77 9b734d44 Ramiro Polla
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
78
    mm_end = end - 23;
79
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
80 dd68318c Ramiro Polla
    while (s < mm_end) {
81 9b734d44 Ramiro Polla
        __asm__ volatile(
82 6e42e6c4 Diego Biurrun
            PREFETCH"    32%1           \n\t"
83
            "movd          %1, %%mm0    \n\t"
84
            "punpckldq    3%1, %%mm0    \n\t"
85
            "movd         6%1, %%mm1    \n\t"
86
            "punpckldq    9%1, %%mm1    \n\t"
87
            "movd        12%1, %%mm2    \n\t"
88
            "punpckldq   15%1, %%mm2    \n\t"
89
            "movd        18%1, %%mm3    \n\t"
90
            "punpckldq   21%1, %%mm3    \n\t"
91 f8a138be Cédric Schieli
            "por        %%mm7, %%mm0    \n\t"
92
            "por        %%mm7, %%mm1    \n\t"
93
            "por        %%mm7, %%mm2    \n\t"
94
            "por        %%mm7, %%mm3    \n\t"
95 6e42e6c4 Diego Biurrun
            MOVNTQ"     %%mm0,   %0     \n\t"
96
            MOVNTQ"     %%mm1,  8%0     \n\t"
97
            MOVNTQ"     %%mm2, 16%0     \n\t"
98
            MOVNTQ"     %%mm3, 24%0"
99
            :"=m"(*dest)
100
            :"m"(*s)
101
            :"memory");
102 9b734d44 Ramiro Polla
        dest += 32;
103
        s += 24;
104
    }
105
    __asm__ volatile(SFENCE:::"memory");
106
    __asm__ volatile(EMMS:::"memory");
107 7d73d1c3 Ramiro Polla
#endif
108 dd68318c Ramiro Polla
    while (s < end) {
109 7d73d1c3 Ramiro Polla
#if HAVE_BIGENDIAN
110 6e42e6c4 Diego Biurrun
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
111 f8a138be Cédric Schieli
        *dest++ = 255;
112 6e42e6c4 Diego Biurrun
        *dest++ = s[2];
113
        *dest++ = s[1];
114
        *dest++ = s[0];
115
        s+=3;
116 7d73d1c3 Ramiro Polla
#else
117 6e42e6c4 Diego Biurrun
        *dest++ = *s++;
118
        *dest++ = *s++;
119
        *dest++ = *s++;
120 f8a138be Cédric Schieli
        *dest++ = 255;
121 7d73d1c3 Ramiro Polla
#endif
122 6e42e6c4 Diego Biurrun
    }
123 b234ae81 Nick Kurshev
}
124 59ac5a93 Nick Kurshev
125 5b03661f Reimar Döffinger
#define STORE_BGR24_MMX \
126
            "psrlq         $8, %%mm2    \n\t" \
127
            "psrlq         $8, %%mm3    \n\t" \
128
            "psrlq         $8, %%mm6    \n\t" \
129
            "psrlq         $8, %%mm7    \n\t" \
130
            "pand "MANGLE(mask24l)", %%mm0\n\t" \
131
            "pand "MANGLE(mask24l)", %%mm1\n\t" \
132
            "pand "MANGLE(mask24l)", %%mm4\n\t" \
133
            "pand "MANGLE(mask24l)", %%mm5\n\t" \
134
            "pand "MANGLE(mask24h)", %%mm2\n\t" \
135
            "pand "MANGLE(mask24h)", %%mm3\n\t" \
136
            "pand "MANGLE(mask24h)", %%mm6\n\t" \
137
            "pand "MANGLE(mask24h)", %%mm7\n\t" \
138
            "por        %%mm2, %%mm0    \n\t" \
139
            "por        %%mm3, %%mm1    \n\t" \
140
            "por        %%mm6, %%mm4    \n\t" \
141
            "por        %%mm7, %%mm5    \n\t" \
142
 \
143
            "movq       %%mm1, %%mm2    \n\t" \
144
            "movq       %%mm4, %%mm3    \n\t" \
145
            "psllq        $48, %%mm2    \n\t" \
146
            "psllq        $32, %%mm3    \n\t" \
147
            "pand "MANGLE(mask24hh)", %%mm2\n\t" \
148
            "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
149
            "por        %%mm2, %%mm0    \n\t" \
150
            "psrlq        $16, %%mm1    \n\t" \
151
            "psrlq        $32, %%mm4    \n\t" \
152
            "psllq        $16, %%mm5    \n\t" \
153
            "por        %%mm3, %%mm1    \n\t" \
154
            "pand  "MANGLE(mask24hhhh)", %%mm5\n\t" \
155
            "por        %%mm5, %%mm4    \n\t" \
156
 \
157
            MOVNTQ"     %%mm0,   %0     \n\t" \
158
            MOVNTQ"     %%mm1,  8%0     \n\t" \
159
            MOVNTQ"     %%mm4, 16%0"
160
161
162 6107059c Michael Niedermayer
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
163 59ac5a93 Nick Kurshev
{
164 6e42e6c4 Diego Biurrun
    uint8_t *dest = dst;
165
    const uint8_t *s = src;
166
    const uint8_t *end;
167 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
168 6e42e6c4 Diego Biurrun
    const uint8_t *mm_end;
169 494a6294 Nick Kurshev
#endif
170 6e42e6c4 Diego Biurrun
    end = s + src_size;
171 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
172 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
173 6e42e6c4 Diego Biurrun
    mm_end = end - 31;
174 dd68318c Ramiro Polla
    while (s < mm_end) {
175 7ad6469e Diego Pettenò
        __asm__ volatile(
176 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
177
            "movq          %1, %%mm0    \n\t"
178
            "movq         8%1, %%mm1    \n\t"
179
            "movq        16%1, %%mm4    \n\t"
180
            "movq        24%1, %%mm5    \n\t"
181
            "movq       %%mm0, %%mm2    \n\t"
182
            "movq       %%mm1, %%mm3    \n\t"
183
            "movq       %%mm4, %%mm6    \n\t"
184
            "movq       %%mm5, %%mm7    \n\t"
185 5b03661f Reimar Döffinger
            STORE_BGR24_MMX
186 9b734d44 Ramiro Polla
            :"=m"(*dest)
187 5b03661f Reimar Döffinger
            :"m"(*s)
188 9b734d44 Ramiro Polla
            :"memory");
189 6e42e6c4 Diego Biurrun
        dest += 24;
190
        s += 32;
191
    }
192 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
193
    __asm__ volatile(EMMS:::"memory");
194 6e42e6c4 Diego Biurrun
#endif
195 dd68318c Ramiro Polla
    while (s < end) {
196 a898cdc9 Måns Rullgård
#if HAVE_BIGENDIAN
197 6e42e6c4 Diego Biurrun
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
198
        s++;
199
        dest[2] = *s++;
200
        dest[1] = *s++;
201
        dest[0] = *s++;
202
        dest += 3;
203 6cb38650 Alex Beregszaszi
#else
204 6e42e6c4 Diego Biurrun
        *dest++ = *s++;
205
        *dest++ = *s++;
206
        *dest++ = *s++;
207
        s++;
208 6cb38650 Alex Beregszaszi
#endif
209 6e42e6c4 Diego Biurrun
    }
210 59ac5a93 Nick Kurshev
}
211 b238eb2e Nick Kurshev
212 a3aece93 Nick Kurshev
/*
213 8a322796 Diego Biurrun
 original by Strepto/Astral
214
 ported to gcc & bugfixed: A'rpi
215 51da31f1 Nick Kurshev
 MMX2, 3DNOW optimization by Nick Kurshev
216 8a322796 Diego Biurrun
 32-bit C version, and and&add trick by Michael Niedermayer
217 a3aece93 Nick Kurshev
*/
218 30c48a0a Benoit Fouet
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
219 b238eb2e Nick Kurshev
{
220 6e42e6c4 Diego Biurrun
    register const uint8_t* s=src;
221
    register uint8_t* d=dst;
222
    register const uint8_t *end;
223
    const uint8_t *mm_end;
224
    end = s + src_size;
225 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
226 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
227
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
228 6e42e6c4 Diego Biurrun
    mm_end = end - 15;
229 dd68318c Ramiro Polla
    while (s<mm_end) {
230 7ad6469e Diego Pettenò
        __asm__ volatile(
231 9b734d44 Ramiro Polla
            PREFETCH"  32%1         \n\t"
232
            "movq        %1, %%mm0  \n\t"
233
            "movq       8%1, %%mm2  \n\t"
234
            "movq     %%mm0, %%mm1  \n\t"
235
            "movq     %%mm2, %%mm3  \n\t"
236
            "pand     %%mm4, %%mm0  \n\t"
237
            "pand     %%mm4, %%mm2  \n\t"
238
            "paddw    %%mm1, %%mm0  \n\t"
239
            "paddw    %%mm3, %%mm2  \n\t"
240
            MOVNTQ"   %%mm0,  %0    \n\t"
241
            MOVNTQ"   %%mm2, 8%0"
242
            :"=m"(*d)
243
            :"m"(*s)
244 6e42e6c4 Diego Biurrun
        );
245
        d+=16;
246
        s+=16;
247
    }
248 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
249
    __asm__ volatile(EMMS:::"memory");
250 b238eb2e Nick Kurshev
#endif
251 d8dad2a5 Michael Niedermayer
    mm_end = end - 3;
252 dd68318c Ramiro Polla
    while (s < mm_end) {
253 994c1ef0 Baptiste Coudurier
        register unsigned x= *((const uint32_t *)s);
254 6e42e6c4 Diego Biurrun
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
255
        d+=4;
256
        s+=4;
257 0d9f3d85 Arpi
    }
258 dd68318c Ramiro Polla
    if (s < end) {
259 994c1ef0 Baptiste Coudurier
        register unsigned short x= *((const uint16_t *)s);
260 6e42e6c4 Diego Biurrun
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
261 0d9f3d85 Arpi
    }
262 b238eb2e Nick Kurshev
}
263 fcfbc150 Michael Niedermayer
264 30c48a0a Benoit Fouet
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
265 ac4d0aea Michael Niedermayer
{
266 6e42e6c4 Diego Biurrun
    register const uint8_t* s=src;
267
    register uint8_t* d=dst;
268
    register const uint8_t *end;
269
    const uint8_t *mm_end;
270
    end = s + src_size;
271 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
272 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
273
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
274
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
275 6e42e6c4 Diego Biurrun
    mm_end = end - 15;
276 dd68318c Ramiro Polla
    while (s<mm_end) {
277 7ad6469e Diego Pettenò
        __asm__ volatile(
278 9b734d44 Ramiro Polla
            PREFETCH"  32%1         \n\t"
279
            "movq        %1, %%mm0  \n\t"
280
            "movq       8%1, %%mm2  \n\t"
281
            "movq     %%mm0, %%mm1  \n\t"
282
            "movq     %%mm2, %%mm3  \n\t"
283
            "psrlq       $1, %%mm0  \n\t"
284
            "psrlq       $1, %%mm2  \n\t"
285
            "pand     %%mm7, %%mm0  \n\t"
286
            "pand     %%mm7, %%mm2  \n\t"
287
            "pand     %%mm6, %%mm1  \n\t"
288
            "pand     %%mm6, %%mm3  \n\t"
289
            "por      %%mm1, %%mm0  \n\t"
290
            "por      %%mm3, %%mm2  \n\t"
291
            MOVNTQ"   %%mm0,  %0    \n\t"
292
            MOVNTQ"   %%mm2, 8%0"
293
            :"=m"(*d)
294
            :"m"(*s)
295 6e42e6c4 Diego Biurrun
        );
296
        d+=16;
297
        s+=16;
298
    }
299 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
300
    __asm__ volatile(EMMS:::"memory");
301 ac4d0aea Michael Niedermayer
#endif
302 0598bcbb Michael Niedermayer
    mm_end = end - 3;
303 dd68318c Ramiro Polla
    while (s < mm_end) {
304 ce3d365f Baptiste Coudurier
        register uint32_t x= *((const uint32_t*)s);
305 6e42e6c4 Diego Biurrun
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
306
        s+=4;
307
        d+=4;
308 ac4d0aea Michael Niedermayer
    }
309 dd68318c Ramiro Polla
    if (s < end) {
310 ce3d365f Baptiste Coudurier
        register uint16_t x= *((const uint16_t*)s);
311 6e42e6c4 Diego Biurrun
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
312 ac4d0aea Michael Niedermayer
    }
313
}
314
315 7f526efd Reimar Döffinger
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
316 fcfbc150 Michael Niedermayer
{
317 6e42e6c4 Diego Biurrun
    const uint8_t *s = src;
318
    const uint8_t *end;
319 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
320 6e42e6c4 Diego Biurrun
    const uint8_t *mm_end;
321 0d9f3d85 Arpi
#endif
322 6e42e6c4 Diego Biurrun
    uint16_t *d = (uint16_t *)dst;
323
    end = s + src_size;
324 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
325 6e42e6c4 Diego Biurrun
    mm_end = end - 15;
326 594ff7cc Diego Biurrun
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
327 7ad6469e Diego Pettenò
    __asm__ volatile(
328 9b734d44 Ramiro Polla
        "movq           %3, %%mm5   \n\t"
329
        "movq           %4, %%mm6   \n\t"
330
        "movq           %5, %%mm7   \n\t"
331
        "jmp 2f                     \n\t"
332 ef4a6514 Mans Rullgard
        ".p2align        4          \n\t"
333 9b734d44 Ramiro Polla
        "1:                         \n\t"
334
        PREFETCH"   32(%1)          \n\t"
335
        "movd         (%1), %%mm0   \n\t"
336
        "movd        4(%1), %%mm3   \n\t"
337
        "punpckldq   8(%1), %%mm0   \n\t"
338
        "punpckldq  12(%1), %%mm3   \n\t"
339
        "movq        %%mm0, %%mm1   \n\t"
340
        "movq        %%mm3, %%mm4   \n\t"
341
        "pand        %%mm6, %%mm0   \n\t"
342
        "pand        %%mm6, %%mm3   \n\t"
343
        "pmaddwd     %%mm7, %%mm0   \n\t"
344
        "pmaddwd     %%mm7, %%mm3   \n\t"
345
        "pand        %%mm5, %%mm1   \n\t"
346
        "pand        %%mm5, %%mm4   \n\t"
347
        "por         %%mm1, %%mm0   \n\t"
348
        "por         %%mm4, %%mm3   \n\t"
349
        "psrld          $5, %%mm0   \n\t"
350
        "pslld         $11, %%mm3   \n\t"
351
        "por         %%mm3, %%mm0   \n\t"
352
        MOVNTQ"      %%mm0, (%0)    \n\t"
353
        "add           $16,  %1     \n\t"
354
        "add            $8,  %0     \n\t"
355
        "2:                         \n\t"
356
        "cmp            %2,  %1     \n\t"
357
        " jb            1b          \n\t"
358
        : "+r" (d), "+r"(s)
359
        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
360 6e42e6c4 Diego Biurrun
    );
361 aeae5d53 Michael Niedermayer
#else
362 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
363
    __asm__ volatile(
364 6e42e6c4 Diego Biurrun
        "movq    %0, %%mm7    \n\t"
365
        "movq    %1, %%mm6    \n\t"
366
        ::"m"(red_16mask),"m"(green_16mask));
367 dd68318c Ramiro Polla
    while (s < mm_end) {
368 7ad6469e Diego Pettenò
        __asm__ volatile(
369 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
370
            "movd          %1, %%mm0    \n\t"
371
            "movd         4%1, %%mm3    \n\t"
372
            "punpckldq    8%1, %%mm0    \n\t"
373
            "punpckldq   12%1, %%mm3    \n\t"
374
            "movq       %%mm0, %%mm1    \n\t"
375
            "movq       %%mm0, %%mm2    \n\t"
376
            "movq       %%mm3, %%mm4    \n\t"
377
            "movq       %%mm3, %%mm5    \n\t"
378
            "psrlq         $3, %%mm0    \n\t"
379
            "psrlq         $3, %%mm3    \n\t"
380
            "pand          %2, %%mm0    \n\t"
381
            "pand          %2, %%mm3    \n\t"
382
            "psrlq         $5, %%mm1    \n\t"
383
            "psrlq         $5, %%mm4    \n\t"
384
            "pand       %%mm6, %%mm1    \n\t"
385
            "pand       %%mm6, %%mm4    \n\t"
386
            "psrlq         $8, %%mm2    \n\t"
387
            "psrlq         $8, %%mm5    \n\t"
388
            "pand       %%mm7, %%mm2    \n\t"
389
            "pand       %%mm7, %%mm5    \n\t"
390
            "por        %%mm1, %%mm0    \n\t"
391
            "por        %%mm4, %%mm3    \n\t"
392
            "por        %%mm2, %%mm0    \n\t"
393
            "por        %%mm5, %%mm3    \n\t"
394
            "psllq        $16, %%mm3    \n\t"
395
            "por        %%mm3, %%mm0    \n\t"
396
            MOVNTQ"     %%mm0, %0       \n\t"
397
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
398 6e42e6c4 Diego Biurrun
        d += 4;
399
        s += 16;
400
    }
401
#endif
402 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
403
    __asm__ volatile(EMMS:::"memory");
404 6e42e6c4 Diego Biurrun
#endif
405 dd68318c Ramiro Polla
    while (s < end) {
406 994c1ef0 Baptiste Coudurier
        register int rgb = *(const uint32_t*)s; s += 4;
407 6e42e6c4 Diego Biurrun
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
408
    }
409 fcfbc150 Michael Niedermayer
}
410
411 7f526efd Reimar Döffinger
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
412 ac4d0aea Michael Niedermayer
{
413 6e42e6c4 Diego Biurrun
    const uint8_t *s = src;
414
    const uint8_t *end;
415 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
416 6e42e6c4 Diego Biurrun
    const uint8_t *mm_end;
417 ac4d0aea Michael Niedermayer
#endif
418 6e42e6c4 Diego Biurrun
    uint16_t *d = (uint16_t *)dst;
419
    end = s + src_size;
420 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
421 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
422
    __asm__ volatile(
423 6e42e6c4 Diego Biurrun
        "movq          %0, %%mm7    \n\t"
424
        "movq          %1, %%mm6    \n\t"
425
        ::"m"(red_16mask),"m"(green_16mask));
426
    mm_end = end - 15;
427 dd68318c Ramiro Polla
    while (s < mm_end) {
428 7ad6469e Diego Pettenò
        __asm__ volatile(
429 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
430
            "movd          %1, %%mm0    \n\t"
431
            "movd         4%1, %%mm3    \n\t"
432
            "punpckldq    8%1, %%mm0    \n\t"
433
            "punpckldq   12%1, %%mm3    \n\t"
434
            "movq       %%mm0, %%mm1    \n\t"
435
            "movq       %%mm0, %%mm2    \n\t"
436
            "movq       %%mm3, %%mm4    \n\t"
437
            "movq       %%mm3, %%mm5    \n\t"
438
            "psllq         $8, %%mm0    \n\t"
439
            "psllq         $8, %%mm3    \n\t"
440
            "pand       %%mm7, %%mm0    \n\t"
441
            "pand       %%mm7, %%mm3    \n\t"
442
            "psrlq         $5, %%mm1    \n\t"
443
            "psrlq         $5, %%mm4    \n\t"
444
            "pand       %%mm6, %%mm1    \n\t"
445
            "pand       %%mm6, %%mm4    \n\t"
446
            "psrlq        $19, %%mm2    \n\t"
447
            "psrlq        $19, %%mm5    \n\t"
448
            "pand          %2, %%mm2    \n\t"
449
            "pand          %2, %%mm5    \n\t"
450
            "por        %%mm1, %%mm0    \n\t"
451
            "por        %%mm4, %%mm3    \n\t"
452
            "por        %%mm2, %%mm0    \n\t"
453
            "por        %%mm5, %%mm3    \n\t"
454
            "psllq        $16, %%mm3    \n\t"
455
            "por        %%mm3, %%mm0    \n\t"
456
            MOVNTQ"     %%mm0, %0       \n\t"
457
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
458 6e42e6c4 Diego Biurrun
        d += 4;
459
        s += 16;
460
    }
461 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
462
    __asm__ volatile(EMMS:::"memory");
463 6e42e6c4 Diego Biurrun
#endif
464 dd68318c Ramiro Polla
    while (s < end) {
465 994c1ef0 Baptiste Coudurier
        register int rgb = *(const uint32_t*)s; s += 4;
466 6e42e6c4 Diego Biurrun
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
467
    }
468 ac4d0aea Michael Niedermayer
}
469
470 7f526efd Reimar Döffinger
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
471 fcfbc150 Michael Niedermayer
{
472 6e42e6c4 Diego Biurrun
    const uint8_t *s = src;
473
    const uint8_t *end;
474 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
475 6e42e6c4 Diego Biurrun
    const uint8_t *mm_end;
476 0d9f3d85 Arpi
#endif
477 6e42e6c4 Diego Biurrun
    uint16_t *d = (uint16_t *)dst;
478
    end = s + src_size;
479 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
480 6e42e6c4 Diego Biurrun
    mm_end = end - 15;
481 594ff7cc Diego Biurrun
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
482 7ad6469e Diego Pettenò
    __asm__ volatile(
483 9b734d44 Ramiro Polla
        "movq           %3, %%mm5   \n\t"
484
        "movq           %4, %%mm6   \n\t"
485
        "movq           %5, %%mm7   \n\t"
486
        "jmp            2f          \n\t"
487 ef4a6514 Mans Rullgard
        ".p2align        4          \n\t"
488 9b734d44 Ramiro Polla
        "1:                         \n\t"
489
        PREFETCH"   32(%1)          \n\t"
490
        "movd         (%1), %%mm0   \n\t"
491
        "movd        4(%1), %%mm3   \n\t"
492
        "punpckldq   8(%1), %%mm0   \n\t"
493
        "punpckldq  12(%1), %%mm3   \n\t"
494
        "movq        %%mm0, %%mm1   \n\t"
495
        "movq        %%mm3, %%mm4   \n\t"
496
        "pand        %%mm6, %%mm0   \n\t"
497
        "pand        %%mm6, %%mm3   \n\t"
498
        "pmaddwd     %%mm7, %%mm0   \n\t"
499
        "pmaddwd     %%mm7, %%mm3   \n\t"
500
        "pand        %%mm5, %%mm1   \n\t"
501
        "pand        %%mm5, %%mm4   \n\t"
502
        "por         %%mm1, %%mm0   \n\t"
503
        "por         %%mm4, %%mm3   \n\t"
504
        "psrld          $6, %%mm0   \n\t"
505
        "pslld         $10, %%mm3   \n\t"
506
        "por         %%mm3, %%mm0   \n\t"
507
        MOVNTQ"      %%mm0, (%0)    \n\t"
508
        "add           $16,  %1     \n\t"
509
        "add            $8,  %0     \n\t"
510
        "2:                         \n\t"
511
        "cmp            %2,  %1     \n\t"
512
        " jb            1b          \n\t"
513
        : "+r" (d), "+r"(s)
514
        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
515 6e42e6c4 Diego Biurrun
    );
516 aeae5d53 Michael Niedermayer
#else
517 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
518
    __asm__ volatile(
519 6e42e6c4 Diego Biurrun
        "movq          %0, %%mm7    \n\t"
520
        "movq          %1, %%mm6    \n\t"
521
        ::"m"(red_15mask),"m"(green_15mask));
522 dd68318c Ramiro Polla
    while (s < mm_end) {
523 7ad6469e Diego Pettenò
        __asm__ volatile(
524 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
525
            "movd          %1, %%mm0    \n\t"
526
            "movd         4%1, %%mm3    \n\t"
527
            "punpckldq    8%1, %%mm0    \n\t"
528
            "punpckldq   12%1, %%mm3    \n\t"
529
            "movq       %%mm0, %%mm1    \n\t"
530
            "movq       %%mm0, %%mm2    \n\t"
531
            "movq       %%mm3, %%mm4    \n\t"
532
            "movq       %%mm3, %%mm5    \n\t"
533
            "psrlq         $3, %%mm0    \n\t"
534
            "psrlq         $3, %%mm3    \n\t"
535
            "pand          %2, %%mm0    \n\t"
536
            "pand          %2, %%mm3    \n\t"
537
            "psrlq         $6, %%mm1    \n\t"
538
            "psrlq         $6, %%mm4    \n\t"
539
            "pand       %%mm6, %%mm1    \n\t"
540
            "pand       %%mm6, %%mm4    \n\t"
541
            "psrlq         $9, %%mm2    \n\t"
542
            "psrlq         $9, %%mm5    \n\t"
543
            "pand       %%mm7, %%mm2    \n\t"
544
            "pand       %%mm7, %%mm5    \n\t"
545
            "por        %%mm1, %%mm0    \n\t"
546
            "por        %%mm4, %%mm3    \n\t"
547
            "por        %%mm2, %%mm0    \n\t"
548
            "por        %%mm5, %%mm3    \n\t"
549
            "psllq        $16, %%mm3    \n\t"
550
            "por        %%mm3, %%mm0    \n\t"
551
            MOVNTQ"     %%mm0, %0       \n\t"
552
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
553 6e42e6c4 Diego Biurrun
        d += 4;
554
        s += 16;
555
    }
556
#endif
557 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
558
    __asm__ volatile(EMMS:::"memory");
559 6e42e6c4 Diego Biurrun
#endif
560 dd68318c Ramiro Polla
    while (s < end) {
561 994c1ef0 Baptiste Coudurier
        register int rgb = *(const uint32_t*)s; s += 4;
562 6e42e6c4 Diego Biurrun
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
563
    }
564 fcfbc150 Michael Niedermayer
}
565
566 7f526efd Reimar Döffinger
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
567 ac4d0aea Michael Niedermayer
{
568 6e42e6c4 Diego Biurrun
    const uint8_t *s = src;
569
    const uint8_t *end;
570 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
571 6e42e6c4 Diego Biurrun
    const uint8_t *mm_end;
572 ac4d0aea Michael Niedermayer
#endif
573 6e42e6c4 Diego Biurrun
    uint16_t *d = (uint16_t *)dst;
574
    end = s + src_size;
575 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
576 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
577
    __asm__ volatile(
578 6e42e6c4 Diego Biurrun
        "movq          %0, %%mm7    \n\t"
579
        "movq          %1, %%mm6    \n\t"
580
        ::"m"(red_15mask),"m"(green_15mask));
581
    mm_end = end - 15;
582 dd68318c Ramiro Polla
    while (s < mm_end) {
583 7ad6469e Diego Pettenò
        __asm__ volatile(
584 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
585
            "movd          %1, %%mm0    \n\t"
586
            "movd         4%1, %%mm3    \n\t"
587
            "punpckldq    8%1, %%mm0    \n\t"
588
            "punpckldq   12%1, %%mm3    \n\t"
589
            "movq       %%mm0, %%mm1    \n\t"
590
            "movq       %%mm0, %%mm2    \n\t"
591
            "movq       %%mm3, %%mm4    \n\t"
592
            "movq       %%mm3, %%mm5    \n\t"
593
            "psllq         $7, %%mm0    \n\t"
594
            "psllq         $7, %%mm3    \n\t"
595
            "pand       %%mm7, %%mm0    \n\t"
596
            "pand       %%mm7, %%mm3    \n\t"
597
            "psrlq         $6, %%mm1    \n\t"
598
            "psrlq         $6, %%mm4    \n\t"
599
            "pand       %%mm6, %%mm1    \n\t"
600
            "pand       %%mm6, %%mm4    \n\t"
601
            "psrlq        $19, %%mm2    \n\t"
602
            "psrlq        $19, %%mm5    \n\t"
603
            "pand          %2, %%mm2    \n\t"
604
            "pand          %2, %%mm5    \n\t"
605
            "por        %%mm1, %%mm0    \n\t"
606
            "por        %%mm4, %%mm3    \n\t"
607
            "por        %%mm2, %%mm0    \n\t"
608
            "por        %%mm5, %%mm3    \n\t"
609
            "psllq        $16, %%mm3    \n\t"
610
            "por        %%mm3, %%mm0    \n\t"
611
            MOVNTQ"     %%mm0, %0       \n\t"
612
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
613 6e42e6c4 Diego Biurrun
        d += 4;
614
        s += 16;
615
    }
616 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
617
    __asm__ volatile(EMMS:::"memory");
618 6e42e6c4 Diego Biurrun
#endif
619 dd68318c Ramiro Polla
    while (s < end) {
620 994c1ef0 Baptiste Coudurier
        register int rgb = *(const uint32_t*)s; s += 4;
621 6e42e6c4 Diego Biurrun
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
622
    }
623 ac4d0aea Michael Niedermayer
}
624
625 6107059c Michael Niedermayer
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
626 996e1a7c Nick Kurshev
{
627 6e42e6c4 Diego Biurrun
    const uint8_t *s = src;
628
    const uint8_t *end;
629 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
630 6e42e6c4 Diego Biurrun
    const uint8_t *mm_end;
631 0d9f3d85 Arpi
#endif
632 6e42e6c4 Diego Biurrun
    uint16_t *d = (uint16_t *)dst;
633
    end = s + src_size;
634 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
635 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
636
    __asm__ volatile(
637 6e42e6c4 Diego Biurrun
        "movq         %0, %%mm7     \n\t"
638
        "movq         %1, %%mm6     \n\t"
639
        ::"m"(red_16mask),"m"(green_16mask));
640
    mm_end = end - 11;
641 dd68318c Ramiro Polla
    while (s < mm_end) {
642 7ad6469e Diego Pettenò
        __asm__ volatile(
643 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
644
            "movd          %1, %%mm0    \n\t"
645
            "movd         3%1, %%mm3    \n\t"
646
            "punpckldq    6%1, %%mm0    \n\t"
647
            "punpckldq    9%1, %%mm3    \n\t"
648
            "movq       %%mm0, %%mm1    \n\t"
649
            "movq       %%mm0, %%mm2    \n\t"
650
            "movq       %%mm3, %%mm4    \n\t"
651
            "movq       %%mm3, %%mm5    \n\t"
652
            "psrlq         $3, %%mm0    \n\t"
653
            "psrlq         $3, %%mm3    \n\t"
654
            "pand          %2, %%mm0    \n\t"
655
            "pand          %2, %%mm3    \n\t"
656
            "psrlq         $5, %%mm1    \n\t"
657
            "psrlq         $5, %%mm4    \n\t"
658
            "pand       %%mm6, %%mm1    \n\t"
659
            "pand       %%mm6, %%mm4    \n\t"
660
            "psrlq         $8, %%mm2    \n\t"
661
            "psrlq         $8, %%mm5    \n\t"
662
            "pand       %%mm7, %%mm2    \n\t"
663
            "pand       %%mm7, %%mm5    \n\t"
664
            "por        %%mm1, %%mm0    \n\t"
665
            "por        %%mm4, %%mm3    \n\t"
666
            "por        %%mm2, %%mm0    \n\t"
667
            "por        %%mm5, %%mm3    \n\t"
668
            "psllq        $16, %%mm3    \n\t"
669
            "por        %%mm3, %%mm0    \n\t"
670
            MOVNTQ"     %%mm0, %0       \n\t"
671
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
672 6e42e6c4 Diego Biurrun
        d += 4;
673
        s += 12;
674
    }
675 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
676
    __asm__ volatile(EMMS:::"memory");
677 6e42e6c4 Diego Biurrun
#endif
678 dd68318c Ramiro Polla
    while (s < end) {
679 6e42e6c4 Diego Biurrun
        const int b = *s++;
680
        const int g = *s++;
681
        const int r = *s++;
682
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
683
    }
684 996e1a7c Nick Kurshev
}
685
686 6107059c Michael Niedermayer
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
687 ac4d0aea Michael Niedermayer
{
688 6e42e6c4 Diego Biurrun
    const uint8_t *s = src;
689
    const uint8_t *end;
690 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
691 6e42e6c4 Diego Biurrun
    const uint8_t *mm_end;
692 ac4d0aea Michael Niedermayer
#endif
693 6e42e6c4 Diego Biurrun
    uint16_t *d = (uint16_t *)dst;
694
    end = s + src_size;
695 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
696 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
697
    __asm__ volatile(
698 6e42e6c4 Diego Biurrun
        "movq         %0, %%mm7     \n\t"
699
        "movq         %1, %%mm6     \n\t"
700
        ::"m"(red_16mask),"m"(green_16mask));
701
    mm_end = end - 15;
702 dd68318c Ramiro Polla
    while (s < mm_end) {
703 7ad6469e Diego Pettenò
        __asm__ volatile(
704 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
705
            "movd          %1, %%mm0    \n\t"
706
            "movd         3%1, %%mm3    \n\t"
707
            "punpckldq    6%1, %%mm0    \n\t"
708
            "punpckldq    9%1, %%mm3    \n\t"
709
            "movq       %%mm0, %%mm1    \n\t"
710
            "movq       %%mm0, %%mm2    \n\t"
711
            "movq       %%mm3, %%mm4    \n\t"
712
            "movq       %%mm3, %%mm5    \n\t"
713
            "psllq         $8, %%mm0    \n\t"
714
            "psllq         $8, %%mm3    \n\t"
715
            "pand       %%mm7, %%mm0    \n\t"
716
            "pand       %%mm7, %%mm3    \n\t"
717
            "psrlq         $5, %%mm1    \n\t"
718
            "psrlq         $5, %%mm4    \n\t"
719
            "pand       %%mm6, %%mm1    \n\t"
720
            "pand       %%mm6, %%mm4    \n\t"
721
            "psrlq        $19, %%mm2    \n\t"
722
            "psrlq        $19, %%mm5    \n\t"
723
            "pand          %2, %%mm2    \n\t"
724
            "pand          %2, %%mm5    \n\t"
725
            "por        %%mm1, %%mm0    \n\t"
726
            "por        %%mm4, %%mm3    \n\t"
727
            "por        %%mm2, %%mm0    \n\t"
728
            "por        %%mm5, %%mm3    \n\t"
729
            "psllq        $16, %%mm3    \n\t"
730
            "por        %%mm3, %%mm0    \n\t"
731
            MOVNTQ"     %%mm0, %0       \n\t"
732
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
733 6e42e6c4 Diego Biurrun
        d += 4;
734
        s += 12;
735
    }
736 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
737
    __asm__ volatile(EMMS:::"memory");
738 6e42e6c4 Diego Biurrun
#endif
739 dd68318c Ramiro Polla
    while (s < end) {
740 6e42e6c4 Diego Biurrun
        const int r = *s++;
741
        const int g = *s++;
742
        const int b = *s++;
743
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
744
    }
745 ac4d0aea Michael Niedermayer
}
746
747 6107059c Michael Niedermayer
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
748 996e1a7c Nick Kurshev
{
749 6e42e6c4 Diego Biurrun
    const uint8_t *s = src;
750
    const uint8_t *end;
751 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
752 6e42e6c4 Diego Biurrun
    const uint8_t *mm_end;
753 0d9f3d85 Arpi
#endif
754 6e42e6c4 Diego Biurrun
    uint16_t *d = (uint16_t *)dst;
755
    end = s + src_size;
756 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
757 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
758
    __asm__ volatile(
759 6e42e6c4 Diego Biurrun
        "movq          %0, %%mm7    \n\t"
760
        "movq          %1, %%mm6    \n\t"
761
        ::"m"(red_15mask),"m"(green_15mask));
762
    mm_end = end - 11;
763 dd68318c Ramiro Polla
    while (s < mm_end) {
764 7ad6469e Diego Pettenò
        __asm__ volatile(
765 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
766
            "movd          %1, %%mm0    \n\t"
767
            "movd         3%1, %%mm3    \n\t"
768
            "punpckldq    6%1, %%mm0    \n\t"
769
            "punpckldq    9%1, %%mm3    \n\t"
770
            "movq       %%mm0, %%mm1    \n\t"
771
            "movq       %%mm0, %%mm2    \n\t"
772
            "movq       %%mm3, %%mm4    \n\t"
773
            "movq       %%mm3, %%mm5    \n\t"
774
            "psrlq         $3, %%mm0    \n\t"
775
            "psrlq         $3, %%mm3    \n\t"
776
            "pand          %2, %%mm0    \n\t"
777
            "pand          %2, %%mm3    \n\t"
778
            "psrlq         $6, %%mm1    \n\t"
779
            "psrlq         $6, %%mm4    \n\t"
780
            "pand       %%mm6, %%mm1    \n\t"
781
            "pand       %%mm6, %%mm4    \n\t"
782
            "psrlq         $9, %%mm2    \n\t"
783
            "psrlq         $9, %%mm5    \n\t"
784
            "pand       %%mm7, %%mm2    \n\t"
785
            "pand       %%mm7, %%mm5    \n\t"
786
            "por        %%mm1, %%mm0    \n\t"
787
            "por        %%mm4, %%mm3    \n\t"
788
            "por        %%mm2, %%mm0    \n\t"
789
            "por        %%mm5, %%mm3    \n\t"
790
            "psllq        $16, %%mm3    \n\t"
791
            "por        %%mm3, %%mm0    \n\t"
792
            MOVNTQ"     %%mm0, %0       \n\t"
793
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794 6e42e6c4 Diego Biurrun
        d += 4;
795
        s += 12;
796
    }
797 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
798
    __asm__ volatile(EMMS:::"memory");
799 6e42e6c4 Diego Biurrun
#endif
800 dd68318c Ramiro Polla
    while (s < end) {
801 6e42e6c4 Diego Biurrun
        const int b = *s++;
802
        const int g = *s++;
803
        const int r = *s++;
804
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
805
    }
806 0d9f3d85 Arpi
}
807
808 6107059c Michael Niedermayer
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
809 ac4d0aea Michael Niedermayer
{
810 6e42e6c4 Diego Biurrun
    const uint8_t *s = src;
811
    const uint8_t *end;
812 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
813 6e42e6c4 Diego Biurrun
    const uint8_t *mm_end;
814 ac4d0aea Michael Niedermayer
#endif
815 6e42e6c4 Diego Biurrun
    uint16_t *d = (uint16_t *)dst;
816
    end = s + src_size;
817 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
818 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
819
    __asm__ volatile(
820 6e42e6c4 Diego Biurrun
        "movq         %0, %%mm7     \n\t"
821
        "movq         %1, %%mm6     \n\t"
822
        ::"m"(red_15mask),"m"(green_15mask));
823
    mm_end = end - 15;
824 dd68318c Ramiro Polla
    while (s < mm_end) {
825 7ad6469e Diego Pettenò
        __asm__ volatile(
826 9b734d44 Ramiro Polla
            PREFETCH"   32%1            \n\t"
827
            "movd         %1, %%mm0     \n\t"
828
            "movd        3%1, %%mm3     \n\t"
829
            "punpckldq   6%1, %%mm0     \n\t"
830
            "punpckldq   9%1, %%mm3     \n\t"
831
            "movq      %%mm0, %%mm1     \n\t"
832
            "movq      %%mm0, %%mm2     \n\t"
833
            "movq      %%mm3, %%mm4     \n\t"
834
            "movq      %%mm3, %%mm5     \n\t"
835
            "psllq        $7, %%mm0     \n\t"
836
            "psllq        $7, %%mm3     \n\t"
837
            "pand      %%mm7, %%mm0     \n\t"
838
            "pand      %%mm7, %%mm3     \n\t"
839
            "psrlq        $6, %%mm1     \n\t"
840
            "psrlq        $6, %%mm4     \n\t"
841
            "pand      %%mm6, %%mm1     \n\t"
842
            "pand      %%mm6, %%mm4     \n\t"
843
            "psrlq       $19, %%mm2     \n\t"
844
            "psrlq       $19, %%mm5     \n\t"
845
            "pand         %2, %%mm2     \n\t"
846
            "pand         %2, %%mm5     \n\t"
847
            "por       %%mm1, %%mm0     \n\t"
848
            "por       %%mm4, %%mm3     \n\t"
849
            "por       %%mm2, %%mm0     \n\t"
850
            "por       %%mm5, %%mm3     \n\t"
851
            "psllq       $16, %%mm3     \n\t"
852
            "por       %%mm3, %%mm0     \n\t"
853
            MOVNTQ"    %%mm0, %0        \n\t"
854
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855 6e42e6c4 Diego Biurrun
        d += 4;
856
        s += 12;
857
    }
858 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
859
    __asm__ volatile(EMMS:::"memory");
860 6e42e6c4 Diego Biurrun
#endif
861 dd68318c Ramiro Polla
    while (s < end) {
862 6e42e6c4 Diego Biurrun
        const int r = *s++;
863
        const int g = *s++;
864
        const int b = *s++;
865
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
866
    }
867 ac4d0aea Michael Niedermayer
}
868
869 0d9f3d85 Arpi
/*
870 594ff7cc Diego Biurrun
  I use less accurate approximation here by simply left-shifting the input
871
  value and filling the low order bits with zeroes. This method improves PNG
872
  compression but this scheme cannot reproduce white exactly, since it does
873
  not generate an all-ones maximum value; the net effect is to darken the
874 0d9f3d85 Arpi
  image slightly.
875

876
  The better method should be "left bit replication":
877

878
   4 3 2 1 0
879
   ---------
880
   1 1 0 1 1
881

882
   7 6 5 4 3  2 1 0
883
   ----------------
884
   1 1 0 1 1  1 1 0
885
   |=======|  |===|
886 8a322796 Diego Biurrun
       |      leftmost bits repeated to fill open bits
887 0d9f3d85 Arpi
       |
888 8a322796 Diego Biurrun
   original bits
889 0d9f3d85 Arpi
*/
890 6107059c Michael Niedermayer
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
891 0d9f3d85 Arpi
{
892 6e42e6c4 Diego Biurrun
    const uint16_t *end;
893 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
894 6e42e6c4 Diego Biurrun
    const uint16_t *mm_end;
895 0d9f3d85 Arpi
#endif
896 baf7f7c6 Baptiste Coudurier
    uint8_t *d = dst;
897 994c1ef0 Baptiste Coudurier
    const uint16_t *s = (const uint16_t*)src;
898 6e42e6c4 Diego Biurrun
    end = s + src_size/2;
899 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
900 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
901 6e42e6c4 Diego Biurrun
    mm_end = end - 7;
902 dd68318c Ramiro Polla
    while (s < mm_end) {
903 7ad6469e Diego Pettenò
        __asm__ volatile(
904 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
905
            "movq          %1, %%mm0    \n\t"
906
            "movq          %1, %%mm1    \n\t"
907
            "movq          %1, %%mm2    \n\t"
908
            "pand          %2, %%mm0    \n\t"
909
            "pand          %3, %%mm1    \n\t"
910
            "pand          %4, %%mm2    \n\t"
911
            "psllq         $3, %%mm0    \n\t"
912
            "psrlq         $2, %%mm1    \n\t"
913
            "psrlq         $7, %%mm2    \n\t"
914
            "movq       %%mm0, %%mm3    \n\t"
915
            "movq       %%mm1, %%mm4    \n\t"
916
            "movq       %%mm2, %%mm5    \n\t"
917
            "punpcklwd     %5, %%mm0    \n\t"
918
            "punpcklwd     %5, %%mm1    \n\t"
919
            "punpcklwd     %5, %%mm2    \n\t"
920
            "punpckhwd     %5, %%mm3    \n\t"
921
            "punpckhwd     %5, %%mm4    \n\t"
922
            "punpckhwd     %5, %%mm5    \n\t"
923
            "psllq         $8, %%mm1    \n\t"
924
            "psllq        $16, %%mm2    \n\t"
925
            "por        %%mm1, %%mm0    \n\t"
926
            "por        %%mm2, %%mm0    \n\t"
927
            "psllq         $8, %%mm4    \n\t"
928
            "psllq        $16, %%mm5    \n\t"
929
            "por        %%mm4, %%mm3    \n\t"
930
            "por        %%mm5, %%mm3    \n\t"
931
932
            "movq       %%mm0, %%mm6    \n\t"
933
            "movq       %%mm3, %%mm7    \n\t"
934
935
            "movq         8%1, %%mm0    \n\t"
936
            "movq         8%1, %%mm1    \n\t"
937
            "movq         8%1, %%mm2    \n\t"
938
            "pand          %2, %%mm0    \n\t"
939
            "pand          %3, %%mm1    \n\t"
940
            "pand          %4, %%mm2    \n\t"
941
            "psllq         $3, %%mm0    \n\t"
942
            "psrlq         $2, %%mm1    \n\t"
943
            "psrlq         $7, %%mm2    \n\t"
944
            "movq       %%mm0, %%mm3    \n\t"
945
            "movq       %%mm1, %%mm4    \n\t"
946
            "movq       %%mm2, %%mm5    \n\t"
947
            "punpcklwd     %5, %%mm0    \n\t"
948
            "punpcklwd     %5, %%mm1    \n\t"
949
            "punpcklwd     %5, %%mm2    \n\t"
950
            "punpckhwd     %5, %%mm3    \n\t"
951
            "punpckhwd     %5, %%mm4    \n\t"
952
            "punpckhwd     %5, %%mm5    \n\t"
953
            "psllq         $8, %%mm1    \n\t"
954
            "psllq        $16, %%mm2    \n\t"
955
            "por        %%mm1, %%mm0    \n\t"
956
            "por        %%mm2, %%mm0    \n\t"
957
            "psllq         $8, %%mm4    \n\t"
958
            "psllq        $16, %%mm5    \n\t"
959
            "por        %%mm4, %%mm3    \n\t"
960
            "por        %%mm5, %%mm3    \n\t"
961
962
            :"=m"(*d)
963
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
964
            :"memory");
965 8a322796 Diego Biurrun
        /* borrowed 32 to 24 */
966 7ad6469e Diego Pettenò
        __asm__ volatile(
967 9b734d44 Ramiro Polla
            "movq       %%mm0, %%mm4    \n\t"
968
            "movq       %%mm3, %%mm5    \n\t"
969
            "movq       %%mm6, %%mm0    \n\t"
970
            "movq       %%mm7, %%mm1    \n\t"
971
972
            "movq       %%mm4, %%mm6    \n\t"
973
            "movq       %%mm5, %%mm7    \n\t"
974
            "movq       %%mm0, %%mm2    \n\t"
975
            "movq       %%mm1, %%mm3    \n\t"
976
977 5b03661f Reimar Döffinger
            STORE_BGR24_MMX
978 9b734d44 Ramiro Polla
979
            :"=m"(*d)
980 5b03661f Reimar Döffinger
            :"m"(*s)
981 9b734d44 Ramiro Polla
            :"memory");
982 6e42e6c4 Diego Biurrun
        d += 24;
983
        s += 8;
984
    }
985 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
986
    __asm__ volatile(EMMS:::"memory");
987 6e42e6c4 Diego Biurrun
#endif
988 dd68318c Ramiro Polla
    while (s < end) {
989 6e42e6c4 Diego Biurrun
        register uint16_t bgr;
990
        bgr = *s++;
991
        *d++ = (bgr&0x1F)<<3;
992
        *d++ = (bgr&0x3E0)>>2;
993
        *d++ = (bgr&0x7C00)>>7;
994
    }
995 0d9f3d85 Arpi
}
996
997 6107059c Michael Niedermayer
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
998 0d9f3d85 Arpi
{
999 6e42e6c4 Diego Biurrun
    const uint16_t *end;
1000 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1001 6e42e6c4 Diego Biurrun
    const uint16_t *mm_end;
1002 0d9f3d85 Arpi
#endif
1003 6e42e6c4 Diego Biurrun
    uint8_t *d = (uint8_t *)dst;
1004
    const uint16_t *s = (const uint16_t *)src;
1005
    end = s + src_size/2;
1006 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1007 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1008 6e42e6c4 Diego Biurrun
    mm_end = end - 7;
1009 dd68318c Ramiro Polla
    while (s < mm_end) {
1010 7ad6469e Diego Pettenò
        __asm__ volatile(
1011 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
1012
            "movq          %1, %%mm0    \n\t"
1013
            "movq          %1, %%mm1    \n\t"
1014
            "movq          %1, %%mm2    \n\t"
1015
            "pand          %2, %%mm0    \n\t"
1016
            "pand          %3, %%mm1    \n\t"
1017
            "pand          %4, %%mm2    \n\t"
1018
            "psllq         $3, %%mm0    \n\t"
1019
            "psrlq         $3, %%mm1    \n\t"
1020
            "psrlq         $8, %%mm2    \n\t"
1021
            "movq       %%mm0, %%mm3    \n\t"
1022
            "movq       %%mm1, %%mm4    \n\t"
1023
            "movq       %%mm2, %%mm5    \n\t"
1024
            "punpcklwd     %5, %%mm0    \n\t"
1025
            "punpcklwd     %5, %%mm1    \n\t"
1026
            "punpcklwd     %5, %%mm2    \n\t"
1027
            "punpckhwd     %5, %%mm3    \n\t"
1028
            "punpckhwd     %5, %%mm4    \n\t"
1029
            "punpckhwd     %5, %%mm5    \n\t"
1030
            "psllq         $8, %%mm1    \n\t"
1031
            "psllq        $16, %%mm2    \n\t"
1032
            "por        %%mm1, %%mm0    \n\t"
1033
            "por        %%mm2, %%mm0    \n\t"
1034
            "psllq         $8, %%mm4    \n\t"
1035
            "psllq        $16, %%mm5    \n\t"
1036
            "por        %%mm4, %%mm3    \n\t"
1037
            "por        %%mm5, %%mm3    \n\t"
1038
1039
            "movq       %%mm0, %%mm6    \n\t"
1040
            "movq       %%mm3, %%mm7    \n\t"
1041
1042
            "movq         8%1, %%mm0    \n\t"
1043
            "movq         8%1, %%mm1    \n\t"
1044
            "movq         8%1, %%mm2    \n\t"
1045
            "pand          %2, %%mm0    \n\t"
1046
            "pand          %3, %%mm1    \n\t"
1047
            "pand          %4, %%mm2    \n\t"
1048
            "psllq         $3, %%mm0    \n\t"
1049
            "psrlq         $3, %%mm1    \n\t"
1050
            "psrlq         $8, %%mm2    \n\t"
1051
            "movq       %%mm0, %%mm3    \n\t"
1052
            "movq       %%mm1, %%mm4    \n\t"
1053
            "movq       %%mm2, %%mm5    \n\t"
1054
            "punpcklwd     %5, %%mm0    \n\t"
1055
            "punpcklwd     %5, %%mm1    \n\t"
1056
            "punpcklwd     %5, %%mm2    \n\t"
1057
            "punpckhwd     %5, %%mm3    \n\t"
1058
            "punpckhwd     %5, %%mm4    \n\t"
1059
            "punpckhwd     %5, %%mm5    \n\t"
1060
            "psllq         $8, %%mm1    \n\t"
1061
            "psllq        $16, %%mm2    \n\t"
1062
            "por        %%mm1, %%mm0    \n\t"
1063
            "por        %%mm2, %%mm0    \n\t"
1064
            "psllq         $8, %%mm4    \n\t"
1065
            "psllq        $16, %%mm5    \n\t"
1066
            "por        %%mm4, %%mm3    \n\t"
1067
            "por        %%mm5, %%mm3    \n\t"
1068
            :"=m"(*d)
1069
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1070
            :"memory");
1071 8a322796 Diego Biurrun
        /* borrowed 32 to 24 */
1072 7ad6469e Diego Pettenò
        __asm__ volatile(
1073 9b734d44 Ramiro Polla
            "movq       %%mm0, %%mm4    \n\t"
1074
            "movq       %%mm3, %%mm5    \n\t"
1075
            "movq       %%mm6, %%mm0    \n\t"
1076
            "movq       %%mm7, %%mm1    \n\t"
1077
1078
            "movq       %%mm4, %%mm6    \n\t"
1079
            "movq       %%mm5, %%mm7    \n\t"
1080
            "movq       %%mm0, %%mm2    \n\t"
1081
            "movq       %%mm1, %%mm3    \n\t"
1082
1083 5b03661f Reimar Döffinger
            STORE_BGR24_MMX
1084 9b734d44 Ramiro Polla
1085
            :"=m"(*d)
1086 5b03661f Reimar Döffinger
            :"m"(*s)
1087 9b734d44 Ramiro Polla
            :"memory");
1088 6e42e6c4 Diego Biurrun
        d += 24;
1089
        s += 8;
1090
    }
1091 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
1092
    __asm__ volatile(EMMS:::"memory");
1093 6e42e6c4 Diego Biurrun
#endif
1094 dd68318c Ramiro Polla
    while (s < end) {
1095 6e42e6c4 Diego Biurrun
        register uint16_t bgr;
1096
        bgr = *s++;
1097
        *d++ = (bgr&0x1F)<<3;
1098
        *d++ = (bgr&0x7E0)>>3;
1099
        *d++ = (bgr&0xF800)>>8;
1100
    }
1101 0d9f3d85 Arpi
}
1102
1103 a284d030 Cédric Schieli
/*
1104
 * mm0 = 00 B3 00 B2 00 B1 00 B0
1105
 * mm1 = 00 G3 00 G2 00 G1 00 G0
1106
 * mm2 = 00 R3 00 R2 00 R1 00 R0
1107
 * mm6 = FF FF FF FF FF FF FF FF
1108
 * mm7 = 00 00 00 00 00 00 00 00
1109
 */
1110
#define PACK_RGB32 \
1111
    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1112
    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1113
    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1114
    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1115
    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1116
    "movq       %%mm0, %%mm3    \n\t"                               \
1117
    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1118
    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1119
    MOVNTQ"     %%mm0,  %0      \n\t"                               \
1120
    MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1121
1122 7f526efd Reimar Döffinger
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1123 0d9f3d85 Arpi
{
1124 6e42e6c4 Diego Biurrun
    const uint16_t *end;
1125 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1126 6e42e6c4 Diego Biurrun
    const uint16_t *mm_end;
1127 0d9f3d85 Arpi
#endif
1128 baf7f7c6 Baptiste Coudurier
    uint8_t *d = dst;
1129 6e42e6c4 Diego Biurrun
    const uint16_t *s = (const uint16_t *)src;
1130
    end = s + src_size/2;
1131 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1132 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1133
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1134 a284d030 Cédric Schieli
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1135 6e42e6c4 Diego Biurrun
    mm_end = end - 3;
1136 dd68318c Ramiro Polla
    while (s < mm_end) {
1137 7ad6469e Diego Pettenò
        __asm__ volatile(
1138 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
1139
            "movq          %1, %%mm0    \n\t"
1140
            "movq          %1, %%mm1    \n\t"
1141
            "movq          %1, %%mm2    \n\t"
1142
            "pand          %2, %%mm0    \n\t"
1143
            "pand          %3, %%mm1    \n\t"
1144
            "pand          %4, %%mm2    \n\t"
1145
            "psllq         $3, %%mm0    \n\t"
1146
            "psrlq         $2, %%mm1    \n\t"
1147
            "psrlq         $7, %%mm2    \n\t"
1148
            PACK_RGB32
1149
            :"=m"(*d)
1150
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1151
            :"memory");
1152 6e42e6c4 Diego Biurrun
        d += 16;
1153
        s += 4;
1154
    }
1155 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
1156
    __asm__ volatile(EMMS:::"memory");
1157 6e42e6c4 Diego Biurrun
#endif
1158 dd68318c Ramiro Polla
    while (s < end) {
1159 6e42e6c4 Diego Biurrun
        register uint16_t bgr;
1160
        bgr = *s++;
1161 a898cdc9 Måns Rullgård
#if HAVE_BIGENDIAN
1162 f8a138be Cédric Schieli
        *d++ = 255;
1163 6e42e6c4 Diego Biurrun
        *d++ = (bgr&0x7C00)>>7;
1164
        *d++ = (bgr&0x3E0)>>2;
1165
        *d++ = (bgr&0x1F)<<3;
1166 6cb38650 Alex Beregszaszi
#else
1167 6e42e6c4 Diego Biurrun
        *d++ = (bgr&0x1F)<<3;
1168
        *d++ = (bgr&0x3E0)>>2;
1169
        *d++ = (bgr&0x7C00)>>7;
1170 f8a138be Cédric Schieli
        *d++ = 255;
1171 deb2277c Michael Niedermayer
#endif
1172 6e42e6c4 Diego Biurrun
    }
1173 0d9f3d85 Arpi
}
1174 996e1a7c Nick Kurshev
1175 7f526efd Reimar Döffinger
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1176 0d9f3d85 Arpi
{
1177 6e42e6c4 Diego Biurrun
    const uint16_t *end;
1178 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1179 6e42e6c4 Diego Biurrun
    const uint16_t *mm_end;
1180 0d9f3d85 Arpi
#endif
1181 baf7f7c6 Baptiste Coudurier
    uint8_t *d = dst;
1182 994c1ef0 Baptiste Coudurier
    const uint16_t *s = (const uint16_t*)src;
1183 6e42e6c4 Diego Biurrun
    end = s + src_size/2;
1184 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1185 7ad6469e Diego Pettenò
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1186
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1187 a284d030 Cédric Schieli
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1188 6e42e6c4 Diego Biurrun
    mm_end = end - 3;
1189 dd68318c Ramiro Polla
    while (s < mm_end) {
1190 7ad6469e Diego Pettenò
        __asm__ volatile(
1191 9b734d44 Ramiro Polla
            PREFETCH"    32%1           \n\t"
1192
            "movq          %1, %%mm0    \n\t"
1193
            "movq          %1, %%mm1    \n\t"
1194
            "movq          %1, %%mm2    \n\t"
1195
            "pand          %2, %%mm0    \n\t"
1196
            "pand          %3, %%mm1    \n\t"
1197
            "pand          %4, %%mm2    \n\t"
1198
            "psllq         $3, %%mm0    \n\t"
1199
            "psrlq         $3, %%mm1    \n\t"
1200
            "psrlq         $8, %%mm2    \n\t"
1201
            PACK_RGB32
1202
            :"=m"(*d)
1203
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1204
            :"memory");
1205 6e42e6c4 Diego Biurrun
        d += 16;
1206
        s += 4;
1207
    }
1208 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
1209
    __asm__ volatile(EMMS:::"memory");
1210 6e42e6c4 Diego Biurrun
#endif
1211 dd68318c Ramiro Polla
    while (s < end) {
1212 6e42e6c4 Diego Biurrun
        register uint16_t bgr;
1213
        bgr = *s++;
1214 a898cdc9 Måns Rullgård
#if HAVE_BIGENDIAN
1215 f8a138be Cédric Schieli
        *d++ = 255;
1216 6e42e6c4 Diego Biurrun
        *d++ = (bgr&0xF800)>>8;
1217
        *d++ = (bgr&0x7E0)>>3;
1218
        *d++ = (bgr&0x1F)<<3;
1219 6cb38650 Alex Beregszaszi
#else
1220 6e42e6c4 Diego Biurrun
        *d++ = (bgr&0x1F)<<3;
1221
        *d++ = (bgr&0x7E0)>>3;
1222
        *d++ = (bgr&0xF800)>>8;
1223 f8a138be Cédric Schieli
        *d++ = 255;
1224 6cb38650 Alex Beregszaszi
#endif
1225 6e42e6c4 Diego Biurrun
    }
1226 996e1a7c Nick Kurshev
}
1227 fcfbc150 Michael Niedermayer
1228 a51125b5 Ramiro Polla
static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
1229 99969243 Michael Niedermayer
{
1230 9326d3f3 Michael Niedermayer
    x86_reg idx = 15 - src_size;
1231 994c1ef0 Baptiste Coudurier
    const uint8_t *s = src-idx;
1232
    uint8_t *d = dst-idx;
1233 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1234 7ad6469e Diego Pettenò
    __asm__ volatile(
1235 9b734d44 Ramiro Polla
        "test          %0, %0           \n\t"
1236
        "jns           2f               \n\t"
1237
        PREFETCH"       (%1, %0)        \n\t"
1238
        "movq          %3, %%mm7        \n\t"
1239
        "pxor          %4, %%mm7        \n\t"
1240
        "movq       %%mm7, %%mm6        \n\t"
1241
        "pxor          %5, %%mm7        \n\t"
1242 ef4a6514 Mans Rullgard
        ".p2align       4               \n\t"
1243 9b734d44 Ramiro Polla
        "1:                             \n\t"
1244
        PREFETCH"     32(%1, %0)        \n\t"
1245
        "movq           (%1, %0), %%mm0 \n\t"
1246
        "movq          8(%1, %0), %%mm1 \n\t"
1247 c12f7b2d Ramiro Polla
# if COMPILE_TEMPLATE_MMX2
1248 9b734d44 Ramiro Polla
        "pshufw      $177, %%mm0, %%mm3 \n\t"
1249
        "pshufw      $177, %%mm1, %%mm5 \n\t"
1250
        "pand       %%mm7, %%mm0        \n\t"
1251
        "pand       %%mm6, %%mm3        \n\t"
1252
        "pand       %%mm7, %%mm1        \n\t"
1253
        "pand       %%mm6, %%mm5        \n\t"
1254
        "por        %%mm3, %%mm0        \n\t"
1255
        "por        %%mm5, %%mm1        \n\t"
1256 b38d4874 Ivo van Poorten
# else
1257 9b734d44 Ramiro Polla
        "movq       %%mm0, %%mm2        \n\t"
1258
        "movq       %%mm1, %%mm4        \n\t"
1259
        "pand       %%mm7, %%mm0        \n\t"
1260
        "pand       %%mm6, %%mm2        \n\t"
1261
        "pand       %%mm7, %%mm1        \n\t"
1262
        "pand       %%mm6, %%mm4        \n\t"
1263
        "movq       %%mm2, %%mm3        \n\t"
1264
        "movq       %%mm4, %%mm5        \n\t"
1265
        "pslld        $16, %%mm2        \n\t"
1266
        "psrld        $16, %%mm3        \n\t"
1267
        "pslld        $16, %%mm4        \n\t"
1268
        "psrld        $16, %%mm5        \n\t"
1269
        "por        %%mm2, %%mm0        \n\t"
1270
        "por        %%mm4, %%mm1        \n\t"
1271
        "por        %%mm3, %%mm0        \n\t"
1272
        "por        %%mm5, %%mm1        \n\t"
1273 b38d4874 Ivo van Poorten
# endif
1274 9b734d44 Ramiro Polla
        MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1275
        MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1276
        "add          $16, %0           \n\t"
1277
        "js            1b               \n\t"
1278
        SFENCE"                         \n\t"
1279
        EMMS"                           \n\t"
1280
        "2:                             \n\t"
1281
        : "+&r"(idx)
1282
        : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1283
        : "memory");
1284 6e42e6c4 Diego Biurrun
#endif
1285
    for (; idx<15; idx+=4) {
1286 994c1ef0 Baptiste Coudurier
        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1287 6e42e6c4 Diego Biurrun
        v &= 0xff00ff;
1288
        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1289
    }
1290 99969243 Michael Niedermayer
}
1291
1292 7f526efd Reimar Döffinger
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1293 74d35835 Michael Niedermayer
{
1294 6e42e6c4 Diego Biurrun
    unsigned i;
1295 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1296 d0ce212a Ramiro Polla
    x86_reg mmx_size= 23 - src_size;
1297 7ad6469e Diego Pettenò
    __asm__ volatile (
1298 9b734d44 Ramiro Polla
        "test             %%"REG_a", %%"REG_a"          \n\t"
1299
        "jns                     2f                     \n\t"
1300
        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1301
        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1302
        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1303 ef4a6514 Mans Rullgard
        ".p2align                 4                     \n\t"
1304 9b734d44 Ramiro Polla
        "1:                                             \n\t"
1305
        PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1306
        "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1307
        "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1308
        "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1309
        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1310
        "pand                 %%mm5, %%mm0              \n\t"
1311
        "pand                 %%mm6, %%mm1              \n\t"
1312
        "pand                 %%mm7, %%mm2              \n\t"
1313
        "por                  %%mm0, %%mm1              \n\t"
1314
        "por                  %%mm2, %%mm1              \n\t"
1315
        "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1316
        MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1317
        "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1318
        "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1319
        "pand                 %%mm7, %%mm0              \n\t"
1320
        "pand                 %%mm5, %%mm1              \n\t"
1321
        "pand                 %%mm6, %%mm2              \n\t"
1322
        "por                  %%mm0, %%mm1              \n\t"
1323
        "por                  %%mm2, %%mm1              \n\t"
1324
        "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1325
        MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1326
        "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1327
        "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1328
        "pand                 %%mm6, %%mm0              \n\t"
1329
        "pand                 %%mm7, %%mm1              \n\t"
1330
        "pand                 %%mm5, %%mm2              \n\t"
1331
        "por                  %%mm0, %%mm1              \n\t"
1332
        "por                  %%mm2, %%mm1              \n\t"
1333
        MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1334
        "add                    $24, %%"REG_a"          \n\t"
1335
        " js                     1b                     \n\t"
1336
        "2:                                             \n\t"
1337
        : "+a" (mmx_size)
1338
        : "r" (src-mmx_size), "r"(dst-mmx_size)
1339 6e42e6c4 Diego Biurrun
    );
1340
1341 7ad6469e Diego Pettenò
    __asm__ volatile(SFENCE:::"memory");
1342
    __asm__ volatile(EMMS:::"memory");
1343 6e42e6c4 Diego Biurrun
1344 8a322796 Diego Biurrun
    if (mmx_size==23) return; //finished, was multiple of 8
1345 6e42e6c4 Diego Biurrun
1346
    src+= src_size;
1347
    dst+= src_size;
1348
    src_size= 23-mmx_size;
1349
    src-= src_size;
1350
    dst-= src_size;
1351
#endif
1352 dd68318c Ramiro Polla
    for (i=0; i<src_size; i+=3) {
1353 6e42e6c4 Diego Biurrun
        register uint8_t x;
1354
        x          = src[i + 2];
1355
        dst[i + 1] = src[i + 1];
1356
        dst[i + 2] = src[i + 0];
1357
        dst[i + 0] = x;
1358
    }
1359 74d35835 Michael Niedermayer
}
1360
1361 b1ec5875 Michael Niedermayer
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1362 6e42e6c4 Diego Biurrun
                                           long width, long height,
1363
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1364 d9d58d17 Michael Niedermayer
{
1365 6e42e6c4 Diego Biurrun
    long y;
1366 9326d3f3 Michael Niedermayer
    const x86_reg chromWidth= width>>1;
1367 dd68318c Ramiro Polla
    for (y=0; y<height; y++) {
1368 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1369 7d73d1c3 Ramiro Polla
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1370 7ad6469e Diego Pettenò
        __asm__ volatile(
1371 9b734d44 Ramiro Polla
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1372 ef4a6514 Mans Rullgard
            ".p2align                    4              \n\t"
1373 9b734d44 Ramiro Polla
            "1:                                         \n\t"
1374
            PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1375
            PREFETCH"    32(%2, %%"REG_a")              \n\t"
1376
            PREFETCH"    32(%3, %%"REG_a")              \n\t"
1377
            "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1378
            "movq                    %%mm0, %%mm2       \n\t" // U(0)
1379
            "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1380
            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1381
            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1382
1383
            "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1384
            "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1385
            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1386
            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1387
            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1388
            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1389
            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1390
            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1391
1392
            MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1393
            MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1394
            MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1395
            MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1396
1397
            "add                        $8, %%"REG_a"   \n\t"
1398
            "cmp                        %4, %%"REG_a"   \n\t"
1399
            " jb                        1b              \n\t"
1400
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1401
            : "%"REG_a
1402 6e42e6c4 Diego Biurrun
        );
1403 4060205b Michael Niedermayer
#else
1404 b3b8bf64 Michael Niedermayer
1405 b63f641e Aurelien Jacobs
#if ARCH_ALPHA && HAVE_MVI
1406 6e42e6c4 Diego Biurrun
#define pl2yuy2(n)                  \
1407
    y1 = yc[n];                     \
1408
    y2 = yc2[n];                    \
1409
    u = uc[n];                      \
1410
    v = vc[n];                      \
1411 7ad6469e Diego Pettenò
    __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1412
    __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1413
    __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1414
    __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1415 6e42e6c4 Diego Biurrun
    yuv1 = (u << 8) + (v << 24);                \
1416
    yuv2 = yuv1 + y2;               \
1417
    yuv1 += y1;                     \
1418
    qdst[n]  = yuv1;                \
1419
    qdst2[n] = yuv2;
1420
1421
        int i;
1422
        uint64_t *qdst = (uint64_t *) dst;
1423
        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1424
        const uint32_t *yc = (uint32_t *) ysrc;
1425
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1426
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1427 dd68318c Ramiro Polla
        for (i = 0; i < chromWidth; i += 8) {
1428 6e42e6c4 Diego Biurrun
            uint64_t y1, y2, yuv1, yuv2;
1429
            uint64_t u, v;
1430
            /* Prefetch */
1431 7ad6469e Diego Pettenò
            __asm__("ldq $31,64(%0)" :: "r"(yc));
1432
            __asm__("ldq $31,64(%0)" :: "r"(yc2));
1433
            __asm__("ldq $31,64(%0)" :: "r"(uc));
1434
            __asm__("ldq $31,64(%0)" :: "r"(vc));
1435 6e42e6c4 Diego Biurrun
1436
            pl2yuy2(0);
1437
            pl2yuy2(1);
1438
            pl2yuy2(2);
1439
            pl2yuy2(3);
1440
1441
            yc    += 4;
1442
            yc2   += 4;
1443
            uc    += 4;
1444
            vc    += 4;
1445
            qdst  += 4;
1446
            qdst2 += 4;
1447
        }
1448
        y++;
1449
        ysrc += lumStride;
1450
        dst += dstStride;
1451 b3b8bf64 Michael Niedermayer
1452 02a6a6ee Diego Biurrun
#elif HAVE_FAST_64BIT
1453 6e42e6c4 Diego Biurrun
        int i;
1454
        uint64_t *ldst = (uint64_t *) dst;
1455
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1456 dd68318c Ramiro Polla
        for (i = 0; i < chromWidth; i += 2) {
1457 6e42e6c4 Diego Biurrun
            uint64_t k, l;
1458
            k = yc[0] + (uc[0] << 8) +
1459
                (yc[1] << 16) + (vc[0] << 24);
1460
            l = yc[2] + (uc[1] << 8) +
1461
                (yc[3] << 16) + (vc[1] << 24);
1462
            *ldst++ = k + (l << 32);
1463
            yc += 4;
1464
            uc += 2;
1465
            vc += 2;
1466
        }
1467 0d9f3d85 Arpi
1468
#else
1469 6e42e6c4 Diego Biurrun
        int i, *idst = (int32_t *) dst;
1470
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1471 dd68318c Ramiro Polla
        for (i = 0; i < chromWidth; i++) {
1472 a898cdc9 Måns Rullgård
#if HAVE_BIGENDIAN
1473 6e42e6c4 Diego Biurrun
            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1474
                (yc[1] << 8) + (vc[0] << 0);
1475 da7f8893 Michael Niedermayer
#else
1476 6e42e6c4 Diego Biurrun
            *idst++ = yc[0] + (uc[0] << 8) +
1477
                (yc[1] << 16) + (vc[0] << 24);
1478
#endif
1479
            yc += 2;
1480
            uc++;
1481
            vc++;
1482
        }
1483
#endif
1484
#endif
1485 dd68318c Ramiro Polla
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1486 6e42e6c4 Diego Biurrun
            usrc += chromStride;
1487
            vsrc += chromStride;
1488
        }
1489
        ysrc += lumStride;
1490
        dst  += dstStride;
1491
    }
1492 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1493 9b734d44 Ramiro Polla
    __asm__(EMMS"       \n\t"
1494
            SFENCE"     \n\t"
1495
            :::"memory");
1496 4060205b Michael Niedermayer
#endif
1497 d9d58d17 Michael Niedermayer
}
1498
1499 dabcdbc4 Michael Niedermayer
/**
1500 8a322796 Diego Biurrun
 * Height should be a multiple of 2 and width should be a multiple of 16.
1501
 * (If this is a problem for anyone then tell me, and I will fix it.)
1502 dabcdbc4 Michael Niedermayer
 */
1503 b1ec5875 Michael Niedermayer
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1504 6e42e6c4 Diego Biurrun
                                      long width, long height,
1505
                                      long lumStride, long chromStride, long dstStride)
1506 b1ec5875 Michael Niedermayer
{
1507 6e42e6c4 Diego Biurrun
    //FIXME interpolate chroma
1508
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1509 caeaabe7 Alex Beregszaszi
}
1510
1511
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1512 6e42e6c4 Diego Biurrun
                                           long width, long height,
1513
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1514 caeaabe7 Alex Beregszaszi
{
1515 6e42e6c4 Diego Biurrun
    long y;
1516 9326d3f3 Michael Niedermayer
    const x86_reg chromWidth= width>>1;
1517 dd68318c Ramiro Polla
    for (y=0; y<height; y++) {
1518 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1519 7d73d1c3 Ramiro Polla
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1520 7ad6469e Diego Pettenò
        __asm__ volatile(
1521 9b734d44 Ramiro Polla
            "xor                %%"REG_a", %%"REG_a"    \n\t"
1522 ef4a6514 Mans Rullgard
            ".p2align                   4               \n\t"
1523 9b734d44 Ramiro Polla
            "1:                                         \n\t"
1524
            PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1525
            PREFETCH"   32(%2, %%"REG_a")               \n\t"
1526
            PREFETCH"   32(%3, %%"REG_a")               \n\t"
1527
            "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1528
            "movq                   %%mm0, %%mm2        \n\t" // U(0)
1529
            "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1530
            "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1531
            "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1532
1533
            "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1534
            "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1535
            "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1536
            "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1537
            "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1538
            "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1539
            "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1540
            "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1541
1542
            MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1543
            MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1544
            MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1545
            MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1546
1547
            "add                       $8, %%"REG_a"    \n\t"
1548
            "cmp                       %4, %%"REG_a"    \n\t"
1549
            " jb                       1b               \n\t"
1550
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1551
            : "%"REG_a
1552 6e42e6c4 Diego Biurrun
        );
1553 7ac25f2d Michael Niedermayer
#else
1554 594ff7cc Diego Biurrun
//FIXME adapt the Alpha ASM code from yv12->yuy2
1555 7ac25f2d Michael Niedermayer
1556 02a6a6ee Diego Biurrun
#if HAVE_FAST_64BIT
1557 6e42e6c4 Diego Biurrun
        int i;
1558
        uint64_t *ldst = (uint64_t *) dst;
1559
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560 dd68318c Ramiro Polla
        for (i = 0; i < chromWidth; i += 2) {
1561 6e42e6c4 Diego Biurrun
            uint64_t k, l;
1562
            k = uc[0] + (yc[0] << 8) +
1563
                (vc[0] << 16) + (yc[1] << 24);
1564
            l = uc[1] + (yc[2] << 8) +
1565
                (vc[1] << 16) + (yc[3] << 24);
1566
            *ldst++ = k + (l << 32);
1567
            yc += 4;
1568
            uc += 2;
1569
            vc += 2;
1570
        }
1571 caeaabe7 Alex Beregszaszi
1572
#else
1573 6e42e6c4 Diego Biurrun
        int i, *idst = (int32_t *) dst;
1574
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575 dd68318c Ramiro Polla
        for (i = 0; i < chromWidth; i++) {
1576 a898cdc9 Måns Rullgård
#if HAVE_BIGENDIAN
1577 6e42e6c4 Diego Biurrun
            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1578
                (vc[0] << 8) + (yc[1] << 0);
1579 da7f8893 Michael Niedermayer
#else
1580 6e42e6c4 Diego Biurrun
            *idst++ = uc[0] + (yc[0] << 8) +
1581 8a322796 Diego Biurrun
               (vc[0] << 16) + (yc[1] << 24);
1582 6e42e6c4 Diego Biurrun
#endif
1583
            yc += 2;
1584
            uc++;
1585
            vc++;
1586
        }
1587
#endif
1588
#endif
1589 dd68318c Ramiro Polla
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1590 6e42e6c4 Diego Biurrun
            usrc += chromStride;
1591
            vsrc += chromStride;
1592
        }
1593
        ysrc += lumStride;
1594
        dst += dstStride;
1595
    }
1596 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1597 9b734d44 Ramiro Polla
    __asm__(EMMS"       \n\t"
1598
            SFENCE"     \n\t"
1599
            :::"memory");
1600 7ac25f2d Michael Niedermayer
#endif
1601 caeaabe7 Alex Beregszaszi
}
1602
1603
/**
1604 8a322796 Diego Biurrun
 * Height should be a multiple of 2 and width should be a multiple of 16
1605
 * (If this is a problem for anyone then tell me, and I will fix it.)
1606 caeaabe7 Alex Beregszaszi
 */
1607
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1608 6e42e6c4 Diego Biurrun
                                      long width, long height,
1609
                                      long lumStride, long chromStride, long dstStride)
1610 caeaabe7 Alex Beregszaszi
{
1611 6e42e6c4 Diego Biurrun
    //FIXME interpolate chroma
1612
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1613 b1ec5875 Michael Niedermayer
}
1614
1615
/**
1616 594ff7cc Diego Biurrun
 * Width should be a multiple of 16.
1617 b1ec5875 Michael Niedermayer
 */
1618 a6100f39 Baptiste Coudurier
static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619
                                         long width, long height,
1620
                                         long lumStride, long chromStride, long dstStride)
1621
{
1622
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1623
}
1624
1625
/**
1626
 * Width should be a multiple of 16.
1627
 */
1628 b1ec5875 Michael Niedermayer
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1629 6e42e6c4 Diego Biurrun
                                         long width, long height,
1630
                                         long lumStride, long chromStride, long dstStride)
1631 b1ec5875 Michael Niedermayer
{
1632 6e42e6c4 Diego Biurrun
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1633 b1ec5875 Michael Niedermayer
}
1634
1635
/**
1636 8a322796 Diego Biurrun
 * Height should be a multiple of 2 and width should be a multiple of 16.
1637
 * (If this is a problem for anyone then tell me, and I will fix it.)
1638 b1ec5875 Michael Niedermayer
 */
1639 1de97d84 Michael Niedermayer
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1640 6e42e6c4 Diego Biurrun
                                      long width, long height,
1641
                                      long lumStride, long chromStride, long srcStride)
1642 d9d58d17 Michael Niedermayer
{
1643 6e42e6c4 Diego Biurrun
    long y;
1644 9326d3f3 Michael Niedermayer
    const x86_reg chromWidth= width>>1;
1645 dd68318c Ramiro Polla
    for (y=0; y<height; y+=2) {
1646 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1647 7ad6469e Diego Pettenò
        __asm__ volatile(
1648 9b734d44 Ramiro Polla
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1649
            "pcmpeqw                 %%mm7, %%mm7       \n\t"
1650
            "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1651 ef4a6514 Mans Rullgard
            ".p2align                    4              \n\t"
1652 9b734d44 Ramiro Polla
            "1:                \n\t"
1653
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1654
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1655
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1656
            "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1657
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1658
            "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1659
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1660
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1661
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1662
            "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1663
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1664
1665
            MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1666
1667
            "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1668
            "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1669
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1670
            "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1671
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1672
            "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1673
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1674
            "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1675
            "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1676
            "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1677
1678
            MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1679
1680
            "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1681
            "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1682
            "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1683
            "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1684
            "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1685
            "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1686
            "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1687
            "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1688
1689
            MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1690
            MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1691
1692
            "add                        $8, %%"REG_a"   \n\t"
1693
            "cmp                        %4, %%"REG_a"   \n\t"
1694
            " jb                        1b              \n\t"
1695
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1696
            : "memory", "%"REG_a
1697 6e42e6c4 Diego Biurrun
        );
1698
1699
        ydst += lumStride;
1700
        src  += srcStride;
1701
1702 7ad6469e Diego Pettenò
        __asm__ volatile(
1703 9b734d44 Ramiro Polla
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1704 ef4a6514 Mans Rullgard
            ".p2align                    4              \n\t"
1705 9b734d44 Ramiro Polla
            "1:                                         \n\t"
1706
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1707
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1708
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1709
            "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1710
            "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1711
            "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1712
            "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1713
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1714
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1715
            "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1716
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1717
1718
            MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1719
            MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1720
1721
            "add                        $8, %%"REG_a"   \n\t"
1722
            "cmp                        %4, %%"REG_a"   \n\t"
1723
            " jb                        1b              \n\t"
1724
1725
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1726
            : "memory", "%"REG_a
1727 6e42e6c4 Diego Biurrun
        );
1728 bd09433f Michael Niedermayer
#else
1729 6e42e6c4 Diego Biurrun
        long i;
1730 dd68318c Ramiro Polla
        for (i=0; i<chromWidth; i++) {
1731 6e42e6c4 Diego Biurrun
            ydst[2*i+0]     = src[4*i+0];
1732
            udst[i]     = src[4*i+1];
1733
            ydst[2*i+1]     = src[4*i+2];
1734
            vdst[i]     = src[4*i+3];
1735
        }
1736
        ydst += lumStride;
1737
        src  += srcStride;
1738
1739 dd68318c Ramiro Polla
        for (i=0; i<chromWidth; i++) {
1740 6e42e6c4 Diego Biurrun
            ydst[2*i+0]     = src[4*i+0];
1741
            ydst[2*i+1]     = src[4*i+2];
1742
        }
1743
#endif
1744
        udst += chromStride;
1745
        vdst += chromStride;
1746
        ydst += lumStride;
1747
        src  += srcStride;
1748
    }
1749 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1750 9b734d44 Ramiro Polla
    __asm__ volatile(EMMS"       \n\t"
1751
                     SFENCE"     \n\t"
1752
                     :::"memory");
1753 bd09433f Michael Niedermayer
#endif
1754 42b5fcb8 Michael Niedermayer
}
1755 81c0590e Arpi
1756 7f526efd Reimar Döffinger
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1757 b241cbf2 Michael Niedermayer
{
1758 6e42e6c4 Diego Biurrun
    long x,y;
1759 6a4970ab Diego Biurrun
1760 6e42e6c4 Diego Biurrun
    dst[0]= src[0];
1761 6a4970ab Diego Biurrun
1762 6e42e6c4 Diego Biurrun
    // first line
1763 dd68318c Ramiro Polla
    for (x=0; x<srcWidth-1; x++) {
1764 6e42e6c4 Diego Biurrun
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1765
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1766
    }
1767
    dst[2*srcWidth-1]= src[srcWidth-1];
1768 6a4970ab Diego Biurrun
1769 9b734d44 Ramiro Polla
    dst+= dstStride;
1770 b241cbf2 Michael Niedermayer
1771 dd68318c Ramiro Polla
    for (y=1; y<srcHeight; y++) {
1772 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1773 d0ce212a Ramiro Polla
        const x86_reg mmxSize= srcWidth&~15;
1774 7ad6469e Diego Pettenò
        __asm__ volatile(
1775 9b734d44 Ramiro Polla
            "mov           %4, %%"REG_a"            \n\t"
1776 6527e561 Ramiro Polla
            "movq        "MANGLE(mmx_ff)", %%mm0    \n\t"
1777
            "movq         (%0, %%"REG_a"), %%mm4    \n\t"
1778
            "movq                   %%mm4, %%mm2    \n\t"
1779
            "psllq                     $8, %%mm4    \n\t"
1780
            "pand                   %%mm0, %%mm2    \n\t"
1781
            "por                    %%mm2, %%mm4    \n\t"
1782
            "movq         (%1, %%"REG_a"), %%mm5    \n\t"
1783
            "movq                   %%mm5, %%mm3    \n\t"
1784
            "psllq                     $8, %%mm5    \n\t"
1785
            "pand                   %%mm0, %%mm3    \n\t"
1786
            "por                    %%mm3, %%mm5    \n\t"
1787 9b734d44 Ramiro Polla
            "1:                                     \n\t"
1788
            "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1789
            "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1790
            "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1791
            "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1792
            PAVGB"                  %%mm0, %%mm5    \n\t"
1793
            PAVGB"                  %%mm0, %%mm3    \n\t"
1794
            PAVGB"                  %%mm0, %%mm5    \n\t"
1795
            PAVGB"                  %%mm0, %%mm3    \n\t"
1796
            PAVGB"                  %%mm1, %%mm4    \n\t"
1797
            PAVGB"                  %%mm1, %%mm2    \n\t"
1798
            PAVGB"                  %%mm1, %%mm4    \n\t"
1799
            PAVGB"                  %%mm1, %%mm2    \n\t"
1800
            "movq                   %%mm5, %%mm7    \n\t"
1801
            "movq                   %%mm4, %%mm6    \n\t"
1802
            "punpcklbw              %%mm3, %%mm5    \n\t"
1803
            "punpckhbw              %%mm3, %%mm7    \n\t"
1804
            "punpcklbw              %%mm2, %%mm4    \n\t"
1805
            "punpckhbw              %%mm2, %%mm6    \n\t"
1806 b241cbf2 Michael Niedermayer
#if 1
1807 9b734d44 Ramiro Polla
            MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1808
            MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1809
            MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1810
            MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1811 b241cbf2 Michael Niedermayer
#else
1812 9b734d44 Ramiro Polla
            "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1813
            "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1814
            "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1815
            "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1816
#endif
1817
            "add                       $8, %%"REG_a"            \n\t"
1818 6527e561 Ramiro Polla
            "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1819
            "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1820 9b734d44 Ramiro Polla
            " js                       1b                       \n\t"
1821
            :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1822 40b433b6 Ramiro Polla
               "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1823
               "g" (-mmxSize)
1824 9b734d44 Ramiro Polla
            : "%"REG_a
1825 6e42e6c4 Diego Biurrun
        );
1826 b241cbf2 Michael Niedermayer
#else
1827 9326d3f3 Michael Niedermayer
        const x86_reg mmxSize=1;
1828 40b433b6 Ramiro Polla
1829 6e42e6c4 Diego Biurrun
        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1830
        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1831 6527e561 Ramiro Polla
#endif
1832 6e42e6c4 Diego Biurrun
1833 dd68318c Ramiro Polla
        for (x=mmxSize-1; x<srcWidth-1; x++) {
1834 6e42e6c4 Diego Biurrun
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1835
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1836
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1837
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1838
        }
1839
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1840
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1841
1842
        dst+=dstStride*2;
1843
        src+=srcStride;
1844
    }
1845 6a4970ab Diego Biurrun
1846 6e42e6c4 Diego Biurrun
    // last line
1847 b2609d4c Michael Niedermayer
#if 1
1848 6e42e6c4 Diego Biurrun
    dst[0]= src[0];
1849 6a4970ab Diego Biurrun
1850 dd68318c Ramiro Polla
    for (x=0; x<srcWidth-1; x++) {
1851 6e42e6c4 Diego Biurrun
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1852
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1853
    }
1854
    dst[2*srcWidth-1]= src[srcWidth-1];
1855 b2609d4c Michael Niedermayer
#else
1856 dd68318c Ramiro Polla
    for (x=0; x<srcWidth; x++) {
1857 6e42e6c4 Diego Biurrun
        dst[2*x+0]=
1858
        dst[2*x+1]= src[x];
1859
    }
1860 b2609d4c Michael Niedermayer
#endif
1861
1862 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1863 9b734d44 Ramiro Polla
    __asm__ volatile(EMMS"       \n\t"
1864
                     SFENCE"     \n\t"
1865
                     :::"memory");
1866 b241cbf2 Michael Niedermayer
#endif
1867
}
1868
1869 81c0590e Arpi
/**
1870 8a322796 Diego Biurrun
 * Height should be a multiple of 2 and width should be a multiple of 16.
1871
 * (If this is a problem for anyone then tell me, and I will fix it.)
1872
 * Chrominance data is only taken from every second line, others are ignored.
1873 594ff7cc Diego Biurrun
 * FIXME: Write HQ version.
1874 81c0590e Arpi
 */
1875 1de97d84 Michael Niedermayer
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1876 6e42e6c4 Diego Biurrun
                                      long width, long height,
1877
                                      long lumStride, long chromStride, long srcStride)
1878 81c0590e Arpi
{
1879 6e42e6c4 Diego Biurrun
    long y;
1880 9326d3f3 Michael Niedermayer
    const x86_reg chromWidth= width>>1;
1881 dd68318c Ramiro Polla
    for (y=0; y<height; y+=2) {
1882 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1883 7ad6469e Diego Pettenò
        __asm__ volatile(
1884 9b734d44 Ramiro Polla
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1885
            "pcmpeqw             %%mm7, %%mm7   \n\t"
1886
            "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
1887 ef4a6514 Mans Rullgard
            ".p2align                4          \n\t"
1888 9b734d44 Ramiro Polla
            "1:                                 \n\t"
1889
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1890
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
1891
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
1892
            "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
1893
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
1894
            "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
1895
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
1896
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
1897
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
1898
            "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
1899
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
1900
1901
            MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
1902
1903
            "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
1904
            "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
1905
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
1906
            "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
1907
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
1908
            "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
1909
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
1910
            "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
1911
            "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
1912
            "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
1913
1914
            MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1915
1916
            "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
1917
            "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
1918
            "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
1919
            "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
1920
            "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
1921
            "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
1922
            "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
1923
            "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
1924
1925
            MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
1926
            MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
1927
1928
            "add                    $8, %%"REG_a"   \n\t"
1929
            "cmp                    %4, %%"REG_a"   \n\t"
1930
            " jb                    1b          \n\t"
1931
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1932
            : "memory", "%"REG_a
1933 6e42e6c4 Diego Biurrun
        );
1934
1935
        ydst += lumStride;
1936
        src  += srcStride;
1937
1938 7ad6469e Diego Pettenò
        __asm__ volatile(
1939 9b734d44 Ramiro Polla
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1940 ef4a6514 Mans Rullgard
            ".p2align                    4          \n\t"
1941 9b734d44 Ramiro Polla
            "1:                                 \n\t"
1942
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1943
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
1944
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
1945
            "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
1946
            "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
1947
            "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
1948
            "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
1949
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
1950
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
1951
            "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
1952
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
1953
1954
            MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
1955
            MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1956
1957
            "add                    $8, %%"REG_a"   \n\t"
1958
            "cmp                    %4, %%"REG_a"   \n\t"
1959
            " jb                    1b          \n\t"
1960
1961
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1962
            : "memory", "%"REG_a
1963 6e42e6c4 Diego Biurrun
        );
1964 ed8c0670 Michael Niedermayer
#else
1965 6e42e6c4 Diego Biurrun
        long i;
1966 dd68318c Ramiro Polla
        for (i=0; i<chromWidth; i++) {
1967 6e42e6c4 Diego Biurrun
            udst[i]     = src[4*i+0];
1968
            ydst[2*i+0] = src[4*i+1];
1969
            vdst[i]     = src[4*i+2];
1970
            ydst[2*i+1] = src[4*i+3];
1971
        }
1972
        ydst += lumStride;
1973
        src  += srcStride;
1974
1975 dd68318c Ramiro Polla
        for (i=0; i<chromWidth; i++) {
1976 6e42e6c4 Diego Biurrun
            ydst[2*i+0] = src[4*i+1];
1977
            ydst[2*i+1] = src[4*i+3];
1978
        }
1979
#endif
1980
        udst += chromStride;
1981
        vdst += chromStride;
1982
        ydst += lumStride;
1983
        src  += srcStride;
1984
    }
1985 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
1986 9b734d44 Ramiro Polla
    __asm__ volatile(EMMS"       \n\t"
1987
                     SFENCE"     \n\t"
1988
                     :::"memory");
1989 ed8c0670 Michael Niedermayer
#endif
1990 81c0590e Arpi
}
1991
1992 1de97d84 Michael Niedermayer
/**
1993 8a322796 Diego Biurrun
 * Height should be a multiple of 2 and width should be a multiple of 2.
1994
 * (If this is a problem for anyone then tell me, and I will fix it.)
1995
 * Chrominance data is only taken from every second line,
1996 594ff7cc Diego Biurrun
 * others are ignored in the C version.
1997
 * FIXME: Write HQ version.
1998 1de97d84 Michael Niedermayer
 */
1999
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2000 6e42e6c4 Diego Biurrun
                                       long width, long height,
2001
                                       long lumStride, long chromStride, long srcStride)
2002 1de97d84 Michael Niedermayer
{
2003 6e42e6c4 Diego Biurrun
    long y;
2004 9326d3f3 Michael Niedermayer
    const x86_reg chromWidth= width>>1;
2005 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX
2006 dd68318c Ramiro Polla
    for (y=0; y<height-2; y+=2) {
2007 6e42e6c4 Diego Biurrun
        long i;
2008 dd68318c Ramiro Polla
        for (i=0; i<2; i++) {
2009 7ad6469e Diego Pettenò
            __asm__ volatile(
2010 9b734d44 Ramiro Polla
                "mov                        %2, %%"REG_a"   \n\t"
2011
                "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2012
                "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2013
                "pxor                    %%mm7, %%mm7       \n\t"
2014
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2015 ef4a6514 Mans Rullgard
                ".p2align                    4              \n\t"
2016 9b734d44 Ramiro Polla
                "1:                                         \n\t"
2017
                PREFETCH"    64(%0, %%"REG_d")              \n\t"
2018
                "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2019
                "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2020
                "punpcklbw               %%mm7, %%mm0       \n\t"
2021
                "punpcklbw               %%mm7, %%mm1       \n\t"
2022
                "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2023
                "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2024
                "punpcklbw               %%mm7, %%mm2       \n\t"
2025
                "punpcklbw               %%mm7, %%mm3       \n\t"
2026
                "pmaddwd                 %%mm6, %%mm0       \n\t"
2027
                "pmaddwd                 %%mm6, %%mm1       \n\t"
2028
                "pmaddwd                 %%mm6, %%mm2       \n\t"
2029
                "pmaddwd                 %%mm6, %%mm3       \n\t"
2030
#ifndef FAST_BGR2YV12
2031
                "psrad                      $8, %%mm0       \n\t"
2032
                "psrad                      $8, %%mm1       \n\t"
2033
                "psrad                      $8, %%mm2       \n\t"
2034
                "psrad                      $8, %%mm3       \n\t"
2035
#endif
2036
                "packssdw                %%mm1, %%mm0       \n\t"
2037
                "packssdw                %%mm3, %%mm2       \n\t"
2038
                "pmaddwd                 %%mm5, %%mm0       \n\t"
2039
                "pmaddwd                 %%mm5, %%mm2       \n\t"
2040
                "packssdw                %%mm2, %%mm0       \n\t"
2041
                "psraw                      $7, %%mm0       \n\t"
2042
2043
                "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2044
                "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2045
                "punpcklbw               %%mm7, %%mm4       \n\t"
2046
                "punpcklbw               %%mm7, %%mm1       \n\t"
2047
                "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2048
                "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2049
                "punpcklbw               %%mm7, %%mm2       \n\t"
2050
                "punpcklbw               %%mm7, %%mm3       \n\t"
2051
                "pmaddwd                 %%mm6, %%mm4       \n\t"
2052
                "pmaddwd                 %%mm6, %%mm1       \n\t"
2053
                "pmaddwd                 %%mm6, %%mm2       \n\t"
2054
                "pmaddwd                 %%mm6, %%mm3       \n\t"
2055
#ifndef FAST_BGR2YV12
2056
                "psrad                      $8, %%mm4       \n\t"
2057
                "psrad                      $8, %%mm1       \n\t"
2058
                "psrad                      $8, %%mm2       \n\t"
2059
                "psrad                      $8, %%mm3       \n\t"
2060
#endif
2061
                "packssdw                %%mm1, %%mm4       \n\t"
2062
                "packssdw                %%mm3, %%mm2       \n\t"
2063
                "pmaddwd                 %%mm5, %%mm4       \n\t"
2064
                "pmaddwd                 %%mm5, %%mm2       \n\t"
2065
                "add                       $24, %%"REG_d"   \n\t"
2066
                "packssdw                %%mm2, %%mm4       \n\t"
2067
                "psraw                      $7, %%mm4       \n\t"
2068
2069
                "packuswb                %%mm4, %%mm0       \n\t"
2070
                "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2071
2072
                MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2073
                "add                        $8,      %%"REG_a"  \n\t"
2074
                " js                        1b                  \n\t"
2075
                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2076
                : "%"REG_a, "%"REG_d
2077
            );
2078
            ydst += lumStride;
2079
            src  += srcStride;
2080
        }
2081
        src -= srcStride*2;
2082
        __asm__ volatile(
2083
            "mov                        %4, %%"REG_a"   \n\t"
2084 5802683a Reimar Döffinger
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2085 9b734d44 Ramiro Polla
            "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2086 6e42e6c4 Diego Biurrun
            "pxor                    %%mm7, %%mm7       \n\t"
2087
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2088 9b734d44 Ramiro Polla
            "add                 %%"REG_d", %%"REG_d"   \n\t"
2089 ef4a6514 Mans Rullgard
            ".p2align                    4              \n\t"
2090 6e42e6c4 Diego Biurrun
            "1:                                         \n\t"
2091
            PREFETCH"    64(%0, %%"REG_d")              \n\t"
2092 9b734d44 Ramiro Polla
            PREFETCH"    64(%1, %%"REG_d")              \n\t"
2093 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2094 9b734d44 Ramiro Polla
            "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2095
            "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2096
            "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2097
            "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2098
            PAVGB"                   %%mm1, %%mm0       \n\t"
2099
            PAVGB"                   %%mm3, %%mm2       \n\t"
2100
            "movq                    %%mm0, %%mm1       \n\t"
2101
            "movq                    %%mm2, %%mm3       \n\t"
2102
            "psrlq                     $24, %%mm0       \n\t"
2103
            "psrlq                     $24, %%mm2       \n\t"
2104
            PAVGB"                   %%mm1, %%mm0       \n\t"
2105
            PAVGB"                   %%mm3, %%mm2       \n\t"
2106
            "punpcklbw               %%mm7, %%mm0       \n\t"
2107
            "punpcklbw               %%mm7, %%mm2       \n\t"
2108
#else
2109 6e42e6c4 Diego Biurrun
            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2110 9b734d44 Ramiro Polla
            "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2111
            "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2112
            "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2113 6e42e6c4 Diego Biurrun
            "punpcklbw               %%mm7, %%mm0       \n\t"
2114
            "punpcklbw               %%mm7, %%mm1       \n\t"
2115
            "punpcklbw               %%mm7, %%mm2       \n\t"
2116
            "punpcklbw               %%mm7, %%mm3       \n\t"
2117 9b734d44 Ramiro Polla
            "paddw                   %%mm1, %%mm0       \n\t"
2118
            "paddw                   %%mm3, %%mm2       \n\t"
2119
            "paddw                   %%mm2, %%mm0       \n\t"
2120
            "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2121
            "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2122
            "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2123
            "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2124
            "punpcklbw               %%mm7, %%mm4       \n\t"
2125
            "punpcklbw               %%mm7, %%mm1       \n\t"
2126
            "punpcklbw               %%mm7, %%mm2       \n\t"
2127
            "punpcklbw               %%mm7, %%mm3       \n\t"
2128
            "paddw                   %%mm1, %%mm4       \n\t"
2129
            "paddw                   %%mm3, %%mm2       \n\t"
2130
            "paddw                   %%mm4, %%mm2       \n\t"
2131
            "psrlw                      $2, %%mm0       \n\t"
2132
            "psrlw                      $2, %%mm2       \n\t"
2133
#endif
2134
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2135
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2136
2137
            "pmaddwd                 %%mm0, %%mm1       \n\t"
2138
            "pmaddwd                 %%mm2, %%mm3       \n\t"
2139 6e42e6c4 Diego Biurrun
            "pmaddwd                 %%mm6, %%mm0       \n\t"
2140
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2141 21316f3c Michael Niedermayer
#ifndef FAST_BGR2YV12
2142 6e42e6c4 Diego Biurrun
            "psrad                      $8, %%mm0       \n\t"
2143
            "psrad                      $8, %%mm1       \n\t"
2144
            "psrad                      $8, %%mm2       \n\t"
2145
            "psrad                      $8, %%mm3       \n\t"
2146
#endif
2147
            "packssdw                %%mm2, %%mm0       \n\t"
2148 9b734d44 Ramiro Polla
            "packssdw                %%mm3, %%mm1       \n\t"
2149
            "pmaddwd                 %%mm5, %%mm0       \n\t"
2150
            "pmaddwd                 %%mm5, %%mm1       \n\t"
2151
            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2152 6e42e6c4 Diego Biurrun
            "psraw                      $7, %%mm0       \n\t"
2153
2154 c12f7b2d Ramiro Polla
#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2155 9b734d44 Ramiro Polla
            "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2156
            "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2157
            "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2158
            "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2159
            PAVGB"                   %%mm1, %%mm4       \n\t"
2160
            PAVGB"                   %%mm3, %%mm2       \n\t"
2161
            "movq                    %%mm4, %%mm1       \n\t"
2162
            "movq                    %%mm2, %%mm3       \n\t"
2163
            "psrlq                     $24, %%mm4       \n\t"
2164
            "psrlq                     $24, %%mm2       \n\t"
2165
            PAVGB"                   %%mm1, %%mm4       \n\t"
2166
            PAVGB"                   %%mm3, %%mm2       \n\t"
2167
            "punpcklbw               %%mm7, %%mm4       \n\t"
2168
            "punpcklbw               %%mm7, %%mm2       \n\t"
2169
#else
2170 6e42e6c4 Diego Biurrun
            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2171 9b734d44 Ramiro Polla
            "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2172
            "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2173
            "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2174 6e42e6c4 Diego Biurrun
            "punpcklbw               %%mm7, %%mm4       \n\t"
2175
            "punpcklbw               %%mm7, %%mm1       \n\t"
2176
            "punpcklbw               %%mm7, %%mm2       \n\t"
2177
            "punpcklbw               %%mm7, %%mm3       \n\t"
2178 9b734d44 Ramiro Polla
            "paddw                   %%mm1, %%mm4       \n\t"
2179
            "paddw                   %%mm3, %%mm2       \n\t"
2180
            "paddw                   %%mm2, %%mm4       \n\t"
2181
            "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2182
            "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2183
            "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2184
            "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2185
            "punpcklbw               %%mm7, %%mm5       \n\t"
2186
            "punpcklbw               %%mm7, %%mm1       \n\t"
2187
            "punpcklbw               %%mm7, %%mm2       \n\t"
2188
            "punpcklbw               %%mm7, %%mm3       \n\t"
2189
            "paddw                   %%mm1, %%mm5       \n\t"
2190
            "paddw                   %%mm3, %%mm2       \n\t"
2191
            "paddw                   %%mm5, %%mm2       \n\t"
2192
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2193
            "psrlw                      $2, %%mm4       \n\t"
2194
            "psrlw                      $2, %%mm2       \n\t"
2195
#endif
2196
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2197
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2198
2199
            "pmaddwd                 %%mm4, %%mm1       \n\t"
2200
            "pmaddwd                 %%mm2, %%mm3       \n\t"
2201 6e42e6c4 Diego Biurrun
            "pmaddwd                 %%mm6, %%mm4       \n\t"
2202
            "pmaddwd                 %%mm6, %%mm2       \n\t"