Statistics
| Branch: | Revision:

ffmpeg / libswscale / rgb2rgb_template.c @ 9326d3f3

History | View | Annotate | Download (106 KB)

1
/*
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 * lot of big-endian byte order fixes by Alex Beregszaszi
9
 *
10
 * This file is part of FFmpeg.
11
 *
12
 * FFmpeg is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2 of the License, or
15
 * (at your option) any later version.
16
 *
17
 * FFmpeg is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU General Public License
23
 * along with FFmpeg; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 *
26
 * The C code (not assembly, MMX, ...) of this file can be used
27
 * under the LGPL license.
28
 */
29

    
30
#include <stddef.h>
31

    
32
#undef PREFETCH
33
#undef MOVNTQ
34
#undef EMMS
35
#undef SFENCE
36
#undef MMREG_SIZE
37
#undef PREFETCHW
38
#undef PAVGB
39

    
40
#if HAVE_SSE2
41
#define MMREG_SIZE 16
42
#else
43
#define MMREG_SIZE 8
44
#endif
45

    
46
#if HAVE_AMD3DNOW
47
#define PREFETCH  "prefetch"
48
#define PREFETCHW "prefetchw"
49
#define PAVGB     "pavgusb"
50
#elif HAVE_MMX2
51
#define PREFETCH "prefetchnta"
52
#define PREFETCHW "prefetcht0"
53
#define PAVGB     "pavgb"
54
#else
55
#define PREFETCH  " # nop"
56
#define PREFETCHW " # nop"
57
#endif
58

    
59
#if HAVE_AMD3DNOW
60
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
61
#define EMMS     "femms"
62
#else
63
#define EMMS     "emms"
64
#endif
65

    
66
#if HAVE_MMX2
67
#define MOVNTQ "movntq"
68
#define SFENCE "sfence"
69
#else
70
#define MOVNTQ "movq"
71
#define SFENCE " # nop"
72
#endif
73

    
74
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
75
{
76
    uint8_t *dest = dst;
77
    const uint8_t *s = src;
78
    const uint8_t *end;
79
    #if HAVE_MMX
80
        const uint8_t *mm_end;
81
    #endif
82
    end = s + src_size;
83
    #if HAVE_MMX
84
        __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
85
        mm_end = end - 23;
86
        __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
87
        while (s < mm_end)
88
        {
89
            __asm__ volatile(
90
            PREFETCH"    32%1           \n\t"
91
            "movd          %1, %%mm0    \n\t"
92
            "punpckldq    3%1, %%mm0    \n\t"
93
            "movd         6%1, %%mm1    \n\t"
94
            "punpckldq    9%1, %%mm1    \n\t"
95
            "movd        12%1, %%mm2    \n\t"
96
            "punpckldq   15%1, %%mm2    \n\t"
97
            "movd        18%1, %%mm3    \n\t"
98
            "punpckldq   21%1, %%mm3    \n\t"
99
            "por        %%mm7, %%mm0    \n\t"
100
            "por        %%mm7, %%mm1    \n\t"
101
            "por        %%mm7, %%mm2    \n\t"
102
            "por        %%mm7, %%mm3    \n\t"
103
            MOVNTQ"     %%mm0,   %0     \n\t"
104
            MOVNTQ"     %%mm1,  8%0     \n\t"
105
            MOVNTQ"     %%mm2, 16%0     \n\t"
106
            MOVNTQ"     %%mm3, 24%0"
107
            :"=m"(*dest)
108
            :"m"(*s)
109
            :"memory");
110
            dest += 32;
111
            s += 24;
112
        }
113
        __asm__ volatile(SFENCE:::"memory");
114
        __asm__ volatile(EMMS:::"memory");
115
    #endif
116
    while (s < end)
117
    {
118
    #ifdef WORDS_BIGENDIAN
119
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
120
        *dest++ = 255;
121
        *dest++ = s[2];
122
        *dest++ = s[1];
123
        *dest++ = s[0];
124
        s+=3;
125
    #else
126
        *dest++ = *s++;
127
        *dest++ = *s++;
128
        *dest++ = *s++;
129
        *dest++ = 255;
130
    #endif
131
    }
132
}
133

    
134
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
135
{
136
    uint8_t *dest = dst;
137
    const uint8_t *s = src;
138
    const uint8_t *end;
139
#if HAVE_MMX
140
    const uint8_t *mm_end;
141
#endif
142
    end = s + src_size;
143
#if HAVE_MMX
144
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
145
    mm_end = end - 31;
146
    while (s < mm_end)
147
    {
148
        __asm__ volatile(
149
        PREFETCH"    32%1           \n\t"
150
        "movq          %1, %%mm0    \n\t"
151
        "movq         8%1, %%mm1    \n\t"
152
        "movq        16%1, %%mm4    \n\t"
153
        "movq        24%1, %%mm5    \n\t"
154
        "movq       %%mm0, %%mm2    \n\t"
155
        "movq       %%mm1, %%mm3    \n\t"
156
        "movq       %%mm4, %%mm6    \n\t"
157
        "movq       %%mm5, %%mm7    \n\t"
158
        "psrlq         $8, %%mm2    \n\t"
159
        "psrlq         $8, %%mm3    \n\t"
160
        "psrlq         $8, %%mm6    \n\t"
161
        "psrlq         $8, %%mm7    \n\t"
162
        "pand          %2, %%mm0    \n\t"
163
        "pand          %2, %%mm1    \n\t"
164
        "pand          %2, %%mm4    \n\t"
165
        "pand          %2, %%mm5    \n\t"
166
        "pand          %3, %%mm2    \n\t"
167
        "pand          %3, %%mm3    \n\t"
168
        "pand          %3, %%mm6    \n\t"
169
        "pand          %3, %%mm7    \n\t"
170
        "por        %%mm2, %%mm0    \n\t"
171
        "por        %%mm3, %%mm1    \n\t"
172
        "por        %%mm6, %%mm4    \n\t"
173
        "por        %%mm7, %%mm5    \n\t"
174

    
175
        "movq       %%mm1, %%mm2    \n\t"
176
        "movq       %%mm4, %%mm3    \n\t"
177
        "psllq        $48, %%mm2    \n\t"
178
        "psllq        $32, %%mm3    \n\t"
179
        "pand          %4, %%mm2    \n\t"
180
        "pand          %5, %%mm3    \n\t"
181
        "por        %%mm2, %%mm0    \n\t"
182
        "psrlq        $16, %%mm1    \n\t"
183
        "psrlq        $32, %%mm4    \n\t"
184
        "psllq        $16, %%mm5    \n\t"
185
        "por        %%mm3, %%mm1    \n\t"
186
        "pand          %6, %%mm5    \n\t"
187
        "por        %%mm5, %%mm4    \n\t"
188

    
189
        MOVNTQ"     %%mm0,   %0     \n\t"
190
        MOVNTQ"     %%mm1,  8%0     \n\t"
191
        MOVNTQ"     %%mm4, 16%0"
192
        :"=m"(*dest)
193
        :"m"(*s),"m"(mask24l),
194
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
195
        :"memory");
196
        dest += 24;
197
        s += 32;
198
    }
199
    __asm__ volatile(SFENCE:::"memory");
200
    __asm__ volatile(EMMS:::"memory");
201
#endif
202
    while (s < end)
203
    {
204
#ifdef WORDS_BIGENDIAN
205
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
206
        s++;
207
        dest[2] = *s++;
208
        dest[1] = *s++;
209
        dest[0] = *s++;
210
        dest += 3;
211
#else
212
        *dest++ = *s++;
213
        *dest++ = *s++;
214
        *dest++ = *s++;
215
        s++;
216
#endif
217
    }
218
}
219

    
220
/*
221
 original by Strepto/Astral
222
 ported to gcc & bugfixed: A'rpi
223
 MMX2, 3DNOW optimization by Nick Kurshev
224
 32-bit C version, and and&add trick by Michael Niedermayer
225
*/
226
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
227
{
228
    register const uint8_t* s=src;
229
    register uint8_t* d=dst;
230
    register const uint8_t *end;
231
    const uint8_t *mm_end;
232
    end = s + src_size;
233
#if HAVE_MMX
234
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
235
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
236
    mm_end = end - 15;
237
    while (s<mm_end)
238
    {
239
        __asm__ volatile(
240
        PREFETCH"  32%1         \n\t"
241
        "movq        %1, %%mm0  \n\t"
242
        "movq       8%1, %%mm2  \n\t"
243
        "movq     %%mm0, %%mm1  \n\t"
244
        "movq     %%mm2, %%mm3  \n\t"
245
        "pand     %%mm4, %%mm0  \n\t"
246
        "pand     %%mm4, %%mm2  \n\t"
247
        "paddw    %%mm1, %%mm0  \n\t"
248
        "paddw    %%mm3, %%mm2  \n\t"
249
        MOVNTQ"   %%mm0,  %0    \n\t"
250
        MOVNTQ"   %%mm2, 8%0"
251
        :"=m"(*d)
252
        :"m"(*s)
253
        );
254
        d+=16;
255
        s+=16;
256
    }
257
    __asm__ volatile(SFENCE:::"memory");
258
    __asm__ volatile(EMMS:::"memory");
259
#endif
260
    mm_end = end - 3;
261
    while (s < mm_end)
262
    {
263
        register unsigned x= *((const uint32_t *)s);
264
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
265
        d+=4;
266
        s+=4;
267
    }
268
    if (s < end)
269
    {
270
        register unsigned short x= *((const uint16_t *)s);
271
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
272
    }
273
}
274

    
275
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
276
{
277
    register const uint8_t* s=src;
278
    register uint8_t* d=dst;
279
    register const uint8_t *end;
280
    const uint8_t *mm_end;
281
    end = s + src_size;
282
#if HAVE_MMX
283
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
284
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
285
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
286
    mm_end = end - 15;
287
    while (s<mm_end)
288
    {
289
        __asm__ volatile(
290
        PREFETCH"  32%1         \n\t"
291
        "movq        %1, %%mm0  \n\t"
292
        "movq       8%1, %%mm2  \n\t"
293
        "movq     %%mm0, %%mm1  \n\t"
294
        "movq     %%mm2, %%mm3  \n\t"
295
        "psrlq       $1, %%mm0  \n\t"
296
        "psrlq       $1, %%mm2  \n\t"
297
        "pand     %%mm7, %%mm0  \n\t"
298
        "pand     %%mm7, %%mm2  \n\t"
299
        "pand     %%mm6, %%mm1  \n\t"
300
        "pand     %%mm6, %%mm3  \n\t"
301
        "por      %%mm1, %%mm0  \n\t"
302
        "por      %%mm3, %%mm2  \n\t"
303
        MOVNTQ"   %%mm0,  %0    \n\t"
304
        MOVNTQ"   %%mm2, 8%0"
305
        :"=m"(*d)
306
        :"m"(*s)
307
        );
308
        d+=16;
309
        s+=16;
310
    }
311
    __asm__ volatile(SFENCE:::"memory");
312
    __asm__ volatile(EMMS:::"memory");
313
#endif
314
    mm_end = end - 3;
315
    while (s < mm_end)
316
    {
317
        register uint32_t x= *((const uint32_t*)s);
318
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
319
        s+=4;
320
        d+=4;
321
    }
322
    if (s < end)
323
    {
324
        register uint16_t x= *((const uint16_t*)s);
325
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
326
        s+=2;
327
        d+=2;
328
    }
329
}
330

    
331
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
332
{
333
    const uint8_t *s = src;
334
    const uint8_t *end;
335
#if HAVE_MMX
336
    const uint8_t *mm_end;
337
#endif
338
    uint16_t *d = (uint16_t *)dst;
339
    end = s + src_size;
340
#if HAVE_MMX
341
    mm_end = end - 15;
342
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
343
    __asm__ volatile(
344
    "movq           %3, %%mm5   \n\t"
345
    "movq           %4, %%mm6   \n\t"
346
    "movq           %5, %%mm7   \n\t"
347
    "jmp 2f                     \n\t"
348
    ASMALIGN(4)
349
    "1:                         \n\t"
350
    PREFETCH"   32(%1)          \n\t"
351
    "movd         (%1), %%mm0   \n\t"
352
    "movd        4(%1), %%mm3   \n\t"
353
    "punpckldq   8(%1), %%mm0   \n\t"
354
    "punpckldq  12(%1), %%mm3   \n\t"
355
    "movq        %%mm0, %%mm1   \n\t"
356
    "movq        %%mm3, %%mm4   \n\t"
357
    "pand        %%mm6, %%mm0   \n\t"
358
    "pand        %%mm6, %%mm3   \n\t"
359
    "pmaddwd     %%mm7, %%mm0   \n\t"
360
    "pmaddwd     %%mm7, %%mm3   \n\t"
361
    "pand        %%mm5, %%mm1   \n\t"
362
    "pand        %%mm5, %%mm4   \n\t"
363
    "por         %%mm1, %%mm0   \n\t"
364
    "por         %%mm4, %%mm3   \n\t"
365
    "psrld          $5, %%mm0   \n\t"
366
    "pslld         $11, %%mm3   \n\t"
367
    "por         %%mm3, %%mm0   \n\t"
368
    MOVNTQ"      %%mm0, (%0)    \n\t"
369
    "add           $16,  %1     \n\t"
370
    "add            $8,  %0     \n\t"
371
    "2:                         \n\t"
372
    "cmp            %2,  %1     \n\t"
373
    " jb            1b          \n\t"
374
    : "+r" (d), "+r"(s)
375
    : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
376
    );
377
#else
378
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
379
    __asm__ volatile(
380
        "movq    %0, %%mm7    \n\t"
381
        "movq    %1, %%mm6    \n\t"
382
        ::"m"(red_16mask),"m"(green_16mask));
383
    while (s < mm_end)
384
    {
385
        __asm__ volatile(
386
        PREFETCH"    32%1           \n\t"
387
        "movd          %1, %%mm0    \n\t"
388
        "movd         4%1, %%mm3    \n\t"
389
        "punpckldq    8%1, %%mm0    \n\t"
390
        "punpckldq   12%1, %%mm3    \n\t"
391
        "movq       %%mm0, %%mm1    \n\t"
392
        "movq       %%mm0, %%mm2    \n\t"
393
        "movq       %%mm3, %%mm4    \n\t"
394
        "movq       %%mm3, %%mm5    \n\t"
395
        "psrlq         $3, %%mm0    \n\t"
396
        "psrlq         $3, %%mm3    \n\t"
397
        "pand          %2, %%mm0    \n\t"
398
        "pand          %2, %%mm3    \n\t"
399
        "psrlq         $5, %%mm1    \n\t"
400
        "psrlq         $5, %%mm4    \n\t"
401
        "pand       %%mm6, %%mm1    \n\t"
402
        "pand       %%mm6, %%mm4    \n\t"
403
        "psrlq         $8, %%mm2    \n\t"
404
        "psrlq         $8, %%mm5    \n\t"
405
        "pand       %%mm7, %%mm2    \n\t"
406
        "pand       %%mm7, %%mm5    \n\t"
407
        "por        %%mm1, %%mm0    \n\t"
408
        "por        %%mm4, %%mm3    \n\t"
409
        "por        %%mm2, %%mm0    \n\t"
410
        "por        %%mm5, %%mm3    \n\t"
411
        "psllq        $16, %%mm3    \n\t"
412
        "por        %%mm3, %%mm0    \n\t"
413
        MOVNTQ"     %%mm0, %0       \n\t"
414
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
415
        d += 4;
416
        s += 16;
417
    }
418
#endif
419
    __asm__ volatile(SFENCE:::"memory");
420
    __asm__ volatile(EMMS:::"memory");
421
#endif
422
    while (s < end)
423
    {
424
        register int rgb = *(const uint32_t*)s; s += 4;
425
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
426
    }
427
}
428

    
429
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
430
{
431
    const uint8_t *s = src;
432
    const uint8_t *end;
433
#if HAVE_MMX
434
    const uint8_t *mm_end;
435
#endif
436
    uint16_t *d = (uint16_t *)dst;
437
    end = s + src_size;
438
#if HAVE_MMX
439
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
440
    __asm__ volatile(
441
        "movq          %0, %%mm7    \n\t"
442
        "movq          %1, %%mm6    \n\t"
443
        ::"m"(red_16mask),"m"(green_16mask));
444
    mm_end = end - 15;
445
    while (s < mm_end)
446
    {
447
        __asm__ volatile(
448
        PREFETCH"    32%1           \n\t"
449
        "movd          %1, %%mm0    \n\t"
450
        "movd         4%1, %%mm3    \n\t"
451
        "punpckldq    8%1, %%mm0    \n\t"
452
        "punpckldq   12%1, %%mm3    \n\t"
453
        "movq       %%mm0, %%mm1    \n\t"
454
        "movq       %%mm0, %%mm2    \n\t"
455
        "movq       %%mm3, %%mm4    \n\t"
456
        "movq       %%mm3, %%mm5    \n\t"
457
        "psllq         $8, %%mm0    \n\t"
458
        "psllq         $8, %%mm3    \n\t"
459
        "pand       %%mm7, %%mm0    \n\t"
460
        "pand       %%mm7, %%mm3    \n\t"
461
        "psrlq         $5, %%mm1    \n\t"
462
        "psrlq         $5, %%mm4    \n\t"
463
        "pand       %%mm6, %%mm1    \n\t"
464
        "pand       %%mm6, %%mm4    \n\t"
465
        "psrlq        $19, %%mm2    \n\t"
466
        "psrlq        $19, %%mm5    \n\t"
467
        "pand          %2, %%mm2    \n\t"
468
        "pand          %2, %%mm5    \n\t"
469
        "por        %%mm1, %%mm0    \n\t"
470
        "por        %%mm4, %%mm3    \n\t"
471
        "por        %%mm2, %%mm0    \n\t"
472
        "por        %%mm5, %%mm3    \n\t"
473
        "psllq        $16, %%mm3    \n\t"
474
        "por        %%mm3, %%mm0    \n\t"
475
        MOVNTQ"     %%mm0, %0       \n\t"
476
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
477
        d += 4;
478
        s += 16;
479
    }
480
    __asm__ volatile(SFENCE:::"memory");
481
    __asm__ volatile(EMMS:::"memory");
482
#endif
483
    while (s < end)
484
    {
485
        register int rgb = *(const uint32_t*)s; s += 4;
486
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
487
    }
488
}
489

    
490
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
491
{
492
    const uint8_t *s = src;
493
    const uint8_t *end;
494
#if HAVE_MMX
495
    const uint8_t *mm_end;
496
#endif
497
    uint16_t *d = (uint16_t *)dst;
498
    end = s + src_size;
499
#if HAVE_MMX
500
    mm_end = end - 15;
501
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
502
    __asm__ volatile(
503
    "movq           %3, %%mm5   \n\t"
504
    "movq           %4, %%mm6   \n\t"
505
    "movq           %5, %%mm7   \n\t"
506
    "jmp            2f          \n\t"
507
    ASMALIGN(4)
508
    "1:                         \n\t"
509
    PREFETCH"   32(%1)          \n\t"
510
    "movd         (%1), %%mm0   \n\t"
511
    "movd        4(%1), %%mm3   \n\t"
512
    "punpckldq   8(%1), %%mm0   \n\t"
513
    "punpckldq  12(%1), %%mm3   \n\t"
514
    "movq        %%mm0, %%mm1   \n\t"
515
    "movq        %%mm3, %%mm4   \n\t"
516
    "pand        %%mm6, %%mm0   \n\t"
517
    "pand        %%mm6, %%mm3   \n\t"
518
    "pmaddwd     %%mm7, %%mm0   \n\t"
519
    "pmaddwd     %%mm7, %%mm3   \n\t"
520
    "pand        %%mm5, %%mm1   \n\t"
521
    "pand        %%mm5, %%mm4   \n\t"
522
    "por         %%mm1, %%mm0   \n\t"
523
    "por         %%mm4, %%mm3   \n\t"
524
    "psrld          $6, %%mm0   \n\t"
525
    "pslld         $10, %%mm3   \n\t"
526
    "por         %%mm3, %%mm0   \n\t"
527
    MOVNTQ"      %%mm0, (%0)    \n\t"
528
    "add           $16,  %1     \n\t"
529
    "add            $8,  %0     \n\t"
530
    "2:                         \n\t"
531
    "cmp            %2,  %1     \n\t"
532
    " jb            1b          \n\t"
533
    : "+r" (d), "+r"(s)
534
    : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
535
    );
536
#else
537
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
538
    __asm__ volatile(
539
        "movq          %0, %%mm7    \n\t"
540
        "movq          %1, %%mm6    \n\t"
541
        ::"m"(red_15mask),"m"(green_15mask));
542
    while (s < mm_end)
543
    {
544
        __asm__ volatile(
545
        PREFETCH"    32%1           \n\t"
546
        "movd          %1, %%mm0    \n\t"
547
        "movd         4%1, %%mm3    \n\t"
548
        "punpckldq    8%1, %%mm0    \n\t"
549
        "punpckldq   12%1, %%mm3    \n\t"
550
        "movq       %%mm0, %%mm1    \n\t"
551
        "movq       %%mm0, %%mm2    \n\t"
552
        "movq       %%mm3, %%mm4    \n\t"
553
        "movq       %%mm3, %%mm5    \n\t"
554
        "psrlq         $3, %%mm0    \n\t"
555
        "psrlq         $3, %%mm3    \n\t"
556
        "pand          %2, %%mm0    \n\t"
557
        "pand          %2, %%mm3    \n\t"
558
        "psrlq         $6, %%mm1    \n\t"
559
        "psrlq         $6, %%mm4    \n\t"
560
        "pand       %%mm6, %%mm1    \n\t"
561
        "pand       %%mm6, %%mm4    \n\t"
562
        "psrlq         $9, %%mm2    \n\t"
563
        "psrlq         $9, %%mm5    \n\t"
564
        "pand       %%mm7, %%mm2    \n\t"
565
        "pand       %%mm7, %%mm5    \n\t"
566
        "por        %%mm1, %%mm0    \n\t"
567
        "por        %%mm4, %%mm3    \n\t"
568
        "por        %%mm2, %%mm0    \n\t"
569
        "por        %%mm5, %%mm3    \n\t"
570
        "psllq        $16, %%mm3    \n\t"
571
        "por        %%mm3, %%mm0    \n\t"
572
        MOVNTQ"     %%mm0, %0       \n\t"
573
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
574
        d += 4;
575
        s += 16;
576
    }
577
#endif
578
    __asm__ volatile(SFENCE:::"memory");
579
    __asm__ volatile(EMMS:::"memory");
580
#endif
581
    while (s < end)
582
    {
583
        register int rgb = *(const uint32_t*)s; s += 4;
584
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
585
    }
586
}
587

    
588
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
589
{
590
    const uint8_t *s = src;
591
    const uint8_t *end;
592
#if HAVE_MMX
593
    const uint8_t *mm_end;
594
#endif
595
    uint16_t *d = (uint16_t *)dst;
596
    end = s + src_size;
597
#if HAVE_MMX
598
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
599
    __asm__ volatile(
600
        "movq          %0, %%mm7    \n\t"
601
        "movq          %1, %%mm6    \n\t"
602
        ::"m"(red_15mask),"m"(green_15mask));
603
    mm_end = end - 15;
604
    while (s < mm_end)
605
    {
606
        __asm__ volatile(
607
        PREFETCH"    32%1           \n\t"
608
        "movd          %1, %%mm0    \n\t"
609
        "movd         4%1, %%mm3    \n\t"
610
        "punpckldq    8%1, %%mm0    \n\t"
611
        "punpckldq   12%1, %%mm3    \n\t"
612
        "movq       %%mm0, %%mm1    \n\t"
613
        "movq       %%mm0, %%mm2    \n\t"
614
        "movq       %%mm3, %%mm4    \n\t"
615
        "movq       %%mm3, %%mm5    \n\t"
616
        "psllq         $7, %%mm0    \n\t"
617
        "psllq         $7, %%mm3    \n\t"
618
        "pand       %%mm7, %%mm0    \n\t"
619
        "pand       %%mm7, %%mm3    \n\t"
620
        "psrlq         $6, %%mm1    \n\t"
621
        "psrlq         $6, %%mm4    \n\t"
622
        "pand       %%mm6, %%mm1    \n\t"
623
        "pand       %%mm6, %%mm4    \n\t"
624
        "psrlq        $19, %%mm2    \n\t"
625
        "psrlq        $19, %%mm5    \n\t"
626
        "pand          %2, %%mm2    \n\t"
627
        "pand          %2, %%mm5    \n\t"
628
        "por        %%mm1, %%mm0    \n\t"
629
        "por        %%mm4, %%mm3    \n\t"
630
        "por        %%mm2, %%mm0    \n\t"
631
        "por        %%mm5, %%mm3    \n\t"
632
        "psllq        $16, %%mm3    \n\t"
633
        "por        %%mm3, %%mm0    \n\t"
634
        MOVNTQ"     %%mm0, %0       \n\t"
635
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
636
        d += 4;
637
        s += 16;
638
    }
639
    __asm__ volatile(SFENCE:::"memory");
640
    __asm__ volatile(EMMS:::"memory");
641
#endif
642
    while (s < end)
643
    {
644
        register int rgb = *(const uint32_t*)s; s += 4;
645
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
646
    }
647
}
648

    
649
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
650
{
651
    const uint8_t *s = src;
652
    const uint8_t *end;
653
#if HAVE_MMX
654
    const uint8_t *mm_end;
655
#endif
656
    uint16_t *d = (uint16_t *)dst;
657
    end = s + src_size;
658
#if HAVE_MMX
659
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
660
    __asm__ volatile(
661
        "movq         %0, %%mm7     \n\t"
662
        "movq         %1, %%mm6     \n\t"
663
        ::"m"(red_16mask),"m"(green_16mask));
664
    mm_end = end - 11;
665
    while (s < mm_end)
666
    {
667
        __asm__ volatile(
668
        PREFETCH"    32%1           \n\t"
669
        "movd          %1, %%mm0    \n\t"
670
        "movd         3%1, %%mm3    \n\t"
671
        "punpckldq    6%1, %%mm0    \n\t"
672
        "punpckldq    9%1, %%mm3    \n\t"
673
        "movq       %%mm0, %%mm1    \n\t"
674
        "movq       %%mm0, %%mm2    \n\t"
675
        "movq       %%mm3, %%mm4    \n\t"
676
        "movq       %%mm3, %%mm5    \n\t"
677
        "psrlq         $3, %%mm0    \n\t"
678
        "psrlq         $3, %%mm3    \n\t"
679
        "pand          %2, %%mm0    \n\t"
680
        "pand          %2, %%mm3    \n\t"
681
        "psrlq         $5, %%mm1    \n\t"
682
        "psrlq         $5, %%mm4    \n\t"
683
        "pand       %%mm6, %%mm1    \n\t"
684
        "pand       %%mm6, %%mm4    \n\t"
685
        "psrlq         $8, %%mm2    \n\t"
686
        "psrlq         $8, %%mm5    \n\t"
687
        "pand       %%mm7, %%mm2    \n\t"
688
        "pand       %%mm7, %%mm5    \n\t"
689
        "por        %%mm1, %%mm0    \n\t"
690
        "por        %%mm4, %%mm3    \n\t"
691
        "por        %%mm2, %%mm0    \n\t"
692
        "por        %%mm5, %%mm3    \n\t"
693
        "psllq        $16, %%mm3    \n\t"
694
        "por        %%mm3, %%mm0    \n\t"
695
        MOVNTQ"     %%mm0, %0       \n\t"
696
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
697
        d += 4;
698
        s += 12;
699
    }
700
    __asm__ volatile(SFENCE:::"memory");
701
    __asm__ volatile(EMMS:::"memory");
702
#endif
703
    while (s < end)
704
    {
705
        const int b = *s++;
706
        const int g = *s++;
707
        const int r = *s++;
708
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
709
    }
710
}
711

    
712
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
713
{
714
    const uint8_t *s = src;
715
    const uint8_t *end;
716
#if HAVE_MMX
717
    const uint8_t *mm_end;
718
#endif
719
    uint16_t *d = (uint16_t *)dst;
720
    end = s + src_size;
721
#if HAVE_MMX
722
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
723
    __asm__ volatile(
724
        "movq         %0, %%mm7     \n\t"
725
        "movq         %1, %%mm6     \n\t"
726
        ::"m"(red_16mask),"m"(green_16mask));
727
    mm_end = end - 15;
728
    while (s < mm_end)
729
    {
730
        __asm__ volatile(
731
        PREFETCH"    32%1           \n\t"
732
        "movd          %1, %%mm0    \n\t"
733
        "movd         3%1, %%mm3    \n\t"
734
        "punpckldq    6%1, %%mm0    \n\t"
735
        "punpckldq    9%1, %%mm3    \n\t"
736
        "movq       %%mm0, %%mm1    \n\t"
737
        "movq       %%mm0, %%mm2    \n\t"
738
        "movq       %%mm3, %%mm4    \n\t"
739
        "movq       %%mm3, %%mm5    \n\t"
740
        "psllq         $8, %%mm0    \n\t"
741
        "psllq         $8, %%mm3    \n\t"
742
        "pand       %%mm7, %%mm0    \n\t"
743
        "pand       %%mm7, %%mm3    \n\t"
744
        "psrlq         $5, %%mm1    \n\t"
745
        "psrlq         $5, %%mm4    \n\t"
746
        "pand       %%mm6, %%mm1    \n\t"
747
        "pand       %%mm6, %%mm4    \n\t"
748
        "psrlq        $19, %%mm2    \n\t"
749
        "psrlq        $19, %%mm5    \n\t"
750
        "pand          %2, %%mm2    \n\t"
751
        "pand          %2, %%mm5    \n\t"
752
        "por        %%mm1, %%mm0    \n\t"
753
        "por        %%mm4, %%mm3    \n\t"
754
        "por        %%mm2, %%mm0    \n\t"
755
        "por        %%mm5, %%mm3    \n\t"
756
        "psllq        $16, %%mm3    \n\t"
757
        "por        %%mm3, %%mm0    \n\t"
758
        MOVNTQ"     %%mm0, %0       \n\t"
759
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
760
        d += 4;
761
        s += 12;
762
    }
763
    __asm__ volatile(SFENCE:::"memory");
764
    __asm__ volatile(EMMS:::"memory");
765
#endif
766
    while (s < end)
767
    {
768
        const int r = *s++;
769
        const int g = *s++;
770
        const int b = *s++;
771
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
772
    }
773
}
774

    
775
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
776
{
777
    const uint8_t *s = src;
778
    const uint8_t *end;
779
#if HAVE_MMX
780
    const uint8_t *mm_end;
781
#endif
782
    uint16_t *d = (uint16_t *)dst;
783
    end = s + src_size;
784
#if HAVE_MMX
785
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
786
    __asm__ volatile(
787
        "movq          %0, %%mm7    \n\t"
788
        "movq          %1, %%mm6    \n\t"
789
        ::"m"(red_15mask),"m"(green_15mask));
790
    mm_end = end - 11;
791
    while (s < mm_end)
792
    {
793
        __asm__ volatile(
794
        PREFETCH"    32%1           \n\t"
795
        "movd          %1, %%mm0    \n\t"
796
        "movd         3%1, %%mm3    \n\t"
797
        "punpckldq    6%1, %%mm0    \n\t"
798
        "punpckldq    9%1, %%mm3    \n\t"
799
        "movq       %%mm0, %%mm1    \n\t"
800
        "movq       %%mm0, %%mm2    \n\t"
801
        "movq       %%mm3, %%mm4    \n\t"
802
        "movq       %%mm3, %%mm5    \n\t"
803
        "psrlq         $3, %%mm0    \n\t"
804
        "psrlq         $3, %%mm3    \n\t"
805
        "pand          %2, %%mm0    \n\t"
806
        "pand          %2, %%mm3    \n\t"
807
        "psrlq         $6, %%mm1    \n\t"
808
        "psrlq         $6, %%mm4    \n\t"
809
        "pand       %%mm6, %%mm1    \n\t"
810
        "pand       %%mm6, %%mm4    \n\t"
811
        "psrlq         $9, %%mm2    \n\t"
812
        "psrlq         $9, %%mm5    \n\t"
813
        "pand       %%mm7, %%mm2    \n\t"
814
        "pand       %%mm7, %%mm5    \n\t"
815
        "por        %%mm1, %%mm0    \n\t"
816
        "por        %%mm4, %%mm3    \n\t"
817
        "por        %%mm2, %%mm0    \n\t"
818
        "por        %%mm5, %%mm3    \n\t"
819
        "psllq        $16, %%mm3    \n\t"
820
        "por        %%mm3, %%mm0    \n\t"
821
        MOVNTQ"     %%mm0, %0       \n\t"
822
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
823
        d += 4;
824
        s += 12;
825
    }
826
    __asm__ volatile(SFENCE:::"memory");
827
    __asm__ volatile(EMMS:::"memory");
828
#endif
829
    while (s < end)
830
    {
831
        const int b = *s++;
832
        const int g = *s++;
833
        const int r = *s++;
834
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
835
    }
836
}
837

    
838
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
839
{
840
    const uint8_t *s = src;
841
    const uint8_t *end;
842
#if HAVE_MMX
843
    const uint8_t *mm_end;
844
#endif
845
    uint16_t *d = (uint16_t *)dst;
846
    end = s + src_size;
847
#if HAVE_MMX
848
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
849
    __asm__ volatile(
850
        "movq         %0, %%mm7     \n\t"
851
        "movq         %1, %%mm6     \n\t"
852
        ::"m"(red_15mask),"m"(green_15mask));
853
    mm_end = end - 15;
854
    while (s < mm_end)
855
    {
856
        __asm__ volatile(
857
        PREFETCH"   32%1            \n\t"
858
        "movd         %1, %%mm0     \n\t"
859
        "movd        3%1, %%mm3     \n\t"
860
        "punpckldq   6%1, %%mm0     \n\t"
861
        "punpckldq   9%1, %%mm3     \n\t"
862
        "movq      %%mm0, %%mm1     \n\t"
863
        "movq      %%mm0, %%mm2     \n\t"
864
        "movq      %%mm3, %%mm4     \n\t"
865
        "movq      %%mm3, %%mm5     \n\t"
866
        "psllq        $7, %%mm0     \n\t"
867
        "psllq        $7, %%mm3     \n\t"
868
        "pand      %%mm7, %%mm0     \n\t"
869
        "pand      %%mm7, %%mm3     \n\t"
870
        "psrlq        $6, %%mm1     \n\t"
871
        "psrlq        $6, %%mm4     \n\t"
872
        "pand      %%mm6, %%mm1     \n\t"
873
        "pand      %%mm6, %%mm4     \n\t"
874
        "psrlq       $19, %%mm2     \n\t"
875
        "psrlq       $19, %%mm5     \n\t"
876
        "pand         %2, %%mm2     \n\t"
877
        "pand         %2, %%mm5     \n\t"
878
        "por       %%mm1, %%mm0     \n\t"
879
        "por       %%mm4, %%mm3     \n\t"
880
        "por       %%mm2, %%mm0     \n\t"
881
        "por       %%mm5, %%mm3     \n\t"
882
        "psllq       $16, %%mm3     \n\t"
883
        "por       %%mm3, %%mm0     \n\t"
884
        MOVNTQ"    %%mm0, %0        \n\t"
885
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
886
        d += 4;
887
        s += 12;
888
    }
889
    __asm__ volatile(SFENCE:::"memory");
890
    __asm__ volatile(EMMS:::"memory");
891
#endif
892
    while (s < end)
893
    {
894
        const int r = *s++;
895
        const int g = *s++;
896
        const int b = *s++;
897
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
898
    }
899
}
900

    
901
/*
902
  I use less accurate approximation here by simply left-shifting the input
903
  value and filling the low order bits with zeroes. This method improves PNG
904
  compression but this scheme cannot reproduce white exactly, since it does
905
  not generate an all-ones maximum value; the net effect is to darken the
906
  image slightly.
907

908
  The better method should be "left bit replication":
909

910
   4 3 2 1 0
911
   ---------
912
   1 1 0 1 1
913

914
   7 6 5 4 3  2 1 0
915
   ----------------
916
   1 1 0 1 1  1 1 0
917
   |=======|  |===|
918
       |      leftmost bits repeated to fill open bits
919
       |
920
   original bits
921
*/
922
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
923
{
924
    const uint16_t *end;
925
#if HAVE_MMX
926
    const uint16_t *mm_end;
927
#endif
928
    uint8_t *d = dst;
929
    const uint16_t *s = (const uint16_t*)src;
930
    end = s + src_size/2;
931
#if HAVE_MMX
932
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
933
    mm_end = end - 7;
934
    while (s < mm_end)
935
    {
936
        __asm__ volatile(
937
        PREFETCH"    32%1           \n\t"
938
        "movq          %1, %%mm0    \n\t"
939
        "movq          %1, %%mm1    \n\t"
940
        "movq          %1, %%mm2    \n\t"
941
        "pand          %2, %%mm0    \n\t"
942
        "pand          %3, %%mm1    \n\t"
943
        "pand          %4, %%mm2    \n\t"
944
        "psllq         $3, %%mm0    \n\t"
945
        "psrlq         $2, %%mm1    \n\t"
946
        "psrlq         $7, %%mm2    \n\t"
947
        "movq       %%mm0, %%mm3    \n\t"
948
        "movq       %%mm1, %%mm4    \n\t"
949
        "movq       %%mm2, %%mm5    \n\t"
950
        "punpcklwd     %5, %%mm0    \n\t"
951
        "punpcklwd     %5, %%mm1    \n\t"
952
        "punpcklwd     %5, %%mm2    \n\t"
953
        "punpckhwd     %5, %%mm3    \n\t"
954
        "punpckhwd     %5, %%mm4    \n\t"
955
        "punpckhwd     %5, %%mm5    \n\t"
956
        "psllq         $8, %%mm1    \n\t"
957
        "psllq        $16, %%mm2    \n\t"
958
        "por        %%mm1, %%mm0    \n\t"
959
        "por        %%mm2, %%mm0    \n\t"
960
        "psllq         $8, %%mm4    \n\t"
961
        "psllq        $16, %%mm5    \n\t"
962
        "por        %%mm4, %%mm3    \n\t"
963
        "por        %%mm5, %%mm3    \n\t"
964

    
965
        "movq       %%mm0, %%mm6    \n\t"
966
        "movq       %%mm3, %%mm7    \n\t"
967

    
968
        "movq         8%1, %%mm0    \n\t"
969
        "movq         8%1, %%mm1    \n\t"
970
        "movq         8%1, %%mm2    \n\t"
971
        "pand          %2, %%mm0    \n\t"
972
        "pand          %3, %%mm1    \n\t"
973
        "pand          %4, %%mm2    \n\t"
974
        "psllq         $3, %%mm0    \n\t"
975
        "psrlq         $2, %%mm1    \n\t"
976
        "psrlq         $7, %%mm2    \n\t"
977
        "movq       %%mm0, %%mm3    \n\t"
978
        "movq       %%mm1, %%mm4    \n\t"
979
        "movq       %%mm2, %%mm5    \n\t"
980
        "punpcklwd     %5, %%mm0    \n\t"
981
        "punpcklwd     %5, %%mm1    \n\t"
982
        "punpcklwd     %5, %%mm2    \n\t"
983
        "punpckhwd     %5, %%mm3    \n\t"
984
        "punpckhwd     %5, %%mm4    \n\t"
985
        "punpckhwd     %5, %%mm5    \n\t"
986
        "psllq         $8, %%mm1    \n\t"
987
        "psllq        $16, %%mm2    \n\t"
988
        "por        %%mm1, %%mm0    \n\t"
989
        "por        %%mm2, %%mm0    \n\t"
990
        "psllq         $8, %%mm4    \n\t"
991
        "psllq        $16, %%mm5    \n\t"
992
        "por        %%mm4, %%mm3    \n\t"
993
        "por        %%mm5, %%mm3    \n\t"
994

    
995
        :"=m"(*d)
996
        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
997
        :"memory");
998
        /* borrowed 32 to 24 */
999
        __asm__ volatile(
1000
        "movq       %%mm0, %%mm4    \n\t"
1001
        "movq       %%mm3, %%mm5    \n\t"
1002
        "movq       %%mm6, %%mm0    \n\t"
1003
        "movq       %%mm7, %%mm1    \n\t"
1004

    
1005
        "movq       %%mm4, %%mm6    \n\t"
1006
        "movq       %%mm5, %%mm7    \n\t"
1007
        "movq       %%mm0, %%mm2    \n\t"
1008
        "movq       %%mm1, %%mm3    \n\t"
1009

    
1010
        "psrlq         $8, %%mm2    \n\t"
1011
        "psrlq         $8, %%mm3    \n\t"
1012
        "psrlq         $8, %%mm6    \n\t"
1013
        "psrlq         $8, %%mm7    \n\t"
1014
        "pand          %2, %%mm0    \n\t"
1015
        "pand          %2, %%mm1    \n\t"
1016
        "pand          %2, %%mm4    \n\t"
1017
        "pand          %2, %%mm5    \n\t"
1018
        "pand          %3, %%mm2    \n\t"
1019
        "pand          %3, %%mm3    \n\t"
1020
        "pand          %3, %%mm6    \n\t"
1021
        "pand          %3, %%mm7    \n\t"
1022
        "por        %%mm2, %%mm0    \n\t"
1023
        "por        %%mm3, %%mm1    \n\t"
1024
        "por        %%mm6, %%mm4    \n\t"
1025
        "por        %%mm7, %%mm5    \n\t"
1026

    
1027
        "movq       %%mm1, %%mm2    \n\t"
1028
        "movq       %%mm4, %%mm3    \n\t"
1029
        "psllq        $48, %%mm2    \n\t"
1030
        "psllq        $32, %%mm3    \n\t"
1031
        "pand          %4, %%mm2    \n\t"
1032
        "pand          %5, %%mm3    \n\t"
1033
        "por        %%mm2, %%mm0    \n\t"
1034
        "psrlq        $16, %%mm1    \n\t"
1035
        "psrlq        $32, %%mm4    \n\t"
1036
        "psllq        $16, %%mm5    \n\t"
1037
        "por        %%mm3, %%mm1    \n\t"
1038
        "pand          %6, %%mm5    \n\t"
1039
        "por        %%mm5, %%mm4    \n\t"
1040

    
1041
        MOVNTQ"     %%mm0,   %0     \n\t"
1042
        MOVNTQ"     %%mm1,  8%0     \n\t"
1043
        MOVNTQ"     %%mm4, 16%0"
1044

    
1045
        :"=m"(*d)
1046
        :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1047
        :"memory");
1048
        d += 24;
1049
        s += 8;
1050
    }
1051
    __asm__ volatile(SFENCE:::"memory");
1052
    __asm__ volatile(EMMS:::"memory");
1053
#endif
1054
    while (s < end)
1055
    {
1056
        register uint16_t bgr;
1057
        bgr = *s++;
1058
        *d++ = (bgr&0x1F)<<3;
1059
        *d++ = (bgr&0x3E0)>>2;
1060
        *d++ = (bgr&0x7C00)>>7;
1061
    }
1062
}
1063

    
1064
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1065
{
1066
    const uint16_t *end;
1067
#if HAVE_MMX
1068
    const uint16_t *mm_end;
1069
#endif
1070
    uint8_t *d = (uint8_t *)dst;
1071
    const uint16_t *s = (const uint16_t *)src;
1072
    end = s + src_size/2;
1073
#if HAVE_MMX
1074
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1075
    mm_end = end - 7;
1076
    while (s < mm_end)
1077
    {
1078
        __asm__ volatile(
1079
        PREFETCH"    32%1           \n\t"
1080
        "movq          %1, %%mm0    \n\t"
1081
        "movq          %1, %%mm1    \n\t"
1082
        "movq          %1, %%mm2    \n\t"
1083
        "pand          %2, %%mm0    \n\t"
1084
        "pand          %3, %%mm1    \n\t"
1085
        "pand          %4, %%mm2    \n\t"
1086
        "psllq         $3, %%mm0    \n\t"
1087
        "psrlq         $3, %%mm1    \n\t"
1088
        "psrlq         $8, %%mm2    \n\t"
1089
        "movq       %%mm0, %%mm3    \n\t"
1090
        "movq       %%mm1, %%mm4    \n\t"
1091
        "movq       %%mm2, %%mm5    \n\t"
1092
        "punpcklwd     %5, %%mm0    \n\t"
1093
        "punpcklwd     %5, %%mm1    \n\t"
1094
        "punpcklwd     %5, %%mm2    \n\t"
1095
        "punpckhwd     %5, %%mm3    \n\t"
1096
        "punpckhwd     %5, %%mm4    \n\t"
1097
        "punpckhwd     %5, %%mm5    \n\t"
1098
        "psllq         $8, %%mm1    \n\t"
1099
        "psllq        $16, %%mm2    \n\t"
1100
        "por        %%mm1, %%mm0    \n\t"
1101
        "por        %%mm2, %%mm0    \n\t"
1102
        "psllq         $8, %%mm4    \n\t"
1103
        "psllq        $16, %%mm5    \n\t"
1104
        "por        %%mm4, %%mm3    \n\t"
1105
        "por        %%mm5, %%mm3    \n\t"
1106

    
1107
        "movq       %%mm0, %%mm6    \n\t"
1108
        "movq       %%mm3, %%mm7    \n\t"
1109

    
1110
        "movq         8%1, %%mm0    \n\t"
1111
        "movq         8%1, %%mm1    \n\t"
1112
        "movq         8%1, %%mm2    \n\t"
1113
        "pand          %2, %%mm0    \n\t"
1114
        "pand          %3, %%mm1    \n\t"
1115
        "pand          %4, %%mm2    \n\t"
1116
        "psllq         $3, %%mm0    \n\t"
1117
        "psrlq         $3, %%mm1    \n\t"
1118
        "psrlq         $8, %%mm2    \n\t"
1119
        "movq       %%mm0, %%mm3    \n\t"
1120
        "movq       %%mm1, %%mm4    \n\t"
1121
        "movq       %%mm2, %%mm5    \n\t"
1122
        "punpcklwd     %5, %%mm0    \n\t"
1123
        "punpcklwd     %5, %%mm1    \n\t"
1124
        "punpcklwd     %5, %%mm2    \n\t"
1125
        "punpckhwd     %5, %%mm3    \n\t"
1126
        "punpckhwd     %5, %%mm4    \n\t"
1127
        "punpckhwd     %5, %%mm5    \n\t"
1128
        "psllq         $8, %%mm1    \n\t"
1129
        "psllq        $16, %%mm2    \n\t"
1130
        "por        %%mm1, %%mm0    \n\t"
1131
        "por        %%mm2, %%mm0    \n\t"
1132
        "psllq         $8, %%mm4    \n\t"
1133
        "psllq        $16, %%mm5    \n\t"
1134
        "por        %%mm4, %%mm3    \n\t"
1135
        "por        %%mm5, %%mm3    \n\t"
1136
        :"=m"(*d)
1137
        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1138
        :"memory");
1139
        /* borrowed 32 to 24 */
1140
        __asm__ volatile(
1141
        "movq       %%mm0, %%mm4    \n\t"
1142
        "movq       %%mm3, %%mm5    \n\t"
1143
        "movq       %%mm6, %%mm0    \n\t"
1144
        "movq       %%mm7, %%mm1    \n\t"
1145

    
1146
        "movq       %%mm4, %%mm6    \n\t"
1147
        "movq       %%mm5, %%mm7    \n\t"
1148
        "movq       %%mm0, %%mm2    \n\t"
1149
        "movq       %%mm1, %%mm3    \n\t"
1150

    
1151
        "psrlq         $8, %%mm2    \n\t"
1152
        "psrlq         $8, %%mm3    \n\t"
1153
        "psrlq         $8, %%mm6    \n\t"
1154
        "psrlq         $8, %%mm7    \n\t"
1155
        "pand          %2, %%mm0    \n\t"
1156
        "pand          %2, %%mm1    \n\t"
1157
        "pand          %2, %%mm4    \n\t"
1158
        "pand          %2, %%mm5    \n\t"
1159
        "pand          %3, %%mm2    \n\t"
1160
        "pand          %3, %%mm3    \n\t"
1161
        "pand          %3, %%mm6    \n\t"
1162
        "pand          %3, %%mm7    \n\t"
1163
        "por        %%mm2, %%mm0    \n\t"
1164
        "por        %%mm3, %%mm1    \n\t"
1165
        "por        %%mm6, %%mm4    \n\t"
1166
        "por        %%mm7, %%mm5    \n\t"
1167

    
1168
        "movq       %%mm1, %%mm2    \n\t"
1169
        "movq       %%mm4, %%mm3    \n\t"
1170
        "psllq        $48, %%mm2    \n\t"
1171
        "psllq        $32, %%mm3    \n\t"
1172
        "pand          %4, %%mm2    \n\t"
1173
        "pand          %5, %%mm3    \n\t"
1174
        "por        %%mm2, %%mm0    \n\t"
1175
        "psrlq        $16, %%mm1    \n\t"
1176
        "psrlq        $32, %%mm4    \n\t"
1177
        "psllq        $16, %%mm5    \n\t"
1178
        "por        %%mm3, %%mm1    \n\t"
1179
        "pand          %6, %%mm5    \n\t"
1180
        "por        %%mm5, %%mm4    \n\t"
1181

    
1182
        MOVNTQ"     %%mm0,   %0     \n\t"
1183
        MOVNTQ"     %%mm1,  8%0     \n\t"
1184
        MOVNTQ"     %%mm4, 16%0"
1185

    
1186
        :"=m"(*d)
1187
        :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1188
        :"memory");
1189
        d += 24;
1190
        s += 8;
1191
    }
1192
    __asm__ volatile(SFENCE:::"memory");
1193
    __asm__ volatile(EMMS:::"memory");
1194
#endif
1195
    while (s < end)
1196
    {
1197
        register uint16_t bgr;
1198
        bgr = *s++;
1199
        *d++ = (bgr&0x1F)<<3;
1200
        *d++ = (bgr&0x7E0)>>3;
1201
        *d++ = (bgr&0xF800)>>8;
1202
    }
1203
}
1204

    
1205
/*
1206
 * mm0 = 00 B3 00 B2 00 B1 00 B0
1207
 * mm1 = 00 G3 00 G2 00 G1 00 G0
1208
 * mm2 = 00 R3 00 R2 00 R1 00 R0
1209
 * mm6 = FF FF FF FF FF FF FF FF
1210
 * mm7 = 00 00 00 00 00 00 00 00
1211
 */
1212
#define PACK_RGB32 \
1213
    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1214
    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1215
    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1216
    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1217
    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1218
    "movq       %%mm0, %%mm3    \n\t"                               \
1219
    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1220
    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1221
    MOVNTQ"     %%mm0,  %0      \n\t"                               \
1222
    MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1223

    
1224
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1225
{
1226
    const uint16_t *end;
1227
#if HAVE_MMX
1228
    const uint16_t *mm_end;
1229
#endif
1230
    uint8_t *d = dst;
1231
    const uint16_t *s = (const uint16_t *)src;
1232
    end = s + src_size/2;
1233
#if HAVE_MMX
1234
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1235
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1236
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1237
    mm_end = end - 3;
1238
    while (s < mm_end)
1239
    {
1240
        __asm__ volatile(
1241
        PREFETCH"    32%1           \n\t"
1242
        "movq          %1, %%mm0    \n\t"
1243
        "movq          %1, %%mm1    \n\t"
1244
        "movq          %1, %%mm2    \n\t"
1245
        "pand          %2, %%mm0    \n\t"
1246
        "pand          %3, %%mm1    \n\t"
1247
        "pand          %4, %%mm2    \n\t"
1248
        "psllq         $3, %%mm0    \n\t"
1249
        "psrlq         $2, %%mm1    \n\t"
1250
        "psrlq         $7, %%mm2    \n\t"
1251
        PACK_RGB32
1252
        :"=m"(*d)
1253
        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1254
        :"memory");
1255
        d += 16;
1256
        s += 4;
1257
    }
1258
    __asm__ volatile(SFENCE:::"memory");
1259
    __asm__ volatile(EMMS:::"memory");
1260
#endif
1261
    while (s < end)
1262
    {
1263
#if 0 //slightly slower on Athlon
1264
        int bgr= *s++;
1265
        *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1266
#else
1267
        register uint16_t bgr;
1268
        bgr = *s++;
1269
#ifdef WORDS_BIGENDIAN
1270
        *d++ = 255;
1271
        *d++ = (bgr&0x7C00)>>7;
1272
        *d++ = (bgr&0x3E0)>>2;
1273
        *d++ = (bgr&0x1F)<<3;
1274
#else
1275
        *d++ = (bgr&0x1F)<<3;
1276
        *d++ = (bgr&0x3E0)>>2;
1277
        *d++ = (bgr&0x7C00)>>7;
1278
        *d++ = 255;
1279
#endif
1280

    
1281
#endif
1282
    }
1283
}
1284

    
1285
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1286
{
1287
    const uint16_t *end;
1288
#if HAVE_MMX
1289
    const uint16_t *mm_end;
1290
#endif
1291
    uint8_t *d = dst;
1292
    const uint16_t *s = (const uint16_t*)src;
1293
    end = s + src_size/2;
1294
#if HAVE_MMX
1295
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1296
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1297
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1298
    mm_end = end - 3;
1299
    while (s < mm_end)
1300
    {
1301
        __asm__ volatile(
1302
        PREFETCH"    32%1           \n\t"
1303
        "movq          %1, %%mm0    \n\t"
1304
        "movq          %1, %%mm1    \n\t"
1305
        "movq          %1, %%mm2    \n\t"
1306
        "pand          %2, %%mm0    \n\t"
1307
        "pand          %3, %%mm1    \n\t"
1308
        "pand          %4, %%mm2    \n\t"
1309
        "psllq         $3, %%mm0    \n\t"
1310
        "psrlq         $3, %%mm1    \n\t"
1311
        "psrlq         $8, %%mm2    \n\t"
1312
        PACK_RGB32
1313
        :"=m"(*d)
1314
        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1315
        :"memory");
1316
        d += 16;
1317
        s += 4;
1318
    }
1319
    __asm__ volatile(SFENCE:::"memory");
1320
    __asm__ volatile(EMMS:::"memory");
1321
#endif
1322
    while (s < end)
1323
    {
1324
        register uint16_t bgr;
1325
        bgr = *s++;
1326
#ifdef WORDS_BIGENDIAN
1327
        *d++ = 255;
1328
        *d++ = (bgr&0xF800)>>8;
1329
        *d++ = (bgr&0x7E0)>>3;
1330
        *d++ = (bgr&0x1F)<<3;
1331
#else
1332
        *d++ = (bgr&0x1F)<<3;
1333
        *d++ = (bgr&0x7E0)>>3;
1334
        *d++ = (bgr&0xF800)>>8;
1335
        *d++ = 255;
1336
#endif
1337
    }
1338
}
1339

    
1340
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1341
{
1342
    x86_reg idx = 15 - src_size;
1343
    const uint8_t *s = src-idx;
1344
    uint8_t *d = dst-idx;
1345
#if HAVE_MMX
1346
    __asm__ volatile(
1347
    "test          %0, %0           \n\t"
1348
    "jns           2f               \n\t"
1349
    PREFETCH"       (%1, %0)        \n\t"
1350
    "movq          %3, %%mm7        \n\t"
1351
    "pxor          %4, %%mm7        \n\t"
1352
    "movq       %%mm7, %%mm6        \n\t"
1353
    "pxor          %5, %%mm7        \n\t"
1354
    ASMALIGN(4)
1355
    "1:                             \n\t"
1356
    PREFETCH"     32(%1, %0)        \n\t"
1357
    "movq           (%1, %0), %%mm0 \n\t"
1358
    "movq          8(%1, %0), %%mm1 \n\t"
1359
# if HAVE_MMX2
1360
    "pshufw      $177, %%mm0, %%mm3 \n\t"
1361
    "pshufw      $177, %%mm1, %%mm5 \n\t"
1362
    "pand       %%mm7, %%mm0        \n\t"
1363
    "pand       %%mm6, %%mm3        \n\t"
1364
    "pand       %%mm7, %%mm1        \n\t"
1365
    "pand       %%mm6, %%mm5        \n\t"
1366
    "por        %%mm3, %%mm0        \n\t"
1367
    "por        %%mm5, %%mm1        \n\t"
1368
# else
1369
    "movq       %%mm0, %%mm2        \n\t"
1370
    "movq       %%mm1, %%mm4        \n\t"
1371
    "pand       %%mm7, %%mm0        \n\t"
1372
    "pand       %%mm6, %%mm2        \n\t"
1373
    "pand       %%mm7, %%mm1        \n\t"
1374
    "pand       %%mm6, %%mm4        \n\t"
1375
    "movq       %%mm2, %%mm3        \n\t"
1376
    "movq       %%mm4, %%mm5        \n\t"
1377
    "pslld        $16, %%mm2        \n\t"
1378
    "psrld        $16, %%mm3        \n\t"
1379
    "pslld        $16, %%mm4        \n\t"
1380
    "psrld        $16, %%mm5        \n\t"
1381
    "por        %%mm2, %%mm0        \n\t"
1382
    "por        %%mm4, %%mm1        \n\t"
1383
    "por        %%mm3, %%mm0        \n\t"
1384
    "por        %%mm5, %%mm1        \n\t"
1385
# endif
1386
    MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1387
    MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1388
    "add          $16, %0           \n\t"
1389
    "js            1b               \n\t"
1390
    SFENCE"                         \n\t"
1391
    EMMS"                           \n\t"
1392
    "2:                             \n\t"
1393
    : "+&r"(idx)
1394
    : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1395
    : "memory");
1396
#endif
1397
    for (; idx<15; idx+=4) {
1398
        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1399
        v &= 0xff00ff;
1400
        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1401
    }
1402
}
1403

    
1404
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1405
{
1406
    unsigned i;
1407
#if HAVE_MMX
1408
    x86_reg mmx_size= 23 - src_size;
1409
    __asm__ volatile (
1410
    "test             %%"REG_a", %%"REG_a"          \n\t"
1411
    "jns                     2f                     \n\t"
1412
    "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1413
    "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1414
    "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1415
    ASMALIGN(4)
1416
    "1:                                             \n\t"
1417
    PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1418
    "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1419
    "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1420
    "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1421
    "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1422
    "pand                 %%mm5, %%mm0              \n\t"
1423
    "pand                 %%mm6, %%mm1              \n\t"
1424
    "pand                 %%mm7, %%mm2              \n\t"
1425
    "por                  %%mm0, %%mm1              \n\t"
1426
    "por                  %%mm2, %%mm1              \n\t"
1427
    "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1428
    MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1429
    "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1430
    "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1431
    "pand                 %%mm7, %%mm0              \n\t"
1432
    "pand                 %%mm5, %%mm1              \n\t"
1433
    "pand                 %%mm6, %%mm2              \n\t"
1434
    "por                  %%mm0, %%mm1              \n\t"
1435
    "por                  %%mm2, %%mm1              \n\t"
1436
    "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1437
    MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1438
    "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1439
    "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1440
    "pand                 %%mm6, %%mm0              \n\t"
1441
    "pand                 %%mm7, %%mm1              \n\t"
1442
    "pand                 %%mm5, %%mm2              \n\t"
1443
    "por                  %%mm0, %%mm1              \n\t"
1444
    "por                  %%mm2, %%mm1              \n\t"
1445
    MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1446
    "add                    $24, %%"REG_a"          \n\t"
1447
    " js                     1b                     \n\t"
1448
    "2:                                             \n\t"
1449
    : "+a" (mmx_size)
1450
    : "r" (src-mmx_size), "r"(dst-mmx_size)
1451
    );
1452

    
1453
    __asm__ volatile(SFENCE:::"memory");
1454
    __asm__ volatile(EMMS:::"memory");
1455

    
1456
    if (mmx_size==23) return; //finished, was multiple of 8
1457

    
1458
    src+= src_size;
1459
    dst+= src_size;
1460
    src_size= 23-mmx_size;
1461
    src-= src_size;
1462
    dst-= src_size;
1463
#endif
1464
    for (i=0; i<src_size; i+=3)
1465
    {
1466
        register uint8_t x;
1467
        x          = src[i + 2];
1468
        dst[i + 1] = src[i + 1];
1469
        dst[i + 2] = src[i + 0];
1470
        dst[i + 0] = x;
1471
    }
1472
}
1473

    
1474
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1475
                                           long width, long height,
1476
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1477
{
1478
    long y;
1479
    const x86_reg chromWidth= width>>1;
1480
    for (y=0; y<height; y++)
1481
    {
1482
#if HAVE_MMX
1483
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1484
        __asm__ volatile(
1485
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1486
        ASMALIGN(4)
1487
        "1:                                         \n\t"
1488
        PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1489
        PREFETCH"    32(%2, %%"REG_a")              \n\t"
1490
        PREFETCH"    32(%3, %%"REG_a")              \n\t"
1491
        "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1492
        "movq                    %%mm0, %%mm2       \n\t" // U(0)
1493
        "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1494
        "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1495
        "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1496

    
1497
        "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1498
        "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1499
        "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1500
        "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1501
        "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1502
        "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1503
        "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1504
        "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1505

    
1506
        MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1507
        MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1508
        MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1509
        MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1510

    
1511
        "add                        $8, %%"REG_a"   \n\t"
1512
        "cmp                        %4, %%"REG_a"   \n\t"
1513
        " jb                        1b              \n\t"
1514
        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1515
        : "%"REG_a
1516
        );
1517
#else
1518

    
1519
#if ARCH_ALPHA && HAVE_MVI
1520
#define pl2yuy2(n)                  \
1521
    y1 = yc[n];                     \
1522
    y2 = yc2[n];                    \
1523
    u = uc[n];                      \
1524
    v = vc[n];                      \
1525
    __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1526
    __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1527
    __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1528
    __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1529
    yuv1 = (u << 8) + (v << 24);                \
1530
    yuv2 = yuv1 + y2;               \
1531
    yuv1 += y1;                     \
1532
    qdst[n]  = yuv1;                \
1533
    qdst2[n] = yuv2;
1534

    
1535
        int i;
1536
        uint64_t *qdst = (uint64_t *) dst;
1537
        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1538
        const uint32_t *yc = (uint32_t *) ysrc;
1539
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1540
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1541
        for (i = 0; i < chromWidth; i += 8){
1542
            uint64_t y1, y2, yuv1, yuv2;
1543
            uint64_t u, v;
1544
            /* Prefetch */
1545
            __asm__("ldq $31,64(%0)" :: "r"(yc));
1546
            __asm__("ldq $31,64(%0)" :: "r"(yc2));
1547
            __asm__("ldq $31,64(%0)" :: "r"(uc));
1548
            __asm__("ldq $31,64(%0)" :: "r"(vc));
1549

    
1550
            pl2yuy2(0);
1551
            pl2yuy2(1);
1552
            pl2yuy2(2);
1553
            pl2yuy2(3);
1554

    
1555
            yc    += 4;
1556
            yc2   += 4;
1557
            uc    += 4;
1558
            vc    += 4;
1559
            qdst  += 4;
1560
            qdst2 += 4;
1561
        }
1562
        y++;
1563
        ysrc += lumStride;
1564
        dst += dstStride;
1565

    
1566
#elif HAVE_FAST_64BIT
1567
        int i;
1568
        uint64_t *ldst = (uint64_t *) dst;
1569
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1570
        for (i = 0; i < chromWidth; i += 2){
1571
            uint64_t k, l;
1572
            k = yc[0] + (uc[0] << 8) +
1573
                (yc[1] << 16) + (vc[0] << 24);
1574
            l = yc[2] + (uc[1] << 8) +
1575
                (yc[3] << 16) + (vc[1] << 24);
1576
            *ldst++ = k + (l << 32);
1577
            yc += 4;
1578
            uc += 2;
1579
            vc += 2;
1580
        }
1581

    
1582
#else
1583
        int i, *idst = (int32_t *) dst;
1584
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1585
        for (i = 0; i < chromWidth; i++){
1586
#ifdef WORDS_BIGENDIAN
1587
            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1588
                (yc[1] << 8) + (vc[0] << 0);
1589
#else
1590
            *idst++ = yc[0] + (uc[0] << 8) +
1591
                (yc[1] << 16) + (vc[0] << 24);
1592
#endif
1593
            yc += 2;
1594
            uc++;
1595
            vc++;
1596
        }
1597
#endif
1598
#endif
1599
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1600
        {
1601
            usrc += chromStride;
1602
            vsrc += chromStride;
1603
        }
1604
        ysrc += lumStride;
1605
        dst  += dstStride;
1606
    }
1607
#if HAVE_MMX
1608
__asm__(    EMMS"       \n\t"
1609
        SFENCE"     \n\t"
1610
        :::"memory");
1611
#endif
1612
}
1613

    
1614
/**
1615
 * Height should be a multiple of 2 and width should be a multiple of 16.
1616
 * (If this is a problem for anyone then tell me, and I will fix it.)
1617
 */
1618
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619
                                      long width, long height,
1620
                                      long lumStride, long chromStride, long dstStride)
1621
{
1622
    //FIXME interpolate chroma
1623
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1624
}
1625

    
1626
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1627
                                           long width, long height,
1628
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1629
{
1630
    long y;
1631
    const x86_reg chromWidth= width>>1;
1632
    for (y=0; y<height; y++)
1633
    {
1634
#if HAVE_MMX
1635
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1636
        __asm__ volatile(
1637
        "xor                %%"REG_a", %%"REG_a"    \n\t"
1638
        ASMALIGN(4)
1639
        "1:                                         \n\t"
1640
        PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1641
        PREFETCH"   32(%2, %%"REG_a")               \n\t"
1642
        PREFETCH"   32(%3, %%"REG_a")               \n\t"
1643
        "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1644
        "movq                   %%mm0, %%mm2        \n\t" // U(0)
1645
        "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1646
        "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1647
        "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1648

    
1649
        "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1650
        "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1651
        "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1652
        "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1653
        "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1654
        "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1655
        "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1656
        "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1657

    
1658
        MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1659
        MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1660
        MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1661
        MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1662

    
1663
        "add                       $8, %%"REG_a"    \n\t"
1664
        "cmp                       %4, %%"REG_a"    \n\t"
1665
        " jb                       1b               \n\t"
1666
        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1667
        : "%"REG_a
1668
        );
1669
#else
1670
//FIXME adapt the Alpha ASM code from yv12->yuy2
1671

    
1672
#if HAVE_FAST_64BIT
1673
        int i;
1674
        uint64_t *ldst = (uint64_t *) dst;
1675
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1676
        for (i = 0; i < chromWidth; i += 2){
1677
            uint64_t k, l;
1678
            k = uc[0] + (yc[0] << 8) +
1679
                (vc[0] << 16) + (yc[1] << 24);
1680
            l = uc[1] + (yc[2] << 8) +
1681
                (vc[1] << 16) + (yc[3] << 24);
1682
            *ldst++ = k + (l << 32);
1683
            yc += 4;
1684
            uc += 2;
1685
            vc += 2;
1686
        }
1687

    
1688
#else
1689
        int i, *idst = (int32_t *) dst;
1690
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1691
        for (i = 0; i < chromWidth; i++){
1692
#ifdef WORDS_BIGENDIAN
1693
            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1694
                (vc[0] << 8) + (yc[1] << 0);
1695
#else
1696
            *idst++ = uc[0] + (yc[0] << 8) +
1697
               (vc[0] << 16) + (yc[1] << 24);
1698
#endif
1699
            yc += 2;
1700
            uc++;
1701
            vc++;
1702
        }
1703
#endif
1704
#endif
1705
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1706
        {
1707
            usrc += chromStride;
1708
            vsrc += chromStride;
1709
        }
1710
        ysrc += lumStride;
1711
        dst += dstStride;
1712
    }
1713
#if HAVE_MMX
1714
__asm__(    EMMS"       \n\t"
1715
        SFENCE"     \n\t"
1716
        :::"memory");
1717
#endif
1718
}
1719

    
1720
/**
1721
 * Height should be a multiple of 2 and width should be a multiple of 16
1722
 * (If this is a problem for anyone then tell me, and I will fix it.)
1723
 */
1724
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1725
                                      long width, long height,
1726
                                      long lumStride, long chromStride, long dstStride)
1727
{
1728
    //FIXME interpolate chroma
1729
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1730
}
1731

    
1732
/**
1733
 * Width should be a multiple of 16.
1734
 */
1735
static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1736
                                         long width, long height,
1737
                                         long lumStride, long chromStride, long dstStride)
1738
{
1739
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1740
}
1741

    
1742
/**
1743
 * Width should be a multiple of 16.
1744
 */
1745
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1746
                                         long width, long height,
1747
                                         long lumStride, long chromStride, long dstStride)
1748
{
1749
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1750
}
1751

    
1752
/**
1753
 * Height should be a multiple of 2 and width should be a multiple of 16.
1754
 * (If this is a problem for anyone then tell me, and I will fix it.)
1755
 */
1756
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1757
                                      long width, long height,
1758
                                      long lumStride, long chromStride, long srcStride)
1759
{
1760
    long y;
1761
    const x86_reg chromWidth= width>>1;
1762
    for (y=0; y<height; y+=2)
1763
    {
1764
#if HAVE_MMX
1765
        __asm__ volatile(
1766
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1767
        "pcmpeqw                 %%mm7, %%mm7       \n\t"
1768
        "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1769
        ASMALIGN(4)
1770
        "1:                \n\t"
1771
        PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1772
        "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1773
        "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1774
        "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1775
        "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1776
        "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1777
        "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1778
        "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1779
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1780
        "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1781
        "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1782

    
1783
        MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1784

    
1785
        "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1786
        "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1787
        "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1788
        "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1789
        "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1790
        "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1791
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1792
        "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1793
        "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1794
        "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1795

    
1796
        MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1797

    
1798
        "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1799
        "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1800
        "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1801
        "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1802
        "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1803
        "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1804
        "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1805
        "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1806

    
1807
        MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1808
        MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1809

    
1810
        "add                        $8, %%"REG_a"   \n\t"
1811
        "cmp                        %4, %%"REG_a"   \n\t"
1812
        " jb                        1b              \n\t"
1813
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1814
        : "memory", "%"REG_a
1815
        );
1816

    
1817
        ydst += lumStride;
1818
        src  += srcStride;
1819

    
1820
        __asm__ volatile(
1821
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1822
        ASMALIGN(4)
1823
        "1:                                         \n\t"
1824
        PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1825
        "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1826
        "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1827
        "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1828
        "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1829
        "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1830
        "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1831
        "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1832
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1833
        "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1834
        "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1835

    
1836
        MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1837
        MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1838

    
1839
        "add                        $8, %%"REG_a"   \n\t"
1840
        "cmp                        %4, %%"REG_a"   \n\t"
1841
        " jb                        1b              \n\t"
1842

    
1843
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1844
        : "memory", "%"REG_a
1845
        );
1846
#else
1847
        long i;
1848
        for (i=0; i<chromWidth; i++)
1849
        {
1850
            ydst[2*i+0]     = src[4*i+0];
1851
            udst[i]     = src[4*i+1];
1852
            ydst[2*i+1]     = src[4*i+2];
1853
            vdst[i]     = src[4*i+3];
1854
        }
1855
        ydst += lumStride;
1856
        src  += srcStride;
1857

    
1858
        for (i=0; i<chromWidth; i++)
1859
        {
1860
            ydst[2*i+0]     = src[4*i+0];
1861
            ydst[2*i+1]     = src[4*i+2];
1862
        }
1863
#endif
1864
        udst += chromStride;
1865
        vdst += chromStride;
1866
        ydst += lumStride;
1867
        src  += srcStride;
1868
    }
1869
#if HAVE_MMX
1870
__asm__ volatile(   EMMS"       \n\t"
1871
                SFENCE"     \n\t"
1872
                :::"memory");
1873
#endif
1874
}
1875

    
1876
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1877
                                      uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1878
                                      long width, long height, long lumStride, long chromStride)
1879
{
1880
    /* Y Plane */
1881
    memcpy(ydst, ysrc, width*height);
1882

    
1883
    /* XXX: implement upscaling for U,V */
1884
}
1885

    
1886
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1887
{
1888
    long x,y;
1889

    
1890
    dst[0]= src[0];
1891

    
1892
    // first line
1893
    for (x=0; x<srcWidth-1; x++){
1894
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1895
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1896
    }
1897
    dst[2*srcWidth-1]= src[srcWidth-1];
1898

    
1899
        dst+= dstStride;
1900

    
1901
    for (y=1; y<srcHeight; y++){
1902
#if HAVE_MMX2 || HAVE_AMD3DNOW
1903
        const x86_reg mmxSize= srcWidth&~15;
1904
        __asm__ volatile(
1905
        "mov           %4, %%"REG_a"            \n\t"
1906
        "1:                                     \n\t"
1907
        "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1908
        "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1909
        "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1910
        "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1911
        "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1912
        "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1913
        PAVGB"                  %%mm0, %%mm5    \n\t"
1914
        PAVGB"                  %%mm0, %%mm3    \n\t"
1915
        PAVGB"                  %%mm0, %%mm5    \n\t"
1916
        PAVGB"                  %%mm0, %%mm3    \n\t"
1917
        PAVGB"                  %%mm1, %%mm4    \n\t"
1918
        PAVGB"                  %%mm1, %%mm2    \n\t"
1919
        PAVGB"                  %%mm1, %%mm4    \n\t"
1920
        PAVGB"                  %%mm1, %%mm2    \n\t"
1921
        "movq                   %%mm5, %%mm7    \n\t"
1922
        "movq                   %%mm4, %%mm6    \n\t"
1923
        "punpcklbw              %%mm3, %%mm5    \n\t"
1924
        "punpckhbw              %%mm3, %%mm7    \n\t"
1925
        "punpcklbw              %%mm2, %%mm4    \n\t"
1926
        "punpckhbw              %%mm2, %%mm6    \n\t"
1927
#if 1
1928
        MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1929
        MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1930
        MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1931
        MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1932
#else
1933
        "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1934
        "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1935
        "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1936
        "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1937
#endif
1938
        "add                       $8, %%"REG_a"            \n\t"
1939
        " js                       1b                       \n\t"
1940
        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1941
           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1942
           "g" (-mmxSize)
1943
        : "%"REG_a
1944

    
1945
        );
1946
#else
1947
        const x86_reg mmxSize=1;
1948
#endif
1949
        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1950
        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1951

    
1952
        for (x=mmxSize-1; x<srcWidth-1; x++){
1953
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1954
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1955
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1956
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1957
        }
1958
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1959
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1960

    
1961
        dst+=dstStride*2;
1962
        src+=srcStride;
1963
    }
1964

    
1965
    // last line
1966
#if 1
1967
    dst[0]= src[0];
1968

    
1969
    for (x=0; x<srcWidth-1; x++){
1970
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1971
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1972
    }
1973
    dst[2*srcWidth-1]= src[srcWidth-1];
1974
#else
1975
    for (x=0; x<srcWidth; x++){
1976
        dst[2*x+0]=
1977
        dst[2*x+1]= src[x];
1978
    }
1979
#endif
1980

    
1981
#if HAVE_MMX
1982
__asm__ volatile(   EMMS"       \n\t"
1983
                SFENCE"     \n\t"
1984
                :::"memory");
1985
#endif
1986
}
1987

    
1988
/**
1989
 * Height should be a multiple of 2 and width should be a multiple of 16.
1990
 * (If this is a problem for anyone then tell me, and I will fix it.)
1991
 * Chrominance data is only taken from every second line, others are ignored.
1992
 * FIXME: Write HQ version.
1993
 */
1994
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1995
                                      long width, long height,
1996
                                      long lumStride, long chromStride, long srcStride)
1997
{
1998
    long y;
1999
    const x86_reg chromWidth= width>>1;
2000
    for (y=0; y<height; y+=2)
2001
    {
2002
#if HAVE_MMX
2003
        __asm__ volatile(
2004
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
2005
        "pcmpeqw             %%mm7, %%mm7   \n\t"
2006
        "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
2007
        ASMALIGN(4)
2008
        "1:                                 \n\t"
2009
        PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
2010
        "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
2011
        "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
2012
        "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
2013
        "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
2014
        "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
2015
        "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
2016
        "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
2017
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
2018
        "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
2019
        "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
2020

    
2021
        MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
2022

    
2023
        "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
2024
        "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
2025
        "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
2026
        "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
2027
        "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
2028
        "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
2029
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
2030
        "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
2031
        "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
2032
        "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
2033

    
2034
        MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
2035

    
2036
        "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
2037
        "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
2038
        "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
2039
        "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
2040
        "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
2041
        "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
2042
        "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
2043
        "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
2044

    
2045
        MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
2046
        MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
2047

    
2048
        "add                    $8, %%"REG_a"   \n\t"
2049
        "cmp                    %4, %%"REG_a"   \n\t"
2050
        " jb                    1b          \n\t"
2051
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2052
        : "memory", "%"REG_a
2053
        );
2054

    
2055
        ydst += lumStride;
2056
        src  += srcStride;
2057

    
2058
        __asm__ volatile(
2059
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
2060
        ASMALIGN(4)
2061
        "1:                                 \n\t"
2062
        PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
2063
        "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
2064
        "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
2065
        "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
2066
        "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
2067
        "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
2068
        "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
2069
        "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
2070
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
2071
        "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
2072
        "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
2073

    
2074
        MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
2075
        MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2076

    
2077
        "add                    $8, %%"REG_a"   \n\t"
2078
        "cmp                    %4, %%"REG_a"   \n\t"
2079
        " jb                    1b          \n\t"
2080

    
2081
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2082
        : "memory", "%"REG_a
2083
        );
2084
#else
2085
        long i;
2086
        for (i=0; i<chromWidth; i++)
2087
        {
2088
            udst[i]     = src[4*i+0];
2089
            ydst[2*i+0] = src[4*i+1];
2090
            vdst[i]     = src[4*i+2];
2091
            ydst[2*i+1] = src[4*i+3];
2092
        }
2093
        ydst += lumStride;
2094
        src  += srcStride;
2095

    
2096
        for (i=0; i<chromWidth; i++)
2097
        {
2098
            ydst[2*i+0] = src[4*i+1];
2099
            ydst[2*i+1] = src[4*i+3];
2100
        }
2101
#endif
2102
        udst += chromStride;
2103
        vdst += chromStride;
2104
        ydst += lumStride;
2105
        src  += srcStride;
2106
    }
2107
#if HAVE_MMX
2108
__asm__ volatile(   EMMS"       \n\t"
2109
                SFENCE"     \n\t"
2110
                :::"memory");
2111
#endif
2112
}
2113

    
2114
/**
2115
 * Height should be a multiple of 2 and width should be a multiple of 2.
2116
 * (If this is a problem for anyone then tell me, and I will fix it.)
2117
 * Chrominance data is only taken from every second line,
2118
 * others are ignored in the C version.
2119
 * FIXME: Write HQ version.
2120
 */
2121
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2122
                                       long width, long height,
2123
                                       long lumStride, long chromStride, long srcStride)
2124
{
2125
    long y;
2126
    const x86_reg chromWidth= width>>1;
2127
#if HAVE_MMX
2128
    for (y=0; y<height-2; y+=2)
2129
    {
2130
        long i;
2131
        for (i=0; i<2; i++)
2132
        {
2133
            __asm__ volatile(
2134
            "mov                        %2, %%"REG_a"   \n\t"
2135
            "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2136
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2137
            "pxor                    %%mm7, %%mm7       \n\t"
2138
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2139
            ASMALIGN(4)
2140
            "1:                                         \n\t"
2141
            PREFETCH"    64(%0, %%"REG_d")              \n\t"
2142
            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2143
            "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2144
            "punpcklbw               %%mm7, %%mm0       \n\t"
2145
            "punpcklbw               %%mm7, %%mm1       \n\t"
2146
            "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2147
            "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2148
            "punpcklbw               %%mm7, %%mm2       \n\t"
2149
            "punpcklbw               %%mm7, %%mm3       \n\t"
2150
            "pmaddwd                 %%mm6, %%mm0       \n\t"
2151
            "pmaddwd                 %%mm6, %%mm1       \n\t"
2152
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2153
            "pmaddwd                 %%mm6, %%mm3       \n\t"
2154
#ifndef FAST_BGR2YV12
2155
            "psrad                      $8, %%mm0       \n\t"
2156
            "psrad                      $8, %%mm1       \n\t"
2157
            "psrad                      $8, %%mm2       \n\t"
2158
            "psrad                      $8, %%mm3       \n\t"
2159
#endif
2160
            "packssdw                %%mm1, %%mm0       \n\t"
2161
            "packssdw                %%mm3, %%mm2       \n\t"
2162
            "pmaddwd                 %%mm5, %%mm0       \n\t"
2163
            "pmaddwd                 %%mm5, %%mm2       \n\t"
2164
            "packssdw                %%mm2, %%mm0       \n\t"
2165
            "psraw                      $7, %%mm0       \n\t"
2166

    
2167
            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2168
            "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2169
            "punpcklbw               %%mm7, %%mm4       \n\t"
2170
            "punpcklbw               %%mm7, %%mm1       \n\t"
2171
            "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2172
            "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2173
            "punpcklbw               %%mm7, %%mm2       \n\t"
2174
            "punpcklbw               %%mm7, %%mm3       \n\t"
2175
            "pmaddwd                 %%mm6, %%mm4       \n\t"
2176
            "pmaddwd                 %%mm6, %%mm1       \n\t"
2177
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2178
            "pmaddwd                 %%mm6, %%mm3       \n\t"
2179
#ifndef FAST_BGR2YV12
2180
            "psrad                      $8, %%mm4       \n\t"
2181
            "psrad                      $8, %%mm1       \n\t"
2182
            "psrad                      $8, %%mm2       \n\t"
2183
            "psrad                      $8, %%mm3       \n\t"
2184
#endif
2185
            "packssdw                %%mm1, %%mm4       \n\t"
2186
            "packssdw                %%mm3, %%mm2       \n\t"
2187
            "pmaddwd                 %%mm5, %%mm4       \n\t"
2188
            "pmaddwd                 %%mm5, %%mm2       \n\t"
2189
            "add                       $24, %%"REG_d"   \n\t"
2190
            "packssdw                %%mm2, %%mm4       \n\t"
2191
            "psraw                      $7, %%mm4       \n\t"
2192

    
2193
            "packuswb                %%mm4, %%mm0       \n\t"
2194
            "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2195

    
2196
            MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2197
            "add                        $8,      %%"REG_a"  \n\t"
2198
            " js                        1b                  \n\t"
2199
            : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2200
            : "%"REG_a, "%"REG_d
2201
            );
2202
            ydst += lumStride;
2203
            src  += srcStride;
2204
        }
2205
        src -= srcStride*2;
2206
        __asm__ volatile(
2207
        "mov                        %4, %%"REG_a"   \n\t"
2208
        "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2209
        "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2210
        "pxor                    %%mm7, %%mm7       \n\t"
2211
        "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2212
        "add                 %%"REG_d", %%"REG_d"   \n\t"
2213
        ASMALIGN(4)
2214
        "1:                                         \n\t"
2215
        PREFETCH"    64(%0, %%"REG_d")              \n\t"
2216
        PREFETCH"    64(%1, %%"REG_d")              \n\t"
2217
#if HAVE_MMX2 || HAVE_AMD3DNOW
2218
        "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2219
        "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2220
        "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2221
        "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2222
        PAVGB"                   %%mm1, %%mm0       \n\t"
2223
        PAVGB"                   %%mm3, %%mm2       \n\t"
2224
        "movq                    %%mm0, %%mm1       \n\t"
2225
        "movq                    %%mm2, %%mm3       \n\t"
2226
        "psrlq                     $24, %%mm0       \n\t"
2227
        "psrlq                     $24, %%mm2       \n\t"
2228
        PAVGB"                   %%mm1, %%mm0       \n\t"
2229
        PAVGB"                   %%mm3, %%mm2       \n\t"
2230
        "punpcklbw               %%mm7, %%mm0       \n\t"
2231
        "punpcklbw               %%mm7, %%mm2       \n\t"
2232
#else
2233
        "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2234
        "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2235
        "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2236
        "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2237
        "punpcklbw               %%mm7, %%mm0       \n\t"
2238
        "punpcklbw               %%mm7, %%mm1       \n\t"
2239
        "punpcklbw               %%mm7, %%mm2       \n\t"
2240
        "punpcklbw               %%mm7, %%mm3       \n\t"
2241
        "paddw                   %%mm1, %%mm0       \n\t"
2242
        "paddw                   %%mm3, %%mm2       \n\t"
2243
        "paddw                   %%mm2, %%mm0       \n\t"
2244
        "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2245
        "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2246
        "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2247
        "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2248
        "punpcklbw               %%mm7, %%mm4       \n\t"
2249
        "punpcklbw               %%mm7, %%mm1       \n\t"
2250
        "punpcklbw               %%mm7, %%mm2       \n\t"
2251
        "punpcklbw               %%mm7, %%mm3       \n\t"
2252
        "paddw                   %%mm1, %%mm4       \n\t"
2253
        "paddw                   %%mm3, %%mm2       \n\t"
2254
        "paddw                   %%mm4, %%mm2       \n\t"
2255
        "psrlw                      $2, %%mm0       \n\t"
2256
        "psrlw                      $2, %%mm2       \n\t"
2257
#endif
2258
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2259
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2260

    
2261
        "pmaddwd                 %%mm0, %%mm1       \n\t"
2262
        "pmaddwd                 %%mm2, %%mm3       \n\t"
2263
        "pmaddwd                 %%mm6, %%mm0       \n\t"
2264
        "pmaddwd                 %%mm6, %%mm2       \n\t"
2265
#ifndef FAST_BGR2YV12
2266
        "psrad                      $8, %%mm0       \n\t"
2267
        "psrad                      $8, %%mm1       \n\t"
2268
        "psrad                      $8, %%mm2       \n\t"
2269
        "psrad                      $8, %%mm3       \n\t"
2270
#endif
2271
        "packssdw                %%mm2, %%mm0       \n\t"
2272
        "packssdw                %%mm3, %%mm1       \n\t"
2273
        "pmaddwd                 %%mm5, %%mm0       \n\t"
2274
        "pmaddwd                 %%mm5, %%mm1       \n\t"
2275
        "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2276
        "psraw                      $7, %%mm0       \n\t"
2277

    
2278
#if HAVE_MMX2 || HAVE_AMD3DNOW
2279
        "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2280
        "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2281
        "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2282
        "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2283
        PAVGB"                   %%mm1, %%mm4       \n\t"
2284
        PAVGB"                   %%mm3, %%mm2       \n\t"
2285
        "movq                    %%mm4, %%mm1       \n\t"
2286
        "movq                    %%mm2, %%mm3       \n\t"
2287
        "psrlq                     $24, %%mm4       \n\t"
2288
        "psrlq                     $24, %%mm2       \n\t"
2289
        PAVGB"                   %%mm1, %%mm4       \n\t"
2290
        PAVGB"                   %%mm3, %%mm2       \n\t"
2291
        "punpcklbw               %%mm7, %%mm4       \n\t"
2292
        "punpcklbw               %%mm7, %%mm2       \n\t"
2293
#else
2294
        "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2295
        "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2296
        "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2297
        "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2298
        "punpcklbw               %%mm7, %%mm4       \n\t"
2299
        "punpcklbw               %%mm7, %%mm1       \n\t"
2300
        "punpcklbw               %%mm7, %%mm2       \n\t"
2301
        "punpcklbw               %%mm7, %%mm3       \n\t"
2302
        "paddw                   %%mm1, %%mm4       \n\t"
2303
        "paddw                   %%mm3, %%mm2       \n\t"
2304
        "paddw                   %%mm2, %%mm4       \n\t"
2305
        "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2306
        "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2307
        "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2308
        "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2309
        "punpcklbw               %%mm7, %%mm5       \n\t"
2310
        "punpcklbw               %%mm7, %%mm1       \n\t"
2311
        "punpcklbw               %%mm7, %%mm2       \n\t"
2312
        "punpcklbw               %%mm7, %%mm3       \n\t"
2313
        "paddw                   %%mm1, %%mm5       \n\t"
2314
        "paddw                   %%mm3, %%mm2       \n\t"
2315
        "paddw                   %%mm5, %%mm2       \n\t"
2316
        "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2317
        "psrlw                      $2, %%mm4       \n\t"
2318
        "psrlw                      $2, %%mm2       \n\t"
2319
#endif
2320
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2321
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2322

    
2323
        "pmaddwd                 %%mm4, %%mm1       \n\t"
2324
        "pmaddwd                 %%mm2, %%mm3       \n\t"
2325
        "pmaddwd                 %%mm6, %%mm4       \n\t"
2326
        "pmaddwd                 %%mm6, %%mm2       \n\t"
2327
#ifndef FAST_BGR2YV12
2328
        "psrad                      $8, %%mm4       \n\t"
2329
        "psrad                      $8, %%mm1       \n\t"
2330
        "psrad                      $8, %%mm2       \n\t"
2331
        "psrad                      $8, %%mm3       \n\t"
2332
#endif
2333
        "packssdw                %%mm2, %%mm4       \n\t"
2334
        "packssdw                %%mm3, %%mm1       \n\t"
2335
        "pmaddwd                 %%mm5, %%mm4       \n\t"
2336
        "pmaddwd                 %%mm5, %%mm1       \n\t"
2337
        "add                       $24, %%"REG_d"   \n\t"
2338
        "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2339
        "psraw                      $7, %%mm4       \n\t"
2340

    
2341
        "movq                    %%mm0, %%mm1           \n\t"
2342
        "punpckldq               %%mm4, %%mm0           \n\t"
2343
        "punpckhdq               %%mm4, %%mm1           \n\t"
2344
        "packsswb                %%mm1, %%mm0           \n\t"
2345
        "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
2346
        "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2347
        "punpckhdq               %%mm0, %%mm0           \n\t"
2348
        "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2349
        "add                        $4, %%"REG_a"       \n\t"
2350
        " js                        1b                  \n\t"
2351
        : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2352
        : "%"REG_a, "%"REG_d
2353
        );
2354

    
2355
        udst += chromStride;
2356
        vdst += chromStride;
2357
        src  += srcStride*2;
2358
    }
2359

    
2360
    __asm__ volatile(   EMMS"       \n\t"
2361
                    SFENCE"     \n\t"
2362
                    :::"memory");
2363
#else
2364
    y=0;
2365
#endif
2366
    for (; y<height; y+=2)
2367
    {
2368
        long i;
2369
        for (i=0; i<chromWidth; i++)
2370
        {
2371
            unsigned int b = src[6*i+0];
2372
            unsigned int g = src[6*i+1];
2373
            unsigned int r = src[6*i+2];
2374

    
2375
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2376
            unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2377
            unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2378

    
2379
            udst[i]     = U;
2380
            vdst[i]     = V;
2381
            ydst[2*i]   = Y;
2382

    
2383
            b = src[6*i+3];
2384
            g = src[6*i+4];
2385
            r = src[6*i+5];
2386

    
2387
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2388
            ydst[2*i+1]     = Y;
2389
        }
2390
        ydst += lumStride;
2391
        src  += srcStride;
2392

    
2393
        for (i=0; i<chromWidth; i++)
2394
        {
2395
            unsigned int b = src[6*i+0];
2396
            unsigned int g = src[6*i+1];
2397
            unsigned int r = src[6*i+2];
2398

    
2399
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2400

    
2401
            ydst[2*i]     = Y;
2402

    
2403
            b = src[6*i+3];
2404
            g = src[6*i+4];
2405
            r = src[6*i+5];
2406

    
2407
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2408
            ydst[2*i+1]     = Y;
2409
        }
2410
        udst += chromStride;
2411
        vdst += chromStride;
2412
        ydst += lumStride;
2413
        src  += srcStride;
2414
    }
2415
}
2416

    
2417
static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2418
                             long width, long height, long src1Stride,
2419
                             long src2Stride, long dstStride){
2420
    long h;
2421

    
2422
    for (h=0; h < height; h++)
2423
    {
2424
        long w;
2425

    
2426
#if HAVE_MMX
2427
#if HAVE_SSE2
2428
        __asm__(
2429
        "xor              %%"REG_a", %%"REG_a"  \n\t"
2430
        "1:                                     \n\t"
2431
        PREFETCH" 64(%1, %%"REG_a")             \n\t"
2432
        PREFETCH" 64(%2, %%"REG_a")             \n\t"
2433
        "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2434
        "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2435
        "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2436
        "punpcklbw           %%xmm2, %%xmm0     \n\t"
2437
        "punpckhbw           %%xmm2, %%xmm1     \n\t"
2438
        "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2439
        "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2440
        "add                    $16, %%"REG_a"  \n\t"
2441
        "cmp                     %3, %%"REG_a"  \n\t"
2442
        " jb                     1b             \n\t"
2443
        ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2444
        : "memory", "%"REG_a""
2445
        );
2446
#else
2447
        __asm__(
2448
        "xor %%"REG_a", %%"REG_a"               \n\t"
2449
        "1:                                     \n\t"
2450
        PREFETCH" 64(%1, %%"REG_a")             \n\t"
2451
        PREFETCH" 64(%2, %%"REG_a")             \n\t"
2452
        "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2453
        "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2454
        "movq                 %%mm0, %%mm1      \n\t"
2455
        "movq                 %%mm2, %%mm3      \n\t"
2456
        "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2457
        "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2458
        "punpcklbw            %%mm4, %%mm0      \n\t"
2459
        "punpckhbw            %%mm4, %%mm1      \n\t"
2460
        "punpcklbw            %%mm5, %%mm2      \n\t"
2461
        "punpckhbw            %%mm5, %%mm3      \n\t"
2462
        MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2463
        MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2464
        MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2465
        MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2466
        "add                    $16, %%"REG_a"  \n\t"
2467
        "cmp                     %3, %%"REG_a"  \n\t"
2468
        " jb                     1b             \n\t"
2469
        ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2470
        : "memory", "%"REG_a
2471
        );
2472
#endif
2473
        for (w= (width&(~15)); w < width; w++)
2474
        {
2475
            dest[2*w+0] = src1[w];
2476
            dest[2*w+1] = src2[w];
2477
        }
2478
#else
2479
        for (w=0; w < width; w++)
2480
        {
2481
            dest[2*w+0] = src1[w];
2482
            dest[2*w+1] = src2[w];
2483
        }
2484
#endif
2485
        dest += dstStride;
2486
                src1 += src1Stride;
2487
                src2 += src2Stride;
2488
    }
2489
#if HAVE_MMX
2490
    __asm__(
2491
        EMMS"       \n\t"
2492
        SFENCE"     \n\t"
2493
        ::: "memory"
2494
        );
2495
#endif
2496
}
2497

    
2498
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2499
                                       uint8_t *dst1, uint8_t *dst2,
2500
                                       long width, long height,
2501
                                       long srcStride1, long srcStride2,
2502
                                       long dstStride1, long dstStride2)
2503
{
2504
    x86_reg y;
2505
    long x,w,h;
2506
    w=width/2; h=height/2;
2507
#if HAVE_MMX
2508
    __asm__ volatile(
2509
    PREFETCH" %0    \n\t"
2510
    PREFETCH" %1    \n\t"
2511
    ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2512
#endif
2513
    for (y=0;y<h;y++){
2514
    const uint8_t* s1=src1+srcStride1*(y>>1);
2515
    uint8_t* d=dst1+dstStride1*y;
2516
    x=0;
2517
#if HAVE_MMX
2518
    for (;x<w-31;x+=32)
2519
    {
2520
        __asm__ volatile(
2521
        PREFETCH"   32%1        \n\t"
2522
        "movq         %1, %%mm0 \n\t"
2523
        "movq        8%1, %%mm2 \n\t"
2524
        "movq       16%1, %%mm4 \n\t"
2525
        "movq       24%1, %%mm6 \n\t"
2526
        "movq      %%mm0, %%mm1 \n\t"
2527
        "movq      %%mm2, %%mm3 \n\t"
2528
        "movq      %%mm4, %%mm5 \n\t"
2529
        "movq      %%mm6, %%mm7 \n\t"
2530
        "punpcklbw %%mm0, %%mm0 \n\t"
2531
        "punpckhbw %%mm1, %%mm1 \n\t"
2532
        "punpcklbw %%mm2, %%mm2 \n\t"
2533
        "punpckhbw %%mm3, %%mm3 \n\t"
2534
        "punpcklbw %%mm4, %%mm4 \n\t"
2535
        "punpckhbw %%mm5, %%mm5 \n\t"
2536
        "punpcklbw %%mm6, %%mm6 \n\t"
2537
        "punpckhbw %%mm7, %%mm7 \n\t"
2538
        MOVNTQ"    %%mm0,   %0  \n\t"
2539
        MOVNTQ"    %%mm1,  8%0  \n\t"
2540
        MOVNTQ"    %%mm2, 16%0  \n\t"
2541
        MOVNTQ"    %%mm3, 24%0  \n\t"
2542
        MOVNTQ"    %%mm4, 32%0  \n\t"
2543
        MOVNTQ"    %%mm5, 40%0  \n\t"
2544
        MOVNTQ"    %%mm6, 48%0  \n\t"
2545
        MOVNTQ"    %%mm7, 56%0"
2546
        :"=m"(d[2*x])
2547
        :"m"(s1[x])
2548
        :"memory");
2549
    }
2550
#endif
2551
    for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2552
    }
2553
    for (y=0;y<h;y++){
2554
    const uint8_t* s2=src2+srcStride2*(y>>1);
2555
    uint8_t* d=dst2+dstStride2*y;
2556
    x=0;
2557
#if HAVE_MMX
2558
    for (;x<w-31;x+=32)
2559
    {
2560
        __asm__ volatile(
2561
        PREFETCH"   32%1        \n\t"
2562
        "movq         %1, %%mm0 \n\t"
2563
        "movq        8%1, %%mm2 \n\t"
2564
        "movq       16%1, %%mm4 \n\t"
2565
        "movq       24%1, %%mm6 \n\t"
2566
        "movq      %%mm0, %%mm1 \n\t"
2567
        "movq      %%mm2, %%mm3 \n\t"
2568
        "movq      %%mm4, %%mm5 \n\t"
2569
        "movq      %%mm6, %%mm7 \n\t"
2570
        "punpcklbw %%mm0, %%mm0 \n\t"
2571
        "punpckhbw %%mm1, %%mm1 \n\t"
2572
        "punpcklbw %%mm2, %%mm2 \n\t"
2573
        "punpckhbw %%mm3, %%mm3 \n\t"
2574
        "punpcklbw %%mm4, %%mm4 \n\t"
2575
        "punpckhbw %%mm5, %%mm5 \n\t"
2576
        "punpcklbw %%mm6, %%mm6 \n\t"
2577
        "punpckhbw %%mm7, %%mm7 \n\t"
2578
        MOVNTQ"    %%mm0,   %0  \n\t"
2579
        MOVNTQ"    %%mm1,  8%0  \n\t"
2580
        MOVNTQ"    %%mm2, 16%0  \n\t"
2581
        MOVNTQ"    %%mm3, 24%0  \n\t"
2582
        MOVNTQ"    %%mm4, 32%0  \n\t"
2583
        MOVNTQ"    %%mm5, 40%0  \n\t"
2584
        MOVNTQ"    %%mm6, 48%0  \n\t"
2585
        MOVNTQ"    %%mm7, 56%0"
2586
        :"=m"(d[2*x])
2587
        :"m"(s2[x])
2588
        :"memory");
2589
    }
2590
#endif
2591
    for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2592
    }
2593
#if HAVE_MMX
2594
    __asm__(
2595
        EMMS"       \n\t"
2596
        SFENCE"     \n\t"
2597
        ::: "memory"
2598
        );
2599
#endif
2600
}
2601

    
2602
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2603
                                        uint8_t *dst,
2604
                                        long width, long height,
2605
                                        long srcStride1, long srcStride2,
2606
                                        long srcStride3, long dstStride)
2607
{
2608
    x86_reg x;
2609
    long y,w,h;
2610
    w=width/2; h=height;
2611
    for (y=0;y<h;y++){
2612
    const uint8_t* yp=src1+srcStride1*y;
2613
    const uint8_t* up=src2+srcStride2*(y>>2);
2614
    const uint8_t* vp=src3+srcStride3*(y>>2);
2615
    uint8_t* d=dst+dstStride*y;
2616
    x=0;
2617
#if HAVE_MMX
2618
    for (;x<w-7;x+=8)
2619
    {
2620
        __asm__ volatile(
2621
        PREFETCH"   32(%1, %0)          \n\t"
2622
        PREFETCH"   32(%2, %0)          \n\t"
2623
        PREFETCH"   32(%3, %0)          \n\t"
2624
        "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2625
        "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2626
        "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2627
        "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2628
        "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2629
        "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2630
        "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2631
        "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2632
        "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2633
        "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2634

    
2635
        "movq            %%mm1, %%mm6   \n\t"
2636
        "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2637
        "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2638
        "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2639
        MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2640
        MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2641

    
2642
        "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2643
        "movq     8(%1, %0, 4), %%mm0   \n\t"
2644
        "movq            %%mm0, %%mm3   \n\t"
2645
        "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2646
        "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2647
        MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2648
        MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2649

    
2650
        "movq            %%mm4, %%mm6   \n\t"
2651
        "movq    16(%1, %0, 4), %%mm0   \n\t"
2652
        "movq            %%mm0, %%mm3   \n\t"
2653
        "punpcklbw       %%mm5, %%mm4   \n\t"
2654
        "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2655
        "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2656
        MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2657
        MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2658

    
2659
        "punpckhbw       %%mm5, %%mm6   \n\t"
2660
        "movq    24(%1, %0, 4), %%mm0   \n\t"
2661
        "movq            %%mm0, %%mm3   \n\t"
2662
        "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2663
        "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2664
        MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2665
        MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2666

    
2667
        : "+r" (x)
2668
        : "r"(yp), "r" (up), "r"(vp), "r"(d)
2669
        :"memory");
2670
    }
2671
#endif
2672
    for (; x<w; x++)
2673
    {
2674
        const long x2 = x<<2;
2675
        d[8*x+0] = yp[x2];
2676
        d[8*x+1] = up[x];
2677
        d[8*x+2] = yp[x2+1];
2678
        d[8*x+3] = vp[x];
2679
        d[8*x+4] = yp[x2+2];
2680
        d[8*x+5] = up[x];
2681
        d[8*x+6] = yp[x2+3];
2682
        d[8*x+7] = vp[x];
2683
    }
2684
    }
2685
#if HAVE_MMX
2686
    __asm__(
2687
        EMMS"       \n\t"
2688
        SFENCE"     \n\t"
2689
        ::: "memory"
2690
        );
2691
#endif
2692
}
2693

    
2694
static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2695
{
2696
    dst +=   count;
2697
    src += 2*count;
2698
    count= - count;
2699

    
2700
#if HAVE_MMX
2701
    if(count <= -16){
2702
        count += 15;
2703
        __asm__ volatile(
2704
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2705
            "psrlw            $8, %%mm7        \n\t"
2706
            "1:                                \n\t"
2707
            "movq -30(%1, %0, 2), %%mm0        \n\t"
2708
            "movq -22(%1, %0, 2), %%mm1        \n\t"
2709
            "movq -14(%1, %0, 2), %%mm2        \n\t"
2710
            "movq  -6(%1, %0, 2), %%mm3        \n\t"
2711
            "pand          %%mm7, %%mm0        \n\t"
2712
            "pand          %%mm7, %%mm1        \n\t"
2713
            "pand          %%mm7, %%mm2        \n\t"
2714
            "pand          %%mm7, %%mm3        \n\t"
2715
            "packuswb      %%mm1, %%mm0        \n\t"
2716
            "packuswb      %%mm3, %%mm2        \n\t"
2717
            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2718
            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2719
            "add             $16, %0           \n\t"
2720
            " js 1b                            \n\t"
2721
            : "+r"(count)
2722
            : "r"(src), "r"(dst)
2723
        );
2724
        count -= 15;
2725
    }
2726
#endif
2727
    while(count<0){
2728
        dst[count]= src[2*count];
2729
        count++;
2730
    }
2731
}
2732

    
2733
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2734
{
2735
    dst0+=   count;
2736
    dst1+=   count;
2737
    src += 4*count;
2738
    count= - count;
2739
#if HAVE_MMX
2740
    if(count <= -8){
2741
        count += 7;
2742
        __asm__ volatile(
2743
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2744
            "psrlw            $8, %%mm7        \n\t"
2745
            "1:                                \n\t"
2746
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2747
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2748
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2749
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2750
            "pand          %%mm7, %%mm0        \n\t"
2751
            "pand          %%mm7, %%mm1        \n\t"
2752
            "pand          %%mm7, %%mm2        \n\t"
2753
            "pand          %%mm7, %%mm3        \n\t"
2754
            "packuswb      %%mm1, %%mm0        \n\t"
2755
            "packuswb      %%mm3, %%mm2        \n\t"
2756
            "movq          %%mm0, %%mm1        \n\t"
2757
            "movq          %%mm2, %%mm3        \n\t"
2758
            "psrlw            $8, %%mm0        \n\t"
2759
            "psrlw            $8, %%mm2        \n\t"
2760
            "pand          %%mm7, %%mm1        \n\t"
2761
            "pand          %%mm7, %%mm3        \n\t"
2762
            "packuswb      %%mm2, %%mm0        \n\t"
2763
            "packuswb      %%mm3, %%mm1        \n\t"
2764
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2765
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2766
            "add              $8, %0           \n\t"
2767
            " js 1b                            \n\t"
2768
            : "+r"(count)
2769
            : "r"(src), "r"(dst0), "r"(dst1)
2770
        );
2771
        count -= 7;
2772
    }
2773
#endif
2774
    while(count<0){
2775
        dst0[count]= src[4*count+0];
2776
        dst1[count]= src[4*count+2];
2777
        count++;
2778
    }
2779
}
2780

    
2781
static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2782
{
2783
    dst0+=   count;
2784
    dst1+=   count;
2785
    src += 4*count;
2786
    count= - count;
2787
#if HAVE_MMX
2788
    if(count <= -8){
2789
        count += 7;
2790
        __asm__ volatile(
2791
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2792
            "psrlw            $8, %%mm7        \n\t"
2793
            "1:                                \n\t"
2794
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2795
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2796
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2797
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2798
            "psrlw            $8, %%mm0        \n\t"
2799
            "psrlw            $8, %%mm1        \n\t"
2800
            "psrlw            $8, %%mm2        \n\t"
2801
            "psrlw            $8, %%mm3        \n\t"
2802
            "packuswb      %%mm1, %%mm0        \n\t"
2803
            "packuswb      %%mm3, %%mm2        \n\t"
2804
            "movq          %%mm0, %%mm1        \n\t"
2805
            "movq          %%mm2, %%mm3        \n\t"
2806
            "psrlw            $8, %%mm0        \n\t"
2807
            "psrlw            $8, %%mm2        \n\t"
2808
            "pand          %%mm7, %%mm1        \n\t"
2809
            "pand          %%mm7, %%mm3        \n\t"
2810
            "packuswb      %%mm2, %%mm0        \n\t"
2811
            "packuswb      %%mm3, %%mm1        \n\t"
2812
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2813
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2814
            "add              $8, %0           \n\t"
2815
            " js 1b                            \n\t"
2816
            : "+r"(count)
2817
            : "r"(src), "r"(dst0), "r"(dst1)
2818
        );
2819
        count -= 7;
2820
    }
2821
#endif
2822
    while(count<0){
2823
        dst0[count]= src[4*count+0];
2824
        dst1[count]= src[4*count+2];
2825
        count++;
2826
    }
2827
}
2828

    
2829
static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2830
                                      long width, long height,
2831
                                      long lumStride, long chromStride, long srcStride)
2832
{
2833
    long y;
2834
    const long chromWidth= -((-width)>>1);
2835

    
2836
    for (y=0; y<height; y++){
2837
        RENAME(extract_even)(src, ydst, width);
2838
        if(!(y&1)){
2839
            RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2840
            udst+= chromStride;
2841
            vdst+= chromStride;
2842
        }
2843

    
2844
        src += srcStride;
2845
        ydst+= lumStride;
2846
    }
2847
#if HAVE_MMX
2848
    __asm__(
2849
        EMMS"       \n\t"
2850
        SFENCE"     \n\t"
2851
        ::: "memory"
2852
        );
2853
#endif
2854
}
2855

    
2856
static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2857
                                      long width, long height,
2858
                                      long lumStride, long chromStride, long srcStride)
2859
{
2860
    long y;
2861
    const long chromWidth= -((-width)>>1);
2862

    
2863
    for (y=0; y<height; y++){
2864
        RENAME(extract_even)(src, ydst, width);
2865
        RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2866

    
2867
        src += srcStride;
2868
        ydst+= lumStride;
2869
        udst+= chromStride;
2870
        vdst+= chromStride;
2871
    }
2872
#if HAVE_MMX
2873
    __asm__(
2874
        EMMS"       \n\t"
2875
        SFENCE"     \n\t"
2876
        ::: "memory"
2877
        );
2878
#endif
2879
}
2880

    
2881
static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2882
                                      long width, long height,
2883
                                      long lumStride, long chromStride, long srcStride)
2884
{
2885
    long y;
2886
    const long chromWidth= -((-width)>>1);
2887

    
2888
    for (y=0; y<height; y++){
2889
        RENAME(extract_even)(src+1, ydst, width);
2890
        if(!(y&1)){
2891
            RENAME(extract_even2)(src, udst, vdst, chromWidth);
2892
            udst+= chromStride;
2893
            vdst+= chromStride;
2894
        }
2895

    
2896
        src += srcStride;
2897
        ydst+= lumStride;
2898
    }
2899
#if HAVE_MMX
2900
    __asm__(
2901
        EMMS"       \n\t"
2902
        SFENCE"     \n\t"
2903
        ::: "memory"
2904
        );
2905
#endif
2906
}
2907

    
2908
static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2909
                                      long width, long height,
2910
                                      long lumStride, long chromStride, long srcStride)
2911
{
2912
    long y;
2913
    const long chromWidth= -((-width)>>1);
2914

    
2915
    for (y=0; y<height; y++){
2916
        RENAME(extract_even)(src+1, ydst, width);
2917
        RENAME(extract_even2)(src, udst, vdst, chromWidth);
2918

    
2919
        src += srcStride;
2920
        ydst+= lumStride;
2921
        udst+= chromStride;
2922
        vdst+= chromStride;
2923
    }
2924
#if HAVE_MMX
2925
    __asm__(
2926
        EMMS"       \n\t"
2927
        SFENCE"     \n\t"
2928
        ::: "memory"
2929
        );
2930
#endif
2931
}
2932

    
2933
static inline void RENAME(rgb2rgb_init)(void){
2934
    rgb15to16       = RENAME(rgb15to16);
2935
    rgb15tobgr24    = RENAME(rgb15tobgr24);
2936
    rgb15to32       = RENAME(rgb15to32);
2937
    rgb16tobgr24    = RENAME(rgb16tobgr24);
2938
    rgb16to32       = RENAME(rgb16to32);
2939
    rgb16to15       = RENAME(rgb16to15);
2940
    rgb24tobgr16    = RENAME(rgb24tobgr16);
2941
    rgb24tobgr15    = RENAME(rgb24tobgr15);
2942
    rgb24tobgr32    = RENAME(rgb24tobgr32);
2943
    rgb32to16       = RENAME(rgb32to16);
2944
    rgb32to15       = RENAME(rgb32to15);
2945
    rgb32tobgr24    = RENAME(rgb32tobgr24);
2946
    rgb24to15       = RENAME(rgb24to15);
2947
    rgb24to16       = RENAME(rgb24to16);
2948
    rgb24tobgr24    = RENAME(rgb24tobgr24);
2949
    rgb32tobgr32    = RENAME(rgb32tobgr32);
2950
    rgb32tobgr16    = RENAME(rgb32tobgr16);
2951
    rgb32tobgr15    = RENAME(rgb32tobgr15);
2952
    yv12toyuy2      = RENAME(yv12toyuy2);
2953
    yv12touyvy      = RENAME(yv12touyvy);
2954
    yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
2955
    yuv422ptouyvy   = RENAME(yuv422ptouyvy);
2956
    yuy2toyv12      = RENAME(yuy2toyv12);
2957
//    yvu9toyv12      = RENAME(yvu9toyv12);
2958
    planar2x        = RENAME(planar2x);
2959
    rgb24toyv12     = RENAME(rgb24toyv12);
2960
    interleaveBytes = RENAME(interleaveBytes);
2961
    vu9_to_vu12     = RENAME(vu9_to_vu12);
2962
    yvu9_to_yuy2    = RENAME(yvu9_to_yuy2);
2963

    
2964
    uyvytoyuv420    = RENAME(uyvytoyuv420);
2965
    uyvytoyuv422    = RENAME(uyvytoyuv422);
2966
    yuyvtoyuv420    = RENAME(yuyvtoyuv420);
2967
    yuyvtoyuv422    = RENAME(yuyvtoyuv422);
2968
}