Statistics
| Branch: | Revision:

ffmpeg / libswscale / rgb2rgb_template.c @ a898cdc9

History | View | Annotate | Download (110 KB)

1
/*
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 * lot of big-endian byte order fixes by Alex Beregszaszi
9
 *
10
 * This file is part of FFmpeg.
11
 *
12
 * FFmpeg is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2 of the License, or
15
 * (at your option) any later version.
16
 *
17
 * FFmpeg is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU General Public License
23
 * along with FFmpeg; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 *
26
 * The C code (not assembly, MMX, ...) of this file can be used
27
 * under the LGPL license.
28
 */
29

    
30
#include <stddef.h>
31

    
32
#undef PREFETCH
33
#undef MOVNTQ
34
#undef EMMS
35
#undef SFENCE
36
#undef MMREG_SIZE
37
#undef PREFETCHW
38
#undef PAVGB
39

    
40
#if HAVE_SSE2
41
#define MMREG_SIZE 16
42
#else
43
#define MMREG_SIZE 8
44
#endif
45

    
46
#if HAVE_AMD3DNOW
47
#define PREFETCH  "prefetch"
48
#define PREFETCHW "prefetchw"
49
#define PAVGB     "pavgusb"
50
#elif HAVE_MMX2
51
#define PREFETCH "prefetchnta"
52
#define PREFETCHW "prefetcht0"
53
#define PAVGB     "pavgb"
54
#else
55
#define PREFETCH  " # nop"
56
#define PREFETCHW " # nop"
57
#endif
58

    
59
#if HAVE_AMD3DNOW
60
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
61
#define EMMS     "femms"
62
#else
63
#define EMMS     "emms"
64
#endif
65

    
66
#if HAVE_MMX2
67
#define MOVNTQ "movntq"
68
#define SFENCE "sfence"
69
#else
70
#define MOVNTQ "movq"
71
#define SFENCE " # nop"
72
#endif
73

    
74
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
75
{
76
    uint8_t *dest = dst;
77
    const uint8_t *s = src;
78
    const uint8_t *end;
79
    #if HAVE_MMX
80
        const uint8_t *mm_end;
81
    #endif
82
    end = s + src_size;
83
    #if HAVE_MMX
84
        __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
85
        mm_end = end - 23;
86
        __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
87
        while (s < mm_end)
88
        {
89
            __asm__ volatile(
90
            PREFETCH"    32%1           \n\t"
91
            "movd          %1, %%mm0    \n\t"
92
            "punpckldq    3%1, %%mm0    \n\t"
93
            "movd         6%1, %%mm1    \n\t"
94
            "punpckldq    9%1, %%mm1    \n\t"
95
            "movd        12%1, %%mm2    \n\t"
96
            "punpckldq   15%1, %%mm2    \n\t"
97
            "movd        18%1, %%mm3    \n\t"
98
            "punpckldq   21%1, %%mm3    \n\t"
99
            "por        %%mm7, %%mm0    \n\t"
100
            "por        %%mm7, %%mm1    \n\t"
101
            "por        %%mm7, %%mm2    \n\t"
102
            "por        %%mm7, %%mm3    \n\t"
103
            MOVNTQ"     %%mm0,   %0     \n\t"
104
            MOVNTQ"     %%mm1,  8%0     \n\t"
105
            MOVNTQ"     %%mm2, 16%0     \n\t"
106
            MOVNTQ"     %%mm3, 24%0"
107
            :"=m"(*dest)
108
            :"m"(*s)
109
            :"memory");
110
            dest += 32;
111
            s += 24;
112
        }
113
        __asm__ volatile(SFENCE:::"memory");
114
        __asm__ volatile(EMMS:::"memory");
115
    #endif
116
    while (s < end)
117
    {
118
    #if HAVE_BIGENDIAN
119
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
120
        *dest++ = 255;
121
        *dest++ = s[2];
122
        *dest++ = s[1];
123
        *dest++ = s[0];
124
        s+=3;
125
    #else
126
        *dest++ = *s++;
127
        *dest++ = *s++;
128
        *dest++ = *s++;
129
        *dest++ = 255;
130
    #endif
131
    }
132
}
133

    
134
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
135
{
136
    uint8_t *dest = dst;
137
    const uint8_t *s = src;
138
    const uint8_t *end;
139
#if HAVE_MMX
140
    const uint8_t *mm_end;
141
#endif
142
    end = s + src_size;
143
#if HAVE_MMX
144
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
145
    mm_end = end - 31;
146
    while (s < mm_end)
147
    {
148
        __asm__ volatile(
149
        PREFETCH"    32%1           \n\t"
150
        "movq          %1, %%mm0    \n\t"
151
        "movq         8%1, %%mm1    \n\t"
152
        "movq        16%1, %%mm4    \n\t"
153
        "movq        24%1, %%mm5    \n\t"
154
        "movq       %%mm0, %%mm2    \n\t"
155
        "movq       %%mm1, %%mm3    \n\t"
156
        "movq       %%mm4, %%mm6    \n\t"
157
        "movq       %%mm5, %%mm7    \n\t"
158
        "psrlq         $8, %%mm2    \n\t"
159
        "psrlq         $8, %%mm3    \n\t"
160
        "psrlq         $8, %%mm6    \n\t"
161
        "psrlq         $8, %%mm7    \n\t"
162
        "pand          %2, %%mm0    \n\t"
163
        "pand          %2, %%mm1    \n\t"
164
        "pand          %2, %%mm4    \n\t"
165
        "pand          %2, %%mm5    \n\t"
166
        "pand          %3, %%mm2    \n\t"
167
        "pand          %3, %%mm3    \n\t"
168
        "pand          %3, %%mm6    \n\t"
169
        "pand          %3, %%mm7    \n\t"
170
        "por        %%mm2, %%mm0    \n\t"
171
        "por        %%mm3, %%mm1    \n\t"
172
        "por        %%mm6, %%mm4    \n\t"
173
        "por        %%mm7, %%mm5    \n\t"
174

    
175
        "movq       %%mm1, %%mm2    \n\t"
176
        "movq       %%mm4, %%mm3    \n\t"
177
        "psllq        $48, %%mm2    \n\t"
178
        "psllq        $32, %%mm3    \n\t"
179
        "pand          %4, %%mm2    \n\t"
180
        "pand          %5, %%mm3    \n\t"
181
        "por        %%mm2, %%mm0    \n\t"
182
        "psrlq        $16, %%mm1    \n\t"
183
        "psrlq        $32, %%mm4    \n\t"
184
        "psllq        $16, %%mm5    \n\t"
185
        "por        %%mm3, %%mm1    \n\t"
186
        "pand          %6, %%mm5    \n\t"
187
        "por        %%mm5, %%mm4    \n\t"
188

    
189
        MOVNTQ"     %%mm0,   %0     \n\t"
190
        MOVNTQ"     %%mm1,  8%0     \n\t"
191
        MOVNTQ"     %%mm4, 16%0"
192
        :"=m"(*dest)
193
        :"m"(*s),"m"(mask24l),
194
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
195
        :"memory");
196
        dest += 24;
197
        s += 32;
198
    }
199
    __asm__ volatile(SFENCE:::"memory");
200
    __asm__ volatile(EMMS:::"memory");
201
#endif
202
    while (s < end)
203
    {
204
#if HAVE_BIGENDIAN
205
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
206
        s++;
207
        dest[2] = *s++;
208
        dest[1] = *s++;
209
        dest[0] = *s++;
210
        dest += 3;
211
#else
212
        *dest++ = *s++;
213
        *dest++ = *s++;
214
        *dest++ = *s++;
215
        s++;
216
#endif
217
    }
218
}
219

    
220
/*
221
 original by Strepto/Astral
222
 ported to gcc & bugfixed: A'rpi
223
 MMX2, 3DNOW optimization by Nick Kurshev
224
 32-bit C version, and and&add trick by Michael Niedermayer
225
*/
226
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
227
{
228
    register const uint8_t* s=src;
229
    register uint8_t* d=dst;
230
    register const uint8_t *end;
231
    const uint8_t *mm_end;
232
    end = s + src_size;
233
#if HAVE_MMX
234
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
235
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
236
    mm_end = end - 15;
237
    while (s<mm_end)
238
    {
239
        __asm__ volatile(
240
        PREFETCH"  32%1         \n\t"
241
        "movq        %1, %%mm0  \n\t"
242
        "movq       8%1, %%mm2  \n\t"
243
        "movq     %%mm0, %%mm1  \n\t"
244
        "movq     %%mm2, %%mm3  \n\t"
245
        "pand     %%mm4, %%mm0  \n\t"
246
        "pand     %%mm4, %%mm2  \n\t"
247
        "paddw    %%mm1, %%mm0  \n\t"
248
        "paddw    %%mm3, %%mm2  \n\t"
249
        MOVNTQ"   %%mm0,  %0    \n\t"
250
        MOVNTQ"   %%mm2, 8%0"
251
        :"=m"(*d)
252
        :"m"(*s)
253
        );
254
        d+=16;
255
        s+=16;
256
    }
257
    __asm__ volatile(SFENCE:::"memory");
258
    __asm__ volatile(EMMS:::"memory");
259
#endif
260
    mm_end = end - 3;
261
    while (s < mm_end)
262
    {
263
        register unsigned x= *((const uint32_t *)s);
264
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
265
        d+=4;
266
        s+=4;
267
    }
268
    if (s < end)
269
    {
270
        register unsigned short x= *((const uint16_t *)s);
271
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
272
    }
273
}
274

    
275
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
276
{
277
    register const uint8_t* s=src;
278
    register uint8_t* d=dst;
279
    register const uint8_t *end;
280
    const uint8_t *mm_end;
281
    end = s + src_size;
282
#if HAVE_MMX
283
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
284
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
285
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
286
    mm_end = end - 15;
287
    while (s<mm_end)
288
    {
289
        __asm__ volatile(
290
        PREFETCH"  32%1         \n\t"
291
        "movq        %1, %%mm0  \n\t"
292
        "movq       8%1, %%mm2  \n\t"
293
        "movq     %%mm0, %%mm1  \n\t"
294
        "movq     %%mm2, %%mm3  \n\t"
295
        "psrlq       $1, %%mm0  \n\t"
296
        "psrlq       $1, %%mm2  \n\t"
297
        "pand     %%mm7, %%mm0  \n\t"
298
        "pand     %%mm7, %%mm2  \n\t"
299
        "pand     %%mm6, %%mm1  \n\t"
300
        "pand     %%mm6, %%mm3  \n\t"
301
        "por      %%mm1, %%mm0  \n\t"
302
        "por      %%mm3, %%mm2  \n\t"
303
        MOVNTQ"   %%mm0,  %0    \n\t"
304
        MOVNTQ"   %%mm2, 8%0"
305
        :"=m"(*d)
306
        :"m"(*s)
307
        );
308
        d+=16;
309
        s+=16;
310
    }
311
    __asm__ volatile(SFENCE:::"memory");
312
    __asm__ volatile(EMMS:::"memory");
313
#endif
314
    mm_end = end - 3;
315
    while (s < mm_end)
316
    {
317
        register uint32_t x= *((const uint32_t*)s);
318
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
319
        s+=4;
320
        d+=4;
321
    }
322
    if (s < end)
323
    {
324
        register uint16_t x= *((const uint16_t*)s);
325
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
326
    }
327
}
328

    
329
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
330
{
331
    const uint8_t *s = src;
332
    const uint8_t *end;
333
#if HAVE_MMX
334
    const uint8_t *mm_end;
335
#endif
336
    uint16_t *d = (uint16_t *)dst;
337
    end = s + src_size;
338
#if HAVE_MMX
339
    mm_end = end - 15;
340
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
341
    __asm__ volatile(
342
    "movq           %3, %%mm5   \n\t"
343
    "movq           %4, %%mm6   \n\t"
344
    "movq           %5, %%mm7   \n\t"
345
    "jmp 2f                     \n\t"
346
    ASMALIGN(4)
347
    "1:                         \n\t"
348
    PREFETCH"   32(%1)          \n\t"
349
    "movd         (%1), %%mm0   \n\t"
350
    "movd        4(%1), %%mm3   \n\t"
351
    "punpckldq   8(%1), %%mm0   \n\t"
352
    "punpckldq  12(%1), %%mm3   \n\t"
353
    "movq        %%mm0, %%mm1   \n\t"
354
    "movq        %%mm3, %%mm4   \n\t"
355
    "pand        %%mm6, %%mm0   \n\t"
356
    "pand        %%mm6, %%mm3   \n\t"
357
    "pmaddwd     %%mm7, %%mm0   \n\t"
358
    "pmaddwd     %%mm7, %%mm3   \n\t"
359
    "pand        %%mm5, %%mm1   \n\t"
360
    "pand        %%mm5, %%mm4   \n\t"
361
    "por         %%mm1, %%mm0   \n\t"
362
    "por         %%mm4, %%mm3   \n\t"
363
    "psrld          $5, %%mm0   \n\t"
364
    "pslld         $11, %%mm3   \n\t"
365
    "por         %%mm3, %%mm0   \n\t"
366
    MOVNTQ"      %%mm0, (%0)    \n\t"
367
    "add           $16,  %1     \n\t"
368
    "add            $8,  %0     \n\t"
369
    "2:                         \n\t"
370
    "cmp            %2,  %1     \n\t"
371
    " jb            1b          \n\t"
372
    : "+r" (d), "+r"(s)
373
    : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
374
    );
375
#else
376
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
377
    __asm__ volatile(
378
        "movq    %0, %%mm7    \n\t"
379
        "movq    %1, %%mm6    \n\t"
380
        ::"m"(red_16mask),"m"(green_16mask));
381
    while (s < mm_end)
382
    {
383
        __asm__ volatile(
384
        PREFETCH"    32%1           \n\t"
385
        "movd          %1, %%mm0    \n\t"
386
        "movd         4%1, %%mm3    \n\t"
387
        "punpckldq    8%1, %%mm0    \n\t"
388
        "punpckldq   12%1, %%mm3    \n\t"
389
        "movq       %%mm0, %%mm1    \n\t"
390
        "movq       %%mm0, %%mm2    \n\t"
391
        "movq       %%mm3, %%mm4    \n\t"
392
        "movq       %%mm3, %%mm5    \n\t"
393
        "psrlq         $3, %%mm0    \n\t"
394
        "psrlq         $3, %%mm3    \n\t"
395
        "pand          %2, %%mm0    \n\t"
396
        "pand          %2, %%mm3    \n\t"
397
        "psrlq         $5, %%mm1    \n\t"
398
        "psrlq         $5, %%mm4    \n\t"
399
        "pand       %%mm6, %%mm1    \n\t"
400
        "pand       %%mm6, %%mm4    \n\t"
401
        "psrlq         $8, %%mm2    \n\t"
402
        "psrlq         $8, %%mm5    \n\t"
403
        "pand       %%mm7, %%mm2    \n\t"
404
        "pand       %%mm7, %%mm5    \n\t"
405
        "por        %%mm1, %%mm0    \n\t"
406
        "por        %%mm4, %%mm3    \n\t"
407
        "por        %%mm2, %%mm0    \n\t"
408
        "por        %%mm5, %%mm3    \n\t"
409
        "psllq        $16, %%mm3    \n\t"
410
        "por        %%mm3, %%mm0    \n\t"
411
        MOVNTQ"     %%mm0, %0       \n\t"
412
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
413
        d += 4;
414
        s += 16;
415
    }
416
#endif
417
    __asm__ volatile(SFENCE:::"memory");
418
    __asm__ volatile(EMMS:::"memory");
419
#endif
420
    while (s < end)
421
    {
422
        register int rgb = *(const uint32_t*)s; s += 4;
423
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
424
    }
425
}
426

    
427
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
428
{
429
    const uint8_t *s = src;
430
    const uint8_t *end;
431
#if HAVE_MMX
432
    const uint8_t *mm_end;
433
#endif
434
    uint16_t *d = (uint16_t *)dst;
435
    end = s + src_size;
436
#if HAVE_MMX
437
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
438
    __asm__ volatile(
439
        "movq          %0, %%mm7    \n\t"
440
        "movq          %1, %%mm6    \n\t"
441
        ::"m"(red_16mask),"m"(green_16mask));
442
    mm_end = end - 15;
443
    while (s < mm_end)
444
    {
445
        __asm__ volatile(
446
        PREFETCH"    32%1           \n\t"
447
        "movd          %1, %%mm0    \n\t"
448
        "movd         4%1, %%mm3    \n\t"
449
        "punpckldq    8%1, %%mm0    \n\t"
450
        "punpckldq   12%1, %%mm3    \n\t"
451
        "movq       %%mm0, %%mm1    \n\t"
452
        "movq       %%mm0, %%mm2    \n\t"
453
        "movq       %%mm3, %%mm4    \n\t"
454
        "movq       %%mm3, %%mm5    \n\t"
455
        "psllq         $8, %%mm0    \n\t"
456
        "psllq         $8, %%mm3    \n\t"
457
        "pand       %%mm7, %%mm0    \n\t"
458
        "pand       %%mm7, %%mm3    \n\t"
459
        "psrlq         $5, %%mm1    \n\t"
460
        "psrlq         $5, %%mm4    \n\t"
461
        "pand       %%mm6, %%mm1    \n\t"
462
        "pand       %%mm6, %%mm4    \n\t"
463
        "psrlq        $19, %%mm2    \n\t"
464
        "psrlq        $19, %%mm5    \n\t"
465
        "pand          %2, %%mm2    \n\t"
466
        "pand          %2, %%mm5    \n\t"
467
        "por        %%mm1, %%mm0    \n\t"
468
        "por        %%mm4, %%mm3    \n\t"
469
        "por        %%mm2, %%mm0    \n\t"
470
        "por        %%mm5, %%mm3    \n\t"
471
        "psllq        $16, %%mm3    \n\t"
472
        "por        %%mm3, %%mm0    \n\t"
473
        MOVNTQ"     %%mm0, %0       \n\t"
474
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
475
        d += 4;
476
        s += 16;
477
    }
478
    __asm__ volatile(SFENCE:::"memory");
479
    __asm__ volatile(EMMS:::"memory");
480
#endif
481
    while (s < end)
482
    {
483
        register int rgb = *(const uint32_t*)s; s += 4;
484
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
485
    }
486
}
487

    
488
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
489
{
490
    const uint8_t *s = src;
491
    const uint8_t *end;
492
#if HAVE_MMX
493
    const uint8_t *mm_end;
494
#endif
495
    uint16_t *d = (uint16_t *)dst;
496
    end = s + src_size;
497
#if HAVE_MMX
498
    mm_end = end - 15;
499
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
500
    __asm__ volatile(
501
    "movq           %3, %%mm5   \n\t"
502
    "movq           %4, %%mm6   \n\t"
503
    "movq           %5, %%mm7   \n\t"
504
    "jmp            2f          \n\t"
505
    ASMALIGN(4)
506
    "1:                         \n\t"
507
    PREFETCH"   32(%1)          \n\t"
508
    "movd         (%1), %%mm0   \n\t"
509
    "movd        4(%1), %%mm3   \n\t"
510
    "punpckldq   8(%1), %%mm0   \n\t"
511
    "punpckldq  12(%1), %%mm3   \n\t"
512
    "movq        %%mm0, %%mm1   \n\t"
513
    "movq        %%mm3, %%mm4   \n\t"
514
    "pand        %%mm6, %%mm0   \n\t"
515
    "pand        %%mm6, %%mm3   \n\t"
516
    "pmaddwd     %%mm7, %%mm0   \n\t"
517
    "pmaddwd     %%mm7, %%mm3   \n\t"
518
    "pand        %%mm5, %%mm1   \n\t"
519
    "pand        %%mm5, %%mm4   \n\t"
520
    "por         %%mm1, %%mm0   \n\t"
521
    "por         %%mm4, %%mm3   \n\t"
522
    "psrld          $6, %%mm0   \n\t"
523
    "pslld         $10, %%mm3   \n\t"
524
    "por         %%mm3, %%mm0   \n\t"
525
    MOVNTQ"      %%mm0, (%0)    \n\t"
526
    "add           $16,  %1     \n\t"
527
    "add            $8,  %0     \n\t"
528
    "2:                         \n\t"
529
    "cmp            %2,  %1     \n\t"
530
    " jb            1b          \n\t"
531
    : "+r" (d), "+r"(s)
532
    : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
533
    );
534
#else
535
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
536
    __asm__ volatile(
537
        "movq          %0, %%mm7    \n\t"
538
        "movq          %1, %%mm6    \n\t"
539
        ::"m"(red_15mask),"m"(green_15mask));
540
    while (s < mm_end)
541
    {
542
        __asm__ volatile(
543
        PREFETCH"    32%1           \n\t"
544
        "movd          %1, %%mm0    \n\t"
545
        "movd         4%1, %%mm3    \n\t"
546
        "punpckldq    8%1, %%mm0    \n\t"
547
        "punpckldq   12%1, %%mm3    \n\t"
548
        "movq       %%mm0, %%mm1    \n\t"
549
        "movq       %%mm0, %%mm2    \n\t"
550
        "movq       %%mm3, %%mm4    \n\t"
551
        "movq       %%mm3, %%mm5    \n\t"
552
        "psrlq         $3, %%mm0    \n\t"
553
        "psrlq         $3, %%mm3    \n\t"
554
        "pand          %2, %%mm0    \n\t"
555
        "pand          %2, %%mm3    \n\t"
556
        "psrlq         $6, %%mm1    \n\t"
557
        "psrlq         $6, %%mm4    \n\t"
558
        "pand       %%mm6, %%mm1    \n\t"
559
        "pand       %%mm6, %%mm4    \n\t"
560
        "psrlq         $9, %%mm2    \n\t"
561
        "psrlq         $9, %%mm5    \n\t"
562
        "pand       %%mm7, %%mm2    \n\t"
563
        "pand       %%mm7, %%mm5    \n\t"
564
        "por        %%mm1, %%mm0    \n\t"
565
        "por        %%mm4, %%mm3    \n\t"
566
        "por        %%mm2, %%mm0    \n\t"
567
        "por        %%mm5, %%mm3    \n\t"
568
        "psllq        $16, %%mm3    \n\t"
569
        "por        %%mm3, %%mm0    \n\t"
570
        MOVNTQ"     %%mm0, %0       \n\t"
571
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
572
        d += 4;
573
        s += 16;
574
    }
575
#endif
576
    __asm__ volatile(SFENCE:::"memory");
577
    __asm__ volatile(EMMS:::"memory");
578
#endif
579
    while (s < end)
580
    {
581
        register int rgb = *(const uint32_t*)s; s += 4;
582
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
583
    }
584
}
585

    
586
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
587
{
588
    const uint8_t *s = src;
589
    const uint8_t *end;
590
#if HAVE_MMX
591
    const uint8_t *mm_end;
592
#endif
593
    uint16_t *d = (uint16_t *)dst;
594
    end = s + src_size;
595
#if HAVE_MMX
596
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
597
    __asm__ volatile(
598
        "movq          %0, %%mm7    \n\t"
599
        "movq          %1, %%mm6    \n\t"
600
        ::"m"(red_15mask),"m"(green_15mask));
601
    mm_end = end - 15;
602
    while (s < mm_end)
603
    {
604
        __asm__ volatile(
605
        PREFETCH"    32%1           \n\t"
606
        "movd          %1, %%mm0    \n\t"
607
        "movd         4%1, %%mm3    \n\t"
608
        "punpckldq    8%1, %%mm0    \n\t"
609
        "punpckldq   12%1, %%mm3    \n\t"
610
        "movq       %%mm0, %%mm1    \n\t"
611
        "movq       %%mm0, %%mm2    \n\t"
612
        "movq       %%mm3, %%mm4    \n\t"
613
        "movq       %%mm3, %%mm5    \n\t"
614
        "psllq         $7, %%mm0    \n\t"
615
        "psllq         $7, %%mm3    \n\t"
616
        "pand       %%mm7, %%mm0    \n\t"
617
        "pand       %%mm7, %%mm3    \n\t"
618
        "psrlq         $6, %%mm1    \n\t"
619
        "psrlq         $6, %%mm4    \n\t"
620
        "pand       %%mm6, %%mm1    \n\t"
621
        "pand       %%mm6, %%mm4    \n\t"
622
        "psrlq        $19, %%mm2    \n\t"
623
        "psrlq        $19, %%mm5    \n\t"
624
        "pand          %2, %%mm2    \n\t"
625
        "pand          %2, %%mm5    \n\t"
626
        "por        %%mm1, %%mm0    \n\t"
627
        "por        %%mm4, %%mm3    \n\t"
628
        "por        %%mm2, %%mm0    \n\t"
629
        "por        %%mm5, %%mm3    \n\t"
630
        "psllq        $16, %%mm3    \n\t"
631
        "por        %%mm3, %%mm0    \n\t"
632
        MOVNTQ"     %%mm0, %0       \n\t"
633
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
634
        d += 4;
635
        s += 16;
636
    }
637
    __asm__ volatile(SFENCE:::"memory");
638
    __asm__ volatile(EMMS:::"memory");
639
#endif
640
    while (s < end)
641
    {
642
        register int rgb = *(const uint32_t*)s; s += 4;
643
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
644
    }
645
}
646

    
647
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
648
{
649
    const uint8_t *s = src;
650
    const uint8_t *end;
651
#if HAVE_MMX
652
    const uint8_t *mm_end;
653
#endif
654
    uint16_t *d = (uint16_t *)dst;
655
    end = s + src_size;
656
#if HAVE_MMX
657
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
658
    __asm__ volatile(
659
        "movq         %0, %%mm7     \n\t"
660
        "movq         %1, %%mm6     \n\t"
661
        ::"m"(red_16mask),"m"(green_16mask));
662
    mm_end = end - 11;
663
    while (s < mm_end)
664
    {
665
        __asm__ volatile(
666
        PREFETCH"    32%1           \n\t"
667
        "movd          %1, %%mm0    \n\t"
668
        "movd         3%1, %%mm3    \n\t"
669
        "punpckldq    6%1, %%mm0    \n\t"
670
        "punpckldq    9%1, %%mm3    \n\t"
671
        "movq       %%mm0, %%mm1    \n\t"
672
        "movq       %%mm0, %%mm2    \n\t"
673
        "movq       %%mm3, %%mm4    \n\t"
674
        "movq       %%mm3, %%mm5    \n\t"
675
        "psrlq         $3, %%mm0    \n\t"
676
        "psrlq         $3, %%mm3    \n\t"
677
        "pand          %2, %%mm0    \n\t"
678
        "pand          %2, %%mm3    \n\t"
679
        "psrlq         $5, %%mm1    \n\t"
680
        "psrlq         $5, %%mm4    \n\t"
681
        "pand       %%mm6, %%mm1    \n\t"
682
        "pand       %%mm6, %%mm4    \n\t"
683
        "psrlq         $8, %%mm2    \n\t"
684
        "psrlq         $8, %%mm5    \n\t"
685
        "pand       %%mm7, %%mm2    \n\t"
686
        "pand       %%mm7, %%mm5    \n\t"
687
        "por        %%mm1, %%mm0    \n\t"
688
        "por        %%mm4, %%mm3    \n\t"
689
        "por        %%mm2, %%mm0    \n\t"
690
        "por        %%mm5, %%mm3    \n\t"
691
        "psllq        $16, %%mm3    \n\t"
692
        "por        %%mm3, %%mm0    \n\t"
693
        MOVNTQ"     %%mm0, %0       \n\t"
694
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
695
        d += 4;
696
        s += 12;
697
    }
698
    __asm__ volatile(SFENCE:::"memory");
699
    __asm__ volatile(EMMS:::"memory");
700
#endif
701
    while (s < end)
702
    {
703
        const int b = *s++;
704
        const int g = *s++;
705
        const int r = *s++;
706
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
707
    }
708
}
709

    
710
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
711
{
712
    const uint8_t *s = src;
713
    const uint8_t *end;
714
#if HAVE_MMX
715
    const uint8_t *mm_end;
716
#endif
717
    uint16_t *d = (uint16_t *)dst;
718
    end = s + src_size;
719
#if HAVE_MMX
720
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
721
    __asm__ volatile(
722
        "movq         %0, %%mm7     \n\t"
723
        "movq         %1, %%mm6     \n\t"
724
        ::"m"(red_16mask),"m"(green_16mask));
725
    mm_end = end - 15;
726
    while (s < mm_end)
727
    {
728
        __asm__ volatile(
729
        PREFETCH"    32%1           \n\t"
730
        "movd          %1, %%mm0    \n\t"
731
        "movd         3%1, %%mm3    \n\t"
732
        "punpckldq    6%1, %%mm0    \n\t"
733
        "punpckldq    9%1, %%mm3    \n\t"
734
        "movq       %%mm0, %%mm1    \n\t"
735
        "movq       %%mm0, %%mm2    \n\t"
736
        "movq       %%mm3, %%mm4    \n\t"
737
        "movq       %%mm3, %%mm5    \n\t"
738
        "psllq         $8, %%mm0    \n\t"
739
        "psllq         $8, %%mm3    \n\t"
740
        "pand       %%mm7, %%mm0    \n\t"
741
        "pand       %%mm7, %%mm3    \n\t"
742
        "psrlq         $5, %%mm1    \n\t"
743
        "psrlq         $5, %%mm4    \n\t"
744
        "pand       %%mm6, %%mm1    \n\t"
745
        "pand       %%mm6, %%mm4    \n\t"
746
        "psrlq        $19, %%mm2    \n\t"
747
        "psrlq        $19, %%mm5    \n\t"
748
        "pand          %2, %%mm2    \n\t"
749
        "pand          %2, %%mm5    \n\t"
750
        "por        %%mm1, %%mm0    \n\t"
751
        "por        %%mm4, %%mm3    \n\t"
752
        "por        %%mm2, %%mm0    \n\t"
753
        "por        %%mm5, %%mm3    \n\t"
754
        "psllq        $16, %%mm3    \n\t"
755
        "por        %%mm3, %%mm0    \n\t"
756
        MOVNTQ"     %%mm0, %0       \n\t"
757
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
758
        d += 4;
759
        s += 12;
760
    }
761
    __asm__ volatile(SFENCE:::"memory");
762
    __asm__ volatile(EMMS:::"memory");
763
#endif
764
    while (s < end)
765
    {
766
        const int r = *s++;
767
        const int g = *s++;
768
        const int b = *s++;
769
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
770
    }
771
}
772

    
773
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
774
{
775
    const uint8_t *s = src;
776
    const uint8_t *end;
777
#if HAVE_MMX
778
    const uint8_t *mm_end;
779
#endif
780
    uint16_t *d = (uint16_t *)dst;
781
    end = s + src_size;
782
#if HAVE_MMX
783
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
784
    __asm__ volatile(
785
        "movq          %0, %%mm7    \n\t"
786
        "movq          %1, %%mm6    \n\t"
787
        ::"m"(red_15mask),"m"(green_15mask));
788
    mm_end = end - 11;
789
    while (s < mm_end)
790
    {
791
        __asm__ volatile(
792
        PREFETCH"    32%1           \n\t"
793
        "movd          %1, %%mm0    \n\t"
794
        "movd         3%1, %%mm3    \n\t"
795
        "punpckldq    6%1, %%mm0    \n\t"
796
        "punpckldq    9%1, %%mm3    \n\t"
797
        "movq       %%mm0, %%mm1    \n\t"
798
        "movq       %%mm0, %%mm2    \n\t"
799
        "movq       %%mm3, %%mm4    \n\t"
800
        "movq       %%mm3, %%mm5    \n\t"
801
        "psrlq         $3, %%mm0    \n\t"
802
        "psrlq         $3, %%mm3    \n\t"
803
        "pand          %2, %%mm0    \n\t"
804
        "pand          %2, %%mm3    \n\t"
805
        "psrlq         $6, %%mm1    \n\t"
806
        "psrlq         $6, %%mm4    \n\t"
807
        "pand       %%mm6, %%mm1    \n\t"
808
        "pand       %%mm6, %%mm4    \n\t"
809
        "psrlq         $9, %%mm2    \n\t"
810
        "psrlq         $9, %%mm5    \n\t"
811
        "pand       %%mm7, %%mm2    \n\t"
812
        "pand       %%mm7, %%mm5    \n\t"
813
        "por        %%mm1, %%mm0    \n\t"
814
        "por        %%mm4, %%mm3    \n\t"
815
        "por        %%mm2, %%mm0    \n\t"
816
        "por        %%mm5, %%mm3    \n\t"
817
        "psllq        $16, %%mm3    \n\t"
818
        "por        %%mm3, %%mm0    \n\t"
819
        MOVNTQ"     %%mm0, %0       \n\t"
820
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
821
        d += 4;
822
        s += 12;
823
    }
824
    __asm__ volatile(SFENCE:::"memory");
825
    __asm__ volatile(EMMS:::"memory");
826
#endif
827
    while (s < end)
828
    {
829
        const int b = *s++;
830
        const int g = *s++;
831
        const int r = *s++;
832
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
833
    }
834
}
835

    
836
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
837
{
838
    const uint8_t *s = src;
839
    const uint8_t *end;
840
#if HAVE_MMX
841
    const uint8_t *mm_end;
842
#endif
843
    uint16_t *d = (uint16_t *)dst;
844
    end = s + src_size;
845
#if HAVE_MMX
846
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
847
    __asm__ volatile(
848
        "movq         %0, %%mm7     \n\t"
849
        "movq         %1, %%mm6     \n\t"
850
        ::"m"(red_15mask),"m"(green_15mask));
851
    mm_end = end - 15;
852
    while (s < mm_end)
853
    {
854
        __asm__ volatile(
855
        PREFETCH"   32%1            \n\t"
856
        "movd         %1, %%mm0     \n\t"
857
        "movd        3%1, %%mm3     \n\t"
858
        "punpckldq   6%1, %%mm0     \n\t"
859
        "punpckldq   9%1, %%mm3     \n\t"
860
        "movq      %%mm0, %%mm1     \n\t"
861
        "movq      %%mm0, %%mm2     \n\t"
862
        "movq      %%mm3, %%mm4     \n\t"
863
        "movq      %%mm3, %%mm5     \n\t"
864
        "psllq        $7, %%mm0     \n\t"
865
        "psllq        $7, %%mm3     \n\t"
866
        "pand      %%mm7, %%mm0     \n\t"
867
        "pand      %%mm7, %%mm3     \n\t"
868
        "psrlq        $6, %%mm1     \n\t"
869
        "psrlq        $6, %%mm4     \n\t"
870
        "pand      %%mm6, %%mm1     \n\t"
871
        "pand      %%mm6, %%mm4     \n\t"
872
        "psrlq       $19, %%mm2     \n\t"
873
        "psrlq       $19, %%mm5     \n\t"
874
        "pand         %2, %%mm2     \n\t"
875
        "pand         %2, %%mm5     \n\t"
876
        "por       %%mm1, %%mm0     \n\t"
877
        "por       %%mm4, %%mm3     \n\t"
878
        "por       %%mm2, %%mm0     \n\t"
879
        "por       %%mm5, %%mm3     \n\t"
880
        "psllq       $16, %%mm3     \n\t"
881
        "por       %%mm3, %%mm0     \n\t"
882
        MOVNTQ"    %%mm0, %0        \n\t"
883
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
884
        d += 4;
885
        s += 12;
886
    }
887
    __asm__ volatile(SFENCE:::"memory");
888
    __asm__ volatile(EMMS:::"memory");
889
#endif
890
    while (s < end)
891
    {
892
        const int r = *s++;
893
        const int g = *s++;
894
        const int b = *s++;
895
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
896
    }
897
}
898

    
899
/*
900
  I use less accurate approximation here by simply left-shifting the input
901
  value and filling the low order bits with zeroes. This method improves PNG
902
  compression but this scheme cannot reproduce white exactly, since it does
903
  not generate an all-ones maximum value; the net effect is to darken the
904
  image slightly.
905

906
  The better method should be "left bit replication":
907

908
   4 3 2 1 0
909
   ---------
910
   1 1 0 1 1
911

912
   7 6 5 4 3  2 1 0
913
   ----------------
914
   1 1 0 1 1  1 1 0
915
   |=======|  |===|
916
       |      leftmost bits repeated to fill open bits
917
       |
918
   original bits
919
*/
920
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
921
{
922
    const uint16_t *end;
923
#if HAVE_MMX
924
    const uint16_t *mm_end;
925
#endif
926
    uint8_t *d = dst;
927
    const uint16_t *s = (const uint16_t*)src;
928
    end = s + src_size/2;
929
#if HAVE_MMX
930
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
931
    mm_end = end - 7;
932
    while (s < mm_end)
933
    {
934
        __asm__ volatile(
935
        PREFETCH"    32%1           \n\t"
936
        "movq          %1, %%mm0    \n\t"
937
        "movq          %1, %%mm1    \n\t"
938
        "movq          %1, %%mm2    \n\t"
939
        "pand          %2, %%mm0    \n\t"
940
        "pand          %3, %%mm1    \n\t"
941
        "pand          %4, %%mm2    \n\t"
942
        "psllq         $3, %%mm0    \n\t"
943
        "psrlq         $2, %%mm1    \n\t"
944
        "psrlq         $7, %%mm2    \n\t"
945
        "movq       %%mm0, %%mm3    \n\t"
946
        "movq       %%mm1, %%mm4    \n\t"
947
        "movq       %%mm2, %%mm5    \n\t"
948
        "punpcklwd     %5, %%mm0    \n\t"
949
        "punpcklwd     %5, %%mm1    \n\t"
950
        "punpcklwd     %5, %%mm2    \n\t"
951
        "punpckhwd     %5, %%mm3    \n\t"
952
        "punpckhwd     %5, %%mm4    \n\t"
953
        "punpckhwd     %5, %%mm5    \n\t"
954
        "psllq         $8, %%mm1    \n\t"
955
        "psllq        $16, %%mm2    \n\t"
956
        "por        %%mm1, %%mm0    \n\t"
957
        "por        %%mm2, %%mm0    \n\t"
958
        "psllq         $8, %%mm4    \n\t"
959
        "psllq        $16, %%mm5    \n\t"
960
        "por        %%mm4, %%mm3    \n\t"
961
        "por        %%mm5, %%mm3    \n\t"
962

    
963
        "movq       %%mm0, %%mm6    \n\t"
964
        "movq       %%mm3, %%mm7    \n\t"
965

    
966
        "movq         8%1, %%mm0    \n\t"
967
        "movq         8%1, %%mm1    \n\t"
968
        "movq         8%1, %%mm2    \n\t"
969
        "pand          %2, %%mm0    \n\t"
970
        "pand          %3, %%mm1    \n\t"
971
        "pand          %4, %%mm2    \n\t"
972
        "psllq         $3, %%mm0    \n\t"
973
        "psrlq         $2, %%mm1    \n\t"
974
        "psrlq         $7, %%mm2    \n\t"
975
        "movq       %%mm0, %%mm3    \n\t"
976
        "movq       %%mm1, %%mm4    \n\t"
977
        "movq       %%mm2, %%mm5    \n\t"
978
        "punpcklwd     %5, %%mm0    \n\t"
979
        "punpcklwd     %5, %%mm1    \n\t"
980
        "punpcklwd     %5, %%mm2    \n\t"
981
        "punpckhwd     %5, %%mm3    \n\t"
982
        "punpckhwd     %5, %%mm4    \n\t"
983
        "punpckhwd     %5, %%mm5    \n\t"
984
        "psllq         $8, %%mm1    \n\t"
985
        "psllq        $16, %%mm2    \n\t"
986
        "por        %%mm1, %%mm0    \n\t"
987
        "por        %%mm2, %%mm0    \n\t"
988
        "psllq         $8, %%mm4    \n\t"
989
        "psllq        $16, %%mm5    \n\t"
990
        "por        %%mm4, %%mm3    \n\t"
991
        "por        %%mm5, %%mm3    \n\t"
992

    
993
        :"=m"(*d)
994
        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
995
        :"memory");
996
        /* borrowed 32 to 24 */
997
        __asm__ volatile(
998
        "movq       %%mm0, %%mm4    \n\t"
999
        "movq       %%mm3, %%mm5    \n\t"
1000
        "movq       %%mm6, %%mm0    \n\t"
1001
        "movq       %%mm7, %%mm1    \n\t"
1002

    
1003
        "movq       %%mm4, %%mm6    \n\t"
1004
        "movq       %%mm5, %%mm7    \n\t"
1005
        "movq       %%mm0, %%mm2    \n\t"
1006
        "movq       %%mm1, %%mm3    \n\t"
1007

    
1008
        "psrlq         $8, %%mm2    \n\t"
1009
        "psrlq         $8, %%mm3    \n\t"
1010
        "psrlq         $8, %%mm6    \n\t"
1011
        "psrlq         $8, %%mm7    \n\t"
1012
        "pand          %2, %%mm0    \n\t"
1013
        "pand          %2, %%mm1    \n\t"
1014
        "pand          %2, %%mm4    \n\t"
1015
        "pand          %2, %%mm5    \n\t"
1016
        "pand          %3, %%mm2    \n\t"
1017
        "pand          %3, %%mm3    \n\t"
1018
        "pand          %3, %%mm6    \n\t"
1019
        "pand          %3, %%mm7    \n\t"
1020
        "por        %%mm2, %%mm0    \n\t"
1021
        "por        %%mm3, %%mm1    \n\t"
1022
        "por        %%mm6, %%mm4    \n\t"
1023
        "por        %%mm7, %%mm5    \n\t"
1024

    
1025
        "movq       %%mm1, %%mm2    \n\t"
1026
        "movq       %%mm4, %%mm3    \n\t"
1027
        "psllq        $48, %%mm2    \n\t"
1028
        "psllq        $32, %%mm3    \n\t"
1029
        "pand          %4, %%mm2    \n\t"
1030
        "pand          %5, %%mm3    \n\t"
1031
        "por        %%mm2, %%mm0    \n\t"
1032
        "psrlq        $16, %%mm1    \n\t"
1033
        "psrlq        $32, %%mm4    \n\t"
1034
        "psllq        $16, %%mm5    \n\t"
1035
        "por        %%mm3, %%mm1    \n\t"
1036
        "pand          %6, %%mm5    \n\t"
1037
        "por        %%mm5, %%mm4    \n\t"
1038

    
1039
        MOVNTQ"     %%mm0,   %0     \n\t"
1040
        MOVNTQ"     %%mm1,  8%0     \n\t"
1041
        MOVNTQ"     %%mm4, 16%0"
1042

    
1043
        :"=m"(*d)
1044
        :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1045
        :"memory");
1046
        d += 24;
1047
        s += 8;
1048
    }
1049
    __asm__ volatile(SFENCE:::"memory");
1050
    __asm__ volatile(EMMS:::"memory");
1051
#endif
1052
    while (s < end)
1053
    {
1054
        register uint16_t bgr;
1055
        bgr = *s++;
1056
        *d++ = (bgr&0x1F)<<3;
1057
        *d++ = (bgr&0x3E0)>>2;
1058
        *d++ = (bgr&0x7C00)>>7;
1059
    }
1060
}
1061

    
1062
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1063
{
1064
    const uint16_t *end;
1065
#if HAVE_MMX
1066
    const uint16_t *mm_end;
1067
#endif
1068
    uint8_t *d = (uint8_t *)dst;
1069
    const uint16_t *s = (const uint16_t *)src;
1070
    end = s + src_size/2;
1071
#if HAVE_MMX
1072
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1073
    mm_end = end - 7;
1074
    while (s < mm_end)
1075
    {
1076
        __asm__ volatile(
1077
        PREFETCH"    32%1           \n\t"
1078
        "movq          %1, %%mm0    \n\t"
1079
        "movq          %1, %%mm1    \n\t"
1080
        "movq          %1, %%mm2    \n\t"
1081
        "pand          %2, %%mm0    \n\t"
1082
        "pand          %3, %%mm1    \n\t"
1083
        "pand          %4, %%mm2    \n\t"
1084
        "psllq         $3, %%mm0    \n\t"
1085
        "psrlq         $3, %%mm1    \n\t"
1086
        "psrlq         $8, %%mm2    \n\t"
1087
        "movq       %%mm0, %%mm3    \n\t"
1088
        "movq       %%mm1, %%mm4    \n\t"
1089
        "movq       %%mm2, %%mm5    \n\t"
1090
        "punpcklwd     %5, %%mm0    \n\t"
1091
        "punpcklwd     %5, %%mm1    \n\t"
1092
        "punpcklwd     %5, %%mm2    \n\t"
1093
        "punpckhwd     %5, %%mm3    \n\t"
1094
        "punpckhwd     %5, %%mm4    \n\t"
1095
        "punpckhwd     %5, %%mm5    \n\t"
1096
        "psllq         $8, %%mm1    \n\t"
1097
        "psllq        $16, %%mm2    \n\t"
1098
        "por        %%mm1, %%mm0    \n\t"
1099
        "por        %%mm2, %%mm0    \n\t"
1100
        "psllq         $8, %%mm4    \n\t"
1101
        "psllq        $16, %%mm5    \n\t"
1102
        "por        %%mm4, %%mm3    \n\t"
1103
        "por        %%mm5, %%mm3    \n\t"
1104

    
1105
        "movq       %%mm0, %%mm6    \n\t"
1106
        "movq       %%mm3, %%mm7    \n\t"
1107

    
1108
        "movq         8%1, %%mm0    \n\t"
1109
        "movq         8%1, %%mm1    \n\t"
1110
        "movq         8%1, %%mm2    \n\t"
1111
        "pand          %2, %%mm0    \n\t"
1112
        "pand          %3, %%mm1    \n\t"
1113
        "pand          %4, %%mm2    \n\t"
1114
        "psllq         $3, %%mm0    \n\t"
1115
        "psrlq         $3, %%mm1    \n\t"
1116
        "psrlq         $8, %%mm2    \n\t"
1117
        "movq       %%mm0, %%mm3    \n\t"
1118
        "movq       %%mm1, %%mm4    \n\t"
1119
        "movq       %%mm2, %%mm5    \n\t"
1120
        "punpcklwd     %5, %%mm0    \n\t"
1121
        "punpcklwd     %5, %%mm1    \n\t"
1122
        "punpcklwd     %5, %%mm2    \n\t"
1123
        "punpckhwd     %5, %%mm3    \n\t"
1124
        "punpckhwd     %5, %%mm4    \n\t"
1125
        "punpckhwd     %5, %%mm5    \n\t"
1126
        "psllq         $8, %%mm1    \n\t"
1127
        "psllq        $16, %%mm2    \n\t"
1128
        "por        %%mm1, %%mm0    \n\t"
1129
        "por        %%mm2, %%mm0    \n\t"
1130
        "psllq         $8, %%mm4    \n\t"
1131
        "psllq        $16, %%mm5    \n\t"
1132
        "por        %%mm4, %%mm3    \n\t"
1133
        "por        %%mm5, %%mm3    \n\t"
1134
        :"=m"(*d)
1135
        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1136
        :"memory");
1137
        /* borrowed 32 to 24 */
1138
        __asm__ volatile(
1139
        "movq       %%mm0, %%mm4    \n\t"
1140
        "movq       %%mm3, %%mm5    \n\t"
1141
        "movq       %%mm6, %%mm0    \n\t"
1142
        "movq       %%mm7, %%mm1    \n\t"
1143

    
1144
        "movq       %%mm4, %%mm6    \n\t"
1145
        "movq       %%mm5, %%mm7    \n\t"
1146
        "movq       %%mm0, %%mm2    \n\t"
1147
        "movq       %%mm1, %%mm3    \n\t"
1148

    
1149
        "psrlq         $8, %%mm2    \n\t"
1150
        "psrlq         $8, %%mm3    \n\t"
1151
        "psrlq         $8, %%mm6    \n\t"
1152
        "psrlq         $8, %%mm7    \n\t"
1153
        "pand          %2, %%mm0    \n\t"
1154
        "pand          %2, %%mm1    \n\t"
1155
        "pand          %2, %%mm4    \n\t"
1156
        "pand          %2, %%mm5    \n\t"
1157
        "pand          %3, %%mm2    \n\t"
1158
        "pand          %3, %%mm3    \n\t"
1159
        "pand          %3, %%mm6    \n\t"
1160
        "pand          %3, %%mm7    \n\t"
1161
        "por        %%mm2, %%mm0    \n\t"
1162
        "por        %%mm3, %%mm1    \n\t"
1163
        "por        %%mm6, %%mm4    \n\t"
1164
        "por        %%mm7, %%mm5    \n\t"
1165

    
1166
        "movq       %%mm1, %%mm2    \n\t"
1167
        "movq       %%mm4, %%mm3    \n\t"
1168
        "psllq        $48, %%mm2    \n\t"
1169
        "psllq        $32, %%mm3    \n\t"
1170
        "pand          %4, %%mm2    \n\t"
1171
        "pand          %5, %%mm3    \n\t"
1172
        "por        %%mm2, %%mm0    \n\t"
1173
        "psrlq        $16, %%mm1    \n\t"
1174
        "psrlq        $32, %%mm4    \n\t"
1175
        "psllq        $16, %%mm5    \n\t"
1176
        "por        %%mm3, %%mm1    \n\t"
1177
        "pand          %6, %%mm5    \n\t"
1178
        "por        %%mm5, %%mm4    \n\t"
1179

    
1180
        MOVNTQ"     %%mm0,   %0     \n\t"
1181
        MOVNTQ"     %%mm1,  8%0     \n\t"
1182
        MOVNTQ"     %%mm4, 16%0"
1183

    
1184
        :"=m"(*d)
1185
        :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1186
        :"memory");
1187
        d += 24;
1188
        s += 8;
1189
    }
1190
    __asm__ volatile(SFENCE:::"memory");
1191
    __asm__ volatile(EMMS:::"memory");
1192
#endif
1193
    while (s < end)
1194
    {
1195
        register uint16_t bgr;
1196
        bgr = *s++;
1197
        *d++ = (bgr&0x1F)<<3;
1198
        *d++ = (bgr&0x7E0)>>3;
1199
        *d++ = (bgr&0xF800)>>8;
1200
    }
1201
}
1202

    
1203
/*
1204
 * mm0 = 00 B3 00 B2 00 B1 00 B0
1205
 * mm1 = 00 G3 00 G2 00 G1 00 G0
1206
 * mm2 = 00 R3 00 R2 00 R1 00 R0
1207
 * mm6 = FF FF FF FF FF FF FF FF
1208
 * mm7 = 00 00 00 00 00 00 00 00
1209
 */
1210
#define PACK_RGB32 \
1211
    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1212
    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1213
    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1214
    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1215
    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1216
    "movq       %%mm0, %%mm3    \n\t"                               \
1217
    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1218
    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1219
    MOVNTQ"     %%mm0,  %0      \n\t"                               \
1220
    MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1221

    
1222
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1223
{
1224
    const uint16_t *end;
1225
#if HAVE_MMX
1226
    const uint16_t *mm_end;
1227
#endif
1228
    uint8_t *d = dst;
1229
    const uint16_t *s = (const uint16_t *)src;
1230
    end = s + src_size/2;
1231
#if HAVE_MMX
1232
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1233
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1234
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1235
    mm_end = end - 3;
1236
    while (s < mm_end)
1237
    {
1238
        __asm__ volatile(
1239
        PREFETCH"    32%1           \n\t"
1240
        "movq          %1, %%mm0    \n\t"
1241
        "movq          %1, %%mm1    \n\t"
1242
        "movq          %1, %%mm2    \n\t"
1243
        "pand          %2, %%mm0    \n\t"
1244
        "pand          %3, %%mm1    \n\t"
1245
        "pand          %4, %%mm2    \n\t"
1246
        "psllq         $3, %%mm0    \n\t"
1247
        "psrlq         $2, %%mm1    \n\t"
1248
        "psrlq         $7, %%mm2    \n\t"
1249
        PACK_RGB32
1250
        :"=m"(*d)
1251
        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1252
        :"memory");
1253
        d += 16;
1254
        s += 4;
1255
    }
1256
    __asm__ volatile(SFENCE:::"memory");
1257
    __asm__ volatile(EMMS:::"memory");
1258
#endif
1259
    while (s < end)
1260
    {
1261
#if 0 //slightly slower on Athlon
1262
        int bgr= *s++;
1263
        *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1264
#else
1265
        register uint16_t bgr;
1266
        bgr = *s++;
1267
#if HAVE_BIGENDIAN
1268
        *d++ = 255;
1269
        *d++ = (bgr&0x7C00)>>7;
1270
        *d++ = (bgr&0x3E0)>>2;
1271
        *d++ = (bgr&0x1F)<<3;
1272
#else
1273
        *d++ = (bgr&0x1F)<<3;
1274
        *d++ = (bgr&0x3E0)>>2;
1275
        *d++ = (bgr&0x7C00)>>7;
1276
        *d++ = 255;
1277
#endif
1278

    
1279
#endif
1280
    }
1281
}
1282

    
1283
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1284
{
1285
    const uint16_t *end;
1286
#if HAVE_MMX
1287
    const uint16_t *mm_end;
1288
#endif
1289
    uint8_t *d = dst;
1290
    const uint16_t *s = (const uint16_t*)src;
1291
    end = s + src_size/2;
1292
#if HAVE_MMX
1293
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1294
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1295
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1296
    mm_end = end - 3;
1297
    while (s < mm_end)
1298
    {
1299
        __asm__ volatile(
1300
        PREFETCH"    32%1           \n\t"
1301
        "movq          %1, %%mm0    \n\t"
1302
        "movq          %1, %%mm1    \n\t"
1303
        "movq          %1, %%mm2    \n\t"
1304
        "pand          %2, %%mm0    \n\t"
1305
        "pand          %3, %%mm1    \n\t"
1306
        "pand          %4, %%mm2    \n\t"
1307
        "psllq         $3, %%mm0    \n\t"
1308
        "psrlq         $3, %%mm1    \n\t"
1309
        "psrlq         $8, %%mm2    \n\t"
1310
        PACK_RGB32
1311
        :"=m"(*d)
1312
        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1313
        :"memory");
1314
        d += 16;
1315
        s += 4;
1316
    }
1317
    __asm__ volatile(SFENCE:::"memory");
1318
    __asm__ volatile(EMMS:::"memory");
1319
#endif
1320
    while (s < end)
1321
    {
1322
        register uint16_t bgr;
1323
        bgr = *s++;
1324
#if HAVE_BIGENDIAN
1325
        *d++ = 255;
1326
        *d++ = (bgr&0xF800)>>8;
1327
        *d++ = (bgr&0x7E0)>>3;
1328
        *d++ = (bgr&0x1F)<<3;
1329
#else
1330
        *d++ = (bgr&0x1F)<<3;
1331
        *d++ = (bgr&0x7E0)>>3;
1332
        *d++ = (bgr&0xF800)>>8;
1333
        *d++ = 255;
1334
#endif
1335
    }
1336
}
1337

    
1338
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1339
{
1340
    x86_reg idx = 15 - src_size;
1341
    const uint8_t *s = src-idx;
1342
    uint8_t *d = dst-idx;
1343
#if HAVE_MMX
1344
    __asm__ volatile(
1345
    "test          %0, %0           \n\t"
1346
    "jns           2f               \n\t"
1347
    PREFETCH"       (%1, %0)        \n\t"
1348
    "movq          %3, %%mm7        \n\t"
1349
    "pxor          %4, %%mm7        \n\t"
1350
    "movq       %%mm7, %%mm6        \n\t"
1351
    "pxor          %5, %%mm7        \n\t"
1352
    ASMALIGN(4)
1353
    "1:                             \n\t"
1354
    PREFETCH"     32(%1, %0)        \n\t"
1355
    "movq           (%1, %0), %%mm0 \n\t"
1356
    "movq          8(%1, %0), %%mm1 \n\t"
1357
# if HAVE_MMX2
1358
    "pshufw      $177, %%mm0, %%mm3 \n\t"
1359
    "pshufw      $177, %%mm1, %%mm5 \n\t"
1360
    "pand       %%mm7, %%mm0        \n\t"
1361
    "pand       %%mm6, %%mm3        \n\t"
1362
    "pand       %%mm7, %%mm1        \n\t"
1363
    "pand       %%mm6, %%mm5        \n\t"
1364
    "por        %%mm3, %%mm0        \n\t"
1365
    "por        %%mm5, %%mm1        \n\t"
1366
# else
1367
    "movq       %%mm0, %%mm2        \n\t"
1368
    "movq       %%mm1, %%mm4        \n\t"
1369
    "pand       %%mm7, %%mm0        \n\t"
1370
    "pand       %%mm6, %%mm2        \n\t"
1371
    "pand       %%mm7, %%mm1        \n\t"
1372
    "pand       %%mm6, %%mm4        \n\t"
1373
    "movq       %%mm2, %%mm3        \n\t"
1374
    "movq       %%mm4, %%mm5        \n\t"
1375
    "pslld        $16, %%mm2        \n\t"
1376
    "psrld        $16, %%mm3        \n\t"
1377
    "pslld        $16, %%mm4        \n\t"
1378
    "psrld        $16, %%mm5        \n\t"
1379
    "por        %%mm2, %%mm0        \n\t"
1380
    "por        %%mm4, %%mm1        \n\t"
1381
    "por        %%mm3, %%mm0        \n\t"
1382
    "por        %%mm5, %%mm1        \n\t"
1383
# endif
1384
    MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1385
    MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1386
    "add          $16, %0           \n\t"
1387
    "js            1b               \n\t"
1388
    SFENCE"                         \n\t"
1389
    EMMS"                           \n\t"
1390
    "2:                             \n\t"
1391
    : "+&r"(idx)
1392
    : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1393
    : "memory");
1394
#endif
1395
    for (; idx<15; idx+=4) {
1396
        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1397
        v &= 0xff00ff;
1398
        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1399
    }
1400
}
1401

    
1402
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1403
{
1404
    unsigned i;
1405
#if HAVE_MMX
1406
    x86_reg mmx_size= 23 - src_size;
1407
    __asm__ volatile (
1408
    "test             %%"REG_a", %%"REG_a"          \n\t"
1409
    "jns                     2f                     \n\t"
1410
    "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1411
    "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1412
    "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1413
    ASMALIGN(4)
1414
    "1:                                             \n\t"
1415
    PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1416
    "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1417
    "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1418
    "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1419
    "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1420
    "pand                 %%mm5, %%mm0              \n\t"
1421
    "pand                 %%mm6, %%mm1              \n\t"
1422
    "pand                 %%mm7, %%mm2              \n\t"
1423
    "por                  %%mm0, %%mm1              \n\t"
1424
    "por                  %%mm2, %%mm1              \n\t"
1425
    "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1426
    MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1427
    "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1428
    "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1429
    "pand                 %%mm7, %%mm0              \n\t"
1430
    "pand                 %%mm5, %%mm1              \n\t"
1431
    "pand                 %%mm6, %%mm2              \n\t"
1432
    "por                  %%mm0, %%mm1              \n\t"
1433
    "por                  %%mm2, %%mm1              \n\t"
1434
    "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1435
    MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1436
    "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1437
    "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1438
    "pand                 %%mm6, %%mm0              \n\t"
1439
    "pand                 %%mm7, %%mm1              \n\t"
1440
    "pand                 %%mm5, %%mm2              \n\t"
1441
    "por                  %%mm0, %%mm1              \n\t"
1442
    "por                  %%mm2, %%mm1              \n\t"
1443
    MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1444
    "add                    $24, %%"REG_a"          \n\t"
1445
    " js                     1b                     \n\t"
1446
    "2:                                             \n\t"
1447
    : "+a" (mmx_size)
1448
    : "r" (src-mmx_size), "r"(dst-mmx_size)
1449
    );
1450

    
1451
    __asm__ volatile(SFENCE:::"memory");
1452
    __asm__ volatile(EMMS:::"memory");
1453

    
1454
    if (mmx_size==23) return; //finished, was multiple of 8
1455

    
1456
    src+= src_size;
1457
    dst+= src_size;
1458
    src_size= 23-mmx_size;
1459
    src-= src_size;
1460
    dst-= src_size;
1461
#endif
1462
    for (i=0; i<src_size; i+=3)
1463
    {
1464
        register uint8_t x;
1465
        x          = src[i + 2];
1466
        dst[i + 1] = src[i + 1];
1467
        dst[i + 2] = src[i + 0];
1468
        dst[i + 0] = x;
1469
    }
1470
}
1471

    
1472
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1473
                                           long width, long height,
1474
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1475
{
1476
    long y;
1477
    const x86_reg chromWidth= width>>1;
1478
    for (y=0; y<height; y++)
1479
    {
1480
#if HAVE_MMX
1481
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1482
        __asm__ volatile(
1483
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1484
        ASMALIGN(4)
1485
        "1:                                         \n\t"
1486
        PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1487
        PREFETCH"    32(%2, %%"REG_a")              \n\t"
1488
        PREFETCH"    32(%3, %%"REG_a")              \n\t"
1489
        "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1490
        "movq                    %%mm0, %%mm2       \n\t" // U(0)
1491
        "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1492
        "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1493
        "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1494

    
1495
        "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1496
        "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1497
        "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1498
        "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1499
        "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1500
        "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1501
        "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1502
        "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1503

    
1504
        MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1505
        MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1506
        MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1507
        MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1508

    
1509
        "add                        $8, %%"REG_a"   \n\t"
1510
        "cmp                        %4, %%"REG_a"   \n\t"
1511
        " jb                        1b              \n\t"
1512
        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1513
        : "%"REG_a
1514
        );
1515
#else
1516

    
1517
#if ARCH_ALPHA && HAVE_MVI
1518
#define pl2yuy2(n)                  \
1519
    y1 = yc[n];                     \
1520
    y2 = yc2[n];                    \
1521
    u = uc[n];                      \
1522
    v = vc[n];                      \
1523
    __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1524
    __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1525
    __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1526
    __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1527
    yuv1 = (u << 8) + (v << 24);                \
1528
    yuv2 = yuv1 + y2;               \
1529
    yuv1 += y1;                     \
1530
    qdst[n]  = yuv1;                \
1531
    qdst2[n] = yuv2;
1532

    
1533
        int i;
1534
        uint64_t *qdst = (uint64_t *) dst;
1535
        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1536
        const uint32_t *yc = (uint32_t *) ysrc;
1537
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1538
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1539
        for (i = 0; i < chromWidth; i += 8){
1540
            uint64_t y1, y2, yuv1, yuv2;
1541
            uint64_t u, v;
1542
            /* Prefetch */
1543
            __asm__("ldq $31,64(%0)" :: "r"(yc));
1544
            __asm__("ldq $31,64(%0)" :: "r"(yc2));
1545
            __asm__("ldq $31,64(%0)" :: "r"(uc));
1546
            __asm__("ldq $31,64(%0)" :: "r"(vc));
1547

    
1548
            pl2yuy2(0);
1549
            pl2yuy2(1);
1550
            pl2yuy2(2);
1551
            pl2yuy2(3);
1552

    
1553
            yc    += 4;
1554
            yc2   += 4;
1555
            uc    += 4;
1556
            vc    += 4;
1557
            qdst  += 4;
1558
            qdst2 += 4;
1559
        }
1560
        y++;
1561
        ysrc += lumStride;
1562
        dst += dstStride;
1563

    
1564
#elif HAVE_FAST_64BIT
1565
        int i;
1566
        uint64_t *ldst = (uint64_t *) dst;
1567
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1568
        for (i = 0; i < chromWidth; i += 2){
1569
            uint64_t k, l;
1570
            k = yc[0] + (uc[0] << 8) +
1571
                (yc[1] << 16) + (vc[0] << 24);
1572
            l = yc[2] + (uc[1] << 8) +
1573
                (yc[3] << 16) + (vc[1] << 24);
1574
            *ldst++ = k + (l << 32);
1575
            yc += 4;
1576
            uc += 2;
1577
            vc += 2;
1578
        }
1579

    
1580
#else
1581
        int i, *idst = (int32_t *) dst;
1582
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1583
        for (i = 0; i < chromWidth; i++){
1584
#if HAVE_BIGENDIAN
1585
            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1586
                (yc[1] << 8) + (vc[0] << 0);
1587
#else
1588
            *idst++ = yc[0] + (uc[0] << 8) +
1589
                (yc[1] << 16) + (vc[0] << 24);
1590
#endif
1591
            yc += 2;
1592
            uc++;
1593
            vc++;
1594
        }
1595
#endif
1596
#endif
1597
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1598
        {
1599
            usrc += chromStride;
1600
            vsrc += chromStride;
1601
        }
1602
        ysrc += lumStride;
1603
        dst  += dstStride;
1604
    }
1605
#if HAVE_MMX
1606
__asm__(    EMMS"       \n\t"
1607
        SFENCE"     \n\t"
1608
        :::"memory");
1609
#endif
1610
}
1611

    
1612
/**
1613
 * Height should be a multiple of 2 and width should be a multiple of 16.
1614
 * (If this is a problem for anyone then tell me, and I will fix it.)
1615
 */
1616
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1617
                                      long width, long height,
1618
                                      long lumStride, long chromStride, long dstStride)
1619
{
1620
    //FIXME interpolate chroma
1621
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1622
}
1623

    
1624
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1625
                                           long width, long height,
1626
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1627
{
1628
    long y;
1629
    const x86_reg chromWidth= width>>1;
1630
    for (y=0; y<height; y++)
1631
    {
1632
#if HAVE_MMX
1633
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1634
        __asm__ volatile(
1635
        "xor                %%"REG_a", %%"REG_a"    \n\t"
1636
        ASMALIGN(4)
1637
        "1:                                         \n\t"
1638
        PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1639
        PREFETCH"   32(%2, %%"REG_a")               \n\t"
1640
        PREFETCH"   32(%3, %%"REG_a")               \n\t"
1641
        "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1642
        "movq                   %%mm0, %%mm2        \n\t" // U(0)
1643
        "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1644
        "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1645
        "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1646

    
1647
        "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1648
        "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1649
        "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1650
        "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1651
        "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1652
        "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1653
        "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1654
        "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1655

    
1656
        MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1657
        MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1658
        MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1659
        MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1660

    
1661
        "add                       $8, %%"REG_a"    \n\t"
1662
        "cmp                       %4, %%"REG_a"    \n\t"
1663
        " jb                       1b               \n\t"
1664
        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1665
        : "%"REG_a
1666
        );
1667
#else
1668
//FIXME adapt the Alpha ASM code from yv12->yuy2
1669

    
1670
#if HAVE_FAST_64BIT
1671
        int i;
1672
        uint64_t *ldst = (uint64_t *) dst;
1673
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1674
        for (i = 0; i < chromWidth; i += 2){
1675
            uint64_t k, l;
1676
            k = uc[0] + (yc[0] << 8) +
1677
                (vc[0] << 16) + (yc[1] << 24);
1678
            l = uc[1] + (yc[2] << 8) +
1679
                (vc[1] << 16) + (yc[3] << 24);
1680
            *ldst++ = k + (l << 32);
1681
            yc += 4;
1682
            uc += 2;
1683
            vc += 2;
1684
        }
1685

    
1686
#else
1687
        int i, *idst = (int32_t *) dst;
1688
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1689
        for (i = 0; i < chromWidth; i++){
1690
#if HAVE_BIGENDIAN
1691
            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1692
                (vc[0] << 8) + (yc[1] << 0);
1693
#else
1694
            *idst++ = uc[0] + (yc[0] << 8) +
1695
               (vc[0] << 16) + (yc[1] << 24);
1696
#endif
1697
            yc += 2;
1698
            uc++;
1699
            vc++;
1700
        }
1701
#endif
1702
#endif
1703
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1704
        {
1705
            usrc += chromStride;
1706
            vsrc += chromStride;
1707
        }
1708
        ysrc += lumStride;
1709
        dst += dstStride;
1710
    }
1711
#if HAVE_MMX
1712
__asm__(    EMMS"       \n\t"
1713
        SFENCE"     \n\t"
1714
        :::"memory");
1715
#endif
1716
}
1717

    
1718
/**
1719
 * Height should be a multiple of 2 and width should be a multiple of 16
1720
 * (If this is a problem for anyone then tell me, and I will fix it.)
1721
 */
1722
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1723
                                      long width, long height,
1724
                                      long lumStride, long chromStride, long dstStride)
1725
{
1726
    //FIXME interpolate chroma
1727
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1728
}
1729

    
1730
/**
1731
 * Width should be a multiple of 16.
1732
 */
1733
static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1734
                                         long width, long height,
1735
                                         long lumStride, long chromStride, long dstStride)
1736
{
1737
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1738
}
1739

    
1740
/**
1741
 * Width should be a multiple of 16.
1742
 */
1743
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1744
                                         long width, long height,
1745
                                         long lumStride, long chromStride, long dstStride)
1746
{
1747
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1748
}
1749

    
1750
/**
1751
 * Height should be a multiple of 2 and width should be a multiple of 16.
1752
 * (If this is a problem for anyone then tell me, and I will fix it.)
1753
 */
1754
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1755
                                      long width, long height,
1756
                                      long lumStride, long chromStride, long srcStride)
1757
{
1758
    long y;
1759
    const x86_reg chromWidth= width>>1;
1760
    for (y=0; y<height; y+=2)
1761
    {
1762
#if HAVE_MMX
1763
        __asm__ volatile(
1764
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1765
        "pcmpeqw                 %%mm7, %%mm7       \n\t"
1766
        "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1767
        ASMALIGN(4)
1768
        "1:                \n\t"
1769
        PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1770
        "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1771
        "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1772
        "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1773
        "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1774
        "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1775
        "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1776
        "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1777
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1778
        "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1779
        "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1780

    
1781
        MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1782

    
1783
        "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1784
        "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1785
        "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1786
        "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1787
        "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1788
        "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1789
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1790
        "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1791
        "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1792
        "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1793

    
1794
        MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1795

    
1796
        "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1797
        "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1798
        "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1799
        "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1800
        "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1801
        "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1802
        "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1803
        "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1804

    
1805
        MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1806
        MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1807

    
1808
        "add                        $8, %%"REG_a"   \n\t"
1809
        "cmp                        %4, %%"REG_a"   \n\t"
1810
        " jb                        1b              \n\t"
1811
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1812
        : "memory", "%"REG_a
1813
        );
1814

    
1815
        ydst += lumStride;
1816
        src  += srcStride;
1817

    
1818
        __asm__ volatile(
1819
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1820
        ASMALIGN(4)
1821
        "1:                                         \n\t"
1822
        PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1823
        "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1824
        "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1825
        "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1826
        "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1827
        "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1828
        "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1829
        "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1830
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1831
        "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1832
        "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1833

    
1834
        MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1835
        MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1836

    
1837
        "add                        $8, %%"REG_a"   \n\t"
1838
        "cmp                        %4, %%"REG_a"   \n\t"
1839
        " jb                        1b              \n\t"
1840

    
1841
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1842
        : "memory", "%"REG_a
1843
        );
1844
#else
1845
        long i;
1846
        for (i=0; i<chromWidth; i++)
1847
        {
1848
            ydst[2*i+0]     = src[4*i+0];
1849
            udst[i]     = src[4*i+1];
1850
            ydst[2*i+1]     = src[4*i+2];
1851
            vdst[i]     = src[4*i+3];
1852
        }
1853
        ydst += lumStride;
1854
        src  += srcStride;
1855

    
1856
        for (i=0; i<chromWidth; i++)
1857
        {
1858
            ydst[2*i+0]     = src[4*i+0];
1859
            ydst[2*i+1]     = src[4*i+2];
1860
        }
1861
#endif
1862
        udst += chromStride;
1863
        vdst += chromStride;
1864
        ydst += lumStride;
1865
        src  += srcStride;
1866
    }
1867
#if HAVE_MMX
1868
__asm__ volatile(   EMMS"       \n\t"
1869
                SFENCE"     \n\t"
1870
                :::"memory");
1871
#endif
1872
}
1873

    
1874
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1875
                                      uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1876
                                      long width, long height, long lumStride, long chromStride)
1877
{
1878
    /* Y Plane */
1879
    memcpy(ydst, ysrc, width*height);
1880

    
1881
    /* XXX: implement upscaling for U,V */
1882
}
1883

    
1884
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1885
{
1886
    long x,y;
1887

    
1888
    dst[0]= src[0];
1889

    
1890
    // first line
1891
    for (x=0; x<srcWidth-1; x++){
1892
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1893
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1894
    }
1895
    dst[2*srcWidth-1]= src[srcWidth-1];
1896

    
1897
        dst+= dstStride;
1898

    
1899
    for (y=1; y<srcHeight; y++){
1900
#if HAVE_MMX2 || HAVE_AMD3DNOW
1901
        const x86_reg mmxSize= srcWidth&~15;
1902
        __asm__ volatile(
1903
        "mov           %4, %%"REG_a"            \n\t"
1904
        "1:                                     \n\t"
1905
        "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1906
        "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1907
        "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1908
        "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1909
        "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1910
        "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1911
        PAVGB"                  %%mm0, %%mm5    \n\t"
1912
        PAVGB"                  %%mm0, %%mm3    \n\t"
1913
        PAVGB"                  %%mm0, %%mm5    \n\t"
1914
        PAVGB"                  %%mm0, %%mm3    \n\t"
1915
        PAVGB"                  %%mm1, %%mm4    \n\t"
1916
        PAVGB"                  %%mm1, %%mm2    \n\t"
1917
        PAVGB"                  %%mm1, %%mm4    \n\t"
1918
        PAVGB"                  %%mm1, %%mm2    \n\t"
1919
        "movq                   %%mm5, %%mm7    \n\t"
1920
        "movq                   %%mm4, %%mm6    \n\t"
1921
        "punpcklbw              %%mm3, %%mm5    \n\t"
1922
        "punpckhbw              %%mm3, %%mm7    \n\t"
1923
        "punpcklbw              %%mm2, %%mm4    \n\t"
1924
        "punpckhbw              %%mm2, %%mm6    \n\t"
1925
#if 1
1926
        MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1927
        MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1928
        MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1929
        MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1930
#else
1931
        "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1932
        "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1933
        "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1934
        "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1935
#endif
1936
        "add                       $8, %%"REG_a"            \n\t"
1937
        " js                       1b                       \n\t"
1938
        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1939
           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1940
           "g" (-mmxSize)
1941
        : "%"REG_a
1942

    
1943
        );
1944
#else
1945
        const x86_reg mmxSize=1;
1946
#endif
1947
        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1948
        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1949

    
1950
        for (x=mmxSize-1; x<srcWidth-1; x++){
1951
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1952
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1953
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1954
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1955
        }
1956
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1957
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1958

    
1959
        dst+=dstStride*2;
1960
        src+=srcStride;
1961
    }
1962

    
1963
    // last line
1964
#if 1
1965
    dst[0]= src[0];
1966

    
1967
    for (x=0; x<srcWidth-1; x++){
1968
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1969
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1970
    }
1971
    dst[2*srcWidth-1]= src[srcWidth-1];
1972
#else
1973
    for (x=0; x<srcWidth; x++){
1974
        dst[2*x+0]=
1975
        dst[2*x+1]= src[x];
1976
    }
1977
#endif
1978

    
1979
#if HAVE_MMX
1980
__asm__ volatile(   EMMS"       \n\t"
1981
                SFENCE"     \n\t"
1982
                :::"memory");
1983
#endif
1984
}
1985

    
1986
/**
1987
 * Height should be a multiple of 2 and width should be a multiple of 16.
1988
 * (If this is a problem for anyone then tell me, and I will fix it.)
1989
 * Chrominance data is only taken from every second line, others are ignored.
1990
 * FIXME: Write HQ version.
1991
 */
1992
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1993
                                      long width, long height,
1994
                                      long lumStride, long chromStride, long srcStride)
1995
{
1996
    long y;
1997
    const x86_reg chromWidth= width>>1;
1998
    for (y=0; y<height; y+=2)
1999
    {
2000
#if HAVE_MMX
2001
        __asm__ volatile(
2002
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
2003
        "pcmpeqw             %%mm7, %%mm7   \n\t"
2004
        "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
2005
        ASMALIGN(4)
2006
        "1:                                 \n\t"
2007
        PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
2008
        "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
2009
        "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
2010
        "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
2011
        "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
2012
        "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
2013
        "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
2014
        "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
2015
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
2016
        "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
2017
        "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
2018

    
2019
        MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
2020

    
2021
        "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
2022
        "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
2023
        "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
2024
        "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
2025
        "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
2026
        "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
2027
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
2028
        "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
2029
        "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
2030
        "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
2031

    
2032
        MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
2033

    
2034
        "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
2035
        "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
2036
        "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
2037
        "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
2038
        "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
2039
        "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
2040
        "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
2041
        "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
2042

    
2043
        MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
2044
        MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
2045

    
2046
        "add                    $8, %%"REG_a"   \n\t"
2047
        "cmp                    %4, %%"REG_a"   \n\t"
2048
        " jb                    1b          \n\t"
2049
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2050
        : "memory", "%"REG_a
2051
        );
2052

    
2053
        ydst += lumStride;
2054
        src  += srcStride;
2055

    
2056
        __asm__ volatile(
2057
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
2058
        ASMALIGN(4)
2059
        "1:                                 \n\t"
2060
        PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
2061
        "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
2062
        "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
2063
        "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
2064
        "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
2065
        "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
2066
        "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
2067
        "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
2068
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
2069
        "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
2070
        "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
2071

    
2072
        MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
2073
        MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2074

    
2075
        "add                    $8, %%"REG_a"   \n\t"
2076
        "cmp                    %4, %%"REG_a"   \n\t"
2077
        " jb                    1b          \n\t"
2078

    
2079
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2080
        : "memory", "%"REG_a
2081
        );
2082
#else
2083
        long i;
2084
        for (i=0; i<chromWidth; i++)
2085
        {
2086
            udst[i]     = src[4*i+0];
2087
            ydst[2*i+0] = src[4*i+1];
2088
            vdst[i]     = src[4*i+2];
2089
            ydst[2*i+1] = src[4*i+3];
2090
        }
2091
        ydst += lumStride;
2092
        src  += srcStride;
2093

    
2094
        for (i=0; i<chromWidth; i++)
2095
        {
2096
            ydst[2*i+0] = src[4*i+1];
2097
            ydst[2*i+1] = src[4*i+3];
2098
        }
2099
#endif
2100
        udst += chromStride;
2101
        vdst += chromStride;
2102
        ydst += lumStride;
2103
        src  += srcStride;
2104
    }
2105
#if HAVE_MMX
2106
__asm__ volatile(   EMMS"       \n\t"
2107
                SFENCE"     \n\t"
2108
                :::"memory");
2109
#endif
2110
}
2111

    
2112
/**
2113
 * Height should be a multiple of 2 and width should be a multiple of 2.
2114
 * (If this is a problem for anyone then tell me, and I will fix it.)
2115
 * Chrominance data is only taken from every second line,
2116
 * others are ignored in the C version.
2117
 * FIXME: Write HQ version.
2118
 */
2119
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2120
                                       long width, long height,
2121
                                       long lumStride, long chromStride, long srcStride)
2122
{
2123
    long y;
2124
    const x86_reg chromWidth= width>>1;
2125
#if HAVE_MMX
2126
    for (y=0; y<height-2; y+=2)
2127
    {
2128
        long i;
2129
        for (i=0; i<2; i++)
2130
        {
2131
            __asm__ volatile(
2132
            "mov                        %2, %%"REG_a"   \n\t"
2133
            "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2134
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2135
            "pxor                    %%mm7, %%mm7       \n\t"
2136
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2137
            ASMALIGN(4)
2138
            "1:                                         \n\t"
2139
            PREFETCH"    64(%0, %%"REG_d")              \n\t"
2140
            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2141
            "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2142
            "punpcklbw               %%mm7, %%mm0       \n\t"
2143
            "punpcklbw               %%mm7, %%mm1       \n\t"
2144
            "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2145
            "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2146
            "punpcklbw               %%mm7, %%mm2       \n\t"
2147
            "punpcklbw               %%mm7, %%mm3       \n\t"
2148
            "pmaddwd                 %%mm6, %%mm0       \n\t"
2149
            "pmaddwd                 %%mm6, %%mm1       \n\t"
2150
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2151
            "pmaddwd                 %%mm6, %%mm3       \n\t"
2152
#ifndef FAST_BGR2YV12
2153
            "psrad                      $8, %%mm0       \n\t"
2154
            "psrad                      $8, %%mm1       \n\t"
2155
            "psrad                      $8, %%mm2       \n\t"
2156
            "psrad                      $8, %%mm3       \n\t"
2157
#endif
2158
            "packssdw                %%mm1, %%mm0       \n\t"
2159
            "packssdw                %%mm3, %%mm2       \n\t"
2160
            "pmaddwd                 %%mm5, %%mm0       \n\t"
2161
            "pmaddwd                 %%mm5, %%mm2       \n\t"
2162
            "packssdw                %%mm2, %%mm0       \n\t"
2163
            "psraw                      $7, %%mm0       \n\t"
2164

    
2165
            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2166
            "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2167
            "punpcklbw               %%mm7, %%mm4       \n\t"
2168
            "punpcklbw               %%mm7, %%mm1       \n\t"
2169
            "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2170
            "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2171
            "punpcklbw               %%mm7, %%mm2       \n\t"
2172
            "punpcklbw               %%mm7, %%mm3       \n\t"
2173
            "pmaddwd                 %%mm6, %%mm4       \n\t"
2174
            "pmaddwd                 %%mm6, %%mm1       \n\t"
2175
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2176
            "pmaddwd                 %%mm6, %%mm3       \n\t"
2177
#ifndef FAST_BGR2YV12
2178
            "psrad                      $8, %%mm4       \n\t"
2179
            "psrad                      $8, %%mm1       \n\t"
2180
            "psrad                      $8, %%mm2       \n\t"
2181
            "psrad                      $8, %%mm3       \n\t"
2182
#endif
2183
            "packssdw                %%mm1, %%mm4       \n\t"
2184
            "packssdw                %%mm3, %%mm2       \n\t"
2185
            "pmaddwd                 %%mm5, %%mm4       \n\t"
2186
            "pmaddwd                 %%mm5, %%mm2       \n\t"
2187
            "add                       $24, %%"REG_d"   \n\t"
2188
            "packssdw                %%mm2, %%mm4       \n\t"
2189
            "psraw                      $7, %%mm4       \n\t"
2190

    
2191
            "packuswb                %%mm4, %%mm0       \n\t"
2192
            "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2193

    
2194
            MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2195
            "add                        $8,      %%"REG_a"  \n\t"
2196
            " js                        1b                  \n\t"
2197
            : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2198
            : "%"REG_a, "%"REG_d
2199
            );
2200
            ydst += lumStride;
2201
            src  += srcStride;
2202
        }
2203
        src -= srcStride*2;
2204
        __asm__ volatile(
2205
        "mov                        %4, %%"REG_a"   \n\t"
2206
        "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2207
        "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2208
        "pxor                    %%mm7, %%mm7       \n\t"
2209
        "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2210
        "add                 %%"REG_d", %%"REG_d"   \n\t"
2211
        ASMALIGN(4)
2212
        "1:                                         \n\t"
2213
        PREFETCH"    64(%0, %%"REG_d")              \n\t"
2214
        PREFETCH"    64(%1, %%"REG_d")              \n\t"
2215
#if HAVE_MMX2 || HAVE_AMD3DNOW
2216
        "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2217
        "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2218
        "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2219
        "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2220
        PAVGB"                   %%mm1, %%mm0       \n\t"
2221
        PAVGB"                   %%mm3, %%mm2       \n\t"
2222
        "movq                    %%mm0, %%mm1       \n\t"
2223
        "movq                    %%mm2, %%mm3       \n\t"
2224
        "psrlq                     $24, %%mm0       \n\t"
2225
        "psrlq                     $24, %%mm2       \n\t"
2226
        PAVGB"                   %%mm1, %%mm0       \n\t"
2227
        PAVGB"                   %%mm3, %%mm2       \n\t"
2228
        "punpcklbw               %%mm7, %%mm0       \n\t"
2229
        "punpcklbw               %%mm7, %%mm2       \n\t"
2230
#else
2231
        "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2232
        "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2233
        "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2234
        "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2235
        "punpcklbw               %%mm7, %%mm0       \n\t"
2236
        "punpcklbw               %%mm7, %%mm1       \n\t"
2237
        "punpcklbw               %%mm7, %%mm2       \n\t"
2238
        "punpcklbw               %%mm7, %%mm3       \n\t"
2239
        "paddw                   %%mm1, %%mm0       \n\t"
2240
        "paddw                   %%mm3, %%mm2       \n\t"
2241
        "paddw                   %%mm2, %%mm0       \n\t"
2242
        "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2243
        "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2244
        "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2245
        "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2246
        "punpcklbw               %%mm7, %%mm4       \n\t"
2247
        "punpcklbw               %%mm7, %%mm1       \n\t"
2248
        "punpcklbw               %%mm7, %%mm2       \n\t"
2249
        "punpcklbw               %%mm7, %%mm3       \n\t"
2250
        "paddw                   %%mm1, %%mm4       \n\t"
2251
        "paddw                   %%mm3, %%mm2       \n\t"
2252
        "paddw                   %%mm4, %%mm2       \n\t"
2253
        "psrlw                      $2, %%mm0       \n\t"
2254
        "psrlw                      $2, %%mm2       \n\t"
2255
#endif
2256
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2257
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2258

    
2259
        "pmaddwd                 %%mm0, %%mm1       \n\t"
2260
        "pmaddwd                 %%mm2, %%mm3       \n\t"
2261
        "pmaddwd                 %%mm6, %%mm0       \n\t"
2262
        "pmaddwd                 %%mm6, %%mm2       \n\t"
2263
#ifndef FAST_BGR2YV12
2264
        "psrad                      $8, %%mm0       \n\t"
2265
        "psrad                      $8, %%mm1       \n\t"
2266
        "psrad                      $8, %%mm2       \n\t"
2267
        "psrad                      $8, %%mm3       \n\t"
2268
#endif
2269
        "packssdw                %%mm2, %%mm0       \n\t"
2270
        "packssdw                %%mm3, %%mm1       \n\t"
2271
        "pmaddwd                 %%mm5, %%mm0       \n\t"
2272
        "pmaddwd                 %%mm5, %%mm1       \n\t"
2273
        "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2274
        "psraw                      $7, %%mm0       \n\t"
2275

    
2276
#if HAVE_MMX2 || HAVE_AMD3DNOW
2277
        "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2278
        "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2279
        "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2280
        "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2281
        PAVGB"                   %%mm1, %%mm4       \n\t"
2282
        PAVGB"                   %%mm3, %%mm2       \n\t"
2283
        "movq                    %%mm4, %%mm1       \n\t"
2284
        "movq                    %%mm2, %%mm3       \n\t"
2285
        "psrlq                     $24, %%mm4       \n\t"
2286
        "psrlq                     $24, %%mm2       \n\t"
2287
        PAVGB"                   %%mm1, %%mm4       \n\t"
2288
        PAVGB"                   %%mm3, %%mm2       \n\t"
2289
        "punpcklbw               %%mm7, %%mm4       \n\t"
2290
        "punpcklbw               %%mm7, %%mm2       \n\t"
2291
#else
2292
        "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2293
        "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2294
        "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2295
        "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2296
        "punpcklbw               %%mm7, %%mm4       \n\t"
2297
        "punpcklbw               %%mm7, %%mm1       \n\t"
2298
        "punpcklbw               %%mm7, %%mm2       \n\t"
2299
        "punpcklbw               %%mm7, %%mm3       \n\t"
2300
        "paddw                   %%mm1, %%mm4       \n\t"
2301
        "paddw                   %%mm3, %%mm2       \n\t"
2302
        "paddw                   %%mm2, %%mm4       \n\t"
2303
        "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2304
        "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2305
        "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2306
        "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2307
        "punpcklbw               %%mm7, %%mm5       \n\t"
2308
        "punpcklbw               %%mm7, %%mm1       \n\t"
2309
        "punpcklbw               %%mm7, %%mm2       \n\t"
2310
        "punpcklbw               %%mm7, %%mm3       \n\t"
2311
        "paddw                   %%mm1, %%mm5       \n\t"
2312
        "paddw                   %%mm3, %%mm2       \n\t"
2313
        "paddw                   %%mm5, %%mm2       \n\t"
2314
        "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2315
        "psrlw                      $2, %%mm4       \n\t"
2316
        "psrlw                      $2, %%mm2       \n\t"
2317
#endif
2318
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2319
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2320

    
2321
        "pmaddwd                 %%mm4, %%mm1       \n\t"
2322
        "pmaddwd                 %%mm2, %%mm3       \n\t"
2323
        "pmaddwd                 %%mm6, %%mm4       \n\t"
2324
        "pmaddwd                 %%mm6, %%mm2       \n\t"
2325
#ifndef FAST_BGR2YV12
2326
        "psrad                      $8, %%mm4       \n\t"
2327
        "psrad                      $8, %%mm1       \n\t"
2328
        "psrad                      $8, %%mm2       \n\t"
2329
        "psrad                      $8, %%mm3       \n\t"
2330
#endif
2331
        "packssdw                %%mm2, %%mm4       \n\t"
2332
        "packssdw                %%mm3, %%mm1       \n\t"
2333
        "pmaddwd                 %%mm5, %%mm4       \n\t"
2334
        "pmaddwd                 %%mm5, %%mm1       \n\t"
2335
        "add                       $24, %%"REG_d"   \n\t"
2336
        "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2337
        "psraw                      $7, %%mm4       \n\t"
2338

    
2339
        "movq                    %%mm0, %%mm1           \n\t"
2340
        "punpckldq               %%mm4, %%mm0           \n\t"
2341
        "punpckhdq               %%mm4, %%mm1           \n\t"
2342
        "packsswb                %%mm1, %%mm0           \n\t"
2343
        "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
2344
        "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2345
        "punpckhdq               %%mm0, %%mm0           \n\t"
2346
        "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2347
        "add                        $4, %%"REG_a"       \n\t"
2348
        " js                        1b                  \n\t"
2349
        : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2350
        : "%"REG_a, "%"REG_d
2351
        );
2352

    
2353
        udst += chromStride;
2354
        vdst += chromStride;
2355
        src  += srcStride*2;
2356
    }
2357

    
2358
    __asm__ volatile(   EMMS"       \n\t"
2359
                    SFENCE"     \n\t"
2360
                    :::"memory");
2361
#else
2362
    y=0;
2363
#endif
2364
    for (; y<height; y+=2)
2365
    {
2366
        long i;
2367
        for (i=0; i<chromWidth; i++)
2368
        {
2369
            unsigned int b = src[6*i+0];
2370
            unsigned int g = src[6*i+1];
2371
            unsigned int r = src[6*i+2];
2372

    
2373
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2374
            unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2375
            unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2376

    
2377
            udst[i]     = U;
2378
            vdst[i]     = V;
2379
            ydst[2*i]   = Y;
2380

    
2381
            b = src[6*i+3];
2382
            g = src[6*i+4];
2383
            r = src[6*i+5];
2384

    
2385
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2386
            ydst[2*i+1]     = Y;
2387
        }
2388
        ydst += lumStride;
2389
        src  += srcStride;
2390

    
2391
        for (i=0; i<chromWidth; i++)
2392
        {
2393
            unsigned int b = src[6*i+0];
2394
            unsigned int g = src[6*i+1];
2395
            unsigned int r = src[6*i+2];
2396

    
2397
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2398

    
2399
            ydst[2*i]     = Y;
2400

    
2401
            b = src[6*i+3];
2402
            g = src[6*i+4];
2403
            r = src[6*i+5];
2404

    
2405
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2406
            ydst[2*i+1]     = Y;
2407
        }
2408
        udst += chromStride;
2409
        vdst += chromStride;
2410
        ydst += lumStride;
2411
        src  += srcStride;
2412
    }
2413
}
2414

    
2415
static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2416
                             long width, long height, long src1Stride,
2417
                             long src2Stride, long dstStride){
2418
    long h;
2419

    
2420
    for (h=0; h < height; h++)
2421
    {
2422
        long w;
2423

    
2424
#if HAVE_MMX
2425
#if HAVE_SSE2
2426
        __asm__(
2427
        "xor              %%"REG_a", %%"REG_a"  \n\t"
2428
        "1:                                     \n\t"
2429
        PREFETCH" 64(%1, %%"REG_a")             \n\t"
2430
        PREFETCH" 64(%2, %%"REG_a")             \n\t"
2431
        "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2432
        "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2433
        "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2434
        "punpcklbw           %%xmm2, %%xmm0     \n\t"
2435
        "punpckhbw           %%xmm2, %%xmm1     \n\t"
2436
        "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2437
        "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2438
        "add                    $16, %%"REG_a"  \n\t"
2439
        "cmp                     %3, %%"REG_a"  \n\t"
2440
        " jb                     1b             \n\t"
2441
        ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2442
        : "memory", "%"REG_a""
2443
        );
2444
#else
2445
        __asm__(
2446
        "xor %%"REG_a", %%"REG_a"               \n\t"
2447
        "1:                                     \n\t"
2448
        PREFETCH" 64(%1, %%"REG_a")             \n\t"
2449
        PREFETCH" 64(%2, %%"REG_a")             \n\t"
2450
        "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2451
        "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2452
        "movq                 %%mm0, %%mm1      \n\t"
2453
        "movq                 %%mm2, %%mm3      \n\t"
2454
        "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2455
        "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2456
        "punpcklbw            %%mm4, %%mm0      \n\t"
2457
        "punpckhbw            %%mm4, %%mm1      \n\t"
2458
        "punpcklbw            %%mm5, %%mm2      \n\t"
2459
        "punpckhbw            %%mm5, %%mm3      \n\t"
2460
        MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2461
        MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2462
        MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2463
        MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2464
        "add                    $16, %%"REG_a"  \n\t"
2465
        "cmp                     %3, %%"REG_a"  \n\t"
2466
        " jb                     1b             \n\t"
2467
        ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2468
        : "memory", "%"REG_a
2469
        );
2470
#endif
2471
        for (w= (width&(~15)); w < width; w++)
2472
        {
2473
            dest[2*w+0] = src1[w];
2474
            dest[2*w+1] = src2[w];
2475
        }
2476
#else
2477
        for (w=0; w < width; w++)
2478
        {
2479
            dest[2*w+0] = src1[w];
2480
            dest[2*w+1] = src2[w];
2481
        }
2482
#endif
2483
        dest += dstStride;
2484
                src1 += src1Stride;
2485
                src2 += src2Stride;
2486
    }
2487
#if HAVE_MMX
2488
    __asm__(
2489
        EMMS"       \n\t"
2490
        SFENCE"     \n\t"
2491
        ::: "memory"
2492
        );
2493
#endif
2494
}
2495

    
2496
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2497
                                       uint8_t *dst1, uint8_t *dst2,
2498
                                       long width, long height,
2499
                                       long srcStride1, long srcStride2,
2500
                                       long dstStride1, long dstStride2)
2501
{
2502
    x86_reg y;
2503
    long x,w,h;
2504
    w=width/2; h=height/2;
2505
#if HAVE_MMX
2506
    __asm__ volatile(
2507
    PREFETCH" %0    \n\t"
2508
    PREFETCH" %1    \n\t"
2509
    ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2510
#endif
2511
    for (y=0;y<h;y++){
2512
    const uint8_t* s1=src1+srcStride1*(y>>1);
2513
    uint8_t* d=dst1+dstStride1*y;
2514
    x=0;
2515
#if HAVE_MMX
2516
    for (;x<w-31;x+=32)
2517
    {
2518
        __asm__ volatile(
2519
        PREFETCH"   32%1        \n\t"
2520
        "movq         %1, %%mm0 \n\t"
2521
        "movq        8%1, %%mm2 \n\t"
2522
        "movq       16%1, %%mm4 \n\t"
2523
        "movq       24%1, %%mm6 \n\t"
2524
        "movq      %%mm0, %%mm1 \n\t"
2525
        "movq      %%mm2, %%mm3 \n\t"
2526
        "movq      %%mm4, %%mm5 \n\t"
2527
        "movq      %%mm6, %%mm7 \n\t"
2528
        "punpcklbw %%mm0, %%mm0 \n\t"
2529
        "punpckhbw %%mm1, %%mm1 \n\t"
2530
        "punpcklbw %%mm2, %%mm2 \n\t"
2531
        "punpckhbw %%mm3, %%mm3 \n\t"
2532
        "punpcklbw %%mm4, %%mm4 \n\t"
2533
        "punpckhbw %%mm5, %%mm5 \n\t"
2534
        "punpcklbw %%mm6, %%mm6 \n\t"
2535
        "punpckhbw %%mm7, %%mm7 \n\t"
2536
        MOVNTQ"    %%mm0,   %0  \n\t"
2537
        MOVNTQ"    %%mm1,  8%0  \n\t"
2538
        MOVNTQ"    %%mm2, 16%0  \n\t"
2539
        MOVNTQ"    %%mm3, 24%0  \n\t"
2540
        MOVNTQ"    %%mm4, 32%0  \n\t"
2541
        MOVNTQ"    %%mm5, 40%0  \n\t"
2542
        MOVNTQ"    %%mm6, 48%0  \n\t"
2543
        MOVNTQ"    %%mm7, 56%0"
2544
        :"=m"(d[2*x])
2545
        :"m"(s1[x])
2546
        :"memory");
2547
    }
2548
#endif
2549
    for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2550
    }
2551
    for (y=0;y<h;y++){
2552
    const uint8_t* s2=src2+srcStride2*(y>>1);
2553
    uint8_t* d=dst2+dstStride2*y;
2554
    x=0;
2555
#if HAVE_MMX
2556
    for (;x<w-31;x+=32)
2557
    {
2558
        __asm__ volatile(
2559
        PREFETCH"   32%1        \n\t"
2560
        "movq         %1, %%mm0 \n\t"
2561
        "movq        8%1, %%mm2 \n\t"
2562
        "movq       16%1, %%mm4 \n\t"
2563
        "movq       24%1, %%mm6 \n\t"
2564
        "movq      %%mm0, %%mm1 \n\t"
2565
        "movq      %%mm2, %%mm3 \n\t"
2566
        "movq      %%mm4, %%mm5 \n\t"
2567
        "movq      %%mm6, %%mm7 \n\t"
2568
        "punpcklbw %%mm0, %%mm0 \n\t"
2569
        "punpckhbw %%mm1, %%mm1 \n\t"
2570
        "punpcklbw %%mm2, %%mm2 \n\t"
2571
        "punpckhbw %%mm3, %%mm3 \n\t"
2572
        "punpcklbw %%mm4, %%mm4 \n\t"
2573
        "punpckhbw %%mm5, %%mm5 \n\t"
2574
        "punpcklbw %%mm6, %%mm6 \n\t"
2575
        "punpckhbw %%mm7, %%mm7 \n\t"
2576
        MOVNTQ"    %%mm0,   %0  \n\t"
2577
        MOVNTQ"    %%mm1,  8%0  \n\t"
2578
        MOVNTQ"    %%mm2, 16%0  \n\t"
2579
        MOVNTQ"    %%mm3, 24%0  \n\t"
2580
        MOVNTQ"    %%mm4, 32%0  \n\t"
2581
        MOVNTQ"    %%mm5, 40%0  \n\t"
2582
        MOVNTQ"    %%mm6, 48%0  \n\t"
2583
        MOVNTQ"    %%mm7, 56%0"
2584
        :"=m"(d[2*x])
2585
        :"m"(s2[x])
2586
        :"memory");
2587
    }
2588
#endif
2589
    for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2590
    }
2591
#if HAVE_MMX
2592
    __asm__(
2593
        EMMS"       \n\t"
2594
        SFENCE"     \n\t"
2595
        ::: "memory"
2596
        );
2597
#endif
2598
}
2599

    
2600
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2601
                                        uint8_t *dst,
2602
                                        long width, long height,
2603
                                        long srcStride1, long srcStride2,
2604
                                        long srcStride3, long dstStride)
2605
{
2606
    x86_reg x;
2607
    long y,w,h;
2608
    w=width/2; h=height;
2609
    for (y=0;y<h;y++){
2610
    const uint8_t* yp=src1+srcStride1*y;
2611
    const uint8_t* up=src2+srcStride2*(y>>2);
2612
    const uint8_t* vp=src3+srcStride3*(y>>2);
2613
    uint8_t* d=dst+dstStride*y;
2614
    x=0;
2615
#if HAVE_MMX
2616
    for (;x<w-7;x+=8)
2617
    {
2618
        __asm__ volatile(
2619
        PREFETCH"   32(%1, %0)          \n\t"
2620
        PREFETCH"   32(%2, %0)          \n\t"
2621
        PREFETCH"   32(%3, %0)          \n\t"
2622
        "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2623
        "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2624
        "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2625
        "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2626
        "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2627
        "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2628
        "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2629
        "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2630
        "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2631
        "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2632

    
2633
        "movq            %%mm1, %%mm6   \n\t"
2634
        "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2635
        "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2636
        "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2637
        MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2638
        MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2639

    
2640
        "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2641
        "movq     8(%1, %0, 4), %%mm0   \n\t"
2642
        "movq            %%mm0, %%mm3   \n\t"
2643
        "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2644
        "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2645
        MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2646
        MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2647

    
2648
        "movq            %%mm4, %%mm6   \n\t"
2649
        "movq    16(%1, %0, 4), %%mm0   \n\t"
2650
        "movq            %%mm0, %%mm3   \n\t"
2651
        "punpcklbw       %%mm5, %%mm4   \n\t"
2652
        "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2653
        "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2654
        MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2655
        MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2656

    
2657
        "punpckhbw       %%mm5, %%mm6   \n\t"
2658
        "movq    24(%1, %0, 4), %%mm0   \n\t"
2659
        "movq            %%mm0, %%mm3   \n\t"
2660
        "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2661
        "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2662
        MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2663
        MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2664

    
2665
        : "+r" (x)
2666
        : "r"(yp), "r" (up), "r"(vp), "r"(d)
2667
        :"memory");
2668
    }
2669
#endif
2670
    for (; x<w; x++)
2671
    {
2672
        const long x2 = x<<2;
2673
        d[8*x+0] = yp[x2];
2674
        d[8*x+1] = up[x];
2675
        d[8*x+2] = yp[x2+1];
2676
        d[8*x+3] = vp[x];
2677
        d[8*x+4] = yp[x2+2];
2678
        d[8*x+5] = up[x];
2679
        d[8*x+6] = yp[x2+3];
2680
        d[8*x+7] = vp[x];
2681
    }
2682
    }
2683
#if HAVE_MMX
2684
    __asm__(
2685
        EMMS"       \n\t"
2686
        SFENCE"     \n\t"
2687
        ::: "memory"
2688
        );
2689
#endif
2690
}
2691

    
2692
static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2693
{
2694
    dst +=   count;
2695
    src += 2*count;
2696
    count= - count;
2697

    
2698
#if HAVE_MMX
2699
    if(count <= -16){
2700
        count += 15;
2701
        __asm__ volatile(
2702
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2703
            "psrlw            $8, %%mm7        \n\t"
2704
            "1:                                \n\t"
2705
            "movq -30(%1, %0, 2), %%mm0        \n\t"
2706
            "movq -22(%1, %0, 2), %%mm1        \n\t"
2707
            "movq -14(%1, %0, 2), %%mm2        \n\t"
2708
            "movq  -6(%1, %0, 2), %%mm3        \n\t"
2709
            "pand          %%mm7, %%mm0        \n\t"
2710
            "pand          %%mm7, %%mm1        \n\t"
2711
            "pand          %%mm7, %%mm2        \n\t"
2712
            "pand          %%mm7, %%mm3        \n\t"
2713
            "packuswb      %%mm1, %%mm0        \n\t"
2714
            "packuswb      %%mm3, %%mm2        \n\t"
2715
            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2716
            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2717
            "add             $16, %0           \n\t"
2718
            " js 1b                            \n\t"
2719
            : "+r"(count)
2720
            : "r"(src), "r"(dst)
2721
        );
2722
        count -= 15;
2723
    }
2724
#endif
2725
    while(count<0){
2726
        dst[count]= src[2*count];
2727
        count++;
2728
    }
2729
}
2730

    
2731
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2732
{
2733
    dst0+=   count;
2734
    dst1+=   count;
2735
    src += 4*count;
2736
    count= - count;
2737
#if HAVE_MMX
2738
    if(count <= -8){
2739
        count += 7;
2740
        __asm__ volatile(
2741
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2742
            "psrlw            $8, %%mm7        \n\t"
2743
            "1:                                \n\t"
2744
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2745
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2746
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2747
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2748
            "pand          %%mm7, %%mm0        \n\t"
2749
            "pand          %%mm7, %%mm1        \n\t"
2750
            "pand          %%mm7, %%mm2        \n\t"
2751
            "pand          %%mm7, %%mm3        \n\t"
2752
            "packuswb      %%mm1, %%mm0        \n\t"
2753
            "packuswb      %%mm3, %%mm2        \n\t"
2754
            "movq          %%mm0, %%mm1        \n\t"
2755
            "movq          %%mm2, %%mm3        \n\t"
2756
            "psrlw            $8, %%mm0        \n\t"
2757
            "psrlw            $8, %%mm2        \n\t"
2758
            "pand          %%mm7, %%mm1        \n\t"
2759
            "pand          %%mm7, %%mm3        \n\t"
2760
            "packuswb      %%mm2, %%mm0        \n\t"
2761
            "packuswb      %%mm3, %%mm1        \n\t"
2762
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2763
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2764
            "add              $8, %0           \n\t"
2765
            " js 1b                            \n\t"
2766
            : "+r"(count)
2767
            : "r"(src), "r"(dst0), "r"(dst1)
2768
        );
2769
        count -= 7;
2770
    }
2771
#endif
2772
    while(count<0){
2773
        dst0[count]= src[4*count+0];
2774
        dst1[count]= src[4*count+2];
2775
        count++;
2776
    }
2777
}
2778

    
2779
static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2780
{
2781
    dst0 +=   count;
2782
    dst1 +=   count;
2783
    src0 += 4*count;
2784
    src1 += 4*count;
2785
    count= - count;
2786
#ifdef PAVGB
2787
    if(count <= -8){
2788
        count += 7;
2789
        __asm__ volatile(
2790
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2791
            "psrlw             $8, %%mm7        \n\t"
2792
            "1:                                \n\t"
2793
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2794
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2795
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2796
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2797
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2798
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2799
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2800
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2801
            "pand           %%mm7, %%mm0        \n\t"
2802
            "pand           %%mm7, %%mm1        \n\t"
2803
            "pand           %%mm7, %%mm2        \n\t"
2804
            "pand           %%mm7, %%mm3        \n\t"
2805
            "packuswb       %%mm1, %%mm0        \n\t"
2806
            "packuswb       %%mm3, %%mm2        \n\t"
2807
            "movq           %%mm0, %%mm1        \n\t"
2808
            "movq           %%mm2, %%mm3        \n\t"
2809
            "psrlw             $8, %%mm0        \n\t"
2810
            "psrlw             $8, %%mm2        \n\t"
2811
            "pand           %%mm7, %%mm1        \n\t"
2812
            "pand           %%mm7, %%mm3        \n\t"
2813
            "packuswb       %%mm2, %%mm0        \n\t"
2814
            "packuswb       %%mm3, %%mm1        \n\t"
2815
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2816
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2817
            "add               $8, %0           \n\t"
2818
            " js 1b                            \n\t"
2819
            : "+r"(count)
2820
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2821
        );
2822
        count -= 7;
2823
    }
2824
#endif
2825
    while(count<0){
2826
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2827
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2828
        count++;
2829
    }
2830
}
2831

    
2832
static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2833
{
2834
    dst0+=   count;
2835
    dst1+=   count;
2836
    src += 4*count;
2837
    count= - count;
2838
#if HAVE_MMX
2839
    if(count <= -8){
2840
        count += 7;
2841
        __asm__ volatile(
2842
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2843
            "psrlw            $8, %%mm7        \n\t"
2844
            "1:                                \n\t"
2845
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2846
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2847
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2848
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2849
            "psrlw            $8, %%mm0        \n\t"
2850
            "psrlw            $8, %%mm1        \n\t"
2851
            "psrlw            $8, %%mm2        \n\t"
2852
            "psrlw            $8, %%mm3        \n\t"
2853
            "packuswb      %%mm1, %%mm0        \n\t"
2854
            "packuswb      %%mm3, %%mm2        \n\t"
2855
            "movq          %%mm0, %%mm1        \n\t"
2856
            "movq          %%mm2, %%mm3        \n\t"
2857
            "psrlw            $8, %%mm0        \n\t"
2858
            "psrlw            $8, %%mm2        \n\t"
2859
            "pand          %%mm7, %%mm1        \n\t"
2860
            "pand          %%mm7, %%mm3        \n\t"
2861
            "packuswb      %%mm2, %%mm0        \n\t"
2862
            "packuswb      %%mm3, %%mm1        \n\t"
2863
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2864
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2865
            "add              $8, %0           \n\t"
2866
            " js 1b                            \n\t"
2867
            : "+r"(count)
2868
            : "r"(src), "r"(dst0), "r"(dst1)
2869
        );
2870
        count -= 7;
2871
    }
2872
#endif
2873
    src++;
2874
    while(count<0){
2875
        dst0[count]= src[4*count+0];
2876
        dst1[count]= src[4*count+2];
2877
        count++;
2878
    }
2879
}
2880

    
2881
static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2882
{
2883
    dst0 +=   count;
2884
    dst1 +=   count;
2885
    src0 += 4*count;
2886
    src1 += 4*count;
2887
    count= - count;
2888
#ifdef PAVGB
2889
    if(count <= -8){
2890
        count += 7;
2891
        __asm__ volatile(
2892
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2893
            "psrlw             $8, %%mm7        \n\t"
2894
            "1:                                \n\t"
2895
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2896
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2897
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2898
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2899
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2900
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2901
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2902
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2903
            "psrlw             $8, %%mm0        \n\t"
2904
            "psrlw             $8, %%mm1        \n\t"
2905
            "psrlw             $8, %%mm2        \n\t"
2906
            "psrlw             $8, %%mm3        \n\t"
2907
            "packuswb       %%mm1, %%mm0        \n\t"
2908
            "packuswb       %%mm3, %%mm2        \n\t"
2909
            "movq           %%mm0, %%mm1        \n\t"
2910
            "movq           %%mm2, %%mm3        \n\t"
2911
            "psrlw             $8, %%mm0        \n\t"
2912
            "psrlw             $8, %%mm2        \n\t"
2913
            "pand           %%mm7, %%mm1        \n\t"
2914
            "pand           %%mm7, %%mm3        \n\t"
2915
            "packuswb       %%mm2, %%mm0        \n\t"
2916
            "packuswb       %%mm3, %%mm1        \n\t"
2917
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2918
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2919
            "add               $8, %0           \n\t"
2920
            " js 1b                            \n\t"
2921
            : "+r"(count)
2922
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2923
        );
2924
        count -= 7;
2925
    }
2926
#endif
2927
    src0++;
2928
    src1++;
2929
    while(count<0){
2930
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2931
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2932
        count++;
2933
    }
2934
}
2935

    
2936
static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2937
                                      long width, long height,
2938
                                      long lumStride, long chromStride, long srcStride)
2939
{
2940
    long y;
2941
    const long chromWidth= -((-width)>>1);
2942

    
2943
    for (y=0; y<height; y++){
2944
        RENAME(extract_even)(src, ydst, width);
2945
        if(y&1){
2946
            RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2947
            udst+= chromStride;
2948
            vdst+= chromStride;
2949
        }
2950

    
2951
        src += srcStride;
2952
        ydst+= lumStride;
2953
    }
2954
#if HAVE_MMX
2955
    __asm__(
2956
        EMMS"       \n\t"
2957
        SFENCE"     \n\t"
2958
        ::: "memory"
2959
        );
2960
#endif
2961
}
2962

    
2963
static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2964
                                      long width, long height,
2965
                                      long lumStride, long chromStride, long srcStride)
2966
{
2967
    long y;
2968
    const long chromWidth= -((-width)>>1);
2969

    
2970
    for (y=0; y<height; y++){
2971
        RENAME(extract_even)(src, ydst, width);
2972
        RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2973

    
2974
        src += srcStride;
2975
        ydst+= lumStride;
2976
        udst+= chromStride;
2977
        vdst+= chromStride;
2978
    }
2979
#if HAVE_MMX
2980
    __asm__(
2981
        EMMS"       \n\t"
2982
        SFENCE"     \n\t"
2983
        ::: "memory"
2984
        );
2985
#endif
2986
}
2987

    
2988
static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2989
                                      long width, long height,
2990
                                      long lumStride, long chromStride, long srcStride)
2991
{
2992
    long y;
2993
    const long chromWidth= -((-width)>>1);
2994

    
2995
    for (y=0; y<height; y++){
2996
        RENAME(extract_even)(src+1, ydst, width);
2997
        if(y&1){
2998
            RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2999
            udst+= chromStride;
3000
            vdst+= chromStride;
3001
        }
3002

    
3003
        src += srcStride;
3004
        ydst+= lumStride;
3005
    }
3006
#if HAVE_MMX
3007
    __asm__(
3008
        EMMS"       \n\t"
3009
        SFENCE"     \n\t"
3010
        ::: "memory"
3011
        );
3012
#endif
3013
}
3014

    
3015
static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
3016
                                      long width, long height,
3017
                                      long lumStride, long chromStride, long srcStride)
3018
{
3019
    long y;
3020
    const long chromWidth= -((-width)>>1);
3021

    
3022
    for (y=0; y<height; y++){
3023
        RENAME(extract_even)(src+1, ydst, width);
3024
        RENAME(extract_even2)(src, udst, vdst, chromWidth);
3025

    
3026
        src += srcStride;
3027
        ydst+= lumStride;
3028
        udst+= chromStride;
3029
        vdst+= chromStride;
3030
    }
3031
#if HAVE_MMX
3032
    __asm__(
3033
        EMMS"       \n\t"
3034
        SFENCE"     \n\t"
3035
        ::: "memory"
3036
        );
3037
#endif
3038
}
3039

    
3040
static inline void RENAME(rgb2rgb_init)(void){
3041
    rgb15to16       = RENAME(rgb15to16);
3042
    rgb15tobgr24    = RENAME(rgb15tobgr24);
3043
    rgb15to32       = RENAME(rgb15to32);
3044
    rgb16tobgr24    = RENAME(rgb16tobgr24);
3045
    rgb16to32       = RENAME(rgb16to32);
3046
    rgb16to15       = RENAME(rgb16to15);
3047
    rgb24tobgr16    = RENAME(rgb24tobgr16);
3048
    rgb24tobgr15    = RENAME(rgb24tobgr15);
3049
    rgb24tobgr32    = RENAME(rgb24tobgr32);
3050
    rgb32to16       = RENAME(rgb32to16);
3051
    rgb32to15       = RENAME(rgb32to15);
3052
    rgb32tobgr24    = RENAME(rgb32tobgr24);
3053
    rgb24to15       = RENAME(rgb24to15);
3054
    rgb24to16       = RENAME(rgb24to16);
3055
    rgb24tobgr24    = RENAME(rgb24tobgr24);
3056
    rgb32tobgr32    = RENAME(rgb32tobgr32);
3057
    rgb32tobgr16    = RENAME(rgb32tobgr16);
3058
    rgb32tobgr15    = RENAME(rgb32tobgr15);
3059
    yv12toyuy2      = RENAME(yv12toyuy2);
3060
    yv12touyvy      = RENAME(yv12touyvy);
3061
    yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
3062
    yuv422ptouyvy   = RENAME(yuv422ptouyvy);
3063
    yuy2toyv12      = RENAME(yuy2toyv12);
3064
//    yvu9toyv12      = RENAME(yvu9toyv12);
3065
    planar2x        = RENAME(planar2x);
3066
    rgb24toyv12     = RENAME(rgb24toyv12);
3067
    interleaveBytes = RENAME(interleaveBytes);
3068
    vu9_to_vu12     = RENAME(vu9_to_vu12);
3069
    yvu9_to_yuy2    = RENAME(yvu9_to_yuy2);
3070

    
3071
    uyvytoyuv420    = RENAME(uyvytoyuv420);
3072
    uyvytoyuv422    = RENAME(uyvytoyuv422);
3073
    yuyvtoyuv420    = RENAME(yuyvtoyuv420);
3074
    yuyvtoyuv422    = RENAME(yuyvtoyuv422);
3075
}