Statistics
| Branch: | Revision:

ffmpeg / libswscale / rgb2rgb_template.c @ 594ff7cc

History | View | Annotate | Download (98 KB)

1
/*
2
 *  rgb2rgb.c, Software RGB to RGB convertor
3
 *  pluralize by Software PAL8 to RGB convertor
4
 *               Software YUV to YUV convertor
5
 *               Software YUV to RGB convertor
6
 *  Written by Nick Kurshev.
7
 *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 *  lot of big-endian byteorder fixes by Alex Beregszaszi
9
 *
10
 * This file is part of FFmpeg.
11
 *
12
 * FFmpeg is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2 of the License, or
15
 * (at your option) any later version.
16
 *
17
 * FFmpeg is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU General Public License
23
 * along with FFmpeg; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 *
26
 * The C code (not assembly, mmx, ...) of this file can be used
27
 * under the LGPL license.
28
 */
29

    
30
#include <stddef.h>
31
#include <inttypes.h> /* for __WORDSIZE */
32

    
33
#ifndef __WORDSIZE
34
// #warning You have a misconfigured system and will probably lose performance!
35
#define __WORDSIZE MP_WORDSIZE
36
#endif
37

    
38
#undef PREFETCH
39
#undef MOVNTQ
40
#undef EMMS
41
#undef SFENCE
42
#undef MMREG_SIZE
43
#undef PREFETCHW
44
#undef PAVGB
45

    
46
#ifdef HAVE_SSE2
47
#define MMREG_SIZE 16
48
#else
49
#define MMREG_SIZE 8
50
#endif
51

    
52
#ifdef HAVE_3DNOW
53
#define PREFETCH  "prefetch"
54
#define PREFETCHW "prefetchw"
55
#define PAVGB     "pavgusb"
56
#elif defined ( HAVE_MMX2 )
57
#define PREFETCH "prefetchnta"
58
#define PREFETCHW "prefetcht0"
59
#define PAVGB     "pavgb"
60
#else
61
#ifdef __APPLE__
62
#define PREFETCH "#"
63
#define PREFETCHW "#"
64
#else
65
#define PREFETCH  " # nop"
66
#define PREFETCHW " # nop"
67
#endif
68
#endif
69

    
70
#ifdef HAVE_3DNOW
71
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
72
#define EMMS     "femms"
73
#else
74
#define EMMS     "emms"
75
#endif
76

    
77
#ifdef HAVE_MMX2
78
#define MOVNTQ "movntq"
79
#define SFENCE "sfence"
80
#else
81
#define MOVNTQ "movq"
82
#define SFENCE " # nop"
83
#endif
84

    
85
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
86
{
87
    uint8_t *dest = dst;
88
    const uint8_t *s = src;
89
    const uint8_t *end;
90
    #ifdef HAVE_MMX
91
        const uint8_t *mm_end;
92
    #endif
93
    end = s + src_size;
94
    #ifdef HAVE_MMX
95
        __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
96
        mm_end = end - 23;
97
        __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
98
        while (s < mm_end)
99
        {
100
            __asm __volatile(
101
            PREFETCH"    32%1           \n\t"
102
            "movd          %1, %%mm0    \n\t"
103
            "punpckldq    3%1, %%mm0    \n\t"
104
            "movd         6%1, %%mm1    \n\t"
105
            "punpckldq    9%1, %%mm1    \n\t"
106
            "movd        12%1, %%mm2    \n\t"
107
            "punpckldq   15%1, %%mm2    \n\t"
108
            "movd        18%1, %%mm3    \n\t"
109
            "punpckldq   21%1, %%mm3    \n\t"
110
            "pand       %%mm7, %%mm0    \n\t"
111
            "pand       %%mm7, %%mm1    \n\t"
112
            "pand       %%mm7, %%mm2    \n\t"
113
            "pand       %%mm7, %%mm3    \n\t"
114
            MOVNTQ"     %%mm0,   %0     \n\t"
115
            MOVNTQ"     %%mm1,  8%0     \n\t"
116
            MOVNTQ"     %%mm2, 16%0     \n\t"
117
            MOVNTQ"     %%mm3, 24%0"
118
            :"=m"(*dest)
119
            :"m"(*s)
120
            :"memory");
121
            dest += 32;
122
            s += 24;
123
        }
124
        __asm __volatile(SFENCE:::"memory");
125
        __asm __volatile(EMMS:::"memory");
126
    #endif
127
    while (s < end)
128
    {
129
    #ifdef WORDS_BIGENDIAN
130
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
131
        *dest++ = 0;
132
        *dest++ = s[2];
133
        *dest++ = s[1];
134
        *dest++ = s[0];
135
        s+=3;
136
    #else
137
        *dest++ = *s++;
138
        *dest++ = *s++;
139
        *dest++ = *s++;
140
        *dest++ = 0;
141
    #endif
142
    }
143
}
144

    
145
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
146
{
147
    uint8_t *dest = dst;
148
    const uint8_t *s = src;
149
    const uint8_t *end;
150
#ifdef HAVE_MMX
151
    const uint8_t *mm_end;
152
#endif
153
    end = s + src_size;
154
#ifdef HAVE_MMX
155
    __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
156
    mm_end = end - 31;
157
    while (s < mm_end)
158
    {
159
        __asm __volatile(
160
        PREFETCH"    32%1           \n\t"
161
        "movq          %1, %%mm0    \n\t"
162
        "movq         8%1, %%mm1    \n\t"
163
        "movq        16%1, %%mm4    \n\t"
164
        "movq        24%1, %%mm5    \n\t"
165
        "movq       %%mm0, %%mm2    \n\t"
166
        "movq       %%mm1, %%mm3    \n\t"
167
        "movq       %%mm4, %%mm6    \n\t"
168
        "movq       %%mm5, %%mm7    \n\t"
169
        "psrlq         $8, %%mm2    \n\t"
170
        "psrlq         $8, %%mm3    \n\t"
171
        "psrlq         $8, %%mm6    \n\t"
172
        "psrlq         $8, %%mm7    \n\t"
173
        "pand          %2, %%mm0    \n\t"
174
        "pand          %2, %%mm1    \n\t"
175
        "pand          %2, %%mm4    \n\t"
176
        "pand          %2, %%mm5    \n\t"
177
        "pand          %3, %%mm2    \n\t"
178
        "pand          %3, %%mm3    \n\t"
179
        "pand          %3, %%mm6    \n\t"
180
        "pand          %3, %%mm7    \n\t"
181
        "por        %%mm2, %%mm0    \n\t"
182
        "por        %%mm3, %%mm1    \n\t"
183
        "por        %%mm6, %%mm4    \n\t"
184
        "por        %%mm7, %%mm5    \n\t"
185

    
186
        "movq       %%mm1, %%mm2    \n\t"
187
        "movq       %%mm4, %%mm3    \n\t"
188
        "psllq        $48, %%mm2    \n\t"
189
        "psllq        $32, %%mm3    \n\t"
190
        "pand          %4, %%mm2    \n\t"
191
        "pand          %5, %%mm3    \n\t"
192
        "por        %%mm2, %%mm0    \n\t"
193
        "psrlq        $16, %%mm1    \n\t"
194
        "psrlq        $32, %%mm4    \n\t"
195
        "psllq        $16, %%mm5    \n\t"
196
        "por        %%mm3, %%mm1    \n\t"
197
        "pand          %6, %%mm5    \n\t"
198
        "por        %%mm5, %%mm4    \n\t"
199

    
200
        MOVNTQ"     %%mm0,   %0     \n\t"
201
        MOVNTQ"     %%mm1,  8%0     \n\t"
202
        MOVNTQ"     %%mm4, 16%0"
203
        :"=m"(*dest)
204
        :"m"(*s),"m"(mask24l),
205
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
206
        :"memory");
207
        dest += 24;
208
        s += 32;
209
    }
210
    __asm __volatile(SFENCE:::"memory");
211
    __asm __volatile(EMMS:::"memory");
212
#endif
213
    while (s < end)
214
    {
215
#ifdef WORDS_BIGENDIAN
216
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
217
        s++;
218
        dest[2] = *s++;
219
        dest[1] = *s++;
220
        dest[0] = *s++;
221
        dest += 3;
222
#else
223
        *dest++ = *s++;
224
        *dest++ = *s++;
225
        *dest++ = *s++;
226
        s++;
227
#endif
228
    }
229
}
230

    
231
/*
232
 Original by Strepto/Astral
233
 ported to gcc & bugfixed : A'rpi
234
 MMX2, 3DNOW optimization by Nick Kurshev
235
 32 bit C version, and and&add trick by Michael Niedermayer
236
*/
237
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
238
{
239
    register const uint8_t* s=src;
240
    register uint8_t* d=dst;
241
    register const uint8_t *end;
242
    const uint8_t *mm_end;
243
    end = s + src_size;
244
#ifdef HAVE_MMX
245
    __asm __volatile(PREFETCH"    %0"::"m"(*s));
246
    __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
247
    mm_end = end - 15;
248
    while (s<mm_end)
249
    {
250
        __asm __volatile(
251
        PREFETCH"  32%1         \n\t"
252
        "movq        %1, %%mm0  \n\t"
253
        "movq       8%1, %%mm2  \n\t"
254
        "movq     %%mm0, %%mm1  \n\t"
255
        "movq     %%mm2, %%mm3  \n\t"
256
        "pand     %%mm4, %%mm0  \n\t"
257
        "pand     %%mm4, %%mm2  \n\t"
258
        "paddw    %%mm1, %%mm0  \n\t"
259
        "paddw    %%mm3, %%mm2  \n\t"
260
        MOVNTQ"   %%mm0,  %0    \n\t"
261
        MOVNTQ"   %%mm2, 8%0"
262
        :"=m"(*d)
263
        :"m"(*s)
264
        );
265
        d+=16;
266
        s+=16;
267
    }
268
    __asm __volatile(SFENCE:::"memory");
269
    __asm __volatile(EMMS:::"memory");
270
#endif
271
    mm_end = end - 3;
272
    while (s < mm_end)
273
    {
274
        register unsigned x= *((uint32_t *)s);
275
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
276
        d+=4;
277
        s+=4;
278
    }
279
    if (s < end)
280
    {
281
        register unsigned short x= *((uint16_t *)s);
282
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
283
    }
284
}
285

    
286
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
287
{
288
    register const uint8_t* s=src;
289
    register uint8_t* d=dst;
290
    register const uint8_t *end;
291
    const uint8_t *mm_end;
292
    end = s + src_size;
293
#ifdef HAVE_MMX
294
    __asm __volatile(PREFETCH"    %0"::"m"(*s));
295
    __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
296
    __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
297
    mm_end = end - 15;
298
    while (s<mm_end)
299
    {
300
        __asm __volatile(
301
        PREFETCH"  32%1         \n\t"
302
        "movq        %1, %%mm0  \n\t"
303
        "movq       8%1, %%mm2  \n\t"
304
        "movq     %%mm0, %%mm1  \n\t"
305
        "movq     %%mm2, %%mm3  \n\t"
306
        "psrlq       $1, %%mm0  \n\t"
307
        "psrlq       $1, %%mm2  \n\t"
308
        "pand     %%mm7, %%mm0  \n\t"
309
        "pand     %%mm7, %%mm2  \n\t"
310
        "pand     %%mm6, %%mm1  \n\t"
311
        "pand     %%mm6, %%mm3  \n\t"
312
        "por      %%mm1, %%mm0  \n\t"
313
        "por      %%mm3, %%mm2  \n\t"
314
        MOVNTQ"   %%mm0,  %0    \n\t"
315
        MOVNTQ"   %%mm2, 8%0"
316
        :"=m"(*d)
317
        :"m"(*s)
318
        );
319
        d+=16;
320
        s+=16;
321
    }
322
    __asm __volatile(SFENCE:::"memory");
323
    __asm __volatile(EMMS:::"memory");
324
#endif
325
    mm_end = end - 3;
326
    while (s < mm_end)
327
    {
328
        register uint32_t x= *((uint32_t *)s);
329
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
330
        s+=4;
331
        d+=4;
332
    }
333
    if (s < end)
334
    {
335
        register uint16_t x= *((uint16_t *)s);
336
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
337
        s+=2;
338
        d+=2;
339
    }
340
}
341

    
342
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
343
{
344
    const uint8_t *s = src;
345
    const uint8_t *end;
346
#ifdef HAVE_MMX
347
    const uint8_t *mm_end;
348
#endif
349
    uint16_t *d = (uint16_t *)dst;
350
    end = s + src_size;
351
#ifdef HAVE_MMX
352
    mm_end = end - 15;
353
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
354
    asm volatile(
355
    "movq           %3, %%mm5   \n\t"
356
    "movq           %4, %%mm6   \n\t"
357
    "movq           %5, %%mm7   \n\t"
358
    "jmp 2f                     \n\t"
359
    ASMALIGN(4)
360
    "1:                         \n\t"
361
    PREFETCH"   32(%1)          \n\t"
362
    "movd         (%1), %%mm0   \n\t"
363
    "movd        4(%1), %%mm3   \n\t"
364
    "punpckldq   8(%1), %%mm0   \n\t"
365
    "punpckldq  12(%1), %%mm3   \n\t"
366
    "movq        %%mm0, %%mm1   \n\t"
367
    "movq        %%mm3, %%mm4   \n\t"
368
    "pand        %%mm6, %%mm0   \n\t"
369
    "pand        %%mm6, %%mm3   \n\t"
370
    "pmaddwd     %%mm7, %%mm0   \n\t"
371
    "pmaddwd     %%mm7, %%mm3   \n\t"
372
    "pand        %%mm5, %%mm1   \n\t"
373
    "pand        %%mm5, %%mm4   \n\t"
374
    "por         %%mm1, %%mm0   \n\t"
375
    "por         %%mm4, %%mm3   \n\t"
376
    "psrld          $5, %%mm0   \n\t"
377
    "pslld         $11, %%mm3   \n\t"
378
    "por         %%mm3, %%mm0   \n\t"
379
    MOVNTQ"      %%mm0, (%0)    \n\t"
380
    "add           $16,  %1     \n\t"
381
    "add            $8,  %0     \n\t"
382
    "2:                         \n\t"
383
    "cmp            %2,  %1     \n\t"
384
    " jb            1b          \n\t"
385
    : "+r" (d), "+r"(s)
386
    : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
387
    );
388
#else
389
    __asm __volatile(PREFETCH"    %0"::"m"(*src):"memory");
390
    __asm __volatile(
391
        "movq    %0, %%mm7    \n\t"
392
        "movq    %1, %%mm6    \n\t"
393
        ::"m"(red_16mask),"m"(green_16mask));
394
    while (s < mm_end)
395
    {
396
        __asm __volatile(
397
        PREFETCH"    32%1           \n\t"
398
        "movd          %1, %%mm0    \n\t"
399
        "movd         4%1, %%mm3    \n\t"
400
        "punpckldq    8%1, %%mm0    \n\t"
401
        "punpckldq   12%1, %%mm3    \n\t"
402
        "movq       %%mm0, %%mm1    \n\t"
403
        "movq       %%mm0, %%mm2    \n\t"
404
        "movq       %%mm3, %%mm4    \n\t"
405
        "movq       %%mm3, %%mm5    \n\t"
406
        "psrlq         $3, %%mm0    \n\t"
407
        "psrlq         $3, %%mm3    \n\t"
408
        "pand          %2, %%mm0    \n\t"
409
        "pand          %2, %%mm3    \n\t"
410
        "psrlq         $5, %%mm1    \n\t"
411
        "psrlq         $5, %%mm4    \n\t"
412
        "pand       %%mm6, %%mm1    \n\t"
413
        "pand       %%mm6, %%mm4    \n\t"
414
        "psrlq         $8, %%mm2    \n\t"
415
        "psrlq         $8, %%mm5    \n\t"
416
        "pand       %%mm7, %%mm2    \n\t"
417
        "pand       %%mm7, %%mm5    \n\t"
418
        "por        %%mm1, %%mm0    \n\t"
419
        "por        %%mm4, %%mm3    \n\t"
420
        "por        %%mm2, %%mm0    \n\t"
421
        "por        %%mm5, %%mm3    \n\t"
422
        "psllq        $16, %%mm3    \n\t"
423
        "por        %%mm3, %%mm0    \n\t"
424
        MOVNTQ"     %%mm0, %0       \n\t"
425
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
426
        d += 4;
427
        s += 16;
428
    }
429
#endif
430
    __asm __volatile(SFENCE:::"memory");
431
    __asm __volatile(EMMS:::"memory");
432
#endif
433
    while (s < end)
434
    {
435
        register int rgb = *(uint32_t*)s; s += 4;
436
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
437
    }
438
}
439

    
440
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
441
{
442
    const uint8_t *s = src;
443
    const uint8_t *end;
444
#ifdef HAVE_MMX
445
    const uint8_t *mm_end;
446
#endif
447
    uint16_t *d = (uint16_t *)dst;
448
    end = s + src_size;
449
#ifdef HAVE_MMX
450
    __asm __volatile(PREFETCH"    %0"::"m"(*src):"memory");
451
    __asm __volatile(
452
        "movq          %0, %%mm7    \n\t"
453
        "movq          %1, %%mm6    \n\t"
454
        ::"m"(red_16mask),"m"(green_16mask));
455
    mm_end = end - 15;
456
    while (s < mm_end)
457
    {
458
        __asm __volatile(
459
        PREFETCH"    32%1           \n\t"
460
        "movd          %1, %%mm0    \n\t"
461
        "movd         4%1, %%mm3    \n\t"
462
        "punpckldq    8%1, %%mm0    \n\t"
463
        "punpckldq   12%1, %%mm3    \n\t"
464
        "movq       %%mm0, %%mm1    \n\t"
465
        "movq       %%mm0, %%mm2    \n\t"
466
        "movq       %%mm3, %%mm4    \n\t"
467
        "movq       %%mm3, %%mm5    \n\t"
468
        "psllq         $8, %%mm0    \n\t"
469
        "psllq         $8, %%mm3    \n\t"
470
        "pand       %%mm7, %%mm0    \n\t"
471
        "pand       %%mm7, %%mm3    \n\t"
472
        "psrlq         $5, %%mm1    \n\t"
473
        "psrlq         $5, %%mm4    \n\t"
474
        "pand       %%mm6, %%mm1    \n\t"
475
        "pand       %%mm6, %%mm4    \n\t"
476
        "psrlq        $19, %%mm2    \n\t"
477
        "psrlq        $19, %%mm5    \n\t"
478
        "pand          %2, %%mm2    \n\t"
479
        "pand          %2, %%mm5    \n\t"
480
        "por        %%mm1, %%mm0    \n\t"
481
        "por        %%mm4, %%mm3    \n\t"
482
        "por        %%mm2, %%mm0    \n\t"
483
        "por        %%mm5, %%mm3    \n\t"
484
        "psllq        $16, %%mm3    \n\t"
485
        "por        %%mm3, %%mm0    \n\t"
486
        MOVNTQ"     %%mm0, %0       \n\t"
487
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
488
        d += 4;
489
        s += 16;
490
    }
491
    __asm __volatile(SFENCE:::"memory");
492
    __asm __volatile(EMMS:::"memory");
493
#endif
494
    while (s < end)
495
    {
496
        register int rgb = *(uint32_t*)s; s += 4;
497
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
498
    }
499
}
500

    
501
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
502
{
503
    const uint8_t *s = src;
504
    const uint8_t *end;
505
#ifdef HAVE_MMX
506
    const uint8_t *mm_end;
507
#endif
508
    uint16_t *d = (uint16_t *)dst;
509
    end = s + src_size;
510
#ifdef HAVE_MMX
511
    mm_end = end - 15;
512
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
513
    asm volatile(
514
    "movq           %3, %%mm5   \n\t"
515
    "movq           %4, %%mm6   \n\t"
516
    "movq           %5, %%mm7   \n\t"
517
    "jmp            2f          \n\t"
518
    ASMALIGN(4)
519
    "1:                         \n\t"
520
    PREFETCH"   32(%1)          \n\t"
521
    "movd         (%1), %%mm0   \n\t"
522
    "movd        4(%1), %%mm3   \n\t"
523
    "punpckldq   8(%1), %%mm0   \n\t"
524
    "punpckldq  12(%1), %%mm3   \n\t"
525
    "movq        %%mm0, %%mm1   \n\t"
526
    "movq        %%mm3, %%mm4   \n\t"
527
    "pand        %%mm6, %%mm0   \n\t"
528
    "pand        %%mm6, %%mm3   \n\t"
529
    "pmaddwd     %%mm7, %%mm0   \n\t"
530
    "pmaddwd     %%mm7, %%mm3   \n\t"
531
    "pand        %%mm5, %%mm1   \n\t"
532
    "pand        %%mm5, %%mm4   \n\t"
533
    "por         %%mm1, %%mm0   \n\t"
534
    "por         %%mm4, %%mm3   \n\t"
535
    "psrld          $6, %%mm0   \n\t"
536
    "pslld         $10, %%mm3   \n\t"
537
    "por         %%mm3, %%mm0   \n\t"
538
    MOVNTQ"      %%mm0, (%0)    \n\t"
539
    "add           $16,  %1     \n\t"
540
    "add            $8,  %0     \n\t"
541
    "2:                         \n\t"
542
    "cmp            %2,  %1     \n\t"
543
    " jb            1b          \n\t"
544
    : "+r" (d), "+r"(s)
545
    : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
546
    );
547
#else
548
    __asm __volatile(PREFETCH"    %0"::"m"(*src):"memory");
549
    __asm __volatile(
550
        "movq          %0, %%mm7    \n\t"
551
        "movq          %1, %%mm6    \n\t"
552
        ::"m"(red_15mask),"m"(green_15mask));
553
    while (s < mm_end)
554
    {
555
        __asm __volatile(
556
        PREFETCH"    32%1           \n\t"
557
        "movd          %1, %%mm0    \n\t"
558
        "movd         4%1, %%mm3    \n\t"
559
        "punpckldq    8%1, %%mm0    \n\t"
560
        "punpckldq   12%1, %%mm3    \n\t"
561
        "movq       %%mm0, %%mm1    \n\t"
562
        "movq       %%mm0, %%mm2    \n\t"
563
        "movq       %%mm3, %%mm4    \n\t"
564
        "movq       %%mm3, %%mm5    \n\t"
565
        "psrlq         $3, %%mm0    \n\t"
566
        "psrlq         $3, %%mm3    \n\t"
567
        "pand          %2, %%mm0    \n\t"
568
        "pand          %2, %%mm3    \n\t"
569
        "psrlq         $6, %%mm1    \n\t"
570
        "psrlq         $6, %%mm4    \n\t"
571
        "pand       %%mm6, %%mm1    \n\t"
572
        "pand       %%mm6, %%mm4    \n\t"
573
        "psrlq         $9, %%mm2    \n\t"
574
        "psrlq         $9, %%mm5    \n\t"
575
        "pand       %%mm7, %%mm2    \n\t"
576
        "pand       %%mm7, %%mm5    \n\t"
577
        "por        %%mm1, %%mm0    \n\t"
578
        "por        %%mm4, %%mm3    \n\t"
579
        "por        %%mm2, %%mm0    \n\t"
580
        "por        %%mm5, %%mm3    \n\t"
581
        "psllq        $16, %%mm3    \n\t"
582
        "por        %%mm3, %%mm0    \n\t"
583
        MOVNTQ"     %%mm0, %0       \n\t"
584
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
585
        d += 4;
586
        s += 16;
587
    }
588
#endif
589
    __asm __volatile(SFENCE:::"memory");
590
    __asm __volatile(EMMS:::"memory");
591
#endif
592
    while (s < end)
593
    {
594
        register int rgb = *(uint32_t*)s; s += 4;
595
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
596
    }
597
}
598

    
599
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
600
{
601
    const uint8_t *s = src;
602
    const uint8_t *end;
603
#ifdef HAVE_MMX
604
    const uint8_t *mm_end;
605
#endif
606
    uint16_t *d = (uint16_t *)dst;
607
    end = s + src_size;
608
#ifdef HAVE_MMX
609
    __asm __volatile(PREFETCH"    %0"::"m"(*src):"memory");
610
    __asm __volatile(
611
        "movq          %0, %%mm7    \n\t"
612
        "movq          %1, %%mm6    \n\t"
613
        ::"m"(red_15mask),"m"(green_15mask));
614
    mm_end = end - 15;
615
    while (s < mm_end)
616
    {
617
        __asm __volatile(
618
        PREFETCH"    32%1           \n\t"
619
        "movd          %1, %%mm0    \n\t"
620
        "movd         4%1, %%mm3    \n\t"
621
        "punpckldq    8%1, %%mm0    \n\t"
622
        "punpckldq   12%1, %%mm3    \n\t"
623
        "movq       %%mm0, %%mm1    \n\t"
624
        "movq       %%mm0, %%mm2    \n\t"
625
        "movq       %%mm3, %%mm4    \n\t"
626
        "movq       %%mm3, %%mm5    \n\t"
627
        "psllq         $7, %%mm0    \n\t"
628
        "psllq         $7, %%mm3    \n\t"
629
        "pand       %%mm7, %%mm0    \n\t"
630
        "pand       %%mm7, %%mm3    \n\t"
631
        "psrlq         $6, %%mm1    \n\t"
632
        "psrlq         $6, %%mm4    \n\t"
633
        "pand       %%mm6, %%mm1    \n\t"
634
        "pand       %%mm6, %%mm4    \n\t"
635
        "psrlq        $19, %%mm2    \n\t"
636
        "psrlq        $19, %%mm5    \n\t"
637
        "pand          %2, %%mm2    \n\t"
638
        "pand          %2, %%mm5    \n\t"
639
        "por        %%mm1, %%mm0    \n\t"
640
        "por        %%mm4, %%mm3    \n\t"
641
        "por        %%mm2, %%mm0    \n\t"
642
        "por        %%mm5, %%mm3    \n\t"
643
        "psllq        $16, %%mm3    \n\t"
644
        "por        %%mm3, %%mm0    \n\t"
645
        MOVNTQ"     %%mm0, %0       \n\t"
646
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
647
        d += 4;
648
        s += 16;
649
    }
650
    __asm __volatile(SFENCE:::"memory");
651
    __asm __volatile(EMMS:::"memory");
652
#endif
653
    while (s < end)
654
    {
655
        register int rgb = *(uint32_t*)s; s += 4;
656
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
657
    }
658
}
659

    
660
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
661
{
662
    const uint8_t *s = src;
663
    const uint8_t *end;
664
#ifdef HAVE_MMX
665
    const uint8_t *mm_end;
666
#endif
667
    uint16_t *d = (uint16_t *)dst;
668
    end = s + src_size;
669
#ifdef HAVE_MMX
670
    __asm __volatile(PREFETCH"    %0"::"m"(*src):"memory");
671
    __asm __volatile(
672
        "movq         %0, %%mm7     \n\t"
673
        "movq         %1, %%mm6     \n\t"
674
        ::"m"(red_16mask),"m"(green_16mask));
675
    mm_end = end - 11;
676
    while (s < mm_end)
677
    {
678
        __asm __volatile(
679
        PREFETCH"    32%1           \n\t"
680
        "movd          %1, %%mm0    \n\t"
681
        "movd         3%1, %%mm3    \n\t"
682
        "punpckldq    6%1, %%mm0    \n\t"
683
        "punpckldq    9%1, %%mm3    \n\t"
684
        "movq       %%mm0, %%mm1    \n\t"
685
        "movq       %%mm0, %%mm2    \n\t"
686
        "movq       %%mm3, %%mm4    \n\t"
687
        "movq       %%mm3, %%mm5    \n\t"
688
        "psrlq         $3, %%mm0    \n\t"
689
        "psrlq         $3, %%mm3    \n\t"
690
        "pand          %2, %%mm0    \n\t"
691
        "pand          %2, %%mm3    \n\t"
692
        "psrlq         $5, %%mm1    \n\t"
693
        "psrlq         $5, %%mm4    \n\t"
694
        "pand       %%mm6, %%mm1    \n\t"
695
        "pand       %%mm6, %%mm4    \n\t"
696
        "psrlq         $8, %%mm2    \n\t"
697
        "psrlq         $8, %%mm5    \n\t"
698
        "pand       %%mm7, %%mm2    \n\t"
699
        "pand       %%mm7, %%mm5    \n\t"
700
        "por        %%mm1, %%mm0    \n\t"
701
        "por        %%mm4, %%mm3    \n\t"
702
        "por        %%mm2, %%mm0    \n\t"
703
        "por        %%mm5, %%mm3    \n\t"
704
        "psllq        $16, %%mm3    \n\t"
705
        "por        %%mm3, %%mm0    \n\t"
706
        MOVNTQ"     %%mm0, %0       \n\t"
707
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
708
        d += 4;
709
        s += 12;
710
    }
711
    __asm __volatile(SFENCE:::"memory");
712
    __asm __volatile(EMMS:::"memory");
713
#endif
714
    while (s < end)
715
    {
716
        const int b = *s++;
717
        const int g = *s++;
718
        const int r = *s++;
719
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
720
    }
721
}
722

    
723
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
724
{
725
    const uint8_t *s = src;
726
    const uint8_t *end;
727
#ifdef HAVE_MMX
728
    const uint8_t *mm_end;
729
#endif
730
    uint16_t *d = (uint16_t *)dst;
731
    end = s + src_size;
732
#ifdef HAVE_MMX
733
    __asm __volatile(PREFETCH"    %0"::"m"(*src):"memory");
734
    __asm __volatile(
735
        "movq         %0, %%mm7     \n\t"
736
        "movq         %1, %%mm6     \n\t"
737
        ::"m"(red_16mask),"m"(green_16mask));
738
    mm_end = end - 15;
739
    while (s < mm_end)
740
    {
741
        __asm __volatile(
742
        PREFETCH"    32%1           \n\t"
743
        "movd          %1, %%mm0    \n\t"
744
        "movd         3%1, %%mm3    \n\t"
745
        "punpckldq    6%1, %%mm0    \n\t"
746
        "punpckldq    9%1, %%mm3    \n\t"
747
        "movq       %%mm0, %%mm1    \n\t"
748
        "movq       %%mm0, %%mm2    \n\t"
749
        "movq       %%mm3, %%mm4    \n\t"
750
        "movq       %%mm3, %%mm5    \n\t"
751
        "psllq         $8, %%mm0    \n\t"
752
        "psllq         $8, %%mm3    \n\t"
753
        "pand       %%mm7, %%mm0    \n\t"
754
        "pand       %%mm7, %%mm3    \n\t"
755
        "psrlq         $5, %%mm1    \n\t"
756
        "psrlq         $5, %%mm4    \n\t"
757
        "pand       %%mm6, %%mm1    \n\t"
758
        "pand       %%mm6, %%mm4    \n\t"
759
        "psrlq        $19, %%mm2    \n\t"
760
        "psrlq        $19, %%mm5    \n\t"
761
        "pand          %2, %%mm2    \n\t"
762
        "pand          %2, %%mm5    \n\t"
763
        "por        %%mm1, %%mm0    \n\t"
764
        "por        %%mm4, %%mm3    \n\t"
765
        "por        %%mm2, %%mm0    \n\t"
766
        "por        %%mm5, %%mm3    \n\t"
767
        "psllq        $16, %%mm3    \n\t"
768
        "por        %%mm3, %%mm0    \n\t"
769
        MOVNTQ"     %%mm0, %0       \n\t"
770
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
771
        d += 4;
772
        s += 12;
773
    }
774
    __asm __volatile(SFENCE:::"memory");
775
    __asm __volatile(EMMS:::"memory");
776
#endif
777
    while (s < end)
778
    {
779
        const int r = *s++;
780
        const int g = *s++;
781
        const int b = *s++;
782
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
783
    }
784
}
785

    
786
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
787
{
788
    const uint8_t *s = src;
789
    const uint8_t *end;
790
#ifdef HAVE_MMX
791
    const uint8_t *mm_end;
792
#endif
793
    uint16_t *d = (uint16_t *)dst;
794
    end = s + src_size;
795
#ifdef HAVE_MMX
796
    __asm __volatile(PREFETCH"    %0"::"m"(*src):"memory");
797
    __asm __volatile(
798
        "movq          %0, %%mm7    \n\t"
799
        "movq          %1, %%mm6    \n\t"
800
        ::"m"(red_15mask),"m"(green_15mask));
801
    mm_end = end - 11;
802
    while (s < mm_end)
803
    {
804
        __asm __volatile(
805
        PREFETCH"    32%1           \n\t"
806
        "movd          %1, %%mm0    \n\t"
807
        "movd         3%1, %%mm3    \n\t"
808
        "punpckldq    6%1, %%mm0    \n\t"
809
        "punpckldq    9%1, %%mm3    \n\t"
810
        "movq       %%mm0, %%mm1    \n\t"
811
        "movq       %%mm0, %%mm2    \n\t"
812
        "movq       %%mm3, %%mm4    \n\t"
813
        "movq       %%mm3, %%mm5    \n\t"
814
        "psrlq         $3, %%mm0    \n\t"
815
        "psrlq         $3, %%mm3    \n\t"
816
        "pand          %2, %%mm0    \n\t"
817
        "pand          %2, %%mm3    \n\t"
818
        "psrlq         $6, %%mm1    \n\t"
819
        "psrlq         $6, %%mm4    \n\t"
820
        "pand       %%mm6, %%mm1    \n\t"
821
        "pand       %%mm6, %%mm4    \n\t"
822
        "psrlq         $9, %%mm2    \n\t"
823
        "psrlq         $9, %%mm5    \n\t"
824
        "pand       %%mm7, %%mm2    \n\t"
825
        "pand       %%mm7, %%mm5    \n\t"
826
        "por        %%mm1, %%mm0    \n\t"
827
        "por        %%mm4, %%mm3    \n\t"
828
        "por        %%mm2, %%mm0    \n\t"
829
        "por        %%mm5, %%mm3    \n\t"
830
        "psllq        $16, %%mm3    \n\t"
831
        "por        %%mm3, %%mm0    \n\t"
832
        MOVNTQ"     %%mm0, %0       \n\t"
833
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
834
        d += 4;
835
        s += 12;
836
    }
837
    __asm __volatile(SFENCE:::"memory");
838
    __asm __volatile(EMMS:::"memory");
839
#endif
840
    while (s < end)
841
    {
842
        const int b = *s++;
843
        const int g = *s++;
844
        const int r = *s++;
845
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
846
    }
847
}
848

    
849
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
850
{
851
    const uint8_t *s = src;
852
    const uint8_t *end;
853
#ifdef HAVE_MMX
854
    const uint8_t *mm_end;
855
#endif
856
    uint16_t *d = (uint16_t *)dst;
857
    end = s + src_size;
858
#ifdef HAVE_MMX
859
    __asm __volatile(PREFETCH"    %0"::"m"(*src):"memory");
860
    __asm __volatile(
861
        "movq         %0, %%mm7     \n\t"
862
        "movq         %1, %%mm6     \n\t"
863
        ::"m"(red_15mask),"m"(green_15mask));
864
    mm_end = end - 15;
865
    while (s < mm_end)
866
    {
867
        __asm __volatile(
868
        PREFETCH"   32%1            \n\t"
869
        "movd         %1, %%mm0     \n\t"
870
        "movd        3%1, %%mm3     \n\t"
871
        "punpckldq   6%1, %%mm0     \n\t"
872
        "punpckldq   9%1, %%mm3     \n\t"
873
        "movq      %%mm0, %%mm1     \n\t"
874
        "movq      %%mm0, %%mm2     \n\t"
875
        "movq      %%mm3, %%mm4     \n\t"
876
        "movq      %%mm3, %%mm5     \n\t"
877
        "psllq        $7, %%mm0     \n\t"
878
        "psllq        $7, %%mm3     \n\t"
879
        "pand      %%mm7, %%mm0     \n\t"
880
        "pand      %%mm7, %%mm3     \n\t"
881
        "psrlq        $6, %%mm1     \n\t"
882
        "psrlq        $6, %%mm4     \n\t"
883
        "pand      %%mm6, %%mm1     \n\t"
884
        "pand      %%mm6, %%mm4     \n\t"
885
        "psrlq       $19, %%mm2     \n\t"
886
        "psrlq       $19, %%mm5     \n\t"
887
        "pand         %2, %%mm2     \n\t"
888
        "pand         %2, %%mm5     \n\t"
889
        "por       %%mm1, %%mm0     \n\t"
890
        "por       %%mm4, %%mm3     \n\t"
891
        "por       %%mm2, %%mm0     \n\t"
892
        "por       %%mm5, %%mm3     \n\t"
893
        "psllq       $16, %%mm3     \n\t"
894
        "por       %%mm3, %%mm0     \n\t"
895
        MOVNTQ"    %%mm0, %0        \n\t"
896
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
897
        d += 4;
898
        s += 12;
899
    }
900
    __asm __volatile(SFENCE:::"memory");
901
    __asm __volatile(EMMS:::"memory");
902
#endif
903
    while (s < end)
904
    {
905
        const int r = *s++;
906
        const int g = *s++;
907
        const int b = *s++;
908
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
909
    }
910
}
911

    
912
/*
913
  I use less accurate approximation here by simply left-shifting the input
914
  value and filling the low order bits with zeroes. This method improves PNG
915
  compression but this scheme cannot reproduce white exactly, since it does
916
  not generate an all-ones maximum value; the net effect is to darken the
917
  image slightly.
918

919
  The better method should be "left bit replication":
920

921
   4 3 2 1 0
922
   ---------
923
   1 1 0 1 1
924

925
   7 6 5 4 3  2 1 0
926
   ----------------
927
   1 1 0 1 1  1 1 0
928
   |=======|  |===|
929
       |      Leftmost Bits Repeated to Fill Open Bits
930
       |
931
   Original Bits
932
*/
933
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
934
{
935
    const uint16_t *end;
936
#ifdef HAVE_MMX
937
    const uint16_t *mm_end;
938
#endif
939
    uint8_t *d = (uint8_t *)dst;
940
    const uint16_t *s = (uint16_t *)src;
941
    end = s + src_size/2;
942
#ifdef HAVE_MMX
943
    __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
944
    mm_end = end - 7;
945
    while (s < mm_end)
946
    {
947
        __asm __volatile(
948
        PREFETCH"    32%1           \n\t"
949
        "movq          %1, %%mm0    \n\t"
950
        "movq          %1, %%mm1    \n\t"
951
        "movq          %1, %%mm2    \n\t"
952
        "pand          %2, %%mm0    \n\t"
953
        "pand          %3, %%mm1    \n\t"
954
        "pand          %4, %%mm2    \n\t"
955
        "psllq         $3, %%mm0    \n\t"
956
        "psrlq         $2, %%mm1    \n\t"
957
        "psrlq         $7, %%mm2    \n\t"
958
        "movq       %%mm0, %%mm3    \n\t"
959
        "movq       %%mm1, %%mm4    \n\t"
960
        "movq       %%mm2, %%mm5    \n\t"
961
        "punpcklwd     %5, %%mm0    \n\t"
962
        "punpcklwd     %5, %%mm1    \n\t"
963
        "punpcklwd     %5, %%mm2    \n\t"
964
        "punpckhwd     %5, %%mm3    \n\t"
965
        "punpckhwd     %5, %%mm4    \n\t"
966
        "punpckhwd     %5, %%mm5    \n\t"
967
        "psllq         $8, %%mm1    \n\t"
968
        "psllq        $16, %%mm2    \n\t"
969
        "por        %%mm1, %%mm0    \n\t"
970
        "por        %%mm2, %%mm0    \n\t"
971
        "psllq         $8, %%mm4    \n\t"
972
        "psllq        $16, %%mm5    \n\t"
973
        "por        %%mm4, %%mm3    \n\t"
974
        "por        %%mm5, %%mm3    \n\t"
975

    
976
        "movq       %%mm0, %%mm6    \n\t"
977
        "movq       %%mm3, %%mm7    \n\t"
978

    
979
        "movq         8%1, %%mm0    \n\t"
980
        "movq         8%1, %%mm1    \n\t"
981
        "movq         8%1, %%mm2    \n\t"
982
        "pand          %2, %%mm0    \n\t"
983
        "pand          %3, %%mm1    \n\t"
984
        "pand          %4, %%mm2    \n\t"
985
        "psllq         $3, %%mm0    \n\t"
986
        "psrlq         $2, %%mm1    \n\t"
987
        "psrlq         $7, %%mm2    \n\t"
988
        "movq       %%mm0, %%mm3    \n\t"
989
        "movq       %%mm1, %%mm4    \n\t"
990
        "movq       %%mm2, %%mm5    \n\t"
991
        "punpcklwd     %5, %%mm0    \n\t"
992
        "punpcklwd     %5, %%mm1    \n\t"
993
        "punpcklwd     %5, %%mm2    \n\t"
994
        "punpckhwd     %5, %%mm3    \n\t"
995
        "punpckhwd     %5, %%mm4    \n\t"
996
        "punpckhwd     %5, %%mm5    \n\t"
997
        "psllq         $8, %%mm1    \n\t"
998
        "psllq        $16, %%mm2    \n\t"
999
        "por        %%mm1, %%mm0    \n\t"
1000
        "por        %%mm2, %%mm0    \n\t"
1001
        "psllq         $8, %%mm4    \n\t"
1002
        "psllq        $16, %%mm5    \n\t"
1003
        "por        %%mm4, %%mm3    \n\t"
1004
        "por        %%mm5, %%mm3    \n\t"
1005

    
1006
        :"=m"(*d)
1007
        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1008
        :"memory");
1009
        /* Borrowed 32 to 24 */
1010
        __asm __volatile(
1011
        "movq       %%mm0, %%mm4    \n\t"
1012
        "movq       %%mm3, %%mm5    \n\t"
1013
        "movq       %%mm6, %%mm0    \n\t"
1014
        "movq       %%mm7, %%mm1    \n\t"
1015

    
1016
        "movq       %%mm4, %%mm6    \n\t"
1017
        "movq       %%mm5, %%mm7    \n\t"
1018
        "movq       %%mm0, %%mm2    \n\t"
1019
        "movq       %%mm1, %%mm3    \n\t"
1020

    
1021
        "psrlq         $8, %%mm2    \n\t"
1022
        "psrlq         $8, %%mm3    \n\t"
1023
        "psrlq         $8, %%mm6    \n\t"
1024
        "psrlq         $8, %%mm7    \n\t"
1025
        "pand          %2, %%mm0    \n\t"
1026
        "pand          %2, %%mm1    \n\t"
1027
        "pand          %2, %%mm4    \n\t"
1028
        "pand          %2, %%mm5    \n\t"
1029
        "pand          %3, %%mm2    \n\t"
1030
        "pand          %3, %%mm3    \n\t"
1031
        "pand          %3, %%mm6    \n\t"
1032
        "pand          %3, %%mm7    \n\t"
1033
        "por        %%mm2, %%mm0    \n\t"
1034
        "por        %%mm3, %%mm1    \n\t"
1035
        "por        %%mm6, %%mm4    \n\t"
1036
        "por        %%mm7, %%mm5    \n\t"
1037

    
1038
        "movq       %%mm1, %%mm2    \n\t"
1039
        "movq       %%mm4, %%mm3    \n\t"
1040
        "psllq        $48, %%mm2    \n\t"
1041
        "psllq        $32, %%mm3    \n\t"
1042
        "pand          %4, %%mm2    \n\t"
1043
        "pand          %5, %%mm3    \n\t"
1044
        "por        %%mm2, %%mm0    \n\t"
1045
        "psrlq        $16, %%mm1    \n\t"
1046
        "psrlq        $32, %%mm4    \n\t"
1047
        "psllq        $16, %%mm5    \n\t"
1048
        "por        %%mm3, %%mm1    \n\t"
1049
        "pand          %6, %%mm5    \n\t"
1050
        "por        %%mm5, %%mm4    \n\t"
1051

    
1052
        MOVNTQ"     %%mm0,   %0     \n\t"
1053
        MOVNTQ"     %%mm1,  8%0     \n\t"
1054
        MOVNTQ"     %%mm4, 16%0"
1055

    
1056
        :"=m"(*d)
1057
        :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1058
        :"memory");
1059
        d += 24;
1060
        s += 8;
1061
    }
1062
    __asm __volatile(SFENCE:::"memory");
1063
    __asm __volatile(EMMS:::"memory");
1064
#endif
1065
    while (s < end)
1066
    {
1067
        register uint16_t bgr;
1068
        bgr = *s++;
1069
        *d++ = (bgr&0x1F)<<3;
1070
        *d++ = (bgr&0x3E0)>>2;
1071
        *d++ = (bgr&0x7C00)>>7;
1072
    }
1073
}
1074

    
1075
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1076
{
1077
    const uint16_t *end;
1078
#ifdef HAVE_MMX
1079
    const uint16_t *mm_end;
1080
#endif
1081
    uint8_t *d = (uint8_t *)dst;
1082
    const uint16_t *s = (const uint16_t *)src;
1083
    end = s + src_size/2;
1084
#ifdef HAVE_MMX
1085
    __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
1086
    mm_end = end - 7;
1087
    while (s < mm_end)
1088
    {
1089
        __asm __volatile(
1090
        PREFETCH"    32%1           \n\t"
1091
        "movq          %1, %%mm0    \n\t"
1092
        "movq          %1, %%mm1    \n\t"
1093
        "movq          %1, %%mm2    \n\t"
1094
        "pand          %2, %%mm0    \n\t"
1095
        "pand          %3, %%mm1    \n\t"
1096
        "pand          %4, %%mm2    \n\t"
1097
        "psllq         $3, %%mm0    \n\t"
1098
        "psrlq         $3, %%mm1    \n\t"
1099
        "psrlq         $8, %%mm2    \n\t"
1100
        "movq       %%mm0, %%mm3    \n\t"
1101
        "movq       %%mm1, %%mm4    \n\t"
1102
        "movq       %%mm2, %%mm5    \n\t"
1103
        "punpcklwd     %5, %%mm0    \n\t"
1104
        "punpcklwd     %5, %%mm1    \n\t"
1105
        "punpcklwd     %5, %%mm2    \n\t"
1106
        "punpckhwd     %5, %%mm3    \n\t"
1107
        "punpckhwd     %5, %%mm4    \n\t"
1108
        "punpckhwd     %5, %%mm5    \n\t"
1109
        "psllq         $8, %%mm1    \n\t"
1110
        "psllq        $16, %%mm2    \n\t"
1111
        "por        %%mm1, %%mm0    \n\t"
1112
        "por        %%mm2, %%mm0    \n\t"
1113
        "psllq         $8, %%mm4    \n\t"
1114
        "psllq        $16, %%mm5    \n\t"
1115
        "por        %%mm4, %%mm3    \n\t"
1116
        "por        %%mm5, %%mm3    \n\t"
1117

    
1118
        "movq       %%mm0, %%mm6    \n\t"
1119
        "movq       %%mm3, %%mm7    \n\t"
1120

    
1121
        "movq         8%1, %%mm0    \n\t"
1122
        "movq         8%1, %%mm1    \n\t"
1123
        "movq         8%1, %%mm2    \n\t"
1124
        "pand          %2, %%mm0    \n\t"
1125
        "pand          %3, %%mm1    \n\t"
1126
        "pand          %4, %%mm2    \n\t"
1127
        "psllq         $3, %%mm0    \n\t"
1128
        "psrlq         $3, %%mm1    \n\t"
1129
        "psrlq         $8, %%mm2    \n\t"
1130
        "movq       %%mm0, %%mm3    \n\t"
1131
        "movq       %%mm1, %%mm4    \n\t"
1132
        "movq       %%mm2, %%mm5    \n\t"
1133
        "punpcklwd     %5, %%mm0    \n\t"
1134
        "punpcklwd     %5, %%mm1    \n\t"
1135
        "punpcklwd     %5, %%mm2    \n\t"
1136
        "punpckhwd     %5, %%mm3    \n\t"
1137
        "punpckhwd     %5, %%mm4    \n\t"
1138
        "punpckhwd     %5, %%mm5    \n\t"
1139
        "psllq         $8, %%mm1    \n\t"
1140
        "psllq        $16, %%mm2    \n\t"
1141
        "por        %%mm1, %%mm0    \n\t"
1142
        "por        %%mm2, %%mm0    \n\t"
1143
        "psllq         $8, %%mm4    \n\t"
1144
        "psllq        $16, %%mm5    \n\t"
1145
        "por        %%mm4, %%mm3    \n\t"
1146
        "por        %%mm5, %%mm3    \n\t"
1147
        :"=m"(*d)
1148
        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1149
        :"memory");
1150
        /* Borrowed 32 to 24 */
1151
        __asm __volatile(
1152
        "movq       %%mm0, %%mm4    \n\t"
1153
        "movq       %%mm3, %%mm5    \n\t"
1154
        "movq       %%mm6, %%mm0    \n\t"
1155
        "movq       %%mm7, %%mm1    \n\t"
1156

    
1157
        "movq       %%mm4, %%mm6    \n\t"
1158
        "movq       %%mm5, %%mm7    \n\t"
1159
        "movq       %%mm0, %%mm2    \n\t"
1160
        "movq       %%mm1, %%mm3    \n\t"
1161

    
1162
        "psrlq         $8, %%mm2    \n\t"
1163
        "psrlq         $8, %%mm3    \n\t"
1164
        "psrlq         $8, %%mm6    \n\t"
1165
        "psrlq         $8, %%mm7    \n\t"
1166
        "pand          %2, %%mm0    \n\t"
1167
        "pand          %2, %%mm1    \n\t"
1168
        "pand          %2, %%mm4    \n\t"
1169
        "pand          %2, %%mm5    \n\t"
1170
        "pand          %3, %%mm2    \n\t"
1171
        "pand          %3, %%mm3    \n\t"
1172
        "pand          %3, %%mm6    \n\t"
1173
        "pand          %3, %%mm7    \n\t"
1174
        "por        %%mm2, %%mm0    \n\t"
1175
        "por        %%mm3, %%mm1    \n\t"
1176
        "por        %%mm6, %%mm4    \n\t"
1177
        "por        %%mm7, %%mm5    \n\t"
1178

    
1179
        "movq       %%mm1, %%mm2    \n\t"
1180
        "movq       %%mm4, %%mm3    \n\t"
1181
        "psllq        $48, %%mm2    \n\t"
1182
        "psllq        $32, %%mm3    \n\t"
1183
        "pand          %4, %%mm2    \n\t"
1184
        "pand          %5, %%mm3    \n\t"
1185
        "por        %%mm2, %%mm0    \n\t"
1186
        "psrlq        $16, %%mm1    \n\t"
1187
        "psrlq        $32, %%mm4    \n\t"
1188
        "psllq        $16, %%mm5    \n\t"
1189
        "por        %%mm3, %%mm1    \n\t"
1190
        "pand          %6, %%mm5    \n\t"
1191
        "por        %%mm5, %%mm4    \n\t"
1192

    
1193
        MOVNTQ"     %%mm0,   %0     \n\t"
1194
        MOVNTQ"     %%mm1,  8%0     \n\t"
1195
        MOVNTQ"     %%mm4, 16%0"
1196

    
1197
        :"=m"(*d)
1198
        :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1199
        :"memory");
1200
        d += 24;
1201
        s += 8;
1202
    }
1203
    __asm __volatile(SFENCE:::"memory");
1204
    __asm __volatile(EMMS:::"memory");
1205
#endif
1206
    while (s < end)
1207
    {
1208
        register uint16_t bgr;
1209
        bgr = *s++;
1210
        *d++ = (bgr&0x1F)<<3;
1211
        *d++ = (bgr&0x7E0)>>3;
1212
        *d++ = (bgr&0xF800)>>8;
1213
    }
1214
}
1215

    
1216
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1217
{
1218
    const uint16_t *end;
1219
#ifdef HAVE_MMX
1220
    const uint16_t *mm_end;
1221
#endif
1222
    uint8_t *d = (uint8_t *)dst;
1223
    const uint16_t *s = (const uint16_t *)src;
1224
    end = s + src_size/2;
1225
#ifdef HAVE_MMX
1226
    __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
1227
    __asm __volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1228
    mm_end = end - 3;
1229
    while (s < mm_end)
1230
    {
1231
        __asm __volatile(
1232
        PREFETCH"    32%1           \n\t"
1233
        "movq          %1, %%mm0    \n\t"
1234
        "movq          %1, %%mm1    \n\t"
1235
        "movq          %1, %%mm2    \n\t"
1236
        "pand          %2, %%mm0    \n\t"
1237
        "pand          %3, %%mm1    \n\t"
1238
        "pand          %4, %%mm2    \n\t"
1239
        "psllq         $3, %%mm0    \n\t"
1240
        "psrlq         $2, %%mm1    \n\t"
1241
        "psrlq         $7, %%mm2    \n\t"
1242
        "movq       %%mm0, %%mm3    \n\t"
1243
        "movq       %%mm1, %%mm4    \n\t"
1244
        "movq       %%mm2, %%mm5    \n\t"
1245
        "punpcklwd  %%mm7, %%mm0    \n\t"
1246
        "punpcklwd  %%mm7, %%mm1    \n\t"
1247
        "punpcklwd  %%mm7, %%mm2    \n\t"
1248
        "punpckhwd  %%mm7, %%mm3    \n\t"
1249
        "punpckhwd  %%mm7, %%mm4    \n\t"
1250
        "punpckhwd  %%mm7, %%mm5    \n\t"
1251
        "psllq         $8, %%mm1    \n\t"
1252
        "psllq        $16, %%mm2    \n\t"
1253
        "por        %%mm1, %%mm0    \n\t"
1254
        "por        %%mm2, %%mm0    \n\t"
1255
        "psllq         $8, %%mm4    \n\t"
1256
        "psllq        $16, %%mm5    \n\t"
1257
        "por        %%mm4, %%mm3    \n\t"
1258
        "por        %%mm5, %%mm3    \n\t"
1259
        MOVNTQ"     %%mm0,  %0      \n\t"
1260
        MOVNTQ"     %%mm3, 8%0      \n\t"
1261
        :"=m"(*d)
1262
        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1263
        :"memory");
1264
        d += 16;
1265
        s += 4;
1266
    }
1267
    __asm __volatile(SFENCE:::"memory");
1268
    __asm __volatile(EMMS:::"memory");
1269
#endif
1270
    while (s < end)
1271
    {
1272
#if 0 //slightly slower on Athlon
1273
        int bgr= *s++;
1274
        *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1275
#else
1276
        register uint16_t bgr;
1277
        bgr = *s++;
1278
#ifdef WORDS_BIGENDIAN
1279
        *d++ = 0;
1280
        *d++ = (bgr&0x7C00)>>7;
1281
        *d++ = (bgr&0x3E0)>>2;
1282
        *d++ = (bgr&0x1F)<<3;
1283
#else
1284
        *d++ = (bgr&0x1F)<<3;
1285
        *d++ = (bgr&0x3E0)>>2;
1286
        *d++ = (bgr&0x7C00)>>7;
1287
        *d++ = 0;
1288
#endif
1289

    
1290
#endif
1291
    }
1292
}
1293

    
1294
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1295
{
1296
    const uint16_t *end;
1297
#ifdef HAVE_MMX
1298
    const uint16_t *mm_end;
1299
#endif
1300
    uint8_t *d = (uint8_t *)dst;
1301
    const uint16_t *s = (uint16_t *)src;
1302
    end = s + src_size/2;
1303
#ifdef HAVE_MMX
1304
    __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
1305
    __asm __volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1306
    mm_end = end - 3;
1307
    while (s < mm_end)
1308
    {
1309
        __asm __volatile(
1310
        PREFETCH"    32%1           \n\t"
1311
        "movq          %1, %%mm0    \n\t"
1312
        "movq          %1, %%mm1    \n\t"
1313
        "movq          %1, %%mm2    \n\t"
1314
        "pand          %2, %%mm0    \n\t"
1315
        "pand          %3, %%mm1    \n\t"
1316
        "pand          %4, %%mm2    \n\t"
1317
        "psllq         $3, %%mm0    \n\t"
1318
        "psrlq         $3, %%mm1    \n\t"
1319
        "psrlq         $8, %%mm2    \n\t"
1320
        "movq       %%mm0, %%mm3    \n\t"
1321
        "movq       %%mm1, %%mm4    \n\t"
1322
        "movq       %%mm2, %%mm5    \n\t"
1323
        "punpcklwd  %%mm7, %%mm0    \n\t"
1324
        "punpcklwd  %%mm7, %%mm1    \n\t"
1325
        "punpcklwd  %%mm7, %%mm2    \n\t"
1326
        "punpckhwd  %%mm7, %%mm3    \n\t"
1327
        "punpckhwd  %%mm7, %%mm4    \n\t"
1328
        "punpckhwd  %%mm7, %%mm5    \n\t"
1329
        "psllq         $8, %%mm1    \n\t"
1330
        "psllq        $16, %%mm2    \n\t"
1331
        "por        %%mm1, %%mm0    \n\t"
1332
        "por        %%mm2, %%mm0    \n\t"
1333
        "psllq         $8, %%mm4    \n\t"
1334
        "psllq        $16, %%mm5    \n\t"
1335
        "por        %%mm4, %%mm3    \n\t"
1336
        "por        %%mm5, %%mm3    \n\t"
1337
        MOVNTQ"     %%mm0, %0       \n\t"
1338
        MOVNTQ"     %%mm3, 8%0      \n\t"
1339
        :"=m"(*d)
1340
        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1341
        :"memory");
1342
        d += 16;
1343
        s += 4;
1344
    }
1345
    __asm __volatile(SFENCE:::"memory");
1346
    __asm __volatile(EMMS:::"memory");
1347
#endif
1348
    while (s < end)
1349
    {
1350
        register uint16_t bgr;
1351
        bgr = *s++;
1352
#ifdef WORDS_BIGENDIAN
1353
        *d++ = 0;
1354
        *d++ = (bgr&0xF800)>>8;
1355
        *d++ = (bgr&0x7E0)>>3;
1356
        *d++ = (bgr&0x1F)<<3;
1357
#else
1358
        *d++ = (bgr&0x1F)<<3;
1359
        *d++ = (bgr&0x7E0)>>3;
1360
        *d++ = (bgr&0xF800)>>8;
1361
        *d++ = 0;
1362
#endif
1363
    }
1364
}
1365

    
1366
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1367
{
1368
    long idx = 15 - src_size;
1369
    uint8_t *s = (uint8_t *) src-idx, *d = dst-idx;
1370
#ifdef HAVE_MMX
1371
    __asm __volatile(
1372
    "test          %0, %0           \n\t"
1373
    "jns           2f               \n\t"
1374
    PREFETCH"       (%1, %0)        \n\t"
1375
    "movq          %3, %%mm7        \n\t"
1376
    "pxor          %4, %%mm7        \n\t"
1377
    "movq       %%mm7, %%mm6        \n\t"
1378
    "pxor          %5, %%mm7        \n\t"
1379
    ASMALIGN(4)
1380
    "1:                             \n\t"
1381
    PREFETCH"     32(%1, %0)        \n\t"
1382
    "movq           (%1, %0), %%mm0 \n\t"
1383
    "movq          8(%1, %0), %%mm1 \n\t"
1384
# ifdef HAVE_MMX2
1385
    "pshufw      $177, %%mm0, %%mm3 \n\t"
1386
    "pshufw      $177, %%mm1, %%mm5 \n\t"
1387
    "pand       %%mm7, %%mm0        \n\t"
1388
    "pand       %%mm6, %%mm3        \n\t"
1389
    "pand       %%mm7, %%mm1        \n\t"
1390
    "pand       %%mm6, %%mm5        \n\t"
1391
    "por        %%mm3, %%mm0        \n\t"
1392
    "por        %%mm5, %%mm1        \n\t"
1393
# else
1394
    "movq       %%mm0, %%mm2        \n\t"
1395
    "movq       %%mm1, %%mm4        \n\t"
1396
    "pand       %%mm7, %%mm0        \n\t"
1397
    "pand       %%mm6, %%mm2        \n\t"
1398
    "pand       %%mm7, %%mm1        \n\t"
1399
    "pand       %%mm6, %%mm4        \n\t"
1400
    "movq       %%mm2, %%mm3        \n\t"
1401
    "movq       %%mm4, %%mm5        \n\t"
1402
    "pslld        $16, %%mm2        \n\t"
1403
    "psrld        $16, %%mm3        \n\t"
1404
    "pslld        $16, %%mm4        \n\t"
1405
    "psrld        $16, %%mm5        \n\t"
1406
    "por        %%mm2, %%mm0        \n\t"
1407
    "por        %%mm4, %%mm1        \n\t"
1408
    "por        %%mm3, %%mm0        \n\t"
1409
    "por        %%mm5, %%mm1        \n\t"
1410
# endif
1411
    MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1412
    MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1413
    "add          $16, %0           \n\t"
1414
    "js            1b               \n\t"
1415
    SFENCE"                         \n\t"
1416
    EMMS"                           \n\t"
1417
    "2:                             \n\t"
1418
    : "+&r"(idx)
1419
    : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1420
    : "memory");
1421
#endif
1422
    for (; idx<15; idx+=4) {
1423
        register int v = *(uint32_t *)&s[idx], g = v & 0xff00ff00;
1424
        v &= 0xff00ff;
1425
        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1426
    }
1427
}
1428

    
1429
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1430
{
1431
    unsigned i;
1432
#ifdef HAVE_MMX
1433
    long mmx_size= 23 - src_size;
1434
    asm volatile (
1435
    "test             %%"REG_a", %%"REG_a"          \n\t"
1436
    "jns                     2f                     \n\t"
1437
    "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1438
    "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1439
    "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1440
    ASMALIGN(4)
1441
    "1:                                             \n\t"
1442
    PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1443
    "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1444
    "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1445
    "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1446
    "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1447
    "pand                 %%mm5, %%mm0              \n\t"
1448
    "pand                 %%mm6, %%mm1              \n\t"
1449
    "pand                 %%mm7, %%mm2              \n\t"
1450
    "por                  %%mm0, %%mm1              \n\t"
1451
    "por                  %%mm2, %%mm1              \n\t"
1452
    "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1453
    MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1454
    "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1455
    "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1456
    "pand                 %%mm7, %%mm0              \n\t"
1457
    "pand                 %%mm5, %%mm1              \n\t"
1458
    "pand                 %%mm6, %%mm2              \n\t"
1459
    "por                  %%mm0, %%mm1              \n\t"
1460
    "por                  %%mm2, %%mm1              \n\t"
1461
    "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1462
    MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1463
    "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1464
    "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1465
    "pand                 %%mm6, %%mm0              \n\t"
1466
    "pand                 %%mm7, %%mm1              \n\t"
1467
    "pand                 %%mm5, %%mm2              \n\t"
1468
    "por                  %%mm0, %%mm1              \n\t"
1469
    "por                  %%mm2, %%mm1              \n\t"
1470
    MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1471
    "add                    $24, %%"REG_a"          \n\t"
1472
    " js                     1b                     \n\t"
1473
    "2:                                             \n\t"
1474
    : "+a" (mmx_size)
1475
    : "r" (src-mmx_size), "r"(dst-mmx_size)
1476
    );
1477

    
1478
    __asm __volatile(SFENCE:::"memory");
1479
    __asm __volatile(EMMS:::"memory");
1480

    
1481
    if (mmx_size==23) return; //finihsed, was multiple of 8
1482

    
1483
    src+= src_size;
1484
    dst+= src_size;
1485
    src_size= 23-mmx_size;
1486
    src-= src_size;
1487
    dst-= src_size;
1488
#endif
1489
    for (i=0; i<src_size; i+=3)
1490
    {
1491
        register uint8_t x;
1492
        x          = src[i + 2];
1493
        dst[i + 1] = src[i + 1];
1494
        dst[i + 2] = src[i + 0];
1495
        dst[i + 0] = x;
1496
    }
1497
}
1498

    
1499
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1500
                                           long width, long height,
1501
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1502
{
1503
    long y;
1504
    const long chromWidth= width>>1;
1505
    for (y=0; y<height; y++)
1506
    {
1507
#ifdef HAVE_MMX
1508
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1509
        asm volatile(
1510
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1511
        ASMALIGN(4)
1512
        "1:                                         \n\t"
1513
        PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1514
        PREFETCH"    32(%2, %%"REG_a")              \n\t"
1515
        PREFETCH"    32(%3, %%"REG_a")              \n\t"
1516
        "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1517
        "movq                    %%mm0, %%mm2       \n\t" // U(0)
1518
        "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1519
        "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1520
        "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1521

    
1522
        "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1523
        "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1524
        "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1525
        "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1526
        "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1527
        "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1528
        "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1529
        "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1530

    
1531
        MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1532
        MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1533
        MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1534
        MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1535

    
1536
        "add                        $8, %%"REG_a"   \n\t"
1537
        "cmp                        %4, %%"REG_a"   \n\t"
1538
        " jb                        1b              \n\t"
1539
        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1540
        : "%"REG_a
1541
        );
1542
#else
1543

    
1544
#if defined ARCH_ALPHA && defined HAVE_MVI
1545
#define pl2yuy2(n)                  \
1546
    y1 = yc[n];                     \
1547
    y2 = yc2[n];                    \
1548
    u = uc[n];                      \
1549
    v = vc[n];                      \
1550
    asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1551
    asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1552
    asm("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1553
    asm("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1554
    yuv1 = (u << 8) + (v << 24);                \
1555
    yuv2 = yuv1 + y2;               \
1556
    yuv1 += y1;                     \
1557
    qdst[n]  = yuv1;                \
1558
    qdst2[n] = yuv2;
1559

    
1560
        int i;
1561
        uint64_t *qdst = (uint64_t *) dst;
1562
        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1563
        const uint32_t *yc = (uint32_t *) ysrc;
1564
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1565
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1566
        for (i = 0; i < chromWidth; i += 8){
1567
            uint64_t y1, y2, yuv1, yuv2;
1568
            uint64_t u, v;
1569
            /* Prefetch */
1570
            asm("ldq $31,64(%0)" :: "r"(yc));
1571
            asm("ldq $31,64(%0)" :: "r"(yc2));
1572
            asm("ldq $31,64(%0)" :: "r"(uc));
1573
            asm("ldq $31,64(%0)" :: "r"(vc));
1574

    
1575
            pl2yuy2(0);
1576
            pl2yuy2(1);
1577
            pl2yuy2(2);
1578
            pl2yuy2(3);
1579

    
1580
            yc    += 4;
1581
            yc2   += 4;
1582
            uc    += 4;
1583
            vc    += 4;
1584
            qdst  += 4;
1585
            qdst2 += 4;
1586
        }
1587
        y++;
1588
        ysrc += lumStride;
1589
        dst += dstStride;
1590

    
1591
#elif __WORDSIZE >= 64
1592
        int i;
1593
        uint64_t *ldst = (uint64_t *) dst;
1594
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1595
        for (i = 0; i < chromWidth; i += 2){
1596
            uint64_t k, l;
1597
            k = yc[0] + (uc[0] << 8) +
1598
                (yc[1] << 16) + (vc[0] << 24);
1599
            l = yc[2] + (uc[1] << 8) +
1600
                (yc[3] << 16) + (vc[1] << 24);
1601
            *ldst++ = k + (l << 32);
1602
            yc += 4;
1603
            uc += 2;
1604
            vc += 2;
1605
        }
1606

    
1607
#else
1608
        int i, *idst = (int32_t *) dst;
1609
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1610
        for (i = 0; i < chromWidth; i++){
1611
#ifdef WORDS_BIGENDIAN
1612
            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1613
                (yc[1] << 8) + (vc[0] << 0);
1614
#else
1615
            *idst++ = yc[0] + (uc[0] << 8) +
1616
                (yc[1] << 16) + (vc[0] << 24);
1617
#endif
1618
            yc += 2;
1619
            uc++;
1620
            vc++;
1621
        }
1622
#endif
1623
#endif
1624
        if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1625
        {
1626
            usrc += chromStride;
1627
            vsrc += chromStride;
1628
        }
1629
        ysrc += lumStride;
1630
        dst  += dstStride;
1631
    }
1632
#ifdef HAVE_MMX
1633
asm(    EMMS"       \n\t"
1634
        SFENCE"     \n\t"
1635
        :::"memory");
1636
#endif
1637
}
1638

    
1639
/**
1640
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1641
 * this is a problem for anyone then tell me, and I will fix it).
1642
 */
1643
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1644
                                      long width, long height,
1645
                                      long lumStride, long chromStride, long dstStride)
1646
{
1647
    //FIXME interpolate chroma
1648
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1649
}
1650

    
1651
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1652
                                           long width, long height,
1653
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1654
{
1655
    long y;
1656
    const long chromWidth= width>>1;
1657
    for (y=0; y<height; y++)
1658
    {
1659
#ifdef HAVE_MMX
1660
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1661
        asm volatile(
1662
        "xor                %%"REG_a", %%"REG_a"    \n\t"
1663
        ASMALIGN(4)
1664
        "1:                                         \n\t"
1665
        PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1666
        PREFETCH"   32(%2, %%"REG_a")               \n\t"
1667
        PREFETCH"   32(%3, %%"REG_a")               \n\t"
1668
        "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1669
        "movq                   %%mm0, %%mm2        \n\t" // U(0)
1670
        "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1671
        "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1672
        "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1673

    
1674
        "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1675
        "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1676
        "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1677
        "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1678
        "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1679
        "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1680
        "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1681
        "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1682

    
1683
        MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1684
        MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1685
        MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1686
        MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1687

    
1688
        "add                       $8, %%"REG_a"    \n\t"
1689
        "cmp                       %4, %%"REG_a"    \n\t"
1690
        " jb                       1b               \n\t"
1691
        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1692
        : "%"REG_a
1693
        );
1694
#else
1695
//FIXME adapt the Alpha ASM code from yv12->yuy2
1696

    
1697
#if __WORDSIZE >= 64
1698
        int i;
1699
        uint64_t *ldst = (uint64_t *) dst;
1700
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1701
        for (i = 0; i < chromWidth; i += 2){
1702
            uint64_t k, l;
1703
            k = uc[0] + (yc[0] << 8) +
1704
                (vc[0] << 16) + (yc[1] << 24);
1705
            l = uc[1] + (yc[2] << 8) +
1706
                (vc[1] << 16) + (yc[3] << 24);
1707
            *ldst++ = k + (l << 32);
1708
            yc += 4;
1709
            uc += 2;
1710
            vc += 2;
1711
        }
1712

    
1713
#else
1714
        int i, *idst = (int32_t *) dst;
1715
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1716
        for (i = 0; i < chromWidth; i++){
1717
#ifdef WORDS_BIGENDIAN
1718
            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1719
                (vc[0] << 8) + (yc[1] << 0);
1720
#else
1721
            *idst++ = uc[0] + (yc[0] << 8) +
1722
                (vc[0] << 16) + (yc[1] << 24);
1723
#endif
1724
            yc += 2;
1725
            uc++;
1726
            vc++;
1727
        }
1728
#endif
1729
#endif
1730
        if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1731
        {
1732
            usrc += chromStride;
1733
            vsrc += chromStride;
1734
        }
1735
        ysrc += lumStride;
1736
        dst += dstStride;
1737
    }
1738
#ifdef HAVE_MMX
1739
asm(    EMMS"       \n\t"
1740
        SFENCE"     \n\t"
1741
        :::"memory");
1742
#endif
1743
}
1744

    
1745
/**
1746
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1747
 * this is a problem for anyone then tell me, and I will fix it).
1748
 */
1749
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1750
                                      long width, long height,
1751
                                      long lumStride, long chromStride, long dstStride)
1752
{
1753
    //FIXME interpolate chroma
1754
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1755
}
1756

    
1757
/**
1758
 * Width should be a multiple of 16.
1759
 */
1760
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1761
                                         long width, long height,
1762
                                         long lumStride, long chromStride, long dstStride)
1763
{
1764
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1765
}
1766

    
1767
/**
1768
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1769
 * this is a problem for anyone then tell me, and I will fix it).
1770
 */
1771
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1772
                                      long width, long height,
1773
                                      long lumStride, long chromStride, long srcStride)
1774
{
1775
    long y;
1776
    const long chromWidth= width>>1;
1777
    for (y=0; y<height; y+=2)
1778
    {
1779
#ifdef HAVE_MMX
1780
        asm volatile(
1781
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1782
        "pcmpeqw                 %%mm7, %%mm7       \n\t"
1783
        "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1784
        ASMALIGN(4)
1785
        "1:                \n\t"
1786
        PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1787
        "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1788
        "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1789
        "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1790
        "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1791
        "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1792
        "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1793
        "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1794
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1795
        "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1796
        "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1797

    
1798
        MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1799

    
1800
        "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1801
        "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1802
        "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1803
        "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1804
        "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1805
        "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1806
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1807
        "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1808
        "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1809
        "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1810

    
1811
        MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1812

    
1813
        "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1814
        "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1815
        "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1816
        "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1817
        "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1818
        "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1819
        "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1820
        "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1821

    
1822
        MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1823
        MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1824

    
1825
        "add                        $8, %%"REG_a"   \n\t"
1826
        "cmp                        %4, %%"REG_a"   \n\t"
1827
        " jb                        1b              \n\t"
1828
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1829
        : "memory", "%"REG_a
1830
        );
1831

    
1832
        ydst += lumStride;
1833
        src  += srcStride;
1834

    
1835
        asm volatile(
1836
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1837
        ASMALIGN(4)
1838
        "1:                                         \n\t"
1839
        PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1840
        "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1841
        "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1842
        "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1843
        "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1844
        "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1845
        "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1846
        "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1847
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1848
        "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1849
        "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1850

    
1851
        MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1852
        MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1853

    
1854
        "add                        $8, %%"REG_a"   \n\t"
1855
        "cmp                        %4, %%"REG_a"   \n\t"
1856
        " jb                        1b              \n\t"
1857

    
1858
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1859
        : "memory", "%"REG_a
1860
        );
1861
#else
1862
        long i;
1863
        for (i=0; i<chromWidth; i++)
1864
        {
1865
            ydst[2*i+0]     = src[4*i+0];
1866
            udst[i]     = src[4*i+1];
1867
            ydst[2*i+1]     = src[4*i+2];
1868
            vdst[i]     = src[4*i+3];
1869
        }
1870
        ydst += lumStride;
1871
        src  += srcStride;
1872

    
1873
        for (i=0; i<chromWidth; i++)
1874
        {
1875
            ydst[2*i+0]     = src[4*i+0];
1876
            ydst[2*i+1]     = src[4*i+2];
1877
        }
1878
#endif
1879
        udst += chromStride;
1880
        vdst += chromStride;
1881
        ydst += lumStride;
1882
        src  += srcStride;
1883
    }
1884
#ifdef HAVE_MMX
1885
asm volatile(   EMMS"       \n\t"
1886
                SFENCE"     \n\t"
1887
                :::"memory");
1888
#endif
1889
}
1890

    
1891
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1892
                                      uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1893
                                      long width, long height, long lumStride, long chromStride)
1894
{
1895
    /* Y Plane */
1896
    memcpy(ydst, ysrc, width*height);
1897

    
1898
    /* XXX: implement upscaling for U,V */
1899
}
1900

    
1901
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1902
{
1903
    long x,y;
1904

    
1905
    dst[0]= src[0];
1906

    
1907
    // first line
1908
    for (x=0; x<srcWidth-1; x++){
1909
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1910
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1911
    }
1912
    dst[2*srcWidth-1]= src[srcWidth-1];
1913

    
1914
        dst+= dstStride;
1915

    
1916
    for (y=1; y<srcHeight; y++){
1917
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1918
        const long mmxSize= srcWidth&~15;
1919
        asm volatile(
1920
        "mov           %4, %%"REG_a"            \n\t"
1921
        "1:                                     \n\t"
1922
        "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1923
        "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1924
        "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1925
        "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1926
        "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1927
        "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1928
        PAVGB"                  %%mm0, %%mm5    \n\t"
1929
        PAVGB"                  %%mm0, %%mm3    \n\t"
1930
        PAVGB"                  %%mm0, %%mm5    \n\t"
1931
        PAVGB"                  %%mm0, %%mm3    \n\t"
1932
        PAVGB"                  %%mm1, %%mm4    \n\t"
1933
        PAVGB"                  %%mm1, %%mm2    \n\t"
1934
        PAVGB"                  %%mm1, %%mm4    \n\t"
1935
        PAVGB"                  %%mm1, %%mm2    \n\t"
1936
        "movq                   %%mm5, %%mm7    \n\t"
1937
        "movq                   %%mm4, %%mm6    \n\t"
1938
        "punpcklbw              %%mm3, %%mm5    \n\t"
1939
        "punpckhbw              %%mm3, %%mm7    \n\t"
1940
        "punpcklbw              %%mm2, %%mm4    \n\t"
1941
        "punpckhbw              %%mm2, %%mm6    \n\t"
1942
#if 1
1943
        MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1944
        MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1945
        MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1946
        MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1947
#else
1948
        "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1949
        "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1950
        "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1951
        "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1952
#endif
1953
        "add                       $8, %%"REG_a"            \n\t"
1954
        " js                       1b                       \n\t"
1955
        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1956
           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1957
           "g" (-mmxSize)
1958
        : "%"REG_a
1959

    
1960
        );
1961
#else
1962
        const long mmxSize=1;
1963
#endif
1964
        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1965
        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1966

    
1967
        for (x=mmxSize-1; x<srcWidth-1; x++){
1968
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1969
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1970
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1971
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1972
        }
1973
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1974
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1975

    
1976
        dst+=dstStride*2;
1977
        src+=srcStride;
1978
    }
1979

    
1980
    // last line
1981
#if 1
1982
    dst[0]= src[0];
1983

    
1984
    for (x=0; x<srcWidth-1; x++){
1985
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1986
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1987
    }
1988
    dst[2*srcWidth-1]= src[srcWidth-1];
1989
#else
1990
    for (x=0; x<srcWidth; x++){
1991
        dst[2*x+0]=
1992
        dst[2*x+1]= src[x];
1993
    }
1994
#endif
1995

    
1996
#ifdef HAVE_MMX
1997
asm volatile(   EMMS"       \n\t"
1998
                SFENCE"     \n\t"
1999
                :::"memory");
2000
#endif
2001
}
2002

    
2003
/**
2004
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
2005
 * this is a problem for anyone then tell me, and I will fix it).
2006
 * Chrominance data is only taken from every secound line, others are ignored.
2007
 * FIXME: Write HQ version.
2008
 */
2009
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2010
                                      long width, long height,
2011
                                      long lumStride, long chromStride, long srcStride)
2012
{
2013
    long y;
2014
    const long chromWidth= width>>1;
2015
    for (y=0; y<height; y+=2)
2016
    {
2017
#ifdef HAVE_MMX
2018
        asm volatile(
2019
        "xorl                %%eax, %%eax   \n\t"
2020
        "pcmpeqw             %%mm7, %%mm7   \n\t"
2021
        "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
2022
        ASMALIGN(4)
2023
        "1:                                 \n\t"
2024
        PREFETCH" 64(%0, %%eax, 4)          \n\t"
2025
        "movq       (%0, %%eax, 4), %%mm0   \n\t" // UYVY UYVY(0)
2026
        "movq      8(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(4)
2027
        "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
2028
        "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
2029
        "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
2030
        "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
2031
        "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
2032
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
2033
        "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
2034
        "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
2035

    
2036
        MOVNTQ"              %%mm2,  (%1, %%eax, 2) \n\t"
2037

    
2038
        "movq     16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
2039
        "movq     24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
2040
        "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
2041
        "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
2042
        "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
2043
        "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
2044
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
2045
        "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
2046
        "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
2047
        "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
2048

    
2049
        MOVNTQ"              %%mm3, 8(%1, %%eax, 2) \n\t"
2050

    
2051
        "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
2052
        "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
2053
        "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
2054
        "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
2055
        "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
2056
        "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
2057
        "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
2058
        "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
2059

    
2060
        MOVNTQ"              %%mm0, (%3, %%eax) \n\t"
2061
        MOVNTQ"              %%mm2, (%2, %%eax) \n\t"
2062

    
2063
        "addl                   $8, %%eax   \n\t"
2064
        "cmpl                   %4, %%eax   \n\t"
2065
        " jb                    1b          \n\t"
2066
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2067
        : "memory", "%eax"
2068
        );
2069

    
2070
        ydst += lumStride;
2071
        src  += srcStride;
2072

    
2073
        asm volatile(
2074
        "xorl                %%eax, %%eax   \n\t"
2075
        ASMALIGN(4)
2076
        "1:                                 \n\t"
2077
        PREFETCH" 64(%0, %%eax, 4)          \n\t"
2078
        "movq       (%0, %%eax, 4), %%mm0   \n\t" // YUYV YUYV(0)
2079
        "movq      8(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(4)
2080
        "movq     16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
2081
        "movq     24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
2082
        "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
2083
        "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
2084
        "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
2085
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
2086
        "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
2087
        "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
2088

    
2089
        MOVNTQ"              %%mm0,  (%1, %%eax, 2) \n\t"
2090
        MOVNTQ"              %%mm2, 8(%1, %%eax, 2) \n\t"
2091

    
2092
        "addl                   $8, %%eax   \n\t"
2093
        "cmpl                   %4, %%eax   \n\t"
2094
        " jb                    1b          \n\t"
2095

    
2096
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2097
        : "memory", "%eax"
2098
        );
2099
#else
2100
        long i;
2101
        for (i=0; i<chromWidth; i++)
2102
        {
2103
            udst[i]     = src[4*i+0];
2104
            ydst[2*i+0] = src[4*i+1];
2105
            vdst[i]     = src[4*i+2];
2106
            ydst[2*i+1] = src[4*i+3];
2107
        }
2108
        ydst += lumStride;
2109
        src  += srcStride;
2110

    
2111
        for (i=0; i<chromWidth; i++)
2112
        {
2113
            ydst[2*i+0] = src[4*i+1];
2114
            ydst[2*i+1] = src[4*i+3];
2115
        }
2116
#endif
2117
        udst += chromStride;
2118
        vdst += chromStride;
2119
        ydst += lumStride;
2120
        src  += srcStride;
2121
    }
2122
#ifdef HAVE_MMX
2123
asm volatile(   EMMS"       \n\t"
2124
                SFENCE"     \n\t"
2125
                :::"memory");
2126
#endif
2127
}
2128

    
2129
/**
2130
 * Height should be a multiple of 2 and width should be a multiple of 2 (if
2131
 * this is a problem for anyone then tell me, and I will fix it).
2132
 * Chrominance data is only taken from every secound line,
2133
 * others are ignored in the C version.
2134
 * FIXME: Write HQ version.
2135
 */
2136
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2137
                                       long width, long height,
2138
                                       long lumStride, long chromStride, long srcStride)
2139
{
2140
    long y;
2141
    const long chromWidth= width>>1;
2142
#ifdef HAVE_MMX
2143
    for (y=0; y<height-2; y+=2)
2144
    {
2145
        long i;
2146
        for (i=0; i<2; i++)
2147
        {
2148
            asm volatile(
2149
            "mov                        %2, %%"REG_a"   \n\t"
2150
            "movq     "MANGLE(bgr2YCoeff)", %%mm6       \n\t"
2151
            "movq          "MANGLE(w1111)", %%mm5       \n\t"
2152
            "pxor                    %%mm7, %%mm7       \n\t"
2153
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2154
            ASMALIGN(4)
2155
            "1:                                         \n\t"
2156
            PREFETCH"    64(%0, %%"REG_d")              \n\t"
2157
            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2158
            "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2159
            "punpcklbw               %%mm7, %%mm0       \n\t"
2160
            "punpcklbw               %%mm7, %%mm1       \n\t"
2161
            "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2162
            "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2163
            "punpcklbw               %%mm7, %%mm2       \n\t"
2164
            "punpcklbw               %%mm7, %%mm3       \n\t"
2165
            "pmaddwd                 %%mm6, %%mm0       \n\t"
2166
            "pmaddwd                 %%mm6, %%mm1       \n\t"
2167
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2168
            "pmaddwd                 %%mm6, %%mm3       \n\t"
2169
#ifndef FAST_BGR2YV12
2170
            "psrad                      $8, %%mm0       \n\t"
2171
            "psrad                      $8, %%mm1       \n\t"
2172
            "psrad                      $8, %%mm2       \n\t"
2173
            "psrad                      $8, %%mm3       \n\t"
2174
#endif
2175
            "packssdw                %%mm1, %%mm0       \n\t"
2176
            "packssdw                %%mm3, %%mm2       \n\t"
2177
            "pmaddwd                 %%mm5, %%mm0       \n\t"
2178
            "pmaddwd                 %%mm5, %%mm2       \n\t"
2179
            "packssdw                %%mm2, %%mm0       \n\t"
2180
            "psraw                      $7, %%mm0       \n\t"
2181

    
2182
            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2183
            "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2184
            "punpcklbw               %%mm7, %%mm4       \n\t"
2185
            "punpcklbw               %%mm7, %%mm1       \n\t"
2186
            "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2187
            "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2188
            "punpcklbw               %%mm7, %%mm2       \n\t"
2189
            "punpcklbw               %%mm7, %%mm3       \n\t"
2190
            "pmaddwd                 %%mm6, %%mm4       \n\t"
2191
            "pmaddwd                 %%mm6, %%mm1       \n\t"
2192
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2193
            "pmaddwd                 %%mm6, %%mm3       \n\t"
2194
#ifndef FAST_BGR2YV12
2195
            "psrad                      $8, %%mm4       \n\t"
2196
            "psrad                      $8, %%mm1       \n\t"
2197
            "psrad                      $8, %%mm2       \n\t"
2198
            "psrad                      $8, %%mm3       \n\t"
2199
#endif
2200
            "packssdw                %%mm1, %%mm4       \n\t"
2201
            "packssdw                %%mm3, %%mm2       \n\t"
2202
            "pmaddwd                 %%mm5, %%mm4       \n\t"
2203
            "pmaddwd                 %%mm5, %%mm2       \n\t"
2204
            "add                       $24, %%"REG_d"   \n\t"
2205
            "packssdw                %%mm2, %%mm4       \n\t"
2206
            "psraw                      $7, %%mm4       \n\t"
2207

    
2208
            "packuswb                %%mm4, %%mm0       \n\t"
2209
            "paddusb "MANGLE(bgr2YOffset)", %%mm0       \n\t"
2210

    
2211
            MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2212
            "add                        $8,      %%"REG_a"  \n\t"
2213
            " js                        1b                  \n\t"
2214
            : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2215
            : "%"REG_a, "%"REG_d
2216
            );
2217
            ydst += lumStride;
2218
            src  += srcStride;
2219
        }
2220
        src -= srcStride*2;
2221
        asm volatile(
2222
        "mov                        %4, %%"REG_a"   \n\t"
2223
        "movq          "MANGLE(w1111)", %%mm5       \n\t"
2224
        "movq     "MANGLE(bgr2UCoeff)", %%mm6       \n\t"
2225
        "pxor                    %%mm7, %%mm7       \n\t"
2226
        "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2227
        "add                 %%"REG_d", %%"REG_d"   \n\t"
2228
        ASMALIGN(4)
2229
        "1:                                         \n\t"
2230
        PREFETCH"    64(%0, %%"REG_d")              \n\t"
2231
        PREFETCH"    64(%1, %%"REG_d")              \n\t"
2232
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2233
        "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2234
        "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2235
        "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2236
        "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2237
        PAVGB"                   %%mm1, %%mm0       \n\t"
2238
        PAVGB"                   %%mm3, %%mm2       \n\t"
2239
        "movq                    %%mm0, %%mm1       \n\t"
2240
        "movq                    %%mm2, %%mm3       \n\t"
2241
        "psrlq                     $24, %%mm0       \n\t"
2242
        "psrlq                     $24, %%mm2       \n\t"
2243
        PAVGB"                   %%mm1, %%mm0       \n\t"
2244
        PAVGB"                   %%mm3, %%mm2       \n\t"
2245
        "punpcklbw               %%mm7, %%mm0       \n\t"
2246
        "punpcklbw               %%mm7, %%mm2       \n\t"
2247
#else
2248
        "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2249
        "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2250
        "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2251
        "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2252
        "punpcklbw               %%mm7, %%mm0       \n\t"
2253
        "punpcklbw               %%mm7, %%mm1       \n\t"
2254
        "punpcklbw               %%mm7, %%mm2       \n\t"
2255
        "punpcklbw               %%mm7, %%mm3       \n\t"
2256
        "paddw                   %%mm1, %%mm0       \n\t"
2257
        "paddw                   %%mm3, %%mm2       \n\t"
2258
        "paddw                   %%mm2, %%mm0       \n\t"
2259
        "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2260
        "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2261
        "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2262
        "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2263
        "punpcklbw               %%mm7, %%mm4       \n\t"
2264
        "punpcklbw               %%mm7, %%mm1       \n\t"
2265
        "punpcklbw               %%mm7, %%mm2       \n\t"
2266
        "punpcklbw               %%mm7, %%mm3       \n\t"
2267
        "paddw                   %%mm1, %%mm4       \n\t"
2268
        "paddw                   %%mm3, %%mm2       \n\t"
2269
        "paddw                   %%mm4, %%mm2       \n\t"
2270
        "psrlw                      $2, %%mm0       \n\t"
2271
        "psrlw                      $2, %%mm2       \n\t"
2272
#endif
2273
        "movq     "MANGLE(bgr2VCoeff)", %%mm1       \n\t"
2274
        "movq     "MANGLE(bgr2VCoeff)", %%mm3       \n\t"
2275

    
2276
        "pmaddwd                 %%mm0, %%mm1       \n\t"
2277
        "pmaddwd                 %%mm2, %%mm3       \n\t"
2278
        "pmaddwd                 %%mm6, %%mm0       \n\t"
2279
        "pmaddwd                 %%mm6, %%mm2       \n\t"
2280
#ifndef FAST_BGR2YV12
2281
        "psrad                      $8, %%mm0       \n\t"
2282
        "psrad                      $8, %%mm1       \n\t"
2283
        "psrad                      $8, %%mm2       \n\t"
2284
        "psrad                      $8, %%mm3       \n\t"
2285
#endif
2286
        "packssdw                %%mm2, %%mm0       \n\t"
2287
        "packssdw                %%mm3, %%mm1       \n\t"
2288
        "pmaddwd                 %%mm5, %%mm0       \n\t"
2289
        "pmaddwd                 %%mm5, %%mm1       \n\t"
2290
        "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2291
        "psraw                      $7, %%mm0       \n\t"
2292

    
2293
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2294
        "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2295
        "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2296
        "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2297
        "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2298
        PAVGB"                   %%mm1, %%mm4       \n\t"
2299
        PAVGB"                   %%mm3, %%mm2       \n\t"
2300
        "movq                    %%mm4, %%mm1       \n\t"
2301
        "movq                    %%mm2, %%mm3       \n\t"
2302
        "psrlq                     $24, %%mm4       \n\t"
2303
        "psrlq                     $24, %%mm2       \n\t"
2304
        PAVGB"                   %%mm1, %%mm4       \n\t"
2305
        PAVGB"                   %%mm3, %%mm2       \n\t"
2306
        "punpcklbw               %%mm7, %%mm4       \n\t"
2307
        "punpcklbw               %%mm7, %%mm2       \n\t"
2308
#else
2309
        "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2310
        "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2311
        "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2312
        "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2313
        "punpcklbw               %%mm7, %%mm4       \n\t"
2314
        "punpcklbw               %%mm7, %%mm1       \n\t"
2315
        "punpcklbw               %%mm7, %%mm2       \n\t"
2316
        "punpcklbw               %%mm7, %%mm3       \n\t"
2317
        "paddw                   %%mm1, %%mm4       \n\t"
2318
        "paddw                   %%mm3, %%mm2       \n\t"
2319
        "paddw                   %%mm2, %%mm4       \n\t"
2320
        "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2321
        "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2322
        "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2323
        "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2324
        "punpcklbw               %%mm7, %%mm5       \n\t"
2325
        "punpcklbw               %%mm7, %%mm1       \n\t"
2326
        "punpcklbw               %%mm7, %%mm2       \n\t"
2327
        "punpcklbw               %%mm7, %%mm3       \n\t"
2328
        "paddw                   %%mm1, %%mm5       \n\t"
2329
        "paddw                   %%mm3, %%mm2       \n\t"
2330
        "paddw                   %%mm5, %%mm2       \n\t"
2331
        "movq          "MANGLE(w1111)", %%mm5       \n\t"
2332
        "psrlw                      $2, %%mm4       \n\t"
2333
        "psrlw                      $2, %%mm2       \n\t"
2334
#endif
2335
        "movq     "MANGLE(bgr2VCoeff)", %%mm1       \n\t"
2336
        "movq     "MANGLE(bgr2VCoeff)", %%mm3       \n\t"
2337

    
2338
        "pmaddwd                 %%mm4, %%mm1       \n\t"
2339
        "pmaddwd                 %%mm2, %%mm3       \n\t"
2340
        "pmaddwd                 %%mm6, %%mm4       \n\t"
2341
        "pmaddwd                 %%mm6, %%mm2       \n\t"
2342
#ifndef FAST_BGR2YV12
2343
        "psrad                      $8, %%mm4       \n\t"
2344
        "psrad                      $8, %%mm1       \n\t"
2345
        "psrad                      $8, %%mm2       \n\t"
2346
        "psrad                      $8, %%mm3       \n\t"
2347
#endif
2348
        "packssdw                %%mm2, %%mm4       \n\t"
2349
        "packssdw                %%mm3, %%mm1       \n\t"
2350
        "pmaddwd                 %%mm5, %%mm4       \n\t"
2351
        "pmaddwd                 %%mm5, %%mm1       \n\t"
2352
        "add                       $24, %%"REG_d"   \n\t"
2353
        "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2354
        "psraw                      $7, %%mm4       \n\t"
2355

    
2356
        "movq                    %%mm0, %%mm1           \n\t"
2357
        "punpckldq               %%mm4, %%mm0           \n\t"
2358
        "punpckhdq               %%mm4, %%mm1           \n\t"
2359
        "packsswb                %%mm1, %%mm0           \n\t"
2360
        "paddb  "MANGLE(bgr2UVOffset)", %%mm0           \n\t"
2361
        "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2362
        "punpckhdq               %%mm0, %%mm0           \n\t"
2363
        "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2364
        "add                        $4, %%"REG_a"       \n\t"
2365
        " js                        1b                  \n\t"
2366
        : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2367
        : "%"REG_a, "%"REG_d
2368
        );
2369

    
2370
        udst += chromStride;
2371
        vdst += chromStride;
2372
        src  += srcStride*2;
2373
    }
2374

    
2375
    asm volatile(   EMMS"       \n\t"
2376
                    SFENCE"     \n\t"
2377
                    :::"memory");
2378
#else
2379
    y=0;
2380
#endif
2381
    for (; y<height; y+=2)
2382
    {
2383
        long i;
2384
        for (i=0; i<chromWidth; i++)
2385
        {
2386
            unsigned int b = src[6*i+0];
2387
            unsigned int g = src[6*i+1];
2388
            unsigned int r = src[6*i+2];
2389

    
2390
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2391
            unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2392
            unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2393

    
2394
            udst[i]     = U;
2395
            vdst[i]     = V;
2396
            ydst[2*i]   = Y;
2397

    
2398
            b = src[6*i+3];
2399
            g = src[6*i+4];
2400
            r = src[6*i+5];
2401

    
2402
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2403
            ydst[2*i+1]     = Y;
2404
        }
2405
        ydst += lumStride;
2406
        src  += srcStride;
2407

    
2408
        for (i=0; i<chromWidth; i++)
2409
        {
2410
            unsigned int b = src[6*i+0];
2411
            unsigned int g = src[6*i+1];
2412
            unsigned int r = src[6*i+2];
2413

    
2414
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2415

    
2416
            ydst[2*i]     = Y;
2417

    
2418
            b = src[6*i+3];
2419
            g = src[6*i+4];
2420
            r = src[6*i+5];
2421

    
2422
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2423
            ydst[2*i+1]     = Y;
2424
        }
2425
        udst += chromStride;
2426
        vdst += chromStride;
2427
        ydst += lumStride;
2428
        src  += srcStride;
2429
    }
2430
}
2431

    
2432
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2433
                             long width, long height, long src1Stride,
2434
                             long src2Stride, long dstStride){
2435
    long h;
2436

    
2437
    for (h=0; h < height; h++)
2438
    {
2439
        long w;
2440

    
2441
#ifdef HAVE_MMX
2442
#ifdef HAVE_SSE2
2443
        asm(
2444
        "xor              %%"REG_a", %%"REG_a"  \n\t"
2445
        "1:                                     \n\t"
2446
        PREFETCH" 64(%1, %%"REG_a")             \n\t"
2447
        PREFETCH" 64(%2, %%"REG_a")             \n\t"
2448
        "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2449
        "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2450
        "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2451
        "punpcklbw           %%xmm2, %%xmm0     \n\t"
2452
        "punpckhbw           %%xmm2, %%xmm1     \n\t"
2453
        "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2454
        "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2455
        "add                    $16, %%"REG_a"  \n\t"
2456
        "cmp                     %3, %%"REG_a"  \n\t"
2457
        " jb                     1b             \n\t"
2458
        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2459
        : "memory", "%"REG_a""
2460
        );
2461
#else
2462
        asm(
2463
        "xor %%"REG_a", %%"REG_a"               \n\t"
2464
        "1:                                     \n\t"
2465
        PREFETCH" 64(%1, %%"REG_a")             \n\t"
2466
        PREFETCH" 64(%2, %%"REG_a")             \n\t"
2467
        "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2468
        "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2469
        "movq                 %%mm0, %%mm1      \n\t"
2470
        "movq                 %%mm2, %%mm3      \n\t"
2471
        "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2472
        "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2473
        "punpcklbw            %%mm4, %%mm0      \n\t"
2474
        "punpckhbw            %%mm4, %%mm1      \n\t"
2475
        "punpcklbw            %%mm5, %%mm2      \n\t"
2476
        "punpckhbw            %%mm5, %%mm3      \n\t"
2477
        MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2478
        MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2479
        MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2480
        MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2481
        "add                    $16, %%"REG_a"  \n\t"
2482
        "cmp                     %3, %%"REG_a"  \n\t"
2483
        " jb                     1b             \n\t"
2484
        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2485
        : "memory", "%"REG_a
2486
        );
2487
#endif
2488
        for (w= (width&(~15)); w < width; w++)
2489
        {
2490
            dest[2*w+0] = src1[w];
2491
            dest[2*w+1] = src2[w];
2492
        }
2493
#else
2494
        for (w=0; w < width; w++)
2495
        {
2496
            dest[2*w+0] = src1[w];
2497
            dest[2*w+1] = src2[w];
2498
        }
2499
#endif
2500
        dest += dstStride;
2501
                src1 += src1Stride;
2502
                src2 += src2Stride;
2503
    }
2504
#ifdef HAVE_MMX
2505
    asm(
2506
        EMMS"       \n\t"
2507
        SFENCE"     \n\t"
2508
        ::: "memory"
2509
        );
2510
#endif
2511
}
2512

    
2513
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2514
                                       uint8_t *dst1, uint8_t *dst2,
2515
                                       long width, long height,
2516
                                       long srcStride1, long srcStride2,
2517
                                       long dstStride1, long dstStride2)
2518
{
2519
    long y,x,w,h;
2520
    w=width/2; h=height/2;
2521
#ifdef HAVE_MMX
2522
    asm volatile(
2523
    PREFETCH" %0    \n\t"
2524
    PREFETCH" %1    \n\t"
2525
    ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2526
#endif
2527
    for (y=0;y<h;y++){
2528
    const uint8_t* s1=src1+srcStride1*(y>>1);
2529
    uint8_t* d=dst1+dstStride1*y;
2530
    x=0;
2531
#ifdef HAVE_MMX
2532
    for (;x<w-31;x+=32)
2533
    {
2534
        asm volatile(
2535
        PREFETCH"   32%1        \n\t"
2536
        "movq         %1, %%mm0 \n\t"
2537
        "movq        8%1, %%mm2 \n\t"
2538
        "movq       16%1, %%mm4 \n\t"
2539
        "movq       24%1, %%mm6 \n\t"
2540
        "movq      %%mm0, %%mm1 \n\t"
2541
        "movq      %%mm2, %%mm3 \n\t"
2542
        "movq      %%mm4, %%mm5 \n\t"
2543
        "movq      %%mm6, %%mm7 \n\t"
2544
        "punpcklbw %%mm0, %%mm0 \n\t"
2545
        "punpckhbw %%mm1, %%mm1 \n\t"
2546
        "punpcklbw %%mm2, %%mm2 \n\t"
2547
        "punpckhbw %%mm3, %%mm3 \n\t"
2548
        "punpcklbw %%mm4, %%mm4 \n\t"
2549
        "punpckhbw %%mm5, %%mm5 \n\t"
2550
        "punpcklbw %%mm6, %%mm6 \n\t"
2551
        "punpckhbw %%mm7, %%mm7 \n\t"
2552
        MOVNTQ"    %%mm0,   %0  \n\t"
2553
        MOVNTQ"    %%mm1,  8%0  \n\t"
2554
        MOVNTQ"    %%mm2, 16%0  \n\t"
2555
        MOVNTQ"    %%mm3, 24%0  \n\t"
2556
        MOVNTQ"    %%mm4, 32%0  \n\t"
2557
        MOVNTQ"    %%mm5, 40%0  \n\t"
2558
        MOVNTQ"    %%mm6, 48%0  \n\t"
2559
        MOVNTQ"    %%mm7, 56%0"
2560
        :"=m"(d[2*x])
2561
        :"m"(s1[x])
2562
        :"memory");
2563
    }
2564
#endif
2565
    for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2566
    }
2567
    for (y=0;y<h;y++){
2568
    const uint8_t* s2=src2+srcStride2*(y>>1);
2569
    uint8_t* d=dst2+dstStride2*y;
2570
    x=0;
2571
#ifdef HAVE_MMX
2572
    for (;x<w-31;x+=32)
2573
    {
2574
        asm volatile(
2575
        PREFETCH"   32%1        \n\t"
2576
        "movq         %1, %%mm0 \n\t"
2577
        "movq        8%1, %%mm2 \n\t"
2578
        "movq       16%1, %%mm4 \n\t"
2579
        "movq       24%1, %%mm6 \n\t"
2580
        "movq      %%mm0, %%mm1 \n\t"
2581
        "movq      %%mm2, %%mm3 \n\t"
2582
        "movq      %%mm4, %%mm5 \n\t"
2583
        "movq      %%mm6, %%mm7 \n\t"
2584
        "punpcklbw %%mm0, %%mm0 \n\t"
2585
        "punpckhbw %%mm1, %%mm1 \n\t"
2586
        "punpcklbw %%mm2, %%mm2 \n\t"
2587
        "punpckhbw %%mm3, %%mm3 \n\t"
2588
        "punpcklbw %%mm4, %%mm4 \n\t"
2589
        "punpckhbw %%mm5, %%mm5 \n\t"
2590
        "punpcklbw %%mm6, %%mm6 \n\t"
2591
        "punpckhbw %%mm7, %%mm7 \n\t"
2592
        MOVNTQ"    %%mm0,   %0  \n\t"
2593
        MOVNTQ"    %%mm1,  8%0  \n\t"
2594
        MOVNTQ"    %%mm2, 16%0  \n\t"
2595
        MOVNTQ"    %%mm3, 24%0  \n\t"
2596
        MOVNTQ"    %%mm4, 32%0  \n\t"
2597
        MOVNTQ"    %%mm5, 40%0  \n\t"
2598
        MOVNTQ"    %%mm6, 48%0  \n\t"
2599
        MOVNTQ"    %%mm7, 56%0"
2600
        :"=m"(d[2*x])
2601
        :"m"(s2[x])
2602
        :"memory");
2603
    }
2604
#endif
2605
    for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2606
    }
2607
#ifdef HAVE_MMX
2608
    asm(
2609
        EMMS"       \n\t"
2610
        SFENCE"     \n\t"
2611
        ::: "memory"
2612
        );
2613
#endif
2614
}
2615

    
2616
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2617
                                        uint8_t *dst,
2618
                                        long width, long height,
2619
                                        long srcStride1, long srcStride2,
2620
                                        long srcStride3, long dstStride)
2621
{
2622
    long y,x,w,h;
2623
    w=width/2; h=height;
2624
    for (y=0;y<h;y++){
2625
    const uint8_t* yp=src1+srcStride1*y;
2626
    const uint8_t* up=src2+srcStride2*(y>>2);
2627
    const uint8_t* vp=src3+srcStride3*(y>>2);
2628
    uint8_t* d=dst+dstStride*y;
2629
    x=0;
2630
#ifdef HAVE_MMX
2631
    for (;x<w-7;x+=8)
2632
    {
2633
        asm volatile(
2634
        PREFETCH"   32(%1, %0)          \n\t"
2635
        PREFETCH"   32(%2, %0)          \n\t"
2636
        PREFETCH"   32(%3, %0)          \n\t"
2637
        "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2638
        "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2639
        "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2640
        "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2641
        "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2642
        "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2643
        "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2644
        "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2645
        "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2646
        "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2647

    
2648
        "movq            %%mm1, %%mm6   \n\t"
2649
        "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2650
        "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2651
        "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2652
        MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2653
        MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2654

    
2655
        "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2656
        "movq     8(%1, %0, 4), %%mm0   \n\t"
2657
        "movq            %%mm0, %%mm3   \n\t"
2658
        "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2659
        "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2660
        MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2661
        MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2662

    
2663
        "movq            %%mm4, %%mm6   \n\t"
2664
        "movq    16(%1, %0, 4), %%mm0   \n\t"
2665
        "movq            %%mm0, %%mm3   \n\t"
2666
        "punpcklbw       %%mm5, %%mm4   \n\t"
2667
        "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2668
        "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2669
        MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2670
        MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2671

    
2672
        "punpckhbw       %%mm5, %%mm6   \n\t"
2673
        "movq    24(%1, %0, 4), %%mm0   \n\t"
2674
        "movq            %%mm0, %%mm3   \n\t"
2675
        "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2676
        "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2677
        MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2678
        MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2679

    
2680
        : "+r" (x)
2681
        : "r"(yp), "r" (up), "r"(vp), "r"(d)
2682
        :"memory");
2683
    }
2684
#endif
2685
    for (; x<w; x++)
2686
    {
2687
        const long x2 = x<<2;
2688
        d[8*x+0] = yp[x2];
2689
        d[8*x+1] = up[x];
2690
        d[8*x+2] = yp[x2+1];
2691
        d[8*x+3] = vp[x];
2692
        d[8*x+4] = yp[x2+2];
2693
        d[8*x+5] = up[x];
2694
        d[8*x+6] = yp[x2+3];
2695
        d[8*x+7] = vp[x];
2696
    }
2697
    }
2698
#ifdef HAVE_MMX
2699
    asm(
2700
        EMMS"       \n\t"
2701
        SFENCE"     \n\t"
2702
        ::: "memory"
2703
        );
2704
#endif
2705
}
2706

    
2707
static inline void RENAME(rgb2rgb_init)(void){
2708
    rgb15to16       = RENAME(rgb15to16);
2709
    rgb15to24       = RENAME(rgb15to24);
2710
    rgb15to32       = RENAME(rgb15to32);
2711
    rgb16to24       = RENAME(rgb16to24);
2712
    rgb16to32       = RENAME(rgb16to32);
2713
    rgb16to15       = RENAME(rgb16to15);
2714
    rgb24to16       = RENAME(rgb24to16);
2715
    rgb24to15       = RENAME(rgb24to15);
2716
    rgb24to32       = RENAME(rgb24to32);
2717
    rgb32to16       = RENAME(rgb32to16);
2718
    rgb32to15       = RENAME(rgb32to15);
2719
    rgb32to24       = RENAME(rgb32to24);
2720
    rgb24tobgr15    = RENAME(rgb24tobgr15);
2721
    rgb24tobgr16    = RENAME(rgb24tobgr16);
2722
    rgb24tobgr24    = RENAME(rgb24tobgr24);
2723
    rgb32tobgr32    = RENAME(rgb32tobgr32);
2724
    rgb32tobgr16    = RENAME(rgb32tobgr16);
2725
    rgb32tobgr15    = RENAME(rgb32tobgr15);
2726
    yv12toyuy2      = RENAME(yv12toyuy2);
2727
    yv12touyvy      = RENAME(yv12touyvy);
2728
    yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
2729
    yuy2toyv12      = RENAME(yuy2toyv12);
2730
//    uyvytoyv12      = RENAME(uyvytoyv12);
2731
//    yvu9toyv12      = RENAME(yvu9toyv12);
2732
    planar2x        = RENAME(planar2x);
2733
    rgb24toyv12     = RENAME(rgb24toyv12);
2734
    interleaveBytes = RENAME(interleaveBytes);
2735
    vu9_to_vu12     = RENAME(vu9_to_vu12);
2736
    yvu9_to_yuy2    = RENAME(yvu9_to_yuy2);
2737
}