Statistics
| Branch: | Revision:

ffmpeg / libswscale / rgb2rgb_template.c @ 8a322796

History | View | Annotate | Download (97.7 KB)

1
/*
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 * lot of big-endian byte order fixes by Alex Beregszaszi
9
 *
10
 * This file is part of FFmpeg.
11
 *
12
 * FFmpeg is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2 of the License, or
15
 * (at your option) any later version.
16
 *
17
 * FFmpeg is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU General Public License
23
 * along with FFmpeg; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 *
26
 * The C code (not assembly, MMX, ...) of this file can be used
27
 * under the LGPL license.
28
 */
29

    
30
#include <stddef.h>
31
#include <inttypes.h> /* for __WORDSIZE */
32

    
33
#ifndef __WORDSIZE
34
// #warning You have a misconfigured system and will probably lose performance!
35
#define __WORDSIZE MP_WORDSIZE
36
#endif
37

    
38
#undef PREFETCH
39
#undef MOVNTQ
40
#undef EMMS
41
#undef SFENCE
42
#undef MMREG_SIZE
43
#undef PREFETCHW
44
#undef PAVGB
45

    
46
#ifdef HAVE_SSE2
47
#define MMREG_SIZE 16
48
#else
49
#define MMREG_SIZE 8
50
#endif
51

    
52
#ifdef HAVE_3DNOW
53
#define PREFETCH  "prefetch"
54
#define PREFETCHW "prefetchw"
55
#define PAVGB     "pavgusb"
56
#elif defined (HAVE_MMX2)
57
#define PREFETCH "prefetchnta"
58
#define PREFETCHW "prefetcht0"
59
#define PAVGB     "pavgb"
60
#else
61
#ifdef __APPLE__
62
#define PREFETCH "#"
63
#define PREFETCHW "#"
64
#else
65
#define PREFETCH  " # nop"
66
#define PREFETCHW " # nop"
67
#endif
68
#endif
69

    
70
#ifdef HAVE_3DNOW
71
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
72
#define EMMS     "femms"
73
#else
74
#define EMMS     "emms"
75
#endif
76

    
77
#ifdef HAVE_MMX2
78
#define MOVNTQ "movntq"
79
#define SFENCE "sfence"
80
#else
81
#define MOVNTQ "movq"
82
#define SFENCE " # nop"
83
#endif
84

    
85
static inline void RENAME(rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size)
86
{
87
    uint8_t *dest = dst;
88
    const uint8_t *s = src;
89
    const uint8_t *end;
90
    #ifdef HAVE_MMX
91
        const uint8_t *mm_end;
92
    #endif
93
    end = s + src_size;
94
    #ifdef HAVE_MMX
95
        asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
96
        mm_end = end - 23;
97
        asm volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
98
        while (s < mm_end)
99
        {
100
            asm volatile(
101
            PREFETCH"    32%1           \n\t"
102
            "movd          %1, %%mm0    \n\t"
103
            "punpckldq    3%1, %%mm0    \n\t"
104
            "movd         6%1, %%mm1    \n\t"
105
            "punpckldq    9%1, %%mm1    \n\t"
106
            "movd        12%1, %%mm2    \n\t"
107
            "punpckldq   15%1, %%mm2    \n\t"
108
            "movd        18%1, %%mm3    \n\t"
109
            "punpckldq   21%1, %%mm3    \n\t"
110
            "pand       %%mm7, %%mm0    \n\t"
111
            "pand       %%mm7, %%mm1    \n\t"
112
            "pand       %%mm7, %%mm2    \n\t"
113
            "pand       %%mm7, %%mm3    \n\t"
114
            MOVNTQ"     %%mm0,   %0     \n\t"
115
            MOVNTQ"     %%mm1,  8%0     \n\t"
116
            MOVNTQ"     %%mm2, 16%0     \n\t"
117
            MOVNTQ"     %%mm3, 24%0"
118
            :"=m"(*dest)
119
            :"m"(*s)
120
            :"memory");
121
            dest += 32;
122
            s += 24;
123
        }
124
        asm volatile(SFENCE:::"memory");
125
        asm volatile(EMMS:::"memory");
126
    #endif
127
    while (s < end)
128
    {
129
    #ifdef WORDS_BIGENDIAN
130
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
131
        *dest++ = 0;
132
        *dest++ = s[2];
133
        *dest++ = s[1];
134
        *dest++ = s[0];
135
        s+=3;
136
    #else
137
        *dest++ = *s++;
138
        *dest++ = *s++;
139
        *dest++ = *s++;
140
        *dest++ = 0;
141
    #endif
142
    }
143
}
144

    
145
static inline void RENAME(rgb32to24)(const uint8_t *src, uint8_t *dst, long src_size)
146
{
147
    uint8_t *dest = dst;
148
    const uint8_t *s = src;
149
    const uint8_t *end;
150
#ifdef HAVE_MMX
151
    const uint8_t *mm_end;
152
#endif
153
    end = s + src_size;
154
#ifdef HAVE_MMX
155
    asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
156
    mm_end = end - 31;
157
    while (s < mm_end)
158
    {
159
        asm volatile(
160
        PREFETCH"    32%1           \n\t"
161
        "movq          %1, %%mm0    \n\t"
162
        "movq         8%1, %%mm1    \n\t"
163
        "movq        16%1, %%mm4    \n\t"
164
        "movq        24%1, %%mm5    \n\t"
165
        "movq       %%mm0, %%mm2    \n\t"
166
        "movq       %%mm1, %%mm3    \n\t"
167
        "movq       %%mm4, %%mm6    \n\t"
168
        "movq       %%mm5, %%mm7    \n\t"
169
        "psrlq         $8, %%mm2    \n\t"
170
        "psrlq         $8, %%mm3    \n\t"
171
        "psrlq         $8, %%mm6    \n\t"
172
        "psrlq         $8, %%mm7    \n\t"
173
        "pand          %2, %%mm0    \n\t"
174
        "pand          %2, %%mm1    \n\t"
175
        "pand          %2, %%mm4    \n\t"
176
        "pand          %2, %%mm5    \n\t"
177
        "pand          %3, %%mm2    \n\t"
178
        "pand          %3, %%mm3    \n\t"
179
        "pand          %3, %%mm6    \n\t"
180
        "pand          %3, %%mm7    \n\t"
181
        "por        %%mm2, %%mm0    \n\t"
182
        "por        %%mm3, %%mm1    \n\t"
183
        "por        %%mm6, %%mm4    \n\t"
184
        "por        %%mm7, %%mm5    \n\t"
185

    
186
        "movq       %%mm1, %%mm2    \n\t"
187
        "movq       %%mm4, %%mm3    \n\t"
188
        "psllq        $48, %%mm2    \n\t"
189
        "psllq        $32, %%mm3    \n\t"
190
        "pand          %4, %%mm2    \n\t"
191
        "pand          %5, %%mm3    \n\t"
192
        "por        %%mm2, %%mm0    \n\t"
193
        "psrlq        $16, %%mm1    \n\t"
194
        "psrlq        $32, %%mm4    \n\t"
195
        "psllq        $16, %%mm5    \n\t"
196
        "por        %%mm3, %%mm1    \n\t"
197
        "pand          %6, %%mm5    \n\t"
198
        "por        %%mm5, %%mm4    \n\t"
199

    
200
        MOVNTQ"     %%mm0,   %0     \n\t"
201
        MOVNTQ"     %%mm1,  8%0     \n\t"
202
        MOVNTQ"     %%mm4, 16%0"
203
        :"=m"(*dest)
204
        :"m"(*s),"m"(mask24l),
205
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
206
        :"memory");
207
        dest += 24;
208
        s += 32;
209
    }
210
    asm volatile(SFENCE:::"memory");
211
    asm volatile(EMMS:::"memory");
212
#endif
213
    while (s < end)
214
    {
215
#ifdef WORDS_BIGENDIAN
216
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
217
        s++;
218
        dest[2] = *s++;
219
        dest[1] = *s++;
220
        dest[0] = *s++;
221
        dest += 3;
222
#else
223
        *dest++ = *s++;
224
        *dest++ = *s++;
225
        *dest++ = *s++;
226
        s++;
227
#endif
228
    }
229
}
230

    
231
/*
232
 original by Strepto/Astral
233
 ported to gcc & bugfixed: A'rpi
234
 MMX2, 3DNOW optimization by Nick Kurshev
235
 32-bit C version, and and&add trick by Michael Niedermayer
236
*/
237
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
238
{
239
    register const uint8_t* s=src;
240
    register uint8_t* d=dst;
241
    register const uint8_t *end;
242
    const uint8_t *mm_end;
243
    end = s + src_size;
244
#ifdef HAVE_MMX
245
    asm volatile(PREFETCH"    %0"::"m"(*s));
246
    asm volatile("movq        %0, %%mm4"::"m"(mask15s));
247
    mm_end = end - 15;
248
    while (s<mm_end)
249
    {
250
        asm volatile(
251
        PREFETCH"  32%1         \n\t"
252
        "movq        %1, %%mm0  \n\t"
253
        "movq       8%1, %%mm2  \n\t"
254
        "movq     %%mm0, %%mm1  \n\t"
255
        "movq     %%mm2, %%mm3  \n\t"
256
        "pand     %%mm4, %%mm0  \n\t"
257
        "pand     %%mm4, %%mm2  \n\t"
258
        "paddw    %%mm1, %%mm0  \n\t"
259
        "paddw    %%mm3, %%mm2  \n\t"
260
        MOVNTQ"   %%mm0,  %0    \n\t"
261
        MOVNTQ"   %%mm2, 8%0"
262
        :"=m"(*d)
263
        :"m"(*s)
264
        );
265
        d+=16;
266
        s+=16;
267
    }
268
    asm volatile(SFENCE:::"memory");
269
    asm volatile(EMMS:::"memory");
270
#endif
271
    mm_end = end - 3;
272
    while (s < mm_end)
273
    {
274
        register unsigned x= *((const uint32_t *)s);
275
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
276
        d+=4;
277
        s+=4;
278
    }
279
    if (s < end)
280
    {
281
        register unsigned short x= *((const uint16_t *)s);
282
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
283
    }
284
}
285

    
286
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
287
{
288
    register const uint8_t* s=src;
289
    register uint8_t* d=dst;
290
    register const uint8_t *end;
291
    const uint8_t *mm_end;
292
    end = s + src_size;
293
#ifdef HAVE_MMX
294
    asm volatile(PREFETCH"    %0"::"m"(*s));
295
    asm volatile("movq        %0, %%mm7"::"m"(mask15rg));
296
    asm volatile("movq        %0, %%mm6"::"m"(mask15b));
297
    mm_end = end - 15;
298
    while (s<mm_end)
299
    {
300
        asm volatile(
301
        PREFETCH"  32%1         \n\t"
302
        "movq        %1, %%mm0  \n\t"
303
        "movq       8%1, %%mm2  \n\t"
304
        "movq     %%mm0, %%mm1  \n\t"
305
        "movq     %%mm2, %%mm3  \n\t"
306
        "psrlq       $1, %%mm0  \n\t"
307
        "psrlq       $1, %%mm2  \n\t"
308
        "pand     %%mm7, %%mm0  \n\t"
309
        "pand     %%mm7, %%mm2  \n\t"
310
        "pand     %%mm6, %%mm1  \n\t"
311
        "pand     %%mm6, %%mm3  \n\t"
312
        "por      %%mm1, %%mm0  \n\t"
313
        "por      %%mm3, %%mm2  \n\t"
314
        MOVNTQ"   %%mm0,  %0    \n\t"
315
        MOVNTQ"   %%mm2, 8%0"
316
        :"=m"(*d)
317
        :"m"(*s)
318
        );
319
        d+=16;
320
        s+=16;
321
    }
322
    asm volatile(SFENCE:::"memory");
323
    asm volatile(EMMS:::"memory");
324
#endif
325
    mm_end = end - 3;
326
    while (s < mm_end)
327
    {
328
        register uint32_t x= *((const uint32_t*)s);
329
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
330
        s+=4;
331
        d+=4;
332
    }
333
    if (s < end)
334
    {
335
        register uint16_t x= *((const uint16_t*)s);
336
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
337
        s+=2;
338
        d+=2;
339
    }
340
}
341

    
342
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
343
{
344
    const uint8_t *s = src;
345
    const uint8_t *end;
346
#ifdef HAVE_MMX
347
    const uint8_t *mm_end;
348
#endif
349
    uint16_t *d = (uint16_t *)dst;
350
    end = s + src_size;
351
#ifdef HAVE_MMX
352
    mm_end = end - 15;
353
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
354
    asm volatile(
355
    "movq           %3, %%mm5   \n\t"
356
    "movq           %4, %%mm6   \n\t"
357
    "movq           %5, %%mm7   \n\t"
358
    "jmp 2f                     \n\t"
359
    ASMALIGN(4)
360
    "1:                         \n\t"
361
    PREFETCH"   32(%1)          \n\t"
362
    "movd         (%1), %%mm0   \n\t"
363
    "movd        4(%1), %%mm3   \n\t"
364
    "punpckldq   8(%1), %%mm0   \n\t"
365
    "punpckldq  12(%1), %%mm3   \n\t"
366
    "movq        %%mm0, %%mm1   \n\t"
367
    "movq        %%mm3, %%mm4   \n\t"
368
    "pand        %%mm6, %%mm0   \n\t"
369
    "pand        %%mm6, %%mm3   \n\t"
370
    "pmaddwd     %%mm7, %%mm0   \n\t"
371
    "pmaddwd     %%mm7, %%mm3   \n\t"
372
    "pand        %%mm5, %%mm1   \n\t"
373
    "pand        %%mm5, %%mm4   \n\t"
374
    "por         %%mm1, %%mm0   \n\t"
375
    "por         %%mm4, %%mm3   \n\t"
376
    "psrld          $5, %%mm0   \n\t"
377
    "pslld         $11, %%mm3   \n\t"
378
    "por         %%mm3, %%mm0   \n\t"
379
    MOVNTQ"      %%mm0, (%0)    \n\t"
380
    "add           $16,  %1     \n\t"
381
    "add            $8,  %0     \n\t"
382
    "2:                         \n\t"
383
    "cmp            %2,  %1     \n\t"
384
    " jb            1b          \n\t"
385
    : "+r" (d), "+r"(s)
386
    : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
387
    );
388
#else
389
    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
390
    asm volatile(
391
        "movq    %0, %%mm7    \n\t"
392
        "movq    %1, %%mm6    \n\t"
393
        ::"m"(red_16mask),"m"(green_16mask));
394
    while (s < mm_end)
395
    {
396
        asm volatile(
397
        PREFETCH"    32%1           \n\t"
398
        "movd          %1, %%mm0    \n\t"
399
        "movd         4%1, %%mm3    \n\t"
400
        "punpckldq    8%1, %%mm0    \n\t"
401
        "punpckldq   12%1, %%mm3    \n\t"
402
        "movq       %%mm0, %%mm1    \n\t"
403
        "movq       %%mm0, %%mm2    \n\t"
404
        "movq       %%mm3, %%mm4    \n\t"
405
        "movq       %%mm3, %%mm5    \n\t"
406
        "psrlq         $3, %%mm0    \n\t"
407
        "psrlq         $3, %%mm3    \n\t"
408
        "pand          %2, %%mm0    \n\t"
409
        "pand          %2, %%mm3    \n\t"
410
        "psrlq         $5, %%mm1    \n\t"
411
        "psrlq         $5, %%mm4    \n\t"
412
        "pand       %%mm6, %%mm1    \n\t"
413
        "pand       %%mm6, %%mm4    \n\t"
414
        "psrlq         $8, %%mm2    \n\t"
415
        "psrlq         $8, %%mm5    \n\t"
416
        "pand       %%mm7, %%mm2    \n\t"
417
        "pand       %%mm7, %%mm5    \n\t"
418
        "por        %%mm1, %%mm0    \n\t"
419
        "por        %%mm4, %%mm3    \n\t"
420
        "por        %%mm2, %%mm0    \n\t"
421
        "por        %%mm5, %%mm3    \n\t"
422
        "psllq        $16, %%mm3    \n\t"
423
        "por        %%mm3, %%mm0    \n\t"
424
        MOVNTQ"     %%mm0, %0       \n\t"
425
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
426
        d += 4;
427
        s += 16;
428
    }
429
#endif
430
    asm volatile(SFENCE:::"memory");
431
    asm volatile(EMMS:::"memory");
432
#endif
433
    while (s < end)
434
    {
435
        register int rgb = *(const uint32_t*)s; s += 4;
436
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
437
    }
438
}
439

    
440
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
441
{
442
    const uint8_t *s = src;
443
    const uint8_t *end;
444
#ifdef HAVE_MMX
445
    const uint8_t *mm_end;
446
#endif
447
    uint16_t *d = (uint16_t *)dst;
448
    end = s + src_size;
449
#ifdef HAVE_MMX
450
    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
451
    asm volatile(
452
        "movq          %0, %%mm7    \n\t"
453
        "movq          %1, %%mm6    \n\t"
454
        ::"m"(red_16mask),"m"(green_16mask));
455
    mm_end = end - 15;
456
    while (s < mm_end)
457
    {
458
        asm volatile(
459
        PREFETCH"    32%1           \n\t"
460
        "movd          %1, %%mm0    \n\t"
461
        "movd         4%1, %%mm3    \n\t"
462
        "punpckldq    8%1, %%mm0    \n\t"
463
        "punpckldq   12%1, %%mm3    \n\t"
464
        "movq       %%mm0, %%mm1    \n\t"
465
        "movq       %%mm0, %%mm2    \n\t"
466
        "movq       %%mm3, %%mm4    \n\t"
467
        "movq       %%mm3, %%mm5    \n\t"
468
        "psllq         $8, %%mm0    \n\t"
469
        "psllq         $8, %%mm3    \n\t"
470
        "pand       %%mm7, %%mm0    \n\t"
471
        "pand       %%mm7, %%mm3    \n\t"
472
        "psrlq         $5, %%mm1    \n\t"
473
        "psrlq         $5, %%mm4    \n\t"
474
        "pand       %%mm6, %%mm1    \n\t"
475
        "pand       %%mm6, %%mm4    \n\t"
476
        "psrlq        $19, %%mm2    \n\t"
477
        "psrlq        $19, %%mm5    \n\t"
478
        "pand          %2, %%mm2    \n\t"
479
        "pand          %2, %%mm5    \n\t"
480
        "por        %%mm1, %%mm0    \n\t"
481
        "por        %%mm4, %%mm3    \n\t"
482
        "por        %%mm2, %%mm0    \n\t"
483
        "por        %%mm5, %%mm3    \n\t"
484
        "psllq        $16, %%mm3    \n\t"
485
        "por        %%mm3, %%mm0    \n\t"
486
        MOVNTQ"     %%mm0, %0       \n\t"
487
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
488
        d += 4;
489
        s += 16;
490
    }
491
    asm volatile(SFENCE:::"memory");
492
    asm volatile(EMMS:::"memory");
493
#endif
494
    while (s < end)
495
    {
496
        register int rgb = *(const uint32_t*)s; s += 4;
497
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
498
    }
499
}
500

    
501
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
502
{
503
    const uint8_t *s = src;
504
    const uint8_t *end;
505
#ifdef HAVE_MMX
506
    const uint8_t *mm_end;
507
#endif
508
    uint16_t *d = (uint16_t *)dst;
509
    end = s + src_size;
510
#ifdef HAVE_MMX
511
    mm_end = end - 15;
512
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
513
    asm volatile(
514
    "movq           %3, %%mm5   \n\t"
515
    "movq           %4, %%mm6   \n\t"
516
    "movq           %5, %%mm7   \n\t"
517
    "jmp            2f          \n\t"
518
    ASMALIGN(4)
519
    "1:                         \n\t"
520
    PREFETCH"   32(%1)          \n\t"
521
    "movd         (%1), %%mm0   \n\t"
522
    "movd        4(%1), %%mm3   \n\t"
523
    "punpckldq   8(%1), %%mm0   \n\t"
524
    "punpckldq  12(%1), %%mm3   \n\t"
525
    "movq        %%mm0, %%mm1   \n\t"
526
    "movq        %%mm3, %%mm4   \n\t"
527
    "pand        %%mm6, %%mm0   \n\t"
528
    "pand        %%mm6, %%mm3   \n\t"
529
    "pmaddwd     %%mm7, %%mm0   \n\t"
530
    "pmaddwd     %%mm7, %%mm3   \n\t"
531
    "pand        %%mm5, %%mm1   \n\t"
532
    "pand        %%mm5, %%mm4   \n\t"
533
    "por         %%mm1, %%mm0   \n\t"
534
    "por         %%mm4, %%mm3   \n\t"
535
    "psrld          $6, %%mm0   \n\t"
536
    "pslld         $10, %%mm3   \n\t"
537
    "por         %%mm3, %%mm0   \n\t"
538
    MOVNTQ"      %%mm0, (%0)    \n\t"
539
    "add           $16,  %1     \n\t"
540
    "add            $8,  %0     \n\t"
541
    "2:                         \n\t"
542
    "cmp            %2,  %1     \n\t"
543
    " jb            1b          \n\t"
544
    : "+r" (d), "+r"(s)
545
    : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
546
    );
547
#else
548
    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
549
    asm volatile(
550
        "movq          %0, %%mm7    \n\t"
551
        "movq          %1, %%mm6    \n\t"
552
        ::"m"(red_15mask),"m"(green_15mask));
553
    while (s < mm_end)
554
    {
555
        asm volatile(
556
        PREFETCH"    32%1           \n\t"
557
        "movd          %1, %%mm0    \n\t"
558
        "movd         4%1, %%mm3    \n\t"
559
        "punpckldq    8%1, %%mm0    \n\t"
560
        "punpckldq   12%1, %%mm3    \n\t"
561
        "movq       %%mm0, %%mm1    \n\t"
562
        "movq       %%mm0, %%mm2    \n\t"
563
        "movq       %%mm3, %%mm4    \n\t"
564
        "movq       %%mm3, %%mm5    \n\t"
565
        "psrlq         $3, %%mm0    \n\t"
566
        "psrlq         $3, %%mm3    \n\t"
567
        "pand          %2, %%mm0    \n\t"
568
        "pand          %2, %%mm3    \n\t"
569
        "psrlq         $6, %%mm1    \n\t"
570
        "psrlq         $6, %%mm4    \n\t"
571
        "pand       %%mm6, %%mm1    \n\t"
572
        "pand       %%mm6, %%mm4    \n\t"
573
        "psrlq         $9, %%mm2    \n\t"
574
        "psrlq         $9, %%mm5    \n\t"
575
        "pand       %%mm7, %%mm2    \n\t"
576
        "pand       %%mm7, %%mm5    \n\t"
577
        "por        %%mm1, %%mm0    \n\t"
578
        "por        %%mm4, %%mm3    \n\t"
579
        "por        %%mm2, %%mm0    \n\t"
580
        "por        %%mm5, %%mm3    \n\t"
581
        "psllq        $16, %%mm3    \n\t"
582
        "por        %%mm3, %%mm0    \n\t"
583
        MOVNTQ"     %%mm0, %0       \n\t"
584
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
585
        d += 4;
586
        s += 16;
587
    }
588
#endif
589
    asm volatile(SFENCE:::"memory");
590
    asm volatile(EMMS:::"memory");
591
#endif
592
    while (s < end)
593
    {
594
        register int rgb = *(const uint32_t*)s; s += 4;
595
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
596
    }
597
}
598

    
599
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
600
{
601
    const uint8_t *s = src;
602
    const uint8_t *end;
603
#ifdef HAVE_MMX
604
    const uint8_t *mm_end;
605
#endif
606
    uint16_t *d = (uint16_t *)dst;
607
    end = s + src_size;
608
#ifdef HAVE_MMX
609
    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
610
    asm volatile(
611
        "movq          %0, %%mm7    \n\t"
612
        "movq          %1, %%mm6    \n\t"
613
        ::"m"(red_15mask),"m"(green_15mask));
614
    mm_end = end - 15;
615
    while (s < mm_end)
616
    {
617
        asm volatile(
618
        PREFETCH"    32%1           \n\t"
619
        "movd          %1, %%mm0    \n\t"
620
        "movd         4%1, %%mm3    \n\t"
621
        "punpckldq    8%1, %%mm0    \n\t"
622
        "punpckldq   12%1, %%mm3    \n\t"
623
        "movq       %%mm0, %%mm1    \n\t"
624
        "movq       %%mm0, %%mm2    \n\t"
625
        "movq       %%mm3, %%mm4    \n\t"
626
        "movq       %%mm3, %%mm5    \n\t"
627
        "psllq         $7, %%mm0    \n\t"
628
        "psllq         $7, %%mm3    \n\t"
629
        "pand       %%mm7, %%mm0    \n\t"
630
        "pand       %%mm7, %%mm3    \n\t"
631
        "psrlq         $6, %%mm1    \n\t"
632
        "psrlq         $6, %%mm4    \n\t"
633
        "pand       %%mm6, %%mm1    \n\t"
634
        "pand       %%mm6, %%mm4    \n\t"
635
        "psrlq        $19, %%mm2    \n\t"
636
        "psrlq        $19, %%mm5    \n\t"
637
        "pand          %2, %%mm2    \n\t"
638
        "pand          %2, %%mm5    \n\t"
639
        "por        %%mm1, %%mm0    \n\t"
640
        "por        %%mm4, %%mm3    \n\t"
641
        "por        %%mm2, %%mm0    \n\t"
642
        "por        %%mm5, %%mm3    \n\t"
643
        "psllq        $16, %%mm3    \n\t"
644
        "por        %%mm3, %%mm0    \n\t"
645
        MOVNTQ"     %%mm0, %0       \n\t"
646
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
647
        d += 4;
648
        s += 16;
649
    }
650
    asm volatile(SFENCE:::"memory");
651
    asm volatile(EMMS:::"memory");
652
#endif
653
    while (s < end)
654
    {
655
        register int rgb = *(const uint32_t*)s; s += 4;
656
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
657
    }
658
}
659

    
660
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
661
{
662
    const uint8_t *s = src;
663
    const uint8_t *end;
664
#ifdef HAVE_MMX
665
    const uint8_t *mm_end;
666
#endif
667
    uint16_t *d = (uint16_t *)dst;
668
    end = s + src_size;
669
#ifdef HAVE_MMX
670
    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
671
    asm volatile(
672
        "movq         %0, %%mm7     \n\t"
673
        "movq         %1, %%mm6     \n\t"
674
        ::"m"(red_16mask),"m"(green_16mask));
675
    mm_end = end - 11;
676
    while (s < mm_end)
677
    {
678
        asm volatile(
679
        PREFETCH"    32%1           \n\t"
680
        "movd          %1, %%mm0    \n\t"
681
        "movd         3%1, %%mm3    \n\t"
682
        "punpckldq    6%1, %%mm0    \n\t"
683
        "punpckldq    9%1, %%mm3    \n\t"
684
        "movq       %%mm0, %%mm1    \n\t"
685
        "movq       %%mm0, %%mm2    \n\t"
686
        "movq       %%mm3, %%mm4    \n\t"
687
        "movq       %%mm3, %%mm5    \n\t"
688
        "psrlq         $3, %%mm0    \n\t"
689
        "psrlq         $3, %%mm3    \n\t"
690
        "pand          %2, %%mm0    \n\t"
691
        "pand          %2, %%mm3    \n\t"
692
        "psrlq         $5, %%mm1    \n\t"
693
        "psrlq         $5, %%mm4    \n\t"
694
        "pand       %%mm6, %%mm1    \n\t"
695
        "pand       %%mm6, %%mm4    \n\t"
696
        "psrlq         $8, %%mm2    \n\t"
697
        "psrlq         $8, %%mm5    \n\t"
698
        "pand       %%mm7, %%mm2    \n\t"
699
        "pand       %%mm7, %%mm5    \n\t"
700
        "por        %%mm1, %%mm0    \n\t"
701
        "por        %%mm4, %%mm3    \n\t"
702
        "por        %%mm2, %%mm0    \n\t"
703
        "por        %%mm5, %%mm3    \n\t"
704
        "psllq        $16, %%mm3    \n\t"
705
        "por        %%mm3, %%mm0    \n\t"
706
        MOVNTQ"     %%mm0, %0       \n\t"
707
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
708
        d += 4;
709
        s += 12;
710
    }
711
    asm volatile(SFENCE:::"memory");
712
    asm volatile(EMMS:::"memory");
713
#endif
714
    while (s < end)
715
    {
716
        const int b = *s++;
717
        const int g = *s++;
718
        const int r = *s++;
719
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
720
    }
721
}
722

    
723
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
724
{
725
    const uint8_t *s = src;
726
    const uint8_t *end;
727
#ifdef HAVE_MMX
728
    const uint8_t *mm_end;
729
#endif
730
    uint16_t *d = (uint16_t *)dst;
731
    end = s + src_size;
732
#ifdef HAVE_MMX
733
    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
734
    asm volatile(
735
        "movq         %0, %%mm7     \n\t"
736
        "movq         %1, %%mm6     \n\t"
737
        ::"m"(red_16mask),"m"(green_16mask));
738
    mm_end = end - 15;
739
    while (s < mm_end)
740
    {
741
        asm volatile(
742
        PREFETCH"    32%1           \n\t"
743
        "movd          %1, %%mm0    \n\t"
744
        "movd         3%1, %%mm3    \n\t"
745
        "punpckldq    6%1, %%mm0    \n\t"
746
        "punpckldq    9%1, %%mm3    \n\t"
747
        "movq       %%mm0, %%mm1    \n\t"
748
        "movq       %%mm0, %%mm2    \n\t"
749
        "movq       %%mm3, %%mm4    \n\t"
750
        "movq       %%mm3, %%mm5    \n\t"
751
        "psllq         $8, %%mm0    \n\t"
752
        "psllq         $8, %%mm3    \n\t"
753
        "pand       %%mm7, %%mm0    \n\t"
754
        "pand       %%mm7, %%mm3    \n\t"
755
        "psrlq         $5, %%mm1    \n\t"
756
        "psrlq         $5, %%mm4    \n\t"
757
        "pand       %%mm6, %%mm1    \n\t"
758
        "pand       %%mm6, %%mm4    \n\t"
759
        "psrlq        $19, %%mm2    \n\t"
760
        "psrlq        $19, %%mm5    \n\t"
761
        "pand          %2, %%mm2    \n\t"
762
        "pand          %2, %%mm5    \n\t"
763
        "por        %%mm1, %%mm0    \n\t"
764
        "por        %%mm4, %%mm3    \n\t"
765
        "por        %%mm2, %%mm0    \n\t"
766
        "por        %%mm5, %%mm3    \n\t"
767
        "psllq        $16, %%mm3    \n\t"
768
        "por        %%mm3, %%mm0    \n\t"
769
        MOVNTQ"     %%mm0, %0       \n\t"
770
        :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
771
        d += 4;
772
        s += 12;
773
    }
774
    asm volatile(SFENCE:::"memory");
775
    asm volatile(EMMS:::"memory");
776
#endif
777
    while (s < end)
778
    {
779
        const int r = *s++;
780
        const int g = *s++;
781
        const int b = *s++;
782
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
783
    }
784
}
785

    
786
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
787
{
788
    const uint8_t *s = src;
789
    const uint8_t *end;
790
#ifdef HAVE_MMX
791
    const uint8_t *mm_end;
792
#endif
793
    uint16_t *d = (uint16_t *)dst;
794
    end = s + src_size;
795
#ifdef HAVE_MMX
796
    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
797
    asm volatile(
798
        "movq          %0, %%mm7    \n\t"
799
        "movq          %1, %%mm6    \n\t"
800
        ::"m"(red_15mask),"m"(green_15mask));
801
    mm_end = end - 11;
802
    while (s < mm_end)
803
    {
804
        asm volatile(
805
        PREFETCH"    32%1           \n\t"
806
        "movd          %1, %%mm0    \n\t"
807
        "movd         3%1, %%mm3    \n\t"
808
        "punpckldq    6%1, %%mm0    \n\t"
809
        "punpckldq    9%1, %%mm3    \n\t"
810
        "movq       %%mm0, %%mm1    \n\t"
811
        "movq       %%mm0, %%mm2    \n\t"
812
        "movq       %%mm3, %%mm4    \n\t"
813
        "movq       %%mm3, %%mm5    \n\t"
814
        "psrlq         $3, %%mm0    \n\t"
815
        "psrlq         $3, %%mm3    \n\t"
816
        "pand          %2, %%mm0    \n\t"
817
        "pand          %2, %%mm3    \n\t"
818
        "psrlq         $6, %%mm1    \n\t"
819
        "psrlq         $6, %%mm4    \n\t"
820
        "pand       %%mm6, %%mm1    \n\t"
821
        "pand       %%mm6, %%mm4    \n\t"
822
        "psrlq         $9, %%mm2    \n\t"
823
        "psrlq         $9, %%mm5    \n\t"
824
        "pand       %%mm7, %%mm2    \n\t"
825
        "pand       %%mm7, %%mm5    \n\t"
826
        "por        %%mm1, %%mm0    \n\t"
827
        "por        %%mm4, %%mm3    \n\t"
828
        "por        %%mm2, %%mm0    \n\t"
829
        "por        %%mm5, %%mm3    \n\t"
830
        "psllq        $16, %%mm3    \n\t"
831
        "por        %%mm3, %%mm0    \n\t"
832
        MOVNTQ"     %%mm0, %0       \n\t"
833
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
834
        d += 4;
835
        s += 12;
836
    }
837
    asm volatile(SFENCE:::"memory");
838
    asm volatile(EMMS:::"memory");
839
#endif
840
    while (s < end)
841
    {
842
        const int b = *s++;
843
        const int g = *s++;
844
        const int r = *s++;
845
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
846
    }
847
}
848

    
849
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
850
{
851
    const uint8_t *s = src;
852
    const uint8_t *end;
853
#ifdef HAVE_MMX
854
    const uint8_t *mm_end;
855
#endif
856
    uint16_t *d = (uint16_t *)dst;
857
    end = s + src_size;
858
#ifdef HAVE_MMX
859
    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
860
    asm volatile(
861
        "movq         %0, %%mm7     \n\t"
862
        "movq         %1, %%mm6     \n\t"
863
        ::"m"(red_15mask),"m"(green_15mask));
864
    mm_end = end - 15;
865
    while (s < mm_end)
866
    {
867
        asm volatile(
868
        PREFETCH"   32%1            \n\t"
869
        "movd         %1, %%mm0     \n\t"
870
        "movd        3%1, %%mm3     \n\t"
871
        "punpckldq   6%1, %%mm0     \n\t"
872
        "punpckldq   9%1, %%mm3     \n\t"
873
        "movq      %%mm0, %%mm1     \n\t"
874
        "movq      %%mm0, %%mm2     \n\t"
875
        "movq      %%mm3, %%mm4     \n\t"
876
        "movq      %%mm3, %%mm5     \n\t"
877
        "psllq        $7, %%mm0     \n\t"
878
        "psllq        $7, %%mm3     \n\t"
879
        "pand      %%mm7, %%mm0     \n\t"
880
        "pand      %%mm7, %%mm3     \n\t"
881
        "psrlq        $6, %%mm1     \n\t"
882
        "psrlq        $6, %%mm4     \n\t"
883
        "pand      %%mm6, %%mm1     \n\t"
884
        "pand      %%mm6, %%mm4     \n\t"
885
        "psrlq       $19, %%mm2     \n\t"
886
        "psrlq       $19, %%mm5     \n\t"
887
        "pand         %2, %%mm2     \n\t"
888
        "pand         %2, %%mm5     \n\t"
889
        "por       %%mm1, %%mm0     \n\t"
890
        "por       %%mm4, %%mm3     \n\t"
891
        "por       %%mm2, %%mm0     \n\t"
892
        "por       %%mm5, %%mm3     \n\t"
893
        "psllq       $16, %%mm3     \n\t"
894
        "por       %%mm3, %%mm0     \n\t"
895
        MOVNTQ"    %%mm0, %0        \n\t"
896
        :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
897
        d += 4;
898
        s += 12;
899
    }
900
    asm volatile(SFENCE:::"memory");
901
    asm volatile(EMMS:::"memory");
902
#endif
903
    while (s < end)
904
    {
905
        const int r = *s++;
906
        const int g = *s++;
907
        const int b = *s++;
908
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
909
    }
910
}
911

    
912
/*
913
  I use less accurate approximation here by simply left-shifting the input
914
  value and filling the low order bits with zeroes. This method improves PNG
915
  compression but this scheme cannot reproduce white exactly, since it does
916
  not generate an all-ones maximum value; the net effect is to darken the
917
  image slightly.
918

919
  The better method should be "left bit replication":
920

921
   4 3 2 1 0
922
   ---------
923
   1 1 0 1 1
924

925
   7 6 5 4 3  2 1 0
926
   ----------------
927
   1 1 0 1 1  1 1 0
928
   |=======|  |===|
929
       |      leftmost bits repeated to fill open bits
930
       |
931
   original bits
932
*/
933
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
934
{
935
    const uint16_t *end;
936
#ifdef HAVE_MMX
937
    const uint16_t *mm_end;
938
#endif
939
    uint8_t *d = dst;
940
    const uint16_t *s = (const uint16_t*)src;
941
    end = s + src_size/2;
942
#ifdef HAVE_MMX
943
    asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
944
    mm_end = end - 7;
945
    while (s < mm_end)
946
    {
947
        asm volatile(
948
        PREFETCH"    32%1           \n\t"
949
        "movq          %1, %%mm0    \n\t"
950
        "movq          %1, %%mm1    \n\t"
951
        "movq          %1, %%mm2    \n\t"
952
        "pand          %2, %%mm0    \n\t"
953
        "pand          %3, %%mm1    \n\t"
954
        "pand          %4, %%mm2    \n\t"
955
        "psllq         $3, %%mm0    \n\t"
956
        "psrlq         $2, %%mm1    \n\t"
957
        "psrlq         $7, %%mm2    \n\t"
958
        "movq       %%mm0, %%mm3    \n\t"
959
        "movq       %%mm1, %%mm4    \n\t"
960
        "movq       %%mm2, %%mm5    \n\t"
961
        "punpcklwd     %5, %%mm0    \n\t"
962
        "punpcklwd     %5, %%mm1    \n\t"
963
        "punpcklwd     %5, %%mm2    \n\t"
964
        "punpckhwd     %5, %%mm3    \n\t"
965
        "punpckhwd     %5, %%mm4    \n\t"
966
        "punpckhwd     %5, %%mm5    \n\t"
967
        "psllq         $8, %%mm1    \n\t"
968
        "psllq        $16, %%mm2    \n\t"
969
        "por        %%mm1, %%mm0    \n\t"
970
        "por        %%mm2, %%mm0    \n\t"
971
        "psllq         $8, %%mm4    \n\t"
972
        "psllq        $16, %%mm5    \n\t"
973
        "por        %%mm4, %%mm3    \n\t"
974
        "por        %%mm5, %%mm3    \n\t"
975

    
976
        "movq       %%mm0, %%mm6    \n\t"
977
        "movq       %%mm3, %%mm7    \n\t"
978

    
979
        "movq         8%1, %%mm0    \n\t"
980
        "movq         8%1, %%mm1    \n\t"
981
        "movq         8%1, %%mm2    \n\t"
982
        "pand          %2, %%mm0    \n\t"
983
        "pand          %3, %%mm1    \n\t"
984
        "pand          %4, %%mm2    \n\t"
985
        "psllq         $3, %%mm0    \n\t"
986
        "psrlq         $2, %%mm1    \n\t"
987
        "psrlq         $7, %%mm2    \n\t"
988
        "movq       %%mm0, %%mm3    \n\t"
989
        "movq       %%mm1, %%mm4    \n\t"
990
        "movq       %%mm2, %%mm5    \n\t"
991
        "punpcklwd     %5, %%mm0    \n\t"
992
        "punpcklwd     %5, %%mm1    \n\t"
993
        "punpcklwd     %5, %%mm2    \n\t"
994
        "punpckhwd     %5, %%mm3    \n\t"
995
        "punpckhwd     %5, %%mm4    \n\t"
996
        "punpckhwd     %5, %%mm5    \n\t"
997
        "psllq         $8, %%mm1    \n\t"
998
        "psllq        $16, %%mm2    \n\t"
999
        "por        %%mm1, %%mm0    \n\t"
1000
        "por        %%mm2, %%mm0    \n\t"
1001
        "psllq         $8, %%mm4    \n\t"
1002
        "psllq        $16, %%mm5    \n\t"
1003
        "por        %%mm4, %%mm3    \n\t"
1004
        "por        %%mm5, %%mm3    \n\t"
1005

    
1006
        :"=m"(*d)
1007
        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1008
        :"memory");
1009
        /* borrowed 32 to 24 */
1010
        asm volatile(
1011
        "movq       %%mm0, %%mm4    \n\t"
1012
        "movq       %%mm3, %%mm5    \n\t"
1013
        "movq       %%mm6, %%mm0    \n\t"
1014
        "movq       %%mm7, %%mm1    \n\t"
1015

    
1016
        "movq       %%mm4, %%mm6    \n\t"
1017
        "movq       %%mm5, %%mm7    \n\t"
1018
        "movq       %%mm0, %%mm2    \n\t"
1019
        "movq       %%mm1, %%mm3    \n\t"
1020

    
1021
        "psrlq         $8, %%mm2    \n\t"
1022
        "psrlq         $8, %%mm3    \n\t"
1023
        "psrlq         $8, %%mm6    \n\t"
1024
        "psrlq         $8, %%mm7    \n\t"
1025
        "pand          %2, %%mm0    \n\t"
1026
        "pand          %2, %%mm1    \n\t"
1027
        "pand          %2, %%mm4    \n\t"
1028
        "pand          %2, %%mm5    \n\t"
1029
        "pand          %3, %%mm2    \n\t"
1030
        "pand          %3, %%mm3    \n\t"
1031
        "pand          %3, %%mm6    \n\t"
1032
        "pand          %3, %%mm7    \n\t"
1033
        "por        %%mm2, %%mm0    \n\t"
1034
        "por        %%mm3, %%mm1    \n\t"
1035
        "por        %%mm6, %%mm4    \n\t"
1036
        "por        %%mm7, %%mm5    \n\t"
1037

    
1038
        "movq       %%mm1, %%mm2    \n\t"
1039
        "movq       %%mm4, %%mm3    \n\t"
1040
        "psllq        $48, %%mm2    \n\t"
1041
        "psllq        $32, %%mm3    \n\t"
1042
        "pand          %4, %%mm2    \n\t"
1043
        "pand          %5, %%mm3    \n\t"
1044
        "por        %%mm2, %%mm0    \n\t"
1045
        "psrlq        $16, %%mm1    \n\t"
1046
        "psrlq        $32, %%mm4    \n\t"
1047
        "psllq        $16, %%mm5    \n\t"
1048
        "por        %%mm3, %%mm1    \n\t"
1049
        "pand          %6, %%mm5    \n\t"
1050
        "por        %%mm5, %%mm4    \n\t"
1051

    
1052
        MOVNTQ"     %%mm0,   %0     \n\t"
1053
        MOVNTQ"     %%mm1,  8%0     \n\t"
1054
        MOVNTQ"     %%mm4, 16%0"
1055

    
1056
        :"=m"(*d)
1057
        :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1058
        :"memory");
1059
        d += 24;
1060
        s += 8;
1061
    }
1062
    asm volatile(SFENCE:::"memory");
1063
    asm volatile(EMMS:::"memory");
1064
#endif
1065
    while (s < end)
1066
    {
1067
        register uint16_t bgr;
1068
        bgr = *s++;
1069
        *d++ = (bgr&0x1F)<<3;
1070
        *d++ = (bgr&0x3E0)>>2;
1071
        *d++ = (bgr&0x7C00)>>7;
1072
    }
1073
}
1074

    
1075
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1076
{
1077
    const uint16_t *end;
1078
#ifdef HAVE_MMX
1079
    const uint16_t *mm_end;
1080
#endif
1081
    uint8_t *d = (uint8_t *)dst;
1082
    const uint16_t *s = (const uint16_t *)src;
1083
    end = s + src_size/2;
1084
#ifdef HAVE_MMX
1085
    asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
1086
    mm_end = end - 7;
1087
    while (s < mm_end)
1088
    {
1089
        asm volatile(
1090
        PREFETCH"    32%1           \n\t"
1091
        "movq          %1, %%mm0    \n\t"
1092
        "movq          %1, %%mm1    \n\t"
1093
        "movq          %1, %%mm2    \n\t"
1094
        "pand          %2, %%mm0    \n\t"
1095
        "pand          %3, %%mm1    \n\t"
1096
        "pand          %4, %%mm2    \n\t"
1097
        "psllq         $3, %%mm0    \n\t"
1098
        "psrlq         $3, %%mm1    \n\t"
1099
        "psrlq         $8, %%mm2    \n\t"
1100
        "movq       %%mm0, %%mm3    \n\t"
1101
        "movq       %%mm1, %%mm4    \n\t"
1102
        "movq       %%mm2, %%mm5    \n\t"
1103
        "punpcklwd     %5, %%mm0    \n\t"
1104
        "punpcklwd     %5, %%mm1    \n\t"
1105
        "punpcklwd     %5, %%mm2    \n\t"
1106
        "punpckhwd     %5, %%mm3    \n\t"
1107
        "punpckhwd     %5, %%mm4    \n\t"
1108
        "punpckhwd     %5, %%mm5    \n\t"
1109
        "psllq         $8, %%mm1    \n\t"
1110
        "psllq        $16, %%mm2    \n\t"
1111
        "por        %%mm1, %%mm0    \n\t"
1112
        "por        %%mm2, %%mm0    \n\t"
1113
        "psllq         $8, %%mm4    \n\t"
1114
        "psllq        $16, %%mm5    \n\t"
1115
        "por        %%mm4, %%mm3    \n\t"
1116
        "por        %%mm5, %%mm3    \n\t"
1117

    
1118
        "movq       %%mm0, %%mm6    \n\t"
1119
        "movq       %%mm3, %%mm7    \n\t"
1120

    
1121
        "movq         8%1, %%mm0    \n\t"
1122
        "movq         8%1, %%mm1    \n\t"
1123
        "movq         8%1, %%mm2    \n\t"
1124
        "pand          %2, %%mm0    \n\t"
1125
        "pand          %3, %%mm1    \n\t"
1126
        "pand          %4, %%mm2    \n\t"
1127
        "psllq         $3, %%mm0    \n\t"
1128
        "psrlq         $3, %%mm1    \n\t"
1129
        "psrlq         $8, %%mm2    \n\t"
1130
        "movq       %%mm0, %%mm3    \n\t"
1131
        "movq       %%mm1, %%mm4    \n\t"
1132
        "movq       %%mm2, %%mm5    \n\t"
1133
        "punpcklwd     %5, %%mm0    \n\t"
1134
        "punpcklwd     %5, %%mm1    \n\t"
1135
        "punpcklwd     %5, %%mm2    \n\t"
1136
        "punpckhwd     %5, %%mm3    \n\t"
1137
        "punpckhwd     %5, %%mm4    \n\t"
1138
        "punpckhwd     %5, %%mm5    \n\t"
1139
        "psllq         $8, %%mm1    \n\t"
1140
        "psllq        $16, %%mm2    \n\t"
1141
        "por        %%mm1, %%mm0    \n\t"
1142
        "por        %%mm2, %%mm0    \n\t"
1143
        "psllq         $8, %%mm4    \n\t"
1144
        "psllq        $16, %%mm5    \n\t"
1145
        "por        %%mm4, %%mm3    \n\t"
1146
        "por        %%mm5, %%mm3    \n\t"
1147
        :"=m"(*d)
1148
        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1149
        :"memory");
1150
        /* borrowed 32 to 24 */
1151
        asm volatile(
1152
        "movq       %%mm0, %%mm4    \n\t"
1153
        "movq       %%mm3, %%mm5    \n\t"
1154
        "movq       %%mm6, %%mm0    \n\t"
1155
        "movq       %%mm7, %%mm1    \n\t"
1156

    
1157
        "movq       %%mm4, %%mm6    \n\t"
1158
        "movq       %%mm5, %%mm7    \n\t"
1159
        "movq       %%mm0, %%mm2    \n\t"
1160
        "movq       %%mm1, %%mm3    \n\t"
1161

    
1162
        "psrlq         $8, %%mm2    \n\t"
1163
        "psrlq         $8, %%mm3    \n\t"
1164
        "psrlq         $8, %%mm6    \n\t"
1165
        "psrlq         $8, %%mm7    \n\t"
1166
        "pand          %2, %%mm0    \n\t"
1167
        "pand          %2, %%mm1    \n\t"
1168
        "pand          %2, %%mm4    \n\t"
1169
        "pand          %2, %%mm5    \n\t"
1170
        "pand          %3, %%mm2    \n\t"
1171
        "pand          %3, %%mm3    \n\t"
1172
        "pand          %3, %%mm6    \n\t"
1173
        "pand          %3, %%mm7    \n\t"
1174
        "por        %%mm2, %%mm0    \n\t"
1175
        "por        %%mm3, %%mm1    \n\t"
1176
        "por        %%mm6, %%mm4    \n\t"
1177
        "por        %%mm7, %%mm5    \n\t"
1178

    
1179
        "movq       %%mm1, %%mm2    \n\t"
1180
        "movq       %%mm4, %%mm3    \n\t"
1181
        "psllq        $48, %%mm2    \n\t"
1182
        "psllq        $32, %%mm3    \n\t"
1183
        "pand          %4, %%mm2    \n\t"
1184
        "pand          %5, %%mm3    \n\t"
1185
        "por        %%mm2, %%mm0    \n\t"
1186
        "psrlq        $16, %%mm1    \n\t"
1187
        "psrlq        $32, %%mm4    \n\t"
1188
        "psllq        $16, %%mm5    \n\t"
1189
        "por        %%mm3, %%mm1    \n\t"
1190
        "pand          %6, %%mm5    \n\t"
1191
        "por        %%mm5, %%mm4    \n\t"
1192

    
1193
        MOVNTQ"     %%mm0,   %0     \n\t"
1194
        MOVNTQ"     %%mm1,  8%0     \n\t"
1195
        MOVNTQ"     %%mm4, 16%0"
1196

    
1197
        :"=m"(*d)
1198
        :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1199
        :"memory");
1200
        d += 24;
1201
        s += 8;
1202
    }
1203
    asm volatile(SFENCE:::"memory");
1204
    asm volatile(EMMS:::"memory");
1205
#endif
1206
    while (s < end)
1207
    {
1208
        register uint16_t bgr;
1209
        bgr = *s++;
1210
        *d++ = (bgr&0x1F)<<3;
1211
        *d++ = (bgr&0x7E0)>>3;
1212
        *d++ = (bgr&0xF800)>>8;
1213
    }
1214
}
1215

    
1216
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1217
{
1218
    const uint16_t *end;
1219
#ifdef HAVE_MMX
1220
    const uint16_t *mm_end;
1221
#endif
1222
    uint8_t *d = dst;
1223
    const uint16_t *s = (const uint16_t *)src;
1224
    end = s + src_size/2;
1225
#ifdef HAVE_MMX
1226
    asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
1227
    asm volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1228
    mm_end = end - 3;
1229
    while (s < mm_end)
1230
    {
1231
        asm volatile(
1232
        PREFETCH"    32%1           \n\t"
1233
        "movq          %1, %%mm0    \n\t"
1234
        "movq          %1, %%mm1    \n\t"
1235
        "movq          %1, %%mm2    \n\t"
1236
        "pand          %2, %%mm0    \n\t"
1237
        "pand          %3, %%mm1    \n\t"
1238
        "pand          %4, %%mm2    \n\t"
1239
        "psllq         $3, %%mm0    \n\t"
1240
        "psrlq         $2, %%mm1    \n\t"
1241
        "psrlq         $7, %%mm2    \n\t"
1242
        "movq       %%mm0, %%mm3    \n\t"
1243
        "movq       %%mm1, %%mm4    \n\t"
1244
        "movq       %%mm2, %%mm5    \n\t"
1245
        "punpcklwd  %%mm7, %%mm0    \n\t"
1246
        "punpcklwd  %%mm7, %%mm1    \n\t"
1247
        "punpcklwd  %%mm7, %%mm2    \n\t"
1248
        "punpckhwd  %%mm7, %%mm3    \n\t"
1249
        "punpckhwd  %%mm7, %%mm4    \n\t"
1250
        "punpckhwd  %%mm7, %%mm5    \n\t"
1251
        "psllq         $8, %%mm1    \n\t"
1252
        "psllq        $16, %%mm2    \n\t"
1253
        "por        %%mm1, %%mm0    \n\t"
1254
        "por        %%mm2, %%mm0    \n\t"
1255
        "psllq         $8, %%mm4    \n\t"
1256
        "psllq        $16, %%mm5    \n\t"
1257
        "por        %%mm4, %%mm3    \n\t"
1258
        "por        %%mm5, %%mm3    \n\t"
1259
        MOVNTQ"     %%mm0,  %0      \n\t"
1260
        MOVNTQ"     %%mm3, 8%0      \n\t"
1261
        :"=m"(*d)
1262
        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1263
        :"memory");
1264
        d += 16;
1265
        s += 4;
1266
    }
1267
    asm volatile(SFENCE:::"memory");
1268
    asm volatile(EMMS:::"memory");
1269
#endif
1270
    while (s < end)
1271
    {
1272
#if 0 //slightly slower on Athlon
1273
        int bgr= *s++;
1274
        *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1275
#else
1276
        register uint16_t bgr;
1277
        bgr = *s++;
1278
#ifdef WORDS_BIGENDIAN
1279
        *d++ = 0;
1280
        *d++ = (bgr&0x7C00)>>7;
1281
        *d++ = (bgr&0x3E0)>>2;
1282
        *d++ = (bgr&0x1F)<<3;
1283
#else
1284
        *d++ = (bgr&0x1F)<<3;
1285
        *d++ = (bgr&0x3E0)>>2;
1286
        *d++ = (bgr&0x7C00)>>7;
1287
        *d++ = 0;
1288
#endif
1289

    
1290
#endif
1291
    }
1292
}
1293

    
1294
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1295
{
1296
    const uint16_t *end;
1297
#ifdef HAVE_MMX
1298
    const uint16_t *mm_end;
1299
#endif
1300
    uint8_t *d = dst;
1301
    const uint16_t *s = (const uint16_t*)src;
1302
    end = s + src_size/2;
1303
#ifdef HAVE_MMX
1304
    asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
1305
    asm volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1306
    mm_end = end - 3;
1307
    while (s < mm_end)
1308
    {
1309
        asm volatile(
1310
        PREFETCH"    32%1           \n\t"
1311
        "movq          %1, %%mm0    \n\t"
1312
        "movq          %1, %%mm1    \n\t"
1313
        "movq          %1, %%mm2    \n\t"
1314
        "pand          %2, %%mm0    \n\t"
1315
        "pand          %3, %%mm1    \n\t"
1316
        "pand          %4, %%mm2    \n\t"
1317
        "psllq         $3, %%mm0    \n\t"
1318
        "psrlq         $3, %%mm1    \n\t"
1319
        "psrlq         $8, %%mm2    \n\t"
1320
        "movq       %%mm0, %%mm3    \n\t"
1321
        "movq       %%mm1, %%mm4    \n\t"
1322
        "movq       %%mm2, %%mm5    \n\t"
1323
        "punpcklwd  %%mm7, %%mm0    \n\t"
1324
        "punpcklwd  %%mm7, %%mm1    \n\t"
1325
        "punpcklwd  %%mm7, %%mm2    \n\t"
1326
        "punpckhwd  %%mm7, %%mm3    \n\t"
1327
        "punpckhwd  %%mm7, %%mm4    \n\t"
1328
        "punpckhwd  %%mm7, %%mm5    \n\t"
1329
        "psllq         $8, %%mm1    \n\t"
1330
        "psllq        $16, %%mm2    \n\t"
1331
        "por        %%mm1, %%mm0    \n\t"
1332
        "por        %%mm2, %%mm0    \n\t"
1333
        "psllq         $8, %%mm4    \n\t"
1334
        "psllq        $16, %%mm5    \n\t"
1335
        "por        %%mm4, %%mm3    \n\t"
1336
        "por        %%mm5, %%mm3    \n\t"
1337
        MOVNTQ"     %%mm0, %0       \n\t"
1338
        MOVNTQ"     %%mm3, 8%0      \n\t"
1339
        :"=m"(*d)
1340
        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1341
        :"memory");
1342
        d += 16;
1343
        s += 4;
1344
    }
1345
    asm volatile(SFENCE:::"memory");
1346
    asm volatile(EMMS:::"memory");
1347
#endif
1348
    while (s < end)
1349
    {
1350
        register uint16_t bgr;
1351
        bgr = *s++;
1352
#ifdef WORDS_BIGENDIAN
1353
        *d++ = 0;
1354
        *d++ = (bgr&0xF800)>>8;
1355
        *d++ = (bgr&0x7E0)>>3;
1356
        *d++ = (bgr&0x1F)<<3;
1357
#else
1358
        *d++ = (bgr&0x1F)<<3;
1359
        *d++ = (bgr&0x7E0)>>3;
1360
        *d++ = (bgr&0xF800)>>8;
1361
        *d++ = 0;
1362
#endif
1363
    }
1364
}
1365

    
1366
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1367
{
1368
    long idx = 15 - src_size;
1369
    const uint8_t *s = src-idx;
1370
    uint8_t *d = dst-idx;
1371
#ifdef HAVE_MMX
1372
    asm volatile(
1373
    "test          %0, %0           \n\t"
1374
    "jns           2f               \n\t"
1375
    PREFETCH"       (%1, %0)        \n\t"
1376
    "movq          %3, %%mm7        \n\t"
1377
    "pxor          %4, %%mm7        \n\t"
1378
    "movq       %%mm7, %%mm6        \n\t"
1379
    "pxor          %5, %%mm7        \n\t"
1380
    ASMALIGN(4)
1381
    "1:                             \n\t"
1382
    PREFETCH"     32(%1, %0)        \n\t"
1383
    "movq           (%1, %0), %%mm0 \n\t"
1384
    "movq          8(%1, %0), %%mm1 \n\t"
1385
# ifdef HAVE_MMX2
1386
    "pshufw      $177, %%mm0, %%mm3 \n\t"
1387
    "pshufw      $177, %%mm1, %%mm5 \n\t"
1388
    "pand       %%mm7, %%mm0        \n\t"
1389
    "pand       %%mm6, %%mm3        \n\t"
1390
    "pand       %%mm7, %%mm1        \n\t"
1391
    "pand       %%mm6, %%mm5        \n\t"
1392
    "por        %%mm3, %%mm0        \n\t"
1393
    "por        %%mm5, %%mm1        \n\t"
1394
# else
1395
    "movq       %%mm0, %%mm2        \n\t"
1396
    "movq       %%mm1, %%mm4        \n\t"
1397
    "pand       %%mm7, %%mm0        \n\t"
1398
    "pand       %%mm6, %%mm2        \n\t"
1399
    "pand       %%mm7, %%mm1        \n\t"
1400
    "pand       %%mm6, %%mm4        \n\t"
1401
    "movq       %%mm2, %%mm3        \n\t"
1402
    "movq       %%mm4, %%mm5        \n\t"
1403
    "pslld        $16, %%mm2        \n\t"
1404
    "psrld        $16, %%mm3        \n\t"
1405
    "pslld        $16, %%mm4        \n\t"
1406
    "psrld        $16, %%mm5        \n\t"
1407
    "por        %%mm2, %%mm0        \n\t"
1408
    "por        %%mm4, %%mm1        \n\t"
1409
    "por        %%mm3, %%mm0        \n\t"
1410
    "por        %%mm5, %%mm1        \n\t"
1411
# endif
1412
    MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1413
    MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1414
    "add          $16, %0           \n\t"
1415
    "js            1b               \n\t"
1416
    SFENCE"                         \n\t"
1417
    EMMS"                           \n\t"
1418
    "2:                             \n\t"
1419
    : "+&r"(idx)
1420
    : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1421
    : "memory");
1422
#endif
1423
    for (; idx<15; idx+=4) {
1424
        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1425
        v &= 0xff00ff;
1426
        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1427
    }
1428
}
1429

    
1430
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1431
{
1432
    unsigned i;
1433
#ifdef HAVE_MMX
1434
    long mmx_size= 23 - src_size;
1435
    asm volatile (
1436
    "test             %%"REG_a", %%"REG_a"          \n\t"
1437
    "jns                     2f                     \n\t"
1438
    "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1439
    "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1440
    "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1441
    ASMALIGN(4)
1442
    "1:                                             \n\t"
1443
    PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1444
    "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1445
    "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1446
    "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1447
    "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1448
    "pand                 %%mm5, %%mm0              \n\t"
1449
    "pand                 %%mm6, %%mm1              \n\t"
1450
    "pand                 %%mm7, %%mm2              \n\t"
1451
    "por                  %%mm0, %%mm1              \n\t"
1452
    "por                  %%mm2, %%mm1              \n\t"
1453
    "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1454
    MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1455
    "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1456
    "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1457
    "pand                 %%mm7, %%mm0              \n\t"
1458
    "pand                 %%mm5, %%mm1              \n\t"
1459
    "pand                 %%mm6, %%mm2              \n\t"
1460
    "por                  %%mm0, %%mm1              \n\t"
1461
    "por                  %%mm2, %%mm1              \n\t"
1462
    "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1463
    MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1464
    "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1465
    "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1466
    "pand                 %%mm6, %%mm0              \n\t"
1467
    "pand                 %%mm7, %%mm1              \n\t"
1468
    "pand                 %%mm5, %%mm2              \n\t"
1469
    "por                  %%mm0, %%mm1              \n\t"
1470
    "por                  %%mm2, %%mm1              \n\t"
1471
    MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1472
    "add                    $24, %%"REG_a"          \n\t"
1473
    " js                     1b                     \n\t"
1474
    "2:                                             \n\t"
1475
    : "+a" (mmx_size)
1476
    : "r" (src-mmx_size), "r"(dst-mmx_size)
1477
    );
1478

    
1479
    asm volatile(SFENCE:::"memory");
1480
    asm volatile(EMMS:::"memory");
1481

    
1482
    if (mmx_size==23) return; //finished, was multiple of 8
1483

    
1484
    src+= src_size;
1485
    dst+= src_size;
1486
    src_size= 23-mmx_size;
1487
    src-= src_size;
1488
    dst-= src_size;
1489
#endif
1490
    for (i=0; i<src_size; i+=3)
1491
    {
1492
        register uint8_t x;
1493
        x          = src[i + 2];
1494
        dst[i + 1] = src[i + 1];
1495
        dst[i + 2] = src[i + 0];
1496
        dst[i + 0] = x;
1497
    }
1498
}
1499

    
1500
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1501
                                           long width, long height,
1502
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1503
{
1504
    long y;
1505
    const long chromWidth= width>>1;
1506
    for (y=0; y<height; y++)
1507
    {
1508
#ifdef HAVE_MMX
1509
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1510
        asm volatile(
1511
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1512
        ASMALIGN(4)
1513
        "1:                                         \n\t"
1514
        PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1515
        PREFETCH"    32(%2, %%"REG_a")              \n\t"
1516
        PREFETCH"    32(%3, %%"REG_a")              \n\t"
1517
        "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1518
        "movq                    %%mm0, %%mm2       \n\t" // U(0)
1519
        "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1520
        "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1521
        "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1522

    
1523
        "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1524
        "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1525
        "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1526
        "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1527
        "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1528
        "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1529
        "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1530
        "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1531

    
1532
        MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1533
        MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1534
        MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1535
        MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1536

    
1537
        "add                        $8, %%"REG_a"   \n\t"
1538
        "cmp                        %4, %%"REG_a"   \n\t"
1539
        " jb                        1b              \n\t"
1540
        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1541
        : "%"REG_a
1542
        );
1543
#else
1544

    
1545
#if defined ARCH_ALPHA && defined HAVE_MVI
1546
#define pl2yuy2(n)                  \
1547
    y1 = yc[n];                     \
1548
    y2 = yc2[n];                    \
1549
    u = uc[n];                      \
1550
    v = vc[n];                      \
1551
    asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1552
    asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1553
    asm("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1554
    asm("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1555
    yuv1 = (u << 8) + (v << 24);                \
1556
    yuv2 = yuv1 + y2;               \
1557
    yuv1 += y1;                     \
1558
    qdst[n]  = yuv1;                \
1559
    qdst2[n] = yuv2;
1560

    
1561
        int i;
1562
        uint64_t *qdst = (uint64_t *) dst;
1563
        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1564
        const uint32_t *yc = (uint32_t *) ysrc;
1565
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1566
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1567
        for (i = 0; i < chromWidth; i += 8){
1568
            uint64_t y1, y2, yuv1, yuv2;
1569
            uint64_t u, v;
1570
            /* Prefetch */
1571
            asm("ldq $31,64(%0)" :: "r"(yc));
1572
            asm("ldq $31,64(%0)" :: "r"(yc2));
1573
            asm("ldq $31,64(%0)" :: "r"(uc));
1574
            asm("ldq $31,64(%0)" :: "r"(vc));
1575

    
1576
            pl2yuy2(0);
1577
            pl2yuy2(1);
1578
            pl2yuy2(2);
1579
            pl2yuy2(3);
1580

    
1581
            yc    += 4;
1582
            yc2   += 4;
1583
            uc    += 4;
1584
            vc    += 4;
1585
            qdst  += 4;
1586
            qdst2 += 4;
1587
        }
1588
        y++;
1589
        ysrc += lumStride;
1590
        dst += dstStride;
1591

    
1592
#elif __WORDSIZE >= 64
1593
        int i;
1594
        uint64_t *ldst = (uint64_t *) dst;
1595
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1596
        for (i = 0; i < chromWidth; i += 2){
1597
            uint64_t k, l;
1598
            k = yc[0] + (uc[0] << 8) +
1599
                (yc[1] << 16) + (vc[0] << 24);
1600
            l = yc[2] + (uc[1] << 8) +
1601
                (yc[3] << 16) + (vc[1] << 24);
1602
            *ldst++ = k + (l << 32);
1603
            yc += 4;
1604
            uc += 2;
1605
            vc += 2;
1606
        }
1607

    
1608
#else
1609
        int i, *idst = (int32_t *) dst;
1610
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1611
        for (i = 0; i < chromWidth; i++){
1612
#ifdef WORDS_BIGENDIAN
1613
            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1614
                (yc[1] << 8) + (vc[0] << 0);
1615
#else
1616
            *idst++ = yc[0] + (uc[0] << 8) +
1617
                (yc[1] << 16) + (vc[0] << 24);
1618
#endif
1619
            yc += 2;
1620
            uc++;
1621
            vc++;
1622
        }
1623
#endif
1624
#endif
1625
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1626
        {
1627
            usrc += chromStride;
1628
            vsrc += chromStride;
1629
        }
1630
        ysrc += lumStride;
1631
        dst  += dstStride;
1632
    }
1633
#ifdef HAVE_MMX
1634
asm(    EMMS"       \n\t"
1635
        SFENCE"     \n\t"
1636
        :::"memory");
1637
#endif
1638
}
1639

    
1640
/**
1641
 * Height should be a multiple of 2 and width should be a multiple of 16.
1642
 * (If this is a problem for anyone then tell me, and I will fix it.)
1643
 */
1644
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1645
                                      long width, long height,
1646
                                      long lumStride, long chromStride, long dstStride)
1647
{
1648
    //FIXME interpolate chroma
1649
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1650
}
1651

    
1652
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1653
                                           long width, long height,
1654
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1655
{
1656
    long y;
1657
    const long chromWidth= width>>1;
1658
    for (y=0; y<height; y++)
1659
    {
1660
#ifdef HAVE_MMX
1661
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1662
        asm volatile(
1663
        "xor                %%"REG_a", %%"REG_a"    \n\t"
1664
        ASMALIGN(4)
1665
        "1:                                         \n\t"
1666
        PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1667
        PREFETCH"   32(%2, %%"REG_a")               \n\t"
1668
        PREFETCH"   32(%3, %%"REG_a")               \n\t"
1669
        "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1670
        "movq                   %%mm0, %%mm2        \n\t" // U(0)
1671
        "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1672
        "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1673
        "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1674

    
1675
        "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1676
        "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1677
        "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1678
        "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1679
        "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1680
        "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1681
        "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1682
        "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1683

    
1684
        MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1685
        MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1686
        MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1687
        MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1688

    
1689
        "add                       $8, %%"REG_a"    \n\t"
1690
        "cmp                       %4, %%"REG_a"    \n\t"
1691
        " jb                       1b               \n\t"
1692
        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1693
        : "%"REG_a
1694
        );
1695
#else
1696
//FIXME adapt the Alpha ASM code from yv12->yuy2
1697

    
1698
#if __WORDSIZE >= 64
1699
        int i;
1700
        uint64_t *ldst = (uint64_t *) dst;
1701
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1702
        for (i = 0; i < chromWidth; i += 2){
1703
            uint64_t k, l;
1704
            k = uc[0] + (yc[0] << 8) +
1705
                (vc[0] << 16) + (yc[1] << 24);
1706
            l = uc[1] + (yc[2] << 8) +
1707
                (vc[1] << 16) + (yc[3] << 24);
1708
            *ldst++ = k + (l << 32);
1709
            yc += 4;
1710
            uc += 2;
1711
            vc += 2;
1712
        }
1713

    
1714
#else
1715
        int i, *idst = (int32_t *) dst;
1716
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1717
        for (i = 0; i < chromWidth; i++){
1718
#ifdef WORDS_BIGENDIAN
1719
            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1720
                (vc[0] << 8) + (yc[1] << 0);
1721
#else
1722
            *idst++ = uc[0] + (yc[0] << 8) +
1723
               (vc[0] << 16) + (yc[1] << 24);
1724
#endif
1725
            yc += 2;
1726
            uc++;
1727
            vc++;
1728
        }
1729
#endif
1730
#endif
1731
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1732
        {
1733
            usrc += chromStride;
1734
            vsrc += chromStride;
1735
        }
1736
        ysrc += lumStride;
1737
        dst += dstStride;
1738
    }
1739
#ifdef HAVE_MMX
1740
asm(    EMMS"       \n\t"
1741
        SFENCE"     \n\t"
1742
        :::"memory");
1743
#endif
1744
}
1745

    
1746
/**
1747
 * Height should be a multiple of 2 and width should be a multiple of 16
1748
 * (If this is a problem for anyone then tell me, and I will fix it.)
1749
 */
1750
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1751
                                      long width, long height,
1752
                                      long lumStride, long chromStride, long dstStride)
1753
{
1754
    //FIXME interpolate chroma
1755
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1756
}
1757

    
1758
/**
1759
 * Width should be a multiple of 16.
1760
 */
1761
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1762
                                         long width, long height,
1763
                                         long lumStride, long chromStride, long dstStride)
1764
{
1765
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1766
}
1767

    
1768
/**
1769
 * Height should be a multiple of 2 and width should be a multiple of 16.
1770
 * (If this is a problem for anyone then tell me, and I will fix it.)
1771
 */
1772
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1773
                                      long width, long height,
1774
                                      long lumStride, long chromStride, long srcStride)
1775
{
1776
    long y;
1777
    const long chromWidth= width>>1;
1778
    for (y=0; y<height; y+=2)
1779
    {
1780
#ifdef HAVE_MMX
1781
        asm volatile(
1782
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1783
        "pcmpeqw                 %%mm7, %%mm7       \n\t"
1784
        "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1785
        ASMALIGN(4)
1786
        "1:                \n\t"
1787
        PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1788
        "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1789
        "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1790
        "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1791
        "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1792
        "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1793
        "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1794
        "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1795
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1796
        "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1797
        "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1798

    
1799
        MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1800

    
1801
        "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1802
        "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1803
        "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1804
        "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1805
        "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1806
        "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1807
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1808
        "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1809
        "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1810
        "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1811

    
1812
        MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1813

    
1814
        "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1815
        "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1816
        "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1817
        "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1818
        "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1819
        "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1820
        "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1821
        "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1822

    
1823
        MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1824
        MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1825

    
1826
        "add                        $8, %%"REG_a"   \n\t"
1827
        "cmp                        %4, %%"REG_a"   \n\t"
1828
        " jb                        1b              \n\t"
1829
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1830
        : "memory", "%"REG_a
1831
        );
1832

    
1833
        ydst += lumStride;
1834
        src  += srcStride;
1835

    
1836
        asm volatile(
1837
        "xor                 %%"REG_a", %%"REG_a"   \n\t"
1838
        ASMALIGN(4)
1839
        "1:                                         \n\t"
1840
        PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1841
        "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1842
        "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1843
        "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1844
        "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1845
        "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1846
        "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1847
        "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1848
        "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1849
        "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1850
        "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1851

    
1852
        MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1853
        MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1854

    
1855
        "add                        $8, %%"REG_a"   \n\t"
1856
        "cmp                        %4, %%"REG_a"   \n\t"
1857
        " jb                        1b              \n\t"
1858

    
1859
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1860
        : "memory", "%"REG_a
1861
        );
1862
#else
1863
        long i;
1864
        for (i=0; i<chromWidth; i++)
1865
        {
1866
            ydst[2*i+0]     = src[4*i+0];
1867
            udst[i]     = src[4*i+1];
1868
            ydst[2*i+1]     = src[4*i+2];
1869
            vdst[i]     = src[4*i+3];
1870
        }
1871
        ydst += lumStride;
1872
        src  += srcStride;
1873

    
1874
        for (i=0; i<chromWidth; i++)
1875
        {
1876
            ydst[2*i+0]     = src[4*i+0];
1877
            ydst[2*i+1]     = src[4*i+2];
1878
        }
1879
#endif
1880
        udst += chromStride;
1881
        vdst += chromStride;
1882
        ydst += lumStride;
1883
        src  += srcStride;
1884
    }
1885
#ifdef HAVE_MMX
1886
asm volatile(   EMMS"       \n\t"
1887
                SFENCE"     \n\t"
1888
                :::"memory");
1889
#endif
1890
}
1891

    
1892
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1893
                                      uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1894
                                      long width, long height, long lumStride, long chromStride)
1895
{
1896
    /* Y Plane */
1897
    memcpy(ydst, ysrc, width*height);
1898

    
1899
    /* XXX: implement upscaling for U,V */
1900
}
1901

    
1902
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1903
{
1904
    long x,y;
1905

    
1906
    dst[0]= src[0];
1907

    
1908
    // first line
1909
    for (x=0; x<srcWidth-1; x++){
1910
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1911
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1912
    }
1913
    dst[2*srcWidth-1]= src[srcWidth-1];
1914

    
1915
        dst+= dstStride;
1916

    
1917
    for (y=1; y<srcHeight; y++){
1918
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1919
        const long mmxSize= srcWidth&~15;
1920
        asm volatile(
1921
        "mov           %4, %%"REG_a"            \n\t"
1922
        "1:                                     \n\t"
1923
        "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1924
        "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1925
        "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1926
        "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1927
        "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1928
        "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1929
        PAVGB"                  %%mm0, %%mm5    \n\t"
1930
        PAVGB"                  %%mm0, %%mm3    \n\t"
1931
        PAVGB"                  %%mm0, %%mm5    \n\t"
1932
        PAVGB"                  %%mm0, %%mm3    \n\t"
1933
        PAVGB"                  %%mm1, %%mm4    \n\t"
1934
        PAVGB"                  %%mm1, %%mm2    \n\t"
1935
        PAVGB"                  %%mm1, %%mm4    \n\t"
1936
        PAVGB"                  %%mm1, %%mm2    \n\t"
1937
        "movq                   %%mm5, %%mm7    \n\t"
1938
        "movq                   %%mm4, %%mm6    \n\t"
1939
        "punpcklbw              %%mm3, %%mm5    \n\t"
1940
        "punpckhbw              %%mm3, %%mm7    \n\t"
1941
        "punpcklbw              %%mm2, %%mm4    \n\t"
1942
        "punpckhbw              %%mm2, %%mm6    \n\t"
1943
#if 1
1944
        MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1945
        MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1946
        MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1947
        MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1948
#else
1949
        "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1950
        "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1951
        "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1952
        "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1953
#endif
1954
        "add                       $8, %%"REG_a"            \n\t"
1955
        " js                       1b                       \n\t"
1956
        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1957
           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1958
           "g" (-mmxSize)
1959
        : "%"REG_a
1960

    
1961
        );
1962
#else
1963
        const long mmxSize=1;
1964
#endif
1965
        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1966
        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1967

    
1968
        for (x=mmxSize-1; x<srcWidth-1; x++){
1969
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1970
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1971
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1972
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1973
        }
1974
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1975
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1976

    
1977
        dst+=dstStride*2;
1978
        src+=srcStride;
1979
    }
1980

    
1981
    // last line
1982
#if 1
1983
    dst[0]= src[0];
1984

    
1985
    for (x=0; x<srcWidth-1; x++){
1986
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1987
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1988
    }
1989
    dst[2*srcWidth-1]= src[srcWidth-1];
1990
#else
1991
    for (x=0; x<srcWidth; x++){
1992
        dst[2*x+0]=
1993
        dst[2*x+1]= src[x];
1994
    }
1995
#endif
1996

    
1997
#ifdef HAVE_MMX
1998
asm volatile(   EMMS"       \n\t"
1999
                SFENCE"     \n\t"
2000
                :::"memory");
2001
#endif
2002
}
2003

    
2004
/**
2005
 * Height should be a multiple of 2 and width should be a multiple of 16.
2006
 * (If this is a problem for anyone then tell me, and I will fix it.)
2007
 * Chrominance data is only taken from every second line, others are ignored.
2008
 * FIXME: Write HQ version.
2009
 */
2010
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2011
                                      long width, long height,
2012
                                      long lumStride, long chromStride, long srcStride)
2013
{
2014
    long y;
2015
    const long chromWidth= width>>1;
2016
    for (y=0; y<height; y+=2)
2017
    {
2018
#ifdef HAVE_MMX
2019
        asm volatile(
2020
        "xorl                %%eax, %%eax   \n\t"
2021
        "pcmpeqw             %%mm7, %%mm7   \n\t"
2022
        "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
2023
        ASMALIGN(4)
2024
        "1:                                 \n\t"
2025
        PREFETCH" 64(%0, %%eax, 4)          \n\t"
2026
        "movq       (%0, %%eax, 4), %%mm0   \n\t" // UYVY UYVY(0)
2027
        "movq      8(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(4)
2028
        "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
2029
        "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
2030
        "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
2031
        "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
2032
        "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
2033
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
2034
        "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
2035
        "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
2036

    
2037
        MOVNTQ"              %%mm2,  (%1, %%eax, 2) \n\t"
2038

    
2039
        "movq     16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
2040
        "movq     24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
2041
        "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
2042
        "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
2043
        "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
2044
        "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
2045
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
2046
        "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
2047
        "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
2048
        "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
2049

    
2050
        MOVNTQ"              %%mm3, 8(%1, %%eax, 2) \n\t"
2051

    
2052
        "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
2053
        "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
2054
        "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
2055
        "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
2056
        "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
2057
        "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
2058
        "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
2059
        "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
2060

    
2061
        MOVNTQ"              %%mm0, (%3, %%eax) \n\t"
2062
        MOVNTQ"              %%mm2, (%2, %%eax) \n\t"
2063

    
2064
        "addl                   $8, %%eax   \n\t"
2065
        "cmpl                   %4, %%eax   \n\t"
2066
        " jb                    1b          \n\t"
2067
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2068
        : "memory", "%eax"
2069
        );
2070

    
2071
        ydst += lumStride;
2072
        src  += srcStride;
2073

    
2074
        asm volatile(
2075
        "xorl                %%eax, %%eax   \n\t"
2076
        ASMALIGN(4)
2077
        "1:                                 \n\t"
2078
        PREFETCH" 64(%0, %%eax, 4)          \n\t"
2079
        "movq       (%0, %%eax, 4), %%mm0   \n\t" // YUYV YUYV(0)
2080
        "movq      8(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(4)
2081
        "movq     16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
2082
        "movq     24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
2083
        "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
2084
        "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
2085
        "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
2086
        "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
2087
        "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
2088
        "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
2089

    
2090
        MOVNTQ"              %%mm0,  (%1, %%eax, 2) \n\t"
2091
        MOVNTQ"              %%mm2, 8(%1, %%eax, 2) \n\t"
2092

    
2093
        "addl                   $8, %%eax   \n\t"
2094
        "cmpl                   %4, %%eax   \n\t"
2095
        " jb                    1b          \n\t"
2096

    
2097
        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2098
        : "memory", "%eax"
2099
        );
2100
#else
2101
        long i;
2102
        for (i=0; i<chromWidth; i++)
2103
        {
2104
            udst[i]     = src[4*i+0];
2105
            ydst[2*i+0] = src[4*i+1];
2106
            vdst[i]     = src[4*i+2];
2107
            ydst[2*i+1] = src[4*i+3];
2108
        }
2109
        ydst += lumStride;
2110
        src  += srcStride;
2111

    
2112
        for (i=0; i<chromWidth; i++)
2113
        {
2114
            ydst[2*i+0] = src[4*i+1];
2115
            ydst[2*i+1] = src[4*i+3];
2116
        }
2117
#endif
2118
        udst += chromStride;
2119
        vdst += chromStride;
2120
        ydst += lumStride;
2121
        src  += srcStride;
2122
    }
2123
#ifdef HAVE_MMX
2124
asm volatile(   EMMS"       \n\t"
2125
                SFENCE"     \n\t"
2126
                :::"memory");
2127
#endif
2128
}
2129

    
2130
/**
2131
 * Height should be a multiple of 2 and width should be a multiple of 2.
2132
 * (If this is a problem for anyone then tell me, and I will fix it.)
2133
 * Chrominance data is only taken from every second line,
2134
 * others are ignored in the C version.
2135
 * FIXME: Write HQ version.
2136
 */
2137
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2138
                                       long width, long height,
2139
                                       long lumStride, long chromStride, long srcStride)
2140
{
2141
    long y;
2142
    const long chromWidth= width>>1;
2143
#ifdef HAVE_MMX
2144
    for (y=0; y<height-2; y+=2)
2145
    {
2146
        long i;
2147
        for (i=0; i<2; i++)
2148
        {
2149
            asm volatile(
2150
            "mov                        %2, %%"REG_a"   \n\t"
2151
            "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2152
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2153
            "pxor                    %%mm7, %%mm7       \n\t"
2154
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2155
            ASMALIGN(4)
2156
            "1:                                         \n\t"
2157
            PREFETCH"    64(%0, %%"REG_d")              \n\t"
2158
            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2159
            "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2160
            "punpcklbw               %%mm7, %%mm0       \n\t"
2161
            "punpcklbw               %%mm7, %%mm1       \n\t"
2162
            "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2163
            "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2164
            "punpcklbw               %%mm7, %%mm2       \n\t"
2165
            "punpcklbw               %%mm7, %%mm3       \n\t"
2166
            "pmaddwd                 %%mm6, %%mm0       \n\t"
2167
            "pmaddwd                 %%mm6, %%mm1       \n\t"
2168
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2169
            "pmaddwd                 %%mm6, %%mm3       \n\t"
2170
#ifndef FAST_BGR2YV12
2171
            "psrad                      $8, %%mm0       \n\t"
2172
            "psrad                      $8, %%mm1       \n\t"
2173
            "psrad                      $8, %%mm2       \n\t"
2174
            "psrad                      $8, %%mm3       \n\t"
2175
#endif
2176
            "packssdw                %%mm1, %%mm0       \n\t"
2177
            "packssdw                %%mm3, %%mm2       \n\t"
2178
            "pmaddwd                 %%mm5, %%mm0       \n\t"
2179
            "pmaddwd                 %%mm5, %%mm2       \n\t"
2180
            "packssdw                %%mm2, %%mm0       \n\t"
2181
            "psraw                      $7, %%mm0       \n\t"
2182

    
2183
            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2184
            "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2185
            "punpcklbw               %%mm7, %%mm4       \n\t"
2186
            "punpcklbw               %%mm7, %%mm1       \n\t"
2187
            "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2188
            "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2189
            "punpcklbw               %%mm7, %%mm2       \n\t"
2190
            "punpcklbw               %%mm7, %%mm3       \n\t"
2191
            "pmaddwd                 %%mm6, %%mm4       \n\t"
2192
            "pmaddwd                 %%mm6, %%mm1       \n\t"
2193
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2194
            "pmaddwd                 %%mm6, %%mm3       \n\t"
2195
#ifndef FAST_BGR2YV12
2196
            "psrad                      $8, %%mm4       \n\t"
2197
            "psrad                      $8, %%mm1       \n\t"
2198
            "psrad                      $8, %%mm2       \n\t"
2199
            "psrad                      $8, %%mm3       \n\t"
2200
#endif
2201
            "packssdw                %%mm1, %%mm4       \n\t"
2202
            "packssdw                %%mm3, %%mm2       \n\t"
2203
            "pmaddwd                 %%mm5, %%mm4       \n\t"
2204
            "pmaddwd                 %%mm5, %%mm2       \n\t"
2205
            "add                       $24, %%"REG_d"   \n\t"
2206
            "packssdw                %%mm2, %%mm4       \n\t"
2207
            "psraw                      $7, %%mm4       \n\t"
2208

    
2209
            "packuswb                %%mm4, %%mm0       \n\t"
2210
            "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2211

    
2212
            MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2213
            "add                        $8,      %%"REG_a"  \n\t"
2214
            " js                        1b                  \n\t"
2215
            : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2216
            : "%"REG_a, "%"REG_d
2217
            );
2218
            ydst += lumStride;
2219
            src  += srcStride;
2220
        }
2221
        src -= srcStride*2;
2222
        asm volatile(
2223
        "mov                        %4, %%"REG_a"   \n\t"
2224
        "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2225
        "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2226
        "pxor                    %%mm7, %%mm7       \n\t"
2227
        "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2228
        "add                 %%"REG_d", %%"REG_d"   \n\t"
2229
        ASMALIGN(4)
2230
        "1:                                         \n\t"
2231
        PREFETCH"    64(%0, %%"REG_d")              \n\t"
2232
        PREFETCH"    64(%1, %%"REG_d")              \n\t"
2233
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2234
        "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2235
        "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2236
        "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2237
        "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2238
        PAVGB"                   %%mm1, %%mm0       \n\t"
2239
        PAVGB"                   %%mm3, %%mm2       \n\t"
2240
        "movq                    %%mm0, %%mm1       \n\t"
2241
        "movq                    %%mm2, %%mm3       \n\t"
2242
        "psrlq                     $24, %%mm0       \n\t"
2243
        "psrlq                     $24, %%mm2       \n\t"
2244
        PAVGB"                   %%mm1, %%mm0       \n\t"
2245
        PAVGB"                   %%mm3, %%mm2       \n\t"
2246
        "punpcklbw               %%mm7, %%mm0       \n\t"
2247
        "punpcklbw               %%mm7, %%mm2       \n\t"
2248
#else
2249
        "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2250
        "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2251
        "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2252
        "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2253
        "punpcklbw               %%mm7, %%mm0       \n\t"
2254
        "punpcklbw               %%mm7, %%mm1       \n\t"
2255
        "punpcklbw               %%mm7, %%mm2       \n\t"
2256
        "punpcklbw               %%mm7, %%mm3       \n\t"
2257
        "paddw                   %%mm1, %%mm0       \n\t"
2258
        "paddw                   %%mm3, %%mm2       \n\t"
2259
        "paddw                   %%mm2, %%mm0       \n\t"
2260
        "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2261
        "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2262
        "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2263
        "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2264
        "punpcklbw               %%mm7, %%mm4       \n\t"
2265
        "punpcklbw               %%mm7, %%mm1       \n\t"
2266
        "punpcklbw               %%mm7, %%mm2       \n\t"
2267
        "punpcklbw               %%mm7, %%mm3       \n\t"
2268
        "paddw                   %%mm1, %%mm4       \n\t"
2269
        "paddw                   %%mm3, %%mm2       \n\t"
2270
        "paddw                   %%mm4, %%mm2       \n\t"
2271
        "psrlw                      $2, %%mm0       \n\t"
2272
        "psrlw                      $2, %%mm2       \n\t"
2273
#endif
2274
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2275
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2276

    
2277
        "pmaddwd                 %%mm0, %%mm1       \n\t"
2278
        "pmaddwd                 %%mm2, %%mm3       \n\t"
2279
        "pmaddwd                 %%mm6, %%mm0       \n\t"
2280
        "pmaddwd                 %%mm6, %%mm2       \n\t"
2281
#ifndef FAST_BGR2YV12
2282
        "psrad                      $8, %%mm0       \n\t"
2283
        "psrad                      $8, %%mm1       \n\t"
2284
        "psrad                      $8, %%mm2       \n\t"
2285
        "psrad                      $8, %%mm3       \n\t"
2286
#endif
2287
        "packssdw                %%mm2, %%mm0       \n\t"
2288
        "packssdw                %%mm3, %%mm1       \n\t"
2289
        "pmaddwd                 %%mm5, %%mm0       \n\t"
2290
        "pmaddwd                 %%mm5, %%mm1       \n\t"
2291
        "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2292
        "psraw                      $7, %%mm0       \n\t"
2293

    
2294
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2295
        "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2296
        "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2297
        "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2298
        "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2299
        PAVGB"                   %%mm1, %%mm4       \n\t"
2300
        PAVGB"                   %%mm3, %%mm2       \n\t"
2301
        "movq                    %%mm4, %%mm1       \n\t"
2302
        "movq                    %%mm2, %%mm3       \n\t"
2303
        "psrlq                     $24, %%mm4       \n\t"
2304
        "psrlq                     $24, %%mm2       \n\t"
2305
        PAVGB"                   %%mm1, %%mm4       \n\t"
2306
        PAVGB"                   %%mm3, %%mm2       \n\t"
2307
        "punpcklbw               %%mm7, %%mm4       \n\t"
2308
        "punpcklbw               %%mm7, %%mm2       \n\t"
2309
#else
2310
        "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2311
        "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2312
        "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2313
        "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2314
        "punpcklbw               %%mm7, %%mm4       \n\t"
2315
        "punpcklbw               %%mm7, %%mm1       \n\t"
2316
        "punpcklbw               %%mm7, %%mm2       \n\t"
2317
        "punpcklbw               %%mm7, %%mm3       \n\t"
2318
        "paddw                   %%mm1, %%mm4       \n\t"
2319
        "paddw                   %%mm3, %%mm2       \n\t"
2320
        "paddw                   %%mm2, %%mm4       \n\t"
2321
        "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2322
        "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2323
        "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2324
        "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2325
        "punpcklbw               %%mm7, %%mm5       \n\t"
2326
        "punpcklbw               %%mm7, %%mm1       \n\t"
2327
        "punpcklbw               %%mm7, %%mm2       \n\t"
2328
        "punpcklbw               %%mm7, %%mm3       \n\t"
2329
        "paddw                   %%mm1, %%mm5       \n\t"
2330
        "paddw                   %%mm3, %%mm2       \n\t"
2331
        "paddw                   %%mm5, %%mm2       \n\t"
2332
        "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2333
        "psrlw                      $2, %%mm4       \n\t"
2334
        "psrlw                      $2, %%mm2       \n\t"
2335
#endif
2336
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2337
        "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2338

    
2339
        "pmaddwd                 %%mm4, %%mm1       \n\t"
2340
        "pmaddwd                 %%mm2, %%mm3       \n\t"
2341
        "pmaddwd                 %%mm6, %%mm4       \n\t"
2342
        "pmaddwd                 %%mm6, %%mm2       \n\t"
2343
#ifndef FAST_BGR2YV12
2344
        "psrad                      $8, %%mm4       \n\t"
2345
        "psrad                      $8, %%mm1       \n\t"
2346
        "psrad                      $8, %%mm2       \n\t"
2347
        "psrad                      $8, %%mm3       \n\t"
2348
#endif
2349
        "packssdw                %%mm2, %%mm4       \n\t"
2350
        "packssdw                %%mm3, %%mm1       \n\t"
2351
        "pmaddwd                 %%mm5, %%mm4       \n\t"
2352
        "pmaddwd                 %%mm5, %%mm1       \n\t"
2353
        "add                       $24, %%"REG_d"   \n\t"
2354
        "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2355
        "psraw                      $7, %%mm4       \n\t"
2356

    
2357
        "movq                    %%mm0, %%mm1           \n\t"
2358
        "punpckldq               %%mm4, %%mm0           \n\t"
2359
        "punpckhdq               %%mm4, %%mm1           \n\t"
2360
        "packsswb                %%mm1, %%mm0           \n\t"
2361
        "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
2362
        "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2363
        "punpckhdq               %%mm0, %%mm0           \n\t"
2364
        "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2365
        "add                        $4, %%"REG_a"       \n\t"
2366
        " js                        1b                  \n\t"
2367
        : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2368
        : "%"REG_a, "%"REG_d
2369
        );
2370

    
2371
        udst += chromStride;
2372
        vdst += chromStride;
2373
        src  += srcStride*2;
2374
    }
2375

    
2376
    asm volatile(   EMMS"       \n\t"
2377
                    SFENCE"     \n\t"
2378
                    :::"memory");
2379
#else
2380
    y=0;
2381
#endif
2382
    for (; y<height; y+=2)
2383
    {
2384
        long i;
2385
        for (i=0; i<chromWidth; i++)
2386
        {
2387
            unsigned int b = src[6*i+0];
2388
            unsigned int g = src[6*i+1];
2389
            unsigned int r = src[6*i+2];
2390

    
2391
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2392
            unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2393
            unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2394

    
2395
            udst[i]     = U;
2396
            vdst[i]     = V;
2397
            ydst[2*i]   = Y;
2398

    
2399
            b = src[6*i+3];
2400
            g = src[6*i+4];
2401
            r = src[6*i+5];
2402

    
2403
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2404
            ydst[2*i+1]     = Y;
2405
        }
2406
        ydst += lumStride;
2407
        src  += srcStride;
2408

    
2409
        for (i=0; i<chromWidth; i++)
2410
        {
2411
            unsigned int b = src[6*i+0];
2412
            unsigned int g = src[6*i+1];
2413
            unsigned int r = src[6*i+2];
2414

    
2415
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2416

    
2417
            ydst[2*i]     = Y;
2418

    
2419
            b = src[6*i+3];
2420
            g = src[6*i+4];
2421
            r = src[6*i+5];
2422

    
2423
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2424
            ydst[2*i+1]     = Y;
2425
        }
2426
        udst += chromStride;
2427
        vdst += chromStride;
2428
        ydst += lumStride;
2429
        src  += srcStride;
2430
    }
2431
}
2432

    
2433
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2434
                             long width, long height, long src1Stride,
2435
                             long src2Stride, long dstStride){
2436
    long h;
2437

    
2438
    for (h=0; h < height; h++)
2439
    {
2440
        long w;
2441

    
2442
#ifdef HAVE_MMX
2443
#ifdef HAVE_SSE2
2444
        asm(
2445
        "xor              %%"REG_a", %%"REG_a"  \n\t"
2446
        "1:                                     \n\t"
2447
        PREFETCH" 64(%1, %%"REG_a")             \n\t"
2448
        PREFETCH" 64(%2, %%"REG_a")             \n\t"
2449
        "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2450
        "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2451
        "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2452
        "punpcklbw           %%xmm2, %%xmm0     \n\t"
2453
        "punpckhbw           %%xmm2, %%xmm1     \n\t"
2454
        "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2455
        "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2456
        "add                    $16, %%"REG_a"  \n\t"
2457
        "cmp                     %3, %%"REG_a"  \n\t"
2458
        " jb                     1b             \n\t"
2459
        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2460
        : "memory", "%"REG_a""
2461
        );
2462
#else
2463
        asm(
2464
        "xor %%"REG_a", %%"REG_a"               \n\t"
2465
        "1:                                     \n\t"
2466
        PREFETCH" 64(%1, %%"REG_a")             \n\t"
2467
        PREFETCH" 64(%2, %%"REG_a")             \n\t"
2468
        "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2469
        "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2470
        "movq                 %%mm0, %%mm1      \n\t"
2471
        "movq                 %%mm2, %%mm3      \n\t"
2472
        "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2473
        "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2474
        "punpcklbw            %%mm4, %%mm0      \n\t"
2475
        "punpckhbw            %%mm4, %%mm1      \n\t"
2476
        "punpcklbw            %%mm5, %%mm2      \n\t"
2477
        "punpckhbw            %%mm5, %%mm3      \n\t"
2478
        MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2479
        MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2480
        MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2481
        MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2482
        "add                    $16, %%"REG_a"  \n\t"
2483
        "cmp                     %3, %%"REG_a"  \n\t"
2484
        " jb                     1b             \n\t"
2485
        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2486
        : "memory", "%"REG_a
2487
        );
2488
#endif
2489
        for (w= (width&(~15)); w < width; w++)
2490
        {
2491
            dest[2*w+0] = src1[w];
2492
            dest[2*w+1] = src2[w];
2493
        }
2494
#else
2495
        for (w=0; w < width; w++)
2496
        {
2497
            dest[2*w+0] = src1[w];
2498
            dest[2*w+1] = src2[w];
2499
        }
2500
#endif
2501
        dest += dstStride;
2502
                src1 += src1Stride;
2503
                src2 += src2Stride;
2504
    }
2505
#ifdef HAVE_MMX
2506
    asm(
2507
        EMMS"       \n\t"
2508
        SFENCE"     \n\t"
2509
        ::: "memory"
2510
        );
2511
#endif
2512
}
2513

    
2514
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2515
                                       uint8_t *dst1, uint8_t *dst2,
2516
                                       long width, long height,
2517
                                       long srcStride1, long srcStride2,
2518
                                       long dstStride1, long dstStride2)
2519
{
2520
    long y,x,w,h;
2521
    w=width/2; h=height/2;
2522
#ifdef HAVE_MMX
2523
    asm volatile(
2524
    PREFETCH" %0    \n\t"
2525
    PREFETCH" %1    \n\t"
2526
    ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2527
#endif
2528
    for (y=0;y<h;y++){
2529
    const uint8_t* s1=src1+srcStride1*(y>>1);
2530
    uint8_t* d=dst1+dstStride1*y;
2531
    x=0;
2532
#ifdef HAVE_MMX
2533
    for (;x<w-31;x+=32)
2534
    {
2535
        asm volatile(
2536
        PREFETCH"   32%1        \n\t"
2537
        "movq         %1, %%mm0 \n\t"
2538
        "movq        8%1, %%mm2 \n\t"
2539
        "movq       16%1, %%mm4 \n\t"
2540
        "movq       24%1, %%mm6 \n\t"
2541
        "movq      %%mm0, %%mm1 \n\t"
2542
        "movq      %%mm2, %%mm3 \n\t"
2543
        "movq      %%mm4, %%mm5 \n\t"
2544
        "movq      %%mm6, %%mm7 \n\t"
2545
        "punpcklbw %%mm0, %%mm0 \n\t"
2546
        "punpckhbw %%mm1, %%mm1 \n\t"
2547
        "punpcklbw %%mm2, %%mm2 \n\t"
2548
        "punpckhbw %%mm3, %%mm3 \n\t"
2549
        "punpcklbw %%mm4, %%mm4 \n\t"
2550
        "punpckhbw %%mm5, %%mm5 \n\t"
2551
        "punpcklbw %%mm6, %%mm6 \n\t"
2552
        "punpckhbw %%mm7, %%mm7 \n\t"
2553
        MOVNTQ"    %%mm0,   %0  \n\t"
2554
        MOVNTQ"    %%mm1,  8%0  \n\t"
2555
        MOVNTQ"    %%mm2, 16%0  \n\t"
2556
        MOVNTQ"    %%mm3, 24%0  \n\t"
2557
        MOVNTQ"    %%mm4, 32%0  \n\t"
2558
        MOVNTQ"    %%mm5, 40%0  \n\t"
2559
        MOVNTQ"    %%mm6, 48%0  \n\t"
2560
        MOVNTQ"    %%mm7, 56%0"
2561
        :"=m"(d[2*x])
2562
        :"m"(s1[x])
2563
        :"memory");
2564
    }
2565
#endif
2566
    for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2567
    }
2568
    for (y=0;y<h;y++){
2569
    const uint8_t* s2=src2+srcStride2*(y>>1);
2570
    uint8_t* d=dst2+dstStride2*y;
2571
    x=0;
2572
#ifdef HAVE_MMX
2573
    for (;x<w-31;x+=32)
2574
    {
2575
        asm volatile(
2576
        PREFETCH"   32%1        \n\t"
2577
        "movq         %1, %%mm0 \n\t"
2578
        "movq        8%1, %%mm2 \n\t"
2579
        "movq       16%1, %%mm4 \n\t"
2580
        "movq       24%1, %%mm6 \n\t"
2581
        "movq      %%mm0, %%mm1 \n\t"
2582
        "movq      %%mm2, %%mm3 \n\t"
2583
        "movq      %%mm4, %%mm5 \n\t"
2584
        "movq      %%mm6, %%mm7 \n\t"
2585
        "punpcklbw %%mm0, %%mm0 \n\t"
2586
        "punpckhbw %%mm1, %%mm1 \n\t"
2587
        "punpcklbw %%mm2, %%mm2 \n\t"
2588
        "punpckhbw %%mm3, %%mm3 \n\t"
2589
        "punpcklbw %%mm4, %%mm4 \n\t"
2590
        "punpckhbw %%mm5, %%mm5 \n\t"
2591
        "punpcklbw %%mm6, %%mm6 \n\t"
2592
        "punpckhbw %%mm7, %%mm7 \n\t"
2593
        MOVNTQ"    %%mm0,   %0  \n\t"
2594
        MOVNTQ"    %%mm1,  8%0  \n\t"
2595
        MOVNTQ"    %%mm2, 16%0  \n\t"
2596
        MOVNTQ"    %%mm3, 24%0  \n\t"
2597
        MOVNTQ"    %%mm4, 32%0  \n\t"
2598
        MOVNTQ"    %%mm5, 40%0  \n\t"
2599
        MOVNTQ"    %%mm6, 48%0  \n\t"
2600
        MOVNTQ"    %%mm7, 56%0"
2601
        :"=m"(d[2*x])
2602
        :"m"(s2[x])
2603
        :"memory");
2604
    }
2605
#endif
2606
    for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2607
    }
2608
#ifdef HAVE_MMX
2609
    asm(
2610
        EMMS"       \n\t"
2611
        SFENCE"     \n\t"
2612
        ::: "memory"
2613
        );
2614
#endif
2615
}
2616

    
2617
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2618
                                        uint8_t *dst,
2619
                                        long width, long height,
2620
                                        long srcStride1, long srcStride2,
2621
                                        long srcStride3, long dstStride)
2622
{
2623
    long y,x,w,h;
2624
    w=width/2; h=height;
2625
    for (y=0;y<h;y++){
2626
    const uint8_t* yp=src1+srcStride1*y;
2627
    const uint8_t* up=src2+srcStride2*(y>>2);
2628
    const uint8_t* vp=src3+srcStride3*(y>>2);
2629
    uint8_t* d=dst+dstStride*y;
2630
    x=0;
2631
#ifdef HAVE_MMX
2632
    for (;x<w-7;x+=8)
2633
    {
2634
        asm volatile(
2635
        PREFETCH"   32(%1, %0)          \n\t"
2636
        PREFETCH"   32(%2, %0)          \n\t"
2637
        PREFETCH"   32(%3, %0)          \n\t"
2638
        "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2639
        "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2640
        "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2641
        "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2642
        "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2643
        "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2644
        "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2645
        "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2646
        "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2647
        "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2648

    
2649
        "movq            %%mm1, %%mm6   \n\t"
2650
        "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2651
        "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2652
        "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2653
        MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2654
        MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2655

    
2656
        "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2657
        "movq     8(%1, %0, 4), %%mm0   \n\t"
2658
        "movq            %%mm0, %%mm3   \n\t"
2659
        "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2660
        "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2661
        MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2662
        MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2663

    
2664
        "movq            %%mm4, %%mm6   \n\t"
2665
        "movq    16(%1, %0, 4), %%mm0   \n\t"
2666
        "movq            %%mm0, %%mm3   \n\t"
2667
        "punpcklbw       %%mm5, %%mm4   \n\t"
2668
        "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2669
        "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2670
        MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2671
        MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2672

    
2673
        "punpckhbw       %%mm5, %%mm6   \n\t"
2674
        "movq    24(%1, %0, 4), %%mm0   \n\t"
2675
        "movq            %%mm0, %%mm3   \n\t"
2676
        "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2677
        "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2678
        MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2679
        MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2680

    
2681
        : "+r" (x)
2682
        : "r"(yp), "r" (up), "r"(vp), "r"(d)
2683
        :"memory");
2684
    }
2685
#endif
2686
    for (; x<w; x++)
2687
    {
2688
        const long x2 = x<<2;
2689
        d[8*x+0] = yp[x2];
2690
        d[8*x+1] = up[x];
2691
        d[8*x+2] = yp[x2+1];
2692
        d[8*x+3] = vp[x];
2693
        d[8*x+4] = yp[x2+2];
2694
        d[8*x+5] = up[x];
2695
        d[8*x+6] = yp[x2+3];
2696
        d[8*x+7] = vp[x];
2697
    }
2698
    }
2699
#ifdef HAVE_MMX
2700
    asm(
2701
        EMMS"       \n\t"
2702
        SFENCE"     \n\t"
2703
        ::: "memory"
2704
        );
2705
#endif
2706
}
2707

    
2708
static inline void RENAME(rgb2rgb_init)(void){
2709
    rgb15to16       = RENAME(rgb15to16);
2710
    rgb15to24       = RENAME(rgb15to24);
2711
    rgb15to32       = RENAME(rgb15to32);
2712
    rgb16to24       = RENAME(rgb16to24);
2713
    rgb16to32       = RENAME(rgb16to32);
2714
    rgb16to15       = RENAME(rgb16to15);
2715
    rgb24to16       = RENAME(rgb24to16);
2716
    rgb24to15       = RENAME(rgb24to15);
2717
    rgb24to32       = RENAME(rgb24to32);
2718
    rgb32to16       = RENAME(rgb32to16);
2719
    rgb32to15       = RENAME(rgb32to15);
2720
    rgb32to24       = RENAME(rgb32to24);
2721
    rgb24tobgr15    = RENAME(rgb24tobgr15);
2722
    rgb24tobgr16    = RENAME(rgb24tobgr16);
2723
    rgb24tobgr24    = RENAME(rgb24tobgr24);
2724
    rgb32tobgr32    = RENAME(rgb32tobgr32);
2725
    rgb32tobgr16    = RENAME(rgb32tobgr16);
2726
    rgb32tobgr15    = RENAME(rgb32tobgr15);
2727
    yv12toyuy2      = RENAME(yv12toyuy2);
2728
    yv12touyvy      = RENAME(yv12touyvy);
2729
    yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
2730
    yuy2toyv12      = RENAME(yuy2toyv12);
2731
//    uyvytoyv12      = RENAME(uyvytoyv12);
2732
//    yvu9toyv12      = RENAME(yvu9toyv12);
2733
    planar2x        = RENAME(planar2x);
2734
    rgb24toyv12     = RENAME(rgb24toyv12);
2735
    interleaveBytes = RENAME(interleaveBytes);
2736
    vu9_to_vu12     = RENAME(vu9_to_vu12);
2737
    yvu9_to_yuy2    = RENAME(yvu9_to_yuy2);
2738
}