Statistics
| Branch: | Revision:

ffmpeg / libswscale / rgb2rgb_template.c @ 4821b445

History | View | Annotate | Download (111 KB)

1
/*
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 * lot of big-endian byte order fixes by Alex Beregszaszi
9
 *
10
 * This file is part of FFmpeg.
11
 *
12
 * FFmpeg is free software; you can redistribute it and/or
13
 * modify it under the terms of the GNU Lesser General Public
14
 * License as published by the Free Software Foundation; either
15
 * version 2.1 of the License, or (at your option) any later version.
16
 *
17
 * FFmpeg is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
 * Lesser General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU Lesser General Public
23
 * License along with FFmpeg; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 */
26

    
27
#include <stddef.h>
28

    
29
#undef PREFETCH
30
#undef MOVNTQ
31
#undef EMMS
32
#undef SFENCE
33
#undef MMREG_SIZE
34
#undef PAVGB
35

    
36
#if HAVE_SSE2
37
#define MMREG_SIZE 16
38
#else
39
#define MMREG_SIZE 8
40
#endif
41

    
42
#if HAVE_AMD3DNOW
43
#define PREFETCH  "prefetch"
44
#define PAVGB     "pavgusb"
45
#elif HAVE_MMX2
46
#define PREFETCH "prefetchnta"
47
#define PAVGB     "pavgb"
48
#else
49
#define PREFETCH  " # nop"
50
#endif
51

    
52
#if HAVE_AMD3DNOW
53
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
54
#define EMMS     "femms"
55
#else
56
#define EMMS     "emms"
57
#endif
58

    
59
#if HAVE_MMX2
60
#define MOVNTQ "movntq"
61
#define SFENCE "sfence"
62
#else
63
#define MOVNTQ "movq"
64
#define SFENCE " # nop"
65
#endif
66

    
67
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
68
{
69
    uint8_t *dest = dst;
70
    const uint8_t *s = src;
71
    const uint8_t *end;
72
#if HAVE_MMX
73
    const uint8_t *mm_end;
74
#endif
75
    end = s + src_size;
76
#if HAVE_MMX
77
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
78
    mm_end = end - 23;
79
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
80
    while (s < mm_end) {
81
        __asm__ volatile(
82
            PREFETCH"    32%1           \n\t"
83
            "movd          %1, %%mm0    \n\t"
84
            "punpckldq    3%1, %%mm0    \n\t"
85
            "movd         6%1, %%mm1    \n\t"
86
            "punpckldq    9%1, %%mm1    \n\t"
87
            "movd        12%1, %%mm2    \n\t"
88
            "punpckldq   15%1, %%mm2    \n\t"
89
            "movd        18%1, %%mm3    \n\t"
90
            "punpckldq   21%1, %%mm3    \n\t"
91
            "por        %%mm7, %%mm0    \n\t"
92
            "por        %%mm7, %%mm1    \n\t"
93
            "por        %%mm7, %%mm2    \n\t"
94
            "por        %%mm7, %%mm3    \n\t"
95
            MOVNTQ"     %%mm0,   %0     \n\t"
96
            MOVNTQ"     %%mm1,  8%0     \n\t"
97
            MOVNTQ"     %%mm2, 16%0     \n\t"
98
            MOVNTQ"     %%mm3, 24%0"
99
            :"=m"(*dest)
100
            :"m"(*s)
101
            :"memory");
102
        dest += 32;
103
        s += 24;
104
    }
105
    __asm__ volatile(SFENCE:::"memory");
106
    __asm__ volatile(EMMS:::"memory");
107
#endif
108
    while (s < end) {
109
#if HAVE_BIGENDIAN
110
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
111
        *dest++ = 255;
112
        *dest++ = s[2];
113
        *dest++ = s[1];
114
        *dest++ = s[0];
115
        s+=3;
116
#else
117
        *dest++ = *s++;
118
        *dest++ = *s++;
119
        *dest++ = *s++;
120
        *dest++ = 255;
121
#endif
122
    }
123
}
124

    
125
#define STORE_BGR24_MMX \
126
            "psrlq         $8, %%mm2    \n\t" \
127
            "psrlq         $8, %%mm3    \n\t" \
128
            "psrlq         $8, %%mm6    \n\t" \
129
            "psrlq         $8, %%mm7    \n\t" \
130
            "pand "MANGLE(mask24l)", %%mm0\n\t" \
131
            "pand "MANGLE(mask24l)", %%mm1\n\t" \
132
            "pand "MANGLE(mask24l)", %%mm4\n\t" \
133
            "pand "MANGLE(mask24l)", %%mm5\n\t" \
134
            "pand "MANGLE(mask24h)", %%mm2\n\t" \
135
            "pand "MANGLE(mask24h)", %%mm3\n\t" \
136
            "pand "MANGLE(mask24h)", %%mm6\n\t" \
137
            "pand "MANGLE(mask24h)", %%mm7\n\t" \
138
            "por        %%mm2, %%mm0    \n\t" \
139
            "por        %%mm3, %%mm1    \n\t" \
140
            "por        %%mm6, %%mm4    \n\t" \
141
            "por        %%mm7, %%mm5    \n\t" \
142
 \
143
            "movq       %%mm1, %%mm2    \n\t" \
144
            "movq       %%mm4, %%mm3    \n\t" \
145
            "psllq        $48, %%mm2    \n\t" \
146
            "psllq        $32, %%mm3    \n\t" \
147
            "pand "MANGLE(mask24hh)", %%mm2\n\t" \
148
            "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
149
            "por        %%mm2, %%mm0    \n\t" \
150
            "psrlq        $16, %%mm1    \n\t" \
151
            "psrlq        $32, %%mm4    \n\t" \
152
            "psllq        $16, %%mm5    \n\t" \
153
            "por        %%mm3, %%mm1    \n\t" \
154
            "pand  "MANGLE(mask24hhhh)", %%mm5\n\t" \
155
            "por        %%mm5, %%mm4    \n\t" \
156
 \
157
            MOVNTQ"     %%mm0,   %0     \n\t" \
158
            MOVNTQ"     %%mm1,  8%0     \n\t" \
159
            MOVNTQ"     %%mm4, 16%0"
160

    
161

    
162
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
163
{
164
    uint8_t *dest = dst;
165
    const uint8_t *s = src;
166
    const uint8_t *end;
167
#if HAVE_MMX
168
    const uint8_t *mm_end;
169
#endif
170
    end = s + src_size;
171
#if HAVE_MMX
172
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
173
    mm_end = end - 31;
174
    while (s < mm_end) {
175
        __asm__ volatile(
176
            PREFETCH"    32%1           \n\t"
177
            "movq          %1, %%mm0    \n\t"
178
            "movq         8%1, %%mm1    \n\t"
179
            "movq        16%1, %%mm4    \n\t"
180
            "movq        24%1, %%mm5    \n\t"
181
            "movq       %%mm0, %%mm2    \n\t"
182
            "movq       %%mm1, %%mm3    \n\t"
183
            "movq       %%mm4, %%mm6    \n\t"
184
            "movq       %%mm5, %%mm7    \n\t"
185
            STORE_BGR24_MMX
186
            :"=m"(*dest)
187
            :"m"(*s)
188
            :"memory");
189
        dest += 24;
190
        s += 32;
191
    }
192
    __asm__ volatile(SFENCE:::"memory");
193
    __asm__ volatile(EMMS:::"memory");
194
#endif
195
    while (s < end) {
196
#if HAVE_BIGENDIAN
197
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
198
        s++;
199
        dest[2] = *s++;
200
        dest[1] = *s++;
201
        dest[0] = *s++;
202
        dest += 3;
203
#else
204
        *dest++ = *s++;
205
        *dest++ = *s++;
206
        *dest++ = *s++;
207
        s++;
208
#endif
209
    }
210
}
211

    
212
/*
213
 original by Strepto/Astral
214
 ported to gcc & bugfixed: A'rpi
215
 MMX2, 3DNOW optimization by Nick Kurshev
216
 32-bit C version, and and&add trick by Michael Niedermayer
217
*/
218
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
219
{
220
    register const uint8_t* s=src;
221
    register uint8_t* d=dst;
222
    register const uint8_t *end;
223
    const uint8_t *mm_end;
224
    end = s + src_size;
225
#if HAVE_MMX
226
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
227
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
228
    mm_end = end - 15;
229
    while (s<mm_end) {
230
        __asm__ volatile(
231
            PREFETCH"  32%1         \n\t"
232
            "movq        %1, %%mm0  \n\t"
233
            "movq       8%1, %%mm2  \n\t"
234
            "movq     %%mm0, %%mm1  \n\t"
235
            "movq     %%mm2, %%mm3  \n\t"
236
            "pand     %%mm4, %%mm0  \n\t"
237
            "pand     %%mm4, %%mm2  \n\t"
238
            "paddw    %%mm1, %%mm0  \n\t"
239
            "paddw    %%mm3, %%mm2  \n\t"
240
            MOVNTQ"   %%mm0,  %0    \n\t"
241
            MOVNTQ"   %%mm2, 8%0"
242
            :"=m"(*d)
243
            :"m"(*s)
244
        );
245
        d+=16;
246
        s+=16;
247
    }
248
    __asm__ volatile(SFENCE:::"memory");
249
    __asm__ volatile(EMMS:::"memory");
250
#endif
251
    mm_end = end - 3;
252
    while (s < mm_end) {
253
        register unsigned x= *((const uint32_t *)s);
254
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
255
        d+=4;
256
        s+=4;
257
    }
258
    if (s < end) {
259
        register unsigned short x= *((const uint16_t *)s);
260
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
261
    }
262
}
263

    
264
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
265
{
266
    register const uint8_t* s=src;
267
    register uint8_t* d=dst;
268
    register const uint8_t *end;
269
    const uint8_t *mm_end;
270
    end = s + src_size;
271
#if HAVE_MMX
272
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
273
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
274
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
275
    mm_end = end - 15;
276
    while (s<mm_end) {
277
        __asm__ volatile(
278
            PREFETCH"  32%1         \n\t"
279
            "movq        %1, %%mm0  \n\t"
280
            "movq       8%1, %%mm2  \n\t"
281
            "movq     %%mm0, %%mm1  \n\t"
282
            "movq     %%mm2, %%mm3  \n\t"
283
            "psrlq       $1, %%mm0  \n\t"
284
            "psrlq       $1, %%mm2  \n\t"
285
            "pand     %%mm7, %%mm0  \n\t"
286
            "pand     %%mm7, %%mm2  \n\t"
287
            "pand     %%mm6, %%mm1  \n\t"
288
            "pand     %%mm6, %%mm3  \n\t"
289
            "por      %%mm1, %%mm0  \n\t"
290
            "por      %%mm3, %%mm2  \n\t"
291
            MOVNTQ"   %%mm0,  %0    \n\t"
292
            MOVNTQ"   %%mm2, 8%0"
293
            :"=m"(*d)
294
            :"m"(*s)
295
        );
296
        d+=16;
297
        s+=16;
298
    }
299
    __asm__ volatile(SFENCE:::"memory");
300
    __asm__ volatile(EMMS:::"memory");
301
#endif
302
    mm_end = end - 3;
303
    while (s < mm_end) {
304
        register uint32_t x= *((const uint32_t*)s);
305
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
306
        s+=4;
307
        d+=4;
308
    }
309
    if (s < end) {
310
        register uint16_t x= *((const uint16_t*)s);
311
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
312
    }
313
}
314

    
315
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
316
{
317
    const uint8_t *s = src;
318
    const uint8_t *end;
319
#if HAVE_MMX
320
    const uint8_t *mm_end;
321
#endif
322
    uint16_t *d = (uint16_t *)dst;
323
    end = s + src_size;
324
#if HAVE_MMX
325
    mm_end = end - 15;
326
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
327
    __asm__ volatile(
328
        "movq           %3, %%mm5   \n\t"
329
        "movq           %4, %%mm6   \n\t"
330
        "movq           %5, %%mm7   \n\t"
331
        "jmp 2f                     \n\t"
332
        ASMALIGN(4)
333
        "1:                         \n\t"
334
        PREFETCH"   32(%1)          \n\t"
335
        "movd         (%1), %%mm0   \n\t"
336
        "movd        4(%1), %%mm3   \n\t"
337
        "punpckldq   8(%1), %%mm0   \n\t"
338
        "punpckldq  12(%1), %%mm3   \n\t"
339
        "movq        %%mm0, %%mm1   \n\t"
340
        "movq        %%mm3, %%mm4   \n\t"
341
        "pand        %%mm6, %%mm0   \n\t"
342
        "pand        %%mm6, %%mm3   \n\t"
343
        "pmaddwd     %%mm7, %%mm0   \n\t"
344
        "pmaddwd     %%mm7, %%mm3   \n\t"
345
        "pand        %%mm5, %%mm1   \n\t"
346
        "pand        %%mm5, %%mm4   \n\t"
347
        "por         %%mm1, %%mm0   \n\t"
348
        "por         %%mm4, %%mm3   \n\t"
349
        "psrld          $5, %%mm0   \n\t"
350
        "pslld         $11, %%mm3   \n\t"
351
        "por         %%mm3, %%mm0   \n\t"
352
        MOVNTQ"      %%mm0, (%0)    \n\t"
353
        "add           $16,  %1     \n\t"
354
        "add            $8,  %0     \n\t"
355
        "2:                         \n\t"
356
        "cmp            %2,  %1     \n\t"
357
        " jb            1b          \n\t"
358
        : "+r" (d), "+r"(s)
359
        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
360
    );
361
#else
362
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
363
    __asm__ volatile(
364
        "movq    %0, %%mm7    \n\t"
365
        "movq    %1, %%mm6    \n\t"
366
        ::"m"(red_16mask),"m"(green_16mask));
367
    while (s < mm_end) {
368
        __asm__ volatile(
369
            PREFETCH"    32%1           \n\t"
370
            "movd          %1, %%mm0    \n\t"
371
            "movd         4%1, %%mm3    \n\t"
372
            "punpckldq    8%1, %%mm0    \n\t"
373
            "punpckldq   12%1, %%mm3    \n\t"
374
            "movq       %%mm0, %%mm1    \n\t"
375
            "movq       %%mm0, %%mm2    \n\t"
376
            "movq       %%mm3, %%mm4    \n\t"
377
            "movq       %%mm3, %%mm5    \n\t"
378
            "psrlq         $3, %%mm0    \n\t"
379
            "psrlq         $3, %%mm3    \n\t"
380
            "pand          %2, %%mm0    \n\t"
381
            "pand          %2, %%mm3    \n\t"
382
            "psrlq         $5, %%mm1    \n\t"
383
            "psrlq         $5, %%mm4    \n\t"
384
            "pand       %%mm6, %%mm1    \n\t"
385
            "pand       %%mm6, %%mm4    \n\t"
386
            "psrlq         $8, %%mm2    \n\t"
387
            "psrlq         $8, %%mm5    \n\t"
388
            "pand       %%mm7, %%mm2    \n\t"
389
            "pand       %%mm7, %%mm5    \n\t"
390
            "por        %%mm1, %%mm0    \n\t"
391
            "por        %%mm4, %%mm3    \n\t"
392
            "por        %%mm2, %%mm0    \n\t"
393
            "por        %%mm5, %%mm3    \n\t"
394
            "psllq        $16, %%mm3    \n\t"
395
            "por        %%mm3, %%mm0    \n\t"
396
            MOVNTQ"     %%mm0, %0       \n\t"
397
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
398
        d += 4;
399
        s += 16;
400
    }
401
#endif
402
    __asm__ volatile(SFENCE:::"memory");
403
    __asm__ volatile(EMMS:::"memory");
404
#endif
405
    while (s < end) {
406
        register int rgb = *(const uint32_t*)s; s += 4;
407
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
408
    }
409
}
410

    
411
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
412
{
413
    const uint8_t *s = src;
414
    const uint8_t *end;
415
#if HAVE_MMX
416
    const uint8_t *mm_end;
417
#endif
418
    uint16_t *d = (uint16_t *)dst;
419
    end = s + src_size;
420
#if HAVE_MMX
421
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
422
    __asm__ volatile(
423
        "movq          %0, %%mm7    \n\t"
424
        "movq          %1, %%mm6    \n\t"
425
        ::"m"(red_16mask),"m"(green_16mask));
426
    mm_end = end - 15;
427
    while (s < mm_end) {
428
        __asm__ volatile(
429
            PREFETCH"    32%1           \n\t"
430
            "movd          %1, %%mm0    \n\t"
431
            "movd         4%1, %%mm3    \n\t"
432
            "punpckldq    8%1, %%mm0    \n\t"
433
            "punpckldq   12%1, %%mm3    \n\t"
434
            "movq       %%mm0, %%mm1    \n\t"
435
            "movq       %%mm0, %%mm2    \n\t"
436
            "movq       %%mm3, %%mm4    \n\t"
437
            "movq       %%mm3, %%mm5    \n\t"
438
            "psllq         $8, %%mm0    \n\t"
439
            "psllq         $8, %%mm3    \n\t"
440
            "pand       %%mm7, %%mm0    \n\t"
441
            "pand       %%mm7, %%mm3    \n\t"
442
            "psrlq         $5, %%mm1    \n\t"
443
            "psrlq         $5, %%mm4    \n\t"
444
            "pand       %%mm6, %%mm1    \n\t"
445
            "pand       %%mm6, %%mm4    \n\t"
446
            "psrlq        $19, %%mm2    \n\t"
447
            "psrlq        $19, %%mm5    \n\t"
448
            "pand          %2, %%mm2    \n\t"
449
            "pand          %2, %%mm5    \n\t"
450
            "por        %%mm1, %%mm0    \n\t"
451
            "por        %%mm4, %%mm3    \n\t"
452
            "por        %%mm2, %%mm0    \n\t"
453
            "por        %%mm5, %%mm3    \n\t"
454
            "psllq        $16, %%mm3    \n\t"
455
            "por        %%mm3, %%mm0    \n\t"
456
            MOVNTQ"     %%mm0, %0       \n\t"
457
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
458
        d += 4;
459
        s += 16;
460
    }
461
    __asm__ volatile(SFENCE:::"memory");
462
    __asm__ volatile(EMMS:::"memory");
463
#endif
464
    while (s < end) {
465
        register int rgb = *(const uint32_t*)s; s += 4;
466
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
467
    }
468
}
469

    
470
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
471
{
472
    const uint8_t *s = src;
473
    const uint8_t *end;
474
#if HAVE_MMX
475
    const uint8_t *mm_end;
476
#endif
477
    uint16_t *d = (uint16_t *)dst;
478
    end = s + src_size;
479
#if HAVE_MMX
480
    mm_end = end - 15;
481
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
482
    __asm__ volatile(
483
        "movq           %3, %%mm5   \n\t"
484
        "movq           %4, %%mm6   \n\t"
485
        "movq           %5, %%mm7   \n\t"
486
        "jmp            2f          \n\t"
487
        ASMALIGN(4)
488
        "1:                         \n\t"
489
        PREFETCH"   32(%1)          \n\t"
490
        "movd         (%1), %%mm0   \n\t"
491
        "movd        4(%1), %%mm3   \n\t"
492
        "punpckldq   8(%1), %%mm0   \n\t"
493
        "punpckldq  12(%1), %%mm3   \n\t"
494
        "movq        %%mm0, %%mm1   \n\t"
495
        "movq        %%mm3, %%mm4   \n\t"
496
        "pand        %%mm6, %%mm0   \n\t"
497
        "pand        %%mm6, %%mm3   \n\t"
498
        "pmaddwd     %%mm7, %%mm0   \n\t"
499
        "pmaddwd     %%mm7, %%mm3   \n\t"
500
        "pand        %%mm5, %%mm1   \n\t"
501
        "pand        %%mm5, %%mm4   \n\t"
502
        "por         %%mm1, %%mm0   \n\t"
503
        "por         %%mm4, %%mm3   \n\t"
504
        "psrld          $6, %%mm0   \n\t"
505
        "pslld         $10, %%mm3   \n\t"
506
        "por         %%mm3, %%mm0   \n\t"
507
        MOVNTQ"      %%mm0, (%0)    \n\t"
508
        "add           $16,  %1     \n\t"
509
        "add            $8,  %0     \n\t"
510
        "2:                         \n\t"
511
        "cmp            %2,  %1     \n\t"
512
        " jb            1b          \n\t"
513
        : "+r" (d), "+r"(s)
514
        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
515
    );
516
#else
517
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
518
    __asm__ volatile(
519
        "movq          %0, %%mm7    \n\t"
520
        "movq          %1, %%mm6    \n\t"
521
        ::"m"(red_15mask),"m"(green_15mask));
522
    while (s < mm_end) {
523
        __asm__ volatile(
524
            PREFETCH"    32%1           \n\t"
525
            "movd          %1, %%mm0    \n\t"
526
            "movd         4%1, %%mm3    \n\t"
527
            "punpckldq    8%1, %%mm0    \n\t"
528
            "punpckldq   12%1, %%mm3    \n\t"
529
            "movq       %%mm0, %%mm1    \n\t"
530
            "movq       %%mm0, %%mm2    \n\t"
531
            "movq       %%mm3, %%mm4    \n\t"
532
            "movq       %%mm3, %%mm5    \n\t"
533
            "psrlq         $3, %%mm0    \n\t"
534
            "psrlq         $3, %%mm3    \n\t"
535
            "pand          %2, %%mm0    \n\t"
536
            "pand          %2, %%mm3    \n\t"
537
            "psrlq         $6, %%mm1    \n\t"
538
            "psrlq         $6, %%mm4    \n\t"
539
            "pand       %%mm6, %%mm1    \n\t"
540
            "pand       %%mm6, %%mm4    \n\t"
541
            "psrlq         $9, %%mm2    \n\t"
542
            "psrlq         $9, %%mm5    \n\t"
543
            "pand       %%mm7, %%mm2    \n\t"
544
            "pand       %%mm7, %%mm5    \n\t"
545
            "por        %%mm1, %%mm0    \n\t"
546
            "por        %%mm4, %%mm3    \n\t"
547
            "por        %%mm2, %%mm0    \n\t"
548
            "por        %%mm5, %%mm3    \n\t"
549
            "psllq        $16, %%mm3    \n\t"
550
            "por        %%mm3, %%mm0    \n\t"
551
            MOVNTQ"     %%mm0, %0       \n\t"
552
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
553
        d += 4;
554
        s += 16;
555
    }
556
#endif
557
    __asm__ volatile(SFENCE:::"memory");
558
    __asm__ volatile(EMMS:::"memory");
559
#endif
560
    while (s < end) {
561
        register int rgb = *(const uint32_t*)s; s += 4;
562
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
563
    }
564
}
565

    
566
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
567
{
568
    const uint8_t *s = src;
569
    const uint8_t *end;
570
#if HAVE_MMX
571
    const uint8_t *mm_end;
572
#endif
573
    uint16_t *d = (uint16_t *)dst;
574
    end = s + src_size;
575
#if HAVE_MMX
576
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
577
    __asm__ volatile(
578
        "movq          %0, %%mm7    \n\t"
579
        "movq          %1, %%mm6    \n\t"
580
        ::"m"(red_15mask),"m"(green_15mask));
581
    mm_end = end - 15;
582
    while (s < mm_end) {
583
        __asm__ volatile(
584
            PREFETCH"    32%1           \n\t"
585
            "movd          %1, %%mm0    \n\t"
586
            "movd         4%1, %%mm3    \n\t"
587
            "punpckldq    8%1, %%mm0    \n\t"
588
            "punpckldq   12%1, %%mm3    \n\t"
589
            "movq       %%mm0, %%mm1    \n\t"
590
            "movq       %%mm0, %%mm2    \n\t"
591
            "movq       %%mm3, %%mm4    \n\t"
592
            "movq       %%mm3, %%mm5    \n\t"
593
            "psllq         $7, %%mm0    \n\t"
594
            "psllq         $7, %%mm3    \n\t"
595
            "pand       %%mm7, %%mm0    \n\t"
596
            "pand       %%mm7, %%mm3    \n\t"
597
            "psrlq         $6, %%mm1    \n\t"
598
            "psrlq         $6, %%mm4    \n\t"
599
            "pand       %%mm6, %%mm1    \n\t"
600
            "pand       %%mm6, %%mm4    \n\t"
601
            "psrlq        $19, %%mm2    \n\t"
602
            "psrlq        $19, %%mm5    \n\t"
603
            "pand          %2, %%mm2    \n\t"
604
            "pand          %2, %%mm5    \n\t"
605
            "por        %%mm1, %%mm0    \n\t"
606
            "por        %%mm4, %%mm3    \n\t"
607
            "por        %%mm2, %%mm0    \n\t"
608
            "por        %%mm5, %%mm3    \n\t"
609
            "psllq        $16, %%mm3    \n\t"
610
            "por        %%mm3, %%mm0    \n\t"
611
            MOVNTQ"     %%mm0, %0       \n\t"
612
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
613
        d += 4;
614
        s += 16;
615
    }
616
    __asm__ volatile(SFENCE:::"memory");
617
    __asm__ volatile(EMMS:::"memory");
618
#endif
619
    while (s < end) {
620
        register int rgb = *(const uint32_t*)s; s += 4;
621
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
622
    }
623
}
624

    
625
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
626
{
627
    const uint8_t *s = src;
628
    const uint8_t *end;
629
#if HAVE_MMX
630
    const uint8_t *mm_end;
631
#endif
632
    uint16_t *d = (uint16_t *)dst;
633
    end = s + src_size;
634
#if HAVE_MMX
635
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
636
    __asm__ volatile(
637
        "movq         %0, %%mm7     \n\t"
638
        "movq         %1, %%mm6     \n\t"
639
        ::"m"(red_16mask),"m"(green_16mask));
640
    mm_end = end - 11;
641
    while (s < mm_end) {
642
        __asm__ volatile(
643
            PREFETCH"    32%1           \n\t"
644
            "movd          %1, %%mm0    \n\t"
645
            "movd         3%1, %%mm3    \n\t"
646
            "punpckldq    6%1, %%mm0    \n\t"
647
            "punpckldq    9%1, %%mm3    \n\t"
648
            "movq       %%mm0, %%mm1    \n\t"
649
            "movq       %%mm0, %%mm2    \n\t"
650
            "movq       %%mm3, %%mm4    \n\t"
651
            "movq       %%mm3, %%mm5    \n\t"
652
            "psrlq         $3, %%mm0    \n\t"
653
            "psrlq         $3, %%mm3    \n\t"
654
            "pand          %2, %%mm0    \n\t"
655
            "pand          %2, %%mm3    \n\t"
656
            "psrlq         $5, %%mm1    \n\t"
657
            "psrlq         $5, %%mm4    \n\t"
658
            "pand       %%mm6, %%mm1    \n\t"
659
            "pand       %%mm6, %%mm4    \n\t"
660
            "psrlq         $8, %%mm2    \n\t"
661
            "psrlq         $8, %%mm5    \n\t"
662
            "pand       %%mm7, %%mm2    \n\t"
663
            "pand       %%mm7, %%mm5    \n\t"
664
            "por        %%mm1, %%mm0    \n\t"
665
            "por        %%mm4, %%mm3    \n\t"
666
            "por        %%mm2, %%mm0    \n\t"
667
            "por        %%mm5, %%mm3    \n\t"
668
            "psllq        $16, %%mm3    \n\t"
669
            "por        %%mm3, %%mm0    \n\t"
670
            MOVNTQ"     %%mm0, %0       \n\t"
671
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
672
        d += 4;
673
        s += 12;
674
    }
675
    __asm__ volatile(SFENCE:::"memory");
676
    __asm__ volatile(EMMS:::"memory");
677
#endif
678
    while (s < end) {
679
        const int b = *s++;
680
        const int g = *s++;
681
        const int r = *s++;
682
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
683
    }
684
}
685

    
686
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
687
{
688
    const uint8_t *s = src;
689
    const uint8_t *end;
690
#if HAVE_MMX
691
    const uint8_t *mm_end;
692
#endif
693
    uint16_t *d = (uint16_t *)dst;
694
    end = s + src_size;
695
#if HAVE_MMX
696
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
697
    __asm__ volatile(
698
        "movq         %0, %%mm7     \n\t"
699
        "movq         %1, %%mm6     \n\t"
700
        ::"m"(red_16mask),"m"(green_16mask));
701
    mm_end = end - 15;
702
    while (s < mm_end) {
703
        __asm__ volatile(
704
            PREFETCH"    32%1           \n\t"
705
            "movd          %1, %%mm0    \n\t"
706
            "movd         3%1, %%mm3    \n\t"
707
            "punpckldq    6%1, %%mm0    \n\t"
708
            "punpckldq    9%1, %%mm3    \n\t"
709
            "movq       %%mm0, %%mm1    \n\t"
710
            "movq       %%mm0, %%mm2    \n\t"
711
            "movq       %%mm3, %%mm4    \n\t"
712
            "movq       %%mm3, %%mm5    \n\t"
713
            "psllq         $8, %%mm0    \n\t"
714
            "psllq         $8, %%mm3    \n\t"
715
            "pand       %%mm7, %%mm0    \n\t"
716
            "pand       %%mm7, %%mm3    \n\t"
717
            "psrlq         $5, %%mm1    \n\t"
718
            "psrlq         $5, %%mm4    \n\t"
719
            "pand       %%mm6, %%mm1    \n\t"
720
            "pand       %%mm6, %%mm4    \n\t"
721
            "psrlq        $19, %%mm2    \n\t"
722
            "psrlq        $19, %%mm5    \n\t"
723
            "pand          %2, %%mm2    \n\t"
724
            "pand          %2, %%mm5    \n\t"
725
            "por        %%mm1, %%mm0    \n\t"
726
            "por        %%mm4, %%mm3    \n\t"
727
            "por        %%mm2, %%mm0    \n\t"
728
            "por        %%mm5, %%mm3    \n\t"
729
            "psllq        $16, %%mm3    \n\t"
730
            "por        %%mm3, %%mm0    \n\t"
731
            MOVNTQ"     %%mm0, %0       \n\t"
732
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
733
        d += 4;
734
        s += 12;
735
    }
736
    __asm__ volatile(SFENCE:::"memory");
737
    __asm__ volatile(EMMS:::"memory");
738
#endif
739
    while (s < end) {
740
        const int r = *s++;
741
        const int g = *s++;
742
        const int b = *s++;
743
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
744
    }
745
}
746

    
747
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
748
{
749
    const uint8_t *s = src;
750
    const uint8_t *end;
751
#if HAVE_MMX
752
    const uint8_t *mm_end;
753
#endif
754
    uint16_t *d = (uint16_t *)dst;
755
    end = s + src_size;
756
#if HAVE_MMX
757
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
758
    __asm__ volatile(
759
        "movq          %0, %%mm7    \n\t"
760
        "movq          %1, %%mm6    \n\t"
761
        ::"m"(red_15mask),"m"(green_15mask));
762
    mm_end = end - 11;
763
    while (s < mm_end) {
764
        __asm__ volatile(
765
            PREFETCH"    32%1           \n\t"
766
            "movd          %1, %%mm0    \n\t"
767
            "movd         3%1, %%mm3    \n\t"
768
            "punpckldq    6%1, %%mm0    \n\t"
769
            "punpckldq    9%1, %%mm3    \n\t"
770
            "movq       %%mm0, %%mm1    \n\t"
771
            "movq       %%mm0, %%mm2    \n\t"
772
            "movq       %%mm3, %%mm4    \n\t"
773
            "movq       %%mm3, %%mm5    \n\t"
774
            "psrlq         $3, %%mm0    \n\t"
775
            "psrlq         $3, %%mm3    \n\t"
776
            "pand          %2, %%mm0    \n\t"
777
            "pand          %2, %%mm3    \n\t"
778
            "psrlq         $6, %%mm1    \n\t"
779
            "psrlq         $6, %%mm4    \n\t"
780
            "pand       %%mm6, %%mm1    \n\t"
781
            "pand       %%mm6, %%mm4    \n\t"
782
            "psrlq         $9, %%mm2    \n\t"
783
            "psrlq         $9, %%mm5    \n\t"
784
            "pand       %%mm7, %%mm2    \n\t"
785
            "pand       %%mm7, %%mm5    \n\t"
786
            "por        %%mm1, %%mm0    \n\t"
787
            "por        %%mm4, %%mm3    \n\t"
788
            "por        %%mm2, %%mm0    \n\t"
789
            "por        %%mm5, %%mm3    \n\t"
790
            "psllq        $16, %%mm3    \n\t"
791
            "por        %%mm3, %%mm0    \n\t"
792
            MOVNTQ"     %%mm0, %0       \n\t"
793
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794
        d += 4;
795
        s += 12;
796
    }
797
    __asm__ volatile(SFENCE:::"memory");
798
    __asm__ volatile(EMMS:::"memory");
799
#endif
800
    while (s < end) {
801
        const int b = *s++;
802
        const int g = *s++;
803
        const int r = *s++;
804
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
805
    }
806
}
807

    
808
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
809
{
810
    const uint8_t *s = src;
811
    const uint8_t *end;
812
#if HAVE_MMX
813
    const uint8_t *mm_end;
814
#endif
815
    uint16_t *d = (uint16_t *)dst;
816
    end = s + src_size;
817
#if HAVE_MMX
818
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
819
    __asm__ volatile(
820
        "movq         %0, %%mm7     \n\t"
821
        "movq         %1, %%mm6     \n\t"
822
        ::"m"(red_15mask),"m"(green_15mask));
823
    mm_end = end - 15;
824
    while (s < mm_end) {
825
        __asm__ volatile(
826
            PREFETCH"   32%1            \n\t"
827
            "movd         %1, %%mm0     \n\t"
828
            "movd        3%1, %%mm3     \n\t"
829
            "punpckldq   6%1, %%mm0     \n\t"
830
            "punpckldq   9%1, %%mm3     \n\t"
831
            "movq      %%mm0, %%mm1     \n\t"
832
            "movq      %%mm0, %%mm2     \n\t"
833
            "movq      %%mm3, %%mm4     \n\t"
834
            "movq      %%mm3, %%mm5     \n\t"
835
            "psllq        $7, %%mm0     \n\t"
836
            "psllq        $7, %%mm3     \n\t"
837
            "pand      %%mm7, %%mm0     \n\t"
838
            "pand      %%mm7, %%mm3     \n\t"
839
            "psrlq        $6, %%mm1     \n\t"
840
            "psrlq        $6, %%mm4     \n\t"
841
            "pand      %%mm6, %%mm1     \n\t"
842
            "pand      %%mm6, %%mm4     \n\t"
843
            "psrlq       $19, %%mm2     \n\t"
844
            "psrlq       $19, %%mm5     \n\t"
845
            "pand         %2, %%mm2     \n\t"
846
            "pand         %2, %%mm5     \n\t"
847
            "por       %%mm1, %%mm0     \n\t"
848
            "por       %%mm4, %%mm3     \n\t"
849
            "por       %%mm2, %%mm0     \n\t"
850
            "por       %%mm5, %%mm3     \n\t"
851
            "psllq       $16, %%mm3     \n\t"
852
            "por       %%mm3, %%mm0     \n\t"
853
            MOVNTQ"    %%mm0, %0        \n\t"
854
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855
        d += 4;
856
        s += 12;
857
    }
858
    __asm__ volatile(SFENCE:::"memory");
859
    __asm__ volatile(EMMS:::"memory");
860
#endif
861
    while (s < end) {
862
        const int r = *s++;
863
        const int g = *s++;
864
        const int b = *s++;
865
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
866
    }
867
}
868

    
869
/*
870
  I use less accurate approximation here by simply left-shifting the input
871
  value and filling the low order bits with zeroes. This method improves PNG
872
  compression but this scheme cannot reproduce white exactly, since it does
873
  not generate an all-ones maximum value; the net effect is to darken the
874
  image slightly.
875

876
  The better method should be "left bit replication":
877

878
   4 3 2 1 0
879
   ---------
880
   1 1 0 1 1
881

882
   7 6 5 4 3  2 1 0
883
   ----------------
884
   1 1 0 1 1  1 1 0
885
   |=======|  |===|
886
       |      leftmost bits repeated to fill open bits
887
       |
888
   original bits
889
*/
890
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
891
{
892
    const uint16_t *end;
893
#if HAVE_MMX
894
    const uint16_t *mm_end;
895
#endif
896
    uint8_t *d = dst;
897
    const uint16_t *s = (const uint16_t*)src;
898
    end = s + src_size/2;
899
#if HAVE_MMX
900
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
901
    mm_end = end - 7;
902
    while (s < mm_end) {
903
        __asm__ volatile(
904
            PREFETCH"    32%1           \n\t"
905
            "movq          %1, %%mm0    \n\t"
906
            "movq          %1, %%mm1    \n\t"
907
            "movq          %1, %%mm2    \n\t"
908
            "pand          %2, %%mm0    \n\t"
909
            "pand          %3, %%mm1    \n\t"
910
            "pand          %4, %%mm2    \n\t"
911
            "psllq         $3, %%mm0    \n\t"
912
            "psrlq         $2, %%mm1    \n\t"
913
            "psrlq         $7, %%mm2    \n\t"
914
            "movq       %%mm0, %%mm3    \n\t"
915
            "movq       %%mm1, %%mm4    \n\t"
916
            "movq       %%mm2, %%mm5    \n\t"
917
            "punpcklwd     %5, %%mm0    \n\t"
918
            "punpcklwd     %5, %%mm1    \n\t"
919
            "punpcklwd     %5, %%mm2    \n\t"
920
            "punpckhwd     %5, %%mm3    \n\t"
921
            "punpckhwd     %5, %%mm4    \n\t"
922
            "punpckhwd     %5, %%mm5    \n\t"
923
            "psllq         $8, %%mm1    \n\t"
924
            "psllq        $16, %%mm2    \n\t"
925
            "por        %%mm1, %%mm0    \n\t"
926
            "por        %%mm2, %%mm0    \n\t"
927
            "psllq         $8, %%mm4    \n\t"
928
            "psllq        $16, %%mm5    \n\t"
929
            "por        %%mm4, %%mm3    \n\t"
930
            "por        %%mm5, %%mm3    \n\t"
931

    
932
            "movq       %%mm0, %%mm6    \n\t"
933
            "movq       %%mm3, %%mm7    \n\t"
934

    
935
            "movq         8%1, %%mm0    \n\t"
936
            "movq         8%1, %%mm1    \n\t"
937
            "movq         8%1, %%mm2    \n\t"
938
            "pand          %2, %%mm0    \n\t"
939
            "pand          %3, %%mm1    \n\t"
940
            "pand          %4, %%mm2    \n\t"
941
            "psllq         $3, %%mm0    \n\t"
942
            "psrlq         $2, %%mm1    \n\t"
943
            "psrlq         $7, %%mm2    \n\t"
944
            "movq       %%mm0, %%mm3    \n\t"
945
            "movq       %%mm1, %%mm4    \n\t"
946
            "movq       %%mm2, %%mm5    \n\t"
947
            "punpcklwd     %5, %%mm0    \n\t"
948
            "punpcklwd     %5, %%mm1    \n\t"
949
            "punpcklwd     %5, %%mm2    \n\t"
950
            "punpckhwd     %5, %%mm3    \n\t"
951
            "punpckhwd     %5, %%mm4    \n\t"
952
            "punpckhwd     %5, %%mm5    \n\t"
953
            "psllq         $8, %%mm1    \n\t"
954
            "psllq        $16, %%mm2    \n\t"
955
            "por        %%mm1, %%mm0    \n\t"
956
            "por        %%mm2, %%mm0    \n\t"
957
            "psllq         $8, %%mm4    \n\t"
958
            "psllq        $16, %%mm5    \n\t"
959
            "por        %%mm4, %%mm3    \n\t"
960
            "por        %%mm5, %%mm3    \n\t"
961

    
962
            :"=m"(*d)
963
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
964
            :"memory");
965
        /* borrowed 32 to 24 */
966
        __asm__ volatile(
967
            "movq       %%mm0, %%mm4    \n\t"
968
            "movq       %%mm3, %%mm5    \n\t"
969
            "movq       %%mm6, %%mm0    \n\t"
970
            "movq       %%mm7, %%mm1    \n\t"
971

    
972
            "movq       %%mm4, %%mm6    \n\t"
973
            "movq       %%mm5, %%mm7    \n\t"
974
            "movq       %%mm0, %%mm2    \n\t"
975
            "movq       %%mm1, %%mm3    \n\t"
976

    
977
            STORE_BGR24_MMX
978

    
979
            :"=m"(*d)
980
            :"m"(*s)
981
            :"memory");
982
        d += 24;
983
        s += 8;
984
    }
985
    __asm__ volatile(SFENCE:::"memory");
986
    __asm__ volatile(EMMS:::"memory");
987
#endif
988
    while (s < end) {
989
        register uint16_t bgr;
990
        bgr = *s++;
991
        *d++ = (bgr&0x1F)<<3;
992
        *d++ = (bgr&0x3E0)>>2;
993
        *d++ = (bgr&0x7C00)>>7;
994
    }
995
}
996

    
997
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
998
{
999
    const uint16_t *end;
1000
#if HAVE_MMX
1001
    const uint16_t *mm_end;
1002
#endif
1003
    uint8_t *d = (uint8_t *)dst;
1004
    const uint16_t *s = (const uint16_t *)src;
1005
    end = s + src_size/2;
1006
#if HAVE_MMX
1007
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1008
    mm_end = end - 7;
1009
    while (s < mm_end) {
1010
        __asm__ volatile(
1011
            PREFETCH"    32%1           \n\t"
1012
            "movq          %1, %%mm0    \n\t"
1013
            "movq          %1, %%mm1    \n\t"
1014
            "movq          %1, %%mm2    \n\t"
1015
            "pand          %2, %%mm0    \n\t"
1016
            "pand          %3, %%mm1    \n\t"
1017
            "pand          %4, %%mm2    \n\t"
1018
            "psllq         $3, %%mm0    \n\t"
1019
            "psrlq         $3, %%mm1    \n\t"
1020
            "psrlq         $8, %%mm2    \n\t"
1021
            "movq       %%mm0, %%mm3    \n\t"
1022
            "movq       %%mm1, %%mm4    \n\t"
1023
            "movq       %%mm2, %%mm5    \n\t"
1024
            "punpcklwd     %5, %%mm0    \n\t"
1025
            "punpcklwd     %5, %%mm1    \n\t"
1026
            "punpcklwd     %5, %%mm2    \n\t"
1027
            "punpckhwd     %5, %%mm3    \n\t"
1028
            "punpckhwd     %5, %%mm4    \n\t"
1029
            "punpckhwd     %5, %%mm5    \n\t"
1030
            "psllq         $8, %%mm1    \n\t"
1031
            "psllq        $16, %%mm2    \n\t"
1032
            "por        %%mm1, %%mm0    \n\t"
1033
            "por        %%mm2, %%mm0    \n\t"
1034
            "psllq         $8, %%mm4    \n\t"
1035
            "psllq        $16, %%mm5    \n\t"
1036
            "por        %%mm4, %%mm3    \n\t"
1037
            "por        %%mm5, %%mm3    \n\t"
1038

    
1039
            "movq       %%mm0, %%mm6    \n\t"
1040
            "movq       %%mm3, %%mm7    \n\t"
1041

    
1042
            "movq         8%1, %%mm0    \n\t"
1043
            "movq         8%1, %%mm1    \n\t"
1044
            "movq         8%1, %%mm2    \n\t"
1045
            "pand          %2, %%mm0    \n\t"
1046
            "pand          %3, %%mm1    \n\t"
1047
            "pand          %4, %%mm2    \n\t"
1048
            "psllq         $3, %%mm0    \n\t"
1049
            "psrlq         $3, %%mm1    \n\t"
1050
            "psrlq         $8, %%mm2    \n\t"
1051
            "movq       %%mm0, %%mm3    \n\t"
1052
            "movq       %%mm1, %%mm4    \n\t"
1053
            "movq       %%mm2, %%mm5    \n\t"
1054
            "punpcklwd     %5, %%mm0    \n\t"
1055
            "punpcklwd     %5, %%mm1    \n\t"
1056
            "punpcklwd     %5, %%mm2    \n\t"
1057
            "punpckhwd     %5, %%mm3    \n\t"
1058
            "punpckhwd     %5, %%mm4    \n\t"
1059
            "punpckhwd     %5, %%mm5    \n\t"
1060
            "psllq         $8, %%mm1    \n\t"
1061
            "psllq        $16, %%mm2    \n\t"
1062
            "por        %%mm1, %%mm0    \n\t"
1063
            "por        %%mm2, %%mm0    \n\t"
1064
            "psllq         $8, %%mm4    \n\t"
1065
            "psllq        $16, %%mm5    \n\t"
1066
            "por        %%mm4, %%mm3    \n\t"
1067
            "por        %%mm5, %%mm3    \n\t"
1068
            :"=m"(*d)
1069
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1070
            :"memory");
1071
        /* borrowed 32 to 24 */
1072
        __asm__ volatile(
1073
            "movq       %%mm0, %%mm4    \n\t"
1074
            "movq       %%mm3, %%mm5    \n\t"
1075
            "movq       %%mm6, %%mm0    \n\t"
1076
            "movq       %%mm7, %%mm1    \n\t"
1077

    
1078
            "movq       %%mm4, %%mm6    \n\t"
1079
            "movq       %%mm5, %%mm7    \n\t"
1080
            "movq       %%mm0, %%mm2    \n\t"
1081
            "movq       %%mm1, %%mm3    \n\t"
1082

    
1083
            STORE_BGR24_MMX
1084

    
1085
            :"=m"(*d)
1086
            :"m"(*s)
1087
            :"memory");
1088
        d += 24;
1089
        s += 8;
1090
    }
1091
    __asm__ volatile(SFENCE:::"memory");
1092
    __asm__ volatile(EMMS:::"memory");
1093
#endif
1094
    while (s < end) {
1095
        register uint16_t bgr;
1096
        bgr = *s++;
1097
        *d++ = (bgr&0x1F)<<3;
1098
        *d++ = (bgr&0x7E0)>>3;
1099
        *d++ = (bgr&0xF800)>>8;
1100
    }
1101
}
1102

    
1103
/*
1104
 * mm0 = 00 B3 00 B2 00 B1 00 B0
1105
 * mm1 = 00 G3 00 G2 00 G1 00 G0
1106
 * mm2 = 00 R3 00 R2 00 R1 00 R0
1107
 * mm6 = FF FF FF FF FF FF FF FF
1108
 * mm7 = 00 00 00 00 00 00 00 00
1109
 */
1110
#define PACK_RGB32 \
1111
    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1112
    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1113
    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1114
    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1115
    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1116
    "movq       %%mm0, %%mm3    \n\t"                               \
1117
    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1118
    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1119
    MOVNTQ"     %%mm0,  %0      \n\t"                               \
1120
    MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1121

    
1122
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1123
{
1124
    const uint16_t *end;
1125
#if HAVE_MMX
1126
    const uint16_t *mm_end;
1127
#endif
1128
    uint8_t *d = dst;
1129
    const uint16_t *s = (const uint16_t *)src;
1130
    end = s + src_size/2;
1131
#if HAVE_MMX
1132
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1133
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1134
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1135
    mm_end = end - 3;
1136
    while (s < mm_end) {
1137
        __asm__ volatile(
1138
            PREFETCH"    32%1           \n\t"
1139
            "movq          %1, %%mm0    \n\t"
1140
            "movq          %1, %%mm1    \n\t"
1141
            "movq          %1, %%mm2    \n\t"
1142
            "pand          %2, %%mm0    \n\t"
1143
            "pand          %3, %%mm1    \n\t"
1144
            "pand          %4, %%mm2    \n\t"
1145
            "psllq         $3, %%mm0    \n\t"
1146
            "psrlq         $2, %%mm1    \n\t"
1147
            "psrlq         $7, %%mm2    \n\t"
1148
            PACK_RGB32
1149
            :"=m"(*d)
1150
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1151
            :"memory");
1152
        d += 16;
1153
        s += 4;
1154
    }
1155
    __asm__ volatile(SFENCE:::"memory");
1156
    __asm__ volatile(EMMS:::"memory");
1157
#endif
1158
    while (s < end) {
1159
        register uint16_t bgr;
1160
        bgr = *s++;
1161
#if HAVE_BIGENDIAN
1162
        *d++ = 255;
1163
        *d++ = (bgr&0x7C00)>>7;
1164
        *d++ = (bgr&0x3E0)>>2;
1165
        *d++ = (bgr&0x1F)<<3;
1166
#else
1167
        *d++ = (bgr&0x1F)<<3;
1168
        *d++ = (bgr&0x3E0)>>2;
1169
        *d++ = (bgr&0x7C00)>>7;
1170
        *d++ = 255;
1171
#endif
1172
    }
1173
}
1174

    
1175
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1176
{
1177
    const uint16_t *end;
1178
#if HAVE_MMX
1179
    const uint16_t *mm_end;
1180
#endif
1181
    uint8_t *d = dst;
1182
    const uint16_t *s = (const uint16_t*)src;
1183
    end = s + src_size/2;
1184
#if HAVE_MMX
1185
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1186
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1187
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1188
    mm_end = end - 3;
1189
    while (s < mm_end) {
1190
        __asm__ volatile(
1191
            PREFETCH"    32%1           \n\t"
1192
            "movq          %1, %%mm0    \n\t"
1193
            "movq          %1, %%mm1    \n\t"
1194
            "movq          %1, %%mm2    \n\t"
1195
            "pand          %2, %%mm0    \n\t"
1196
            "pand          %3, %%mm1    \n\t"
1197
            "pand          %4, %%mm2    \n\t"
1198
            "psllq         $3, %%mm0    \n\t"
1199
            "psrlq         $3, %%mm1    \n\t"
1200
            "psrlq         $8, %%mm2    \n\t"
1201
            PACK_RGB32
1202
            :"=m"(*d)
1203
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1204
            :"memory");
1205
        d += 16;
1206
        s += 4;
1207
    }
1208
    __asm__ volatile(SFENCE:::"memory");
1209
    __asm__ volatile(EMMS:::"memory");
1210
#endif
1211
    while (s < end) {
1212
        register uint16_t bgr;
1213
        bgr = *s++;
1214
#if HAVE_BIGENDIAN
1215
        *d++ = 255;
1216
        *d++ = (bgr&0xF800)>>8;
1217
        *d++ = (bgr&0x7E0)>>3;
1218
        *d++ = (bgr&0x1F)<<3;
1219
#else
1220
        *d++ = (bgr&0x1F)<<3;
1221
        *d++ = (bgr&0x7E0)>>3;
1222
        *d++ = (bgr&0xF800)>>8;
1223
        *d++ = 255;
1224
#endif
1225
    }
1226
}
1227

    
1228
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1229
{
1230
    x86_reg idx = 15 - src_size;
1231
    const uint8_t *s = src-idx;
1232
    uint8_t *d = dst-idx;
1233
#if HAVE_MMX
1234
    __asm__ volatile(
1235
        "test          %0, %0           \n\t"
1236
        "jns           2f               \n\t"
1237
        PREFETCH"       (%1, %0)        \n\t"
1238
        "movq          %3, %%mm7        \n\t"
1239
        "pxor          %4, %%mm7        \n\t"
1240
        "movq       %%mm7, %%mm6        \n\t"
1241
        "pxor          %5, %%mm7        \n\t"
1242
        ASMALIGN(4)
1243
        "1:                             \n\t"
1244
        PREFETCH"     32(%1, %0)        \n\t"
1245
        "movq           (%1, %0), %%mm0 \n\t"
1246
        "movq          8(%1, %0), %%mm1 \n\t"
1247
# if HAVE_MMX2
1248
        "pshufw      $177, %%mm0, %%mm3 \n\t"
1249
        "pshufw      $177, %%mm1, %%mm5 \n\t"
1250
        "pand       %%mm7, %%mm0        \n\t"
1251
        "pand       %%mm6, %%mm3        \n\t"
1252
        "pand       %%mm7, %%mm1        \n\t"
1253
        "pand       %%mm6, %%mm5        \n\t"
1254
        "por        %%mm3, %%mm0        \n\t"
1255
        "por        %%mm5, %%mm1        \n\t"
1256
# else
1257
        "movq       %%mm0, %%mm2        \n\t"
1258
        "movq       %%mm1, %%mm4        \n\t"
1259
        "pand       %%mm7, %%mm0        \n\t"
1260
        "pand       %%mm6, %%mm2        \n\t"
1261
        "pand       %%mm7, %%mm1        \n\t"
1262
        "pand       %%mm6, %%mm4        \n\t"
1263
        "movq       %%mm2, %%mm3        \n\t"
1264
        "movq       %%mm4, %%mm5        \n\t"
1265
        "pslld        $16, %%mm2        \n\t"
1266
        "psrld        $16, %%mm3        \n\t"
1267
        "pslld        $16, %%mm4        \n\t"
1268
        "psrld        $16, %%mm5        \n\t"
1269
        "por        %%mm2, %%mm0        \n\t"
1270
        "por        %%mm4, %%mm1        \n\t"
1271
        "por        %%mm3, %%mm0        \n\t"
1272
        "por        %%mm5, %%mm1        \n\t"
1273
# endif
1274
        MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1275
        MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1276
        "add          $16, %0           \n\t"
1277
        "js            1b               \n\t"
1278
        SFENCE"                         \n\t"
1279
        EMMS"                           \n\t"
1280
        "2:                             \n\t"
1281
        : "+&r"(idx)
1282
        : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1283
        : "memory");
1284
#endif
1285
    for (; idx<15; idx+=4) {
1286
        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1287
        v &= 0xff00ff;
1288
        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1289
    }
1290
}
1291

    
1292
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1293
{
1294
    unsigned i;
1295
#if HAVE_MMX
1296
    x86_reg mmx_size= 23 - src_size;
1297
    __asm__ volatile (
1298
        "test             %%"REG_a", %%"REG_a"          \n\t"
1299
        "jns                     2f                     \n\t"
1300
        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1301
        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1302
        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1303
        ASMALIGN(4)
1304
        "1:                                             \n\t"
1305
        PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1306
        "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1307
        "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1308
        "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1309
        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1310
        "pand                 %%mm5, %%mm0              \n\t"
1311
        "pand                 %%mm6, %%mm1              \n\t"
1312
        "pand                 %%mm7, %%mm2              \n\t"
1313
        "por                  %%mm0, %%mm1              \n\t"
1314
        "por                  %%mm2, %%mm1              \n\t"
1315
        "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1316
        MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1317
        "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1318
        "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1319
        "pand                 %%mm7, %%mm0              \n\t"
1320
        "pand                 %%mm5, %%mm1              \n\t"
1321
        "pand                 %%mm6, %%mm2              \n\t"
1322
        "por                  %%mm0, %%mm1              \n\t"
1323
        "por                  %%mm2, %%mm1              \n\t"
1324
        "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1325
        MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1326
        "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1327
        "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1328
        "pand                 %%mm6, %%mm0              \n\t"
1329
        "pand                 %%mm7, %%mm1              \n\t"
1330
        "pand                 %%mm5, %%mm2              \n\t"
1331
        "por                  %%mm0, %%mm1              \n\t"
1332
        "por                  %%mm2, %%mm1              \n\t"
1333
        MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1334
        "add                    $24, %%"REG_a"          \n\t"
1335
        " js                     1b                     \n\t"
1336
        "2:                                             \n\t"
1337
        : "+a" (mmx_size)
1338
        : "r" (src-mmx_size), "r"(dst-mmx_size)
1339
    );
1340

    
1341
    __asm__ volatile(SFENCE:::"memory");
1342
    __asm__ volatile(EMMS:::"memory");
1343

    
1344
    if (mmx_size==23) return; //finished, was multiple of 8
1345

    
1346
    src+= src_size;
1347
    dst+= src_size;
1348
    src_size= 23-mmx_size;
1349
    src-= src_size;
1350
    dst-= src_size;
1351
#endif
1352
    for (i=0; i<src_size; i+=3) {
1353
        register uint8_t x;
1354
        x          = src[i + 2];
1355
        dst[i + 1] = src[i + 1];
1356
        dst[i + 2] = src[i + 0];
1357
        dst[i + 0] = x;
1358
    }
1359
}
1360

    
1361
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1362
                                           long width, long height,
1363
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1364
{
1365
    long y;
1366
    const x86_reg chromWidth= width>>1;
1367
    for (y=0; y<height; y++) {
1368
#if HAVE_MMX
1369
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1370
        __asm__ volatile(
1371
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1372
            ASMALIGN(4)
1373
            "1:                                         \n\t"
1374
            PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1375
            PREFETCH"    32(%2, %%"REG_a")              \n\t"
1376
            PREFETCH"    32(%3, %%"REG_a")              \n\t"
1377
            "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1378
            "movq                    %%mm0, %%mm2       \n\t" // U(0)
1379
            "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1380
            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1381
            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1382

    
1383
            "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1384
            "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1385
            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1386
            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1387
            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1388
            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1389
            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1390
            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1391

    
1392
            MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1393
            MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1394
            MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1395
            MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1396

    
1397
            "add                        $8, %%"REG_a"   \n\t"
1398
            "cmp                        %4, %%"REG_a"   \n\t"
1399
            " jb                        1b              \n\t"
1400
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1401
            : "%"REG_a
1402
        );
1403
#else
1404

    
1405
#if ARCH_ALPHA && HAVE_MVI
1406
#define pl2yuy2(n)                  \
1407
    y1 = yc[n];                     \
1408
    y2 = yc2[n];                    \
1409
    u = uc[n];                      \
1410
    v = vc[n];                      \
1411
    __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1412
    __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1413
    __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1414
    __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1415
    yuv1 = (u << 8) + (v << 24);                \
1416
    yuv2 = yuv1 + y2;               \
1417
    yuv1 += y1;                     \
1418
    qdst[n]  = yuv1;                \
1419
    qdst2[n] = yuv2;
1420

    
1421
        int i;
1422
        uint64_t *qdst = (uint64_t *) dst;
1423
        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1424
        const uint32_t *yc = (uint32_t *) ysrc;
1425
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1426
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1427
        for (i = 0; i < chromWidth; i += 8) {
1428
            uint64_t y1, y2, yuv1, yuv2;
1429
            uint64_t u, v;
1430
            /* Prefetch */
1431
            __asm__("ldq $31,64(%0)" :: "r"(yc));
1432
            __asm__("ldq $31,64(%0)" :: "r"(yc2));
1433
            __asm__("ldq $31,64(%0)" :: "r"(uc));
1434
            __asm__("ldq $31,64(%0)" :: "r"(vc));
1435

    
1436
            pl2yuy2(0);
1437
            pl2yuy2(1);
1438
            pl2yuy2(2);
1439
            pl2yuy2(3);
1440

    
1441
            yc    += 4;
1442
            yc2   += 4;
1443
            uc    += 4;
1444
            vc    += 4;
1445
            qdst  += 4;
1446
            qdst2 += 4;
1447
        }
1448
        y++;
1449
        ysrc += lumStride;
1450
        dst += dstStride;
1451

    
1452
#elif HAVE_FAST_64BIT
1453
        int i;
1454
        uint64_t *ldst = (uint64_t *) dst;
1455
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1456
        for (i = 0; i < chromWidth; i += 2) {
1457
            uint64_t k, l;
1458
            k = yc[0] + (uc[0] << 8) +
1459
                (yc[1] << 16) + (vc[0] << 24);
1460
            l = yc[2] + (uc[1] << 8) +
1461
                (yc[3] << 16) + (vc[1] << 24);
1462
            *ldst++ = k + (l << 32);
1463
            yc += 4;
1464
            uc += 2;
1465
            vc += 2;
1466
        }
1467

    
1468
#else
1469
        int i, *idst = (int32_t *) dst;
1470
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1471
        for (i = 0; i < chromWidth; i++) {
1472
#if HAVE_BIGENDIAN
1473
            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1474
                (yc[1] << 8) + (vc[0] << 0);
1475
#else
1476
            *idst++ = yc[0] + (uc[0] << 8) +
1477
                (yc[1] << 16) + (vc[0] << 24);
1478
#endif
1479
            yc += 2;
1480
            uc++;
1481
            vc++;
1482
        }
1483
#endif
1484
#endif
1485
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1486
            usrc += chromStride;
1487
            vsrc += chromStride;
1488
        }
1489
        ysrc += lumStride;
1490
        dst  += dstStride;
1491
    }
1492
#if HAVE_MMX
1493
    __asm__(EMMS"       \n\t"
1494
            SFENCE"     \n\t"
1495
            :::"memory");
1496
#endif
1497
}
1498

    
1499
/**
1500
 * Height should be a multiple of 2 and width should be a multiple of 16.
1501
 * (If this is a problem for anyone then tell me, and I will fix it.)
1502
 */
1503
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1504
                                      long width, long height,
1505
                                      long lumStride, long chromStride, long dstStride)
1506
{
1507
    //FIXME interpolate chroma
1508
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1509
}
1510

    
1511
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1512
                                           long width, long height,
1513
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1514
{
1515
    long y;
1516
    const x86_reg chromWidth= width>>1;
1517
    for (y=0; y<height; y++) {
1518
#if HAVE_MMX
1519
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1520
        __asm__ volatile(
1521
            "xor                %%"REG_a", %%"REG_a"    \n\t"
1522
            ASMALIGN(4)
1523
            "1:                                         \n\t"
1524
            PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1525
            PREFETCH"   32(%2, %%"REG_a")               \n\t"
1526
            PREFETCH"   32(%3, %%"REG_a")               \n\t"
1527
            "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1528
            "movq                   %%mm0, %%mm2        \n\t" // U(0)
1529
            "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1530
            "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1531
            "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1532

    
1533
            "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1534
            "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1535
            "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1536
            "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1537
            "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1538
            "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1539
            "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1540
            "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1541

    
1542
            MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1543
            MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1544
            MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1545
            MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1546

    
1547
            "add                       $8, %%"REG_a"    \n\t"
1548
            "cmp                       %4, %%"REG_a"    \n\t"
1549
            " jb                       1b               \n\t"
1550
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1551
            : "%"REG_a
1552
        );
1553
#else
1554
//FIXME adapt the Alpha ASM code from yv12->yuy2
1555

    
1556
#if HAVE_FAST_64BIT
1557
        int i;
1558
        uint64_t *ldst = (uint64_t *) dst;
1559
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560
        for (i = 0; i < chromWidth; i += 2) {
1561
            uint64_t k, l;
1562
            k = uc[0] + (yc[0] << 8) +
1563
                (vc[0] << 16) + (yc[1] << 24);
1564
            l = uc[1] + (yc[2] << 8) +
1565
                (vc[1] << 16) + (yc[3] << 24);
1566
            *ldst++ = k + (l << 32);
1567
            yc += 4;
1568
            uc += 2;
1569
            vc += 2;
1570
        }
1571

    
1572
#else
1573
        int i, *idst = (int32_t *) dst;
1574
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575
        for (i = 0; i < chromWidth; i++) {
1576
#if HAVE_BIGENDIAN
1577
            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1578
                (vc[0] << 8) + (yc[1] << 0);
1579
#else
1580
            *idst++ = uc[0] + (yc[0] << 8) +
1581
               (vc[0] << 16) + (yc[1] << 24);
1582
#endif
1583
            yc += 2;
1584
            uc++;
1585
            vc++;
1586
        }
1587
#endif
1588
#endif
1589
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1590
            usrc += chromStride;
1591
            vsrc += chromStride;
1592
        }
1593
        ysrc += lumStride;
1594
        dst += dstStride;
1595
    }
1596
#if HAVE_MMX
1597
    __asm__(EMMS"       \n\t"
1598
            SFENCE"     \n\t"
1599
            :::"memory");
1600
#endif
1601
}
1602

    
1603
/**
1604
 * Height should be a multiple of 2 and width should be a multiple of 16
1605
 * (If this is a problem for anyone then tell me, and I will fix it.)
1606
 */
1607
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1608
                                      long width, long height,
1609
                                      long lumStride, long chromStride, long dstStride)
1610
{
1611
    //FIXME interpolate chroma
1612
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1613
}
1614

    
1615
/**
1616
 * Width should be a multiple of 16.
1617
 */
1618
static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619
                                         long width, long height,
1620
                                         long lumStride, long chromStride, long dstStride)
1621
{
1622
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1623
}
1624

    
1625
/**
1626
 * Width should be a multiple of 16.
1627
 */
1628
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1629
                                         long width, long height,
1630
                                         long lumStride, long chromStride, long dstStride)
1631
{
1632
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1633
}
1634

    
1635
/**
1636
 * Height should be a multiple of 2 and width should be a multiple of 16.
1637
 * (If this is a problem for anyone then tell me, and I will fix it.)
1638
 */
1639
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1640
                                      long width, long height,
1641
                                      long lumStride, long chromStride, long srcStride)
1642
{
1643
    long y;
1644
    const x86_reg chromWidth= width>>1;
1645
    for (y=0; y<height; y+=2) {
1646
#if HAVE_MMX
1647
        __asm__ volatile(
1648
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1649
            "pcmpeqw                 %%mm7, %%mm7       \n\t"
1650
            "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1651
            ASMALIGN(4)
1652
            "1:                \n\t"
1653
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1654
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1655
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1656
            "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1657
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1658
            "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1659
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1660
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1661
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1662
            "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1663
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1664

    
1665
            MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1666

    
1667
            "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1668
            "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1669
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1670
            "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1671
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1672
            "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1673
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1674
            "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1675
            "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1676
            "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1677

    
1678
            MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1679

    
1680
            "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1681
            "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1682
            "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1683
            "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1684
            "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1685
            "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1686
            "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1687
            "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1688

    
1689
            MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1690
            MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1691

    
1692
            "add                        $8, %%"REG_a"   \n\t"
1693
            "cmp                        %4, %%"REG_a"   \n\t"
1694
            " jb                        1b              \n\t"
1695
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1696
            : "memory", "%"REG_a
1697
        );
1698

    
1699
        ydst += lumStride;
1700
        src  += srcStride;
1701

    
1702
        __asm__ volatile(
1703
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1704
            ASMALIGN(4)
1705
            "1:                                         \n\t"
1706
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1707
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1708
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1709
            "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1710
            "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1711
            "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1712
            "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1713
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1714
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1715
            "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1716
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1717

    
1718
            MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1719
            MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1720

    
1721
            "add                        $8, %%"REG_a"   \n\t"
1722
            "cmp                        %4, %%"REG_a"   \n\t"
1723
            " jb                        1b              \n\t"
1724

    
1725
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1726
            : "memory", "%"REG_a
1727
        );
1728
#else
1729
        long i;
1730
        for (i=0; i<chromWidth; i++) {
1731
            ydst[2*i+0]     = src[4*i+0];
1732
            udst[i]     = src[4*i+1];
1733
            ydst[2*i+1]     = src[4*i+2];
1734
            vdst[i]     = src[4*i+3];
1735
        }
1736
        ydst += lumStride;
1737
        src  += srcStride;
1738

    
1739
        for (i=0; i<chromWidth; i++) {
1740
            ydst[2*i+0]     = src[4*i+0];
1741
            ydst[2*i+1]     = src[4*i+2];
1742
        }
1743
#endif
1744
        udst += chromStride;
1745
        vdst += chromStride;
1746
        ydst += lumStride;
1747
        src  += srcStride;
1748
    }
1749
#if HAVE_MMX
1750
    __asm__ volatile(EMMS"       \n\t"
1751
                     SFENCE"     \n\t"
1752
                     :::"memory");
1753
#endif
1754
}
1755

    
1756
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1757
{
1758
    long x,y;
1759

    
1760
    dst[0]= src[0];
1761

    
1762
    // first line
1763
    for (x=0; x<srcWidth-1; x++) {
1764
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1765
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1766
    }
1767
    dst[2*srcWidth-1]= src[srcWidth-1];
1768

    
1769
    dst+= dstStride;
1770

    
1771
    for (y=1; y<srcHeight; y++) {
1772
#if HAVE_MMX2 || HAVE_AMD3DNOW
1773
        const x86_reg mmxSize= srcWidth&~15;
1774
        __asm__ volatile(
1775
            "mov           %4, %%"REG_a"            \n\t"
1776
            "1:                                     \n\t"
1777
            "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1778
            "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1779
            "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1780
            "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1781
            "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1782
            "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1783
            PAVGB"                  %%mm0, %%mm5    \n\t"
1784
            PAVGB"                  %%mm0, %%mm3    \n\t"
1785
            PAVGB"                  %%mm0, %%mm5    \n\t"
1786
            PAVGB"                  %%mm0, %%mm3    \n\t"
1787
            PAVGB"                  %%mm1, %%mm4    \n\t"
1788
            PAVGB"                  %%mm1, %%mm2    \n\t"
1789
            PAVGB"                  %%mm1, %%mm4    \n\t"
1790
            PAVGB"                  %%mm1, %%mm2    \n\t"
1791
            "movq                   %%mm5, %%mm7    \n\t"
1792
            "movq                   %%mm4, %%mm6    \n\t"
1793
            "punpcklbw              %%mm3, %%mm5    \n\t"
1794
            "punpckhbw              %%mm3, %%mm7    \n\t"
1795
            "punpcklbw              %%mm2, %%mm4    \n\t"
1796
            "punpckhbw              %%mm2, %%mm6    \n\t"
1797
#if 1
1798
            MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1799
            MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1800
            MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1801
            MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1802
#else
1803
            "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1804
            "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1805
            "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1806
            "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1807
#endif
1808
            "add                       $8, %%"REG_a"            \n\t"
1809
            " js                       1b                       \n\t"
1810
            :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1811
            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1812
            "g" (-mmxSize)
1813
            : "%"REG_a
1814

    
1815
        );
1816
#else
1817
        const x86_reg mmxSize=1;
1818
#endif
1819
        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1820
        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1821

    
1822
        for (x=mmxSize-1; x<srcWidth-1; x++) {
1823
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1824
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1825
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1826
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1827
        }
1828
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1829
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1830

    
1831
        dst+=dstStride*2;
1832
        src+=srcStride;
1833
    }
1834

    
1835
    // last line
1836
#if 1
1837
    dst[0]= src[0];
1838

    
1839
    for (x=0; x<srcWidth-1; x++) {
1840
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1841
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1842
    }
1843
    dst[2*srcWidth-1]= src[srcWidth-1];
1844
#else
1845
    for (x=0; x<srcWidth; x++) {
1846
        dst[2*x+0]=
1847
        dst[2*x+1]= src[x];
1848
    }
1849
#endif
1850

    
1851
#if HAVE_MMX
1852
    __asm__ volatile(EMMS"       \n\t"
1853
                     SFENCE"     \n\t"
1854
                     :::"memory");
1855
#endif
1856
}
1857

    
1858
/**
1859
 * Height should be a multiple of 2 and width should be a multiple of 16.
1860
 * (If this is a problem for anyone then tell me, and I will fix it.)
1861
 * Chrominance data is only taken from every second line, others are ignored.
1862
 * FIXME: Write HQ version.
1863
 */
1864
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1865
                                      long width, long height,
1866
                                      long lumStride, long chromStride, long srcStride)
1867
{
1868
    long y;
1869
    const x86_reg chromWidth= width>>1;
1870
    for (y=0; y<height; y+=2) {
1871
#if HAVE_MMX
1872
        __asm__ volatile(
1873
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1874
            "pcmpeqw             %%mm7, %%mm7   \n\t"
1875
            "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
1876
            ASMALIGN(4)
1877
            "1:                                 \n\t"
1878
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1879
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
1880
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
1881
            "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
1882
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
1883
            "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
1884
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
1885
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
1886
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
1887
            "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
1888
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
1889

    
1890
            MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
1891

    
1892
            "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
1893
            "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
1894
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
1895
            "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
1896
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
1897
            "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
1898
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
1899
            "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
1900
            "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
1901
            "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
1902

    
1903
            MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1904

    
1905
            "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
1906
            "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
1907
            "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
1908
            "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
1909
            "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
1910
            "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
1911
            "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
1912
            "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
1913

    
1914
            MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
1915
            MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
1916

    
1917
            "add                    $8, %%"REG_a"   \n\t"
1918
            "cmp                    %4, %%"REG_a"   \n\t"
1919
            " jb                    1b          \n\t"
1920
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1921
            : "memory", "%"REG_a
1922
        );
1923

    
1924
        ydst += lumStride;
1925
        src  += srcStride;
1926

    
1927
        __asm__ volatile(
1928
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1929
            ASMALIGN(4)
1930
            "1:                                 \n\t"
1931
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1932
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
1933
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
1934
            "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
1935
            "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
1936
            "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
1937
            "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
1938
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
1939
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
1940
            "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
1941
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
1942

    
1943
            MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
1944
            MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1945

    
1946
            "add                    $8, %%"REG_a"   \n\t"
1947
            "cmp                    %4, %%"REG_a"   \n\t"
1948
            " jb                    1b          \n\t"
1949

    
1950
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1951
            : "memory", "%"REG_a
1952
        );
1953
#else
1954
        long i;
1955
        for (i=0; i<chromWidth; i++) {
1956
            udst[i]     = src[4*i+0];
1957
            ydst[2*i+0] = src[4*i+1];
1958
            vdst[i]     = src[4*i+2];
1959
            ydst[2*i+1] = src[4*i+3];
1960
        }
1961
        ydst += lumStride;
1962
        src  += srcStride;
1963

    
1964
        for (i=0; i<chromWidth; i++) {
1965
            ydst[2*i+0] = src[4*i+1];
1966
            ydst[2*i+1] = src[4*i+3];
1967
        }
1968
#endif
1969
        udst += chromStride;
1970
        vdst += chromStride;
1971
        ydst += lumStride;
1972
        src  += srcStride;
1973
    }
1974
#if HAVE_MMX
1975
    __asm__ volatile(EMMS"       \n\t"
1976
                     SFENCE"     \n\t"
1977
                     :::"memory");
1978
#endif
1979
}
1980

    
1981
/**
1982
 * Height should be a multiple of 2 and width should be a multiple of 2.
1983
 * (If this is a problem for anyone then tell me, and I will fix it.)
1984
 * Chrominance data is only taken from every second line,
1985
 * others are ignored in the C version.
1986
 * FIXME: Write HQ version.
1987
 */
1988
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1989
                                       long width, long height,
1990
                                       long lumStride, long chromStride, long srcStride)
1991
{
1992
    long y;
1993
    const x86_reg chromWidth= width>>1;
1994
#if HAVE_MMX
1995
    for (y=0; y<height-2; y+=2) {
1996
        long i;
1997
        for (i=0; i<2; i++) {
1998
            __asm__ volatile(
1999
                "mov                        %2, %%"REG_a"   \n\t"
2000
                "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2001
                "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2002
                "pxor                    %%mm7, %%mm7       \n\t"
2003
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2004
                ASMALIGN(4)
2005
                "1:                                         \n\t"
2006
                PREFETCH"    64(%0, %%"REG_d")              \n\t"
2007
                "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2008
                "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2009
                "punpcklbw               %%mm7, %%mm0       \n\t"
2010
                "punpcklbw               %%mm7, %%mm1       \n\t"
2011
                "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2012
                "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2013
                "punpcklbw               %%mm7, %%mm2       \n\t"
2014
                "punpcklbw               %%mm7, %%mm3       \n\t"
2015
                "pmaddwd                 %%mm6, %%mm0       \n\t"
2016
                "pmaddwd                 %%mm6, %%mm1       \n\t"
2017
                "pmaddwd                 %%mm6, %%mm2       \n\t"
2018
                "pmaddwd                 %%mm6, %%mm3       \n\t"
2019
#ifndef FAST_BGR2YV12
2020
                "psrad                      $8, %%mm0       \n\t"
2021
                "psrad                      $8, %%mm1       \n\t"
2022
                "psrad                      $8, %%mm2       \n\t"
2023
                "psrad                      $8, %%mm3       \n\t"
2024
#endif
2025
                "packssdw                %%mm1, %%mm0       \n\t"
2026
                "packssdw                %%mm3, %%mm2       \n\t"
2027
                "pmaddwd                 %%mm5, %%mm0       \n\t"
2028
                "pmaddwd                 %%mm5, %%mm2       \n\t"
2029
                "packssdw                %%mm2, %%mm0       \n\t"
2030
                "psraw                      $7, %%mm0       \n\t"
2031

    
2032
                "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2033
                "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2034
                "punpcklbw               %%mm7, %%mm4       \n\t"
2035
                "punpcklbw               %%mm7, %%mm1       \n\t"
2036
                "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2037
                "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2038
                "punpcklbw               %%mm7, %%mm2       \n\t"
2039
                "punpcklbw               %%mm7, %%mm3       \n\t"
2040
                "pmaddwd                 %%mm6, %%mm4       \n\t"
2041
                "pmaddwd                 %%mm6, %%mm1       \n\t"
2042
                "pmaddwd                 %%mm6, %%mm2       \n\t"
2043
                "pmaddwd                 %%mm6, %%mm3       \n\t"
2044
#ifndef FAST_BGR2YV12
2045
                "psrad                      $8, %%mm4       \n\t"
2046
                "psrad                      $8, %%mm1       \n\t"
2047
                "psrad                      $8, %%mm2       \n\t"
2048
                "psrad                      $8, %%mm3       \n\t"
2049
#endif
2050
                "packssdw                %%mm1, %%mm4       \n\t"
2051
                "packssdw                %%mm3, %%mm2       \n\t"
2052
                "pmaddwd                 %%mm5, %%mm4       \n\t"
2053
                "pmaddwd                 %%mm5, %%mm2       \n\t"
2054
                "add                       $24, %%"REG_d"   \n\t"
2055
                "packssdw                %%mm2, %%mm4       \n\t"
2056
                "psraw                      $7, %%mm4       \n\t"
2057

    
2058
                "packuswb                %%mm4, %%mm0       \n\t"
2059
                "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2060

    
2061
                MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2062
                "add                        $8,      %%"REG_a"  \n\t"
2063
                " js                        1b                  \n\t"
2064
                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2065
                : "%"REG_a, "%"REG_d
2066
            );
2067
            ydst += lumStride;
2068
            src  += srcStride;
2069
        }
2070
        src -= srcStride*2;
2071
        __asm__ volatile(
2072
            "mov                        %4, %%"REG_a"   \n\t"
2073
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2074
            "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2075
            "pxor                    %%mm7, %%mm7       \n\t"
2076
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2077
            "add                 %%"REG_d", %%"REG_d"   \n\t"
2078
            ASMALIGN(4)
2079
            "1:                                         \n\t"
2080
            PREFETCH"    64(%0, %%"REG_d")              \n\t"
2081
            PREFETCH"    64(%1, %%"REG_d")              \n\t"
2082
#if HAVE_MMX2 || HAVE_AMD3DNOW
2083
            "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2084
            "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2085
            "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2086
            "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2087
            PAVGB"                   %%mm1, %%mm0       \n\t"
2088
            PAVGB"                   %%mm3, %%mm2       \n\t"
2089
            "movq                    %%mm0, %%mm1       \n\t"
2090
            "movq                    %%mm2, %%mm3       \n\t"
2091
            "psrlq                     $24, %%mm0       \n\t"
2092
            "psrlq                     $24, %%mm2       \n\t"
2093
            PAVGB"                   %%mm1, %%mm0       \n\t"
2094
            PAVGB"                   %%mm3, %%mm2       \n\t"
2095
            "punpcklbw               %%mm7, %%mm0       \n\t"
2096
            "punpcklbw               %%mm7, %%mm2       \n\t"
2097
#else
2098
            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2099
            "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2100
            "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2101
            "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2102
            "punpcklbw               %%mm7, %%mm0       \n\t"
2103
            "punpcklbw               %%mm7, %%mm1       \n\t"
2104
            "punpcklbw               %%mm7, %%mm2       \n\t"
2105
            "punpcklbw               %%mm7, %%mm3       \n\t"
2106
            "paddw                   %%mm1, %%mm0       \n\t"
2107
            "paddw                   %%mm3, %%mm2       \n\t"
2108
            "paddw                   %%mm2, %%mm0       \n\t"
2109
            "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2110
            "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2111
            "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2112
            "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2113
            "punpcklbw               %%mm7, %%mm4       \n\t"
2114
            "punpcklbw               %%mm7, %%mm1       \n\t"
2115
            "punpcklbw               %%mm7, %%mm2       \n\t"
2116
            "punpcklbw               %%mm7, %%mm3       \n\t"
2117
            "paddw                   %%mm1, %%mm4       \n\t"
2118
            "paddw                   %%mm3, %%mm2       \n\t"
2119
            "paddw                   %%mm4, %%mm2       \n\t"
2120
            "psrlw                      $2, %%mm0       \n\t"
2121
            "psrlw                      $2, %%mm2       \n\t"
2122
#endif
2123
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2124
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2125

    
2126
            "pmaddwd                 %%mm0, %%mm1       \n\t"
2127
            "pmaddwd                 %%mm2, %%mm3       \n\t"
2128
            "pmaddwd                 %%mm6, %%mm0       \n\t"
2129
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2130
#ifndef FAST_BGR2YV12
2131
            "psrad                      $8, %%mm0       \n\t"
2132
            "psrad                      $8, %%mm1       \n\t"
2133
            "psrad                      $8, %%mm2       \n\t"
2134
            "psrad                      $8, %%mm3       \n\t"
2135
#endif
2136
            "packssdw                %%mm2, %%mm0       \n\t"
2137
            "packssdw                %%mm3, %%mm1       \n\t"
2138
            "pmaddwd                 %%mm5, %%mm0       \n\t"
2139
            "pmaddwd                 %%mm5, %%mm1       \n\t"
2140
            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2141
            "psraw                      $7, %%mm0       \n\t"
2142

    
2143
#if HAVE_MMX2 || HAVE_AMD3DNOW
2144
            "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2145
            "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2146
            "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2147
            "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2148
            PAVGB"                   %%mm1, %%mm4       \n\t"
2149
            PAVGB"                   %%mm3, %%mm2       \n\t"
2150
            "movq                    %%mm4, %%mm1       \n\t"
2151
            "movq                    %%mm2, %%mm3       \n\t"
2152
            "psrlq                     $24, %%mm4       \n\t"
2153
            "psrlq                     $24, %%mm2       \n\t"
2154
            PAVGB"                   %%mm1, %%mm4       \n\t"
2155
            PAVGB"                   %%mm3, %%mm2       \n\t"
2156
            "punpcklbw               %%mm7, %%mm4       \n\t"
2157
            "punpcklbw               %%mm7, %%mm2       \n\t"
2158
#else
2159
            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2160
            "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2161
            "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2162
            "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2163
            "punpcklbw               %%mm7, %%mm4       \n\t"
2164
            "punpcklbw               %%mm7, %%mm1       \n\t"
2165
            "punpcklbw               %%mm7, %%mm2       \n\t"
2166
            "punpcklbw               %%mm7, %%mm3       \n\t"
2167
            "paddw                   %%mm1, %%mm4       \n\t"
2168
            "paddw                   %%mm3, %%mm2       \n\t"
2169
            "paddw                   %%mm2, %%mm4       \n\t"
2170
            "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2171
            "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2172
            "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2173
            "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2174
            "punpcklbw               %%mm7, %%mm5       \n\t"
2175
            "punpcklbw               %%mm7, %%mm1       \n\t"
2176
            "punpcklbw               %%mm7, %%mm2       \n\t"
2177
            "punpcklbw               %%mm7, %%mm3       \n\t"
2178
            "paddw                   %%mm1, %%mm5       \n\t"
2179
            "paddw                   %%mm3, %%mm2       \n\t"
2180
            "paddw                   %%mm5, %%mm2       \n\t"
2181
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2182
            "psrlw                      $2, %%mm4       \n\t"
2183
            "psrlw                      $2, %%mm2       \n\t"
2184
#endif
2185
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2186
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2187

    
2188
            "pmaddwd                 %%mm4, %%mm1       \n\t"
2189
            "pmaddwd                 %%mm2, %%mm3       \n\t"
2190
            "pmaddwd                 %%mm6, %%mm4       \n\t"
2191
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2192
#ifndef FAST_BGR2YV12
2193
            "psrad                      $8, %%mm4       \n\t"
2194
            "psrad                      $8, %%mm1       \n\t"
2195
            "psrad                      $8, %%mm2       \n\t"
2196
            "psrad                      $8, %%mm3       \n\t"
2197
#endif
2198
            "packssdw                %%mm2, %%mm4       \n\t"
2199
            "packssdw                %%mm3, %%mm1       \n\t"
2200
            "pmaddwd                 %%mm5, %%mm4       \n\t"
2201
            "pmaddwd                 %%mm5, %%mm1       \n\t"
2202
            "add                       $24, %%"REG_d"   \n\t"
2203
            "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2204
            "psraw                      $7, %%mm4       \n\t"
2205

    
2206
            "movq                    %%mm0, %%mm1           \n\t"
2207
            "punpckldq               %%mm4, %%mm0           \n\t"
2208
            "punpckhdq               %%mm4, %%mm1           \n\t"
2209
            "packsswb                %%mm1, %%mm0           \n\t"
2210
            "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
2211
            "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2212
            "punpckhdq               %%mm0, %%mm0           \n\t"
2213
            "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2214
            "add                        $4, %%"REG_a"       \n\t"
2215
            " js                        1b                  \n\t"
2216
            : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2217
            : "%"REG_a, "%"REG_d
2218
        );
2219

    
2220
        udst += chromStride;
2221
        vdst += chromStride;
2222
        src  += srcStride*2;
2223
    }
2224

    
2225
    __asm__ volatile(EMMS"       \n\t"
2226
                     SFENCE"     \n\t"
2227
                     :::"memory");
2228
#else
2229
    y=0;
2230
#endif
2231
    for (; y<height; y+=2) {
2232
        long i;
2233
        for (i=0; i<chromWidth; i++) {
2234
            unsigned int b = src[6*i+0];
2235
            unsigned int g = src[6*i+1];
2236
            unsigned int r = src[6*i+2];
2237

    
2238
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2239
            unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2240
            unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2241

    
2242
            udst[i]     = U;
2243
            vdst[i]     = V;
2244
            ydst[2*i]   = Y;
2245

    
2246
            b = src[6*i+3];
2247
            g = src[6*i+4];
2248
            r = src[6*i+5];
2249

    
2250
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2251
            ydst[2*i+1]     = Y;
2252
        }
2253
        ydst += lumStride;
2254
        src  += srcStride;
2255

    
2256
        for (i=0; i<chromWidth; i++) {
2257
            unsigned int b = src[6*i+0];
2258
            unsigned int g = src[6*i+1];
2259
            unsigned int r = src[6*i+2];
2260

    
2261
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2262

    
2263
            ydst[2*i]     = Y;
2264

    
2265
            b = src[6*i+3];
2266
            g = src[6*i+4];
2267
            r = src[6*i+5];
2268

    
2269
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2270
            ydst[2*i+1]     = Y;
2271
        }
2272
        udst += chromStride;
2273
        vdst += chromStride;
2274
        ydst += lumStride;
2275
        src  += srcStride;
2276
    }
2277
}
2278

    
2279
static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2280
                             long width, long height, long src1Stride,
2281
                             long src2Stride, long dstStride)
2282
{
2283
    long h;
2284

    
2285
    for (h=0; h < height; h++) {
2286
        long w;
2287

    
2288
#if HAVE_MMX
2289
#if HAVE_SSE2
2290
        __asm__(
2291
            "xor              %%"REG_a", %%"REG_a"  \n\t"
2292
            "1:                                     \n\t"
2293
            PREFETCH" 64(%1, %%"REG_a")             \n\t"
2294
            PREFETCH" 64(%2, %%"REG_a")             \n\t"
2295
            "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2296
            "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2297
            "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2298
            "punpcklbw           %%xmm2, %%xmm0     \n\t"
2299
            "punpckhbw           %%xmm2, %%xmm1     \n\t"
2300
            "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2301
            "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2302
            "add                    $16, %%"REG_a"  \n\t"
2303
            "cmp                     %3, %%"REG_a"  \n\t"
2304
            " jb                     1b             \n\t"
2305
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2306
            : "memory", "%"REG_a""
2307
        );
2308
#else
2309
        __asm__(
2310
            "xor %%"REG_a", %%"REG_a"               \n\t"
2311
            "1:                                     \n\t"
2312
            PREFETCH" 64(%1, %%"REG_a")             \n\t"
2313
            PREFETCH" 64(%2, %%"REG_a")             \n\t"
2314
            "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2315
            "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2316
            "movq                 %%mm0, %%mm1      \n\t"
2317
            "movq                 %%mm2, %%mm3      \n\t"
2318
            "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2319
            "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2320
            "punpcklbw            %%mm4, %%mm0      \n\t"
2321
            "punpckhbw            %%mm4, %%mm1      \n\t"
2322
            "punpcklbw            %%mm5, %%mm2      \n\t"
2323
            "punpckhbw            %%mm5, %%mm3      \n\t"
2324
            MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2325
            MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2326
            MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2327
            MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2328
            "add                    $16, %%"REG_a"  \n\t"
2329
            "cmp                     %3, %%"REG_a"  \n\t"
2330
            " jb                     1b             \n\t"
2331
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2332
            : "memory", "%"REG_a
2333
        );
2334
#endif
2335
        for (w= (width&(~15)); w < width; w++) {
2336
            dest[2*w+0] = src1[w];
2337
            dest[2*w+1] = src2[w];
2338
        }
2339
#else
2340
        for (w=0; w < width; w++) {
2341
            dest[2*w+0] = src1[w];
2342
            dest[2*w+1] = src2[w];
2343
        }
2344
#endif
2345
        dest += dstStride;
2346
                src1 += src1Stride;
2347
                src2 += src2Stride;
2348
    }
2349
#if HAVE_MMX
2350
    __asm__(
2351
            EMMS"       \n\t"
2352
            SFENCE"     \n\t"
2353
            ::: "memory"
2354
            );
2355
#endif
2356
}
2357

    
2358
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2359
                                       uint8_t *dst1, uint8_t *dst2,
2360
                                       long width, long height,
2361
                                       long srcStride1, long srcStride2,
2362
                                       long dstStride1, long dstStride2)
2363
{
2364
    x86_reg y;
2365
    long x,w,h;
2366
    w=width/2; h=height/2;
2367
#if HAVE_MMX
2368
    __asm__ volatile(
2369
        PREFETCH" %0    \n\t"
2370
        PREFETCH" %1    \n\t"
2371
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2372
#endif
2373
    for (y=0;y<h;y++) {
2374
        const uint8_t* s1=src1+srcStride1*(y>>1);
2375
        uint8_t* d=dst1+dstStride1*y;
2376
        x=0;
2377
#if HAVE_MMX
2378
        for (;x<w-31;x+=32) {
2379
            __asm__ volatile(
2380
                PREFETCH"   32%1        \n\t"
2381
                "movq         %1, %%mm0 \n\t"
2382
                "movq        8%1, %%mm2 \n\t"
2383
                "movq       16%1, %%mm4 \n\t"
2384
                "movq       24%1, %%mm6 \n\t"
2385
                "movq      %%mm0, %%mm1 \n\t"
2386
                "movq      %%mm2, %%mm3 \n\t"
2387
                "movq      %%mm4, %%mm5 \n\t"
2388
                "movq      %%mm6, %%mm7 \n\t"
2389
                "punpcklbw %%mm0, %%mm0 \n\t"
2390
                "punpckhbw %%mm1, %%mm1 \n\t"
2391
                "punpcklbw %%mm2, %%mm2 \n\t"
2392
                "punpckhbw %%mm3, %%mm3 \n\t"
2393
                "punpcklbw %%mm4, %%mm4 \n\t"
2394
                "punpckhbw %%mm5, %%mm5 \n\t"
2395
                "punpcklbw %%mm6, %%mm6 \n\t"
2396
                "punpckhbw %%mm7, %%mm7 \n\t"
2397
                MOVNTQ"    %%mm0,   %0  \n\t"
2398
                MOVNTQ"    %%mm1,  8%0  \n\t"
2399
                MOVNTQ"    %%mm2, 16%0  \n\t"
2400
                MOVNTQ"    %%mm3, 24%0  \n\t"
2401
                MOVNTQ"    %%mm4, 32%0  \n\t"
2402
                MOVNTQ"    %%mm5, 40%0  \n\t"
2403
                MOVNTQ"    %%mm6, 48%0  \n\t"
2404
                MOVNTQ"    %%mm7, 56%0"
2405
                :"=m"(d[2*x])
2406
                :"m"(s1[x])
2407
                :"memory");
2408
        }
2409
#endif
2410
        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2411
    }
2412
    for (y=0;y<h;y++) {
2413
        const uint8_t* s2=src2+srcStride2*(y>>1);
2414
        uint8_t* d=dst2+dstStride2*y;
2415
        x=0;
2416
#if HAVE_MMX
2417
        for (;x<w-31;x+=32) {
2418
            __asm__ volatile(
2419
                PREFETCH"   32%1        \n\t"
2420
                "movq         %1, %%mm0 \n\t"
2421
                "movq        8%1, %%mm2 \n\t"
2422
                "movq       16%1, %%mm4 \n\t"
2423
                "movq       24%1, %%mm6 \n\t"
2424
                "movq      %%mm0, %%mm1 \n\t"
2425
                "movq      %%mm2, %%mm3 \n\t"
2426
                "movq      %%mm4, %%mm5 \n\t"
2427
                "movq      %%mm6, %%mm7 \n\t"
2428
                "punpcklbw %%mm0, %%mm0 \n\t"
2429
                "punpckhbw %%mm1, %%mm1 \n\t"
2430
                "punpcklbw %%mm2, %%mm2 \n\t"
2431
                "punpckhbw %%mm3, %%mm3 \n\t"
2432
                "punpcklbw %%mm4, %%mm4 \n\t"
2433
                "punpckhbw %%mm5, %%mm5 \n\t"
2434
                "punpcklbw %%mm6, %%mm6 \n\t"
2435
                "punpckhbw %%mm7, %%mm7 \n\t"
2436
                MOVNTQ"    %%mm0,   %0  \n\t"
2437
                MOVNTQ"    %%mm1,  8%0  \n\t"
2438
                MOVNTQ"    %%mm2, 16%0  \n\t"
2439
                MOVNTQ"    %%mm3, 24%0  \n\t"
2440
                MOVNTQ"    %%mm4, 32%0  \n\t"
2441
                MOVNTQ"    %%mm5, 40%0  \n\t"
2442
                MOVNTQ"    %%mm6, 48%0  \n\t"
2443
                MOVNTQ"    %%mm7, 56%0"
2444
                :"=m"(d[2*x])
2445
                :"m"(s2[x])
2446
                :"memory");
2447
        }
2448
#endif
2449
        for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2450
    }
2451
#if HAVE_MMX
2452
    __asm__(
2453
            EMMS"       \n\t"
2454
            SFENCE"     \n\t"
2455
            ::: "memory"
2456
        );
2457
#endif
2458
}
2459

    
2460
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2461
                                        uint8_t *dst,
2462
                                        long width, long height,
2463
                                        long srcStride1, long srcStride2,
2464
                                        long srcStride3, long dstStride)
2465
{
2466
    x86_reg x;
2467
    long y,w,h;
2468
    w=width/2; h=height;
2469
    for (y=0;y<h;y++) {
2470
        const uint8_t* yp=src1+srcStride1*y;
2471
        const uint8_t* up=src2+srcStride2*(y>>2);
2472
        const uint8_t* vp=src3+srcStride3*(y>>2);
2473
        uint8_t* d=dst+dstStride*y;
2474
        x=0;
2475
#if HAVE_MMX
2476
        for (;x<w-7;x+=8) {
2477
            __asm__ volatile(
2478
                PREFETCH"   32(%1, %0)          \n\t"
2479
                PREFETCH"   32(%2, %0)          \n\t"
2480
                PREFETCH"   32(%3, %0)          \n\t"
2481
                "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2482
                "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2483
                "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2484
                "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2485
                "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2486
                "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2487
                "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2488
                "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2489
                "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2490
                "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2491

    
2492
                "movq            %%mm1, %%mm6   \n\t"
2493
                "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2494
                "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2495
                "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2496
                MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2497
                MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2498

    
2499
                "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2500
                "movq     8(%1, %0, 4), %%mm0   \n\t"
2501
                "movq            %%mm0, %%mm3   \n\t"
2502
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2503
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2504
                MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2505
                MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2506

    
2507
                "movq            %%mm4, %%mm6   \n\t"
2508
                "movq    16(%1, %0, 4), %%mm0   \n\t"
2509
                "movq            %%mm0, %%mm3   \n\t"
2510
                "punpcklbw       %%mm5, %%mm4   \n\t"
2511
                "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2512
                "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2513
                MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2514
                MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2515

    
2516
                "punpckhbw       %%mm5, %%mm6   \n\t"
2517
                "movq    24(%1, %0, 4), %%mm0   \n\t"
2518
                "movq            %%mm0, %%mm3   \n\t"
2519
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2520
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2521
                MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2522
                MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2523

    
2524
                : "+r" (x)
2525
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2526
                :"memory");
2527
        }
2528
#endif
2529
        for (; x<w; x++) {
2530
            const long x2 = x<<2;
2531
            d[8*x+0] = yp[x2];
2532
            d[8*x+1] = up[x];
2533
            d[8*x+2] = yp[x2+1];
2534
            d[8*x+3] = vp[x];
2535
            d[8*x+4] = yp[x2+2];
2536
            d[8*x+5] = up[x];
2537
            d[8*x+6] = yp[x2+3];
2538
            d[8*x+7] = vp[x];
2539
        }
2540
    }
2541
#if HAVE_MMX
2542
    __asm__(
2543
            EMMS"       \n\t"
2544
            SFENCE"     \n\t"
2545
            ::: "memory"
2546
        );
2547
#endif
2548
}
2549

    
2550
static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2551
{
2552
    dst +=   count;
2553
    src += 2*count;
2554
    count= - count;
2555

    
2556
#if HAVE_MMX
2557
    if(count <= -16) {
2558
        count += 15;
2559
        __asm__ volatile(
2560
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2561
            "psrlw            $8, %%mm7        \n\t"
2562
            "1:                                \n\t"
2563
            "movq -30(%1, %0, 2), %%mm0        \n\t"
2564
            "movq -22(%1, %0, 2), %%mm1        \n\t"
2565
            "movq -14(%1, %0, 2), %%mm2        \n\t"
2566
            "movq  -6(%1, %0, 2), %%mm3        \n\t"
2567
            "pand          %%mm7, %%mm0        \n\t"
2568
            "pand          %%mm7, %%mm1        \n\t"
2569
            "pand          %%mm7, %%mm2        \n\t"
2570
            "pand          %%mm7, %%mm3        \n\t"
2571
            "packuswb      %%mm1, %%mm0        \n\t"
2572
            "packuswb      %%mm3, %%mm2        \n\t"
2573
            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2574
            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2575
            "add             $16, %0           \n\t"
2576
            " js 1b                            \n\t"
2577
            : "+r"(count)
2578
            : "r"(src), "r"(dst)
2579
        );
2580
        count -= 15;
2581
    }
2582
#endif
2583
    while(count<0) {
2584
        dst[count]= src[2*count];
2585
        count++;
2586
    }
2587
}
2588

    
2589
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2590
{
2591
    dst0+=   count;
2592
    dst1+=   count;
2593
    src += 4*count;
2594
    count= - count;
2595
#if HAVE_MMX
2596
    if(count <= -8) {
2597
        count += 7;
2598
        __asm__ volatile(
2599
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2600
            "psrlw            $8, %%mm7        \n\t"
2601
            "1:                                \n\t"
2602
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2603
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2604
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2605
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2606
            "pand          %%mm7, %%mm0        \n\t"
2607
            "pand          %%mm7, %%mm1        \n\t"
2608
            "pand          %%mm7, %%mm2        \n\t"
2609
            "pand          %%mm7, %%mm3        \n\t"
2610
            "packuswb      %%mm1, %%mm0        \n\t"
2611
            "packuswb      %%mm3, %%mm2        \n\t"
2612
            "movq          %%mm0, %%mm1        \n\t"
2613
            "movq          %%mm2, %%mm3        \n\t"
2614
            "psrlw            $8, %%mm0        \n\t"
2615
            "psrlw            $8, %%mm2        \n\t"
2616
            "pand          %%mm7, %%mm1        \n\t"
2617
            "pand          %%mm7, %%mm3        \n\t"
2618
            "packuswb      %%mm2, %%mm0        \n\t"
2619
            "packuswb      %%mm3, %%mm1        \n\t"
2620
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2621
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2622
            "add              $8, %0           \n\t"
2623
            " js 1b                            \n\t"
2624
            : "+r"(count)
2625
            : "r"(src), "r"(dst0), "r"(dst1)
2626
        );
2627
        count -= 7;
2628
    }
2629
#endif
2630
    while(count<0) {
2631
        dst0[count]= src[4*count+0];
2632
        dst1[count]= src[4*count+2];
2633
        count++;
2634
    }
2635
}
2636

    
2637
static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2638
{
2639
    dst0 +=   count;
2640
    dst1 +=   count;
2641
    src0 += 4*count;
2642
    src1 += 4*count;
2643
    count= - count;
2644
#ifdef PAVGB
2645
    if(count <= -8) {
2646
        count += 7;
2647
        __asm__ volatile(
2648
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2649
            "psrlw             $8, %%mm7        \n\t"
2650
            "1:                                \n\t"
2651
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2652
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2653
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2654
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2655
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2656
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2657
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2658
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2659
            "pand           %%mm7, %%mm0        \n\t"
2660
            "pand           %%mm7, %%mm1        \n\t"
2661
            "pand           %%mm7, %%mm2        \n\t"
2662
            "pand           %%mm7, %%mm3        \n\t"
2663
            "packuswb       %%mm1, %%mm0        \n\t"
2664
            "packuswb       %%mm3, %%mm2        \n\t"
2665
            "movq           %%mm0, %%mm1        \n\t"
2666
            "movq           %%mm2, %%mm3        \n\t"
2667
            "psrlw             $8, %%mm0        \n\t"
2668
            "psrlw             $8, %%mm2        \n\t"
2669
            "pand           %%mm7, %%mm1        \n\t"
2670
            "pand           %%mm7, %%mm3        \n\t"
2671
            "packuswb       %%mm2, %%mm0        \n\t"
2672
            "packuswb       %%mm3, %%mm1        \n\t"
2673
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2674
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2675
            "add               $8, %0           \n\t"
2676
            " js 1b                            \n\t"
2677
            : "+r"(count)
2678
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2679
        );
2680
        count -= 7;
2681
    }
2682
#endif
2683
    while(count<0) {
2684
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2685
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2686
        count++;
2687
    }
2688
}
2689

    
2690
static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2691
{
2692
    dst0+=   count;
2693
    dst1+=   count;
2694
    src += 4*count;
2695
    count= - count;
2696
#if HAVE_MMX
2697
    if(count <= -8) {
2698
        count += 7;
2699
        __asm__ volatile(
2700
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2701
            "psrlw            $8, %%mm7        \n\t"
2702
            "1:                                \n\t"
2703
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2704
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2705
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2706
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2707
            "psrlw            $8, %%mm0        \n\t"
2708
            "psrlw            $8, %%mm1        \n\t"
2709
            "psrlw            $8, %%mm2        \n\t"
2710
            "psrlw            $8, %%mm3        \n\t"
2711
            "packuswb      %%mm1, %%mm0        \n\t"
2712
            "packuswb      %%mm3, %%mm2        \n\t"
2713
            "movq          %%mm0, %%mm1        \n\t"
2714
            "movq          %%mm2, %%mm3        \n\t"
2715
            "psrlw            $8, %%mm0        \n\t"
2716
            "psrlw            $8, %%mm2        \n\t"
2717
            "pand          %%mm7, %%mm1        \n\t"
2718
            "pand          %%mm7, %%mm3        \n\t"
2719
            "packuswb      %%mm2, %%mm0        \n\t"
2720
            "packuswb      %%mm3, %%mm1        \n\t"
2721
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2722
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2723
            "add              $8, %0           \n\t"
2724
            " js 1b                            \n\t"
2725
            : "+r"(count)
2726
            : "r"(src), "r"(dst0), "r"(dst1)
2727
        );
2728
        count -= 7;
2729
    }
2730
#endif
2731
    src++;
2732
    while(count<0) {
2733
        dst0[count]= src[4*count+0];
2734
        dst1[count]= src[4*count+2];
2735
        count++;
2736
    }
2737
}
2738

    
2739
static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2740
{
2741
    dst0 +=   count;
2742
    dst1 +=   count;
2743
    src0 += 4*count;
2744
    src1 += 4*count;
2745
    count= - count;
2746
#ifdef PAVGB
2747
    if(count <= -8) {
2748
        count += 7;
2749
        __asm__ volatile(
2750
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2751
            "psrlw             $8, %%mm7        \n\t"
2752
            "1:                                \n\t"
2753
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2754
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2755
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2756
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2757
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2758
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2759
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2760
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2761
            "psrlw             $8, %%mm0        \n\t"
2762
            "psrlw             $8, %%mm1        \n\t"
2763
            "psrlw             $8, %%mm2        \n\t"
2764
            "psrlw             $8, %%mm3        \n\t"
2765
            "packuswb       %%mm1, %%mm0        \n\t"
2766
            "packuswb       %%mm3, %%mm2        \n\t"
2767
            "movq           %%mm0, %%mm1        \n\t"
2768
            "movq           %%mm2, %%mm3        \n\t"
2769
            "psrlw             $8, %%mm0        \n\t"
2770
            "psrlw             $8, %%mm2        \n\t"
2771
            "pand           %%mm7, %%mm1        \n\t"
2772
            "pand           %%mm7, %%mm3        \n\t"
2773
            "packuswb       %%mm2, %%mm0        \n\t"
2774
            "packuswb       %%mm3, %%mm1        \n\t"
2775
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2776
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2777
            "add               $8, %0           \n\t"
2778
            " js 1b                            \n\t"
2779
            : "+r"(count)
2780
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2781
        );
2782
        count -= 7;
2783
    }
2784
#endif
2785
    src0++;
2786
    src1++;
2787
    while(count<0) {
2788
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2789
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2790
        count++;
2791
    }
2792
}
2793

    
2794
static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2795
                                      long width, long height,
2796
                                      long lumStride, long chromStride, long srcStride)
2797
{
2798
    long y;
2799
    const long chromWidth= -((-width)>>1);
2800

    
2801
    for (y=0; y<height; y++) {
2802
        RENAME(extract_even)(src, ydst, width);
2803
        if(y&1) {
2804
            RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2805
            udst+= chromStride;
2806
            vdst+= chromStride;
2807
        }
2808

    
2809
        src += srcStride;
2810
        ydst+= lumStride;
2811
    }
2812
#if HAVE_MMX
2813
    __asm__(
2814
            EMMS"       \n\t"
2815
            SFENCE"     \n\t"
2816
            ::: "memory"
2817
        );
2818
#endif
2819
}
2820

    
2821
static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2822
                                      long width, long height,
2823
                                      long lumStride, long chromStride, long srcStride)
2824
{
2825
    long y;
2826
    const long chromWidth= -((-width)>>1);
2827

    
2828
    for (y=0; y<height; y++) {
2829
        RENAME(extract_even)(src, ydst, width);
2830
        RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2831

    
2832
        src += srcStride;
2833
        ydst+= lumStride;
2834
        udst+= chromStride;
2835
        vdst+= chromStride;
2836
    }
2837
#if HAVE_MMX
2838
    __asm__(
2839
            EMMS"       \n\t"
2840
            SFENCE"     \n\t"
2841
            ::: "memory"
2842
        );
2843
#endif
2844
}
2845

    
2846
static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2847
                                      long width, long height,
2848
                                      long lumStride, long chromStride, long srcStride)
2849
{
2850
    long y;
2851
    const long chromWidth= -((-width)>>1);
2852

    
2853
    for (y=0; y<height; y++) {
2854
        RENAME(extract_even)(src+1, ydst, width);
2855
        if(y&1) {
2856
            RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2857
            udst+= chromStride;
2858
            vdst+= chromStride;
2859
        }
2860

    
2861
        src += srcStride;
2862
        ydst+= lumStride;
2863
    }
2864
#if HAVE_MMX
2865
    __asm__(
2866
            EMMS"       \n\t"
2867
            SFENCE"     \n\t"
2868
            ::: "memory"
2869
        );
2870
#endif
2871
}
2872

    
2873
static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2874
                                      long width, long height,
2875
                                      long lumStride, long chromStride, long srcStride)
2876
{
2877
    long y;
2878
    const long chromWidth= -((-width)>>1);
2879

    
2880
    for (y=0; y<height; y++) {
2881
        RENAME(extract_even)(src+1, ydst, width);
2882
        RENAME(extract_even2)(src, udst, vdst, chromWidth);
2883

    
2884
        src += srcStride;
2885
        ydst+= lumStride;
2886
        udst+= chromStride;
2887
        vdst+= chromStride;
2888
    }
2889
#if HAVE_MMX
2890
    __asm__(
2891
            EMMS"       \n\t"
2892
            SFENCE"     \n\t"
2893
            ::: "memory"
2894
        );
2895
#endif
2896
}
2897

    
2898
static inline void RENAME(rgb2rgb_init)(void)
2899
{
2900
    rgb15to16       = RENAME(rgb15to16);
2901
    rgb15tobgr24    = RENAME(rgb15tobgr24);
2902
    rgb15to32       = RENAME(rgb15to32);
2903
    rgb16tobgr24    = RENAME(rgb16tobgr24);
2904
    rgb16to32       = RENAME(rgb16to32);
2905
    rgb16to15       = RENAME(rgb16to15);
2906
    rgb24tobgr16    = RENAME(rgb24tobgr16);
2907
    rgb24tobgr15    = RENAME(rgb24tobgr15);
2908
    rgb24tobgr32    = RENAME(rgb24tobgr32);
2909
    rgb32to16       = RENAME(rgb32to16);
2910
    rgb32to15       = RENAME(rgb32to15);
2911
    rgb32tobgr24    = RENAME(rgb32tobgr24);
2912
    rgb24to15       = RENAME(rgb24to15);
2913
    rgb24to16       = RENAME(rgb24to16);
2914
    rgb24tobgr24    = RENAME(rgb24tobgr24);
2915
    rgb32tobgr32    = RENAME(rgb32tobgr32);
2916
    rgb32tobgr16    = RENAME(rgb32tobgr16);
2917
    rgb32tobgr15    = RENAME(rgb32tobgr15);
2918
    yv12toyuy2      = RENAME(yv12toyuy2);
2919
    yv12touyvy      = RENAME(yv12touyvy);
2920
    yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
2921
    yuv422ptouyvy   = RENAME(yuv422ptouyvy);
2922
    yuy2toyv12      = RENAME(yuy2toyv12);
2923
    planar2x        = RENAME(planar2x);
2924
    rgb24toyv12     = RENAME(rgb24toyv12);
2925
    interleaveBytes = RENAME(interleaveBytes);
2926
    vu9_to_vu12     = RENAME(vu9_to_vu12);
2927
    yvu9_to_yuy2    = RENAME(yvu9_to_yuy2);
2928

    
2929
    uyvytoyuv420    = RENAME(uyvytoyuv420);
2930
    uyvytoyuv422    = RENAME(uyvytoyuv422);
2931
    yuyvtoyuv420    = RENAME(yuyvtoyuv420);
2932
    yuyvtoyuv422    = RENAME(yuyvtoyuv422);
2933
}