Statistics
| Branch: | Revision:

ffmpeg / libswscale / rgb2rgb_template.c @ 5b03661f

History | View | Annotate | Download (112 KB)

1
/*
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 * lot of big-endian byte order fixes by Alex Beregszaszi
9
 *
10
 * This file is part of FFmpeg.
11
 *
12
 * FFmpeg is free software; you can redistribute it and/or
13
 * modify it under the terms of the GNU Lesser General Public
14
 * License as published by the Free Software Foundation; either
15
 * version 2.1 of the License, or (at your option) any later version.
16
 *
17
 * FFmpeg is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
 * Lesser General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU Lesser General Public
23
 * License along with FFmpeg; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 */
26

    
27
#include <stddef.h>
28

    
29
#undef PREFETCH
30
#undef MOVNTQ
31
#undef EMMS
32
#undef SFENCE
33
#undef MMREG_SIZE
34
#undef PAVGB
35

    
36
#if HAVE_SSE2
37
#define MMREG_SIZE 16
38
#else
39
#define MMREG_SIZE 8
40
#endif
41

    
42
#if HAVE_AMD3DNOW
43
#define PREFETCH  "prefetch"
44
#define PAVGB     "pavgusb"
45
#elif HAVE_MMX2
46
#define PREFETCH "prefetchnta"
47
#define PAVGB     "pavgb"
48
#else
49
#define PREFETCH  " # nop"
50
#endif
51

    
52
#if HAVE_AMD3DNOW
53
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
54
#define EMMS     "femms"
55
#else
56
#define EMMS     "emms"
57
#endif
58

    
59
#if HAVE_MMX2
60
#define MOVNTQ "movntq"
61
#define SFENCE "sfence"
62
#else
63
#define MOVNTQ "movq"
64
#define SFENCE " # nop"
65
#endif
66

    
67
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
68
{
69
    uint8_t *dest = dst;
70
    const uint8_t *s = src;
71
    const uint8_t *end;
72
#if HAVE_MMX
73
    const uint8_t *mm_end;
74
#endif
75
    end = s + src_size;
76
#if HAVE_MMX
77
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
78
    mm_end = end - 23;
79
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
80
    while (s < mm_end) {
81
        __asm__ volatile(
82
            PREFETCH"    32%1           \n\t"
83
            "movd          %1, %%mm0    \n\t"
84
            "punpckldq    3%1, %%mm0    \n\t"
85
            "movd         6%1, %%mm1    \n\t"
86
            "punpckldq    9%1, %%mm1    \n\t"
87
            "movd        12%1, %%mm2    \n\t"
88
            "punpckldq   15%1, %%mm2    \n\t"
89
            "movd        18%1, %%mm3    \n\t"
90
            "punpckldq   21%1, %%mm3    \n\t"
91
            "por        %%mm7, %%mm0    \n\t"
92
            "por        %%mm7, %%mm1    \n\t"
93
            "por        %%mm7, %%mm2    \n\t"
94
            "por        %%mm7, %%mm3    \n\t"
95
            MOVNTQ"     %%mm0,   %0     \n\t"
96
            MOVNTQ"     %%mm1,  8%0     \n\t"
97
            MOVNTQ"     %%mm2, 16%0     \n\t"
98
            MOVNTQ"     %%mm3, 24%0"
99
            :"=m"(*dest)
100
            :"m"(*s)
101
            :"memory");
102
        dest += 32;
103
        s += 24;
104
    }
105
    __asm__ volatile(SFENCE:::"memory");
106
    __asm__ volatile(EMMS:::"memory");
107
#endif
108
    while (s < end) {
109
#if HAVE_BIGENDIAN
110
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
111
        *dest++ = 255;
112
        *dest++ = s[2];
113
        *dest++ = s[1];
114
        *dest++ = s[0];
115
        s+=3;
116
#else
117
        *dest++ = *s++;
118
        *dest++ = *s++;
119
        *dest++ = *s++;
120
        *dest++ = 255;
121
#endif
122
    }
123
}
124

    
125
#define STORE_BGR24_MMX \
126
            "psrlq         $8, %%mm2    \n\t" \
127
            "psrlq         $8, %%mm3    \n\t" \
128
            "psrlq         $8, %%mm6    \n\t" \
129
            "psrlq         $8, %%mm7    \n\t" \
130
            "pand "MANGLE(mask24l)", %%mm0\n\t" \
131
            "pand "MANGLE(mask24l)", %%mm1\n\t" \
132
            "pand "MANGLE(mask24l)", %%mm4\n\t" \
133
            "pand "MANGLE(mask24l)", %%mm5\n\t" \
134
            "pand "MANGLE(mask24h)", %%mm2\n\t" \
135
            "pand "MANGLE(mask24h)", %%mm3\n\t" \
136
            "pand "MANGLE(mask24h)", %%mm6\n\t" \
137
            "pand "MANGLE(mask24h)", %%mm7\n\t" \
138
            "por        %%mm2, %%mm0    \n\t" \
139
            "por        %%mm3, %%mm1    \n\t" \
140
            "por        %%mm6, %%mm4    \n\t" \
141
            "por        %%mm7, %%mm5    \n\t" \
142
 \
143
            "movq       %%mm1, %%mm2    \n\t" \
144
            "movq       %%mm4, %%mm3    \n\t" \
145
            "psllq        $48, %%mm2    \n\t" \
146
            "psllq        $32, %%mm3    \n\t" \
147
            "pand "MANGLE(mask24hh)", %%mm2\n\t" \
148
            "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
149
            "por        %%mm2, %%mm0    \n\t" \
150
            "psrlq        $16, %%mm1    \n\t" \
151
            "psrlq        $32, %%mm4    \n\t" \
152
            "psllq        $16, %%mm5    \n\t" \
153
            "por        %%mm3, %%mm1    \n\t" \
154
            "pand  "MANGLE(mask24hhhh)", %%mm5\n\t" \
155
            "por        %%mm5, %%mm4    \n\t" \
156
 \
157
            MOVNTQ"     %%mm0,   %0     \n\t" \
158
            MOVNTQ"     %%mm1,  8%0     \n\t" \
159
            MOVNTQ"     %%mm4, 16%0"
160

    
161

    
162
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
163
{
164
    uint8_t *dest = dst;
165
    const uint8_t *s = src;
166
    const uint8_t *end;
167
#if HAVE_MMX
168
    const uint8_t *mm_end;
169
#endif
170
    end = s + src_size;
171
#if HAVE_MMX
172
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
173
    mm_end = end - 31;
174
    while (s < mm_end) {
175
        __asm__ volatile(
176
            PREFETCH"    32%1           \n\t"
177
            "movq          %1, %%mm0    \n\t"
178
            "movq         8%1, %%mm1    \n\t"
179
            "movq        16%1, %%mm4    \n\t"
180
            "movq        24%1, %%mm5    \n\t"
181
            "movq       %%mm0, %%mm2    \n\t"
182
            "movq       %%mm1, %%mm3    \n\t"
183
            "movq       %%mm4, %%mm6    \n\t"
184
            "movq       %%mm5, %%mm7    \n\t"
185
            STORE_BGR24_MMX
186
            :"=m"(*dest)
187
            :"m"(*s)
188
            :"memory");
189
        dest += 24;
190
        s += 32;
191
    }
192
    __asm__ volatile(SFENCE:::"memory");
193
    __asm__ volatile(EMMS:::"memory");
194
#endif
195
    while (s < end) {
196
#if HAVE_BIGENDIAN
197
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
198
        s++;
199
        dest[2] = *s++;
200
        dest[1] = *s++;
201
        dest[0] = *s++;
202
        dest += 3;
203
#else
204
        *dest++ = *s++;
205
        *dest++ = *s++;
206
        *dest++ = *s++;
207
        s++;
208
#endif
209
    }
210
}
211

    
212
/*
213
 original by Strepto/Astral
214
 ported to gcc & bugfixed: A'rpi
215
 MMX2, 3DNOW optimization by Nick Kurshev
216
 32-bit C version, and and&add trick by Michael Niedermayer
217
*/
218
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
219
{
220
    register const uint8_t* s=src;
221
    register uint8_t* d=dst;
222
    register const uint8_t *end;
223
    const uint8_t *mm_end;
224
    end = s + src_size;
225
#if HAVE_MMX
226
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
227
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
228
    mm_end = end - 15;
229
    while (s<mm_end) {
230
        __asm__ volatile(
231
            PREFETCH"  32%1         \n\t"
232
            "movq        %1, %%mm0  \n\t"
233
            "movq       8%1, %%mm2  \n\t"
234
            "movq     %%mm0, %%mm1  \n\t"
235
            "movq     %%mm2, %%mm3  \n\t"
236
            "pand     %%mm4, %%mm0  \n\t"
237
            "pand     %%mm4, %%mm2  \n\t"
238
            "paddw    %%mm1, %%mm0  \n\t"
239
            "paddw    %%mm3, %%mm2  \n\t"
240
            MOVNTQ"   %%mm0,  %0    \n\t"
241
            MOVNTQ"   %%mm2, 8%0"
242
            :"=m"(*d)
243
            :"m"(*s)
244
        );
245
        d+=16;
246
        s+=16;
247
    }
248
    __asm__ volatile(SFENCE:::"memory");
249
    __asm__ volatile(EMMS:::"memory");
250
#endif
251
    mm_end = end - 3;
252
    while (s < mm_end) {
253
        register unsigned x= *((const uint32_t *)s);
254
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
255
        d+=4;
256
        s+=4;
257
    }
258
    if (s < end) {
259
        register unsigned short x= *((const uint16_t *)s);
260
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
261
    }
262
}
263

    
264
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
265
{
266
    register const uint8_t* s=src;
267
    register uint8_t* d=dst;
268
    register const uint8_t *end;
269
    const uint8_t *mm_end;
270
    end = s + src_size;
271
#if HAVE_MMX
272
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
273
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
274
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
275
    mm_end = end - 15;
276
    while (s<mm_end) {
277
        __asm__ volatile(
278
            PREFETCH"  32%1         \n\t"
279
            "movq        %1, %%mm0  \n\t"
280
            "movq       8%1, %%mm2  \n\t"
281
            "movq     %%mm0, %%mm1  \n\t"
282
            "movq     %%mm2, %%mm3  \n\t"
283
            "psrlq       $1, %%mm0  \n\t"
284
            "psrlq       $1, %%mm2  \n\t"
285
            "pand     %%mm7, %%mm0  \n\t"
286
            "pand     %%mm7, %%mm2  \n\t"
287
            "pand     %%mm6, %%mm1  \n\t"
288
            "pand     %%mm6, %%mm3  \n\t"
289
            "por      %%mm1, %%mm0  \n\t"
290
            "por      %%mm3, %%mm2  \n\t"
291
            MOVNTQ"   %%mm0,  %0    \n\t"
292
            MOVNTQ"   %%mm2, 8%0"
293
            :"=m"(*d)
294
            :"m"(*s)
295
        );
296
        d+=16;
297
        s+=16;
298
    }
299
    __asm__ volatile(SFENCE:::"memory");
300
    __asm__ volatile(EMMS:::"memory");
301
#endif
302
    mm_end = end - 3;
303
    while (s < mm_end) {
304
        register uint32_t x= *((const uint32_t*)s);
305
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
306
        s+=4;
307
        d+=4;
308
    }
309
    if (s < end) {
310
        register uint16_t x= *((const uint16_t*)s);
311
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
312
    }
313
}
314

    
315
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
316
{
317
    const uint8_t *s = src;
318
    const uint8_t *end;
319
#if HAVE_MMX
320
    const uint8_t *mm_end;
321
#endif
322
    uint16_t *d = (uint16_t *)dst;
323
    end = s + src_size;
324
#if HAVE_MMX
325
    mm_end = end - 15;
326
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
327
    __asm__ volatile(
328
        "movq           %3, %%mm5   \n\t"
329
        "movq           %4, %%mm6   \n\t"
330
        "movq           %5, %%mm7   \n\t"
331
        "jmp 2f                     \n\t"
332
        ASMALIGN(4)
333
        "1:                         \n\t"
334
        PREFETCH"   32(%1)          \n\t"
335
        "movd         (%1), %%mm0   \n\t"
336
        "movd        4(%1), %%mm3   \n\t"
337
        "punpckldq   8(%1), %%mm0   \n\t"
338
        "punpckldq  12(%1), %%mm3   \n\t"
339
        "movq        %%mm0, %%mm1   \n\t"
340
        "movq        %%mm3, %%mm4   \n\t"
341
        "pand        %%mm6, %%mm0   \n\t"
342
        "pand        %%mm6, %%mm3   \n\t"
343
        "pmaddwd     %%mm7, %%mm0   \n\t"
344
        "pmaddwd     %%mm7, %%mm3   \n\t"
345
        "pand        %%mm5, %%mm1   \n\t"
346
        "pand        %%mm5, %%mm4   \n\t"
347
        "por         %%mm1, %%mm0   \n\t"
348
        "por         %%mm4, %%mm3   \n\t"
349
        "psrld          $5, %%mm0   \n\t"
350
        "pslld         $11, %%mm3   \n\t"
351
        "por         %%mm3, %%mm0   \n\t"
352
        MOVNTQ"      %%mm0, (%0)    \n\t"
353
        "add           $16,  %1     \n\t"
354
        "add            $8,  %0     \n\t"
355
        "2:                         \n\t"
356
        "cmp            %2,  %1     \n\t"
357
        " jb            1b          \n\t"
358
        : "+r" (d), "+r"(s)
359
        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
360
    );
361
#else
362
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
363
    __asm__ volatile(
364
        "movq    %0, %%mm7    \n\t"
365
        "movq    %1, %%mm6    \n\t"
366
        ::"m"(red_16mask),"m"(green_16mask));
367
    while (s < mm_end) {
368
        __asm__ volatile(
369
            PREFETCH"    32%1           \n\t"
370
            "movd          %1, %%mm0    \n\t"
371
            "movd         4%1, %%mm3    \n\t"
372
            "punpckldq    8%1, %%mm0    \n\t"
373
            "punpckldq   12%1, %%mm3    \n\t"
374
            "movq       %%mm0, %%mm1    \n\t"
375
            "movq       %%mm0, %%mm2    \n\t"
376
            "movq       %%mm3, %%mm4    \n\t"
377
            "movq       %%mm3, %%mm5    \n\t"
378
            "psrlq         $3, %%mm0    \n\t"
379
            "psrlq         $3, %%mm3    \n\t"
380
            "pand          %2, %%mm0    \n\t"
381
            "pand          %2, %%mm3    \n\t"
382
            "psrlq         $5, %%mm1    \n\t"
383
            "psrlq         $5, %%mm4    \n\t"
384
            "pand       %%mm6, %%mm1    \n\t"
385
            "pand       %%mm6, %%mm4    \n\t"
386
            "psrlq         $8, %%mm2    \n\t"
387
            "psrlq         $8, %%mm5    \n\t"
388
            "pand       %%mm7, %%mm2    \n\t"
389
            "pand       %%mm7, %%mm5    \n\t"
390
            "por        %%mm1, %%mm0    \n\t"
391
            "por        %%mm4, %%mm3    \n\t"
392
            "por        %%mm2, %%mm0    \n\t"
393
            "por        %%mm5, %%mm3    \n\t"
394
            "psllq        $16, %%mm3    \n\t"
395
            "por        %%mm3, %%mm0    \n\t"
396
            MOVNTQ"     %%mm0, %0       \n\t"
397
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
398
        d += 4;
399
        s += 16;
400
    }
401
#endif
402
    __asm__ volatile(SFENCE:::"memory");
403
    __asm__ volatile(EMMS:::"memory");
404
#endif
405
    while (s < end) {
406
        register int rgb = *(const uint32_t*)s; s += 4;
407
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
408
    }
409
}
410

    
411
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
412
{
413
    const uint8_t *s = src;
414
    const uint8_t *end;
415
#if HAVE_MMX
416
    const uint8_t *mm_end;
417
#endif
418
    uint16_t *d = (uint16_t *)dst;
419
    end = s + src_size;
420
#if HAVE_MMX
421
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
422
    __asm__ volatile(
423
        "movq          %0, %%mm7    \n\t"
424
        "movq          %1, %%mm6    \n\t"
425
        ::"m"(red_16mask),"m"(green_16mask));
426
    mm_end = end - 15;
427
    while (s < mm_end) {
428
        __asm__ volatile(
429
            PREFETCH"    32%1           \n\t"
430
            "movd          %1, %%mm0    \n\t"
431
            "movd         4%1, %%mm3    \n\t"
432
            "punpckldq    8%1, %%mm0    \n\t"
433
            "punpckldq   12%1, %%mm3    \n\t"
434
            "movq       %%mm0, %%mm1    \n\t"
435
            "movq       %%mm0, %%mm2    \n\t"
436
            "movq       %%mm3, %%mm4    \n\t"
437
            "movq       %%mm3, %%mm5    \n\t"
438
            "psllq         $8, %%mm0    \n\t"
439
            "psllq         $8, %%mm3    \n\t"
440
            "pand       %%mm7, %%mm0    \n\t"
441
            "pand       %%mm7, %%mm3    \n\t"
442
            "psrlq         $5, %%mm1    \n\t"
443
            "psrlq         $5, %%mm4    \n\t"
444
            "pand       %%mm6, %%mm1    \n\t"
445
            "pand       %%mm6, %%mm4    \n\t"
446
            "psrlq        $19, %%mm2    \n\t"
447
            "psrlq        $19, %%mm5    \n\t"
448
            "pand          %2, %%mm2    \n\t"
449
            "pand          %2, %%mm5    \n\t"
450
            "por        %%mm1, %%mm0    \n\t"
451
            "por        %%mm4, %%mm3    \n\t"
452
            "por        %%mm2, %%mm0    \n\t"
453
            "por        %%mm5, %%mm3    \n\t"
454
            "psllq        $16, %%mm3    \n\t"
455
            "por        %%mm3, %%mm0    \n\t"
456
            MOVNTQ"     %%mm0, %0       \n\t"
457
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
458
        d += 4;
459
        s += 16;
460
    }
461
    __asm__ volatile(SFENCE:::"memory");
462
    __asm__ volatile(EMMS:::"memory");
463
#endif
464
    while (s < end) {
465
        register int rgb = *(const uint32_t*)s; s += 4;
466
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
467
    }
468
}
469

    
470
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
471
{
472
    const uint8_t *s = src;
473
    const uint8_t *end;
474
#if HAVE_MMX
475
    const uint8_t *mm_end;
476
#endif
477
    uint16_t *d = (uint16_t *)dst;
478
    end = s + src_size;
479
#if HAVE_MMX
480
    mm_end = end - 15;
481
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
482
    __asm__ volatile(
483
        "movq           %3, %%mm5   \n\t"
484
        "movq           %4, %%mm6   \n\t"
485
        "movq           %5, %%mm7   \n\t"
486
        "jmp            2f          \n\t"
487
        ASMALIGN(4)
488
        "1:                         \n\t"
489
        PREFETCH"   32(%1)          \n\t"
490
        "movd         (%1), %%mm0   \n\t"
491
        "movd        4(%1), %%mm3   \n\t"
492
        "punpckldq   8(%1), %%mm0   \n\t"
493
        "punpckldq  12(%1), %%mm3   \n\t"
494
        "movq        %%mm0, %%mm1   \n\t"
495
        "movq        %%mm3, %%mm4   \n\t"
496
        "pand        %%mm6, %%mm0   \n\t"
497
        "pand        %%mm6, %%mm3   \n\t"
498
        "pmaddwd     %%mm7, %%mm0   \n\t"
499
        "pmaddwd     %%mm7, %%mm3   \n\t"
500
        "pand        %%mm5, %%mm1   \n\t"
501
        "pand        %%mm5, %%mm4   \n\t"
502
        "por         %%mm1, %%mm0   \n\t"
503
        "por         %%mm4, %%mm3   \n\t"
504
        "psrld          $6, %%mm0   \n\t"
505
        "pslld         $10, %%mm3   \n\t"
506
        "por         %%mm3, %%mm0   \n\t"
507
        MOVNTQ"      %%mm0, (%0)    \n\t"
508
        "add           $16,  %1     \n\t"
509
        "add            $8,  %0     \n\t"
510
        "2:                         \n\t"
511
        "cmp            %2,  %1     \n\t"
512
        " jb            1b          \n\t"
513
        : "+r" (d), "+r"(s)
514
        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
515
    );
516
#else
517
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
518
    __asm__ volatile(
519
        "movq          %0, %%mm7    \n\t"
520
        "movq          %1, %%mm6    \n\t"
521
        ::"m"(red_15mask),"m"(green_15mask));
522
    while (s < mm_end) {
523
        __asm__ volatile(
524
            PREFETCH"    32%1           \n\t"
525
            "movd          %1, %%mm0    \n\t"
526
            "movd         4%1, %%mm3    \n\t"
527
            "punpckldq    8%1, %%mm0    \n\t"
528
            "punpckldq   12%1, %%mm3    \n\t"
529
            "movq       %%mm0, %%mm1    \n\t"
530
            "movq       %%mm0, %%mm2    \n\t"
531
            "movq       %%mm3, %%mm4    \n\t"
532
            "movq       %%mm3, %%mm5    \n\t"
533
            "psrlq         $3, %%mm0    \n\t"
534
            "psrlq         $3, %%mm3    \n\t"
535
            "pand          %2, %%mm0    \n\t"
536
            "pand          %2, %%mm3    \n\t"
537
            "psrlq         $6, %%mm1    \n\t"
538
            "psrlq         $6, %%mm4    \n\t"
539
            "pand       %%mm6, %%mm1    \n\t"
540
            "pand       %%mm6, %%mm4    \n\t"
541
            "psrlq         $9, %%mm2    \n\t"
542
            "psrlq         $9, %%mm5    \n\t"
543
            "pand       %%mm7, %%mm2    \n\t"
544
            "pand       %%mm7, %%mm5    \n\t"
545
            "por        %%mm1, %%mm0    \n\t"
546
            "por        %%mm4, %%mm3    \n\t"
547
            "por        %%mm2, %%mm0    \n\t"
548
            "por        %%mm5, %%mm3    \n\t"
549
            "psllq        $16, %%mm3    \n\t"
550
            "por        %%mm3, %%mm0    \n\t"
551
            MOVNTQ"     %%mm0, %0       \n\t"
552
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
553
        d += 4;
554
        s += 16;
555
    }
556
#endif
557
    __asm__ volatile(SFENCE:::"memory");
558
    __asm__ volatile(EMMS:::"memory");
559
#endif
560
    while (s < end) {
561
        register int rgb = *(const uint32_t*)s; s += 4;
562
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
563
    }
564
}
565

    
566
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
567
{
568
    const uint8_t *s = src;
569
    const uint8_t *end;
570
#if HAVE_MMX
571
    const uint8_t *mm_end;
572
#endif
573
    uint16_t *d = (uint16_t *)dst;
574
    end = s + src_size;
575
#if HAVE_MMX
576
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
577
    __asm__ volatile(
578
        "movq          %0, %%mm7    \n\t"
579
        "movq          %1, %%mm6    \n\t"
580
        ::"m"(red_15mask),"m"(green_15mask));
581
    mm_end = end - 15;
582
    while (s < mm_end) {
583
        __asm__ volatile(
584
            PREFETCH"    32%1           \n\t"
585
            "movd          %1, %%mm0    \n\t"
586
            "movd         4%1, %%mm3    \n\t"
587
            "punpckldq    8%1, %%mm0    \n\t"
588
            "punpckldq   12%1, %%mm3    \n\t"
589
            "movq       %%mm0, %%mm1    \n\t"
590
            "movq       %%mm0, %%mm2    \n\t"
591
            "movq       %%mm3, %%mm4    \n\t"
592
            "movq       %%mm3, %%mm5    \n\t"
593
            "psllq         $7, %%mm0    \n\t"
594
            "psllq         $7, %%mm3    \n\t"
595
            "pand       %%mm7, %%mm0    \n\t"
596
            "pand       %%mm7, %%mm3    \n\t"
597
            "psrlq         $6, %%mm1    \n\t"
598
            "psrlq         $6, %%mm4    \n\t"
599
            "pand       %%mm6, %%mm1    \n\t"
600
            "pand       %%mm6, %%mm4    \n\t"
601
            "psrlq        $19, %%mm2    \n\t"
602
            "psrlq        $19, %%mm5    \n\t"
603
            "pand          %2, %%mm2    \n\t"
604
            "pand          %2, %%mm5    \n\t"
605
            "por        %%mm1, %%mm0    \n\t"
606
            "por        %%mm4, %%mm3    \n\t"
607
            "por        %%mm2, %%mm0    \n\t"
608
            "por        %%mm5, %%mm3    \n\t"
609
            "psllq        $16, %%mm3    \n\t"
610
            "por        %%mm3, %%mm0    \n\t"
611
            MOVNTQ"     %%mm0, %0       \n\t"
612
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
613
        d += 4;
614
        s += 16;
615
    }
616
    __asm__ volatile(SFENCE:::"memory");
617
    __asm__ volatile(EMMS:::"memory");
618
#endif
619
    while (s < end) {
620
        register int rgb = *(const uint32_t*)s; s += 4;
621
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
622
    }
623
}
624

    
625
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
626
{
627
    const uint8_t *s = src;
628
    const uint8_t *end;
629
#if HAVE_MMX
630
    const uint8_t *mm_end;
631
#endif
632
    uint16_t *d = (uint16_t *)dst;
633
    end = s + src_size;
634
#if HAVE_MMX
635
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
636
    __asm__ volatile(
637
        "movq         %0, %%mm7     \n\t"
638
        "movq         %1, %%mm6     \n\t"
639
        ::"m"(red_16mask),"m"(green_16mask));
640
    mm_end = end - 11;
641
    while (s < mm_end) {
642
        __asm__ volatile(
643
            PREFETCH"    32%1           \n\t"
644
            "movd          %1, %%mm0    \n\t"
645
            "movd         3%1, %%mm3    \n\t"
646
            "punpckldq    6%1, %%mm0    \n\t"
647
            "punpckldq    9%1, %%mm3    \n\t"
648
            "movq       %%mm0, %%mm1    \n\t"
649
            "movq       %%mm0, %%mm2    \n\t"
650
            "movq       %%mm3, %%mm4    \n\t"
651
            "movq       %%mm3, %%mm5    \n\t"
652
            "psrlq         $3, %%mm0    \n\t"
653
            "psrlq         $3, %%mm3    \n\t"
654
            "pand          %2, %%mm0    \n\t"
655
            "pand          %2, %%mm3    \n\t"
656
            "psrlq         $5, %%mm1    \n\t"
657
            "psrlq         $5, %%mm4    \n\t"
658
            "pand       %%mm6, %%mm1    \n\t"
659
            "pand       %%mm6, %%mm4    \n\t"
660
            "psrlq         $8, %%mm2    \n\t"
661
            "psrlq         $8, %%mm5    \n\t"
662
            "pand       %%mm7, %%mm2    \n\t"
663
            "pand       %%mm7, %%mm5    \n\t"
664
            "por        %%mm1, %%mm0    \n\t"
665
            "por        %%mm4, %%mm3    \n\t"
666
            "por        %%mm2, %%mm0    \n\t"
667
            "por        %%mm5, %%mm3    \n\t"
668
            "psllq        $16, %%mm3    \n\t"
669
            "por        %%mm3, %%mm0    \n\t"
670
            MOVNTQ"     %%mm0, %0       \n\t"
671
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
672
        d += 4;
673
        s += 12;
674
    }
675
    __asm__ volatile(SFENCE:::"memory");
676
    __asm__ volatile(EMMS:::"memory");
677
#endif
678
    while (s < end) {
679
        const int b = *s++;
680
        const int g = *s++;
681
        const int r = *s++;
682
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
683
    }
684
}
685

    
686
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
687
{
688
    const uint8_t *s = src;
689
    const uint8_t *end;
690
#if HAVE_MMX
691
    const uint8_t *mm_end;
692
#endif
693
    uint16_t *d = (uint16_t *)dst;
694
    end = s + src_size;
695
#if HAVE_MMX
696
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
697
    __asm__ volatile(
698
        "movq         %0, %%mm7     \n\t"
699
        "movq         %1, %%mm6     \n\t"
700
        ::"m"(red_16mask),"m"(green_16mask));
701
    mm_end = end - 15;
702
    while (s < mm_end) {
703
        __asm__ volatile(
704
            PREFETCH"    32%1           \n\t"
705
            "movd          %1, %%mm0    \n\t"
706
            "movd         3%1, %%mm3    \n\t"
707
            "punpckldq    6%1, %%mm0    \n\t"
708
            "punpckldq    9%1, %%mm3    \n\t"
709
            "movq       %%mm0, %%mm1    \n\t"
710
            "movq       %%mm0, %%mm2    \n\t"
711
            "movq       %%mm3, %%mm4    \n\t"
712
            "movq       %%mm3, %%mm5    \n\t"
713
            "psllq         $8, %%mm0    \n\t"
714
            "psllq         $8, %%mm3    \n\t"
715
            "pand       %%mm7, %%mm0    \n\t"
716
            "pand       %%mm7, %%mm3    \n\t"
717
            "psrlq         $5, %%mm1    \n\t"
718
            "psrlq         $5, %%mm4    \n\t"
719
            "pand       %%mm6, %%mm1    \n\t"
720
            "pand       %%mm6, %%mm4    \n\t"
721
            "psrlq        $19, %%mm2    \n\t"
722
            "psrlq        $19, %%mm5    \n\t"
723
            "pand          %2, %%mm2    \n\t"
724
            "pand          %2, %%mm5    \n\t"
725
            "por        %%mm1, %%mm0    \n\t"
726
            "por        %%mm4, %%mm3    \n\t"
727
            "por        %%mm2, %%mm0    \n\t"
728
            "por        %%mm5, %%mm3    \n\t"
729
            "psllq        $16, %%mm3    \n\t"
730
            "por        %%mm3, %%mm0    \n\t"
731
            MOVNTQ"     %%mm0, %0       \n\t"
732
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
733
        d += 4;
734
        s += 12;
735
    }
736
    __asm__ volatile(SFENCE:::"memory");
737
    __asm__ volatile(EMMS:::"memory");
738
#endif
739
    while (s < end) {
740
        const int r = *s++;
741
        const int g = *s++;
742
        const int b = *s++;
743
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
744
    }
745
}
746

    
747
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
748
{
749
    const uint8_t *s = src;
750
    const uint8_t *end;
751
#if HAVE_MMX
752
    const uint8_t *mm_end;
753
#endif
754
    uint16_t *d = (uint16_t *)dst;
755
    end = s + src_size;
756
#if HAVE_MMX
757
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
758
    __asm__ volatile(
759
        "movq          %0, %%mm7    \n\t"
760
        "movq          %1, %%mm6    \n\t"
761
        ::"m"(red_15mask),"m"(green_15mask));
762
    mm_end = end - 11;
763
    while (s < mm_end) {
764
        __asm__ volatile(
765
            PREFETCH"    32%1           \n\t"
766
            "movd          %1, %%mm0    \n\t"
767
            "movd         3%1, %%mm3    \n\t"
768
            "punpckldq    6%1, %%mm0    \n\t"
769
            "punpckldq    9%1, %%mm3    \n\t"
770
            "movq       %%mm0, %%mm1    \n\t"
771
            "movq       %%mm0, %%mm2    \n\t"
772
            "movq       %%mm3, %%mm4    \n\t"
773
            "movq       %%mm3, %%mm5    \n\t"
774
            "psrlq         $3, %%mm0    \n\t"
775
            "psrlq         $3, %%mm3    \n\t"
776
            "pand          %2, %%mm0    \n\t"
777
            "pand          %2, %%mm3    \n\t"
778
            "psrlq         $6, %%mm1    \n\t"
779
            "psrlq         $6, %%mm4    \n\t"
780
            "pand       %%mm6, %%mm1    \n\t"
781
            "pand       %%mm6, %%mm4    \n\t"
782
            "psrlq         $9, %%mm2    \n\t"
783
            "psrlq         $9, %%mm5    \n\t"
784
            "pand       %%mm7, %%mm2    \n\t"
785
            "pand       %%mm7, %%mm5    \n\t"
786
            "por        %%mm1, %%mm0    \n\t"
787
            "por        %%mm4, %%mm3    \n\t"
788
            "por        %%mm2, %%mm0    \n\t"
789
            "por        %%mm5, %%mm3    \n\t"
790
            "psllq        $16, %%mm3    \n\t"
791
            "por        %%mm3, %%mm0    \n\t"
792
            MOVNTQ"     %%mm0, %0       \n\t"
793
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794
        d += 4;
795
        s += 12;
796
    }
797
    __asm__ volatile(SFENCE:::"memory");
798
    __asm__ volatile(EMMS:::"memory");
799
#endif
800
    while (s < end) {
801
        const int b = *s++;
802
        const int g = *s++;
803
        const int r = *s++;
804
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
805
    }
806
}
807

    
808
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
809
{
810
    const uint8_t *s = src;
811
    const uint8_t *end;
812
#if HAVE_MMX
813
    const uint8_t *mm_end;
814
#endif
815
    uint16_t *d = (uint16_t *)dst;
816
    end = s + src_size;
817
#if HAVE_MMX
818
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
819
    __asm__ volatile(
820
        "movq         %0, %%mm7     \n\t"
821
        "movq         %1, %%mm6     \n\t"
822
        ::"m"(red_15mask),"m"(green_15mask));
823
    mm_end = end - 15;
824
    while (s < mm_end) {
825
        __asm__ volatile(
826
            PREFETCH"   32%1            \n\t"
827
            "movd         %1, %%mm0     \n\t"
828
            "movd        3%1, %%mm3     \n\t"
829
            "punpckldq   6%1, %%mm0     \n\t"
830
            "punpckldq   9%1, %%mm3     \n\t"
831
            "movq      %%mm0, %%mm1     \n\t"
832
            "movq      %%mm0, %%mm2     \n\t"
833
            "movq      %%mm3, %%mm4     \n\t"
834
            "movq      %%mm3, %%mm5     \n\t"
835
            "psllq        $7, %%mm0     \n\t"
836
            "psllq        $7, %%mm3     \n\t"
837
            "pand      %%mm7, %%mm0     \n\t"
838
            "pand      %%mm7, %%mm3     \n\t"
839
            "psrlq        $6, %%mm1     \n\t"
840
            "psrlq        $6, %%mm4     \n\t"
841
            "pand      %%mm6, %%mm1     \n\t"
842
            "pand      %%mm6, %%mm4     \n\t"
843
            "psrlq       $19, %%mm2     \n\t"
844
            "psrlq       $19, %%mm5     \n\t"
845
            "pand         %2, %%mm2     \n\t"
846
            "pand         %2, %%mm5     \n\t"
847
            "por       %%mm1, %%mm0     \n\t"
848
            "por       %%mm4, %%mm3     \n\t"
849
            "por       %%mm2, %%mm0     \n\t"
850
            "por       %%mm5, %%mm3     \n\t"
851
            "psllq       $16, %%mm3     \n\t"
852
            "por       %%mm3, %%mm0     \n\t"
853
            MOVNTQ"    %%mm0, %0        \n\t"
854
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855
        d += 4;
856
        s += 12;
857
    }
858
    __asm__ volatile(SFENCE:::"memory");
859
    __asm__ volatile(EMMS:::"memory");
860
#endif
861
    while (s < end) {
862
        const int r = *s++;
863
        const int g = *s++;
864
        const int b = *s++;
865
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
866
    }
867
}
868

    
869
/*
870
  I use less accurate approximation here by simply left-shifting the input
871
  value and filling the low order bits with zeroes. This method improves PNG
872
  compression but this scheme cannot reproduce white exactly, since it does
873
  not generate an all-ones maximum value; the net effect is to darken the
874
  image slightly.
875

876
  The better method should be "left bit replication":
877

878
   4 3 2 1 0
879
   ---------
880
   1 1 0 1 1
881

882
   7 6 5 4 3  2 1 0
883
   ----------------
884
   1 1 0 1 1  1 1 0
885
   |=======|  |===|
886
       |      leftmost bits repeated to fill open bits
887
       |
888
   original bits
889
*/
890
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
891
{
892
    const uint16_t *end;
893
#if HAVE_MMX
894
    const uint16_t *mm_end;
895
#endif
896
    uint8_t *d = dst;
897
    const uint16_t *s = (const uint16_t*)src;
898
    end = s + src_size/2;
899
#if HAVE_MMX
900
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
901
    mm_end = end - 7;
902
    while (s < mm_end) {
903
        __asm__ volatile(
904
            PREFETCH"    32%1           \n\t"
905
            "movq          %1, %%mm0    \n\t"
906
            "movq          %1, %%mm1    \n\t"
907
            "movq          %1, %%mm2    \n\t"
908
            "pand          %2, %%mm0    \n\t"
909
            "pand          %3, %%mm1    \n\t"
910
            "pand          %4, %%mm2    \n\t"
911
            "psllq         $3, %%mm0    \n\t"
912
            "psrlq         $2, %%mm1    \n\t"
913
            "psrlq         $7, %%mm2    \n\t"
914
            "movq       %%mm0, %%mm3    \n\t"
915
            "movq       %%mm1, %%mm4    \n\t"
916
            "movq       %%mm2, %%mm5    \n\t"
917
            "punpcklwd     %5, %%mm0    \n\t"
918
            "punpcklwd     %5, %%mm1    \n\t"
919
            "punpcklwd     %5, %%mm2    \n\t"
920
            "punpckhwd     %5, %%mm3    \n\t"
921
            "punpckhwd     %5, %%mm4    \n\t"
922
            "punpckhwd     %5, %%mm5    \n\t"
923
            "psllq         $8, %%mm1    \n\t"
924
            "psllq        $16, %%mm2    \n\t"
925
            "por        %%mm1, %%mm0    \n\t"
926
            "por        %%mm2, %%mm0    \n\t"
927
            "psllq         $8, %%mm4    \n\t"
928
            "psllq        $16, %%mm5    \n\t"
929
            "por        %%mm4, %%mm3    \n\t"
930
            "por        %%mm5, %%mm3    \n\t"
931

    
932
            "movq       %%mm0, %%mm6    \n\t"
933
            "movq       %%mm3, %%mm7    \n\t"
934

    
935
            "movq         8%1, %%mm0    \n\t"
936
            "movq         8%1, %%mm1    \n\t"
937
            "movq         8%1, %%mm2    \n\t"
938
            "pand          %2, %%mm0    \n\t"
939
            "pand          %3, %%mm1    \n\t"
940
            "pand          %4, %%mm2    \n\t"
941
            "psllq         $3, %%mm0    \n\t"
942
            "psrlq         $2, %%mm1    \n\t"
943
            "psrlq         $7, %%mm2    \n\t"
944
            "movq       %%mm0, %%mm3    \n\t"
945
            "movq       %%mm1, %%mm4    \n\t"
946
            "movq       %%mm2, %%mm5    \n\t"
947
            "punpcklwd     %5, %%mm0    \n\t"
948
            "punpcklwd     %5, %%mm1    \n\t"
949
            "punpcklwd     %5, %%mm2    \n\t"
950
            "punpckhwd     %5, %%mm3    \n\t"
951
            "punpckhwd     %5, %%mm4    \n\t"
952
            "punpckhwd     %5, %%mm5    \n\t"
953
            "psllq         $8, %%mm1    \n\t"
954
            "psllq        $16, %%mm2    \n\t"
955
            "por        %%mm1, %%mm0    \n\t"
956
            "por        %%mm2, %%mm0    \n\t"
957
            "psllq         $8, %%mm4    \n\t"
958
            "psllq        $16, %%mm5    \n\t"
959
            "por        %%mm4, %%mm3    \n\t"
960
            "por        %%mm5, %%mm3    \n\t"
961

    
962
            :"=m"(*d)
963
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
964
            :"memory");
965
        /* borrowed 32 to 24 */
966
        __asm__ volatile(
967
            "movq       %%mm0, %%mm4    \n\t"
968
            "movq       %%mm3, %%mm5    \n\t"
969
            "movq       %%mm6, %%mm0    \n\t"
970
            "movq       %%mm7, %%mm1    \n\t"
971

    
972
            "movq       %%mm4, %%mm6    \n\t"
973
            "movq       %%mm5, %%mm7    \n\t"
974
            "movq       %%mm0, %%mm2    \n\t"
975
            "movq       %%mm1, %%mm3    \n\t"
976

    
977
            STORE_BGR24_MMX
978

    
979
            :"=m"(*d)
980
            :"m"(*s)
981
            :"memory");
982
        d += 24;
983
        s += 8;
984
    }
985
    __asm__ volatile(SFENCE:::"memory");
986
    __asm__ volatile(EMMS:::"memory");
987
#endif
988
    while (s < end) {
989
        register uint16_t bgr;
990
        bgr = *s++;
991
        *d++ = (bgr&0x1F)<<3;
992
        *d++ = (bgr&0x3E0)>>2;
993
        *d++ = (bgr&0x7C00)>>7;
994
    }
995
}
996

    
997
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
998
{
999
    const uint16_t *end;
1000
#if HAVE_MMX
1001
    const uint16_t *mm_end;
1002
#endif
1003
    uint8_t *d = (uint8_t *)dst;
1004
    const uint16_t *s = (const uint16_t *)src;
1005
    end = s + src_size/2;
1006
#if HAVE_MMX
1007
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1008
    mm_end = end - 7;
1009
    while (s < mm_end) {
1010
        __asm__ volatile(
1011
            PREFETCH"    32%1           \n\t"
1012
            "movq          %1, %%mm0    \n\t"
1013
            "movq          %1, %%mm1    \n\t"
1014
            "movq          %1, %%mm2    \n\t"
1015
            "pand          %2, %%mm0    \n\t"
1016
            "pand          %3, %%mm1    \n\t"
1017
            "pand          %4, %%mm2    \n\t"
1018
            "psllq         $3, %%mm0    \n\t"
1019
            "psrlq         $3, %%mm1    \n\t"
1020
            "psrlq         $8, %%mm2    \n\t"
1021
            "movq       %%mm0, %%mm3    \n\t"
1022
            "movq       %%mm1, %%mm4    \n\t"
1023
            "movq       %%mm2, %%mm5    \n\t"
1024
            "punpcklwd     %5, %%mm0    \n\t"
1025
            "punpcklwd     %5, %%mm1    \n\t"
1026
            "punpcklwd     %5, %%mm2    \n\t"
1027
            "punpckhwd     %5, %%mm3    \n\t"
1028
            "punpckhwd     %5, %%mm4    \n\t"
1029
            "punpckhwd     %5, %%mm5    \n\t"
1030
            "psllq         $8, %%mm1    \n\t"
1031
            "psllq        $16, %%mm2    \n\t"
1032
            "por        %%mm1, %%mm0    \n\t"
1033
            "por        %%mm2, %%mm0    \n\t"
1034
            "psllq         $8, %%mm4    \n\t"
1035
            "psllq        $16, %%mm5    \n\t"
1036
            "por        %%mm4, %%mm3    \n\t"
1037
            "por        %%mm5, %%mm3    \n\t"
1038

    
1039
            "movq       %%mm0, %%mm6    \n\t"
1040
            "movq       %%mm3, %%mm7    \n\t"
1041

    
1042
            "movq         8%1, %%mm0    \n\t"
1043
            "movq         8%1, %%mm1    \n\t"
1044
            "movq         8%1, %%mm2    \n\t"
1045
            "pand          %2, %%mm0    \n\t"
1046
            "pand          %3, %%mm1    \n\t"
1047
            "pand          %4, %%mm2    \n\t"
1048
            "psllq         $3, %%mm0    \n\t"
1049
            "psrlq         $3, %%mm1    \n\t"
1050
            "psrlq         $8, %%mm2    \n\t"
1051
            "movq       %%mm0, %%mm3    \n\t"
1052
            "movq       %%mm1, %%mm4    \n\t"
1053
            "movq       %%mm2, %%mm5    \n\t"
1054
            "punpcklwd     %5, %%mm0    \n\t"
1055
            "punpcklwd     %5, %%mm1    \n\t"
1056
            "punpcklwd     %5, %%mm2    \n\t"
1057
            "punpckhwd     %5, %%mm3    \n\t"
1058
            "punpckhwd     %5, %%mm4    \n\t"
1059
            "punpckhwd     %5, %%mm5    \n\t"
1060
            "psllq         $8, %%mm1    \n\t"
1061
            "psllq        $16, %%mm2    \n\t"
1062
            "por        %%mm1, %%mm0    \n\t"
1063
            "por        %%mm2, %%mm0    \n\t"
1064
            "psllq         $8, %%mm4    \n\t"
1065
            "psllq        $16, %%mm5    \n\t"
1066
            "por        %%mm4, %%mm3    \n\t"
1067
            "por        %%mm5, %%mm3    \n\t"
1068
            :"=m"(*d)
1069
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1070
            :"memory");
1071
        /* borrowed 32 to 24 */
1072
        __asm__ volatile(
1073
            "movq       %%mm0, %%mm4    \n\t"
1074
            "movq       %%mm3, %%mm5    \n\t"
1075
            "movq       %%mm6, %%mm0    \n\t"
1076
            "movq       %%mm7, %%mm1    \n\t"
1077

    
1078
            "movq       %%mm4, %%mm6    \n\t"
1079
            "movq       %%mm5, %%mm7    \n\t"
1080
            "movq       %%mm0, %%mm2    \n\t"
1081
            "movq       %%mm1, %%mm3    \n\t"
1082

    
1083
            STORE_BGR24_MMX
1084

    
1085
            :"=m"(*d)
1086
            :"m"(*s)
1087
            :"memory");
1088
        d += 24;
1089
        s += 8;
1090
    }
1091
    __asm__ volatile(SFENCE:::"memory");
1092
    __asm__ volatile(EMMS:::"memory");
1093
#endif
1094
    while (s < end) {
1095
        register uint16_t bgr;
1096
        bgr = *s++;
1097
        *d++ = (bgr&0x1F)<<3;
1098
        *d++ = (bgr&0x7E0)>>3;
1099
        *d++ = (bgr&0xF800)>>8;
1100
    }
1101
}
1102

    
1103
/*
1104
 * mm0 = 00 B3 00 B2 00 B1 00 B0
1105
 * mm1 = 00 G3 00 G2 00 G1 00 G0
1106
 * mm2 = 00 R3 00 R2 00 R1 00 R0
1107
 * mm6 = FF FF FF FF FF FF FF FF
1108
 * mm7 = 00 00 00 00 00 00 00 00
1109
 */
1110
#define PACK_RGB32 \
1111
    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1112
    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1113
    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1114
    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1115
    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1116
    "movq       %%mm0, %%mm3    \n\t"                               \
1117
    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1118
    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1119
    MOVNTQ"     %%mm0,  %0      \n\t"                               \
1120
    MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1121

    
1122
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1123
{
1124
    const uint16_t *end;
1125
#if HAVE_MMX
1126
    const uint16_t *mm_end;
1127
#endif
1128
    uint8_t *d = dst;
1129
    const uint16_t *s = (const uint16_t *)src;
1130
    end = s + src_size/2;
1131
#if HAVE_MMX
1132
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1133
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1134
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1135
    mm_end = end - 3;
1136
    while (s < mm_end) {
1137
        __asm__ volatile(
1138
            PREFETCH"    32%1           \n\t"
1139
            "movq          %1, %%mm0    \n\t"
1140
            "movq          %1, %%mm1    \n\t"
1141
            "movq          %1, %%mm2    \n\t"
1142
            "pand          %2, %%mm0    \n\t"
1143
            "pand          %3, %%mm1    \n\t"
1144
            "pand          %4, %%mm2    \n\t"
1145
            "psllq         $3, %%mm0    \n\t"
1146
            "psrlq         $2, %%mm1    \n\t"
1147
            "psrlq         $7, %%mm2    \n\t"
1148
            PACK_RGB32
1149
            :"=m"(*d)
1150
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1151
            :"memory");
1152
        d += 16;
1153
        s += 4;
1154
    }
1155
    __asm__ volatile(SFENCE:::"memory");
1156
    __asm__ volatile(EMMS:::"memory");
1157
#endif
1158
    while (s < end) {
1159
        register uint16_t bgr;
1160
        bgr = *s++;
1161
#if HAVE_BIGENDIAN
1162
        *d++ = 255;
1163
        *d++ = (bgr&0x7C00)>>7;
1164
        *d++ = (bgr&0x3E0)>>2;
1165
        *d++ = (bgr&0x1F)<<3;
1166
#else
1167
        *d++ = (bgr&0x1F)<<3;
1168
        *d++ = (bgr&0x3E0)>>2;
1169
        *d++ = (bgr&0x7C00)>>7;
1170
        *d++ = 255;
1171
#endif
1172
    }
1173
}
1174

    
1175
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1176
{
1177
    const uint16_t *end;
1178
#if HAVE_MMX
1179
    const uint16_t *mm_end;
1180
#endif
1181
    uint8_t *d = dst;
1182
    const uint16_t *s = (const uint16_t*)src;
1183
    end = s + src_size/2;
1184
#if HAVE_MMX
1185
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1186
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1187
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1188
    mm_end = end - 3;
1189
    while (s < mm_end) {
1190
        __asm__ volatile(
1191
            PREFETCH"    32%1           \n\t"
1192
            "movq          %1, %%mm0    \n\t"
1193
            "movq          %1, %%mm1    \n\t"
1194
            "movq          %1, %%mm2    \n\t"
1195
            "pand          %2, %%mm0    \n\t"
1196
            "pand          %3, %%mm1    \n\t"
1197
            "pand          %4, %%mm2    \n\t"
1198
            "psllq         $3, %%mm0    \n\t"
1199
            "psrlq         $3, %%mm1    \n\t"
1200
            "psrlq         $8, %%mm2    \n\t"
1201
            PACK_RGB32
1202
            :"=m"(*d)
1203
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1204
            :"memory");
1205
        d += 16;
1206
        s += 4;
1207
    }
1208
    __asm__ volatile(SFENCE:::"memory");
1209
    __asm__ volatile(EMMS:::"memory");
1210
#endif
1211
    while (s < end) {
1212
        register uint16_t bgr;
1213
        bgr = *s++;
1214
#if HAVE_BIGENDIAN
1215
        *d++ = 255;
1216
        *d++ = (bgr&0xF800)>>8;
1217
        *d++ = (bgr&0x7E0)>>3;
1218
        *d++ = (bgr&0x1F)<<3;
1219
#else
1220
        *d++ = (bgr&0x1F)<<3;
1221
        *d++ = (bgr&0x7E0)>>3;
1222
        *d++ = (bgr&0xF800)>>8;
1223
        *d++ = 255;
1224
#endif
1225
    }
1226
}
1227

    
1228
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1229
{
1230
    x86_reg idx = 15 - src_size;
1231
    const uint8_t *s = src-idx;
1232
    uint8_t *d = dst-idx;
1233
#if HAVE_MMX
1234
    __asm__ volatile(
1235
        "test          %0, %0           \n\t"
1236
        "jns           2f               \n\t"
1237
        PREFETCH"       (%1, %0)        \n\t"
1238
        "movq          %3, %%mm7        \n\t"
1239
        "pxor          %4, %%mm7        \n\t"
1240
        "movq       %%mm7, %%mm6        \n\t"
1241
        "pxor          %5, %%mm7        \n\t"
1242
        ASMALIGN(4)
1243
        "1:                             \n\t"
1244
        PREFETCH"     32(%1, %0)        \n\t"
1245
        "movq           (%1, %0), %%mm0 \n\t"
1246
        "movq          8(%1, %0), %%mm1 \n\t"
1247
# if HAVE_MMX2
1248
        "pshufw      $177, %%mm0, %%mm3 \n\t"
1249
        "pshufw      $177, %%mm1, %%mm5 \n\t"
1250
        "pand       %%mm7, %%mm0        \n\t"
1251
        "pand       %%mm6, %%mm3        \n\t"
1252
        "pand       %%mm7, %%mm1        \n\t"
1253
        "pand       %%mm6, %%mm5        \n\t"
1254
        "por        %%mm3, %%mm0        \n\t"
1255
        "por        %%mm5, %%mm1        \n\t"
1256
# else
1257
        "movq       %%mm0, %%mm2        \n\t"
1258
        "movq       %%mm1, %%mm4        \n\t"
1259
        "pand       %%mm7, %%mm0        \n\t"
1260
        "pand       %%mm6, %%mm2        \n\t"
1261
        "pand       %%mm7, %%mm1        \n\t"
1262
        "pand       %%mm6, %%mm4        \n\t"
1263
        "movq       %%mm2, %%mm3        \n\t"
1264
        "movq       %%mm4, %%mm5        \n\t"
1265
        "pslld        $16, %%mm2        \n\t"
1266
        "psrld        $16, %%mm3        \n\t"
1267
        "pslld        $16, %%mm4        \n\t"
1268
        "psrld        $16, %%mm5        \n\t"
1269
        "por        %%mm2, %%mm0        \n\t"
1270
        "por        %%mm4, %%mm1        \n\t"
1271
        "por        %%mm3, %%mm0        \n\t"
1272
        "por        %%mm5, %%mm1        \n\t"
1273
# endif
1274
        MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1275
        MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1276
        "add          $16, %0           \n\t"
1277
        "js            1b               \n\t"
1278
        SFENCE"                         \n\t"
1279
        EMMS"                           \n\t"
1280
        "2:                             \n\t"
1281
        : "+&r"(idx)
1282
        : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1283
        : "memory");
1284
#endif
1285
    for (; idx<15; idx+=4) {
1286
        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1287
        v &= 0xff00ff;
1288
        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1289
    }
1290
}
1291

    
1292
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1293
{
1294
    unsigned i;
1295
#if HAVE_MMX
1296
    x86_reg mmx_size= 23 - src_size;
1297
    __asm__ volatile (
1298
        "test             %%"REG_a", %%"REG_a"          \n\t"
1299
        "jns                     2f                     \n\t"
1300
        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1301
        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1302
        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1303
        ASMALIGN(4)
1304
        "1:                                             \n\t"
1305
        PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1306
        "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1307
        "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1308
        "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1309
        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1310
        "pand                 %%mm5, %%mm0              \n\t"
1311
        "pand                 %%mm6, %%mm1              \n\t"
1312
        "pand                 %%mm7, %%mm2              \n\t"
1313
        "por                  %%mm0, %%mm1              \n\t"
1314
        "por                  %%mm2, %%mm1              \n\t"
1315
        "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1316
        MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1317
        "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1318
        "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1319
        "pand                 %%mm7, %%mm0              \n\t"
1320
        "pand                 %%mm5, %%mm1              \n\t"
1321
        "pand                 %%mm6, %%mm2              \n\t"
1322
        "por                  %%mm0, %%mm1              \n\t"
1323
        "por                  %%mm2, %%mm1              \n\t"
1324
        "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1325
        MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1326
        "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1327
        "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1328
        "pand                 %%mm6, %%mm0              \n\t"
1329
        "pand                 %%mm7, %%mm1              \n\t"
1330
        "pand                 %%mm5, %%mm2              \n\t"
1331
        "por                  %%mm0, %%mm1              \n\t"
1332
        "por                  %%mm2, %%mm1              \n\t"
1333
        MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1334
        "add                    $24, %%"REG_a"          \n\t"
1335
        " js                     1b                     \n\t"
1336
        "2:                                             \n\t"
1337
        : "+a" (mmx_size)
1338
        : "r" (src-mmx_size), "r"(dst-mmx_size)
1339
    );
1340

    
1341
    __asm__ volatile(SFENCE:::"memory");
1342
    __asm__ volatile(EMMS:::"memory");
1343

    
1344
    if (mmx_size==23) return; //finished, was multiple of 8
1345

    
1346
    src+= src_size;
1347
    dst+= src_size;
1348
    src_size= 23-mmx_size;
1349
    src-= src_size;
1350
    dst-= src_size;
1351
#endif
1352
    for (i=0; i<src_size; i+=3) {
1353
        register uint8_t x;
1354
        x          = src[i + 2];
1355
        dst[i + 1] = src[i + 1];
1356
        dst[i + 2] = src[i + 0];
1357
        dst[i + 0] = x;
1358
    }
1359
}
1360

    
1361
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1362
                                           long width, long height,
1363
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1364
{
1365
    long y;
1366
    const x86_reg chromWidth= width>>1;
1367
    for (y=0; y<height; y++) {
1368
#if HAVE_MMX
1369
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1370
        __asm__ volatile(
1371
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1372
            ASMALIGN(4)
1373
            "1:                                         \n\t"
1374
            PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1375
            PREFETCH"    32(%2, %%"REG_a")              \n\t"
1376
            PREFETCH"    32(%3, %%"REG_a")              \n\t"
1377
            "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1378
            "movq                    %%mm0, %%mm2       \n\t" // U(0)
1379
            "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1380
            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1381
            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1382

    
1383
            "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1384
            "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1385
            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1386
            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1387
            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1388
            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1389
            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1390
            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1391

    
1392
            MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1393
            MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1394
            MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1395
            MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1396

    
1397
            "add                        $8, %%"REG_a"   \n\t"
1398
            "cmp                        %4, %%"REG_a"   \n\t"
1399
            " jb                        1b              \n\t"
1400
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1401
            : "%"REG_a
1402
        );
1403
#else
1404

    
1405
#if ARCH_ALPHA && HAVE_MVI
1406
#define pl2yuy2(n)                  \
1407
    y1 = yc[n];                     \
1408
    y2 = yc2[n];                    \
1409
    u = uc[n];                      \
1410
    v = vc[n];                      \
1411
    __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1412
    __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1413
    __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1414
    __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1415
    yuv1 = (u << 8) + (v << 24);                \
1416
    yuv2 = yuv1 + y2;               \
1417
    yuv1 += y1;                     \
1418
    qdst[n]  = yuv1;                \
1419
    qdst2[n] = yuv2;
1420

    
1421
        int i;
1422
        uint64_t *qdst = (uint64_t *) dst;
1423
        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1424
        const uint32_t *yc = (uint32_t *) ysrc;
1425
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1426
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1427
        for (i = 0; i < chromWidth; i += 8) {
1428
            uint64_t y1, y2, yuv1, yuv2;
1429
            uint64_t u, v;
1430
            /* Prefetch */
1431
            __asm__("ldq $31,64(%0)" :: "r"(yc));
1432
            __asm__("ldq $31,64(%0)" :: "r"(yc2));
1433
            __asm__("ldq $31,64(%0)" :: "r"(uc));
1434
            __asm__("ldq $31,64(%0)" :: "r"(vc));
1435

    
1436
            pl2yuy2(0);
1437
            pl2yuy2(1);
1438
            pl2yuy2(2);
1439
            pl2yuy2(3);
1440

    
1441
            yc    += 4;
1442
            yc2   += 4;
1443
            uc    += 4;
1444
            vc    += 4;
1445
            qdst  += 4;
1446
            qdst2 += 4;
1447
        }
1448
        y++;
1449
        ysrc += lumStride;
1450
        dst += dstStride;
1451

    
1452
#elif HAVE_FAST_64BIT
1453
        int i;
1454
        uint64_t *ldst = (uint64_t *) dst;
1455
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1456
        for (i = 0; i < chromWidth; i += 2) {
1457
            uint64_t k, l;
1458
            k = yc[0] + (uc[0] << 8) +
1459
                (yc[1] << 16) + (vc[0] << 24);
1460
            l = yc[2] + (uc[1] << 8) +
1461
                (yc[3] << 16) + (vc[1] << 24);
1462
            *ldst++ = k + (l << 32);
1463
            yc += 4;
1464
            uc += 2;
1465
            vc += 2;
1466
        }
1467

    
1468
#else
1469
        int i, *idst = (int32_t *) dst;
1470
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1471
        for (i = 0; i < chromWidth; i++) {
1472
#if HAVE_BIGENDIAN
1473
            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1474
                (yc[1] << 8) + (vc[0] << 0);
1475
#else
1476
            *idst++ = yc[0] + (uc[0] << 8) +
1477
                (yc[1] << 16) + (vc[0] << 24);
1478
#endif
1479
            yc += 2;
1480
            uc++;
1481
            vc++;
1482
        }
1483
#endif
1484
#endif
1485
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1486
            usrc += chromStride;
1487
            vsrc += chromStride;
1488
        }
1489
        ysrc += lumStride;
1490
        dst  += dstStride;
1491
    }
1492
#if HAVE_MMX
1493
    __asm__(EMMS"       \n\t"
1494
            SFENCE"     \n\t"
1495
            :::"memory");
1496
#endif
1497
}
1498

    
1499
/**
1500
 * Height should be a multiple of 2 and width should be a multiple of 16.
1501
 * (If this is a problem for anyone then tell me, and I will fix it.)
1502
 */
1503
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1504
                                      long width, long height,
1505
                                      long lumStride, long chromStride, long dstStride)
1506
{
1507
    //FIXME interpolate chroma
1508
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1509
}
1510

    
1511
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1512
                                           long width, long height,
1513
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1514
{
1515
    long y;
1516
    const x86_reg chromWidth= width>>1;
1517
    for (y=0; y<height; y++) {
1518
#if HAVE_MMX
1519
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1520
        __asm__ volatile(
1521
            "xor                %%"REG_a", %%"REG_a"    \n\t"
1522
            ASMALIGN(4)
1523
            "1:                                         \n\t"
1524
            PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1525
            PREFETCH"   32(%2, %%"REG_a")               \n\t"
1526
            PREFETCH"   32(%3, %%"REG_a")               \n\t"
1527
            "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1528
            "movq                   %%mm0, %%mm2        \n\t" // U(0)
1529
            "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1530
            "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1531
            "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1532

    
1533
            "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1534
            "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1535
            "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1536
            "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1537
            "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1538
            "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1539
            "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1540
            "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1541

    
1542
            MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1543
            MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1544
            MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1545
            MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1546

    
1547
            "add                       $8, %%"REG_a"    \n\t"
1548
            "cmp                       %4, %%"REG_a"    \n\t"
1549
            " jb                       1b               \n\t"
1550
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1551
            : "%"REG_a
1552
        );
1553
#else
1554
//FIXME adapt the Alpha ASM code from yv12->yuy2
1555

    
1556
#if HAVE_FAST_64BIT
1557
        int i;
1558
        uint64_t *ldst = (uint64_t *) dst;
1559
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560
        for (i = 0; i < chromWidth; i += 2) {
1561
            uint64_t k, l;
1562
            k = uc[0] + (yc[0] << 8) +
1563
                (vc[0] << 16) + (yc[1] << 24);
1564
            l = uc[1] + (yc[2] << 8) +
1565
                (vc[1] << 16) + (yc[3] << 24);
1566
            *ldst++ = k + (l << 32);
1567
            yc += 4;
1568
            uc += 2;
1569
            vc += 2;
1570
        }
1571

    
1572
#else
1573
        int i, *idst = (int32_t *) dst;
1574
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575
        for (i = 0; i < chromWidth; i++) {
1576
#if HAVE_BIGENDIAN
1577
            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1578
                (vc[0] << 8) + (yc[1] << 0);
1579
#else
1580
            *idst++ = uc[0] + (yc[0] << 8) +
1581
               (vc[0] << 16) + (yc[1] << 24);
1582
#endif
1583
            yc += 2;
1584
            uc++;
1585
            vc++;
1586
        }
1587
#endif
1588
#endif
1589
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1590
            usrc += chromStride;
1591
            vsrc += chromStride;
1592
        }
1593
        ysrc += lumStride;
1594
        dst += dstStride;
1595
    }
1596
#if HAVE_MMX
1597
    __asm__(EMMS"       \n\t"
1598
            SFENCE"     \n\t"
1599
            :::"memory");
1600
#endif
1601
}
1602

    
1603
/**
1604
 * Height should be a multiple of 2 and width should be a multiple of 16
1605
 * (If this is a problem for anyone then tell me, and I will fix it.)
1606
 */
1607
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1608
                                      long width, long height,
1609
                                      long lumStride, long chromStride, long dstStride)
1610
{
1611
    //FIXME interpolate chroma
1612
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1613
}
1614

    
1615
/**
1616
 * Width should be a multiple of 16.
1617
 */
1618
static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619
                                         long width, long height,
1620
                                         long lumStride, long chromStride, long dstStride)
1621
{
1622
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1623
}
1624

    
1625
/**
1626
 * Width should be a multiple of 16.
1627
 */
1628
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1629
                                         long width, long height,
1630
                                         long lumStride, long chromStride, long dstStride)
1631
{
1632
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1633
}
1634

    
1635
/**
1636
 * Height should be a multiple of 2 and width should be a multiple of 16.
1637
 * (If this is a problem for anyone then tell me, and I will fix it.)
1638
 */
1639
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1640
                                      long width, long height,
1641
                                      long lumStride, long chromStride, long srcStride)
1642
{
1643
    long y;
1644
    const x86_reg chromWidth= width>>1;
1645
    for (y=0; y<height; y+=2) {
1646
#if HAVE_MMX
1647
        __asm__ volatile(
1648
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1649
            "pcmpeqw                 %%mm7, %%mm7       \n\t"
1650
            "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1651
            ASMALIGN(4)
1652
            "1:                \n\t"
1653
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1654
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1655
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1656
            "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1657
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1658
            "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1659
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1660
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1661
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1662
            "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1663
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1664

    
1665
            MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1666

    
1667
            "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1668
            "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1669
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1670
            "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1671
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1672
            "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1673
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1674
            "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1675
            "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1676
            "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1677

    
1678
            MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1679

    
1680
            "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1681
            "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1682
            "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1683
            "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1684
            "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1685
            "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1686
            "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1687
            "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1688

    
1689
            MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1690
            MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1691

    
1692
            "add                        $8, %%"REG_a"   \n\t"
1693
            "cmp                        %4, %%"REG_a"   \n\t"
1694
            " jb                        1b              \n\t"
1695
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1696
            : "memory", "%"REG_a
1697
        );
1698

    
1699
        ydst += lumStride;
1700
        src  += srcStride;
1701

    
1702
        __asm__ volatile(
1703
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1704
            ASMALIGN(4)
1705
            "1:                                         \n\t"
1706
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1707
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1708
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1709
            "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1710
            "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1711
            "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1712
            "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1713
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1714
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1715
            "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1716
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1717

    
1718
            MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1719
            MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1720

    
1721
            "add                        $8, %%"REG_a"   \n\t"
1722
            "cmp                        %4, %%"REG_a"   \n\t"
1723
            " jb                        1b              \n\t"
1724

    
1725
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1726
            : "memory", "%"REG_a
1727
        );
1728
#else
1729
        long i;
1730
        for (i=0; i<chromWidth; i++) {
1731
            ydst[2*i+0]     = src[4*i+0];
1732
            udst[i]     = src[4*i+1];
1733
            ydst[2*i+1]     = src[4*i+2];
1734
            vdst[i]     = src[4*i+3];
1735
        }
1736
        ydst += lumStride;
1737
        src  += srcStride;
1738

    
1739
        for (i=0; i<chromWidth; i++) {
1740
            ydst[2*i+0]     = src[4*i+0];
1741
            ydst[2*i+1]     = src[4*i+2];
1742
        }
1743
#endif
1744
        udst += chromStride;
1745
        vdst += chromStride;
1746
        ydst += lumStride;
1747
        src  += srcStride;
1748
    }
1749
#if HAVE_MMX
1750
    __asm__ volatile(EMMS"       \n\t"
1751
                     SFENCE"     \n\t"
1752
                     :::"memory");
1753
#endif
1754
}
1755

    
1756
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1757
                                      uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1758
                                      long width, long height, long lumStride, long chromStride)
1759
{
1760
    /* Y Plane */
1761
    memcpy(ydst, ysrc, width*height);
1762

    
1763
    /* XXX: implement upscaling for U,V */
1764
}
1765

    
1766
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1767
{
1768
    long x,y;
1769

    
1770
    dst[0]= src[0];
1771

    
1772
    // first line
1773
    for (x=0; x<srcWidth-1; x++) {
1774
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1775
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1776
    }
1777
    dst[2*srcWidth-1]= src[srcWidth-1];
1778

    
1779
    dst+= dstStride;
1780

    
1781
    for (y=1; y<srcHeight; y++) {
1782
#if HAVE_MMX2 || HAVE_AMD3DNOW
1783
        const x86_reg mmxSize= srcWidth&~15;
1784
        __asm__ volatile(
1785
            "mov           %4, %%"REG_a"            \n\t"
1786
            "1:                                     \n\t"
1787
            "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1788
            "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1789
            "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1790
            "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1791
            "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1792
            "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1793
            PAVGB"                  %%mm0, %%mm5    \n\t"
1794
            PAVGB"                  %%mm0, %%mm3    \n\t"
1795
            PAVGB"                  %%mm0, %%mm5    \n\t"
1796
            PAVGB"                  %%mm0, %%mm3    \n\t"
1797
            PAVGB"                  %%mm1, %%mm4    \n\t"
1798
            PAVGB"                  %%mm1, %%mm2    \n\t"
1799
            PAVGB"                  %%mm1, %%mm4    \n\t"
1800
            PAVGB"                  %%mm1, %%mm2    \n\t"
1801
            "movq                   %%mm5, %%mm7    \n\t"
1802
            "movq                   %%mm4, %%mm6    \n\t"
1803
            "punpcklbw              %%mm3, %%mm5    \n\t"
1804
            "punpckhbw              %%mm3, %%mm7    \n\t"
1805
            "punpcklbw              %%mm2, %%mm4    \n\t"
1806
            "punpckhbw              %%mm2, %%mm6    \n\t"
1807
#if 1
1808
            MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1809
            MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1810
            MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1811
            MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1812
#else
1813
            "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1814
            "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1815
            "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1816
            "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1817
#endif
1818
            "add                       $8, %%"REG_a"            \n\t"
1819
            " js                       1b                       \n\t"
1820
            :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1821
            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1822
            "g" (-mmxSize)
1823
            : "%"REG_a
1824

    
1825
        );
1826
#else
1827
        const x86_reg mmxSize=1;
1828
#endif
1829
        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1830
        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1831

    
1832
        for (x=mmxSize-1; x<srcWidth-1; x++) {
1833
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1834
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1835
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1836
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1837
        }
1838
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1839
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1840

    
1841
        dst+=dstStride*2;
1842
        src+=srcStride;
1843
    }
1844

    
1845
    // last line
1846
#if 1
1847
    dst[0]= src[0];
1848

    
1849
    for (x=0; x<srcWidth-1; x++) {
1850
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1851
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1852
    }
1853
    dst[2*srcWidth-1]= src[srcWidth-1];
1854
#else
1855
    for (x=0; x<srcWidth; x++) {
1856
        dst[2*x+0]=
1857
        dst[2*x+1]= src[x];
1858
    }
1859
#endif
1860

    
1861
#if HAVE_MMX
1862
    __asm__ volatile(EMMS"       \n\t"
1863
                     SFENCE"     \n\t"
1864
                     :::"memory");
1865
#endif
1866
}
1867

    
1868
/**
1869
 * Height should be a multiple of 2 and width should be a multiple of 16.
1870
 * (If this is a problem for anyone then tell me, and I will fix it.)
1871
 * Chrominance data is only taken from every second line, others are ignored.
1872
 * FIXME: Write HQ version.
1873
 */
1874
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1875
                                      long width, long height,
1876
                                      long lumStride, long chromStride, long srcStride)
1877
{
1878
    long y;
1879
    const x86_reg chromWidth= width>>1;
1880
    for (y=0; y<height; y+=2) {
1881
#if HAVE_MMX
1882
        __asm__ volatile(
1883
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1884
            "pcmpeqw             %%mm7, %%mm7   \n\t"
1885
            "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
1886
            ASMALIGN(4)
1887
            "1:                                 \n\t"
1888
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1889
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
1890
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
1891
            "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
1892
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
1893
            "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
1894
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
1895
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
1896
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
1897
            "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
1898
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
1899

    
1900
            MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
1901

    
1902
            "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
1903
            "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
1904
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
1905
            "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
1906
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
1907
            "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
1908
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
1909
            "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
1910
            "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
1911
            "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
1912

    
1913
            MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1914

    
1915
            "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
1916
            "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
1917
            "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
1918
            "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
1919
            "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
1920
            "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
1921
            "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
1922
            "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
1923

    
1924
            MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
1925
            MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
1926

    
1927
            "add                    $8, %%"REG_a"   \n\t"
1928
            "cmp                    %4, %%"REG_a"   \n\t"
1929
            " jb                    1b          \n\t"
1930
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1931
            : "memory", "%"REG_a
1932
        );
1933

    
1934
        ydst += lumStride;
1935
        src  += srcStride;
1936

    
1937
        __asm__ volatile(
1938
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1939
            ASMALIGN(4)
1940
            "1:                                 \n\t"
1941
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1942
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
1943
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
1944
            "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
1945
            "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
1946
            "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
1947
            "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
1948
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
1949
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
1950
            "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
1951
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
1952

    
1953
            MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
1954
            MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1955

    
1956
            "add                    $8, %%"REG_a"   \n\t"
1957
            "cmp                    %4, %%"REG_a"   \n\t"
1958
            " jb                    1b          \n\t"
1959

    
1960
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1961
            : "memory", "%"REG_a
1962
        );
1963
#else
1964
        long i;
1965
        for (i=0; i<chromWidth; i++) {
1966
            udst[i]     = src[4*i+0];
1967
            ydst[2*i+0] = src[4*i+1];
1968
            vdst[i]     = src[4*i+2];
1969
            ydst[2*i+1] = src[4*i+3];
1970
        }
1971
        ydst += lumStride;
1972
        src  += srcStride;
1973

    
1974
        for (i=0; i<chromWidth; i++) {
1975
            ydst[2*i+0] = src[4*i+1];
1976
            ydst[2*i+1] = src[4*i+3];
1977
        }
1978
#endif
1979
        udst += chromStride;
1980
        vdst += chromStride;
1981
        ydst += lumStride;
1982
        src  += srcStride;
1983
    }
1984
#if HAVE_MMX
1985
    __asm__ volatile(EMMS"       \n\t"
1986
                     SFENCE"     \n\t"
1987
                     :::"memory");
1988
#endif
1989
}
1990

    
1991
/**
1992
 * Height should be a multiple of 2 and width should be a multiple of 2.
1993
 * (If this is a problem for anyone then tell me, and I will fix it.)
1994
 * Chrominance data is only taken from every second line,
1995
 * others are ignored in the C version.
1996
 * FIXME: Write HQ version.
1997
 */
1998
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1999
                                       long width, long height,
2000
                                       long lumStride, long chromStride, long srcStride)
2001
{
2002
    long y;
2003
    const x86_reg chromWidth= width>>1;
2004
#if HAVE_MMX
2005
    for (y=0; y<height-2; y+=2) {
2006
        long i;
2007
        for (i=0; i<2; i++) {
2008
            __asm__ volatile(
2009
                "mov                        %2, %%"REG_a"   \n\t"
2010
                "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2011
                "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2012
                "pxor                    %%mm7, %%mm7       \n\t"
2013
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2014
                ASMALIGN(4)
2015
                "1:                                         \n\t"
2016
                PREFETCH"    64(%0, %%"REG_d")              \n\t"
2017
                "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2018
                "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2019
                "punpcklbw               %%mm7, %%mm0       \n\t"
2020
                "punpcklbw               %%mm7, %%mm1       \n\t"
2021
                "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2022
                "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2023
                "punpcklbw               %%mm7, %%mm2       \n\t"
2024
                "punpcklbw               %%mm7, %%mm3       \n\t"
2025
                "pmaddwd                 %%mm6, %%mm0       \n\t"
2026
                "pmaddwd                 %%mm6, %%mm1       \n\t"
2027
                "pmaddwd                 %%mm6, %%mm2       \n\t"
2028
                "pmaddwd                 %%mm6, %%mm3       \n\t"
2029
#ifndef FAST_BGR2YV12
2030
                "psrad                      $8, %%mm0       \n\t"
2031
                "psrad                      $8, %%mm1       \n\t"
2032
                "psrad                      $8, %%mm2       \n\t"
2033
                "psrad                      $8, %%mm3       \n\t"
2034
#endif
2035
                "packssdw                %%mm1, %%mm0       \n\t"
2036
                "packssdw                %%mm3, %%mm2       \n\t"
2037
                "pmaddwd                 %%mm5, %%mm0       \n\t"
2038
                "pmaddwd                 %%mm5, %%mm2       \n\t"
2039
                "packssdw                %%mm2, %%mm0       \n\t"
2040
                "psraw                      $7, %%mm0       \n\t"
2041

    
2042
                "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2043
                "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2044
                "punpcklbw               %%mm7, %%mm4       \n\t"
2045
                "punpcklbw               %%mm7, %%mm1       \n\t"
2046
                "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2047
                "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2048
                "punpcklbw               %%mm7, %%mm2       \n\t"
2049
                "punpcklbw               %%mm7, %%mm3       \n\t"
2050
                "pmaddwd                 %%mm6, %%mm4       \n\t"
2051
                "pmaddwd                 %%mm6, %%mm1       \n\t"
2052
                "pmaddwd                 %%mm6, %%mm2       \n\t"
2053
                "pmaddwd                 %%mm6, %%mm3       \n\t"
2054
#ifndef FAST_BGR2YV12
2055
                "psrad                      $8, %%mm4       \n\t"
2056
                "psrad                      $8, %%mm1       \n\t"
2057
                "psrad                      $8, %%mm2       \n\t"
2058
                "psrad                      $8, %%mm3       \n\t"
2059
#endif
2060
                "packssdw                %%mm1, %%mm4       \n\t"
2061
                "packssdw                %%mm3, %%mm2       \n\t"
2062
                "pmaddwd                 %%mm5, %%mm4       \n\t"
2063
                "pmaddwd                 %%mm5, %%mm2       \n\t"
2064
                "add                       $24, %%"REG_d"   \n\t"
2065
                "packssdw                %%mm2, %%mm4       \n\t"
2066
                "psraw                      $7, %%mm4       \n\t"
2067

    
2068
                "packuswb                %%mm4, %%mm0       \n\t"
2069
                "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2070

    
2071
                MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2072
                "add                        $8,      %%"REG_a"  \n\t"
2073
                " js                        1b                  \n\t"
2074
                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2075
                : "%"REG_a, "%"REG_d
2076
            );
2077
            ydst += lumStride;
2078
            src  += srcStride;
2079
        }
2080
        src -= srcStride*2;
2081
        __asm__ volatile(
2082
            "mov                        %4, %%"REG_a"   \n\t"
2083
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2084
            "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2085
            "pxor                    %%mm7, %%mm7       \n\t"
2086
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2087
            "add                 %%"REG_d", %%"REG_d"   \n\t"
2088
            ASMALIGN(4)
2089
            "1:                                         \n\t"
2090
            PREFETCH"    64(%0, %%"REG_d")              \n\t"
2091
            PREFETCH"    64(%1, %%"REG_d")              \n\t"
2092
#if HAVE_MMX2 || HAVE_AMD3DNOW
2093
            "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2094
            "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2095
            "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2096
            "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2097
            PAVGB"                   %%mm1, %%mm0       \n\t"
2098
            PAVGB"                   %%mm3, %%mm2       \n\t"
2099
            "movq                    %%mm0, %%mm1       \n\t"
2100
            "movq                    %%mm2, %%mm3       \n\t"
2101
            "psrlq                     $24, %%mm0       \n\t"
2102
            "psrlq                     $24, %%mm2       \n\t"
2103
            PAVGB"                   %%mm1, %%mm0       \n\t"
2104
            PAVGB"                   %%mm3, %%mm2       \n\t"
2105
            "punpcklbw               %%mm7, %%mm0       \n\t"
2106
            "punpcklbw               %%mm7, %%mm2       \n\t"
2107
#else
2108
            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2109
            "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2110
            "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2111
            "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2112
            "punpcklbw               %%mm7, %%mm0       \n\t"
2113
            "punpcklbw               %%mm7, %%mm1       \n\t"
2114
            "punpcklbw               %%mm7, %%mm2       \n\t"
2115
            "punpcklbw               %%mm7, %%mm3       \n\t"
2116
            "paddw                   %%mm1, %%mm0       \n\t"
2117
            "paddw                   %%mm3, %%mm2       \n\t"
2118
            "paddw                   %%mm2, %%mm0       \n\t"
2119
            "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2120
            "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2121
            "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2122
            "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2123
            "punpcklbw               %%mm7, %%mm4       \n\t"
2124
            "punpcklbw               %%mm7, %%mm1       \n\t"
2125
            "punpcklbw               %%mm7, %%mm2       \n\t"
2126
            "punpcklbw               %%mm7, %%mm3       \n\t"
2127
            "paddw                   %%mm1, %%mm4       \n\t"
2128
            "paddw                   %%mm3, %%mm2       \n\t"
2129
            "paddw                   %%mm4, %%mm2       \n\t"
2130
            "psrlw                      $2, %%mm0       \n\t"
2131
            "psrlw                      $2, %%mm2       \n\t"
2132
#endif
2133
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2134
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2135

    
2136
            "pmaddwd                 %%mm0, %%mm1       \n\t"
2137
            "pmaddwd                 %%mm2, %%mm3       \n\t"
2138
            "pmaddwd                 %%mm6, %%mm0       \n\t"
2139
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2140
#ifndef FAST_BGR2YV12
2141
            "psrad                      $8, %%mm0       \n\t"
2142
            "psrad                      $8, %%mm1       \n\t"
2143
            "psrad                      $8, %%mm2       \n\t"
2144
            "psrad                      $8, %%mm3       \n\t"
2145
#endif
2146
            "packssdw                %%mm2, %%mm0       \n\t"
2147
            "packssdw                %%mm3, %%mm1       \n\t"
2148
            "pmaddwd                 %%mm5, %%mm0       \n\t"
2149
            "pmaddwd                 %%mm5, %%mm1       \n\t"
2150
            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2151
            "psraw                      $7, %%mm0       \n\t"
2152

    
2153
#if HAVE_MMX2 || HAVE_AMD3DNOW
2154
            "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2155
            "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2156
            "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2157
            "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2158
            PAVGB"                   %%mm1, %%mm4       \n\t"
2159
            PAVGB"                   %%mm3, %%mm2       \n\t"
2160
            "movq                    %%mm4, %%mm1       \n\t"
2161
            "movq                    %%mm2, %%mm3       \n\t"
2162
            "psrlq                     $24, %%mm4       \n\t"
2163
            "psrlq                     $24, %%mm2       \n\t"
2164
            PAVGB"                   %%mm1, %%mm4       \n\t"
2165
            PAVGB"                   %%mm3, %%mm2       \n\t"
2166
            "punpcklbw               %%mm7, %%mm4       \n\t"
2167
            "punpcklbw               %%mm7, %%mm2       \n\t"
2168
#else
2169
            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2170
            "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2171
            "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2172
            "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2173
            "punpcklbw               %%mm7, %%mm4       \n\t"
2174
            "punpcklbw               %%mm7, %%mm1       \n\t"
2175
            "punpcklbw               %%mm7, %%mm2       \n\t"
2176
            "punpcklbw               %%mm7, %%mm3       \n\t"
2177
            "paddw                   %%mm1, %%mm4       \n\t"
2178
            "paddw                   %%mm3, %%mm2       \n\t"
2179
            "paddw                   %%mm2, %%mm4       \n\t"
2180
            "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2181
            "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2182
            "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2183
            "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2184
            "punpcklbw               %%mm7, %%mm5       \n\t"
2185
            "punpcklbw               %%mm7, %%mm1       \n\t"
2186
            "punpcklbw               %%mm7, %%mm2       \n\t"
2187
            "punpcklbw               %%mm7, %%mm3       \n\t"
2188
            "paddw                   %%mm1, %%mm5       \n\t"
2189
            "paddw                   %%mm3, %%mm2       \n\t"
2190
            "paddw                   %%mm5, %%mm2       \n\t"
2191
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2192
            "psrlw                      $2, %%mm4       \n\t"
2193
            "psrlw                      $2, %%mm2       \n\t"
2194
#endif
2195
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2196
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2197

    
2198
            "pmaddwd                 %%mm4, %%mm1       \n\t"
2199
            "pmaddwd                 %%mm2, %%mm3       \n\t"
2200
            "pmaddwd                 %%mm6, %%mm4       \n\t"
2201
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2202
#ifndef FAST_BGR2YV12
2203
            "psrad                      $8, %%mm4       \n\t"
2204
            "psrad                      $8, %%mm1       \n\t"
2205
            "psrad                      $8, %%mm2       \n\t"
2206
            "psrad                      $8, %%mm3       \n\t"
2207
#endif
2208
            "packssdw                %%mm2, %%mm4       \n\t"
2209
            "packssdw                %%mm3, %%mm1       \n\t"
2210
            "pmaddwd                 %%mm5, %%mm4       \n\t"
2211
            "pmaddwd                 %%mm5, %%mm1       \n\t"
2212
            "add                       $24, %%"REG_d"   \n\t"
2213
            "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2214
            "psraw                      $7, %%mm4       \n\t"
2215

    
2216
            "movq                    %%mm0, %%mm1           \n\t"
2217
            "punpckldq               %%mm4, %%mm0           \n\t"
2218
            "punpckhdq               %%mm4, %%mm1           \n\t"
2219
            "packsswb                %%mm1, %%mm0           \n\t"
2220
            "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
2221
            "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2222
            "punpckhdq               %%mm0, %%mm0           \n\t"
2223
            "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2224
            "add                        $4, %%"REG_a"       \n\t"
2225
            " js                        1b                  \n\t"
2226
            : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2227
            : "%"REG_a, "%"REG_d
2228
        );
2229

    
2230
        udst += chromStride;
2231
        vdst += chromStride;
2232
        src  += srcStride*2;
2233
    }
2234

    
2235
    __asm__ volatile(EMMS"       \n\t"
2236
                     SFENCE"     \n\t"
2237
                     :::"memory");
2238
#else
2239
    y=0;
2240
#endif
2241
    for (; y<height; y+=2) {
2242
        long i;
2243
        for (i=0; i<chromWidth; i++) {
2244
            unsigned int b = src[6*i+0];
2245
            unsigned int g = src[6*i+1];
2246
            unsigned int r = src[6*i+2];
2247

    
2248
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2249
            unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2250
            unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2251

    
2252
            udst[i]     = U;
2253
            vdst[i]     = V;
2254
            ydst[2*i]   = Y;
2255

    
2256
            b = src[6*i+3];
2257
            g = src[6*i+4];
2258
            r = src[6*i+5];
2259

    
2260
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2261
            ydst[2*i+1]     = Y;
2262
        }
2263
        ydst += lumStride;
2264
        src  += srcStride;
2265

    
2266
        for (i=0; i<chromWidth; i++) {
2267
            unsigned int b = src[6*i+0];
2268
            unsigned int g = src[6*i+1];
2269
            unsigned int r = src[6*i+2];
2270

    
2271
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2272

    
2273
            ydst[2*i]     = Y;
2274

    
2275
            b = src[6*i+3];
2276
            g = src[6*i+4];
2277
            r = src[6*i+5];
2278

    
2279
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2280
            ydst[2*i+1]     = Y;
2281
        }
2282
        udst += chromStride;
2283
        vdst += chromStride;
2284
        ydst += lumStride;
2285
        src  += srcStride;
2286
    }
2287
}
2288

    
2289
static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2290
                             long width, long height, long src1Stride,
2291
                             long src2Stride, long dstStride)
2292
{
2293
    long h;
2294

    
2295
    for (h=0; h < height; h++) {
2296
        long w;
2297

    
2298
#if HAVE_MMX
2299
#if HAVE_SSE2
2300
        __asm__(
2301
            "xor              %%"REG_a", %%"REG_a"  \n\t"
2302
            "1:                                     \n\t"
2303
            PREFETCH" 64(%1, %%"REG_a")             \n\t"
2304
            PREFETCH" 64(%2, %%"REG_a")             \n\t"
2305
            "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2306
            "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2307
            "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2308
            "punpcklbw           %%xmm2, %%xmm0     \n\t"
2309
            "punpckhbw           %%xmm2, %%xmm1     \n\t"
2310
            "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2311
            "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2312
            "add                    $16, %%"REG_a"  \n\t"
2313
            "cmp                     %3, %%"REG_a"  \n\t"
2314
            " jb                     1b             \n\t"
2315
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2316
            : "memory", "%"REG_a""
2317
        );
2318
#else
2319
        __asm__(
2320
            "xor %%"REG_a", %%"REG_a"               \n\t"
2321
            "1:                                     \n\t"
2322
            PREFETCH" 64(%1, %%"REG_a")             \n\t"
2323
            PREFETCH" 64(%2, %%"REG_a")             \n\t"
2324
            "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2325
            "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2326
            "movq                 %%mm0, %%mm1      \n\t"
2327
            "movq                 %%mm2, %%mm3      \n\t"
2328
            "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2329
            "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2330
            "punpcklbw            %%mm4, %%mm0      \n\t"
2331
            "punpckhbw            %%mm4, %%mm1      \n\t"
2332
            "punpcklbw            %%mm5, %%mm2      \n\t"
2333
            "punpckhbw            %%mm5, %%mm3      \n\t"
2334
            MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2335
            MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2336
            MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2337
            MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2338
            "add                    $16, %%"REG_a"  \n\t"
2339
            "cmp                     %3, %%"REG_a"  \n\t"
2340
            " jb                     1b             \n\t"
2341
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2342
            : "memory", "%"REG_a
2343
        );
2344
#endif
2345
        for (w= (width&(~15)); w < width; w++) {
2346
            dest[2*w+0] = src1[w];
2347
            dest[2*w+1] = src2[w];
2348
        }
2349
#else
2350
        for (w=0; w < width; w++) {
2351
            dest[2*w+0] = src1[w];
2352
            dest[2*w+1] = src2[w];
2353
        }
2354
#endif
2355
        dest += dstStride;
2356
                src1 += src1Stride;
2357
                src2 += src2Stride;
2358
    }
2359
#if HAVE_MMX
2360
    __asm__(
2361
            EMMS"       \n\t"
2362
            SFENCE"     \n\t"
2363
            ::: "memory"
2364
            );
2365
#endif
2366
}
2367

    
2368
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2369
                                       uint8_t *dst1, uint8_t *dst2,
2370
                                       long width, long height,
2371
                                       long srcStride1, long srcStride2,
2372
                                       long dstStride1, long dstStride2)
2373
{
2374
    x86_reg y;
2375
    long x,w,h;
2376
    w=width/2; h=height/2;
2377
#if HAVE_MMX
2378
    __asm__ volatile(
2379
        PREFETCH" %0    \n\t"
2380
        PREFETCH" %1    \n\t"
2381
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2382
#endif
2383
    for (y=0;y<h;y++) {
2384
        const uint8_t* s1=src1+srcStride1*(y>>1);
2385
        uint8_t* d=dst1+dstStride1*y;
2386
        x=0;
2387
#if HAVE_MMX
2388
        for (;x<w-31;x+=32) {
2389
            __asm__ volatile(
2390
                PREFETCH"   32%1        \n\t"
2391
                "movq         %1, %%mm0 \n\t"
2392
                "movq        8%1, %%mm2 \n\t"
2393
                "movq       16%1, %%mm4 \n\t"
2394
                "movq       24%1, %%mm6 \n\t"
2395
                "movq      %%mm0, %%mm1 \n\t"
2396
                "movq      %%mm2, %%mm3 \n\t"
2397
                "movq      %%mm4, %%mm5 \n\t"
2398
                "movq      %%mm6, %%mm7 \n\t"
2399
                "punpcklbw %%mm0, %%mm0 \n\t"
2400
                "punpckhbw %%mm1, %%mm1 \n\t"
2401
                "punpcklbw %%mm2, %%mm2 \n\t"
2402
                "punpckhbw %%mm3, %%mm3 \n\t"
2403
                "punpcklbw %%mm4, %%mm4 \n\t"
2404
                "punpckhbw %%mm5, %%mm5 \n\t"
2405
                "punpcklbw %%mm6, %%mm6 \n\t"
2406
                "punpckhbw %%mm7, %%mm7 \n\t"
2407
                MOVNTQ"    %%mm0,   %0  \n\t"
2408
                MOVNTQ"    %%mm1,  8%0  \n\t"
2409
                MOVNTQ"    %%mm2, 16%0  \n\t"
2410
                MOVNTQ"    %%mm3, 24%0  \n\t"
2411
                MOVNTQ"    %%mm4, 32%0  \n\t"
2412
                MOVNTQ"    %%mm5, 40%0  \n\t"
2413
                MOVNTQ"    %%mm6, 48%0  \n\t"
2414
                MOVNTQ"    %%mm7, 56%0"
2415
                :"=m"(d[2*x])
2416
                :"m"(s1[x])
2417
                :"memory");
2418
        }
2419
#endif
2420
        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2421
    }
2422
    for (y=0;y<h;y++) {
2423
        const uint8_t* s2=src2+srcStride2*(y>>1);
2424
        uint8_t* d=dst2+dstStride2*y;
2425
        x=0;
2426
#if HAVE_MMX
2427
        for (;x<w-31;x+=32) {
2428
            __asm__ volatile(
2429
                PREFETCH"   32%1        \n\t"
2430
                "movq         %1, %%mm0 \n\t"
2431
                "movq        8%1, %%mm2 \n\t"
2432
                "movq       16%1, %%mm4 \n\t"
2433
                "movq       24%1, %%mm6 \n\t"
2434
                "movq      %%mm0, %%mm1 \n\t"
2435
                "movq      %%mm2, %%mm3 \n\t"
2436
                "movq      %%mm4, %%mm5 \n\t"
2437
                "movq      %%mm6, %%mm7 \n\t"
2438
                "punpcklbw %%mm0, %%mm0 \n\t"
2439
                "punpckhbw %%mm1, %%mm1 \n\t"
2440
                "punpcklbw %%mm2, %%mm2 \n\t"
2441
                "punpckhbw %%mm3, %%mm3 \n\t"
2442
                "punpcklbw %%mm4, %%mm4 \n\t"
2443
                "punpckhbw %%mm5, %%mm5 \n\t"
2444
                "punpcklbw %%mm6, %%mm6 \n\t"
2445
                "punpckhbw %%mm7, %%mm7 \n\t"
2446
                MOVNTQ"    %%mm0,   %0  \n\t"
2447
                MOVNTQ"    %%mm1,  8%0  \n\t"
2448
                MOVNTQ"    %%mm2, 16%0  \n\t"
2449
                MOVNTQ"    %%mm3, 24%0  \n\t"
2450
                MOVNTQ"    %%mm4, 32%0  \n\t"
2451
                MOVNTQ"    %%mm5, 40%0  \n\t"
2452
                MOVNTQ"    %%mm6, 48%0  \n\t"
2453
                MOVNTQ"    %%mm7, 56%0"
2454
                :"=m"(d[2*x])
2455
                :"m"(s2[x])
2456
                :"memory");
2457
        }
2458
#endif
2459
        for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2460
    }
2461
#if HAVE_MMX
2462
    __asm__(
2463
            EMMS"       \n\t"
2464
            SFENCE"     \n\t"
2465
            ::: "memory"
2466
        );
2467
#endif
2468
}
2469

    
2470
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2471
                                        uint8_t *dst,
2472
                                        long width, long height,
2473
                                        long srcStride1, long srcStride2,
2474
                                        long srcStride3, long dstStride)
2475
{
2476
    x86_reg x;
2477
    long y,w,h;
2478
    w=width/2; h=height;
2479
    for (y=0;y<h;y++) {
2480
        const uint8_t* yp=src1+srcStride1*y;
2481
        const uint8_t* up=src2+srcStride2*(y>>2);
2482
        const uint8_t* vp=src3+srcStride3*(y>>2);
2483
        uint8_t* d=dst+dstStride*y;
2484
        x=0;
2485
#if HAVE_MMX
2486
        for (;x<w-7;x+=8) {
2487
            __asm__ volatile(
2488
                PREFETCH"   32(%1, %0)          \n\t"
2489
                PREFETCH"   32(%2, %0)          \n\t"
2490
                PREFETCH"   32(%3, %0)          \n\t"
2491
                "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2492
                "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2493
                "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2494
                "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2495
                "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2496
                "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2497
                "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2498
                "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2499
                "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2500
                "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2501

    
2502
                "movq            %%mm1, %%mm6   \n\t"
2503
                "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2504
                "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2505
                "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2506
                MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2507
                MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2508

    
2509
                "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2510
                "movq     8(%1, %0, 4), %%mm0   \n\t"
2511
                "movq            %%mm0, %%mm3   \n\t"
2512
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2513
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2514
                MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2515
                MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2516

    
2517
                "movq            %%mm4, %%mm6   \n\t"
2518
                "movq    16(%1, %0, 4), %%mm0   \n\t"
2519
                "movq            %%mm0, %%mm3   \n\t"
2520
                "punpcklbw       %%mm5, %%mm4   \n\t"
2521
                "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2522
                "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2523
                MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2524
                MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2525

    
2526
                "punpckhbw       %%mm5, %%mm6   \n\t"
2527
                "movq    24(%1, %0, 4), %%mm0   \n\t"
2528
                "movq            %%mm0, %%mm3   \n\t"
2529
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2530
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2531
                MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2532
                MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2533

    
2534
                : "+r" (x)
2535
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2536
                :"memory");
2537
        }
2538
#endif
2539
        for (; x<w; x++) {
2540
            const long x2 = x<<2;
2541
            d[8*x+0] = yp[x2];
2542
            d[8*x+1] = up[x];
2543
            d[8*x+2] = yp[x2+1];
2544
            d[8*x+3] = vp[x];
2545
            d[8*x+4] = yp[x2+2];
2546
            d[8*x+5] = up[x];
2547
            d[8*x+6] = yp[x2+3];
2548
            d[8*x+7] = vp[x];
2549
        }
2550
    }
2551
#if HAVE_MMX
2552
    __asm__(
2553
            EMMS"       \n\t"
2554
            SFENCE"     \n\t"
2555
            ::: "memory"
2556
        );
2557
#endif
2558
}
2559

    
2560
static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2561
{
2562
    dst +=   count;
2563
    src += 2*count;
2564
    count= - count;
2565

    
2566
#if HAVE_MMX
2567
    if(count <= -16) {
2568
        count += 15;
2569
        __asm__ volatile(
2570
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2571
            "psrlw            $8, %%mm7        \n\t"
2572
            "1:                                \n\t"
2573
            "movq -30(%1, %0, 2), %%mm0        \n\t"
2574
            "movq -22(%1, %0, 2), %%mm1        \n\t"
2575
            "movq -14(%1, %0, 2), %%mm2        \n\t"
2576
            "movq  -6(%1, %0, 2), %%mm3        \n\t"
2577
            "pand          %%mm7, %%mm0        \n\t"
2578
            "pand          %%mm7, %%mm1        \n\t"
2579
            "pand          %%mm7, %%mm2        \n\t"
2580
            "pand          %%mm7, %%mm3        \n\t"
2581
            "packuswb      %%mm1, %%mm0        \n\t"
2582
            "packuswb      %%mm3, %%mm2        \n\t"
2583
            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2584
            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2585
            "add             $16, %0           \n\t"
2586
            " js 1b                            \n\t"
2587
            : "+r"(count)
2588
            : "r"(src), "r"(dst)
2589
        );
2590
        count -= 15;
2591
    }
2592
#endif
2593
    while(count<0) {
2594
        dst[count]= src[2*count];
2595
        count++;
2596
    }
2597
}
2598

    
2599
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2600
{
2601
    dst0+=   count;
2602
    dst1+=   count;
2603
    src += 4*count;
2604
    count= - count;
2605
#if HAVE_MMX
2606
    if(count <= -8) {
2607
        count += 7;
2608
        __asm__ volatile(
2609
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2610
            "psrlw            $8, %%mm7        \n\t"
2611
            "1:                                \n\t"
2612
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2613
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2614
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2615
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2616
            "pand          %%mm7, %%mm0        \n\t"
2617
            "pand          %%mm7, %%mm1        \n\t"
2618
            "pand          %%mm7, %%mm2        \n\t"
2619
            "pand          %%mm7, %%mm3        \n\t"
2620
            "packuswb      %%mm1, %%mm0        \n\t"
2621
            "packuswb      %%mm3, %%mm2        \n\t"
2622
            "movq          %%mm0, %%mm1        \n\t"
2623
            "movq          %%mm2, %%mm3        \n\t"
2624
            "psrlw            $8, %%mm0        \n\t"
2625
            "psrlw            $8, %%mm2        \n\t"
2626
            "pand          %%mm7, %%mm1        \n\t"
2627
            "pand          %%mm7, %%mm3        \n\t"
2628
            "packuswb      %%mm2, %%mm0        \n\t"
2629
            "packuswb      %%mm3, %%mm1        \n\t"
2630
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2631
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2632
            "add              $8, %0           \n\t"
2633
            " js 1b                            \n\t"
2634
            : "+r"(count)
2635
            : "r"(src), "r"(dst0), "r"(dst1)
2636
        );
2637
        count -= 7;
2638
    }
2639
#endif
2640
    while(count<0) {
2641
        dst0[count]= src[4*count+0];
2642
        dst1[count]= src[4*count+2];
2643
        count++;
2644
    }
2645
}
2646

    
2647
static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2648
{
2649
    dst0 +=   count;
2650
    dst1 +=   count;
2651
    src0 += 4*count;
2652
    src1 += 4*count;
2653
    count= - count;
2654
#ifdef PAVGB
2655
    if(count <= -8) {
2656
        count += 7;
2657
        __asm__ volatile(
2658
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2659
            "psrlw             $8, %%mm7        \n\t"
2660
            "1:                                \n\t"
2661
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2662
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2663
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2664
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2665
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2666
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2667
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2668
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2669
            "pand           %%mm7, %%mm0        \n\t"
2670
            "pand           %%mm7, %%mm1        \n\t"
2671
            "pand           %%mm7, %%mm2        \n\t"
2672
            "pand           %%mm7, %%mm3        \n\t"
2673
            "packuswb       %%mm1, %%mm0        \n\t"
2674
            "packuswb       %%mm3, %%mm2        \n\t"
2675
            "movq           %%mm0, %%mm1        \n\t"
2676
            "movq           %%mm2, %%mm3        \n\t"
2677
            "psrlw             $8, %%mm0        \n\t"
2678
            "psrlw             $8, %%mm2        \n\t"
2679
            "pand           %%mm7, %%mm1        \n\t"
2680
            "pand           %%mm7, %%mm3        \n\t"
2681
            "packuswb       %%mm2, %%mm0        \n\t"
2682
            "packuswb       %%mm3, %%mm1        \n\t"
2683
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2684
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2685
            "add               $8, %0           \n\t"
2686
            " js 1b                            \n\t"
2687
            : "+r"(count)
2688
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2689
        );
2690
        count -= 7;
2691
    }
2692
#endif
2693
    while(count<0) {
2694
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2695
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2696
        count++;
2697
    }
2698
}
2699

    
2700
static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2701
{
2702
    dst0+=   count;
2703
    dst1+=   count;
2704
    src += 4*count;
2705
    count= - count;
2706
#if HAVE_MMX
2707
    if(count <= -8) {
2708
        count += 7;
2709
        __asm__ volatile(
2710
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2711
            "psrlw            $8, %%mm7        \n\t"
2712
            "1:                                \n\t"
2713
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2714
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2715
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2716
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2717
            "psrlw            $8, %%mm0        \n\t"
2718
            "psrlw            $8, %%mm1        \n\t"
2719
            "psrlw            $8, %%mm2        \n\t"
2720
            "psrlw            $8, %%mm3        \n\t"
2721
            "packuswb      %%mm1, %%mm0        \n\t"
2722
            "packuswb      %%mm3, %%mm2        \n\t"
2723
            "movq          %%mm0, %%mm1        \n\t"
2724
            "movq          %%mm2, %%mm3        \n\t"
2725
            "psrlw            $8, %%mm0        \n\t"
2726
            "psrlw            $8, %%mm2        \n\t"
2727
            "pand          %%mm7, %%mm1        \n\t"
2728
            "pand          %%mm7, %%mm3        \n\t"
2729
            "packuswb      %%mm2, %%mm0        \n\t"
2730
            "packuswb      %%mm3, %%mm1        \n\t"
2731
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2732
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2733
            "add              $8, %0           \n\t"
2734
            " js 1b                            \n\t"
2735
            : "+r"(count)
2736
            : "r"(src), "r"(dst0), "r"(dst1)
2737
        );
2738
        count -= 7;
2739
    }
2740
#endif
2741
    src++;
2742
    while(count<0) {
2743
        dst0[count]= src[4*count+0];
2744
        dst1[count]= src[4*count+2];
2745
        count++;
2746
    }
2747
}
2748

    
2749
static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2750
{
2751
    dst0 +=   count;
2752
    dst1 +=   count;
2753
    src0 += 4*count;
2754
    src1 += 4*count;
2755
    count= - count;
2756
#ifdef PAVGB
2757
    if(count <= -8) {
2758
        count += 7;
2759
        __asm__ volatile(
2760
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2761
            "psrlw             $8, %%mm7        \n\t"
2762
            "1:                                \n\t"
2763
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2764
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2765
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2766
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2767
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2768
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2769
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2770
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2771
            "psrlw             $8, %%mm0        \n\t"
2772
            "psrlw             $8, %%mm1        \n\t"
2773
            "psrlw             $8, %%mm2        \n\t"
2774
            "psrlw             $8, %%mm3        \n\t"
2775
            "packuswb       %%mm1, %%mm0        \n\t"
2776
            "packuswb       %%mm3, %%mm2        \n\t"
2777
            "movq           %%mm0, %%mm1        \n\t"
2778
            "movq           %%mm2, %%mm3        \n\t"
2779
            "psrlw             $8, %%mm0        \n\t"
2780
            "psrlw             $8, %%mm2        \n\t"
2781
            "pand           %%mm7, %%mm1        \n\t"
2782
            "pand           %%mm7, %%mm3        \n\t"
2783
            "packuswb       %%mm2, %%mm0        \n\t"
2784
            "packuswb       %%mm3, %%mm1        \n\t"
2785
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2786
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2787
            "add               $8, %0           \n\t"
2788
            " js 1b                            \n\t"
2789
            : "+r"(count)
2790
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2791
        );
2792
        count -= 7;
2793
    }
2794
#endif
2795
    src0++;
2796
    src1++;
2797
    while(count<0) {
2798
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2799
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2800
        count++;
2801
    }
2802
}
2803

    
2804
static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2805
                                      long width, long height,
2806
                                      long lumStride, long chromStride, long srcStride)
2807
{
2808
    long y;
2809
    const long chromWidth= -((-width)>>1);
2810

    
2811
    for (y=0; y<height; y++) {
2812
        RENAME(extract_even)(src, ydst, width);
2813
        if(y&1) {
2814
            RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2815
            udst+= chromStride;
2816
            vdst+= chromStride;
2817
        }
2818

    
2819
        src += srcStride;
2820
        ydst+= lumStride;
2821
    }
2822
#if HAVE_MMX
2823
    __asm__(
2824
            EMMS"       \n\t"
2825
            SFENCE"     \n\t"
2826
            ::: "memory"
2827
        );
2828
#endif
2829
}
2830

    
2831
static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2832
                                      long width, long height,
2833
                                      long lumStride, long chromStride, long srcStride)
2834
{
2835
    long y;
2836
    const long chromWidth= -((-width)>>1);
2837

    
2838
    for (y=0; y<height; y++) {
2839
        RENAME(extract_even)(src, ydst, width);
2840
        RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2841

    
2842
        src += srcStride;
2843
        ydst+= lumStride;
2844
        udst+= chromStride;
2845
        vdst+= chromStride;
2846
    }
2847
#if HAVE_MMX
2848
    __asm__(
2849
            EMMS"       \n\t"
2850
            SFENCE"     \n\t"
2851
            ::: "memory"
2852
        );
2853
#endif
2854
}
2855

    
2856
static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2857
                                      long width, long height,
2858
                                      long lumStride, long chromStride, long srcStride)
2859
{
2860
    long y;
2861
    const long chromWidth= -((-width)>>1);
2862

    
2863
    for (y=0; y<height; y++) {
2864
        RENAME(extract_even)(src+1, ydst, width);
2865
        if(y&1) {
2866
            RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2867
            udst+= chromStride;
2868
            vdst+= chromStride;
2869
        }
2870

    
2871
        src += srcStride;
2872
        ydst+= lumStride;
2873
    }
2874
#if HAVE_MMX
2875
    __asm__(
2876
            EMMS"       \n\t"
2877
            SFENCE"     \n\t"
2878
            ::: "memory"
2879
        );
2880
#endif
2881
}
2882

    
2883
static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2884
                                      long width, long height,
2885
                                      long lumStride, long chromStride, long srcStride)
2886
{
2887
    long y;
2888
    const long chromWidth= -((-width)>>1);
2889

    
2890
    for (y=0; y<height; y++) {
2891
        RENAME(extract_even)(src+1, ydst, width);
2892
        RENAME(extract_even2)(src, udst, vdst, chromWidth);
2893

    
2894
        src += srcStride;
2895
        ydst+= lumStride;
2896
        udst+= chromStride;
2897
        vdst+= chromStride;
2898
    }
2899
#if HAVE_MMX
2900
    __asm__(
2901
            EMMS"       \n\t"
2902
            SFENCE"     \n\t"
2903
            ::: "memory"
2904
        );
2905
#endif
2906
}
2907

    
2908
static inline void RENAME(rgb2rgb_init)(void)
2909
{
2910
    rgb15to16       = RENAME(rgb15to16);
2911
    rgb15tobgr24    = RENAME(rgb15tobgr24);
2912
    rgb15to32       = RENAME(rgb15to32);
2913
    rgb16tobgr24    = RENAME(rgb16tobgr24);
2914
    rgb16to32       = RENAME(rgb16to32);
2915
    rgb16to15       = RENAME(rgb16to15);
2916
    rgb24tobgr16    = RENAME(rgb24tobgr16);
2917
    rgb24tobgr15    = RENAME(rgb24tobgr15);
2918
    rgb24tobgr32    = RENAME(rgb24tobgr32);
2919
    rgb32to16       = RENAME(rgb32to16);
2920
    rgb32to15       = RENAME(rgb32to15);
2921
    rgb32tobgr24    = RENAME(rgb32tobgr24);
2922
    rgb24to15       = RENAME(rgb24to15);
2923
    rgb24to16       = RENAME(rgb24to16);
2924
    rgb24tobgr24    = RENAME(rgb24tobgr24);
2925
    rgb32tobgr32    = RENAME(rgb32tobgr32);
2926
    rgb32tobgr16    = RENAME(rgb32tobgr16);
2927
    rgb32tobgr15    = RENAME(rgb32tobgr15);
2928
    yv12toyuy2      = RENAME(yv12toyuy2);
2929
    yv12touyvy      = RENAME(yv12touyvy);
2930
    yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
2931
    yuv422ptouyvy   = RENAME(yuv422ptouyvy);
2932
    yuy2toyv12      = RENAME(yuy2toyv12);
2933
//    yvu9toyv12      = RENAME(yvu9toyv12);
2934
    planar2x        = RENAME(planar2x);
2935
    rgb24toyv12     = RENAME(rgb24toyv12);
2936
    interleaveBytes = RENAME(interleaveBytes);
2937
    vu9_to_vu12     = RENAME(vu9_to_vu12);
2938
    yvu9_to_yuy2    = RENAME(yvu9_to_yuy2);
2939

    
2940
    uyvytoyuv420    = RENAME(uyvytoyuv420);
2941
    uyvytoyuv422    = RENAME(uyvytoyuv422);
2942
    yuyvtoyuv420    = RENAME(yuyvtoyuv420);
2943
    yuyvtoyuv422    = RENAME(yuyvtoyuv422);
2944
}