Statistics
| Branch: | Revision:

ffmpeg / libswscale / rgb2rgb_template.c @ 4b190455

History | View | Annotate | Download (115 KB)

1
/*
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 * lot of big-endian byte order fixes by Alex Beregszaszi
9
 *
10
 * This file is part of FFmpeg.
11
 *
12
 * FFmpeg is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2 of the License, or
15
 * (at your option) any later version.
16
 *
17
 * FFmpeg is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU General Public License
23
 * along with FFmpeg; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 *
26
 * The C code (not assembly, MMX, ...) of this file can be used
27
 * under the LGPL license.
28
 */
29

    
30
#include <stddef.h>
31

    
32
#undef PREFETCH
33
#undef MOVNTQ
34
#undef EMMS
35
#undef SFENCE
36
#undef MMREG_SIZE
37
#undef PAVGB
38

    
39
#if HAVE_SSE2
40
#define MMREG_SIZE 16
41
#else
42
#define MMREG_SIZE 8
43
#endif
44

    
45
#if HAVE_AMD3DNOW
46
#define PREFETCH  "prefetch"
47
#define PAVGB     "pavgusb"
48
#elif HAVE_MMX2
49
#define PREFETCH "prefetchnta"
50
#define PAVGB     "pavgb"
51
#else
52
#define PREFETCH  " # nop"
53
#endif
54

    
55
#if HAVE_AMD3DNOW
56
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
57
#define EMMS     "femms"
58
#else
59
#define EMMS     "emms"
60
#endif
61

    
62
#if HAVE_MMX2
63
#define MOVNTQ "movntq"
64
#define SFENCE "sfence"
65
#else
66
#define MOVNTQ "movq"
67
#define SFENCE " # nop"
68
#endif
69

    
70
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
71
{
72
    uint8_t *dest = dst;
73
    const uint8_t *s = src;
74
    const uint8_t *end;
75
#if HAVE_MMX
76
    const uint8_t *mm_end;
77
#endif
78
    end = s + src_size;
79
#if HAVE_MMX
80
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
81
    mm_end = end - 23;
82
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
83
    while (s < mm_end) {
84
        __asm__ volatile(
85
            PREFETCH"    32%1           \n\t"
86
            "movd          %1, %%mm0    \n\t"
87
            "punpckldq    3%1, %%mm0    \n\t"
88
            "movd         6%1, %%mm1    \n\t"
89
            "punpckldq    9%1, %%mm1    \n\t"
90
            "movd        12%1, %%mm2    \n\t"
91
            "punpckldq   15%1, %%mm2    \n\t"
92
            "movd        18%1, %%mm3    \n\t"
93
            "punpckldq   21%1, %%mm3    \n\t"
94
            "por        %%mm7, %%mm0    \n\t"
95
            "por        %%mm7, %%mm1    \n\t"
96
            "por        %%mm7, %%mm2    \n\t"
97
            "por        %%mm7, %%mm3    \n\t"
98
            MOVNTQ"     %%mm0,   %0     \n\t"
99
            MOVNTQ"     %%mm1,  8%0     \n\t"
100
            MOVNTQ"     %%mm2, 16%0     \n\t"
101
            MOVNTQ"     %%mm3, 24%0"
102
            :"=m"(*dest)
103
            :"m"(*s)
104
            :"memory");
105
        dest += 32;
106
        s += 24;
107
    }
108
    __asm__ volatile(SFENCE:::"memory");
109
    __asm__ volatile(EMMS:::"memory");
110
#endif
111
    while (s < end) {
112
#if HAVE_BIGENDIAN
113
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
114
        *dest++ = 255;
115
        *dest++ = s[2];
116
        *dest++ = s[1];
117
        *dest++ = s[0];
118
        s+=3;
119
#else
120
        *dest++ = *s++;
121
        *dest++ = *s++;
122
        *dest++ = *s++;
123
        *dest++ = 255;
124
#endif
125
    }
126
}
127

    
128
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
129
{
130
    uint8_t *dest = dst;
131
    const uint8_t *s = src;
132
    const uint8_t *end;
133
#if HAVE_MMX
134
    const uint8_t *mm_end;
135
#endif
136
    end = s + src_size;
137
#if HAVE_MMX
138
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
139
    mm_end = end - 31;
140
    while (s < mm_end) {
141
        __asm__ volatile(
142
            PREFETCH"    32%1           \n\t"
143
            "movq          %1, %%mm0    \n\t"
144
            "movq         8%1, %%mm1    \n\t"
145
            "movq        16%1, %%mm4    \n\t"
146
            "movq        24%1, %%mm5    \n\t"
147
            "movq       %%mm0, %%mm2    \n\t"
148
            "movq       %%mm1, %%mm3    \n\t"
149
            "movq       %%mm4, %%mm6    \n\t"
150
            "movq       %%mm5, %%mm7    \n\t"
151
            "psrlq         $8, %%mm2    \n\t"
152
            "psrlq         $8, %%mm3    \n\t"
153
            "psrlq         $8, %%mm6    \n\t"
154
            "psrlq         $8, %%mm7    \n\t"
155
            "pand          %2, %%mm0    \n\t"
156
            "pand          %2, %%mm1    \n\t"
157
            "pand          %2, %%mm4    \n\t"
158
            "pand          %2, %%mm5    \n\t"
159
            "pand          %3, %%mm2    \n\t"
160
            "pand          %3, %%mm3    \n\t"
161
            "pand          %3, %%mm6    \n\t"
162
            "pand          %3, %%mm7    \n\t"
163
            "por        %%mm2, %%mm0    \n\t"
164
            "por        %%mm3, %%mm1    \n\t"
165
            "por        %%mm6, %%mm4    \n\t"
166
            "por        %%mm7, %%mm5    \n\t"
167

    
168
            "movq       %%mm1, %%mm2    \n\t"
169
            "movq       %%mm4, %%mm3    \n\t"
170
            "psllq        $48, %%mm2    \n\t"
171
            "psllq        $32, %%mm3    \n\t"
172
            "pand          %4, %%mm2    \n\t"
173
            "pand          %5, %%mm3    \n\t"
174
            "por        %%mm2, %%mm0    \n\t"
175
            "psrlq        $16, %%mm1    \n\t"
176
            "psrlq        $32, %%mm4    \n\t"
177
            "psllq        $16, %%mm5    \n\t"
178
            "por        %%mm3, %%mm1    \n\t"
179
            "pand          %6, %%mm5    \n\t"
180
            "por        %%mm5, %%mm4    \n\t"
181

    
182
            MOVNTQ"     %%mm0,   %0     \n\t"
183
            MOVNTQ"     %%mm1,  8%0     \n\t"
184
            MOVNTQ"     %%mm4, 16%0"
185
            :"=m"(*dest)
186
            :"m"(*s),"m"(mask24l),
187
            "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
188
            :"memory");
189
        dest += 24;
190
        s += 32;
191
    }
192
    __asm__ volatile(SFENCE:::"memory");
193
    __asm__ volatile(EMMS:::"memory");
194
#endif
195
    while (s < end) {
196
#if HAVE_BIGENDIAN
197
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
198
        s++;
199
        dest[2] = *s++;
200
        dest[1] = *s++;
201
        dest[0] = *s++;
202
        dest += 3;
203
#else
204
        *dest++ = *s++;
205
        *dest++ = *s++;
206
        *dest++ = *s++;
207
        s++;
208
#endif
209
    }
210
}
211

    
212
/*
213
 original by Strepto/Astral
214
 ported to gcc & bugfixed: A'rpi
215
 MMX2, 3DNOW optimization by Nick Kurshev
216
 32-bit C version, and and&add trick by Michael Niedermayer
217
*/
218
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
219
{
220
    register const uint8_t* s=src;
221
    register uint8_t* d=dst;
222
    register const uint8_t *end;
223
    const uint8_t *mm_end;
224
    end = s + src_size;
225
#if HAVE_MMX
226
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
227
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
228
    mm_end = end - 15;
229
    while (s<mm_end) {
230
        __asm__ volatile(
231
            PREFETCH"  32%1         \n\t"
232
            "movq        %1, %%mm0  \n\t"
233
            "movq       8%1, %%mm2  \n\t"
234
            "movq     %%mm0, %%mm1  \n\t"
235
            "movq     %%mm2, %%mm3  \n\t"
236
            "pand     %%mm4, %%mm0  \n\t"
237
            "pand     %%mm4, %%mm2  \n\t"
238
            "paddw    %%mm1, %%mm0  \n\t"
239
            "paddw    %%mm3, %%mm2  \n\t"
240
            MOVNTQ"   %%mm0,  %0    \n\t"
241
            MOVNTQ"   %%mm2, 8%0"
242
            :"=m"(*d)
243
            :"m"(*s)
244
        );
245
        d+=16;
246
        s+=16;
247
    }
248
    __asm__ volatile(SFENCE:::"memory");
249
    __asm__ volatile(EMMS:::"memory");
250
#endif
251
    mm_end = end - 3;
252
    while (s < mm_end) {
253
        register unsigned x= *((const uint32_t *)s);
254
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
255
        d+=4;
256
        s+=4;
257
    }
258
    if (s < end) {
259
        register unsigned short x= *((const uint16_t *)s);
260
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
261
    }
262
}
263

    
264
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
265
{
266
    register const uint8_t* s=src;
267
    register uint8_t* d=dst;
268
    register const uint8_t *end;
269
    const uint8_t *mm_end;
270
    end = s + src_size;
271
#if HAVE_MMX
272
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
273
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
274
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
275
    mm_end = end - 15;
276
    while (s<mm_end) {
277
        __asm__ volatile(
278
            PREFETCH"  32%1         \n\t"
279
            "movq        %1, %%mm0  \n\t"
280
            "movq       8%1, %%mm2  \n\t"
281
            "movq     %%mm0, %%mm1  \n\t"
282
            "movq     %%mm2, %%mm3  \n\t"
283
            "psrlq       $1, %%mm0  \n\t"
284
            "psrlq       $1, %%mm2  \n\t"
285
            "pand     %%mm7, %%mm0  \n\t"
286
            "pand     %%mm7, %%mm2  \n\t"
287
            "pand     %%mm6, %%mm1  \n\t"
288
            "pand     %%mm6, %%mm3  \n\t"
289
            "por      %%mm1, %%mm0  \n\t"
290
            "por      %%mm3, %%mm2  \n\t"
291
            MOVNTQ"   %%mm0,  %0    \n\t"
292
            MOVNTQ"   %%mm2, 8%0"
293
            :"=m"(*d)
294
            :"m"(*s)
295
        );
296
        d+=16;
297
        s+=16;
298
    }
299
    __asm__ volatile(SFENCE:::"memory");
300
    __asm__ volatile(EMMS:::"memory");
301
#endif
302
    mm_end = end - 3;
303
    while (s < mm_end) {
304
        register uint32_t x= *((const uint32_t*)s);
305
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
306
        s+=4;
307
        d+=4;
308
    }
309
    if (s < end) {
310
        register uint16_t x= *((const uint16_t*)s);
311
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
312
    }
313
}
314

    
315
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
316
{
317
    const uint8_t *s = src;
318
    const uint8_t *end;
319
#if HAVE_MMX
320
    const uint8_t *mm_end;
321
#endif
322
    uint16_t *d = (uint16_t *)dst;
323
    end = s + src_size;
324
#if HAVE_MMX
325
    mm_end = end - 15;
326
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
327
    __asm__ volatile(
328
        "movq           %3, %%mm5   \n\t"
329
        "movq           %4, %%mm6   \n\t"
330
        "movq           %5, %%mm7   \n\t"
331
        "jmp 2f                     \n\t"
332
        ASMALIGN(4)
333
        "1:                         \n\t"
334
        PREFETCH"   32(%1)          \n\t"
335
        "movd         (%1), %%mm0   \n\t"
336
        "movd        4(%1), %%mm3   \n\t"
337
        "punpckldq   8(%1), %%mm0   \n\t"
338
        "punpckldq  12(%1), %%mm3   \n\t"
339
        "movq        %%mm0, %%mm1   \n\t"
340
        "movq        %%mm3, %%mm4   \n\t"
341
        "pand        %%mm6, %%mm0   \n\t"
342
        "pand        %%mm6, %%mm3   \n\t"
343
        "pmaddwd     %%mm7, %%mm0   \n\t"
344
        "pmaddwd     %%mm7, %%mm3   \n\t"
345
        "pand        %%mm5, %%mm1   \n\t"
346
        "pand        %%mm5, %%mm4   \n\t"
347
        "por         %%mm1, %%mm0   \n\t"
348
        "por         %%mm4, %%mm3   \n\t"
349
        "psrld          $5, %%mm0   \n\t"
350
        "pslld         $11, %%mm3   \n\t"
351
        "por         %%mm3, %%mm0   \n\t"
352
        MOVNTQ"      %%mm0, (%0)    \n\t"
353
        "add           $16,  %1     \n\t"
354
        "add            $8,  %0     \n\t"
355
        "2:                         \n\t"
356
        "cmp            %2,  %1     \n\t"
357
        " jb            1b          \n\t"
358
        : "+r" (d), "+r"(s)
359
        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
360
    );
361
#else
362
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
363
    __asm__ volatile(
364
        "movq    %0, %%mm7    \n\t"
365
        "movq    %1, %%mm6    \n\t"
366
        ::"m"(red_16mask),"m"(green_16mask));
367
    while (s < mm_end) {
368
        __asm__ volatile(
369
            PREFETCH"    32%1           \n\t"
370
            "movd          %1, %%mm0    \n\t"
371
            "movd         4%1, %%mm3    \n\t"
372
            "punpckldq    8%1, %%mm0    \n\t"
373
            "punpckldq   12%1, %%mm3    \n\t"
374
            "movq       %%mm0, %%mm1    \n\t"
375
            "movq       %%mm0, %%mm2    \n\t"
376
            "movq       %%mm3, %%mm4    \n\t"
377
            "movq       %%mm3, %%mm5    \n\t"
378
            "psrlq         $3, %%mm0    \n\t"
379
            "psrlq         $3, %%mm3    \n\t"
380
            "pand          %2, %%mm0    \n\t"
381
            "pand          %2, %%mm3    \n\t"
382
            "psrlq         $5, %%mm1    \n\t"
383
            "psrlq         $5, %%mm4    \n\t"
384
            "pand       %%mm6, %%mm1    \n\t"
385
            "pand       %%mm6, %%mm4    \n\t"
386
            "psrlq         $8, %%mm2    \n\t"
387
            "psrlq         $8, %%mm5    \n\t"
388
            "pand       %%mm7, %%mm2    \n\t"
389
            "pand       %%mm7, %%mm5    \n\t"
390
            "por        %%mm1, %%mm0    \n\t"
391
            "por        %%mm4, %%mm3    \n\t"
392
            "por        %%mm2, %%mm0    \n\t"
393
            "por        %%mm5, %%mm3    \n\t"
394
            "psllq        $16, %%mm3    \n\t"
395
            "por        %%mm3, %%mm0    \n\t"
396
            MOVNTQ"     %%mm0, %0       \n\t"
397
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
398
        d += 4;
399
        s += 16;
400
    }
401
#endif
402
    __asm__ volatile(SFENCE:::"memory");
403
    __asm__ volatile(EMMS:::"memory");
404
#endif
405
    while (s < end) {
406
        register int rgb = *(const uint32_t*)s; s += 4;
407
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
408
    }
409
}
410

    
411
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
412
{
413
    const uint8_t *s = src;
414
    const uint8_t *end;
415
#if HAVE_MMX
416
    const uint8_t *mm_end;
417
#endif
418
    uint16_t *d = (uint16_t *)dst;
419
    end = s + src_size;
420
#if HAVE_MMX
421
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
422
    __asm__ volatile(
423
        "movq          %0, %%mm7    \n\t"
424
        "movq          %1, %%mm6    \n\t"
425
        ::"m"(red_16mask),"m"(green_16mask));
426
    mm_end = end - 15;
427
    while (s < mm_end) {
428
        __asm__ volatile(
429
            PREFETCH"    32%1           \n\t"
430
            "movd          %1, %%mm0    \n\t"
431
            "movd         4%1, %%mm3    \n\t"
432
            "punpckldq    8%1, %%mm0    \n\t"
433
            "punpckldq   12%1, %%mm3    \n\t"
434
            "movq       %%mm0, %%mm1    \n\t"
435
            "movq       %%mm0, %%mm2    \n\t"
436
            "movq       %%mm3, %%mm4    \n\t"
437
            "movq       %%mm3, %%mm5    \n\t"
438
            "psllq         $8, %%mm0    \n\t"
439
            "psllq         $8, %%mm3    \n\t"
440
            "pand       %%mm7, %%mm0    \n\t"
441
            "pand       %%mm7, %%mm3    \n\t"
442
            "psrlq         $5, %%mm1    \n\t"
443
            "psrlq         $5, %%mm4    \n\t"
444
            "pand       %%mm6, %%mm1    \n\t"
445
            "pand       %%mm6, %%mm4    \n\t"
446
            "psrlq        $19, %%mm2    \n\t"
447
            "psrlq        $19, %%mm5    \n\t"
448
            "pand          %2, %%mm2    \n\t"
449
            "pand          %2, %%mm5    \n\t"
450
            "por        %%mm1, %%mm0    \n\t"
451
            "por        %%mm4, %%mm3    \n\t"
452
            "por        %%mm2, %%mm0    \n\t"
453
            "por        %%mm5, %%mm3    \n\t"
454
            "psllq        $16, %%mm3    \n\t"
455
            "por        %%mm3, %%mm0    \n\t"
456
            MOVNTQ"     %%mm0, %0       \n\t"
457
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
458
        d += 4;
459
        s += 16;
460
    }
461
    __asm__ volatile(SFENCE:::"memory");
462
    __asm__ volatile(EMMS:::"memory");
463
#endif
464
    while (s < end) {
465
        register int rgb = *(const uint32_t*)s; s += 4;
466
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
467
    }
468
}
469

    
470
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
471
{
472
    const uint8_t *s = src;
473
    const uint8_t *end;
474
#if HAVE_MMX
475
    const uint8_t *mm_end;
476
#endif
477
    uint16_t *d = (uint16_t *)dst;
478
    end = s + src_size;
479
#if HAVE_MMX
480
    mm_end = end - 15;
481
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
482
    __asm__ volatile(
483
        "movq           %3, %%mm5   \n\t"
484
        "movq           %4, %%mm6   \n\t"
485
        "movq           %5, %%mm7   \n\t"
486
        "jmp            2f          \n\t"
487
        ASMALIGN(4)
488
        "1:                         \n\t"
489
        PREFETCH"   32(%1)          \n\t"
490
        "movd         (%1), %%mm0   \n\t"
491
        "movd        4(%1), %%mm3   \n\t"
492
        "punpckldq   8(%1), %%mm0   \n\t"
493
        "punpckldq  12(%1), %%mm3   \n\t"
494
        "movq        %%mm0, %%mm1   \n\t"
495
        "movq        %%mm3, %%mm4   \n\t"
496
        "pand        %%mm6, %%mm0   \n\t"
497
        "pand        %%mm6, %%mm3   \n\t"
498
        "pmaddwd     %%mm7, %%mm0   \n\t"
499
        "pmaddwd     %%mm7, %%mm3   \n\t"
500
        "pand        %%mm5, %%mm1   \n\t"
501
        "pand        %%mm5, %%mm4   \n\t"
502
        "por         %%mm1, %%mm0   \n\t"
503
        "por         %%mm4, %%mm3   \n\t"
504
        "psrld          $6, %%mm0   \n\t"
505
        "pslld         $10, %%mm3   \n\t"
506
        "por         %%mm3, %%mm0   \n\t"
507
        MOVNTQ"      %%mm0, (%0)    \n\t"
508
        "add           $16,  %1     \n\t"
509
        "add            $8,  %0     \n\t"
510
        "2:                         \n\t"
511
        "cmp            %2,  %1     \n\t"
512
        " jb            1b          \n\t"
513
        : "+r" (d), "+r"(s)
514
        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
515
    );
516
#else
517
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
518
    __asm__ volatile(
519
        "movq          %0, %%mm7    \n\t"
520
        "movq          %1, %%mm6    \n\t"
521
        ::"m"(red_15mask),"m"(green_15mask));
522
    while (s < mm_end) {
523
        __asm__ volatile(
524
            PREFETCH"    32%1           \n\t"
525
            "movd          %1, %%mm0    \n\t"
526
            "movd         4%1, %%mm3    \n\t"
527
            "punpckldq    8%1, %%mm0    \n\t"
528
            "punpckldq   12%1, %%mm3    \n\t"
529
            "movq       %%mm0, %%mm1    \n\t"
530
            "movq       %%mm0, %%mm2    \n\t"
531
            "movq       %%mm3, %%mm4    \n\t"
532
            "movq       %%mm3, %%mm5    \n\t"
533
            "psrlq         $3, %%mm0    \n\t"
534
            "psrlq         $3, %%mm3    \n\t"
535
            "pand          %2, %%mm0    \n\t"
536
            "pand          %2, %%mm3    \n\t"
537
            "psrlq         $6, %%mm1    \n\t"
538
            "psrlq         $6, %%mm4    \n\t"
539
            "pand       %%mm6, %%mm1    \n\t"
540
            "pand       %%mm6, %%mm4    \n\t"
541
            "psrlq         $9, %%mm2    \n\t"
542
            "psrlq         $9, %%mm5    \n\t"
543
            "pand       %%mm7, %%mm2    \n\t"
544
            "pand       %%mm7, %%mm5    \n\t"
545
            "por        %%mm1, %%mm0    \n\t"
546
            "por        %%mm4, %%mm3    \n\t"
547
            "por        %%mm2, %%mm0    \n\t"
548
            "por        %%mm5, %%mm3    \n\t"
549
            "psllq        $16, %%mm3    \n\t"
550
            "por        %%mm3, %%mm0    \n\t"
551
            MOVNTQ"     %%mm0, %0       \n\t"
552
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
553
        d += 4;
554
        s += 16;
555
    }
556
#endif
557
    __asm__ volatile(SFENCE:::"memory");
558
    __asm__ volatile(EMMS:::"memory");
559
#endif
560
    while (s < end) {
561
        register int rgb = *(const uint32_t*)s; s += 4;
562
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
563
    }
564
}
565

    
566
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
567
{
568
    const uint8_t *s = src;
569
    const uint8_t *end;
570
#if HAVE_MMX
571
    const uint8_t *mm_end;
572
#endif
573
    uint16_t *d = (uint16_t *)dst;
574
    end = s + src_size;
575
#if HAVE_MMX
576
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
577
    __asm__ volatile(
578
        "movq          %0, %%mm7    \n\t"
579
        "movq          %1, %%mm6    \n\t"
580
        ::"m"(red_15mask),"m"(green_15mask));
581
    mm_end = end - 15;
582
    while (s < mm_end) {
583
        __asm__ volatile(
584
            PREFETCH"    32%1           \n\t"
585
            "movd          %1, %%mm0    \n\t"
586
            "movd         4%1, %%mm3    \n\t"
587
            "punpckldq    8%1, %%mm0    \n\t"
588
            "punpckldq   12%1, %%mm3    \n\t"
589
            "movq       %%mm0, %%mm1    \n\t"
590
            "movq       %%mm0, %%mm2    \n\t"
591
            "movq       %%mm3, %%mm4    \n\t"
592
            "movq       %%mm3, %%mm5    \n\t"
593
            "psllq         $7, %%mm0    \n\t"
594
            "psllq         $7, %%mm3    \n\t"
595
            "pand       %%mm7, %%mm0    \n\t"
596
            "pand       %%mm7, %%mm3    \n\t"
597
            "psrlq         $6, %%mm1    \n\t"
598
            "psrlq         $6, %%mm4    \n\t"
599
            "pand       %%mm6, %%mm1    \n\t"
600
            "pand       %%mm6, %%mm4    \n\t"
601
            "psrlq        $19, %%mm2    \n\t"
602
            "psrlq        $19, %%mm5    \n\t"
603
            "pand          %2, %%mm2    \n\t"
604
            "pand          %2, %%mm5    \n\t"
605
            "por        %%mm1, %%mm0    \n\t"
606
            "por        %%mm4, %%mm3    \n\t"
607
            "por        %%mm2, %%mm0    \n\t"
608
            "por        %%mm5, %%mm3    \n\t"
609
            "psllq        $16, %%mm3    \n\t"
610
            "por        %%mm3, %%mm0    \n\t"
611
            MOVNTQ"     %%mm0, %0       \n\t"
612
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
613
        d += 4;
614
        s += 16;
615
    }
616
    __asm__ volatile(SFENCE:::"memory");
617
    __asm__ volatile(EMMS:::"memory");
618
#endif
619
    while (s < end) {
620
        register int rgb = *(const uint32_t*)s; s += 4;
621
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
622
    }
623
}
624

    
625
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
626
{
627
    const uint8_t *s = src;
628
    const uint8_t *end;
629
#if HAVE_MMX
630
    const uint8_t *mm_end;
631
#endif
632
    uint16_t *d = (uint16_t *)dst;
633
    end = s + src_size;
634
#if HAVE_MMX
635
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
636
    __asm__ volatile(
637
        "movq         %0, %%mm7     \n\t"
638
        "movq         %1, %%mm6     \n\t"
639
        ::"m"(red_16mask),"m"(green_16mask));
640
    mm_end = end - 11;
641
    while (s < mm_end) {
642
        __asm__ volatile(
643
            PREFETCH"    32%1           \n\t"
644
            "movd          %1, %%mm0    \n\t"
645
            "movd         3%1, %%mm3    \n\t"
646
            "punpckldq    6%1, %%mm0    \n\t"
647
            "punpckldq    9%1, %%mm3    \n\t"
648
            "movq       %%mm0, %%mm1    \n\t"
649
            "movq       %%mm0, %%mm2    \n\t"
650
            "movq       %%mm3, %%mm4    \n\t"
651
            "movq       %%mm3, %%mm5    \n\t"
652
            "psrlq         $3, %%mm0    \n\t"
653
            "psrlq         $3, %%mm3    \n\t"
654
            "pand          %2, %%mm0    \n\t"
655
            "pand          %2, %%mm3    \n\t"
656
            "psrlq         $5, %%mm1    \n\t"
657
            "psrlq         $5, %%mm4    \n\t"
658
            "pand       %%mm6, %%mm1    \n\t"
659
            "pand       %%mm6, %%mm4    \n\t"
660
            "psrlq         $8, %%mm2    \n\t"
661
            "psrlq         $8, %%mm5    \n\t"
662
            "pand       %%mm7, %%mm2    \n\t"
663
            "pand       %%mm7, %%mm5    \n\t"
664
            "por        %%mm1, %%mm0    \n\t"
665
            "por        %%mm4, %%mm3    \n\t"
666
            "por        %%mm2, %%mm0    \n\t"
667
            "por        %%mm5, %%mm3    \n\t"
668
            "psllq        $16, %%mm3    \n\t"
669
            "por        %%mm3, %%mm0    \n\t"
670
            MOVNTQ"     %%mm0, %0       \n\t"
671
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
672
        d += 4;
673
        s += 12;
674
    }
675
    __asm__ volatile(SFENCE:::"memory");
676
    __asm__ volatile(EMMS:::"memory");
677
#endif
678
    while (s < end) {
679
        const int b = *s++;
680
        const int g = *s++;
681
        const int r = *s++;
682
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
683
    }
684
}
685

    
686
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
687
{
688
    const uint8_t *s = src;
689
    const uint8_t *end;
690
#if HAVE_MMX
691
    const uint8_t *mm_end;
692
#endif
693
    uint16_t *d = (uint16_t *)dst;
694
    end = s + src_size;
695
#if HAVE_MMX
696
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
697
    __asm__ volatile(
698
        "movq         %0, %%mm7     \n\t"
699
        "movq         %1, %%mm6     \n\t"
700
        ::"m"(red_16mask),"m"(green_16mask));
701
    mm_end = end - 15;
702
    while (s < mm_end) {
703
        __asm__ volatile(
704
            PREFETCH"    32%1           \n\t"
705
            "movd          %1, %%mm0    \n\t"
706
            "movd         3%1, %%mm3    \n\t"
707
            "punpckldq    6%1, %%mm0    \n\t"
708
            "punpckldq    9%1, %%mm3    \n\t"
709
            "movq       %%mm0, %%mm1    \n\t"
710
            "movq       %%mm0, %%mm2    \n\t"
711
            "movq       %%mm3, %%mm4    \n\t"
712
            "movq       %%mm3, %%mm5    \n\t"
713
            "psllq         $8, %%mm0    \n\t"
714
            "psllq         $8, %%mm3    \n\t"
715
            "pand       %%mm7, %%mm0    \n\t"
716
            "pand       %%mm7, %%mm3    \n\t"
717
            "psrlq         $5, %%mm1    \n\t"
718
            "psrlq         $5, %%mm4    \n\t"
719
            "pand       %%mm6, %%mm1    \n\t"
720
            "pand       %%mm6, %%mm4    \n\t"
721
            "psrlq        $19, %%mm2    \n\t"
722
            "psrlq        $19, %%mm5    \n\t"
723
            "pand          %2, %%mm2    \n\t"
724
            "pand          %2, %%mm5    \n\t"
725
            "por        %%mm1, %%mm0    \n\t"
726
            "por        %%mm4, %%mm3    \n\t"
727
            "por        %%mm2, %%mm0    \n\t"
728
            "por        %%mm5, %%mm3    \n\t"
729
            "psllq        $16, %%mm3    \n\t"
730
            "por        %%mm3, %%mm0    \n\t"
731
            MOVNTQ"     %%mm0, %0       \n\t"
732
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
733
        d += 4;
734
        s += 12;
735
    }
736
    __asm__ volatile(SFENCE:::"memory");
737
    __asm__ volatile(EMMS:::"memory");
738
#endif
739
    while (s < end) {
740
        const int r = *s++;
741
        const int g = *s++;
742
        const int b = *s++;
743
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
744
    }
745
}
746

    
747
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
748
{
749
    const uint8_t *s = src;
750
    const uint8_t *end;
751
#if HAVE_MMX
752
    const uint8_t *mm_end;
753
#endif
754
    uint16_t *d = (uint16_t *)dst;
755
    end = s + src_size;
756
#if HAVE_MMX
757
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
758
    __asm__ volatile(
759
        "movq          %0, %%mm7    \n\t"
760
        "movq          %1, %%mm6    \n\t"
761
        ::"m"(red_15mask),"m"(green_15mask));
762
    mm_end = end - 11;
763
    while (s < mm_end) {
764
        __asm__ volatile(
765
            PREFETCH"    32%1           \n\t"
766
            "movd          %1, %%mm0    \n\t"
767
            "movd         3%1, %%mm3    \n\t"
768
            "punpckldq    6%1, %%mm0    \n\t"
769
            "punpckldq    9%1, %%mm3    \n\t"
770
            "movq       %%mm0, %%mm1    \n\t"
771
            "movq       %%mm0, %%mm2    \n\t"
772
            "movq       %%mm3, %%mm4    \n\t"
773
            "movq       %%mm3, %%mm5    \n\t"
774
            "psrlq         $3, %%mm0    \n\t"
775
            "psrlq         $3, %%mm3    \n\t"
776
            "pand          %2, %%mm0    \n\t"
777
            "pand          %2, %%mm3    \n\t"
778
            "psrlq         $6, %%mm1    \n\t"
779
            "psrlq         $6, %%mm4    \n\t"
780
            "pand       %%mm6, %%mm1    \n\t"
781
            "pand       %%mm6, %%mm4    \n\t"
782
            "psrlq         $9, %%mm2    \n\t"
783
            "psrlq         $9, %%mm5    \n\t"
784
            "pand       %%mm7, %%mm2    \n\t"
785
            "pand       %%mm7, %%mm5    \n\t"
786
            "por        %%mm1, %%mm0    \n\t"
787
            "por        %%mm4, %%mm3    \n\t"
788
            "por        %%mm2, %%mm0    \n\t"
789
            "por        %%mm5, %%mm3    \n\t"
790
            "psllq        $16, %%mm3    \n\t"
791
            "por        %%mm3, %%mm0    \n\t"
792
            MOVNTQ"     %%mm0, %0       \n\t"
793
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794
        d += 4;
795
        s += 12;
796
    }
797
    __asm__ volatile(SFENCE:::"memory");
798
    __asm__ volatile(EMMS:::"memory");
799
#endif
800
    while (s < end) {
801
        const int b = *s++;
802
        const int g = *s++;
803
        const int r = *s++;
804
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
805
    }
806
}
807

    
808
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
809
{
810
    const uint8_t *s = src;
811
    const uint8_t *end;
812
#if HAVE_MMX
813
    const uint8_t *mm_end;
814
#endif
815
    uint16_t *d = (uint16_t *)dst;
816
    end = s + src_size;
817
#if HAVE_MMX
818
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
819
    __asm__ volatile(
820
        "movq         %0, %%mm7     \n\t"
821
        "movq         %1, %%mm6     \n\t"
822
        ::"m"(red_15mask),"m"(green_15mask));
823
    mm_end = end - 15;
824
    while (s < mm_end) {
825
        __asm__ volatile(
826
            PREFETCH"   32%1            \n\t"
827
            "movd         %1, %%mm0     \n\t"
828
            "movd        3%1, %%mm3     \n\t"
829
            "punpckldq   6%1, %%mm0     \n\t"
830
            "punpckldq   9%1, %%mm3     \n\t"
831
            "movq      %%mm0, %%mm1     \n\t"
832
            "movq      %%mm0, %%mm2     \n\t"
833
            "movq      %%mm3, %%mm4     \n\t"
834
            "movq      %%mm3, %%mm5     \n\t"
835
            "psllq        $7, %%mm0     \n\t"
836
            "psllq        $7, %%mm3     \n\t"
837
            "pand      %%mm7, %%mm0     \n\t"
838
            "pand      %%mm7, %%mm3     \n\t"
839
            "psrlq        $6, %%mm1     \n\t"
840
            "psrlq        $6, %%mm4     \n\t"
841
            "pand      %%mm6, %%mm1     \n\t"
842
            "pand      %%mm6, %%mm4     \n\t"
843
            "psrlq       $19, %%mm2     \n\t"
844
            "psrlq       $19, %%mm5     \n\t"
845
            "pand         %2, %%mm2     \n\t"
846
            "pand         %2, %%mm5     \n\t"
847
            "por       %%mm1, %%mm0     \n\t"
848
            "por       %%mm4, %%mm3     \n\t"
849
            "por       %%mm2, %%mm0     \n\t"
850
            "por       %%mm5, %%mm3     \n\t"
851
            "psllq       $16, %%mm3     \n\t"
852
            "por       %%mm3, %%mm0     \n\t"
853
            MOVNTQ"    %%mm0, %0        \n\t"
854
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855
        d += 4;
856
        s += 12;
857
    }
858
    __asm__ volatile(SFENCE:::"memory");
859
    __asm__ volatile(EMMS:::"memory");
860
#endif
861
    while (s < end) {
862
        const int r = *s++;
863
        const int g = *s++;
864
        const int b = *s++;
865
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
866
    }
867
}
868

    
869
/*
870
  I use less accurate approximation here by simply left-shifting the input
871
  value and filling the low order bits with zeroes. This method improves PNG
872
  compression but this scheme cannot reproduce white exactly, since it does
873
  not generate an all-ones maximum value; the net effect is to darken the
874
  image slightly.
875

876
  The better method should be "left bit replication":
877

878
   4 3 2 1 0
879
   ---------
880
   1 1 0 1 1
881

882
   7 6 5 4 3  2 1 0
883
   ----------------
884
   1 1 0 1 1  1 1 0
885
   |=======|  |===|
886
       |      leftmost bits repeated to fill open bits
887
       |
888
   original bits
889
*/
890
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
891
{
892
    const uint16_t *end;
893
#if HAVE_MMX
894
    const uint16_t *mm_end;
895
#endif
896
    uint8_t *d = dst;
897
    const uint16_t *s = (const uint16_t*)src;
898
    end = s + src_size/2;
899
#if HAVE_MMX
900
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
901
    mm_end = end - 7;
902
    while (s < mm_end) {
903
        __asm__ volatile(
904
            PREFETCH"    32%1           \n\t"
905
            "movq          %1, %%mm0    \n\t"
906
            "movq          %1, %%mm1    \n\t"
907
            "movq          %1, %%mm2    \n\t"
908
            "pand          %2, %%mm0    \n\t"
909
            "pand          %3, %%mm1    \n\t"
910
            "pand          %4, %%mm2    \n\t"
911
            "psllq         $3, %%mm0    \n\t"
912
            "psrlq         $2, %%mm1    \n\t"
913
            "psrlq         $7, %%mm2    \n\t"
914
            "movq       %%mm0, %%mm3    \n\t"
915
            "movq       %%mm1, %%mm4    \n\t"
916
            "movq       %%mm2, %%mm5    \n\t"
917
            "punpcklwd     %5, %%mm0    \n\t"
918
            "punpcklwd     %5, %%mm1    \n\t"
919
            "punpcklwd     %5, %%mm2    \n\t"
920
            "punpckhwd     %5, %%mm3    \n\t"
921
            "punpckhwd     %5, %%mm4    \n\t"
922
            "punpckhwd     %5, %%mm5    \n\t"
923
            "psllq         $8, %%mm1    \n\t"
924
            "psllq        $16, %%mm2    \n\t"
925
            "por        %%mm1, %%mm0    \n\t"
926
            "por        %%mm2, %%mm0    \n\t"
927
            "psllq         $8, %%mm4    \n\t"
928
            "psllq        $16, %%mm5    \n\t"
929
            "por        %%mm4, %%mm3    \n\t"
930
            "por        %%mm5, %%mm3    \n\t"
931

    
932
            "movq       %%mm0, %%mm6    \n\t"
933
            "movq       %%mm3, %%mm7    \n\t"
934

    
935
            "movq         8%1, %%mm0    \n\t"
936
            "movq         8%1, %%mm1    \n\t"
937
            "movq         8%1, %%mm2    \n\t"
938
            "pand          %2, %%mm0    \n\t"
939
            "pand          %3, %%mm1    \n\t"
940
            "pand          %4, %%mm2    \n\t"
941
            "psllq         $3, %%mm0    \n\t"
942
            "psrlq         $2, %%mm1    \n\t"
943
            "psrlq         $7, %%mm2    \n\t"
944
            "movq       %%mm0, %%mm3    \n\t"
945
            "movq       %%mm1, %%mm4    \n\t"
946
            "movq       %%mm2, %%mm5    \n\t"
947
            "punpcklwd     %5, %%mm0    \n\t"
948
            "punpcklwd     %5, %%mm1    \n\t"
949
            "punpcklwd     %5, %%mm2    \n\t"
950
            "punpckhwd     %5, %%mm3    \n\t"
951
            "punpckhwd     %5, %%mm4    \n\t"
952
            "punpckhwd     %5, %%mm5    \n\t"
953
            "psllq         $8, %%mm1    \n\t"
954
            "psllq        $16, %%mm2    \n\t"
955
            "por        %%mm1, %%mm0    \n\t"
956
            "por        %%mm2, %%mm0    \n\t"
957
            "psllq         $8, %%mm4    \n\t"
958
            "psllq        $16, %%mm5    \n\t"
959
            "por        %%mm4, %%mm3    \n\t"
960
            "por        %%mm5, %%mm3    \n\t"
961

    
962
            :"=m"(*d)
963
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
964
            :"memory");
965
        /* borrowed 32 to 24 */
966
        __asm__ volatile(
967
            "movq       %%mm0, %%mm4    \n\t"
968
            "movq       %%mm3, %%mm5    \n\t"
969
            "movq       %%mm6, %%mm0    \n\t"
970
            "movq       %%mm7, %%mm1    \n\t"
971

    
972
            "movq       %%mm4, %%mm6    \n\t"
973
            "movq       %%mm5, %%mm7    \n\t"
974
            "movq       %%mm0, %%mm2    \n\t"
975
            "movq       %%mm1, %%mm3    \n\t"
976

    
977
            "psrlq         $8, %%mm2    \n\t"
978
            "psrlq         $8, %%mm3    \n\t"
979
            "psrlq         $8, %%mm6    \n\t"
980
            "psrlq         $8, %%mm7    \n\t"
981
            "pand          %2, %%mm0    \n\t"
982
            "pand          %2, %%mm1    \n\t"
983
            "pand          %2, %%mm4    \n\t"
984
            "pand          %2, %%mm5    \n\t"
985
            "pand          %3, %%mm2    \n\t"
986
            "pand          %3, %%mm3    \n\t"
987
            "pand          %3, %%mm6    \n\t"
988
            "pand          %3, %%mm7    \n\t"
989
            "por        %%mm2, %%mm0    \n\t"
990
            "por        %%mm3, %%mm1    \n\t"
991
            "por        %%mm6, %%mm4    \n\t"
992
            "por        %%mm7, %%mm5    \n\t"
993

    
994
            "movq       %%mm1, %%mm2    \n\t"
995
            "movq       %%mm4, %%mm3    \n\t"
996
            "psllq        $48, %%mm2    \n\t"
997
            "psllq        $32, %%mm3    \n\t"
998
            "pand          %4, %%mm2    \n\t"
999
            "pand          %5, %%mm3    \n\t"
1000
            "por        %%mm2, %%mm0    \n\t"
1001
            "psrlq        $16, %%mm1    \n\t"
1002
            "psrlq        $32, %%mm4    \n\t"
1003
            "psllq        $16, %%mm5    \n\t"
1004
            "por        %%mm3, %%mm1    \n\t"
1005
            "pand          %6, %%mm5    \n\t"
1006
            "por        %%mm5, %%mm4    \n\t"
1007

    
1008
            MOVNTQ"     %%mm0,   %0     \n\t"
1009
            MOVNTQ"     %%mm1,  8%0     \n\t"
1010
            MOVNTQ"     %%mm4, 16%0"
1011

    
1012
            :"=m"(*d)
1013
            :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1014
            :"memory");
1015
        d += 24;
1016
        s += 8;
1017
    }
1018
    __asm__ volatile(SFENCE:::"memory");
1019
    __asm__ volatile(EMMS:::"memory");
1020
#endif
1021
    while (s < end) {
1022
        register uint16_t bgr;
1023
        bgr = *s++;
1024
        *d++ = (bgr&0x1F)<<3;
1025
        *d++ = (bgr&0x3E0)>>2;
1026
        *d++ = (bgr&0x7C00)>>7;
1027
    }
1028
}
1029

    
1030
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1031
{
1032
    const uint16_t *end;
1033
#if HAVE_MMX
1034
    const uint16_t *mm_end;
1035
#endif
1036
    uint8_t *d = (uint8_t *)dst;
1037
    const uint16_t *s = (const uint16_t *)src;
1038
    end = s + src_size/2;
1039
#if HAVE_MMX
1040
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1041
    mm_end = end - 7;
1042
    while (s < mm_end) {
1043
        __asm__ volatile(
1044
            PREFETCH"    32%1           \n\t"
1045
            "movq          %1, %%mm0    \n\t"
1046
            "movq          %1, %%mm1    \n\t"
1047
            "movq          %1, %%mm2    \n\t"
1048
            "pand          %2, %%mm0    \n\t"
1049
            "pand          %3, %%mm1    \n\t"
1050
            "pand          %4, %%mm2    \n\t"
1051
            "psllq         $3, %%mm0    \n\t"
1052
            "psrlq         $3, %%mm1    \n\t"
1053
            "psrlq         $8, %%mm2    \n\t"
1054
            "movq       %%mm0, %%mm3    \n\t"
1055
            "movq       %%mm1, %%mm4    \n\t"
1056
            "movq       %%mm2, %%mm5    \n\t"
1057
            "punpcklwd     %5, %%mm0    \n\t"
1058
            "punpcklwd     %5, %%mm1    \n\t"
1059
            "punpcklwd     %5, %%mm2    \n\t"
1060
            "punpckhwd     %5, %%mm3    \n\t"
1061
            "punpckhwd     %5, %%mm4    \n\t"
1062
            "punpckhwd     %5, %%mm5    \n\t"
1063
            "psllq         $8, %%mm1    \n\t"
1064
            "psllq        $16, %%mm2    \n\t"
1065
            "por        %%mm1, %%mm0    \n\t"
1066
            "por        %%mm2, %%mm0    \n\t"
1067
            "psllq         $8, %%mm4    \n\t"
1068
            "psllq        $16, %%mm5    \n\t"
1069
            "por        %%mm4, %%mm3    \n\t"
1070
            "por        %%mm5, %%mm3    \n\t"
1071

    
1072
            "movq       %%mm0, %%mm6    \n\t"
1073
            "movq       %%mm3, %%mm7    \n\t"
1074

    
1075
            "movq         8%1, %%mm0    \n\t"
1076
            "movq         8%1, %%mm1    \n\t"
1077
            "movq         8%1, %%mm2    \n\t"
1078
            "pand          %2, %%mm0    \n\t"
1079
            "pand          %3, %%mm1    \n\t"
1080
            "pand          %4, %%mm2    \n\t"
1081
            "psllq         $3, %%mm0    \n\t"
1082
            "psrlq         $3, %%mm1    \n\t"
1083
            "psrlq         $8, %%mm2    \n\t"
1084
            "movq       %%mm0, %%mm3    \n\t"
1085
            "movq       %%mm1, %%mm4    \n\t"
1086
            "movq       %%mm2, %%mm5    \n\t"
1087
            "punpcklwd     %5, %%mm0    \n\t"
1088
            "punpcklwd     %5, %%mm1    \n\t"
1089
            "punpcklwd     %5, %%mm2    \n\t"
1090
            "punpckhwd     %5, %%mm3    \n\t"
1091
            "punpckhwd     %5, %%mm4    \n\t"
1092
            "punpckhwd     %5, %%mm5    \n\t"
1093
            "psllq         $8, %%mm1    \n\t"
1094
            "psllq        $16, %%mm2    \n\t"
1095
            "por        %%mm1, %%mm0    \n\t"
1096
            "por        %%mm2, %%mm0    \n\t"
1097
            "psllq         $8, %%mm4    \n\t"
1098
            "psllq        $16, %%mm5    \n\t"
1099
            "por        %%mm4, %%mm3    \n\t"
1100
            "por        %%mm5, %%mm3    \n\t"
1101
            :"=m"(*d)
1102
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1103
            :"memory");
1104
        /* borrowed 32 to 24 */
1105
        __asm__ volatile(
1106
            "movq       %%mm0, %%mm4    \n\t"
1107
            "movq       %%mm3, %%mm5    \n\t"
1108
            "movq       %%mm6, %%mm0    \n\t"
1109
            "movq       %%mm7, %%mm1    \n\t"
1110

    
1111
            "movq       %%mm4, %%mm6    \n\t"
1112
            "movq       %%mm5, %%mm7    \n\t"
1113
            "movq       %%mm0, %%mm2    \n\t"
1114
            "movq       %%mm1, %%mm3    \n\t"
1115

    
1116
            "psrlq         $8, %%mm2    \n\t"
1117
            "psrlq         $8, %%mm3    \n\t"
1118
            "psrlq         $8, %%mm6    \n\t"
1119
            "psrlq         $8, %%mm7    \n\t"
1120
            "pand          %2, %%mm0    \n\t"
1121
            "pand          %2, %%mm1    \n\t"
1122
            "pand          %2, %%mm4    \n\t"
1123
            "pand          %2, %%mm5    \n\t"
1124
            "pand          %3, %%mm2    \n\t"
1125
            "pand          %3, %%mm3    \n\t"
1126
            "pand          %3, %%mm6    \n\t"
1127
            "pand          %3, %%mm7    \n\t"
1128
            "por        %%mm2, %%mm0    \n\t"
1129
            "por        %%mm3, %%mm1    \n\t"
1130
            "por        %%mm6, %%mm4    \n\t"
1131
            "por        %%mm7, %%mm5    \n\t"
1132

    
1133
            "movq       %%mm1, %%mm2    \n\t"
1134
            "movq       %%mm4, %%mm3    \n\t"
1135
            "psllq        $48, %%mm2    \n\t"
1136
            "psllq        $32, %%mm3    \n\t"
1137
            "pand          %4, %%mm2    \n\t"
1138
            "pand          %5, %%mm3    \n\t"
1139
            "por        %%mm2, %%mm0    \n\t"
1140
            "psrlq        $16, %%mm1    \n\t"
1141
            "psrlq        $32, %%mm4    \n\t"
1142
            "psllq        $16, %%mm5    \n\t"
1143
            "por        %%mm3, %%mm1    \n\t"
1144
            "pand          %6, %%mm5    \n\t"
1145
            "por        %%mm5, %%mm4    \n\t"
1146

    
1147
            MOVNTQ"     %%mm0,   %0     \n\t"
1148
            MOVNTQ"     %%mm1,  8%0     \n\t"
1149
            MOVNTQ"     %%mm4, 16%0"
1150

    
1151
            :"=m"(*d)
1152
            :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1153
            :"memory");
1154
        d += 24;
1155
        s += 8;
1156
    }
1157
    __asm__ volatile(SFENCE:::"memory");
1158
    __asm__ volatile(EMMS:::"memory");
1159
#endif
1160
    while (s < end) {
1161
        register uint16_t bgr;
1162
        bgr = *s++;
1163
        *d++ = (bgr&0x1F)<<3;
1164
        *d++ = (bgr&0x7E0)>>3;
1165
        *d++ = (bgr&0xF800)>>8;
1166
    }
1167
}
1168

    
1169
/*
1170
 * mm0 = 00 B3 00 B2 00 B1 00 B0
1171
 * mm1 = 00 G3 00 G2 00 G1 00 G0
1172
 * mm2 = 00 R3 00 R2 00 R1 00 R0
1173
 * mm6 = FF FF FF FF FF FF FF FF
1174
 * mm7 = 00 00 00 00 00 00 00 00
1175
 */
1176
#define PACK_RGB32 \
1177
    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1178
    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1179
    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1180
    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1181
    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1182
    "movq       %%mm0, %%mm3    \n\t"                               \
1183
    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1184
    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1185
    MOVNTQ"     %%mm0,  %0      \n\t"                               \
1186
    MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1187

    
1188
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1189
{
1190
    const uint16_t *end;
1191
#if HAVE_MMX
1192
    const uint16_t *mm_end;
1193
#endif
1194
    uint8_t *d = dst;
1195
    const uint16_t *s = (const uint16_t *)src;
1196
    end = s + src_size/2;
1197
#if HAVE_MMX
1198
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1199
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1200
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1201
    mm_end = end - 3;
1202
    while (s < mm_end) {
1203
        __asm__ volatile(
1204
            PREFETCH"    32%1           \n\t"
1205
            "movq          %1, %%mm0    \n\t"
1206
            "movq          %1, %%mm1    \n\t"
1207
            "movq          %1, %%mm2    \n\t"
1208
            "pand          %2, %%mm0    \n\t"
1209
            "pand          %3, %%mm1    \n\t"
1210
            "pand          %4, %%mm2    \n\t"
1211
            "psllq         $3, %%mm0    \n\t"
1212
            "psrlq         $2, %%mm1    \n\t"
1213
            "psrlq         $7, %%mm2    \n\t"
1214
            PACK_RGB32
1215
            :"=m"(*d)
1216
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1217
            :"memory");
1218
        d += 16;
1219
        s += 4;
1220
    }
1221
    __asm__ volatile(SFENCE:::"memory");
1222
    __asm__ volatile(EMMS:::"memory");
1223
#endif
1224
    while (s < end) {
1225
        register uint16_t bgr;
1226
        bgr = *s++;
1227
#if HAVE_BIGENDIAN
1228
        *d++ = 255;
1229
        *d++ = (bgr&0x7C00)>>7;
1230
        *d++ = (bgr&0x3E0)>>2;
1231
        *d++ = (bgr&0x1F)<<3;
1232
#else
1233
        *d++ = (bgr&0x1F)<<3;
1234
        *d++ = (bgr&0x3E0)>>2;
1235
        *d++ = (bgr&0x7C00)>>7;
1236
        *d++ = 255;
1237
#endif
1238
    }
1239
}
1240

    
1241
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1242
{
1243
    const uint16_t *end;
1244
#if HAVE_MMX
1245
    const uint16_t *mm_end;
1246
#endif
1247
    uint8_t *d = dst;
1248
    const uint16_t *s = (const uint16_t*)src;
1249
    end = s + src_size/2;
1250
#if HAVE_MMX
1251
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1252
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1253
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1254
    mm_end = end - 3;
1255
    while (s < mm_end) {
1256
        __asm__ volatile(
1257
            PREFETCH"    32%1           \n\t"
1258
            "movq          %1, %%mm0    \n\t"
1259
            "movq          %1, %%mm1    \n\t"
1260
            "movq          %1, %%mm2    \n\t"
1261
            "pand          %2, %%mm0    \n\t"
1262
            "pand          %3, %%mm1    \n\t"
1263
            "pand          %4, %%mm2    \n\t"
1264
            "psllq         $3, %%mm0    \n\t"
1265
            "psrlq         $3, %%mm1    \n\t"
1266
            "psrlq         $8, %%mm2    \n\t"
1267
            PACK_RGB32
1268
            :"=m"(*d)
1269
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1270
            :"memory");
1271
        d += 16;
1272
        s += 4;
1273
    }
1274
    __asm__ volatile(SFENCE:::"memory");
1275
    __asm__ volatile(EMMS:::"memory");
1276
#endif
1277
    while (s < end) {
1278
        register uint16_t bgr;
1279
        bgr = *s++;
1280
#if HAVE_BIGENDIAN
1281
        *d++ = 255;
1282
        *d++ = (bgr&0xF800)>>8;
1283
        *d++ = (bgr&0x7E0)>>3;
1284
        *d++ = (bgr&0x1F)<<3;
1285
#else
1286
        *d++ = (bgr&0x1F)<<3;
1287
        *d++ = (bgr&0x7E0)>>3;
1288
        *d++ = (bgr&0xF800)>>8;
1289
        *d++ = 255;
1290
#endif
1291
    }
1292
}
1293

    
1294
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1295
{
1296
    x86_reg idx = 15 - src_size;
1297
    const uint8_t *s = src-idx;
1298
    uint8_t *d = dst-idx;
1299
#if HAVE_MMX
1300
    __asm__ volatile(
1301
        "test          %0, %0           \n\t"
1302
        "jns           2f               \n\t"
1303
        PREFETCH"       (%1, %0)        \n\t"
1304
        "movq          %3, %%mm7        \n\t"
1305
        "pxor          %4, %%mm7        \n\t"
1306
        "movq       %%mm7, %%mm6        \n\t"
1307
        "pxor          %5, %%mm7        \n\t"
1308
        ASMALIGN(4)
1309
        "1:                             \n\t"
1310
        PREFETCH"     32(%1, %0)        \n\t"
1311
        "movq           (%1, %0), %%mm0 \n\t"
1312
        "movq          8(%1, %0), %%mm1 \n\t"
1313
# if HAVE_MMX2
1314
        "pshufw      $177, %%mm0, %%mm3 \n\t"
1315
        "pshufw      $177, %%mm1, %%mm5 \n\t"
1316
        "pand       %%mm7, %%mm0        \n\t"
1317
        "pand       %%mm6, %%mm3        \n\t"
1318
        "pand       %%mm7, %%mm1        \n\t"
1319
        "pand       %%mm6, %%mm5        \n\t"
1320
        "por        %%mm3, %%mm0        \n\t"
1321
        "por        %%mm5, %%mm1        \n\t"
1322
# else
1323
        "movq       %%mm0, %%mm2        \n\t"
1324
        "movq       %%mm1, %%mm4        \n\t"
1325
        "pand       %%mm7, %%mm0        \n\t"
1326
        "pand       %%mm6, %%mm2        \n\t"
1327
        "pand       %%mm7, %%mm1        \n\t"
1328
        "pand       %%mm6, %%mm4        \n\t"
1329
        "movq       %%mm2, %%mm3        \n\t"
1330
        "movq       %%mm4, %%mm5        \n\t"
1331
        "pslld        $16, %%mm2        \n\t"
1332
        "psrld        $16, %%mm3        \n\t"
1333
        "pslld        $16, %%mm4        \n\t"
1334
        "psrld        $16, %%mm5        \n\t"
1335
        "por        %%mm2, %%mm0        \n\t"
1336
        "por        %%mm4, %%mm1        \n\t"
1337
        "por        %%mm3, %%mm0        \n\t"
1338
        "por        %%mm5, %%mm1        \n\t"
1339
# endif
1340
        MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1341
        MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1342
        "add          $16, %0           \n\t"
1343
        "js            1b               \n\t"
1344
        SFENCE"                         \n\t"
1345
        EMMS"                           \n\t"
1346
        "2:                             \n\t"
1347
        : "+&r"(idx)
1348
        : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1349
        : "memory");
1350
#endif
1351
    for (; idx<15; idx+=4) {
1352
        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1353
        v &= 0xff00ff;
1354
        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1355
    }
1356
}
1357

    
1358
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1359
{
1360
    unsigned i;
1361
#if HAVE_MMX
1362
    x86_reg mmx_size= 23 - src_size;
1363
    __asm__ volatile (
1364
        "test             %%"REG_a", %%"REG_a"          \n\t"
1365
        "jns                     2f                     \n\t"
1366
        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1367
        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1368
        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1369
        ASMALIGN(4)
1370
        "1:                                             \n\t"
1371
        PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1372
        "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1373
        "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1374
        "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1375
        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1376
        "pand                 %%mm5, %%mm0              \n\t"
1377
        "pand                 %%mm6, %%mm1              \n\t"
1378
        "pand                 %%mm7, %%mm2              \n\t"
1379
        "por                  %%mm0, %%mm1              \n\t"
1380
        "por                  %%mm2, %%mm1              \n\t"
1381
        "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1382
        MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1383
        "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1384
        "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1385
        "pand                 %%mm7, %%mm0              \n\t"
1386
        "pand                 %%mm5, %%mm1              \n\t"
1387
        "pand                 %%mm6, %%mm2              \n\t"
1388
        "por                  %%mm0, %%mm1              \n\t"
1389
        "por                  %%mm2, %%mm1              \n\t"
1390
        "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1391
        MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1392
        "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1393
        "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1394
        "pand                 %%mm6, %%mm0              \n\t"
1395
        "pand                 %%mm7, %%mm1              \n\t"
1396
        "pand                 %%mm5, %%mm2              \n\t"
1397
        "por                  %%mm0, %%mm1              \n\t"
1398
        "por                  %%mm2, %%mm1              \n\t"
1399
        MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1400
        "add                    $24, %%"REG_a"          \n\t"
1401
        " js                     1b                     \n\t"
1402
        "2:                                             \n\t"
1403
        : "+a" (mmx_size)
1404
        : "r" (src-mmx_size), "r"(dst-mmx_size)
1405
    );
1406

    
1407
    __asm__ volatile(SFENCE:::"memory");
1408
    __asm__ volatile(EMMS:::"memory");
1409

    
1410
    if (mmx_size==23) return; //finished, was multiple of 8
1411

    
1412
    src+= src_size;
1413
    dst+= src_size;
1414
    src_size= 23-mmx_size;
1415
    src-= src_size;
1416
    dst-= src_size;
1417
#endif
1418
    for (i=0; i<src_size; i+=3) {
1419
        register uint8_t x;
1420
        x          = src[i + 2];
1421
        dst[i + 1] = src[i + 1];
1422
        dst[i + 2] = src[i + 0];
1423
        dst[i + 0] = x;
1424
    }
1425
}
1426

    
1427
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1428
                                           long width, long height,
1429
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1430
{
1431
    long y;
1432
    const x86_reg chromWidth= width>>1;
1433
    for (y=0; y<height; y++) {
1434
#if HAVE_MMX
1435
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1436
        __asm__ volatile(
1437
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1438
            ASMALIGN(4)
1439
            "1:                                         \n\t"
1440
            PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1441
            PREFETCH"    32(%2, %%"REG_a")              \n\t"
1442
            PREFETCH"    32(%3, %%"REG_a")              \n\t"
1443
            "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1444
            "movq                    %%mm0, %%mm2       \n\t" // U(0)
1445
            "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1446
            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1447
            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1448

    
1449
            "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1450
            "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1451
            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1452
            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1453
            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1454
            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1455
            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1456
            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1457

    
1458
            MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1459
            MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1460
            MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1461
            MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1462

    
1463
            "add                        $8, %%"REG_a"   \n\t"
1464
            "cmp                        %4, %%"REG_a"   \n\t"
1465
            " jb                        1b              \n\t"
1466
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1467
            : "%"REG_a
1468
        );
1469
#else
1470

    
1471
#if ARCH_ALPHA && HAVE_MVI
1472
#define pl2yuy2(n)                  \
1473
    y1 = yc[n];                     \
1474
    y2 = yc2[n];                    \
1475
    u = uc[n];                      \
1476
    v = vc[n];                      \
1477
    __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1478
    __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1479
    __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1480
    __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1481
    yuv1 = (u << 8) + (v << 24);                \
1482
    yuv2 = yuv1 + y2;               \
1483
    yuv1 += y1;                     \
1484
    qdst[n]  = yuv1;                \
1485
    qdst2[n] = yuv2;
1486

    
1487
        int i;
1488
        uint64_t *qdst = (uint64_t *) dst;
1489
        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1490
        const uint32_t *yc = (uint32_t *) ysrc;
1491
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1492
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1493
        for (i = 0; i < chromWidth; i += 8) {
1494
            uint64_t y1, y2, yuv1, yuv2;
1495
            uint64_t u, v;
1496
            /* Prefetch */
1497
            __asm__("ldq $31,64(%0)" :: "r"(yc));
1498
            __asm__("ldq $31,64(%0)" :: "r"(yc2));
1499
            __asm__("ldq $31,64(%0)" :: "r"(uc));
1500
            __asm__("ldq $31,64(%0)" :: "r"(vc));
1501

    
1502
            pl2yuy2(0);
1503
            pl2yuy2(1);
1504
            pl2yuy2(2);
1505
            pl2yuy2(3);
1506

    
1507
            yc    += 4;
1508
            yc2   += 4;
1509
            uc    += 4;
1510
            vc    += 4;
1511
            qdst  += 4;
1512
            qdst2 += 4;
1513
        }
1514
        y++;
1515
        ysrc += lumStride;
1516
        dst += dstStride;
1517

    
1518
#elif HAVE_FAST_64BIT
1519
        int i;
1520
        uint64_t *ldst = (uint64_t *) dst;
1521
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1522
        for (i = 0; i < chromWidth; i += 2) {
1523
            uint64_t k, l;
1524
            k = yc[0] + (uc[0] << 8) +
1525
                (yc[1] << 16) + (vc[0] << 24);
1526
            l = yc[2] + (uc[1] << 8) +
1527
                (yc[3] << 16) + (vc[1] << 24);
1528
            *ldst++ = k + (l << 32);
1529
            yc += 4;
1530
            uc += 2;
1531
            vc += 2;
1532
        }
1533

    
1534
#else
1535
        int i, *idst = (int32_t *) dst;
1536
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1537
        for (i = 0; i < chromWidth; i++) {
1538
#if HAVE_BIGENDIAN
1539
            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1540
                (yc[1] << 8) + (vc[0] << 0);
1541
#else
1542
            *idst++ = yc[0] + (uc[0] << 8) +
1543
                (yc[1] << 16) + (vc[0] << 24);
1544
#endif
1545
            yc += 2;
1546
            uc++;
1547
            vc++;
1548
        }
1549
#endif
1550
#endif
1551
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1552
            usrc += chromStride;
1553
            vsrc += chromStride;
1554
        }
1555
        ysrc += lumStride;
1556
        dst  += dstStride;
1557
    }
1558
#if HAVE_MMX
1559
    __asm__(EMMS"       \n\t"
1560
            SFENCE"     \n\t"
1561
            :::"memory");
1562
#endif
1563
}
1564

    
1565
/**
1566
 * Height should be a multiple of 2 and width should be a multiple of 16.
1567
 * (If this is a problem for anyone then tell me, and I will fix it.)
1568
 */
1569
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1570
                                      long width, long height,
1571
                                      long lumStride, long chromStride, long dstStride)
1572
{
1573
    //FIXME interpolate chroma
1574
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1575
}
1576

    
1577
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1578
                                           long width, long height,
1579
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1580
{
1581
    long y;
1582
    const x86_reg chromWidth= width>>1;
1583
    for (y=0; y<height; y++) {
1584
#if HAVE_MMX
1585
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1586
        __asm__ volatile(
1587
            "xor                %%"REG_a", %%"REG_a"    \n\t"
1588
            ASMALIGN(4)
1589
            "1:                                         \n\t"
1590
            PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1591
            PREFETCH"   32(%2, %%"REG_a")               \n\t"
1592
            PREFETCH"   32(%3, %%"REG_a")               \n\t"
1593
            "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1594
            "movq                   %%mm0, %%mm2        \n\t" // U(0)
1595
            "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1596
            "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1597
            "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1598

    
1599
            "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1600
            "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1601
            "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1602
            "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1603
            "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1604
            "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1605
            "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1606
            "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1607

    
1608
            MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1609
            MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1610
            MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1611
            MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1612

    
1613
            "add                       $8, %%"REG_a"    \n\t"
1614
            "cmp                       %4, %%"REG_a"    \n\t"
1615
            " jb                       1b               \n\t"
1616
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1617
            : "%"REG_a
1618
        );
1619
#else
1620
//FIXME adapt the Alpha ASM code from yv12->yuy2
1621

    
1622
#if HAVE_FAST_64BIT
1623
        int i;
1624
        uint64_t *ldst = (uint64_t *) dst;
1625
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1626
        for (i = 0; i < chromWidth; i += 2) {
1627
            uint64_t k, l;
1628
            k = uc[0] + (yc[0] << 8) +
1629
                (vc[0] << 16) + (yc[1] << 24);
1630
            l = uc[1] + (yc[2] << 8) +
1631
                (vc[1] << 16) + (yc[3] << 24);
1632
            *ldst++ = k + (l << 32);
1633
            yc += 4;
1634
            uc += 2;
1635
            vc += 2;
1636
        }
1637

    
1638
#else
1639
        int i, *idst = (int32_t *) dst;
1640
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1641
        for (i = 0; i < chromWidth; i++) {
1642
#if HAVE_BIGENDIAN
1643
            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1644
                (vc[0] << 8) + (yc[1] << 0);
1645
#else
1646
            *idst++ = uc[0] + (yc[0] << 8) +
1647
               (vc[0] << 16) + (yc[1] << 24);
1648
#endif
1649
            yc += 2;
1650
            uc++;
1651
            vc++;
1652
        }
1653
#endif
1654
#endif
1655
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1656
            usrc += chromStride;
1657
            vsrc += chromStride;
1658
        }
1659
        ysrc += lumStride;
1660
        dst += dstStride;
1661
    }
1662
#if HAVE_MMX
1663
    __asm__(EMMS"       \n\t"
1664
            SFENCE"     \n\t"
1665
            :::"memory");
1666
#endif
1667
}
1668

    
1669
/**
1670
 * Height should be a multiple of 2 and width should be a multiple of 16
1671
 * (If this is a problem for anyone then tell me, and I will fix it.)
1672
 */
1673
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1674
                                      long width, long height,
1675
                                      long lumStride, long chromStride, long dstStride)
1676
{
1677
    //FIXME interpolate chroma
1678
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1679
}
1680

    
1681
/**
1682
 * Width should be a multiple of 16.
1683
 */
1684
static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1685
                                         long width, long height,
1686
                                         long lumStride, long chromStride, long dstStride)
1687
{
1688
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1689
}
1690

    
1691
/**
1692
 * Width should be a multiple of 16.
1693
 */
1694
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1695
                                         long width, long height,
1696
                                         long lumStride, long chromStride, long dstStride)
1697
{
1698
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1699
}
1700

    
1701
/**
1702
 * Height should be a multiple of 2 and width should be a multiple of 16.
1703
 * (If this is a problem for anyone then tell me, and I will fix it.)
1704
 */
1705
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1706
                                      long width, long height,
1707
                                      long lumStride, long chromStride, long srcStride)
1708
{
1709
    long y;
1710
    const x86_reg chromWidth= width>>1;
1711
    for (y=0; y<height; y+=2) {
1712
#if HAVE_MMX
1713
        __asm__ volatile(
1714
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1715
            "pcmpeqw                 %%mm7, %%mm7       \n\t"
1716
            "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1717
            ASMALIGN(4)
1718
            "1:                \n\t"
1719
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1720
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1721
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1722
            "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1723
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1724
            "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1725
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1726
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1727
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1728
            "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1729
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1730

    
1731
            MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1732

    
1733
            "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1734
            "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1735
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1736
            "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1737
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1738
            "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1739
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1740
            "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1741
            "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1742
            "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1743

    
1744
            MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1745

    
1746
            "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1747
            "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1748
            "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1749
            "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1750
            "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1751
            "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1752
            "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1753
            "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1754

    
1755
            MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1756
            MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1757

    
1758
            "add                        $8, %%"REG_a"   \n\t"
1759
            "cmp                        %4, %%"REG_a"   \n\t"
1760
            " jb                        1b              \n\t"
1761
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1762
            : "memory", "%"REG_a
1763
        );
1764

    
1765
        ydst += lumStride;
1766
        src  += srcStride;
1767

    
1768
        __asm__ volatile(
1769
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1770
            ASMALIGN(4)
1771
            "1:                                         \n\t"
1772
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1773
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1774
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1775
            "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1776
            "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1777
            "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1778
            "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1779
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1780
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1781
            "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1782
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1783

    
1784
            MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1785
            MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1786

    
1787
            "add                        $8, %%"REG_a"   \n\t"
1788
            "cmp                        %4, %%"REG_a"   \n\t"
1789
            " jb                        1b              \n\t"
1790

    
1791
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1792
            : "memory", "%"REG_a
1793
        );
1794
#else
1795
        long i;
1796
        for (i=0; i<chromWidth; i++) {
1797
            ydst[2*i+0]     = src[4*i+0];
1798
            udst[i]     = src[4*i+1];
1799
            ydst[2*i+1]     = src[4*i+2];
1800
            vdst[i]     = src[4*i+3];
1801
        }
1802
        ydst += lumStride;
1803
        src  += srcStride;
1804

    
1805
        for (i=0; i<chromWidth; i++) {
1806
            ydst[2*i+0]     = src[4*i+0];
1807
            ydst[2*i+1]     = src[4*i+2];
1808
        }
1809
#endif
1810
        udst += chromStride;
1811
        vdst += chromStride;
1812
        ydst += lumStride;
1813
        src  += srcStride;
1814
    }
1815
#if HAVE_MMX
1816
    __asm__ volatile(EMMS"       \n\t"
1817
                     SFENCE"     \n\t"
1818
                     :::"memory");
1819
#endif
1820
}
1821

    
1822
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1823
                                      uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1824
                                      long width, long height, long lumStride, long chromStride)
1825
{
1826
    /* Y Plane */
1827
    memcpy(ydst, ysrc, width*height);
1828

    
1829
    /* XXX: implement upscaling for U,V */
1830
}
1831

    
1832
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1833
{
1834
    long x,y;
1835

    
1836
    dst[0]= src[0];
1837

    
1838
    // first line
1839
    for (x=0; x<srcWidth-1; x++) {
1840
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1841
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1842
    }
1843
    dst[2*srcWidth-1]= src[srcWidth-1];
1844

    
1845
    dst+= dstStride;
1846

    
1847
    for (y=1; y<srcHeight; y++) {
1848
#if HAVE_MMX2 || HAVE_AMD3DNOW
1849
        const x86_reg mmxSize= srcWidth&~15;
1850
        __asm__ volatile(
1851
            "mov           %4, %%"REG_a"            \n\t"
1852
            "1:                                     \n\t"
1853
            "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1854
            "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1855
            "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1856
            "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1857
            "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1858
            "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1859
            PAVGB"                  %%mm0, %%mm5    \n\t"
1860
            PAVGB"                  %%mm0, %%mm3    \n\t"
1861
            PAVGB"                  %%mm0, %%mm5    \n\t"
1862
            PAVGB"                  %%mm0, %%mm3    \n\t"
1863
            PAVGB"                  %%mm1, %%mm4    \n\t"
1864
            PAVGB"                  %%mm1, %%mm2    \n\t"
1865
            PAVGB"                  %%mm1, %%mm4    \n\t"
1866
            PAVGB"                  %%mm1, %%mm2    \n\t"
1867
            "movq                   %%mm5, %%mm7    \n\t"
1868
            "movq                   %%mm4, %%mm6    \n\t"
1869
            "punpcklbw              %%mm3, %%mm5    \n\t"
1870
            "punpckhbw              %%mm3, %%mm7    \n\t"
1871
            "punpcklbw              %%mm2, %%mm4    \n\t"
1872
            "punpckhbw              %%mm2, %%mm6    \n\t"
1873
#if 1
1874
            MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1875
            MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1876
            MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1877
            MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1878
#else
1879
            "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1880
            "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1881
            "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1882
            "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1883
#endif
1884
            "add                       $8, %%"REG_a"            \n\t"
1885
            " js                       1b                       \n\t"
1886
            :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1887
            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1888
            "g" (-mmxSize)
1889
            : "%"REG_a
1890

    
1891
        );
1892
#else
1893
        const x86_reg mmxSize=1;
1894
#endif
1895
        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1896
        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1897

    
1898
        for (x=mmxSize-1; x<srcWidth-1; x++) {
1899
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1900
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1901
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1902
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1903
        }
1904
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1905
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1906

    
1907
        dst+=dstStride*2;
1908
        src+=srcStride;
1909
    }
1910

    
1911
    // last line
1912
#if 1
1913
    dst[0]= src[0];
1914

    
1915
    for (x=0; x<srcWidth-1; x++) {
1916
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1917
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1918
    }
1919
    dst[2*srcWidth-1]= src[srcWidth-1];
1920
#else
1921
    for (x=0; x<srcWidth; x++) {
1922
        dst[2*x+0]=
1923
        dst[2*x+1]= src[x];
1924
    }
1925
#endif
1926

    
1927
#if HAVE_MMX
1928
    __asm__ volatile(EMMS"       \n\t"
1929
                     SFENCE"     \n\t"
1930
                     :::"memory");
1931
#endif
1932
}
1933

    
1934
/**
1935
 * Height should be a multiple of 2 and width should be a multiple of 16.
1936
 * (If this is a problem for anyone then tell me, and I will fix it.)
1937
 * Chrominance data is only taken from every second line, others are ignored.
1938
 * FIXME: Write HQ version.
1939
 */
1940
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1941
                                      long width, long height,
1942
                                      long lumStride, long chromStride, long srcStride)
1943
{
1944
    long y;
1945
    const x86_reg chromWidth= width>>1;
1946
    for (y=0; y<height; y+=2) {
1947
#if HAVE_MMX
1948
        __asm__ volatile(
1949
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1950
            "pcmpeqw             %%mm7, %%mm7   \n\t"
1951
            "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
1952
            ASMALIGN(4)
1953
            "1:                                 \n\t"
1954
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1955
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
1956
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
1957
            "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
1958
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
1959
            "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
1960
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
1961
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
1962
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
1963
            "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
1964
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
1965

    
1966
            MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
1967

    
1968
            "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
1969
            "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
1970
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
1971
            "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
1972
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
1973
            "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
1974
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
1975
            "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
1976
            "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
1977
            "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
1978

    
1979
            MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1980

    
1981
            "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
1982
            "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
1983
            "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
1984
            "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
1985
            "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
1986
            "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
1987
            "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
1988
            "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
1989

    
1990
            MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
1991
            MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
1992

    
1993
            "add                    $8, %%"REG_a"   \n\t"
1994
            "cmp                    %4, %%"REG_a"   \n\t"
1995
            " jb                    1b          \n\t"
1996
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1997
            : "memory", "%"REG_a
1998
        );
1999

    
2000
        ydst += lumStride;
2001
        src  += srcStride;
2002

    
2003
        __asm__ volatile(
2004
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
2005
            ASMALIGN(4)
2006
            "1:                                 \n\t"
2007
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
2008
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
2009
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
2010
            "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
2011
            "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
2012
            "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
2013
            "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
2014
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
2015
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
2016
            "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
2017
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
2018

    
2019
            MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
2020
            MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2021

    
2022
            "add                    $8, %%"REG_a"   \n\t"
2023
            "cmp                    %4, %%"REG_a"   \n\t"
2024
            " jb                    1b          \n\t"
2025

    
2026
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2027
            : "memory", "%"REG_a
2028
        );
2029
#else
2030
        long i;
2031
        for (i=0; i<chromWidth; i++) {
2032
            udst[i]     = src[4*i+0];
2033
            ydst[2*i+0] = src[4*i+1];
2034
            vdst[i]     = src[4*i+2];
2035
            ydst[2*i+1] = src[4*i+3];
2036
        }
2037
        ydst += lumStride;
2038
        src  += srcStride;
2039

    
2040
        for (i=0; i<chromWidth; i++) {
2041
            ydst[2*i+0] = src[4*i+1];
2042
            ydst[2*i+1] = src[4*i+3];
2043
        }
2044
#endif
2045
        udst += chromStride;
2046
        vdst += chromStride;
2047
        ydst += lumStride;
2048
        src  += srcStride;
2049
    }
2050
#if HAVE_MMX
2051
    __asm__ volatile(EMMS"       \n\t"
2052
                     SFENCE"     \n\t"
2053
                     :::"memory");
2054
#endif
2055
}
2056

    
2057
/**
2058
 * Height should be a multiple of 2 and width should be a multiple of 2.
2059
 * (If this is a problem for anyone then tell me, and I will fix it.)
2060
 * Chrominance data is only taken from every second line,
2061
 * others are ignored in the C version.
2062
 * FIXME: Write HQ version.
2063
 */
2064
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2065
                                       long width, long height,
2066
                                       long lumStride, long chromStride, long srcStride)
2067
{
2068
    long y;
2069
    const x86_reg chromWidth= width>>1;
2070
#if HAVE_MMX
2071
    for (y=0; y<height-2; y+=2) {
2072
        long i;
2073
        for (i=0; i<2; i++) {
2074
            __asm__ volatile(
2075
                "mov                        %2, %%"REG_a"   \n\t"
2076
                "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2077
                "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2078
                "pxor                    %%mm7, %%mm7       \n\t"
2079
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2080
                ASMALIGN(4)
2081
                "1:                                         \n\t"
2082
                PREFETCH"    64(%0, %%"REG_d")              \n\t"
2083
                "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2084
                "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2085
                "punpcklbw               %%mm7, %%mm0       \n\t"
2086
                "punpcklbw               %%mm7, %%mm1       \n\t"
2087
                "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2088
                "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2089
                "punpcklbw               %%mm7, %%mm2       \n\t"
2090
                "punpcklbw               %%mm7, %%mm3       \n\t"
2091
                "pmaddwd                 %%mm6, %%mm0       \n\t"
2092
                "pmaddwd                 %%mm6, %%mm1       \n\t"
2093
                "pmaddwd                 %%mm6, %%mm2       \n\t"
2094
                "pmaddwd                 %%mm6, %%mm3       \n\t"
2095
#ifndef FAST_BGR2YV12
2096
                "psrad                      $8, %%mm0       \n\t"
2097
                "psrad                      $8, %%mm1       \n\t"
2098
                "psrad                      $8, %%mm2       \n\t"
2099
                "psrad                      $8, %%mm3       \n\t"
2100
#endif
2101
                "packssdw                %%mm1, %%mm0       \n\t"
2102
                "packssdw                %%mm3, %%mm2       \n\t"
2103
                "pmaddwd                 %%mm5, %%mm0       \n\t"
2104
                "pmaddwd                 %%mm5, %%mm2       \n\t"
2105
                "packssdw                %%mm2, %%mm0       \n\t"
2106
                "psraw                      $7, %%mm0       \n\t"
2107

    
2108
                "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2109
                "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2110
                "punpcklbw               %%mm7, %%mm4       \n\t"
2111
                "punpcklbw               %%mm7, %%mm1       \n\t"
2112
                "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2113
                "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2114
                "punpcklbw               %%mm7, %%mm2       \n\t"
2115
                "punpcklbw               %%mm7, %%mm3       \n\t"
2116
                "pmaddwd                 %%mm6, %%mm4       \n\t"
2117
                "pmaddwd                 %%mm6, %%mm1       \n\t"
2118
                "pmaddwd                 %%mm6, %%mm2       \n\t"
2119
                "pmaddwd                 %%mm6, %%mm3       \n\t"
2120
#ifndef FAST_BGR2YV12
2121
                "psrad                      $8, %%mm4       \n\t"
2122
                "psrad                      $8, %%mm1       \n\t"
2123
                "psrad                      $8, %%mm2       \n\t"
2124
                "psrad                      $8, %%mm3       \n\t"
2125
#endif
2126
                "packssdw                %%mm1, %%mm4       \n\t"
2127
                "packssdw                %%mm3, %%mm2       \n\t"
2128
                "pmaddwd                 %%mm5, %%mm4       \n\t"
2129
                "pmaddwd                 %%mm5, %%mm2       \n\t"
2130
                "add                       $24, %%"REG_d"   \n\t"
2131
                "packssdw                %%mm2, %%mm4       \n\t"
2132
                "psraw                      $7, %%mm4       \n\t"
2133

    
2134
                "packuswb                %%mm4, %%mm0       \n\t"
2135
                "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2136

    
2137
                MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2138
                "add                        $8,      %%"REG_a"  \n\t"
2139
                " js                        1b                  \n\t"
2140
                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2141
                : "%"REG_a, "%"REG_d
2142
            );
2143
            ydst += lumStride;
2144
            src  += srcStride;
2145
        }
2146
        src -= srcStride*2;
2147
        __asm__ volatile(
2148
            "mov                        %4, %%"REG_a"   \n\t"
2149
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2150
            "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2151
            "pxor                    %%mm7, %%mm7       \n\t"
2152
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2153
            "add                 %%"REG_d", %%"REG_d"   \n\t"
2154
            ASMALIGN(4)
2155
            "1:                                         \n\t"
2156
            PREFETCH"    64(%0, %%"REG_d")              \n\t"
2157
            PREFETCH"    64(%1, %%"REG_d")              \n\t"
2158
#if HAVE_MMX2 || HAVE_AMD3DNOW
2159
            "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2160
            "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2161
            "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2162
            "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2163
            PAVGB"                   %%mm1, %%mm0       \n\t"
2164
            PAVGB"                   %%mm3, %%mm2       \n\t"
2165
            "movq                    %%mm0, %%mm1       \n\t"
2166
            "movq                    %%mm2, %%mm3       \n\t"
2167
            "psrlq                     $24, %%mm0       \n\t"
2168
            "psrlq                     $24, %%mm2       \n\t"
2169
            PAVGB"                   %%mm1, %%mm0       \n\t"
2170
            PAVGB"                   %%mm3, %%mm2       \n\t"
2171
            "punpcklbw               %%mm7, %%mm0       \n\t"
2172
            "punpcklbw               %%mm7, %%mm2       \n\t"
2173
#else
2174
            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2175
            "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2176
            "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2177
            "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2178
            "punpcklbw               %%mm7, %%mm0       \n\t"
2179
            "punpcklbw               %%mm7, %%mm1       \n\t"
2180
            "punpcklbw               %%mm7, %%mm2       \n\t"
2181
            "punpcklbw               %%mm7, %%mm3       \n\t"
2182
            "paddw                   %%mm1, %%mm0       \n\t"
2183
            "paddw                   %%mm3, %%mm2       \n\t"
2184
            "paddw                   %%mm2, %%mm0       \n\t"
2185
            "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2186
            "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2187
            "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2188
            "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2189
            "punpcklbw               %%mm7, %%mm4       \n\t"
2190
            "punpcklbw               %%mm7, %%mm1       \n\t"
2191
            "punpcklbw               %%mm7, %%mm2       \n\t"
2192
            "punpcklbw               %%mm7, %%mm3       \n\t"
2193
            "paddw                   %%mm1, %%mm4       \n\t"
2194
            "paddw                   %%mm3, %%mm2       \n\t"
2195
            "paddw                   %%mm4, %%mm2       \n\t"
2196
            "psrlw                      $2, %%mm0       \n\t"
2197
            "psrlw                      $2, %%mm2       \n\t"
2198
#endif
2199
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2200
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2201

    
2202
            "pmaddwd                 %%mm0, %%mm1       \n\t"
2203
            "pmaddwd                 %%mm2, %%mm3       \n\t"
2204
            "pmaddwd                 %%mm6, %%mm0       \n\t"
2205
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2206
#ifndef FAST_BGR2YV12
2207
            "psrad                      $8, %%mm0       \n\t"
2208
            "psrad                      $8, %%mm1       \n\t"
2209
            "psrad                      $8, %%mm2       \n\t"
2210
            "psrad                      $8, %%mm3       \n\t"
2211
#endif
2212
            "packssdw                %%mm2, %%mm0       \n\t"
2213
            "packssdw                %%mm3, %%mm1       \n\t"
2214
            "pmaddwd                 %%mm5, %%mm0       \n\t"
2215
            "pmaddwd                 %%mm5, %%mm1       \n\t"
2216
            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2217
            "psraw                      $7, %%mm0       \n\t"
2218

    
2219
#if HAVE_MMX2 || HAVE_AMD3DNOW
2220
            "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2221
            "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2222
            "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2223
            "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2224
            PAVGB"                   %%mm1, %%mm4       \n\t"
2225
            PAVGB"                   %%mm3, %%mm2       \n\t"
2226
            "movq                    %%mm4, %%mm1       \n\t"
2227
            "movq                    %%mm2, %%mm3       \n\t"
2228
            "psrlq                     $24, %%mm4       \n\t"
2229
            "psrlq                     $24, %%mm2       \n\t"
2230
            PAVGB"                   %%mm1, %%mm4       \n\t"
2231
            PAVGB"                   %%mm3, %%mm2       \n\t"
2232
            "punpcklbw               %%mm7, %%mm4       \n\t"
2233
            "punpcklbw               %%mm7, %%mm2       \n\t"
2234
#else
2235
            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2236
            "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2237
            "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2238
            "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2239
            "punpcklbw               %%mm7, %%mm4       \n\t"
2240
            "punpcklbw               %%mm7, %%mm1       \n\t"
2241
            "punpcklbw               %%mm7, %%mm2       \n\t"
2242
            "punpcklbw               %%mm7, %%mm3       \n\t"
2243
            "paddw                   %%mm1, %%mm4       \n\t"
2244
            "paddw                   %%mm3, %%mm2       \n\t"
2245
            "paddw                   %%mm2, %%mm4       \n\t"
2246
            "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2247
            "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2248
            "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2249
            "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2250
            "punpcklbw               %%mm7, %%mm5       \n\t"
2251
            "punpcklbw               %%mm7, %%mm1       \n\t"
2252
            "punpcklbw               %%mm7, %%mm2       \n\t"
2253
            "punpcklbw               %%mm7, %%mm3       \n\t"
2254
            "paddw                   %%mm1, %%mm5       \n\t"
2255
            "paddw                   %%mm3, %%mm2       \n\t"
2256
            "paddw                   %%mm5, %%mm2       \n\t"
2257
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2258
            "psrlw                      $2, %%mm4       \n\t"
2259
            "psrlw                      $2, %%mm2       \n\t"
2260
#endif
2261
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2262
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2263

    
2264
            "pmaddwd                 %%mm4, %%mm1       \n\t"
2265
            "pmaddwd                 %%mm2, %%mm3       \n\t"
2266
            "pmaddwd                 %%mm6, %%mm4       \n\t"
2267
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2268
#ifndef FAST_BGR2YV12
2269
            "psrad                      $8, %%mm4       \n\t"
2270
            "psrad                      $8, %%mm1       \n\t"
2271
            "psrad                      $8, %%mm2       \n\t"
2272
            "psrad                      $8, %%mm3       \n\t"
2273
#endif
2274
            "packssdw                %%mm2, %%mm4       \n\t"
2275
            "packssdw                %%mm3, %%mm1       \n\t"
2276
            "pmaddwd                 %%mm5, %%mm4       \n\t"
2277
            "pmaddwd                 %%mm5, %%mm1       \n\t"
2278
            "add                       $24, %%"REG_d"   \n\t"
2279
            "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2280
            "psraw                      $7, %%mm4       \n\t"
2281

    
2282
            "movq                    %%mm0, %%mm1           \n\t"
2283
            "punpckldq               %%mm4, %%mm0           \n\t"
2284
            "punpckhdq               %%mm4, %%mm1           \n\t"
2285
            "packsswb                %%mm1, %%mm0           \n\t"
2286
            "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
2287
            "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2288
            "punpckhdq               %%mm0, %%mm0           \n\t"
2289
            "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2290
            "add                        $4, %%"REG_a"       \n\t"
2291
            " js                        1b                  \n\t"
2292
            : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2293
            : "%"REG_a, "%"REG_d
2294
        );
2295

    
2296
        udst += chromStride;
2297
        vdst += chromStride;
2298
        src  += srcStride*2;
2299
    }
2300

    
2301
    __asm__ volatile(EMMS"       \n\t"
2302
                     SFENCE"     \n\t"
2303
                     :::"memory");
2304
#else
2305
    y=0;
2306
#endif
2307
    for (; y<height; y+=2) {
2308
        long i;
2309
        for (i=0; i<chromWidth; i++) {
2310
            unsigned int b = src[6*i+0];
2311
            unsigned int g = src[6*i+1];
2312
            unsigned int r = src[6*i+2];
2313

    
2314
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2315
            unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2316
            unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2317

    
2318
            udst[i]     = U;
2319
            vdst[i]     = V;
2320
            ydst[2*i]   = Y;
2321

    
2322
            b = src[6*i+3];
2323
            g = src[6*i+4];
2324
            r = src[6*i+5];
2325

    
2326
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2327
            ydst[2*i+1]     = Y;
2328
        }
2329
        ydst += lumStride;
2330
        src  += srcStride;
2331

    
2332
        for (i=0; i<chromWidth; i++) {
2333
            unsigned int b = src[6*i+0];
2334
            unsigned int g = src[6*i+1];
2335
            unsigned int r = src[6*i+2];
2336

    
2337
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2338

    
2339
            ydst[2*i]     = Y;
2340

    
2341
            b = src[6*i+3];
2342
            g = src[6*i+4];
2343
            r = src[6*i+5];
2344

    
2345
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2346
            ydst[2*i+1]     = Y;
2347
        }
2348
        udst += chromStride;
2349
        vdst += chromStride;
2350
        ydst += lumStride;
2351
        src  += srcStride;
2352
    }
2353
}
2354

    
2355
static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2356
                             long width, long height, long src1Stride,
2357
                             long src2Stride, long dstStride)
2358
{
2359
    long h;
2360

    
2361
    for (h=0; h < height; h++) {
2362
        long w;
2363

    
2364
#if HAVE_MMX
2365
#if HAVE_SSE2
2366
        __asm__(
2367
            "xor              %%"REG_a", %%"REG_a"  \n\t"
2368
            "1:                                     \n\t"
2369
            PREFETCH" 64(%1, %%"REG_a")             \n\t"
2370
            PREFETCH" 64(%2, %%"REG_a")             \n\t"
2371
            "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2372
            "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2373
            "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2374
            "punpcklbw           %%xmm2, %%xmm0     \n\t"
2375
            "punpckhbw           %%xmm2, %%xmm1     \n\t"
2376
            "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2377
            "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2378
            "add                    $16, %%"REG_a"  \n\t"
2379
            "cmp                     %3, %%"REG_a"  \n\t"
2380
            " jb                     1b             \n\t"
2381
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2382
            : "memory", "%"REG_a""
2383
        );
2384
#else
2385
        __asm__(
2386
            "xor %%"REG_a", %%"REG_a"               \n\t"
2387
            "1:                                     \n\t"
2388
            PREFETCH" 64(%1, %%"REG_a")             \n\t"
2389
            PREFETCH" 64(%2, %%"REG_a")             \n\t"
2390
            "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2391
            "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2392
            "movq                 %%mm0, %%mm1      \n\t"
2393
            "movq                 %%mm2, %%mm3      \n\t"
2394
            "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2395
            "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2396
            "punpcklbw            %%mm4, %%mm0      \n\t"
2397
            "punpckhbw            %%mm4, %%mm1      \n\t"
2398
            "punpcklbw            %%mm5, %%mm2      \n\t"
2399
            "punpckhbw            %%mm5, %%mm3      \n\t"
2400
            MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2401
            MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2402
            MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2403
            MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2404
            "add                    $16, %%"REG_a"  \n\t"
2405
            "cmp                     %3, %%"REG_a"  \n\t"
2406
            " jb                     1b             \n\t"
2407
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2408
            : "memory", "%"REG_a
2409
        );
2410
#endif
2411
        for (w= (width&(~15)); w < width; w++) {
2412
            dest[2*w+0] = src1[w];
2413
            dest[2*w+1] = src2[w];
2414
        }
2415
#else
2416
        for (w=0; w < width; w++) {
2417
            dest[2*w+0] = src1[w];
2418
            dest[2*w+1] = src2[w];
2419
        }
2420
#endif
2421
        dest += dstStride;
2422
                src1 += src1Stride;
2423
                src2 += src2Stride;
2424
    }
2425
#if HAVE_MMX
2426
    __asm__(
2427
            EMMS"       \n\t"
2428
            SFENCE"     \n\t"
2429
            ::: "memory"
2430
            );
2431
#endif
2432
}
2433

    
2434
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2435
                                       uint8_t *dst1, uint8_t *dst2,
2436
                                       long width, long height,
2437
                                       long srcStride1, long srcStride2,
2438
                                       long dstStride1, long dstStride2)
2439
{
2440
    x86_reg y;
2441
    long x,w,h;
2442
    w=width/2; h=height/2;
2443
#if HAVE_MMX
2444
    __asm__ volatile(
2445
        PREFETCH" %0    \n\t"
2446
        PREFETCH" %1    \n\t"
2447
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2448
#endif
2449
    for (y=0;y<h;y++) {
2450
        const uint8_t* s1=src1+srcStride1*(y>>1);
2451
        uint8_t* d=dst1+dstStride1*y;
2452
        x=0;
2453
#if HAVE_MMX
2454
        for (;x<w-31;x+=32) {
2455
            __asm__ volatile(
2456
                PREFETCH"   32%1        \n\t"
2457
                "movq         %1, %%mm0 \n\t"
2458
                "movq        8%1, %%mm2 \n\t"
2459
                "movq       16%1, %%mm4 \n\t"
2460
                "movq       24%1, %%mm6 \n\t"
2461
                "movq      %%mm0, %%mm1 \n\t"
2462
                "movq      %%mm2, %%mm3 \n\t"
2463
                "movq      %%mm4, %%mm5 \n\t"
2464
                "movq      %%mm6, %%mm7 \n\t"
2465
                "punpcklbw %%mm0, %%mm0 \n\t"
2466
                "punpckhbw %%mm1, %%mm1 \n\t"
2467
                "punpcklbw %%mm2, %%mm2 \n\t"
2468
                "punpckhbw %%mm3, %%mm3 \n\t"
2469
                "punpcklbw %%mm4, %%mm4 \n\t"
2470
                "punpckhbw %%mm5, %%mm5 \n\t"
2471
                "punpcklbw %%mm6, %%mm6 \n\t"
2472
                "punpckhbw %%mm7, %%mm7 \n\t"
2473
                MOVNTQ"    %%mm0,   %0  \n\t"
2474
                MOVNTQ"    %%mm1,  8%0  \n\t"
2475
                MOVNTQ"    %%mm2, 16%0  \n\t"
2476
                MOVNTQ"    %%mm3, 24%0  \n\t"
2477
                MOVNTQ"    %%mm4, 32%0  \n\t"
2478
                MOVNTQ"    %%mm5, 40%0  \n\t"
2479
                MOVNTQ"    %%mm6, 48%0  \n\t"
2480
                MOVNTQ"    %%mm7, 56%0"
2481
                :"=m"(d[2*x])
2482
                :"m"(s1[x])
2483
                :"memory");
2484
        }
2485
#endif
2486
        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2487
    }
2488
    for (y=0;y<h;y++) {
2489
        const uint8_t* s2=src2+srcStride2*(y>>1);
2490
        uint8_t* d=dst2+dstStride2*y;
2491
        x=0;
2492
#if HAVE_MMX
2493
        for (;x<w-31;x+=32) {
2494
            __asm__ volatile(
2495
                PREFETCH"   32%1        \n\t"
2496
                "movq         %1, %%mm0 \n\t"
2497
                "movq        8%1, %%mm2 \n\t"
2498
                "movq       16%1, %%mm4 \n\t"
2499
                "movq       24%1, %%mm6 \n\t"
2500
                "movq      %%mm0, %%mm1 \n\t"
2501
                "movq      %%mm2, %%mm3 \n\t"
2502
                "movq      %%mm4, %%mm5 \n\t"
2503
                "movq      %%mm6, %%mm7 \n\t"
2504
                "punpcklbw %%mm0, %%mm0 \n\t"
2505
                "punpckhbw %%mm1, %%mm1 \n\t"
2506
                "punpcklbw %%mm2, %%mm2 \n\t"
2507
                "punpckhbw %%mm3, %%mm3 \n\t"
2508
                "punpcklbw %%mm4, %%mm4 \n\t"
2509
                "punpckhbw %%mm5, %%mm5 \n\t"
2510
                "punpcklbw %%mm6, %%mm6 \n\t"
2511
                "punpckhbw %%mm7, %%mm7 \n\t"
2512
                MOVNTQ"    %%mm0,   %0  \n\t"
2513
                MOVNTQ"    %%mm1,  8%0  \n\t"
2514
                MOVNTQ"    %%mm2, 16%0  \n\t"
2515
                MOVNTQ"    %%mm3, 24%0  \n\t"
2516
                MOVNTQ"    %%mm4, 32%0  \n\t"
2517
                MOVNTQ"    %%mm5, 40%0  \n\t"
2518
                MOVNTQ"    %%mm6, 48%0  \n\t"
2519
                MOVNTQ"    %%mm7, 56%0"
2520
                :"=m"(d[2*x])
2521
                :"m"(s2[x])
2522
                :"memory");
2523
        }
2524
#endif
2525
        for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2526
    }
2527
#if HAVE_MMX
2528
    __asm__(
2529
            EMMS"       \n\t"
2530
            SFENCE"     \n\t"
2531
            ::: "memory"
2532
        );
2533
#endif
2534
}
2535

    
2536
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2537
                                        uint8_t *dst,
2538
                                        long width, long height,
2539
                                        long srcStride1, long srcStride2,
2540
                                        long srcStride3, long dstStride)
2541
{
2542
    x86_reg x;
2543
    long y,w,h;
2544
    w=width/2; h=height;
2545
    for (y=0;y<h;y++) {
2546
        const uint8_t* yp=src1+srcStride1*y;
2547
        const uint8_t* up=src2+srcStride2*(y>>2);
2548
        const uint8_t* vp=src3+srcStride3*(y>>2);
2549
        uint8_t* d=dst+dstStride*y;
2550
        x=0;
2551
#if HAVE_MMX
2552
        for (;x<w-7;x+=8) {
2553
            __asm__ volatile(
2554
                PREFETCH"   32(%1, %0)          \n\t"
2555
                PREFETCH"   32(%2, %0)          \n\t"
2556
                PREFETCH"   32(%3, %0)          \n\t"
2557
                "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2558
                "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2559
                "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2560
                "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2561
                "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2562
                "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2563
                "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2564
                "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2565
                "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2566
                "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2567

    
2568
                "movq            %%mm1, %%mm6   \n\t"
2569
                "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2570
                "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2571
                "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2572
                MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2573
                MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2574

    
2575
                "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2576
                "movq     8(%1, %0, 4), %%mm0   \n\t"
2577
                "movq            %%mm0, %%mm3   \n\t"
2578
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2579
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2580
                MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2581
                MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2582

    
2583
                "movq            %%mm4, %%mm6   \n\t"
2584
                "movq    16(%1, %0, 4), %%mm0   \n\t"
2585
                "movq            %%mm0, %%mm3   \n\t"
2586
                "punpcklbw       %%mm5, %%mm4   \n\t"
2587
                "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2588
                "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2589
                MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2590
                MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2591

    
2592
                "punpckhbw       %%mm5, %%mm6   \n\t"
2593
                "movq    24(%1, %0, 4), %%mm0   \n\t"
2594
                "movq            %%mm0, %%mm3   \n\t"
2595
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2596
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2597
                MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2598
                MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2599

    
2600
                : "+r" (x)
2601
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2602
                :"memory");
2603
        }
2604
#endif
2605
        for (; x<w; x++) {
2606
            const long x2 = x<<2;
2607
            d[8*x+0] = yp[x2];
2608
            d[8*x+1] = up[x];
2609
            d[8*x+2] = yp[x2+1];
2610
            d[8*x+3] = vp[x];
2611
            d[8*x+4] = yp[x2+2];
2612
            d[8*x+5] = up[x];
2613
            d[8*x+6] = yp[x2+3];
2614
            d[8*x+7] = vp[x];
2615
        }
2616
    }
2617
#if HAVE_MMX
2618
    __asm__(
2619
            EMMS"       \n\t"
2620
            SFENCE"     \n\t"
2621
            ::: "memory"
2622
        );
2623
#endif
2624
}
2625

    
2626
static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2627
{
2628
    dst +=   count;
2629
    src += 2*count;
2630
    count= - count;
2631

    
2632
#if HAVE_MMX
2633
    if(count <= -16) {
2634
        count += 15;
2635
        __asm__ volatile(
2636
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2637
            "psrlw            $8, %%mm7        \n\t"
2638
            "1:                                \n\t"
2639
            "movq -30(%1, %0, 2), %%mm0        \n\t"
2640
            "movq -22(%1, %0, 2), %%mm1        \n\t"
2641
            "movq -14(%1, %0, 2), %%mm2        \n\t"
2642
            "movq  -6(%1, %0, 2), %%mm3        \n\t"
2643
            "pand          %%mm7, %%mm0        \n\t"
2644
            "pand          %%mm7, %%mm1        \n\t"
2645
            "pand          %%mm7, %%mm2        \n\t"
2646
            "pand          %%mm7, %%mm3        \n\t"
2647
            "packuswb      %%mm1, %%mm0        \n\t"
2648
            "packuswb      %%mm3, %%mm2        \n\t"
2649
            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2650
            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2651
            "add             $16, %0           \n\t"
2652
            " js 1b                            \n\t"
2653
            : "+r"(count)
2654
            : "r"(src), "r"(dst)
2655
        );
2656
        count -= 15;
2657
    }
2658
#endif
2659
    while(count<0) {
2660
        dst[count]= src[2*count];
2661
        count++;
2662
    }
2663
}
2664

    
2665
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2666
{
2667
    dst0+=   count;
2668
    dst1+=   count;
2669
    src += 4*count;
2670
    count= - count;
2671
#if HAVE_MMX
2672
    if(count <= -8) {
2673
        count += 7;
2674
        __asm__ volatile(
2675
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2676
            "psrlw            $8, %%mm7        \n\t"
2677
            "1:                                \n\t"
2678
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2679
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2680
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2681
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2682
            "pand          %%mm7, %%mm0        \n\t"
2683
            "pand          %%mm7, %%mm1        \n\t"
2684
            "pand          %%mm7, %%mm2        \n\t"
2685
            "pand          %%mm7, %%mm3        \n\t"
2686
            "packuswb      %%mm1, %%mm0        \n\t"
2687
            "packuswb      %%mm3, %%mm2        \n\t"
2688
            "movq          %%mm0, %%mm1        \n\t"
2689
            "movq          %%mm2, %%mm3        \n\t"
2690
            "psrlw            $8, %%mm0        \n\t"
2691
            "psrlw            $8, %%mm2        \n\t"
2692
            "pand          %%mm7, %%mm1        \n\t"
2693
            "pand          %%mm7, %%mm3        \n\t"
2694
            "packuswb      %%mm2, %%mm0        \n\t"
2695
            "packuswb      %%mm3, %%mm1        \n\t"
2696
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2697
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2698
            "add              $8, %0           \n\t"
2699
            " js 1b                            \n\t"
2700
            : "+r"(count)
2701
            : "r"(src), "r"(dst0), "r"(dst1)
2702
        );
2703
        count -= 7;
2704
    }
2705
#endif
2706
    while(count<0) {
2707
        dst0[count]= src[4*count+0];
2708
        dst1[count]= src[4*count+2];
2709
        count++;
2710
    }
2711
}
2712

    
2713
static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2714
{
2715
    dst0 +=   count;
2716
    dst1 +=   count;
2717
    src0 += 4*count;
2718
    src1 += 4*count;
2719
    count= - count;
2720
#ifdef PAVGB
2721
    if(count <= -8) {
2722
        count += 7;
2723
        __asm__ volatile(
2724
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2725
            "psrlw             $8, %%mm7        \n\t"
2726
            "1:                                \n\t"
2727
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2728
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2729
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2730
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2731
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2732
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2733
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2734
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2735
            "pand           %%mm7, %%mm0        \n\t"
2736
            "pand           %%mm7, %%mm1        \n\t"
2737
            "pand           %%mm7, %%mm2        \n\t"
2738
            "pand           %%mm7, %%mm3        \n\t"
2739
            "packuswb       %%mm1, %%mm0        \n\t"
2740
            "packuswb       %%mm3, %%mm2        \n\t"
2741
            "movq           %%mm0, %%mm1        \n\t"
2742
            "movq           %%mm2, %%mm3        \n\t"
2743
            "psrlw             $8, %%mm0        \n\t"
2744
            "psrlw             $8, %%mm2        \n\t"
2745
            "pand           %%mm7, %%mm1        \n\t"
2746
            "pand           %%mm7, %%mm3        \n\t"
2747
            "packuswb       %%mm2, %%mm0        \n\t"
2748
            "packuswb       %%mm3, %%mm1        \n\t"
2749
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2750
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2751
            "add               $8, %0           \n\t"
2752
            " js 1b                            \n\t"
2753
            : "+r"(count)
2754
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2755
        );
2756
        count -= 7;
2757
    }
2758
#endif
2759
    while(count<0) {
2760
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2761
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2762
        count++;
2763
    }
2764
}
2765

    
2766
static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2767
{
2768
    dst0+=   count;
2769
    dst1+=   count;
2770
    src += 4*count;
2771
    count= - count;
2772
#if HAVE_MMX
2773
    if(count <= -8) {
2774
        count += 7;
2775
        __asm__ volatile(
2776
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2777
            "psrlw            $8, %%mm7        \n\t"
2778
            "1:                                \n\t"
2779
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2780
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2781
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2782
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2783
            "psrlw            $8, %%mm0        \n\t"
2784
            "psrlw            $8, %%mm1        \n\t"
2785
            "psrlw            $8, %%mm2        \n\t"
2786
            "psrlw            $8, %%mm3        \n\t"
2787
            "packuswb      %%mm1, %%mm0        \n\t"
2788
            "packuswb      %%mm3, %%mm2        \n\t"
2789
            "movq          %%mm0, %%mm1        \n\t"
2790
            "movq          %%mm2, %%mm3        \n\t"
2791
            "psrlw            $8, %%mm0        \n\t"
2792
            "psrlw            $8, %%mm2        \n\t"
2793
            "pand          %%mm7, %%mm1        \n\t"
2794
            "pand          %%mm7, %%mm3        \n\t"
2795
            "packuswb      %%mm2, %%mm0        \n\t"
2796
            "packuswb      %%mm3, %%mm1        \n\t"
2797
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2798
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2799
            "add              $8, %0           \n\t"
2800
            " js 1b                            \n\t"
2801
            : "+r"(count)
2802
            : "r"(src), "r"(dst0), "r"(dst1)
2803
        );
2804
        count -= 7;
2805
    }
2806
#endif
2807
    src++;
2808
    while(count<0) {
2809
        dst0[count]= src[4*count+0];
2810
        dst1[count]= src[4*count+2];
2811
        count++;
2812
    }
2813
}
2814

    
2815
static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2816
{
2817
    dst0 +=   count;
2818
    dst1 +=   count;
2819
    src0 += 4*count;
2820
    src1 += 4*count;
2821
    count= - count;
2822
#ifdef PAVGB
2823
    if(count <= -8) {
2824
        count += 7;
2825
        __asm__ volatile(
2826
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2827
            "psrlw             $8, %%mm7        \n\t"
2828
            "1:                                \n\t"
2829
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2830
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2831
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2832
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2833
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2834
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2835
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2836
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2837
            "psrlw             $8, %%mm0        \n\t"
2838
            "psrlw             $8, %%mm1        \n\t"
2839
            "psrlw             $8, %%mm2        \n\t"
2840
            "psrlw             $8, %%mm3        \n\t"
2841
            "packuswb       %%mm1, %%mm0        \n\t"
2842
            "packuswb       %%mm3, %%mm2        \n\t"
2843
            "movq           %%mm0, %%mm1        \n\t"
2844
            "movq           %%mm2, %%mm3        \n\t"
2845
            "psrlw             $8, %%mm0        \n\t"
2846
            "psrlw             $8, %%mm2        \n\t"
2847
            "pand           %%mm7, %%mm1        \n\t"
2848
            "pand           %%mm7, %%mm3        \n\t"
2849
            "packuswb       %%mm2, %%mm0        \n\t"
2850
            "packuswb       %%mm3, %%mm1        \n\t"
2851
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2852
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2853
            "add               $8, %0           \n\t"
2854
            " js 1b                            \n\t"
2855
            : "+r"(count)
2856
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2857
        );
2858
        count -= 7;
2859
    }
2860
#endif
2861
    src0++;
2862
    src1++;
2863
    while(count<0) {
2864
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2865
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2866
        count++;
2867
    }
2868
}
2869

    
2870
static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2871
                                      long width, long height,
2872
                                      long lumStride, long chromStride, long srcStride)
2873
{
2874
    long y;
2875
    const long chromWidth= -((-width)>>1);
2876

    
2877
    for (y=0; y<height; y++) {
2878
        RENAME(extract_even)(src, ydst, width);
2879
        if(y&1) {
2880
            RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2881
            udst+= chromStride;
2882
            vdst+= chromStride;
2883
        }
2884

    
2885
        src += srcStride;
2886
        ydst+= lumStride;
2887
    }
2888
#if HAVE_MMX
2889
    __asm__(
2890
            EMMS"       \n\t"
2891
            SFENCE"     \n\t"
2892
            ::: "memory"
2893
        );
2894
#endif
2895
}
2896

    
2897
static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2898
                                      long width, long height,
2899
                                      long lumStride, long chromStride, long srcStride)
2900
{
2901
    long y;
2902
    const long chromWidth= -((-width)>>1);
2903

    
2904
    for (y=0; y<height; y++) {
2905
        RENAME(extract_even)(src, ydst, width);
2906
        RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2907

    
2908
        src += srcStride;
2909
        ydst+= lumStride;
2910
        udst+= chromStride;
2911
        vdst+= chromStride;
2912
    }
2913
#if HAVE_MMX
2914
    __asm__(
2915
            EMMS"       \n\t"
2916
            SFENCE"     \n\t"
2917
            ::: "memory"
2918
        );
2919
#endif
2920
}
2921

    
2922
static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2923
                                      long width, long height,
2924
                                      long lumStride, long chromStride, long srcStride)
2925
{
2926
    long y;
2927
    const long chromWidth= -((-width)>>1);
2928

    
2929
    for (y=0; y<height; y++) {
2930
        RENAME(extract_even)(src+1, ydst, width);
2931
        if(y&1) {
2932
            RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2933
            udst+= chromStride;
2934
            vdst+= chromStride;
2935
        }
2936

    
2937
        src += srcStride;
2938
        ydst+= lumStride;
2939
    }
2940
#if HAVE_MMX
2941
    __asm__(
2942
            EMMS"       \n\t"
2943
            SFENCE"     \n\t"
2944
            ::: "memory"
2945
        );
2946
#endif
2947
}
2948

    
2949
static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2950
                                      long width, long height,
2951
                                      long lumStride, long chromStride, long srcStride)
2952
{
2953
    long y;
2954
    const long chromWidth= -((-width)>>1);
2955

    
2956
    for (y=0; y<height; y++) {
2957
        RENAME(extract_even)(src+1, ydst, width);
2958
        RENAME(extract_even2)(src, udst, vdst, chromWidth);
2959

    
2960
        src += srcStride;
2961
        ydst+= lumStride;
2962
        udst+= chromStride;
2963
        vdst+= chromStride;
2964
    }
2965
#if HAVE_MMX
2966
    __asm__(
2967
            EMMS"       \n\t"
2968
            SFENCE"     \n\t"
2969
            ::: "memory"
2970
        );
2971
#endif
2972
}
2973

    
2974
static inline void RENAME(rgb2rgb_init)(void)
2975
{
2976
    rgb15to16       = RENAME(rgb15to16);
2977
    rgb15tobgr24    = RENAME(rgb15tobgr24);
2978
    rgb15to32       = RENAME(rgb15to32);
2979
    rgb16tobgr24    = RENAME(rgb16tobgr24);
2980
    rgb16to32       = RENAME(rgb16to32);
2981
    rgb16to15       = RENAME(rgb16to15);
2982
    rgb24tobgr16    = RENAME(rgb24tobgr16);
2983
    rgb24tobgr15    = RENAME(rgb24tobgr15);
2984
    rgb24tobgr32    = RENAME(rgb24tobgr32);
2985
    rgb32to16       = RENAME(rgb32to16);
2986
    rgb32to15       = RENAME(rgb32to15);
2987
    rgb32tobgr24    = RENAME(rgb32tobgr24);
2988
    rgb24to15       = RENAME(rgb24to15);
2989
    rgb24to16       = RENAME(rgb24to16);
2990
    rgb24tobgr24    = RENAME(rgb24tobgr24);
2991
    rgb32tobgr32    = RENAME(rgb32tobgr32);
2992
    rgb32tobgr16    = RENAME(rgb32tobgr16);
2993
    rgb32tobgr15    = RENAME(rgb32tobgr15);
2994
    yv12toyuy2      = RENAME(yv12toyuy2);
2995
    yv12touyvy      = RENAME(yv12touyvy);
2996
    yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
2997
    yuv422ptouyvy   = RENAME(yuv422ptouyvy);
2998
    yuy2toyv12      = RENAME(yuy2toyv12);
2999
//    yvu9toyv12      = RENAME(yvu9toyv12);
3000
    planar2x        = RENAME(planar2x);
3001
    rgb24toyv12     = RENAME(rgb24toyv12);
3002
    interleaveBytes = RENAME(interleaveBytes);
3003
    vu9_to_vu12     = RENAME(vu9_to_vu12);
3004
    yvu9_to_yuy2    = RENAME(yvu9_to_yuy2);
3005

    
3006
    uyvytoyuv420    = RENAME(uyvytoyuv420);
3007
    uyvytoyuv422    = RENAME(uyvytoyuv422);
3008
    yuyvtoyuv420    = RENAME(yuyvtoyuv420);
3009
    yuyvtoyuv422    = RENAME(yuyvtoyuv422);
3010
}