Statistics
| Branch: | Revision:

ffmpeg / libswscale / x86 / rgb2rgb_template.c @ c0038328

History | View | Annotate | Download (113 KB)

1
/*
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 * lot of big-endian byte order fixes by Alex Beregszaszi
9
 *
10
 * This file is part of Libav.
11
 *
12
 * Libav is free software; you can redistribute it and/or
13
 * modify it under the terms of the GNU Lesser General Public
14
 * License as published by the Free Software Foundation; either
15
 * version 2.1 of the License, or (at your option) any later version.
16
 *
17
 * Libav is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
 * Lesser General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU Lesser General Public
23
 * License along with Libav; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 */
26

    
27
#include <stddef.h>
28

    
29
#undef PREFETCH
30
#undef MOVNTQ
31
#undef EMMS
32
#undef SFENCE
33
#undef MMREG_SIZE
34
#undef PAVGB
35

    
36
#if COMPILE_TEMPLATE_SSE2
37
#define MMREG_SIZE 16
38
#else
39
#define MMREG_SIZE 8
40
#endif
41

    
42
#if COMPILE_TEMPLATE_AMD3DNOW
43
#define PREFETCH  "prefetch"
44
#define PAVGB     "pavgusb"
45
#elif COMPILE_TEMPLATE_MMX2
46
#define PREFETCH "prefetchnta"
47
#define PAVGB     "pavgb"
48
#else
49
#define PREFETCH  " # nop"
50
#endif
51

    
52
#if COMPILE_TEMPLATE_AMD3DNOW
53
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
54
#define EMMS     "femms"
55
#else
56
#define EMMS     "emms"
57
#endif
58

    
59
#if COMPILE_TEMPLATE_MMX2
60
#define MOVNTQ "movntq"
61
#define SFENCE "sfence"
62
#else
63
#define MOVNTQ "movq"
64
#define SFENCE " # nop"
65
#endif
66

    
67
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
68
{
69
    uint8_t *dest = dst;
70
    const uint8_t *s = src;
71
    const uint8_t *end;
72
#if COMPILE_TEMPLATE_MMX
73
    const uint8_t *mm_end;
74
#endif
75
    end = s + src_size;
76
#if COMPILE_TEMPLATE_MMX
77
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
78
    mm_end = end - 23;
79
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
80
    while (s < mm_end) {
81
        __asm__ volatile(
82
            PREFETCH"    32%1           \n\t"
83
            "movd          %1, %%mm0    \n\t"
84
            "punpckldq    3%1, %%mm0    \n\t"
85
            "movd         6%1, %%mm1    \n\t"
86
            "punpckldq    9%1, %%mm1    \n\t"
87
            "movd        12%1, %%mm2    \n\t"
88
            "punpckldq   15%1, %%mm2    \n\t"
89
            "movd        18%1, %%mm3    \n\t"
90
            "punpckldq   21%1, %%mm3    \n\t"
91
            "por        %%mm7, %%mm0    \n\t"
92
            "por        %%mm7, %%mm1    \n\t"
93
            "por        %%mm7, %%mm2    \n\t"
94
            "por        %%mm7, %%mm3    \n\t"
95
            MOVNTQ"     %%mm0,   %0     \n\t"
96
            MOVNTQ"     %%mm1,  8%0     \n\t"
97
            MOVNTQ"     %%mm2, 16%0     \n\t"
98
            MOVNTQ"     %%mm3, 24%0"
99
            :"=m"(*dest)
100
            :"m"(*s)
101
            :"memory");
102
        dest += 32;
103
        s += 24;
104
    }
105
    __asm__ volatile(SFENCE:::"memory");
106
    __asm__ volatile(EMMS:::"memory");
107
#endif
108
    while (s < end) {
109
#if HAVE_BIGENDIAN
110
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
111
        *dest++ = 255;
112
        *dest++ = s[2];
113
        *dest++ = s[1];
114
        *dest++ = s[0];
115
        s+=3;
116
#else
117
        *dest++ = *s++;
118
        *dest++ = *s++;
119
        *dest++ = *s++;
120
        *dest++ = 255;
121
#endif
122
    }
123
}
124

    
125
#define STORE_BGR24_MMX \
126
            "psrlq         $8, %%mm2    \n\t" \
127
            "psrlq         $8, %%mm3    \n\t" \
128
            "psrlq         $8, %%mm6    \n\t" \
129
            "psrlq         $8, %%mm7    \n\t" \
130
            "pand "MANGLE(mask24l)", %%mm0\n\t" \
131
            "pand "MANGLE(mask24l)", %%mm1\n\t" \
132
            "pand "MANGLE(mask24l)", %%mm4\n\t" \
133
            "pand "MANGLE(mask24l)", %%mm5\n\t" \
134
            "pand "MANGLE(mask24h)", %%mm2\n\t" \
135
            "pand "MANGLE(mask24h)", %%mm3\n\t" \
136
            "pand "MANGLE(mask24h)", %%mm6\n\t" \
137
            "pand "MANGLE(mask24h)", %%mm7\n\t" \
138
            "por        %%mm2, %%mm0    \n\t" \
139
            "por        %%mm3, %%mm1    \n\t" \
140
            "por        %%mm6, %%mm4    \n\t" \
141
            "por        %%mm7, %%mm5    \n\t" \
142
 \
143
            "movq       %%mm1, %%mm2    \n\t" \
144
            "movq       %%mm4, %%mm3    \n\t" \
145
            "psllq        $48, %%mm2    \n\t" \
146
            "psllq        $32, %%mm3    \n\t" \
147
            "pand "MANGLE(mask24hh)", %%mm2\n\t" \
148
            "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
149
            "por        %%mm2, %%mm0    \n\t" \
150
            "psrlq        $16, %%mm1    \n\t" \
151
            "psrlq        $32, %%mm4    \n\t" \
152
            "psllq        $16, %%mm5    \n\t" \
153
            "por        %%mm3, %%mm1    \n\t" \
154
            "pand  "MANGLE(mask24hhhh)", %%mm5\n\t" \
155
            "por        %%mm5, %%mm4    \n\t" \
156
 \
157
            MOVNTQ"     %%mm0,   %0     \n\t" \
158
            MOVNTQ"     %%mm1,  8%0     \n\t" \
159
            MOVNTQ"     %%mm4, 16%0"
160

    
161

    
162
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
163
{
164
    uint8_t *dest = dst;
165
    const uint8_t *s = src;
166
    const uint8_t *end;
167
#if COMPILE_TEMPLATE_MMX
168
    const uint8_t *mm_end;
169
#endif
170
    end = s + src_size;
171
#if COMPILE_TEMPLATE_MMX
172
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
173
    mm_end = end - 31;
174
    while (s < mm_end) {
175
        __asm__ volatile(
176
            PREFETCH"    32%1           \n\t"
177
            "movq          %1, %%mm0    \n\t"
178
            "movq         8%1, %%mm1    \n\t"
179
            "movq        16%1, %%mm4    \n\t"
180
            "movq        24%1, %%mm5    \n\t"
181
            "movq       %%mm0, %%mm2    \n\t"
182
            "movq       %%mm1, %%mm3    \n\t"
183
            "movq       %%mm4, %%mm6    \n\t"
184
            "movq       %%mm5, %%mm7    \n\t"
185
            STORE_BGR24_MMX
186
            :"=m"(*dest)
187
            :"m"(*s)
188
            :"memory");
189
        dest += 24;
190
        s += 32;
191
    }
192
    __asm__ volatile(SFENCE:::"memory");
193
    __asm__ volatile(EMMS:::"memory");
194
#endif
195
    while (s < end) {
196
#if HAVE_BIGENDIAN
197
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
198
        s++;
199
        dest[2] = *s++;
200
        dest[1] = *s++;
201
        dest[0] = *s++;
202
        dest += 3;
203
#else
204
        *dest++ = *s++;
205
        *dest++ = *s++;
206
        *dest++ = *s++;
207
        s++;
208
#endif
209
    }
210
}
211

    
212
/*
213
 original by Strepto/Astral
214
 ported to gcc & bugfixed: A'rpi
215
 MMX2, 3DNOW optimization by Nick Kurshev
216
 32-bit C version, and and&add trick by Michael Niedermayer
217
*/
218
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
219
{
220
    register const uint8_t* s=src;
221
    register uint8_t* d=dst;
222
    register const uint8_t *end;
223
    const uint8_t *mm_end;
224
    end = s + src_size;
225
#if COMPILE_TEMPLATE_MMX
226
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
227
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
228
    mm_end = end - 15;
229
    while (s<mm_end) {
230
        __asm__ volatile(
231
            PREFETCH"  32%1         \n\t"
232
            "movq        %1, %%mm0  \n\t"
233
            "movq       8%1, %%mm2  \n\t"
234
            "movq     %%mm0, %%mm1  \n\t"
235
            "movq     %%mm2, %%mm3  \n\t"
236
            "pand     %%mm4, %%mm0  \n\t"
237
            "pand     %%mm4, %%mm2  \n\t"
238
            "paddw    %%mm1, %%mm0  \n\t"
239
            "paddw    %%mm3, %%mm2  \n\t"
240
            MOVNTQ"   %%mm0,  %0    \n\t"
241
            MOVNTQ"   %%mm2, 8%0"
242
            :"=m"(*d)
243
            :"m"(*s)
244
        );
245
        d+=16;
246
        s+=16;
247
    }
248
    __asm__ volatile(SFENCE:::"memory");
249
    __asm__ volatile(EMMS:::"memory");
250
#endif
251
    mm_end = end - 3;
252
    while (s < mm_end) {
253
        register unsigned x= *((const uint32_t *)s);
254
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
255
        d+=4;
256
        s+=4;
257
    }
258
    if (s < end) {
259
        register unsigned short x= *((const uint16_t *)s);
260
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
261
    }
262
}
263

    
264
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
265
{
266
    register const uint8_t* s=src;
267
    register uint8_t* d=dst;
268
    register const uint8_t *end;
269
    const uint8_t *mm_end;
270
    end = s + src_size;
271
#if COMPILE_TEMPLATE_MMX
272
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
273
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
274
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
275
    mm_end = end - 15;
276
    while (s<mm_end) {
277
        __asm__ volatile(
278
            PREFETCH"  32%1         \n\t"
279
            "movq        %1, %%mm0  \n\t"
280
            "movq       8%1, %%mm2  \n\t"
281
            "movq     %%mm0, %%mm1  \n\t"
282
            "movq     %%mm2, %%mm3  \n\t"
283
            "psrlq       $1, %%mm0  \n\t"
284
            "psrlq       $1, %%mm2  \n\t"
285
            "pand     %%mm7, %%mm0  \n\t"
286
            "pand     %%mm7, %%mm2  \n\t"
287
            "pand     %%mm6, %%mm1  \n\t"
288
            "pand     %%mm6, %%mm3  \n\t"
289
            "por      %%mm1, %%mm0  \n\t"
290
            "por      %%mm3, %%mm2  \n\t"
291
            MOVNTQ"   %%mm0,  %0    \n\t"
292
            MOVNTQ"   %%mm2, 8%0"
293
            :"=m"(*d)
294
            :"m"(*s)
295
        );
296
        d+=16;
297
        s+=16;
298
    }
299
    __asm__ volatile(SFENCE:::"memory");
300
    __asm__ volatile(EMMS:::"memory");
301
#endif
302
    mm_end = end - 3;
303
    while (s < mm_end) {
304
        register uint32_t x= *((const uint32_t*)s);
305
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
306
        s+=4;
307
        d+=4;
308
    }
309
    if (s < end) {
310
        register uint16_t x= *((const uint16_t*)s);
311
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
312
    }
313
}
314

    
315
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
316
{
317
    const uint8_t *s = src;
318
    const uint8_t *end;
319
#if COMPILE_TEMPLATE_MMX
320
    const uint8_t *mm_end;
321
#endif
322
    uint16_t *d = (uint16_t *)dst;
323
    end = s + src_size;
324
#if COMPILE_TEMPLATE_MMX
325
    mm_end = end - 15;
326
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
327
    __asm__ volatile(
328
        "movq           %3, %%mm5   \n\t"
329
        "movq           %4, %%mm6   \n\t"
330
        "movq           %5, %%mm7   \n\t"
331
        "jmp 2f                     \n\t"
332
        ".p2align        4          \n\t"
333
        "1:                         \n\t"
334
        PREFETCH"   32(%1)          \n\t"
335
        "movd         (%1), %%mm0   \n\t"
336
        "movd        4(%1), %%mm3   \n\t"
337
        "punpckldq   8(%1), %%mm0   \n\t"
338
        "punpckldq  12(%1), %%mm3   \n\t"
339
        "movq        %%mm0, %%mm1   \n\t"
340
        "movq        %%mm3, %%mm4   \n\t"
341
        "pand        %%mm6, %%mm0   \n\t"
342
        "pand        %%mm6, %%mm3   \n\t"
343
        "pmaddwd     %%mm7, %%mm0   \n\t"
344
        "pmaddwd     %%mm7, %%mm3   \n\t"
345
        "pand        %%mm5, %%mm1   \n\t"
346
        "pand        %%mm5, %%mm4   \n\t"
347
        "por         %%mm1, %%mm0   \n\t"
348
        "por         %%mm4, %%mm3   \n\t"
349
        "psrld          $5, %%mm0   \n\t"
350
        "pslld         $11, %%mm3   \n\t"
351
        "por         %%mm3, %%mm0   \n\t"
352
        MOVNTQ"      %%mm0, (%0)    \n\t"
353
        "add           $16,  %1     \n\t"
354
        "add            $8,  %0     \n\t"
355
        "2:                         \n\t"
356
        "cmp            %2,  %1     \n\t"
357
        " jb            1b          \n\t"
358
        : "+r" (d), "+r"(s)
359
        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
360
    );
361
#else
362
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
363
    __asm__ volatile(
364
        "movq    %0, %%mm7    \n\t"
365
        "movq    %1, %%mm6    \n\t"
366
        ::"m"(red_16mask),"m"(green_16mask));
367
    while (s < mm_end) {
368
        __asm__ volatile(
369
            PREFETCH"    32%1           \n\t"
370
            "movd          %1, %%mm0    \n\t"
371
            "movd         4%1, %%mm3    \n\t"
372
            "punpckldq    8%1, %%mm0    \n\t"
373
            "punpckldq   12%1, %%mm3    \n\t"
374
            "movq       %%mm0, %%mm1    \n\t"
375
            "movq       %%mm0, %%mm2    \n\t"
376
            "movq       %%mm3, %%mm4    \n\t"
377
            "movq       %%mm3, %%mm5    \n\t"
378
            "psrlq         $3, %%mm0    \n\t"
379
            "psrlq         $3, %%mm3    \n\t"
380
            "pand          %2, %%mm0    \n\t"
381
            "pand          %2, %%mm3    \n\t"
382
            "psrlq         $5, %%mm1    \n\t"
383
            "psrlq         $5, %%mm4    \n\t"
384
            "pand       %%mm6, %%mm1    \n\t"
385
            "pand       %%mm6, %%mm4    \n\t"
386
            "psrlq         $8, %%mm2    \n\t"
387
            "psrlq         $8, %%mm5    \n\t"
388
            "pand       %%mm7, %%mm2    \n\t"
389
            "pand       %%mm7, %%mm5    \n\t"
390
            "por        %%mm1, %%mm0    \n\t"
391
            "por        %%mm4, %%mm3    \n\t"
392
            "por        %%mm2, %%mm0    \n\t"
393
            "por        %%mm5, %%mm3    \n\t"
394
            "psllq        $16, %%mm3    \n\t"
395
            "por        %%mm3, %%mm0    \n\t"
396
            MOVNTQ"     %%mm0, %0       \n\t"
397
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
398
        d += 4;
399
        s += 16;
400
    }
401
#endif
402
    __asm__ volatile(SFENCE:::"memory");
403
    __asm__ volatile(EMMS:::"memory");
404
#endif
405
    while (s < end) {
406
        register int rgb = *(const uint32_t*)s; s += 4;
407
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
408
    }
409
}
410

    
411
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
412
{
413
    const uint8_t *s = src;
414
    const uint8_t *end;
415
#if COMPILE_TEMPLATE_MMX
416
    const uint8_t *mm_end;
417
#endif
418
    uint16_t *d = (uint16_t *)dst;
419
    end = s + src_size;
420
#if COMPILE_TEMPLATE_MMX
421
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
422
    __asm__ volatile(
423
        "movq          %0, %%mm7    \n\t"
424
        "movq          %1, %%mm6    \n\t"
425
        ::"m"(red_16mask),"m"(green_16mask));
426
    mm_end = end - 15;
427
    while (s < mm_end) {
428
        __asm__ volatile(
429
            PREFETCH"    32%1           \n\t"
430
            "movd          %1, %%mm0    \n\t"
431
            "movd         4%1, %%mm3    \n\t"
432
            "punpckldq    8%1, %%mm0    \n\t"
433
            "punpckldq   12%1, %%mm3    \n\t"
434
            "movq       %%mm0, %%mm1    \n\t"
435
            "movq       %%mm0, %%mm2    \n\t"
436
            "movq       %%mm3, %%mm4    \n\t"
437
            "movq       %%mm3, %%mm5    \n\t"
438
            "psllq         $8, %%mm0    \n\t"
439
            "psllq         $8, %%mm3    \n\t"
440
            "pand       %%mm7, %%mm0    \n\t"
441
            "pand       %%mm7, %%mm3    \n\t"
442
            "psrlq         $5, %%mm1    \n\t"
443
            "psrlq         $5, %%mm4    \n\t"
444
            "pand       %%mm6, %%mm1    \n\t"
445
            "pand       %%mm6, %%mm4    \n\t"
446
            "psrlq        $19, %%mm2    \n\t"
447
            "psrlq        $19, %%mm5    \n\t"
448
            "pand          %2, %%mm2    \n\t"
449
            "pand          %2, %%mm5    \n\t"
450
            "por        %%mm1, %%mm0    \n\t"
451
            "por        %%mm4, %%mm3    \n\t"
452
            "por        %%mm2, %%mm0    \n\t"
453
            "por        %%mm5, %%mm3    \n\t"
454
            "psllq        $16, %%mm3    \n\t"
455
            "por        %%mm3, %%mm0    \n\t"
456
            MOVNTQ"     %%mm0, %0       \n\t"
457
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
458
        d += 4;
459
        s += 16;
460
    }
461
    __asm__ volatile(SFENCE:::"memory");
462
    __asm__ volatile(EMMS:::"memory");
463
#endif
464
    while (s < end) {
465
        register int rgb = *(const uint32_t*)s; s += 4;
466
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
467
    }
468
}
469

    
470
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
471
{
472
    const uint8_t *s = src;
473
    const uint8_t *end;
474
#if COMPILE_TEMPLATE_MMX
475
    const uint8_t *mm_end;
476
#endif
477
    uint16_t *d = (uint16_t *)dst;
478
    end = s + src_size;
479
#if COMPILE_TEMPLATE_MMX
480
    mm_end = end - 15;
481
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
482
    __asm__ volatile(
483
        "movq           %3, %%mm5   \n\t"
484
        "movq           %4, %%mm6   \n\t"
485
        "movq           %5, %%mm7   \n\t"
486
        "jmp            2f          \n\t"
487
        ".p2align        4          \n\t"
488
        "1:                         \n\t"
489
        PREFETCH"   32(%1)          \n\t"
490
        "movd         (%1), %%mm0   \n\t"
491
        "movd        4(%1), %%mm3   \n\t"
492
        "punpckldq   8(%1), %%mm0   \n\t"
493
        "punpckldq  12(%1), %%mm3   \n\t"
494
        "movq        %%mm0, %%mm1   \n\t"
495
        "movq        %%mm3, %%mm4   \n\t"
496
        "pand        %%mm6, %%mm0   \n\t"
497
        "pand        %%mm6, %%mm3   \n\t"
498
        "pmaddwd     %%mm7, %%mm0   \n\t"
499
        "pmaddwd     %%mm7, %%mm3   \n\t"
500
        "pand        %%mm5, %%mm1   \n\t"
501
        "pand        %%mm5, %%mm4   \n\t"
502
        "por         %%mm1, %%mm0   \n\t"
503
        "por         %%mm4, %%mm3   \n\t"
504
        "psrld          $6, %%mm0   \n\t"
505
        "pslld         $10, %%mm3   \n\t"
506
        "por         %%mm3, %%mm0   \n\t"
507
        MOVNTQ"      %%mm0, (%0)    \n\t"
508
        "add           $16,  %1     \n\t"
509
        "add            $8,  %0     \n\t"
510
        "2:                         \n\t"
511
        "cmp            %2,  %1     \n\t"
512
        " jb            1b          \n\t"
513
        : "+r" (d), "+r"(s)
514
        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
515
    );
516
#else
517
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
518
    __asm__ volatile(
519
        "movq          %0, %%mm7    \n\t"
520
        "movq          %1, %%mm6    \n\t"
521
        ::"m"(red_15mask),"m"(green_15mask));
522
    while (s < mm_end) {
523
        __asm__ volatile(
524
            PREFETCH"    32%1           \n\t"
525
            "movd          %1, %%mm0    \n\t"
526
            "movd         4%1, %%mm3    \n\t"
527
            "punpckldq    8%1, %%mm0    \n\t"
528
            "punpckldq   12%1, %%mm3    \n\t"
529
            "movq       %%mm0, %%mm1    \n\t"
530
            "movq       %%mm0, %%mm2    \n\t"
531
            "movq       %%mm3, %%mm4    \n\t"
532
            "movq       %%mm3, %%mm5    \n\t"
533
            "psrlq         $3, %%mm0    \n\t"
534
            "psrlq         $3, %%mm3    \n\t"
535
            "pand          %2, %%mm0    \n\t"
536
            "pand          %2, %%mm3    \n\t"
537
            "psrlq         $6, %%mm1    \n\t"
538
            "psrlq         $6, %%mm4    \n\t"
539
            "pand       %%mm6, %%mm1    \n\t"
540
            "pand       %%mm6, %%mm4    \n\t"
541
            "psrlq         $9, %%mm2    \n\t"
542
            "psrlq         $9, %%mm5    \n\t"
543
            "pand       %%mm7, %%mm2    \n\t"
544
            "pand       %%mm7, %%mm5    \n\t"
545
            "por        %%mm1, %%mm0    \n\t"
546
            "por        %%mm4, %%mm3    \n\t"
547
            "por        %%mm2, %%mm0    \n\t"
548
            "por        %%mm5, %%mm3    \n\t"
549
            "psllq        $16, %%mm3    \n\t"
550
            "por        %%mm3, %%mm0    \n\t"
551
            MOVNTQ"     %%mm0, %0       \n\t"
552
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
553
        d += 4;
554
        s += 16;
555
    }
556
#endif
557
    __asm__ volatile(SFENCE:::"memory");
558
    __asm__ volatile(EMMS:::"memory");
559
#endif
560
    while (s < end) {
561
        register int rgb = *(const uint32_t*)s; s += 4;
562
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
563
    }
564
}
565

    
566
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
567
{
568
    const uint8_t *s = src;
569
    const uint8_t *end;
570
#if COMPILE_TEMPLATE_MMX
571
    const uint8_t *mm_end;
572
#endif
573
    uint16_t *d = (uint16_t *)dst;
574
    end = s + src_size;
575
#if COMPILE_TEMPLATE_MMX
576
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
577
    __asm__ volatile(
578
        "movq          %0, %%mm7    \n\t"
579
        "movq          %1, %%mm6    \n\t"
580
        ::"m"(red_15mask),"m"(green_15mask));
581
    mm_end = end - 15;
582
    while (s < mm_end) {
583
        __asm__ volatile(
584
            PREFETCH"    32%1           \n\t"
585
            "movd          %1, %%mm0    \n\t"
586
            "movd         4%1, %%mm3    \n\t"
587
            "punpckldq    8%1, %%mm0    \n\t"
588
            "punpckldq   12%1, %%mm3    \n\t"
589
            "movq       %%mm0, %%mm1    \n\t"
590
            "movq       %%mm0, %%mm2    \n\t"
591
            "movq       %%mm3, %%mm4    \n\t"
592
            "movq       %%mm3, %%mm5    \n\t"
593
            "psllq         $7, %%mm0    \n\t"
594
            "psllq         $7, %%mm3    \n\t"
595
            "pand       %%mm7, %%mm0    \n\t"
596
            "pand       %%mm7, %%mm3    \n\t"
597
            "psrlq         $6, %%mm1    \n\t"
598
            "psrlq         $6, %%mm4    \n\t"
599
            "pand       %%mm6, %%mm1    \n\t"
600
            "pand       %%mm6, %%mm4    \n\t"
601
            "psrlq        $19, %%mm2    \n\t"
602
            "psrlq        $19, %%mm5    \n\t"
603
            "pand          %2, %%mm2    \n\t"
604
            "pand          %2, %%mm5    \n\t"
605
            "por        %%mm1, %%mm0    \n\t"
606
            "por        %%mm4, %%mm3    \n\t"
607
            "por        %%mm2, %%mm0    \n\t"
608
            "por        %%mm5, %%mm3    \n\t"
609
            "psllq        $16, %%mm3    \n\t"
610
            "por        %%mm3, %%mm0    \n\t"
611
            MOVNTQ"     %%mm0, %0       \n\t"
612
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
613
        d += 4;
614
        s += 16;
615
    }
616
    __asm__ volatile(SFENCE:::"memory");
617
    __asm__ volatile(EMMS:::"memory");
618
#endif
619
    while (s < end) {
620
        register int rgb = *(const uint32_t*)s; s += 4;
621
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
622
    }
623
}
624

    
625
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
626
{
627
    const uint8_t *s = src;
628
    const uint8_t *end;
629
#if COMPILE_TEMPLATE_MMX
630
    const uint8_t *mm_end;
631
#endif
632
    uint16_t *d = (uint16_t *)dst;
633
    end = s + src_size;
634
#if COMPILE_TEMPLATE_MMX
635
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
636
    __asm__ volatile(
637
        "movq         %0, %%mm7     \n\t"
638
        "movq         %1, %%mm6     \n\t"
639
        ::"m"(red_16mask),"m"(green_16mask));
640
    mm_end = end - 11;
641
    while (s < mm_end) {
642
        __asm__ volatile(
643
            PREFETCH"    32%1           \n\t"
644
            "movd          %1, %%mm0    \n\t"
645
            "movd         3%1, %%mm3    \n\t"
646
            "punpckldq    6%1, %%mm0    \n\t"
647
            "punpckldq    9%1, %%mm3    \n\t"
648
            "movq       %%mm0, %%mm1    \n\t"
649
            "movq       %%mm0, %%mm2    \n\t"
650
            "movq       %%mm3, %%mm4    \n\t"
651
            "movq       %%mm3, %%mm5    \n\t"
652
            "psrlq         $3, %%mm0    \n\t"
653
            "psrlq         $3, %%mm3    \n\t"
654
            "pand          %2, %%mm0    \n\t"
655
            "pand          %2, %%mm3    \n\t"
656
            "psrlq         $5, %%mm1    \n\t"
657
            "psrlq         $5, %%mm4    \n\t"
658
            "pand       %%mm6, %%mm1    \n\t"
659
            "pand       %%mm6, %%mm4    \n\t"
660
            "psrlq         $8, %%mm2    \n\t"
661
            "psrlq         $8, %%mm5    \n\t"
662
            "pand       %%mm7, %%mm2    \n\t"
663
            "pand       %%mm7, %%mm5    \n\t"
664
            "por        %%mm1, %%mm0    \n\t"
665
            "por        %%mm4, %%mm3    \n\t"
666
            "por        %%mm2, %%mm0    \n\t"
667
            "por        %%mm5, %%mm3    \n\t"
668
            "psllq        $16, %%mm3    \n\t"
669
            "por        %%mm3, %%mm0    \n\t"
670
            MOVNTQ"     %%mm0, %0       \n\t"
671
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
672
        d += 4;
673
        s += 12;
674
    }
675
    __asm__ volatile(SFENCE:::"memory");
676
    __asm__ volatile(EMMS:::"memory");
677
#endif
678
    while (s < end) {
679
        const int b = *s++;
680
        const int g = *s++;
681
        const int r = *s++;
682
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
683
    }
684
}
685

    
686
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
687
{
688
    const uint8_t *s = src;
689
    const uint8_t *end;
690
#if COMPILE_TEMPLATE_MMX
691
    const uint8_t *mm_end;
692
#endif
693
    uint16_t *d = (uint16_t *)dst;
694
    end = s + src_size;
695
#if COMPILE_TEMPLATE_MMX
696
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
697
    __asm__ volatile(
698
        "movq         %0, %%mm7     \n\t"
699
        "movq         %1, %%mm6     \n\t"
700
        ::"m"(red_16mask),"m"(green_16mask));
701
    mm_end = end - 15;
702
    while (s < mm_end) {
703
        __asm__ volatile(
704
            PREFETCH"    32%1           \n\t"
705
            "movd          %1, %%mm0    \n\t"
706
            "movd         3%1, %%mm3    \n\t"
707
            "punpckldq    6%1, %%mm0    \n\t"
708
            "punpckldq    9%1, %%mm3    \n\t"
709
            "movq       %%mm0, %%mm1    \n\t"
710
            "movq       %%mm0, %%mm2    \n\t"
711
            "movq       %%mm3, %%mm4    \n\t"
712
            "movq       %%mm3, %%mm5    \n\t"
713
            "psllq         $8, %%mm0    \n\t"
714
            "psllq         $8, %%mm3    \n\t"
715
            "pand       %%mm7, %%mm0    \n\t"
716
            "pand       %%mm7, %%mm3    \n\t"
717
            "psrlq         $5, %%mm1    \n\t"
718
            "psrlq         $5, %%mm4    \n\t"
719
            "pand       %%mm6, %%mm1    \n\t"
720
            "pand       %%mm6, %%mm4    \n\t"
721
            "psrlq        $19, %%mm2    \n\t"
722
            "psrlq        $19, %%mm5    \n\t"
723
            "pand          %2, %%mm2    \n\t"
724
            "pand          %2, %%mm5    \n\t"
725
            "por        %%mm1, %%mm0    \n\t"
726
            "por        %%mm4, %%mm3    \n\t"
727
            "por        %%mm2, %%mm0    \n\t"
728
            "por        %%mm5, %%mm3    \n\t"
729
            "psllq        $16, %%mm3    \n\t"
730
            "por        %%mm3, %%mm0    \n\t"
731
            MOVNTQ"     %%mm0, %0       \n\t"
732
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
733
        d += 4;
734
        s += 12;
735
    }
736
    __asm__ volatile(SFENCE:::"memory");
737
    __asm__ volatile(EMMS:::"memory");
738
#endif
739
    while (s < end) {
740
        const int r = *s++;
741
        const int g = *s++;
742
        const int b = *s++;
743
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
744
    }
745
}
746

    
747
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
748
{
749
    const uint8_t *s = src;
750
    const uint8_t *end;
751
#if COMPILE_TEMPLATE_MMX
752
    const uint8_t *mm_end;
753
#endif
754
    uint16_t *d = (uint16_t *)dst;
755
    end = s + src_size;
756
#if COMPILE_TEMPLATE_MMX
757
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
758
    __asm__ volatile(
759
        "movq          %0, %%mm7    \n\t"
760
        "movq          %1, %%mm6    \n\t"
761
        ::"m"(red_15mask),"m"(green_15mask));
762
    mm_end = end - 11;
763
    while (s < mm_end) {
764
        __asm__ volatile(
765
            PREFETCH"    32%1           \n\t"
766
            "movd          %1, %%mm0    \n\t"
767
            "movd         3%1, %%mm3    \n\t"
768
            "punpckldq    6%1, %%mm0    \n\t"
769
            "punpckldq    9%1, %%mm3    \n\t"
770
            "movq       %%mm0, %%mm1    \n\t"
771
            "movq       %%mm0, %%mm2    \n\t"
772
            "movq       %%mm3, %%mm4    \n\t"
773
            "movq       %%mm3, %%mm5    \n\t"
774
            "psrlq         $3, %%mm0    \n\t"
775
            "psrlq         $3, %%mm3    \n\t"
776
            "pand          %2, %%mm0    \n\t"
777
            "pand          %2, %%mm3    \n\t"
778
            "psrlq         $6, %%mm1    \n\t"
779
            "psrlq         $6, %%mm4    \n\t"
780
            "pand       %%mm6, %%mm1    \n\t"
781
            "pand       %%mm6, %%mm4    \n\t"
782
            "psrlq         $9, %%mm2    \n\t"
783
            "psrlq         $9, %%mm5    \n\t"
784
            "pand       %%mm7, %%mm2    \n\t"
785
            "pand       %%mm7, %%mm5    \n\t"
786
            "por        %%mm1, %%mm0    \n\t"
787
            "por        %%mm4, %%mm3    \n\t"
788
            "por        %%mm2, %%mm0    \n\t"
789
            "por        %%mm5, %%mm3    \n\t"
790
            "psllq        $16, %%mm3    \n\t"
791
            "por        %%mm3, %%mm0    \n\t"
792
            MOVNTQ"     %%mm0, %0       \n\t"
793
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794
        d += 4;
795
        s += 12;
796
    }
797
    __asm__ volatile(SFENCE:::"memory");
798
    __asm__ volatile(EMMS:::"memory");
799
#endif
800
    while (s < end) {
801
        const int b = *s++;
802
        const int g = *s++;
803
        const int r = *s++;
804
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
805
    }
806
}
807

    
808
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
809
{
810
    const uint8_t *s = src;
811
    const uint8_t *end;
812
#if COMPILE_TEMPLATE_MMX
813
    const uint8_t *mm_end;
814
#endif
815
    uint16_t *d = (uint16_t *)dst;
816
    end = s + src_size;
817
#if COMPILE_TEMPLATE_MMX
818
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
819
    __asm__ volatile(
820
        "movq         %0, %%mm7     \n\t"
821
        "movq         %1, %%mm6     \n\t"
822
        ::"m"(red_15mask),"m"(green_15mask));
823
    mm_end = end - 15;
824
    while (s < mm_end) {
825
        __asm__ volatile(
826
            PREFETCH"   32%1            \n\t"
827
            "movd         %1, %%mm0     \n\t"
828
            "movd        3%1, %%mm3     \n\t"
829
            "punpckldq   6%1, %%mm0     \n\t"
830
            "punpckldq   9%1, %%mm3     \n\t"
831
            "movq      %%mm0, %%mm1     \n\t"
832
            "movq      %%mm0, %%mm2     \n\t"
833
            "movq      %%mm3, %%mm4     \n\t"
834
            "movq      %%mm3, %%mm5     \n\t"
835
            "psllq        $7, %%mm0     \n\t"
836
            "psllq        $7, %%mm3     \n\t"
837
            "pand      %%mm7, %%mm0     \n\t"
838
            "pand      %%mm7, %%mm3     \n\t"
839
            "psrlq        $6, %%mm1     \n\t"
840
            "psrlq        $6, %%mm4     \n\t"
841
            "pand      %%mm6, %%mm1     \n\t"
842
            "pand      %%mm6, %%mm4     \n\t"
843
            "psrlq       $19, %%mm2     \n\t"
844
            "psrlq       $19, %%mm5     \n\t"
845
            "pand         %2, %%mm2     \n\t"
846
            "pand         %2, %%mm5     \n\t"
847
            "por       %%mm1, %%mm0     \n\t"
848
            "por       %%mm4, %%mm3     \n\t"
849
            "por       %%mm2, %%mm0     \n\t"
850
            "por       %%mm5, %%mm3     \n\t"
851
            "psllq       $16, %%mm3     \n\t"
852
            "por       %%mm3, %%mm0     \n\t"
853
            MOVNTQ"    %%mm0, %0        \n\t"
854
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855
        d += 4;
856
        s += 12;
857
    }
858
    __asm__ volatile(SFENCE:::"memory");
859
    __asm__ volatile(EMMS:::"memory");
860
#endif
861
    while (s < end) {
862
        const int r = *s++;
863
        const int g = *s++;
864
        const int b = *s++;
865
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
866
    }
867
}
868

    
869
/*
870
  I use less accurate approximation here by simply left-shifting the input
871
  value and filling the low order bits with zeroes. This method improves PNG
872
  compression but this scheme cannot reproduce white exactly, since it does
873
  not generate an all-ones maximum value; the net effect is to darken the
874
  image slightly.
875

876
  The better method should be "left bit replication":
877

878
   4 3 2 1 0
879
   ---------
880
   1 1 0 1 1
881

882
   7 6 5 4 3  2 1 0
883
   ----------------
884
   1 1 0 1 1  1 1 0
885
   |=======|  |===|
886
       |      leftmost bits repeated to fill open bits
887
       |
888
   original bits
889
*/
890
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
891
{
892
    const uint16_t *end;
893
#if COMPILE_TEMPLATE_MMX
894
    const uint16_t *mm_end;
895
#endif
896
    uint8_t *d = dst;
897
    const uint16_t *s = (const uint16_t*)src;
898
    end = s + src_size/2;
899
#if COMPILE_TEMPLATE_MMX
900
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
901
    mm_end = end - 7;
902
    while (s < mm_end) {
903
        __asm__ volatile(
904
            PREFETCH"    32%1           \n\t"
905
            "movq          %1, %%mm0    \n\t"
906
            "movq          %1, %%mm1    \n\t"
907
            "movq          %1, %%mm2    \n\t"
908
            "pand          %2, %%mm0    \n\t"
909
            "pand          %3, %%mm1    \n\t"
910
            "pand          %4, %%mm2    \n\t"
911
            "psllq         $3, %%mm0    \n\t"
912
            "psrlq         $2, %%mm1    \n\t"
913
            "psrlq         $7, %%mm2    \n\t"
914
            "movq       %%mm0, %%mm3    \n\t"
915
            "movq       %%mm1, %%mm4    \n\t"
916
            "movq       %%mm2, %%mm5    \n\t"
917
            "punpcklwd     %5, %%mm0    \n\t"
918
            "punpcklwd     %5, %%mm1    \n\t"
919
            "punpcklwd     %5, %%mm2    \n\t"
920
            "punpckhwd     %5, %%mm3    \n\t"
921
            "punpckhwd     %5, %%mm4    \n\t"
922
            "punpckhwd     %5, %%mm5    \n\t"
923
            "psllq         $8, %%mm1    \n\t"
924
            "psllq        $16, %%mm2    \n\t"
925
            "por        %%mm1, %%mm0    \n\t"
926
            "por        %%mm2, %%mm0    \n\t"
927
            "psllq         $8, %%mm4    \n\t"
928
            "psllq        $16, %%mm5    \n\t"
929
            "por        %%mm4, %%mm3    \n\t"
930
            "por        %%mm5, %%mm3    \n\t"
931

    
932
            "movq       %%mm0, %%mm6    \n\t"
933
            "movq       %%mm3, %%mm7    \n\t"
934

    
935
            "movq         8%1, %%mm0    \n\t"
936
            "movq         8%1, %%mm1    \n\t"
937
            "movq         8%1, %%mm2    \n\t"
938
            "pand          %2, %%mm0    \n\t"
939
            "pand          %3, %%mm1    \n\t"
940
            "pand          %4, %%mm2    \n\t"
941
            "psllq         $3, %%mm0    \n\t"
942
            "psrlq         $2, %%mm1    \n\t"
943
            "psrlq         $7, %%mm2    \n\t"
944
            "movq       %%mm0, %%mm3    \n\t"
945
            "movq       %%mm1, %%mm4    \n\t"
946
            "movq       %%mm2, %%mm5    \n\t"
947
            "punpcklwd     %5, %%mm0    \n\t"
948
            "punpcklwd     %5, %%mm1    \n\t"
949
            "punpcklwd     %5, %%mm2    \n\t"
950
            "punpckhwd     %5, %%mm3    \n\t"
951
            "punpckhwd     %5, %%mm4    \n\t"
952
            "punpckhwd     %5, %%mm5    \n\t"
953
            "psllq         $8, %%mm1    \n\t"
954
            "psllq        $16, %%mm2    \n\t"
955
            "por        %%mm1, %%mm0    \n\t"
956
            "por        %%mm2, %%mm0    \n\t"
957
            "psllq         $8, %%mm4    \n\t"
958
            "psllq        $16, %%mm5    \n\t"
959
            "por        %%mm4, %%mm3    \n\t"
960
            "por        %%mm5, %%mm3    \n\t"
961

    
962
            :"=m"(*d)
963
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
964
            :"memory");
965
        /* borrowed 32 to 24 */
966
        __asm__ volatile(
967
            "movq       %%mm0, %%mm4    \n\t"
968
            "movq       %%mm3, %%mm5    \n\t"
969
            "movq       %%mm6, %%mm0    \n\t"
970
            "movq       %%mm7, %%mm1    \n\t"
971

    
972
            "movq       %%mm4, %%mm6    \n\t"
973
            "movq       %%mm5, %%mm7    \n\t"
974
            "movq       %%mm0, %%mm2    \n\t"
975
            "movq       %%mm1, %%mm3    \n\t"
976

    
977
            STORE_BGR24_MMX
978

    
979
            :"=m"(*d)
980
            :"m"(*s)
981
            :"memory");
982
        d += 24;
983
        s += 8;
984
    }
985
    __asm__ volatile(SFENCE:::"memory");
986
    __asm__ volatile(EMMS:::"memory");
987
#endif
988
    while (s < end) {
989
        register uint16_t bgr;
990
        bgr = *s++;
991
        *d++ = (bgr&0x1F)<<3;
992
        *d++ = (bgr&0x3E0)>>2;
993
        *d++ = (bgr&0x7C00)>>7;
994
    }
995
}
996

    
997
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
998
{
999
    const uint16_t *end;
1000
#if COMPILE_TEMPLATE_MMX
1001
    const uint16_t *mm_end;
1002
#endif
1003
    uint8_t *d = (uint8_t *)dst;
1004
    const uint16_t *s = (const uint16_t *)src;
1005
    end = s + src_size/2;
1006
#if COMPILE_TEMPLATE_MMX
1007
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1008
    mm_end = end - 7;
1009
    while (s < mm_end) {
1010
        __asm__ volatile(
1011
            PREFETCH"    32%1           \n\t"
1012
            "movq          %1, %%mm0    \n\t"
1013
            "movq          %1, %%mm1    \n\t"
1014
            "movq          %1, %%mm2    \n\t"
1015
            "pand          %2, %%mm0    \n\t"
1016
            "pand          %3, %%mm1    \n\t"
1017
            "pand          %4, %%mm2    \n\t"
1018
            "psllq         $3, %%mm0    \n\t"
1019
            "psrlq         $3, %%mm1    \n\t"
1020
            "psrlq         $8, %%mm2    \n\t"
1021
            "movq       %%mm0, %%mm3    \n\t"
1022
            "movq       %%mm1, %%mm4    \n\t"
1023
            "movq       %%mm2, %%mm5    \n\t"
1024
            "punpcklwd     %5, %%mm0    \n\t"
1025
            "punpcklwd     %5, %%mm1    \n\t"
1026
            "punpcklwd     %5, %%mm2    \n\t"
1027
            "punpckhwd     %5, %%mm3    \n\t"
1028
            "punpckhwd     %5, %%mm4    \n\t"
1029
            "punpckhwd     %5, %%mm5    \n\t"
1030
            "psllq         $8, %%mm1    \n\t"
1031
            "psllq        $16, %%mm2    \n\t"
1032
            "por        %%mm1, %%mm0    \n\t"
1033
            "por        %%mm2, %%mm0    \n\t"
1034
            "psllq         $8, %%mm4    \n\t"
1035
            "psllq        $16, %%mm5    \n\t"
1036
            "por        %%mm4, %%mm3    \n\t"
1037
            "por        %%mm5, %%mm3    \n\t"
1038

    
1039
            "movq       %%mm0, %%mm6    \n\t"
1040
            "movq       %%mm3, %%mm7    \n\t"
1041

    
1042
            "movq         8%1, %%mm0    \n\t"
1043
            "movq         8%1, %%mm1    \n\t"
1044
            "movq         8%1, %%mm2    \n\t"
1045
            "pand          %2, %%mm0    \n\t"
1046
            "pand          %3, %%mm1    \n\t"
1047
            "pand          %4, %%mm2    \n\t"
1048
            "psllq         $3, %%mm0    \n\t"
1049
            "psrlq         $3, %%mm1    \n\t"
1050
            "psrlq         $8, %%mm2    \n\t"
1051
            "movq       %%mm0, %%mm3    \n\t"
1052
            "movq       %%mm1, %%mm4    \n\t"
1053
            "movq       %%mm2, %%mm5    \n\t"
1054
            "punpcklwd     %5, %%mm0    \n\t"
1055
            "punpcklwd     %5, %%mm1    \n\t"
1056
            "punpcklwd     %5, %%mm2    \n\t"
1057
            "punpckhwd     %5, %%mm3    \n\t"
1058
            "punpckhwd     %5, %%mm4    \n\t"
1059
            "punpckhwd     %5, %%mm5    \n\t"
1060
            "psllq         $8, %%mm1    \n\t"
1061
            "psllq        $16, %%mm2    \n\t"
1062
            "por        %%mm1, %%mm0    \n\t"
1063
            "por        %%mm2, %%mm0    \n\t"
1064
            "psllq         $8, %%mm4    \n\t"
1065
            "psllq        $16, %%mm5    \n\t"
1066
            "por        %%mm4, %%mm3    \n\t"
1067
            "por        %%mm5, %%mm3    \n\t"
1068
            :"=m"(*d)
1069
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1070
            :"memory");
1071
        /* borrowed 32 to 24 */
1072
        __asm__ volatile(
1073
            "movq       %%mm0, %%mm4    \n\t"
1074
            "movq       %%mm3, %%mm5    \n\t"
1075
            "movq       %%mm6, %%mm0    \n\t"
1076
            "movq       %%mm7, %%mm1    \n\t"
1077

    
1078
            "movq       %%mm4, %%mm6    \n\t"
1079
            "movq       %%mm5, %%mm7    \n\t"
1080
            "movq       %%mm0, %%mm2    \n\t"
1081
            "movq       %%mm1, %%mm3    \n\t"
1082

    
1083
            STORE_BGR24_MMX
1084

    
1085
            :"=m"(*d)
1086
            :"m"(*s)
1087
            :"memory");
1088
        d += 24;
1089
        s += 8;
1090
    }
1091
    __asm__ volatile(SFENCE:::"memory");
1092
    __asm__ volatile(EMMS:::"memory");
1093
#endif
1094
    while (s < end) {
1095
        register uint16_t bgr;
1096
        bgr = *s++;
1097
        *d++ = (bgr&0x1F)<<3;
1098
        *d++ = (bgr&0x7E0)>>3;
1099
        *d++ = (bgr&0xF800)>>8;
1100
    }
1101
}
1102

    
1103
/*
1104
 * mm0 = 00 B3 00 B2 00 B1 00 B0
1105
 * mm1 = 00 G3 00 G2 00 G1 00 G0
1106
 * mm2 = 00 R3 00 R2 00 R1 00 R0
1107
 * mm6 = FF FF FF FF FF FF FF FF
1108
 * mm7 = 00 00 00 00 00 00 00 00
1109
 */
1110
#define PACK_RGB32 \
1111
    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1112
    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1113
    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1114
    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1115
    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1116
    "movq       %%mm0, %%mm3    \n\t"                               \
1117
    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1118
    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1119
    MOVNTQ"     %%mm0,  %0      \n\t"                               \
1120
    MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1121

    
1122
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1123
{
1124
    const uint16_t *end;
1125
#if COMPILE_TEMPLATE_MMX
1126
    const uint16_t *mm_end;
1127
#endif
1128
    uint8_t *d = dst;
1129
    const uint16_t *s = (const uint16_t *)src;
1130
    end = s + src_size/2;
1131
#if COMPILE_TEMPLATE_MMX
1132
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1133
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1134
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1135
    mm_end = end - 3;
1136
    while (s < mm_end) {
1137
        __asm__ volatile(
1138
            PREFETCH"    32%1           \n\t"
1139
            "movq          %1, %%mm0    \n\t"
1140
            "movq          %1, %%mm1    \n\t"
1141
            "movq          %1, %%mm2    \n\t"
1142
            "pand          %2, %%mm0    \n\t"
1143
            "pand          %3, %%mm1    \n\t"
1144
            "pand          %4, %%mm2    \n\t"
1145
            "psllq         $3, %%mm0    \n\t"
1146
            "psrlq         $2, %%mm1    \n\t"
1147
            "psrlq         $7, %%mm2    \n\t"
1148
            PACK_RGB32
1149
            :"=m"(*d)
1150
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1151
            :"memory");
1152
        d += 16;
1153
        s += 4;
1154
    }
1155
    __asm__ volatile(SFENCE:::"memory");
1156
    __asm__ volatile(EMMS:::"memory");
1157
#endif
1158
    while (s < end) {
1159
        register uint16_t bgr;
1160
        bgr = *s++;
1161
#if HAVE_BIGENDIAN
1162
        *d++ = 255;
1163
        *d++ = (bgr&0x7C00)>>7;
1164
        *d++ = (bgr&0x3E0)>>2;
1165
        *d++ = (bgr&0x1F)<<3;
1166
#else
1167
        *d++ = (bgr&0x1F)<<3;
1168
        *d++ = (bgr&0x3E0)>>2;
1169
        *d++ = (bgr&0x7C00)>>7;
1170
        *d++ = 255;
1171
#endif
1172
    }
1173
}
1174

    
1175
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1176
{
1177
    const uint16_t *end;
1178
#if COMPILE_TEMPLATE_MMX
1179
    const uint16_t *mm_end;
1180
#endif
1181
    uint8_t *d = dst;
1182
    const uint16_t *s = (const uint16_t*)src;
1183
    end = s + src_size/2;
1184
#if COMPILE_TEMPLATE_MMX
1185
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1186
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1187
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1188
    mm_end = end - 3;
1189
    while (s < mm_end) {
1190
        __asm__ volatile(
1191
            PREFETCH"    32%1           \n\t"
1192
            "movq          %1, %%mm0    \n\t"
1193
            "movq          %1, %%mm1    \n\t"
1194
            "movq          %1, %%mm2    \n\t"
1195
            "pand          %2, %%mm0    \n\t"
1196
            "pand          %3, %%mm1    \n\t"
1197
            "pand          %4, %%mm2    \n\t"
1198
            "psllq         $3, %%mm0    \n\t"
1199
            "psrlq         $3, %%mm1    \n\t"
1200
            "psrlq         $8, %%mm2    \n\t"
1201
            PACK_RGB32
1202
            :"=m"(*d)
1203
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1204
            :"memory");
1205
        d += 16;
1206
        s += 4;
1207
    }
1208
    __asm__ volatile(SFENCE:::"memory");
1209
    __asm__ volatile(EMMS:::"memory");
1210
#endif
1211
    while (s < end) {
1212
        register uint16_t bgr;
1213
        bgr = *s++;
1214
#if HAVE_BIGENDIAN
1215
        *d++ = 255;
1216
        *d++ = (bgr&0xF800)>>8;
1217
        *d++ = (bgr&0x7E0)>>3;
1218
        *d++ = (bgr&0x1F)<<3;
1219
#else
1220
        *d++ = (bgr&0x1F)<<3;
1221
        *d++ = (bgr&0x7E0)>>3;
1222
        *d++ = (bgr&0xF800)>>8;
1223
        *d++ = 255;
1224
#endif
1225
    }
1226
}
1227

    
1228
static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
1229
{
1230
    x86_reg idx = 15 - src_size;
1231
    const uint8_t *s = src-idx;
1232
    uint8_t *d = dst-idx;
1233
#if COMPILE_TEMPLATE_MMX
1234
    __asm__ volatile(
1235
        "test          %0, %0           \n\t"
1236
        "jns           2f               \n\t"
1237
        PREFETCH"       (%1, %0)        \n\t"
1238
        "movq          %3, %%mm7        \n\t"
1239
        "pxor          %4, %%mm7        \n\t"
1240
        "movq       %%mm7, %%mm6        \n\t"
1241
        "pxor          %5, %%mm7        \n\t"
1242
        ".p2align       4               \n\t"
1243
        "1:                             \n\t"
1244
        PREFETCH"     32(%1, %0)        \n\t"
1245
        "movq           (%1, %0), %%mm0 \n\t"
1246
        "movq          8(%1, %0), %%mm1 \n\t"
1247
# if COMPILE_TEMPLATE_MMX2
1248
        "pshufw      $177, %%mm0, %%mm3 \n\t"
1249
        "pshufw      $177, %%mm1, %%mm5 \n\t"
1250
        "pand       %%mm7, %%mm0        \n\t"
1251
        "pand       %%mm6, %%mm3        \n\t"
1252
        "pand       %%mm7, %%mm1        \n\t"
1253
        "pand       %%mm6, %%mm5        \n\t"
1254
        "por        %%mm3, %%mm0        \n\t"
1255
        "por        %%mm5, %%mm1        \n\t"
1256
# else
1257
        "movq       %%mm0, %%mm2        \n\t"
1258
        "movq       %%mm1, %%mm4        \n\t"
1259
        "pand       %%mm7, %%mm0        \n\t"
1260
        "pand       %%mm6, %%mm2        \n\t"
1261
        "pand       %%mm7, %%mm1        \n\t"
1262
        "pand       %%mm6, %%mm4        \n\t"
1263
        "movq       %%mm2, %%mm3        \n\t"
1264
        "movq       %%mm4, %%mm5        \n\t"
1265
        "pslld        $16, %%mm2        \n\t"
1266
        "psrld        $16, %%mm3        \n\t"
1267
        "pslld        $16, %%mm4        \n\t"
1268
        "psrld        $16, %%mm5        \n\t"
1269
        "por        %%mm2, %%mm0        \n\t"
1270
        "por        %%mm4, %%mm1        \n\t"
1271
        "por        %%mm3, %%mm0        \n\t"
1272
        "por        %%mm5, %%mm1        \n\t"
1273
# endif
1274
        MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1275
        MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1276
        "add          $16, %0           \n\t"
1277
        "js            1b               \n\t"
1278
        SFENCE"                         \n\t"
1279
        EMMS"                           \n\t"
1280
        "2:                             \n\t"
1281
        : "+&r"(idx)
1282
        : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1283
        : "memory");
1284
#endif
1285
    for (; idx<15; idx+=4) {
1286
        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1287
        v &= 0xff00ff;
1288
        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1289
    }
1290
}
1291

    
1292
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1293
{
1294
    unsigned i;
1295
#if COMPILE_TEMPLATE_MMX
1296
    x86_reg mmx_size= 23 - src_size;
1297
    __asm__ volatile (
1298
        "test             %%"REG_a", %%"REG_a"          \n\t"
1299
        "jns                     2f                     \n\t"
1300
        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1301
        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1302
        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1303
        ".p2align                 4                     \n\t"
1304
        "1:                                             \n\t"
1305
        PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1306
        "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1307
        "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1308
        "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1309
        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1310
        "pand                 %%mm5, %%mm0              \n\t"
1311
        "pand                 %%mm6, %%mm1              \n\t"
1312
        "pand                 %%mm7, %%mm2              \n\t"
1313
        "por                  %%mm0, %%mm1              \n\t"
1314
        "por                  %%mm2, %%mm1              \n\t"
1315
        "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1316
        MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1317
        "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1318
        "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1319
        "pand                 %%mm7, %%mm0              \n\t"
1320
        "pand                 %%mm5, %%mm1              \n\t"
1321
        "pand                 %%mm6, %%mm2              \n\t"
1322
        "por                  %%mm0, %%mm1              \n\t"
1323
        "por                  %%mm2, %%mm1              \n\t"
1324
        "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1325
        MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1326
        "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1327
        "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1328
        "pand                 %%mm6, %%mm0              \n\t"
1329
        "pand                 %%mm7, %%mm1              \n\t"
1330
        "pand                 %%mm5, %%mm2              \n\t"
1331
        "por                  %%mm0, %%mm1              \n\t"
1332
        "por                  %%mm2, %%mm1              \n\t"
1333
        MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1334
        "add                    $24, %%"REG_a"          \n\t"
1335
        " js                     1b                     \n\t"
1336
        "2:                                             \n\t"
1337
        : "+a" (mmx_size)
1338
        : "r" (src-mmx_size), "r"(dst-mmx_size)
1339
    );
1340

    
1341
    __asm__ volatile(SFENCE:::"memory");
1342
    __asm__ volatile(EMMS:::"memory");
1343

    
1344
    if (mmx_size==23) return; //finished, was multiple of 8
1345

    
1346
    src+= src_size;
1347
    dst+= src_size;
1348
    src_size= 23-mmx_size;
1349
    src-= src_size;
1350
    dst-= src_size;
1351
#endif
1352
    for (i=0; i<src_size; i+=3) {
1353
        register uint8_t x;
1354
        x          = src[i + 2];
1355
        dst[i + 1] = src[i + 1];
1356
        dst[i + 2] = src[i + 0];
1357
        dst[i + 0] = x;
1358
    }
1359
}
1360

    
1361
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1362
                                           long width, long height,
1363
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1364
{
1365
    long y;
1366
    const x86_reg chromWidth= width>>1;
1367
    for (y=0; y<height; y++) {
1368
#if COMPILE_TEMPLATE_MMX
1369
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1370
        __asm__ volatile(
1371
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1372
            ".p2align                    4              \n\t"
1373
            "1:                                         \n\t"
1374
            PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1375
            PREFETCH"    32(%2, %%"REG_a")              \n\t"
1376
            PREFETCH"    32(%3, %%"REG_a")              \n\t"
1377
            "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1378
            "movq                    %%mm0, %%mm2       \n\t" // U(0)
1379
            "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1380
            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1381
            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1382

    
1383
            "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1384
            "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1385
            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1386
            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1387
            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1388
            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1389
            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1390
            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1391

    
1392
            MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1393
            MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1394
            MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1395
            MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1396

    
1397
            "add                        $8, %%"REG_a"   \n\t"
1398
            "cmp                        %4, %%"REG_a"   \n\t"
1399
            " jb                        1b              \n\t"
1400
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1401
            : "%"REG_a
1402
        );
1403
#else
1404

    
1405
#if ARCH_ALPHA && HAVE_MVI
1406
#define pl2yuy2(n)                  \
1407
    y1 = yc[n];                     \
1408
    y2 = yc2[n];                    \
1409
    u = uc[n];                      \
1410
    v = vc[n];                      \
1411
    __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1412
    __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1413
    __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1414
    __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1415
    yuv1 = (u << 8) + (v << 24);                \
1416
    yuv2 = yuv1 + y2;               \
1417
    yuv1 += y1;                     \
1418
    qdst[n]  = yuv1;                \
1419
    qdst2[n] = yuv2;
1420

    
1421
        int i;
1422
        uint64_t *qdst = (uint64_t *) dst;
1423
        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1424
        const uint32_t *yc = (uint32_t *) ysrc;
1425
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1426
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1427
        for (i = 0; i < chromWidth; i += 8) {
1428
            uint64_t y1, y2, yuv1, yuv2;
1429
            uint64_t u, v;
1430
            /* Prefetch */
1431
            __asm__("ldq $31,64(%0)" :: "r"(yc));
1432
            __asm__("ldq $31,64(%0)" :: "r"(yc2));
1433
            __asm__("ldq $31,64(%0)" :: "r"(uc));
1434
            __asm__("ldq $31,64(%0)" :: "r"(vc));
1435

    
1436
            pl2yuy2(0);
1437
            pl2yuy2(1);
1438
            pl2yuy2(2);
1439
            pl2yuy2(3);
1440

    
1441
            yc    += 4;
1442
            yc2   += 4;
1443
            uc    += 4;
1444
            vc    += 4;
1445
            qdst  += 4;
1446
            qdst2 += 4;
1447
        }
1448
        y++;
1449
        ysrc += lumStride;
1450
        dst += dstStride;
1451

    
1452
#elif HAVE_FAST_64BIT
1453
        int i;
1454
        uint64_t *ldst = (uint64_t *) dst;
1455
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1456
        for (i = 0; i < chromWidth; i += 2) {
1457
            uint64_t k, l;
1458
            k = yc[0] + (uc[0] << 8) +
1459
                (yc[1] << 16) + (vc[0] << 24);
1460
            l = yc[2] + (uc[1] << 8) +
1461
                (yc[3] << 16) + (vc[1] << 24);
1462
            *ldst++ = k + (l << 32);
1463
            yc += 4;
1464
            uc += 2;
1465
            vc += 2;
1466
        }
1467

    
1468
#else
1469
        int i, *idst = (int32_t *) dst;
1470
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1471
        for (i = 0; i < chromWidth; i++) {
1472
#if HAVE_BIGENDIAN
1473
            *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1474
                (yc[1] << 8) + (vc[0] << 0);
1475
#else
1476
            *idst++ = yc[0] + (uc[0] << 8) +
1477
                (yc[1] << 16) + (vc[0] << 24);
1478
#endif
1479
            yc += 2;
1480
            uc++;
1481
            vc++;
1482
        }
1483
#endif
1484
#endif
1485
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1486
            usrc += chromStride;
1487
            vsrc += chromStride;
1488
        }
1489
        ysrc += lumStride;
1490
        dst  += dstStride;
1491
    }
1492
#if COMPILE_TEMPLATE_MMX
1493
    __asm__(EMMS"       \n\t"
1494
            SFENCE"     \n\t"
1495
            :::"memory");
1496
#endif
1497
}
1498

    
1499
/**
1500
 * Height should be a multiple of 2 and width should be a multiple of 16.
1501
 * (If this is a problem for anyone then tell me, and I will fix it.)
1502
 */
1503
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1504
                                      long width, long height,
1505
                                      long lumStride, long chromStride, long dstStride)
1506
{
1507
    //FIXME interpolate chroma
1508
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1509
}
1510

    
1511
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1512
                                           long width, long height,
1513
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1514
{
1515
    long y;
1516
    const x86_reg chromWidth= width>>1;
1517
    for (y=0; y<height; y++) {
1518
#if COMPILE_TEMPLATE_MMX
1519
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1520
        __asm__ volatile(
1521
            "xor                %%"REG_a", %%"REG_a"    \n\t"
1522
            ".p2align                   4               \n\t"
1523
            "1:                                         \n\t"
1524
            PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1525
            PREFETCH"   32(%2, %%"REG_a")               \n\t"
1526
            PREFETCH"   32(%3, %%"REG_a")               \n\t"
1527
            "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1528
            "movq                   %%mm0, %%mm2        \n\t" // U(0)
1529
            "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1530
            "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1531
            "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1532

    
1533
            "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1534
            "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1535
            "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1536
            "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1537
            "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1538
            "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1539
            "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1540
            "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1541

    
1542
            MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1543
            MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1544
            MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1545
            MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1546

    
1547
            "add                       $8, %%"REG_a"    \n\t"
1548
            "cmp                       %4, %%"REG_a"    \n\t"
1549
            " jb                       1b               \n\t"
1550
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1551
            : "%"REG_a
1552
        );
1553
#else
1554
//FIXME adapt the Alpha ASM code from yv12->yuy2
1555

    
1556
#if HAVE_FAST_64BIT
1557
        int i;
1558
        uint64_t *ldst = (uint64_t *) dst;
1559
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560
        for (i = 0; i < chromWidth; i += 2) {
1561
            uint64_t k, l;
1562
            k = uc[0] + (yc[0] << 8) +
1563
                (vc[0] << 16) + (yc[1] << 24);
1564
            l = uc[1] + (yc[2] << 8) +
1565
                (vc[1] << 16) + (yc[3] << 24);
1566
            *ldst++ = k + (l << 32);
1567
            yc += 4;
1568
            uc += 2;
1569
            vc += 2;
1570
        }
1571

    
1572
#else
1573
        int i, *idst = (int32_t *) dst;
1574
        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575
        for (i = 0; i < chromWidth; i++) {
1576
#if HAVE_BIGENDIAN
1577
            *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1578
                (vc[0] << 8) + (yc[1] << 0);
1579
#else
1580
            *idst++ = uc[0] + (yc[0] << 8) +
1581
               (vc[0] << 16) + (yc[1] << 24);
1582
#endif
1583
            yc += 2;
1584
            uc++;
1585
            vc++;
1586
        }
1587
#endif
1588
#endif
1589
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1590
            usrc += chromStride;
1591
            vsrc += chromStride;
1592
        }
1593
        ysrc += lumStride;
1594
        dst += dstStride;
1595
    }
1596
#if COMPILE_TEMPLATE_MMX
1597
    __asm__(EMMS"       \n\t"
1598
            SFENCE"     \n\t"
1599
            :::"memory");
1600
#endif
1601
}
1602

    
1603
/**
1604
 * Height should be a multiple of 2 and width should be a multiple of 16
1605
 * (If this is a problem for anyone then tell me, and I will fix it.)
1606
 */
1607
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1608
                                      long width, long height,
1609
                                      long lumStride, long chromStride, long dstStride)
1610
{
1611
    //FIXME interpolate chroma
1612
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1613
}
1614

    
1615
/**
1616
 * Width should be a multiple of 16.
1617
 */
1618
static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619
                                         long width, long height,
1620
                                         long lumStride, long chromStride, long dstStride)
1621
{
1622
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1623
}
1624

    
1625
/**
1626
 * Width should be a multiple of 16.
1627
 */
1628
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1629
                                         long width, long height,
1630
                                         long lumStride, long chromStride, long dstStride)
1631
{
1632
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1633
}
1634

    
1635
/**
1636
 * Height should be a multiple of 2 and width should be a multiple of 16.
1637
 * (If this is a problem for anyone then tell me, and I will fix it.)
1638
 */
1639
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1640
                                      long width, long height,
1641
                                      long lumStride, long chromStride, long srcStride)
1642
{
1643
    long y;
1644
    const x86_reg chromWidth= width>>1;
1645
    for (y=0; y<height; y+=2) {
1646
#if COMPILE_TEMPLATE_MMX
1647
        __asm__ volatile(
1648
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1649
            "pcmpeqw                 %%mm7, %%mm7       \n\t"
1650
            "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1651
            ".p2align                    4              \n\t"
1652
            "1:                \n\t"
1653
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1654
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1655
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1656
            "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1657
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1658
            "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1659
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1660
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1661
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1662
            "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1663
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1664

    
1665
            MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1666

    
1667
            "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1668
            "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1669
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1670
            "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1671
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1672
            "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1673
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1674
            "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1675
            "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1676
            "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1677

    
1678
            MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1679

    
1680
            "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1681
            "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1682
            "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1683
            "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1684
            "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1685
            "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1686
            "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1687
            "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1688

    
1689
            MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1690
            MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1691

    
1692
            "add                        $8, %%"REG_a"   \n\t"
1693
            "cmp                        %4, %%"REG_a"   \n\t"
1694
            " jb                        1b              \n\t"
1695
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1696
            : "memory", "%"REG_a
1697
        );
1698

    
1699
        ydst += lumStride;
1700
        src  += srcStride;
1701

    
1702
        __asm__ volatile(
1703
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1704
            ".p2align                    4              \n\t"
1705
            "1:                                         \n\t"
1706
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1707
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1708
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1709
            "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1710
            "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1711
            "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1712
            "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1713
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1714
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1715
            "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1716
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1717

    
1718
            MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1719
            MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1720

    
1721
            "add                        $8, %%"REG_a"   \n\t"
1722
            "cmp                        %4, %%"REG_a"   \n\t"
1723
            " jb                        1b              \n\t"
1724

    
1725
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1726
            : "memory", "%"REG_a
1727
        );
1728
#else
1729
        long i;
1730
        for (i=0; i<chromWidth; i++) {
1731
            ydst[2*i+0]     = src[4*i+0];
1732
            udst[i]     = src[4*i+1];
1733
            ydst[2*i+1]     = src[4*i+2];
1734
            vdst[i]     = src[4*i+3];
1735
        }
1736
        ydst += lumStride;
1737
        src  += srcStride;
1738

    
1739
        for (i=0; i<chromWidth; i++) {
1740
            ydst[2*i+0]     = src[4*i+0];
1741
            ydst[2*i+1]     = src[4*i+2];
1742
        }
1743
#endif
1744
        udst += chromStride;
1745
        vdst += chromStride;
1746
        ydst += lumStride;
1747
        src  += srcStride;
1748
    }
1749
#if COMPILE_TEMPLATE_MMX
1750
    __asm__ volatile(EMMS"       \n\t"
1751
                     SFENCE"     \n\t"
1752
                     :::"memory");
1753
#endif
1754
}
1755

    
1756
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1757
{
1758
    long x,y;
1759

    
1760
    dst[0]= src[0];
1761

    
1762
    // first line
1763
    for (x=0; x<srcWidth-1; x++) {
1764
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1765
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1766
    }
1767
    dst[2*srcWidth-1]= src[srcWidth-1];
1768

    
1769
    dst+= dstStride;
1770

    
1771
    for (y=1; y<srcHeight; y++) {
1772
#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1773
        const x86_reg mmxSize= srcWidth&~15;
1774
        __asm__ volatile(
1775
            "mov           %4, %%"REG_a"            \n\t"
1776
            "movq        "MANGLE(mmx_ff)", %%mm0    \n\t"
1777
            "movq         (%0, %%"REG_a"), %%mm4    \n\t"
1778
            "movq                   %%mm4, %%mm2    \n\t"
1779
            "psllq                     $8, %%mm4    \n\t"
1780
            "pand                   %%mm0, %%mm2    \n\t"
1781
            "por                    %%mm2, %%mm4    \n\t"
1782
            "movq         (%1, %%"REG_a"), %%mm5    \n\t"
1783
            "movq                   %%mm5, %%mm3    \n\t"
1784
            "psllq                     $8, %%mm5    \n\t"
1785
            "pand                   %%mm0, %%mm3    \n\t"
1786
            "por                    %%mm3, %%mm5    \n\t"
1787
            "1:                                     \n\t"
1788
            "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1789
            "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1790
            "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1791
            "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1792
            PAVGB"                  %%mm0, %%mm5    \n\t"
1793
            PAVGB"                  %%mm0, %%mm3    \n\t"
1794
            PAVGB"                  %%mm0, %%mm5    \n\t"
1795
            PAVGB"                  %%mm0, %%mm3    \n\t"
1796
            PAVGB"                  %%mm1, %%mm4    \n\t"
1797
            PAVGB"                  %%mm1, %%mm2    \n\t"
1798
            PAVGB"                  %%mm1, %%mm4    \n\t"
1799
            PAVGB"                  %%mm1, %%mm2    \n\t"
1800
            "movq                   %%mm5, %%mm7    \n\t"
1801
            "movq                   %%mm4, %%mm6    \n\t"
1802
            "punpcklbw              %%mm3, %%mm5    \n\t"
1803
            "punpckhbw              %%mm3, %%mm7    \n\t"
1804
            "punpcklbw              %%mm2, %%mm4    \n\t"
1805
            "punpckhbw              %%mm2, %%mm6    \n\t"
1806
#if 1
1807
            MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1808
            MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1809
            MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1810
            MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1811
#else
1812
            "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1813
            "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1814
            "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1815
            "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1816
#endif
1817
            "add                       $8, %%"REG_a"            \n\t"
1818
            "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1819
            "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1820
            " js                       1b                       \n\t"
1821
            :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1822
               "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1823
               "g" (-mmxSize)
1824
            : "%"REG_a
1825
        );
1826
#else
1827
        const x86_reg mmxSize=1;
1828

    
1829
        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1830
        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1831
#endif
1832

    
1833
        for (x=mmxSize-1; x<srcWidth-1; x++) {
1834
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1835
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1836
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1837
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1838
        }
1839
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1840
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1841

    
1842
        dst+=dstStride*2;
1843
        src+=srcStride;
1844
    }
1845

    
1846
    // last line
1847
#if 1
1848
    dst[0]= src[0];
1849

    
1850
    for (x=0; x<srcWidth-1; x++) {
1851
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1852
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1853
    }
1854
    dst[2*srcWidth-1]= src[srcWidth-1];
1855
#else
1856
    for (x=0; x<srcWidth; x++) {
1857
        dst[2*x+0]=
1858
        dst[2*x+1]= src[x];
1859
    }
1860
#endif
1861

    
1862
#if COMPILE_TEMPLATE_MMX
1863
    __asm__ volatile(EMMS"       \n\t"
1864
                     SFENCE"     \n\t"
1865
                     :::"memory");
1866
#endif
1867
}
1868

    
1869
/**
1870
 * Height should be a multiple of 2 and width should be a multiple of 16.
1871
 * (If this is a problem for anyone then tell me, and I will fix it.)
1872
 * Chrominance data is only taken from every second line, others are ignored.
1873
 * FIXME: Write HQ version.
1874
 */
1875
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1876
                                      long width, long height,
1877
                                      long lumStride, long chromStride, long srcStride)
1878
{
1879
    long y;
1880
    const x86_reg chromWidth= width>>1;
1881
    for (y=0; y<height; y+=2) {
1882
#if COMPILE_TEMPLATE_MMX
1883
        __asm__ volatile(
1884
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1885
            "pcmpeqw             %%mm7, %%mm7   \n\t"
1886
            "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
1887
            ".p2align                4          \n\t"
1888
            "1:                                 \n\t"
1889
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1890
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
1891
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
1892
            "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
1893
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
1894
            "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
1895
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
1896
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
1897
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
1898
            "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
1899
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
1900

    
1901
            MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
1902

    
1903
            "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
1904
            "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
1905
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
1906
            "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
1907
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
1908
            "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
1909
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
1910
            "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
1911
            "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
1912
            "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
1913

    
1914
            MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1915

    
1916
            "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
1917
            "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
1918
            "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
1919
            "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
1920
            "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
1921
            "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
1922
            "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
1923
            "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
1924

    
1925
            MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
1926
            MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
1927

    
1928
            "add                    $8, %%"REG_a"   \n\t"
1929
            "cmp                    %4, %%"REG_a"   \n\t"
1930
            " jb                    1b          \n\t"
1931
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1932
            : "memory", "%"REG_a
1933
        );
1934

    
1935
        ydst += lumStride;
1936
        src  += srcStride;
1937

    
1938
        __asm__ volatile(
1939
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1940
            ".p2align                    4              \n\t"
1941
            "1:                                 \n\t"
1942
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1943
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
1944
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
1945
            "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
1946
            "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
1947
            "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
1948
            "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
1949
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
1950
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
1951
            "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
1952
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
1953

    
1954
            MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
1955
            MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1956

    
1957
            "add                    $8, %%"REG_a"   \n\t"
1958
            "cmp                    %4, %%"REG_a"   \n\t"
1959
            " jb                    1b          \n\t"
1960

    
1961
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1962
            : "memory", "%"REG_a
1963
        );
1964
#else
1965
        long i;
1966
        for (i=0; i<chromWidth; i++) {
1967
            udst[i]     = src[4*i+0];
1968
            ydst[2*i+0] = src[4*i+1];
1969
            vdst[i]     = src[4*i+2];
1970
            ydst[2*i+1] = src[4*i+3];
1971
        }
1972
        ydst += lumStride;
1973
        src  += srcStride;
1974

    
1975
        for (i=0; i<chromWidth; i++) {
1976
            ydst[2*i+0] = src[4*i+1];
1977
            ydst[2*i+1] = src[4*i+3];
1978
        }
1979
#endif
1980
        udst += chromStride;
1981
        vdst += chromStride;
1982
        ydst += lumStride;
1983
        src  += srcStride;
1984
    }
1985
#if COMPILE_TEMPLATE_MMX
1986
    __asm__ volatile(EMMS"       \n\t"
1987
                     SFENCE"     \n\t"
1988
                     :::"memory");
1989
#endif
1990
}
1991

    
1992
/**
1993
 * Height should be a multiple of 2 and width should be a multiple of 2.
1994
 * (If this is a problem for anyone then tell me, and I will fix it.)
1995
 * Chrominance data is only taken from every second line,
1996
 * others are ignored in the C version.
1997
 * FIXME: Write HQ version.
1998
 */
1999
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2000
                                       long width, long height,
2001
                                       long lumStride, long chromStride, long srcStride)
2002
{
2003
    long y;
2004
    const x86_reg chromWidth= width>>1;
2005
#if COMPILE_TEMPLATE_MMX
2006
    for (y=0; y<height-2; y+=2) {
2007
        long i;
2008
        for (i=0; i<2; i++) {
2009
            __asm__ volatile(
2010
                "mov                        %2, %%"REG_a"   \n\t"
2011
                "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
2012
                "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2013
                "pxor                    %%mm7, %%mm7       \n\t"
2014
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2015
                ".p2align                    4              \n\t"
2016
                "1:                                         \n\t"
2017
                PREFETCH"    64(%0, %%"REG_d")              \n\t"
2018
                "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2019
                "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
2020
                "punpcklbw               %%mm7, %%mm0       \n\t"
2021
                "punpcklbw               %%mm7, %%mm1       \n\t"
2022
                "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
2023
                "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
2024
                "punpcklbw               %%mm7, %%mm2       \n\t"
2025
                "punpcklbw               %%mm7, %%mm3       \n\t"
2026
                "pmaddwd                 %%mm6, %%mm0       \n\t"
2027
                "pmaddwd                 %%mm6, %%mm1       \n\t"
2028
                "pmaddwd                 %%mm6, %%mm2       \n\t"
2029
                "pmaddwd                 %%mm6, %%mm3       \n\t"
2030
#ifndef FAST_BGR2YV12
2031
                "psrad                      $8, %%mm0       \n\t"
2032
                "psrad                      $8, %%mm1       \n\t"
2033
                "psrad                      $8, %%mm2       \n\t"
2034
                "psrad                      $8, %%mm3       \n\t"
2035
#endif
2036
                "packssdw                %%mm1, %%mm0       \n\t"
2037
                "packssdw                %%mm3, %%mm2       \n\t"
2038
                "pmaddwd                 %%mm5, %%mm0       \n\t"
2039
                "pmaddwd                 %%mm5, %%mm2       \n\t"
2040
                "packssdw                %%mm2, %%mm0       \n\t"
2041
                "psraw                      $7, %%mm0       \n\t"
2042

    
2043
                "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2044
                "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
2045
                "punpcklbw               %%mm7, %%mm4       \n\t"
2046
                "punpcklbw               %%mm7, %%mm1       \n\t"
2047
                "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
2048
                "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
2049
                "punpcklbw               %%mm7, %%mm2       \n\t"
2050
                "punpcklbw               %%mm7, %%mm3       \n\t"
2051
                "pmaddwd                 %%mm6, %%mm4       \n\t"
2052
                "pmaddwd                 %%mm6, %%mm1       \n\t"
2053
                "pmaddwd                 %%mm6, %%mm2       \n\t"
2054
                "pmaddwd                 %%mm6, %%mm3       \n\t"
2055
#ifndef FAST_BGR2YV12
2056
                "psrad                      $8, %%mm4       \n\t"
2057
                "psrad                      $8, %%mm1       \n\t"
2058
                "psrad                      $8, %%mm2       \n\t"
2059
                "psrad                      $8, %%mm3       \n\t"
2060
#endif
2061
                "packssdw                %%mm1, %%mm4       \n\t"
2062
                "packssdw                %%mm3, %%mm2       \n\t"
2063
                "pmaddwd                 %%mm5, %%mm4       \n\t"
2064
                "pmaddwd                 %%mm5, %%mm2       \n\t"
2065
                "add                       $24, %%"REG_d"   \n\t"
2066
                "packssdw                %%mm2, %%mm4       \n\t"
2067
                "psraw                      $7, %%mm4       \n\t"
2068

    
2069
                "packuswb                %%mm4, %%mm0       \n\t"
2070
                "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
2071

    
2072
                MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
2073
                "add                        $8,      %%"REG_a"  \n\t"
2074
                " js                        1b                  \n\t"
2075
                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2076
                : "%"REG_a, "%"REG_d
2077
            );
2078
            ydst += lumStride;
2079
            src  += srcStride;
2080
        }
2081
        src -= srcStride*2;
2082
        __asm__ volatile(
2083
            "mov                        %4, %%"REG_a"   \n\t"
2084
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2085
            "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
2086
            "pxor                    %%mm7, %%mm7       \n\t"
2087
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
2088
            "add                 %%"REG_d", %%"REG_d"   \n\t"
2089
            ".p2align                    4              \n\t"
2090
            "1:                                         \n\t"
2091
            PREFETCH"    64(%0, %%"REG_d")              \n\t"
2092
            PREFETCH"    64(%1, %%"REG_d")              \n\t"
2093
#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2094
            "movq          (%0, %%"REG_d"), %%mm0       \n\t"
2095
            "movq          (%1, %%"REG_d"), %%mm1       \n\t"
2096
            "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
2097
            "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
2098
            PAVGB"                   %%mm1, %%mm0       \n\t"
2099
            PAVGB"                   %%mm3, %%mm2       \n\t"
2100
            "movq                    %%mm0, %%mm1       \n\t"
2101
            "movq                    %%mm2, %%mm3       \n\t"
2102
            "psrlq                     $24, %%mm0       \n\t"
2103
            "psrlq                     $24, %%mm2       \n\t"
2104
            PAVGB"                   %%mm1, %%mm0       \n\t"
2105
            PAVGB"                   %%mm3, %%mm2       \n\t"
2106
            "punpcklbw               %%mm7, %%mm0       \n\t"
2107
            "punpcklbw               %%mm7, %%mm2       \n\t"
2108
#else
2109
            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
2110
            "movd          (%1, %%"REG_d"), %%mm1       \n\t"
2111
            "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
2112
            "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
2113
            "punpcklbw               %%mm7, %%mm0       \n\t"
2114
            "punpcklbw               %%mm7, %%mm1       \n\t"
2115
            "punpcklbw               %%mm7, %%mm2       \n\t"
2116
            "punpcklbw               %%mm7, %%mm3       \n\t"
2117
            "paddw                   %%mm1, %%mm0       \n\t"
2118
            "paddw                   %%mm3, %%mm2       \n\t"
2119
            "paddw                   %%mm2, %%mm0       \n\t"
2120
            "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
2121
            "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
2122
            "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
2123
            "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
2124
            "punpcklbw               %%mm7, %%mm4       \n\t"
2125
            "punpcklbw               %%mm7, %%mm1       \n\t"
2126
            "punpcklbw               %%mm7, %%mm2       \n\t"
2127
            "punpcklbw               %%mm7, %%mm3       \n\t"
2128
            "paddw                   %%mm1, %%mm4       \n\t"
2129
            "paddw                   %%mm3, %%mm2       \n\t"
2130
            "paddw                   %%mm4, %%mm2       \n\t"
2131
            "psrlw                      $2, %%mm0       \n\t"
2132
            "psrlw                      $2, %%mm2       \n\t"
2133
#endif
2134
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2135
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2136

    
2137
            "pmaddwd                 %%mm0, %%mm1       \n\t"
2138
            "pmaddwd                 %%mm2, %%mm3       \n\t"
2139
            "pmaddwd                 %%mm6, %%mm0       \n\t"
2140
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2141
#ifndef FAST_BGR2YV12
2142
            "psrad                      $8, %%mm0       \n\t"
2143
            "psrad                      $8, %%mm1       \n\t"
2144
            "psrad                      $8, %%mm2       \n\t"
2145
            "psrad                      $8, %%mm3       \n\t"
2146
#endif
2147
            "packssdw                %%mm2, %%mm0       \n\t"
2148
            "packssdw                %%mm3, %%mm1       \n\t"
2149
            "pmaddwd                 %%mm5, %%mm0       \n\t"
2150
            "pmaddwd                 %%mm5, %%mm1       \n\t"
2151
            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2152
            "psraw                      $7, %%mm0       \n\t"
2153

    
2154
#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
2155
            "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
2156
            "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
2157
            "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
2158
            "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
2159
            PAVGB"                   %%mm1, %%mm4       \n\t"
2160
            PAVGB"                   %%mm3, %%mm2       \n\t"
2161
            "movq                    %%mm4, %%mm1       \n\t"
2162
            "movq                    %%mm2, %%mm3       \n\t"
2163
            "psrlq                     $24, %%mm4       \n\t"
2164
            "psrlq                     $24, %%mm2       \n\t"
2165
            PAVGB"                   %%mm1, %%mm4       \n\t"
2166
            PAVGB"                   %%mm3, %%mm2       \n\t"
2167
            "punpcklbw               %%mm7, %%mm4       \n\t"
2168
            "punpcklbw               %%mm7, %%mm2       \n\t"
2169
#else
2170
            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
2171
            "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
2172
            "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
2173
            "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
2174
            "punpcklbw               %%mm7, %%mm4       \n\t"
2175
            "punpcklbw               %%mm7, %%mm1       \n\t"
2176
            "punpcklbw               %%mm7, %%mm2       \n\t"
2177
            "punpcklbw               %%mm7, %%mm3       \n\t"
2178
            "paddw                   %%mm1, %%mm4       \n\t"
2179
            "paddw                   %%mm3, %%mm2       \n\t"
2180
            "paddw                   %%mm2, %%mm4       \n\t"
2181
            "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
2182
            "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
2183
            "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
2184
            "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
2185
            "punpcklbw               %%mm7, %%mm5       \n\t"
2186
            "punpcklbw               %%mm7, %%mm1       \n\t"
2187
            "punpcklbw               %%mm7, %%mm2       \n\t"
2188
            "punpcklbw               %%mm7, %%mm3       \n\t"
2189
            "paddw                   %%mm1, %%mm5       \n\t"
2190
            "paddw                   %%mm3, %%mm2       \n\t"
2191
            "paddw                   %%mm5, %%mm2       \n\t"
2192
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
2193
            "psrlw                      $2, %%mm4       \n\t"
2194
            "psrlw                      $2, %%mm2       \n\t"
2195
#endif
2196
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2197
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2198

    
2199
            "pmaddwd                 %%mm4, %%mm1       \n\t"
2200
            "pmaddwd                 %%mm2, %%mm3       \n\t"
2201
            "pmaddwd                 %%mm6, %%mm4       \n\t"
2202
            "pmaddwd                 %%mm6, %%mm2       \n\t"
2203
#ifndef FAST_BGR2YV12
2204
            "psrad                      $8, %%mm4       \n\t"
2205
            "psrad                      $8, %%mm1       \n\t"
2206
            "psrad                      $8, %%mm2       \n\t"
2207
            "psrad                      $8, %%mm3       \n\t"
2208
#endif
2209
            "packssdw                %%mm2, %%mm4       \n\t"
2210
            "packssdw                %%mm3, %%mm1       \n\t"
2211
            "pmaddwd                 %%mm5, %%mm4       \n\t"
2212
            "pmaddwd                 %%mm5, %%mm1       \n\t"
2213
            "add                       $24, %%"REG_d"   \n\t"
2214
            "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2215
            "psraw                      $7, %%mm4       \n\t"
2216

    
2217
            "movq                    %%mm0, %%mm1           \n\t"
2218
            "punpckldq               %%mm4, %%mm0           \n\t"
2219
            "punpckhdq               %%mm4, %%mm1           \n\t"
2220
            "packsswb                %%mm1, %%mm0           \n\t"
2221
            "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
2222
            "movd                    %%mm0, (%2, %%"REG_a") \n\t"
2223
            "punpckhdq               %%mm0, %%mm0           \n\t"
2224
            "movd                    %%mm0, (%3, %%"REG_a") \n\t"
2225
            "add                        $4, %%"REG_a"       \n\t"
2226
            " js                        1b                  \n\t"
2227
            : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2228
            : "%"REG_a, "%"REG_d
2229
        );
2230

    
2231
        udst += chromStride;
2232
        vdst += chromStride;
2233
        src  += srcStride*2;
2234
    }
2235

    
2236
    __asm__ volatile(EMMS"       \n\t"
2237
                     SFENCE"     \n\t"
2238
                     :::"memory");
2239
#else
2240
    y=0;
2241
#endif
2242
    for (; y<height; y+=2) {
2243
        long i;
2244
        for (i=0; i<chromWidth; i++) {
2245
            unsigned int b = src[6*i+0];
2246
            unsigned int g = src[6*i+1];
2247
            unsigned int r = src[6*i+2];
2248

    
2249
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2250
            unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2251
            unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2252

    
2253
            udst[i]     = U;
2254
            vdst[i]     = V;
2255
            ydst[2*i]   = Y;
2256

    
2257
            b = src[6*i+3];
2258
            g = src[6*i+4];
2259
            r = src[6*i+5];
2260

    
2261
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2262
            ydst[2*i+1]     = Y;
2263
        }
2264
        ydst += lumStride;
2265
        src  += srcStride;
2266

    
2267
        for (i=0; i<chromWidth; i++) {
2268
            unsigned int b = src[6*i+0];
2269
            unsigned int g = src[6*i+1];
2270
            unsigned int r = src[6*i+2];
2271

    
2272
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2273

    
2274
            ydst[2*i]     = Y;
2275

    
2276
            b = src[6*i+3];
2277
            g = src[6*i+4];
2278
            r = src[6*i+5];
2279

    
2280
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2281
            ydst[2*i+1]     = Y;
2282
        }
2283
        udst += chromStride;
2284
        vdst += chromStride;
2285
        ydst += lumStride;
2286
        src  += srcStride;
2287
    }
2288
}
2289

    
2290
static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2291
                                    long width, long height, long src1Stride,
2292
                                    long src2Stride, long dstStride)
2293
{
2294
    long h;
2295

    
2296
    for (h=0; h < height; h++) {
2297
        long w;
2298

    
2299
#if COMPILE_TEMPLATE_MMX
2300
#if COMPILE_TEMPLATE_SSE2
2301
        __asm__(
2302
            "xor              %%"REG_a", %%"REG_a"  \n\t"
2303
            "1:                                     \n\t"
2304
            PREFETCH" 64(%1, %%"REG_a")             \n\t"
2305
            PREFETCH" 64(%2, %%"REG_a")             \n\t"
2306
            "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2307
            "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2308
            "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2309
            "punpcklbw           %%xmm2, %%xmm0     \n\t"
2310
            "punpckhbw           %%xmm2, %%xmm1     \n\t"
2311
            "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2312
            "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2313
            "add                    $16, %%"REG_a"  \n\t"
2314
            "cmp                     %3, %%"REG_a"  \n\t"
2315
            " jb                     1b             \n\t"
2316
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2317
            : "memory", "%"REG_a""
2318
        );
2319
#else
2320
        __asm__(
2321
            "xor %%"REG_a", %%"REG_a"               \n\t"
2322
            "1:                                     \n\t"
2323
            PREFETCH" 64(%1, %%"REG_a")             \n\t"
2324
            PREFETCH" 64(%2, %%"REG_a")             \n\t"
2325
            "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2326
            "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2327
            "movq                 %%mm0, %%mm1      \n\t"
2328
            "movq                 %%mm2, %%mm3      \n\t"
2329
            "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2330
            "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2331
            "punpcklbw            %%mm4, %%mm0      \n\t"
2332
            "punpckhbw            %%mm4, %%mm1      \n\t"
2333
            "punpcklbw            %%mm5, %%mm2      \n\t"
2334
            "punpckhbw            %%mm5, %%mm3      \n\t"
2335
            MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2336
            MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2337
            MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2338
            MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2339
            "add                    $16, %%"REG_a"  \n\t"
2340
            "cmp                     %3, %%"REG_a"  \n\t"
2341
            " jb                     1b             \n\t"
2342
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2343
            : "memory", "%"REG_a
2344
        );
2345
#endif
2346
        for (w= (width&(~15)); w < width; w++) {
2347
            dest[2*w+0] = src1[w];
2348
            dest[2*w+1] = src2[w];
2349
        }
2350
#else
2351
        for (w=0; w < width; w++) {
2352
            dest[2*w+0] = src1[w];
2353
            dest[2*w+1] = src2[w];
2354
        }
2355
#endif
2356
        dest += dstStride;
2357
        src1 += src1Stride;
2358
        src2 += src2Stride;
2359
    }
2360
#if COMPILE_TEMPLATE_MMX
2361
    __asm__(
2362
            EMMS"       \n\t"
2363
            SFENCE"     \n\t"
2364
            ::: "memory"
2365
            );
2366
#endif
2367
}
2368

    
2369
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2370
                                       uint8_t *dst1, uint8_t *dst2,
2371
                                       long width, long height,
2372
                                       long srcStride1, long srcStride2,
2373
                                       long dstStride1, long dstStride2)
2374
{
2375
    x86_reg y;
2376
    long x,w,h;
2377
    w=width/2; h=height/2;
2378
#if COMPILE_TEMPLATE_MMX
2379
    __asm__ volatile(
2380
        PREFETCH" %0    \n\t"
2381
        PREFETCH" %1    \n\t"
2382
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2383
#endif
2384
    for (y=0;y<h;y++) {
2385
        const uint8_t* s1=src1+srcStride1*(y>>1);
2386
        uint8_t* d=dst1+dstStride1*y;
2387
        x=0;
2388
#if COMPILE_TEMPLATE_MMX
2389
        for (;x<w-31;x+=32) {
2390
            __asm__ volatile(
2391
                PREFETCH"   32%1        \n\t"
2392
                "movq         %1, %%mm0 \n\t"
2393
                "movq        8%1, %%mm2 \n\t"
2394
                "movq       16%1, %%mm4 \n\t"
2395
                "movq       24%1, %%mm6 \n\t"
2396
                "movq      %%mm0, %%mm1 \n\t"
2397
                "movq      %%mm2, %%mm3 \n\t"
2398
                "movq      %%mm4, %%mm5 \n\t"
2399
                "movq      %%mm6, %%mm7 \n\t"
2400
                "punpcklbw %%mm0, %%mm0 \n\t"
2401
                "punpckhbw %%mm1, %%mm1 \n\t"
2402
                "punpcklbw %%mm2, %%mm2 \n\t"
2403
                "punpckhbw %%mm3, %%mm3 \n\t"
2404
                "punpcklbw %%mm4, %%mm4 \n\t"
2405
                "punpckhbw %%mm5, %%mm5 \n\t"
2406
                "punpcklbw %%mm6, %%mm6 \n\t"
2407
                "punpckhbw %%mm7, %%mm7 \n\t"
2408
                MOVNTQ"    %%mm0,   %0  \n\t"
2409
                MOVNTQ"    %%mm1,  8%0  \n\t"
2410
                MOVNTQ"    %%mm2, 16%0  \n\t"
2411
                MOVNTQ"    %%mm3, 24%0  \n\t"
2412
                MOVNTQ"    %%mm4, 32%0  \n\t"
2413
                MOVNTQ"    %%mm5, 40%0  \n\t"
2414
                MOVNTQ"    %%mm6, 48%0  \n\t"
2415
                MOVNTQ"    %%mm7, 56%0"
2416
                :"=m"(d[2*x])
2417
                :"m"(s1[x])
2418
                :"memory");
2419
        }
2420
#endif
2421
        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2422
    }
2423
    for (y=0;y<h;y++) {
2424
        const uint8_t* s2=src2+srcStride2*(y>>1);
2425
        uint8_t* d=dst2+dstStride2*y;
2426
        x=0;
2427
#if COMPILE_TEMPLATE_MMX
2428
        for (;x<w-31;x+=32) {
2429
            __asm__ volatile(
2430
                PREFETCH"   32%1        \n\t"
2431
                "movq         %1, %%mm0 \n\t"
2432
                "movq        8%1, %%mm2 \n\t"
2433
                "movq       16%1, %%mm4 \n\t"
2434
                "movq       24%1, %%mm6 \n\t"
2435
                "movq      %%mm0, %%mm1 \n\t"
2436
                "movq      %%mm2, %%mm3 \n\t"
2437
                "movq      %%mm4, %%mm5 \n\t"
2438
                "movq      %%mm6, %%mm7 \n\t"
2439
                "punpcklbw %%mm0, %%mm0 \n\t"
2440
                "punpckhbw %%mm1, %%mm1 \n\t"
2441
                "punpcklbw %%mm2, %%mm2 \n\t"
2442
                "punpckhbw %%mm3, %%mm3 \n\t"
2443
                "punpcklbw %%mm4, %%mm4 \n\t"
2444
                "punpckhbw %%mm5, %%mm5 \n\t"
2445
                "punpcklbw %%mm6, %%mm6 \n\t"
2446
                "punpckhbw %%mm7, %%mm7 \n\t"
2447
                MOVNTQ"    %%mm0,   %0  \n\t"
2448
                MOVNTQ"    %%mm1,  8%0  \n\t"
2449
                MOVNTQ"    %%mm2, 16%0  \n\t"
2450
                MOVNTQ"    %%mm3, 24%0  \n\t"
2451
                MOVNTQ"    %%mm4, 32%0  \n\t"
2452
                MOVNTQ"    %%mm5, 40%0  \n\t"
2453
                MOVNTQ"    %%mm6, 48%0  \n\t"
2454
                MOVNTQ"    %%mm7, 56%0"
2455
                :"=m"(d[2*x])
2456
                :"m"(s2[x])
2457
                :"memory");
2458
        }
2459
#endif
2460
        for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2461
    }
2462
#if COMPILE_TEMPLATE_MMX
2463
    __asm__(
2464
            EMMS"       \n\t"
2465
            SFENCE"     \n\t"
2466
            ::: "memory"
2467
        );
2468
#endif
2469
}
2470

    
2471
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2472
                                        uint8_t *dst,
2473
                                        long width, long height,
2474
                                        long srcStride1, long srcStride2,
2475
                                        long srcStride3, long dstStride)
2476
{
2477
    x86_reg x;
2478
    long y,w,h;
2479
    w=width/2; h=height;
2480
    for (y=0;y<h;y++) {
2481
        const uint8_t* yp=src1+srcStride1*y;
2482
        const uint8_t* up=src2+srcStride2*(y>>2);
2483
        const uint8_t* vp=src3+srcStride3*(y>>2);
2484
        uint8_t* d=dst+dstStride*y;
2485
        x=0;
2486
#if COMPILE_TEMPLATE_MMX
2487
        for (;x<w-7;x+=8) {
2488
            __asm__ volatile(
2489
                PREFETCH"   32(%1, %0)          \n\t"
2490
                PREFETCH"   32(%2, %0)          \n\t"
2491
                PREFETCH"   32(%3, %0)          \n\t"
2492
                "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2493
                "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2494
                "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2495
                "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2496
                "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2497
                "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2498
                "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2499
                "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2500
                "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2501
                "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2502

    
2503
                "movq            %%mm1, %%mm6   \n\t"
2504
                "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2505
                "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2506
                "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2507
                MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2508
                MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2509

    
2510
                "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2511
                "movq     8(%1, %0, 4), %%mm0   \n\t"
2512
                "movq            %%mm0, %%mm3   \n\t"
2513
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2514
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2515
                MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2516
                MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2517

    
2518
                "movq            %%mm4, %%mm6   \n\t"
2519
                "movq    16(%1, %0, 4), %%mm0   \n\t"
2520
                "movq            %%mm0, %%mm3   \n\t"
2521
                "punpcklbw       %%mm5, %%mm4   \n\t"
2522
                "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2523
                "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2524
                MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2525
                MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2526

    
2527
                "punpckhbw       %%mm5, %%mm6   \n\t"
2528
                "movq    24(%1, %0, 4), %%mm0   \n\t"
2529
                "movq            %%mm0, %%mm3   \n\t"
2530
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2531
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2532
                MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2533
                MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2534

    
2535
                : "+r" (x)
2536
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2537
                :"memory");
2538
        }
2539
#endif
2540
        for (; x<w; x++) {
2541
            const long x2 = x<<2;
2542
            d[8*x+0] = yp[x2];
2543
            d[8*x+1] = up[x];
2544
            d[8*x+2] = yp[x2+1];
2545
            d[8*x+3] = vp[x];
2546
            d[8*x+4] = yp[x2+2];
2547
            d[8*x+5] = up[x];
2548
            d[8*x+6] = yp[x2+3];
2549
            d[8*x+7] = vp[x];
2550
        }
2551
    }
2552
#if COMPILE_TEMPLATE_MMX
2553
    __asm__(
2554
            EMMS"       \n\t"
2555
            SFENCE"     \n\t"
2556
            ::: "memory"
2557
        );
2558
#endif
2559
}
2560

    
2561
static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2562
{
2563
    dst +=   count;
2564
    src += 2*count;
2565
    count= - count;
2566

    
2567
#if COMPILE_TEMPLATE_MMX
2568
    if(count <= -16) {
2569
        count += 15;
2570
        __asm__ volatile(
2571
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2572
            "psrlw            $8, %%mm7        \n\t"
2573
            "1:                                \n\t"
2574
            "movq -30(%1, %0, 2), %%mm0        \n\t"
2575
            "movq -22(%1, %0, 2), %%mm1        \n\t"
2576
            "movq -14(%1, %0, 2), %%mm2        \n\t"
2577
            "movq  -6(%1, %0, 2), %%mm3        \n\t"
2578
            "pand          %%mm7, %%mm0        \n\t"
2579
            "pand          %%mm7, %%mm1        \n\t"
2580
            "pand          %%mm7, %%mm2        \n\t"
2581
            "pand          %%mm7, %%mm3        \n\t"
2582
            "packuswb      %%mm1, %%mm0        \n\t"
2583
            "packuswb      %%mm3, %%mm2        \n\t"
2584
            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2585
            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2586
            "add             $16, %0           \n\t"
2587
            " js 1b                            \n\t"
2588
            : "+r"(count)
2589
            : "r"(src), "r"(dst)
2590
        );
2591
        count -= 15;
2592
    }
2593
#endif
2594
    while(count<0) {
2595
        dst[count]= src[2*count];
2596
        count++;
2597
    }
2598
}
2599

    
2600
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2601
{
2602
    dst0+=   count;
2603
    dst1+=   count;
2604
    src += 4*count;
2605
    count= - count;
2606
#if COMPILE_TEMPLATE_MMX
2607
    if(count <= -8) {
2608
        count += 7;
2609
        __asm__ volatile(
2610
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2611
            "psrlw            $8, %%mm7        \n\t"
2612
            "1:                                \n\t"
2613
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2614
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2615
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2616
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2617
            "pand          %%mm7, %%mm0        \n\t"
2618
            "pand          %%mm7, %%mm1        \n\t"
2619
            "pand          %%mm7, %%mm2        \n\t"
2620
            "pand          %%mm7, %%mm3        \n\t"
2621
            "packuswb      %%mm1, %%mm0        \n\t"
2622
            "packuswb      %%mm3, %%mm2        \n\t"
2623
            "movq          %%mm0, %%mm1        \n\t"
2624
            "movq          %%mm2, %%mm3        \n\t"
2625
            "psrlw            $8, %%mm0        \n\t"
2626
            "psrlw            $8, %%mm2        \n\t"
2627
            "pand          %%mm7, %%mm1        \n\t"
2628
            "pand          %%mm7, %%mm3        \n\t"
2629
            "packuswb      %%mm2, %%mm0        \n\t"
2630
            "packuswb      %%mm3, %%mm1        \n\t"
2631
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2632
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2633
            "add              $8, %0           \n\t"
2634
            " js 1b                            \n\t"
2635
            : "+r"(count)
2636
            : "r"(src), "r"(dst0), "r"(dst1)
2637
        );
2638
        count -= 7;
2639
    }
2640
#endif
2641
    while(count<0) {
2642
        dst0[count]= src[4*count+0];
2643
        dst1[count]= src[4*count+2];
2644
        count++;
2645
    }
2646
}
2647

    
2648
static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2649
{
2650
    dst0 +=   count;
2651
    dst1 +=   count;
2652
    src0 += 4*count;
2653
    src1 += 4*count;
2654
    count= - count;
2655
#ifdef PAVGB
2656
    if(count <= -8) {
2657
        count += 7;
2658
        __asm__ volatile(
2659
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2660
            "psrlw             $8, %%mm7        \n\t"
2661
            "1:                                \n\t"
2662
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2663
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2664
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2665
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2666
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2667
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2668
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2669
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2670
            "pand           %%mm7, %%mm0        \n\t"
2671
            "pand           %%mm7, %%mm1        \n\t"
2672
            "pand           %%mm7, %%mm2        \n\t"
2673
            "pand           %%mm7, %%mm3        \n\t"
2674
            "packuswb       %%mm1, %%mm0        \n\t"
2675
            "packuswb       %%mm3, %%mm2        \n\t"
2676
            "movq           %%mm0, %%mm1        \n\t"
2677
            "movq           %%mm2, %%mm3        \n\t"
2678
            "psrlw             $8, %%mm0        \n\t"
2679
            "psrlw             $8, %%mm2        \n\t"
2680
            "pand           %%mm7, %%mm1        \n\t"
2681
            "pand           %%mm7, %%mm3        \n\t"
2682
            "packuswb       %%mm2, %%mm0        \n\t"
2683
            "packuswb       %%mm3, %%mm1        \n\t"
2684
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2685
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2686
            "add               $8, %0           \n\t"
2687
            " js 1b                            \n\t"
2688
            : "+r"(count)
2689
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2690
        );
2691
        count -= 7;
2692
    }
2693
#endif
2694
    while(count<0) {
2695
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2696
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2697
        count++;
2698
    }
2699
}
2700

    
2701
static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2702
{
2703
    dst0+=   count;
2704
    dst1+=   count;
2705
    src += 4*count;
2706
    count= - count;
2707
#if COMPILE_TEMPLATE_MMX
2708
    if(count <= -8) {
2709
        count += 7;
2710
        __asm__ volatile(
2711
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2712
            "psrlw            $8, %%mm7        \n\t"
2713
            "1:                                \n\t"
2714
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2715
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2716
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2717
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2718
            "psrlw            $8, %%mm0        \n\t"
2719
            "psrlw            $8, %%mm1        \n\t"
2720
            "psrlw            $8, %%mm2        \n\t"
2721
            "psrlw            $8, %%mm3        \n\t"
2722
            "packuswb      %%mm1, %%mm0        \n\t"
2723
            "packuswb      %%mm3, %%mm2        \n\t"
2724
            "movq          %%mm0, %%mm1        \n\t"
2725
            "movq          %%mm2, %%mm3        \n\t"
2726
            "psrlw            $8, %%mm0        \n\t"
2727
            "psrlw            $8, %%mm2        \n\t"
2728
            "pand          %%mm7, %%mm1        \n\t"
2729
            "pand          %%mm7, %%mm3        \n\t"
2730
            "packuswb      %%mm2, %%mm0        \n\t"
2731
            "packuswb      %%mm3, %%mm1        \n\t"
2732
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2733
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2734
            "add              $8, %0           \n\t"
2735
            " js 1b                            \n\t"
2736
            : "+r"(count)
2737
            : "r"(src), "r"(dst0), "r"(dst1)
2738
        );
2739
        count -= 7;
2740
    }
2741
#endif
2742
    src++;
2743
    while(count<0) {
2744
        dst0[count]= src[4*count+0];
2745
        dst1[count]= src[4*count+2];
2746
        count++;
2747
    }
2748
}
2749

    
2750
static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2751
{
2752
    dst0 +=   count;
2753
    dst1 +=   count;
2754
    src0 += 4*count;
2755
    src1 += 4*count;
2756
    count= - count;
2757
#ifdef PAVGB
2758
    if(count <= -8) {
2759
        count += 7;
2760
        __asm__ volatile(
2761
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2762
            "psrlw             $8, %%mm7        \n\t"
2763
            "1:                                \n\t"
2764
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2765
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2766
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2767
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2768
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2769
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2770
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2771
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2772
            "psrlw             $8, %%mm0        \n\t"
2773
            "psrlw             $8, %%mm1        \n\t"
2774
            "psrlw             $8, %%mm2        \n\t"
2775
            "psrlw             $8, %%mm3        \n\t"
2776
            "packuswb       %%mm1, %%mm0        \n\t"
2777
            "packuswb       %%mm3, %%mm2        \n\t"
2778
            "movq           %%mm0, %%mm1        \n\t"
2779
            "movq           %%mm2, %%mm3        \n\t"
2780
            "psrlw             $8, %%mm0        \n\t"
2781
            "psrlw             $8, %%mm2        \n\t"
2782
            "pand           %%mm7, %%mm1        \n\t"
2783
            "pand           %%mm7, %%mm3        \n\t"
2784
            "packuswb       %%mm2, %%mm0        \n\t"
2785
            "packuswb       %%mm3, %%mm1        \n\t"
2786
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2787
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2788
            "add               $8, %0           \n\t"
2789
            " js 1b                            \n\t"
2790
            : "+r"(count)
2791
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2792
        );
2793
        count -= 7;
2794
    }
2795
#endif
2796
    src0++;
2797
    src1++;
2798
    while(count<0) {
2799
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2800
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2801
        count++;
2802
    }
2803
}
2804

    
2805
static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2806
                                 long width, long height,
2807
                                 long lumStride, long chromStride, long srcStride)
2808
{
2809
    long y;
2810
    const long chromWidth= -((-width)>>1);
2811

    
2812
    for (y=0; y<height; y++) {
2813
        RENAME(extract_even)(src, ydst, width);
2814
        if(y&1) {
2815
            RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2816
            udst+= chromStride;
2817
            vdst+= chromStride;
2818
        }
2819

    
2820
        src += srcStride;
2821
        ydst+= lumStride;
2822
    }
2823
#if COMPILE_TEMPLATE_MMX
2824
    __asm__(
2825
            EMMS"       \n\t"
2826
            SFENCE"     \n\t"
2827
            ::: "memory"
2828
        );
2829
#endif
2830
}
2831

    
2832
static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2833
                                 long width, long height,
2834
                                 long lumStride, long chromStride, long srcStride)
2835
{
2836
    long y;
2837
    const long chromWidth= -((-width)>>1);
2838

    
2839
    for (y=0; y<height; y++) {
2840
        RENAME(extract_even)(src, ydst, width);
2841
        RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2842

    
2843
        src += srcStride;
2844
        ydst+= lumStride;
2845
        udst+= chromStride;
2846
        vdst+= chromStride;
2847
    }
2848
#if COMPILE_TEMPLATE_MMX
2849
    __asm__(
2850
            EMMS"       \n\t"
2851
            SFENCE"     \n\t"
2852
            ::: "memory"
2853
        );
2854
#endif
2855
}
2856

    
2857
static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2858
                                 long width, long height,
2859
                                 long lumStride, long chromStride, long srcStride)
2860
{
2861
    long y;
2862
    const long chromWidth= -((-width)>>1);
2863

    
2864
    for (y=0; y<height; y++) {
2865
        RENAME(extract_even)(src+1, ydst, width);
2866
        if(y&1) {
2867
            RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2868
            udst+= chromStride;
2869
            vdst+= chromStride;
2870
        }
2871

    
2872
        src += srcStride;
2873
        ydst+= lumStride;
2874
    }
2875
#if COMPILE_TEMPLATE_MMX
2876
    __asm__(
2877
            EMMS"       \n\t"
2878
            SFENCE"     \n\t"
2879
            ::: "memory"
2880
        );
2881
#endif
2882
}
2883

    
2884
static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2885
                                 long width, long height,
2886
                                 long lumStride, long chromStride, long srcStride)
2887
{
2888
    long y;
2889
    const long chromWidth= -((-width)>>1);
2890

    
2891
    for (y=0; y<height; y++) {
2892
        RENAME(extract_even)(src+1, ydst, width);
2893
        RENAME(extract_even2)(src, udst, vdst, chromWidth);
2894

    
2895
        src += srcStride;
2896
        ydst+= lumStride;
2897
        udst+= chromStride;
2898
        vdst+= chromStride;
2899
    }
2900
#if COMPILE_TEMPLATE_MMX
2901
    __asm__(
2902
            EMMS"       \n\t"
2903
            SFENCE"     \n\t"
2904
            ::: "memory"
2905
        );
2906
#endif
2907
}
2908

    
2909
static inline void RENAME(rgb2rgb_init)(void)
2910
{
2911
    rgb15to16          = RENAME(rgb15to16);
2912
    rgb15tobgr24       = RENAME(rgb15tobgr24);
2913
    rgb15to32          = RENAME(rgb15to32);
2914
    rgb16tobgr24       = RENAME(rgb16tobgr24);
2915
    rgb16to32          = RENAME(rgb16to32);
2916
    rgb16to15          = RENAME(rgb16to15);
2917
    rgb24tobgr16       = RENAME(rgb24tobgr16);
2918
    rgb24tobgr15       = RENAME(rgb24tobgr15);
2919
    rgb24tobgr32       = RENAME(rgb24tobgr32);
2920
    rgb32to16          = RENAME(rgb32to16);
2921
    rgb32to15          = RENAME(rgb32to15);
2922
    rgb32tobgr24       = RENAME(rgb32tobgr24);
2923
    rgb24to15          = RENAME(rgb24to15);
2924
    rgb24to16          = RENAME(rgb24to16);
2925
    rgb24tobgr24       = RENAME(rgb24tobgr24);
2926
    shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2927
    rgb32tobgr16       = RENAME(rgb32tobgr16);
2928
    rgb32tobgr15       = RENAME(rgb32tobgr15);
2929
    yv12toyuy2         = RENAME(yv12toyuy2);
2930
    yv12touyvy         = RENAME(yv12touyvy);
2931
    yuv422ptoyuy2      = RENAME(yuv422ptoyuy2);
2932
    yuv422ptouyvy      = RENAME(yuv422ptouyvy);
2933
    yuy2toyv12         = RENAME(yuy2toyv12);
2934
    planar2x           = RENAME(planar2x);
2935
    rgb24toyv12        = RENAME(rgb24toyv12);
2936
    interleaveBytes    = RENAME(interleaveBytes);
2937
    vu9_to_vu12        = RENAME(vu9_to_vu12);
2938
    yvu9_to_yuy2       = RENAME(yvu9_to_yuy2);
2939

    
2940
    uyvytoyuv420       = RENAME(uyvytoyuv420);
2941
    uyvytoyuv422       = RENAME(uyvytoyuv422);
2942
    yuyvtoyuv420       = RENAME(yuyvtoyuv420);
2943
    yuyvtoyuv422       = RENAME(yuyvtoyuv422);
2944
}