Statistics
| Branch: | Revision:

ffmpeg / libswscale / x86 / rgb2rgb_template.c @ 6216fc70

History | View | Annotate | Download (107 KB)

1
/*
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 * lot of big-endian byte order fixes by Alex Beregszaszi
9
 *
10
 * This file is part of Libav.
11
 *
12
 * Libav is free software; you can redistribute it and/or
13
 * modify it under the terms of the GNU Lesser General Public
14
 * License as published by the Free Software Foundation; either
15
 * version 2.1 of the License, or (at your option) any later version.
16
 *
17
 * Libav is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
 * Lesser General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU Lesser General Public
23
 * License along with Libav; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 */
26

    
27
#include <stddef.h>
28

    
29
#undef PREFETCH
30
#undef MOVNTQ
31
#undef EMMS
32
#undef SFENCE
33
#undef MMREG_SIZE
34
#undef PAVGB
35

    
36
#if COMPILE_TEMPLATE_SSE2
37
#define MMREG_SIZE 16
38
#else
39
#define MMREG_SIZE 8
40
#endif
41

    
42
#if COMPILE_TEMPLATE_AMD3DNOW
43
#define PREFETCH  "prefetch"
44
#define PAVGB     "pavgusb"
45
#elif COMPILE_TEMPLATE_MMX2
46
#define PREFETCH "prefetchnta"
47
#define PAVGB     "pavgb"
48
#else
49
#define PREFETCH  " # nop"
50
#endif
51

    
52
#if COMPILE_TEMPLATE_AMD3DNOW
53
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
54
#define EMMS     "femms"
55
#else
56
#define EMMS     "emms"
57
#endif
58

    
59
#if COMPILE_TEMPLATE_MMX2
60
#define MOVNTQ "movntq"
61
#define SFENCE "sfence"
62
#else
63
#define MOVNTQ "movq"
64
#define SFENCE " # nop"
65
#endif
66

    
67
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
68
{
69
    uint8_t *dest = dst;
70
    const uint8_t *s = src;
71
    const uint8_t *end;
72
    const uint8_t *mm_end;
73
    end = s + src_size;
74
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
75
    mm_end = end - 23;
76
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
77
    while (s < mm_end) {
78
        __asm__ volatile(
79
            PREFETCH"    32%1           \n\t"
80
            "movd          %1, %%mm0    \n\t"
81
            "punpckldq    3%1, %%mm0    \n\t"
82
            "movd         6%1, %%mm1    \n\t"
83
            "punpckldq    9%1, %%mm1    \n\t"
84
            "movd        12%1, %%mm2    \n\t"
85
            "punpckldq   15%1, %%mm2    \n\t"
86
            "movd        18%1, %%mm3    \n\t"
87
            "punpckldq   21%1, %%mm3    \n\t"
88
            "por        %%mm7, %%mm0    \n\t"
89
            "por        %%mm7, %%mm1    \n\t"
90
            "por        %%mm7, %%mm2    \n\t"
91
            "por        %%mm7, %%mm3    \n\t"
92
            MOVNTQ"     %%mm0,   %0     \n\t"
93
            MOVNTQ"     %%mm1,  8%0     \n\t"
94
            MOVNTQ"     %%mm2, 16%0     \n\t"
95
            MOVNTQ"     %%mm3, 24%0"
96
            :"=m"(*dest)
97
            :"m"(*s)
98
            :"memory");
99
        dest += 32;
100
        s += 24;
101
    }
102
    __asm__ volatile(SFENCE:::"memory");
103
    __asm__ volatile(EMMS:::"memory");
104
    while (s < end) {
105
        *dest++ = *s++;
106
        *dest++ = *s++;
107
        *dest++ = *s++;
108
        *dest++ = 255;
109
    }
110
}
111

    
112
#define STORE_BGR24_MMX \
113
            "psrlq         $8, %%mm2    \n\t" \
114
            "psrlq         $8, %%mm3    \n\t" \
115
            "psrlq         $8, %%mm6    \n\t" \
116
            "psrlq         $8, %%mm7    \n\t" \
117
            "pand "MANGLE(mask24l)", %%mm0\n\t" \
118
            "pand "MANGLE(mask24l)", %%mm1\n\t" \
119
            "pand "MANGLE(mask24l)", %%mm4\n\t" \
120
            "pand "MANGLE(mask24l)", %%mm5\n\t" \
121
            "pand "MANGLE(mask24h)", %%mm2\n\t" \
122
            "pand "MANGLE(mask24h)", %%mm3\n\t" \
123
            "pand "MANGLE(mask24h)", %%mm6\n\t" \
124
            "pand "MANGLE(mask24h)", %%mm7\n\t" \
125
            "por        %%mm2, %%mm0    \n\t" \
126
            "por        %%mm3, %%mm1    \n\t" \
127
            "por        %%mm6, %%mm4    \n\t" \
128
            "por        %%mm7, %%mm5    \n\t" \
129
 \
130
            "movq       %%mm1, %%mm2    \n\t" \
131
            "movq       %%mm4, %%mm3    \n\t" \
132
            "psllq        $48, %%mm2    \n\t" \
133
            "psllq        $32, %%mm3    \n\t" \
134
            "pand "MANGLE(mask24hh)", %%mm2\n\t" \
135
            "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
136
            "por        %%mm2, %%mm0    \n\t" \
137
            "psrlq        $16, %%mm1    \n\t" \
138
            "psrlq        $32, %%mm4    \n\t" \
139
            "psllq        $16, %%mm5    \n\t" \
140
            "por        %%mm3, %%mm1    \n\t" \
141
            "pand  "MANGLE(mask24hhhh)", %%mm5\n\t" \
142
            "por        %%mm5, %%mm4    \n\t" \
143
 \
144
            MOVNTQ"     %%mm0,   %0     \n\t" \
145
            MOVNTQ"     %%mm1,  8%0     \n\t" \
146
            MOVNTQ"     %%mm4, 16%0"
147

    
148

    
149
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
150
{
151
    uint8_t *dest = dst;
152
    const uint8_t *s = src;
153
    const uint8_t *end;
154
    const uint8_t *mm_end;
155
    end = s + src_size;
156
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
157
    mm_end = end - 31;
158
    while (s < mm_end) {
159
        __asm__ volatile(
160
            PREFETCH"    32%1           \n\t"
161
            "movq          %1, %%mm0    \n\t"
162
            "movq         8%1, %%mm1    \n\t"
163
            "movq        16%1, %%mm4    \n\t"
164
            "movq        24%1, %%mm5    \n\t"
165
            "movq       %%mm0, %%mm2    \n\t"
166
            "movq       %%mm1, %%mm3    \n\t"
167
            "movq       %%mm4, %%mm6    \n\t"
168
            "movq       %%mm5, %%mm7    \n\t"
169
            STORE_BGR24_MMX
170
            :"=m"(*dest)
171
            :"m"(*s)
172
            :"memory");
173
        dest += 24;
174
        s += 32;
175
    }
176
    __asm__ volatile(SFENCE:::"memory");
177
    __asm__ volatile(EMMS:::"memory");
178
    while (s < end) {
179
        *dest++ = *s++;
180
        *dest++ = *s++;
181
        *dest++ = *s++;
182
        s++;
183
    }
184
}
185

    
186
/*
187
 original by Strepto/Astral
188
 ported to gcc & bugfixed: A'rpi
189
 MMX2, 3DNOW optimization by Nick Kurshev
190
 32-bit C version, and and&add trick by Michael Niedermayer
191
*/
192
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
193
{
194
    register const uint8_t* s=src;
195
    register uint8_t* d=dst;
196
    register const uint8_t *end;
197
    const uint8_t *mm_end;
198
    end = s + src_size;
199
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
200
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
201
    mm_end = end - 15;
202
    while (s<mm_end) {
203
        __asm__ volatile(
204
            PREFETCH"  32%1         \n\t"
205
            "movq        %1, %%mm0  \n\t"
206
            "movq       8%1, %%mm2  \n\t"
207
            "movq     %%mm0, %%mm1  \n\t"
208
            "movq     %%mm2, %%mm3  \n\t"
209
            "pand     %%mm4, %%mm0  \n\t"
210
            "pand     %%mm4, %%mm2  \n\t"
211
            "paddw    %%mm1, %%mm0  \n\t"
212
            "paddw    %%mm3, %%mm2  \n\t"
213
            MOVNTQ"   %%mm0,  %0    \n\t"
214
            MOVNTQ"   %%mm2, 8%0"
215
            :"=m"(*d)
216
            :"m"(*s)
217
        );
218
        d+=16;
219
        s+=16;
220
    }
221
    __asm__ volatile(SFENCE:::"memory");
222
    __asm__ volatile(EMMS:::"memory");
223
    mm_end = end - 3;
224
    while (s < mm_end) {
225
        register unsigned x= *((const uint32_t *)s);
226
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
227
        d+=4;
228
        s+=4;
229
    }
230
    if (s < end) {
231
        register unsigned short x= *((const uint16_t *)s);
232
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
233
    }
234
}
235

    
236
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
237
{
238
    register const uint8_t* s=src;
239
    register uint8_t* d=dst;
240
    register const uint8_t *end;
241
    const uint8_t *mm_end;
242
    end = s + src_size;
243
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
244
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
245
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
246
    mm_end = end - 15;
247
    while (s<mm_end) {
248
        __asm__ volatile(
249
            PREFETCH"  32%1         \n\t"
250
            "movq        %1, %%mm0  \n\t"
251
            "movq       8%1, %%mm2  \n\t"
252
            "movq     %%mm0, %%mm1  \n\t"
253
            "movq     %%mm2, %%mm3  \n\t"
254
            "psrlq       $1, %%mm0  \n\t"
255
            "psrlq       $1, %%mm2  \n\t"
256
            "pand     %%mm7, %%mm0  \n\t"
257
            "pand     %%mm7, %%mm2  \n\t"
258
            "pand     %%mm6, %%mm1  \n\t"
259
            "pand     %%mm6, %%mm3  \n\t"
260
            "por      %%mm1, %%mm0  \n\t"
261
            "por      %%mm3, %%mm2  \n\t"
262
            MOVNTQ"   %%mm0,  %0    \n\t"
263
            MOVNTQ"   %%mm2, 8%0"
264
            :"=m"(*d)
265
            :"m"(*s)
266
        );
267
        d+=16;
268
        s+=16;
269
    }
270
    __asm__ volatile(SFENCE:::"memory");
271
    __asm__ volatile(EMMS:::"memory");
272
    mm_end = end - 3;
273
    while (s < mm_end) {
274
        register uint32_t x= *((const uint32_t*)s);
275
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
276
        s+=4;
277
        d+=4;
278
    }
279
    if (s < end) {
280
        register uint16_t x= *((const uint16_t*)s);
281
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
282
    }
283
}
284

    
285
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
286
{
287
    const uint8_t *s = src;
288
    const uint8_t *end;
289
    const uint8_t *mm_end;
290
    uint16_t *d = (uint16_t *)dst;
291
    end = s + src_size;
292
    mm_end = end - 15;
293
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
294
    __asm__ volatile(
295
        "movq           %3, %%mm5   \n\t"
296
        "movq           %4, %%mm6   \n\t"
297
        "movq           %5, %%mm7   \n\t"
298
        "jmp 2f                     \n\t"
299
        ".p2align        4          \n\t"
300
        "1:                         \n\t"
301
        PREFETCH"   32(%1)          \n\t"
302
        "movd         (%1), %%mm0   \n\t"
303
        "movd        4(%1), %%mm3   \n\t"
304
        "punpckldq   8(%1), %%mm0   \n\t"
305
        "punpckldq  12(%1), %%mm3   \n\t"
306
        "movq        %%mm0, %%mm1   \n\t"
307
        "movq        %%mm3, %%mm4   \n\t"
308
        "pand        %%mm6, %%mm0   \n\t"
309
        "pand        %%mm6, %%mm3   \n\t"
310
        "pmaddwd     %%mm7, %%mm0   \n\t"
311
        "pmaddwd     %%mm7, %%mm3   \n\t"
312
        "pand        %%mm5, %%mm1   \n\t"
313
        "pand        %%mm5, %%mm4   \n\t"
314
        "por         %%mm1, %%mm0   \n\t"
315
        "por         %%mm4, %%mm3   \n\t"
316
        "psrld          $5, %%mm0   \n\t"
317
        "pslld         $11, %%mm3   \n\t"
318
        "por         %%mm3, %%mm0   \n\t"
319
        MOVNTQ"      %%mm0, (%0)    \n\t"
320
        "add           $16,  %1     \n\t"
321
        "add            $8,  %0     \n\t"
322
        "2:                         \n\t"
323
        "cmp            %2,  %1     \n\t"
324
        " jb            1b          \n\t"
325
        : "+r" (d), "+r"(s)
326
        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
327
    );
328
#else
329
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
330
    __asm__ volatile(
331
        "movq    %0, %%mm7    \n\t"
332
        "movq    %1, %%mm6    \n\t"
333
        ::"m"(red_16mask),"m"(green_16mask));
334
    while (s < mm_end) {
335
        __asm__ volatile(
336
            PREFETCH"    32%1           \n\t"
337
            "movd          %1, %%mm0    \n\t"
338
            "movd         4%1, %%mm3    \n\t"
339
            "punpckldq    8%1, %%mm0    \n\t"
340
            "punpckldq   12%1, %%mm3    \n\t"
341
            "movq       %%mm0, %%mm1    \n\t"
342
            "movq       %%mm0, %%mm2    \n\t"
343
            "movq       %%mm3, %%mm4    \n\t"
344
            "movq       %%mm3, %%mm5    \n\t"
345
            "psrlq         $3, %%mm0    \n\t"
346
            "psrlq         $3, %%mm3    \n\t"
347
            "pand          %2, %%mm0    \n\t"
348
            "pand          %2, %%mm3    \n\t"
349
            "psrlq         $5, %%mm1    \n\t"
350
            "psrlq         $5, %%mm4    \n\t"
351
            "pand       %%mm6, %%mm1    \n\t"
352
            "pand       %%mm6, %%mm4    \n\t"
353
            "psrlq         $8, %%mm2    \n\t"
354
            "psrlq         $8, %%mm5    \n\t"
355
            "pand       %%mm7, %%mm2    \n\t"
356
            "pand       %%mm7, %%mm5    \n\t"
357
            "por        %%mm1, %%mm0    \n\t"
358
            "por        %%mm4, %%mm3    \n\t"
359
            "por        %%mm2, %%mm0    \n\t"
360
            "por        %%mm5, %%mm3    \n\t"
361
            "psllq        $16, %%mm3    \n\t"
362
            "por        %%mm3, %%mm0    \n\t"
363
            MOVNTQ"     %%mm0, %0       \n\t"
364
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
365
        d += 4;
366
        s += 16;
367
    }
368
#endif
369
    __asm__ volatile(SFENCE:::"memory");
370
    __asm__ volatile(EMMS:::"memory");
371
    while (s < end) {
372
        register int rgb = *(const uint32_t*)s; s += 4;
373
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
374
    }
375
}
376

    
377
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
378
{
379
    const uint8_t *s = src;
380
    const uint8_t *end;
381
    const uint8_t *mm_end;
382
    uint16_t *d = (uint16_t *)dst;
383
    end = s + src_size;
384
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
385
    __asm__ volatile(
386
        "movq          %0, %%mm7    \n\t"
387
        "movq          %1, %%mm6    \n\t"
388
        ::"m"(red_16mask),"m"(green_16mask));
389
    mm_end = end - 15;
390
    while (s < mm_end) {
391
        __asm__ volatile(
392
            PREFETCH"    32%1           \n\t"
393
            "movd          %1, %%mm0    \n\t"
394
            "movd         4%1, %%mm3    \n\t"
395
            "punpckldq    8%1, %%mm0    \n\t"
396
            "punpckldq   12%1, %%mm3    \n\t"
397
            "movq       %%mm0, %%mm1    \n\t"
398
            "movq       %%mm0, %%mm2    \n\t"
399
            "movq       %%mm3, %%mm4    \n\t"
400
            "movq       %%mm3, %%mm5    \n\t"
401
            "psllq         $8, %%mm0    \n\t"
402
            "psllq         $8, %%mm3    \n\t"
403
            "pand       %%mm7, %%mm0    \n\t"
404
            "pand       %%mm7, %%mm3    \n\t"
405
            "psrlq         $5, %%mm1    \n\t"
406
            "psrlq         $5, %%mm4    \n\t"
407
            "pand       %%mm6, %%mm1    \n\t"
408
            "pand       %%mm6, %%mm4    \n\t"
409
            "psrlq        $19, %%mm2    \n\t"
410
            "psrlq        $19, %%mm5    \n\t"
411
            "pand          %2, %%mm2    \n\t"
412
            "pand          %2, %%mm5    \n\t"
413
            "por        %%mm1, %%mm0    \n\t"
414
            "por        %%mm4, %%mm3    \n\t"
415
            "por        %%mm2, %%mm0    \n\t"
416
            "por        %%mm5, %%mm3    \n\t"
417
            "psllq        $16, %%mm3    \n\t"
418
            "por        %%mm3, %%mm0    \n\t"
419
            MOVNTQ"     %%mm0, %0       \n\t"
420
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
421
        d += 4;
422
        s += 16;
423
    }
424
    __asm__ volatile(SFENCE:::"memory");
425
    __asm__ volatile(EMMS:::"memory");
426
    while (s < end) {
427
        register int rgb = *(const uint32_t*)s; s += 4;
428
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
429
    }
430
}
431

    
432
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
433
{
434
    const uint8_t *s = src;
435
    const uint8_t *end;
436
    const uint8_t *mm_end;
437
    uint16_t *d = (uint16_t *)dst;
438
    end = s + src_size;
439
    mm_end = end - 15;
440
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
441
    __asm__ volatile(
442
        "movq           %3, %%mm5   \n\t"
443
        "movq           %4, %%mm6   \n\t"
444
        "movq           %5, %%mm7   \n\t"
445
        "jmp            2f          \n\t"
446
        ".p2align        4          \n\t"
447
        "1:                         \n\t"
448
        PREFETCH"   32(%1)          \n\t"
449
        "movd         (%1), %%mm0   \n\t"
450
        "movd        4(%1), %%mm3   \n\t"
451
        "punpckldq   8(%1), %%mm0   \n\t"
452
        "punpckldq  12(%1), %%mm3   \n\t"
453
        "movq        %%mm0, %%mm1   \n\t"
454
        "movq        %%mm3, %%mm4   \n\t"
455
        "pand        %%mm6, %%mm0   \n\t"
456
        "pand        %%mm6, %%mm3   \n\t"
457
        "pmaddwd     %%mm7, %%mm0   \n\t"
458
        "pmaddwd     %%mm7, %%mm3   \n\t"
459
        "pand        %%mm5, %%mm1   \n\t"
460
        "pand        %%mm5, %%mm4   \n\t"
461
        "por         %%mm1, %%mm0   \n\t"
462
        "por         %%mm4, %%mm3   \n\t"
463
        "psrld          $6, %%mm0   \n\t"
464
        "pslld         $10, %%mm3   \n\t"
465
        "por         %%mm3, %%mm0   \n\t"
466
        MOVNTQ"      %%mm0, (%0)    \n\t"
467
        "add           $16,  %1     \n\t"
468
        "add            $8,  %0     \n\t"
469
        "2:                         \n\t"
470
        "cmp            %2,  %1     \n\t"
471
        " jb            1b          \n\t"
472
        : "+r" (d), "+r"(s)
473
        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
474
    );
475
#else
476
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
477
    __asm__ volatile(
478
        "movq          %0, %%mm7    \n\t"
479
        "movq          %1, %%mm6    \n\t"
480
        ::"m"(red_15mask),"m"(green_15mask));
481
    while (s < mm_end) {
482
        __asm__ volatile(
483
            PREFETCH"    32%1           \n\t"
484
            "movd          %1, %%mm0    \n\t"
485
            "movd         4%1, %%mm3    \n\t"
486
            "punpckldq    8%1, %%mm0    \n\t"
487
            "punpckldq   12%1, %%mm3    \n\t"
488
            "movq       %%mm0, %%mm1    \n\t"
489
            "movq       %%mm0, %%mm2    \n\t"
490
            "movq       %%mm3, %%mm4    \n\t"
491
            "movq       %%mm3, %%mm5    \n\t"
492
            "psrlq         $3, %%mm0    \n\t"
493
            "psrlq         $3, %%mm3    \n\t"
494
            "pand          %2, %%mm0    \n\t"
495
            "pand          %2, %%mm3    \n\t"
496
            "psrlq         $6, %%mm1    \n\t"
497
            "psrlq         $6, %%mm4    \n\t"
498
            "pand       %%mm6, %%mm1    \n\t"
499
            "pand       %%mm6, %%mm4    \n\t"
500
            "psrlq         $9, %%mm2    \n\t"
501
            "psrlq         $9, %%mm5    \n\t"
502
            "pand       %%mm7, %%mm2    \n\t"
503
            "pand       %%mm7, %%mm5    \n\t"
504
            "por        %%mm1, %%mm0    \n\t"
505
            "por        %%mm4, %%mm3    \n\t"
506
            "por        %%mm2, %%mm0    \n\t"
507
            "por        %%mm5, %%mm3    \n\t"
508
            "psllq        $16, %%mm3    \n\t"
509
            "por        %%mm3, %%mm0    \n\t"
510
            MOVNTQ"     %%mm0, %0       \n\t"
511
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
512
        d += 4;
513
        s += 16;
514
    }
515
#endif
516
    __asm__ volatile(SFENCE:::"memory");
517
    __asm__ volatile(EMMS:::"memory");
518
    while (s < end) {
519
        register int rgb = *(const uint32_t*)s; s += 4;
520
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
521
    }
522
}
523

    
524
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
525
{
526
    const uint8_t *s = src;
527
    const uint8_t *end;
528
    const uint8_t *mm_end;
529
    uint16_t *d = (uint16_t *)dst;
530
    end = s + src_size;
531
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
532
    __asm__ volatile(
533
        "movq          %0, %%mm7    \n\t"
534
        "movq          %1, %%mm6    \n\t"
535
        ::"m"(red_15mask),"m"(green_15mask));
536
    mm_end = end - 15;
537
    while (s < mm_end) {
538
        __asm__ volatile(
539
            PREFETCH"    32%1           \n\t"
540
            "movd          %1, %%mm0    \n\t"
541
            "movd         4%1, %%mm3    \n\t"
542
            "punpckldq    8%1, %%mm0    \n\t"
543
            "punpckldq   12%1, %%mm3    \n\t"
544
            "movq       %%mm0, %%mm1    \n\t"
545
            "movq       %%mm0, %%mm2    \n\t"
546
            "movq       %%mm3, %%mm4    \n\t"
547
            "movq       %%mm3, %%mm5    \n\t"
548
            "psllq         $7, %%mm0    \n\t"
549
            "psllq         $7, %%mm3    \n\t"
550
            "pand       %%mm7, %%mm0    \n\t"
551
            "pand       %%mm7, %%mm3    \n\t"
552
            "psrlq         $6, %%mm1    \n\t"
553
            "psrlq         $6, %%mm4    \n\t"
554
            "pand       %%mm6, %%mm1    \n\t"
555
            "pand       %%mm6, %%mm4    \n\t"
556
            "psrlq        $19, %%mm2    \n\t"
557
            "psrlq        $19, %%mm5    \n\t"
558
            "pand          %2, %%mm2    \n\t"
559
            "pand          %2, %%mm5    \n\t"
560
            "por        %%mm1, %%mm0    \n\t"
561
            "por        %%mm4, %%mm3    \n\t"
562
            "por        %%mm2, %%mm0    \n\t"
563
            "por        %%mm5, %%mm3    \n\t"
564
            "psllq        $16, %%mm3    \n\t"
565
            "por        %%mm3, %%mm0    \n\t"
566
            MOVNTQ"     %%mm0, %0       \n\t"
567
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
568
        d += 4;
569
        s += 16;
570
    }
571
    __asm__ volatile(SFENCE:::"memory");
572
    __asm__ volatile(EMMS:::"memory");
573
    while (s < end) {
574
        register int rgb = *(const uint32_t*)s; s += 4;
575
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
576
    }
577
}
578

    
579
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
580
{
581
    const uint8_t *s = src;
582
    const uint8_t *end;
583
    const uint8_t *mm_end;
584
    uint16_t *d = (uint16_t *)dst;
585
    end = s + src_size;
586
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
587
    __asm__ volatile(
588
        "movq         %0, %%mm7     \n\t"
589
        "movq         %1, %%mm6     \n\t"
590
        ::"m"(red_16mask),"m"(green_16mask));
591
    mm_end = end - 11;
592
    while (s < mm_end) {
593
        __asm__ volatile(
594
            PREFETCH"    32%1           \n\t"
595
            "movd          %1, %%mm0    \n\t"
596
            "movd         3%1, %%mm3    \n\t"
597
            "punpckldq    6%1, %%mm0    \n\t"
598
            "punpckldq    9%1, %%mm3    \n\t"
599
            "movq       %%mm0, %%mm1    \n\t"
600
            "movq       %%mm0, %%mm2    \n\t"
601
            "movq       %%mm3, %%mm4    \n\t"
602
            "movq       %%mm3, %%mm5    \n\t"
603
            "psrlq         $3, %%mm0    \n\t"
604
            "psrlq         $3, %%mm3    \n\t"
605
            "pand          %2, %%mm0    \n\t"
606
            "pand          %2, %%mm3    \n\t"
607
            "psrlq         $5, %%mm1    \n\t"
608
            "psrlq         $5, %%mm4    \n\t"
609
            "pand       %%mm6, %%mm1    \n\t"
610
            "pand       %%mm6, %%mm4    \n\t"
611
            "psrlq         $8, %%mm2    \n\t"
612
            "psrlq         $8, %%mm5    \n\t"
613
            "pand       %%mm7, %%mm2    \n\t"
614
            "pand       %%mm7, %%mm5    \n\t"
615
            "por        %%mm1, %%mm0    \n\t"
616
            "por        %%mm4, %%mm3    \n\t"
617
            "por        %%mm2, %%mm0    \n\t"
618
            "por        %%mm5, %%mm3    \n\t"
619
            "psllq        $16, %%mm3    \n\t"
620
            "por        %%mm3, %%mm0    \n\t"
621
            MOVNTQ"     %%mm0, %0       \n\t"
622
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
623
        d += 4;
624
        s += 12;
625
    }
626
    __asm__ volatile(SFENCE:::"memory");
627
    __asm__ volatile(EMMS:::"memory");
628
    while (s < end) {
629
        const int b = *s++;
630
        const int g = *s++;
631
        const int r = *s++;
632
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
633
    }
634
}
635

    
636
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
637
{
638
    const uint8_t *s = src;
639
    const uint8_t *end;
640
    const uint8_t *mm_end;
641
    uint16_t *d = (uint16_t *)dst;
642
    end = s + src_size;
643
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
644
    __asm__ volatile(
645
        "movq         %0, %%mm7     \n\t"
646
        "movq         %1, %%mm6     \n\t"
647
        ::"m"(red_16mask),"m"(green_16mask));
648
    mm_end = end - 15;
649
    while (s < mm_end) {
650
        __asm__ volatile(
651
            PREFETCH"    32%1           \n\t"
652
            "movd          %1, %%mm0    \n\t"
653
            "movd         3%1, %%mm3    \n\t"
654
            "punpckldq    6%1, %%mm0    \n\t"
655
            "punpckldq    9%1, %%mm3    \n\t"
656
            "movq       %%mm0, %%mm1    \n\t"
657
            "movq       %%mm0, %%mm2    \n\t"
658
            "movq       %%mm3, %%mm4    \n\t"
659
            "movq       %%mm3, %%mm5    \n\t"
660
            "psllq         $8, %%mm0    \n\t"
661
            "psllq         $8, %%mm3    \n\t"
662
            "pand       %%mm7, %%mm0    \n\t"
663
            "pand       %%mm7, %%mm3    \n\t"
664
            "psrlq         $5, %%mm1    \n\t"
665
            "psrlq         $5, %%mm4    \n\t"
666
            "pand       %%mm6, %%mm1    \n\t"
667
            "pand       %%mm6, %%mm4    \n\t"
668
            "psrlq        $19, %%mm2    \n\t"
669
            "psrlq        $19, %%mm5    \n\t"
670
            "pand          %2, %%mm2    \n\t"
671
            "pand          %2, %%mm5    \n\t"
672
            "por        %%mm1, %%mm0    \n\t"
673
            "por        %%mm4, %%mm3    \n\t"
674
            "por        %%mm2, %%mm0    \n\t"
675
            "por        %%mm5, %%mm3    \n\t"
676
            "psllq        $16, %%mm3    \n\t"
677
            "por        %%mm3, %%mm0    \n\t"
678
            MOVNTQ"     %%mm0, %0       \n\t"
679
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
680
        d += 4;
681
        s += 12;
682
    }
683
    __asm__ volatile(SFENCE:::"memory");
684
    __asm__ volatile(EMMS:::"memory");
685
    while (s < end) {
686
        const int r = *s++;
687
        const int g = *s++;
688
        const int b = *s++;
689
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
690
    }
691
}
692

    
693
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
694
{
695
    const uint8_t *s = src;
696
    const uint8_t *end;
697
    const uint8_t *mm_end;
698
    uint16_t *d = (uint16_t *)dst;
699
    end = s + src_size;
700
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
701
    __asm__ volatile(
702
        "movq          %0, %%mm7    \n\t"
703
        "movq          %1, %%mm6    \n\t"
704
        ::"m"(red_15mask),"m"(green_15mask));
705
    mm_end = end - 11;
706
    while (s < mm_end) {
707
        __asm__ volatile(
708
            PREFETCH"    32%1           \n\t"
709
            "movd          %1, %%mm0    \n\t"
710
            "movd         3%1, %%mm3    \n\t"
711
            "punpckldq    6%1, %%mm0    \n\t"
712
            "punpckldq    9%1, %%mm3    \n\t"
713
            "movq       %%mm0, %%mm1    \n\t"
714
            "movq       %%mm0, %%mm2    \n\t"
715
            "movq       %%mm3, %%mm4    \n\t"
716
            "movq       %%mm3, %%mm5    \n\t"
717
            "psrlq         $3, %%mm0    \n\t"
718
            "psrlq         $3, %%mm3    \n\t"
719
            "pand          %2, %%mm0    \n\t"
720
            "pand          %2, %%mm3    \n\t"
721
            "psrlq         $6, %%mm1    \n\t"
722
            "psrlq         $6, %%mm4    \n\t"
723
            "pand       %%mm6, %%mm1    \n\t"
724
            "pand       %%mm6, %%mm4    \n\t"
725
            "psrlq         $9, %%mm2    \n\t"
726
            "psrlq         $9, %%mm5    \n\t"
727
            "pand       %%mm7, %%mm2    \n\t"
728
            "pand       %%mm7, %%mm5    \n\t"
729
            "por        %%mm1, %%mm0    \n\t"
730
            "por        %%mm4, %%mm3    \n\t"
731
            "por        %%mm2, %%mm0    \n\t"
732
            "por        %%mm5, %%mm3    \n\t"
733
            "psllq        $16, %%mm3    \n\t"
734
            "por        %%mm3, %%mm0    \n\t"
735
            MOVNTQ"     %%mm0, %0       \n\t"
736
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
737
        d += 4;
738
        s += 12;
739
    }
740
    __asm__ volatile(SFENCE:::"memory");
741
    __asm__ volatile(EMMS:::"memory");
742
    while (s < end) {
743
        const int b = *s++;
744
        const int g = *s++;
745
        const int r = *s++;
746
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
747
    }
748
}
749

    
750
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
751
{
752
    const uint8_t *s = src;
753
    const uint8_t *end;
754
    const uint8_t *mm_end;
755
    uint16_t *d = (uint16_t *)dst;
756
    end = s + src_size;
757
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
758
    __asm__ volatile(
759
        "movq         %0, %%mm7     \n\t"
760
        "movq         %1, %%mm6     \n\t"
761
        ::"m"(red_15mask),"m"(green_15mask));
762
    mm_end = end - 15;
763
    while (s < mm_end) {
764
        __asm__ volatile(
765
            PREFETCH"   32%1            \n\t"
766
            "movd         %1, %%mm0     \n\t"
767
            "movd        3%1, %%mm3     \n\t"
768
            "punpckldq   6%1, %%mm0     \n\t"
769
            "punpckldq   9%1, %%mm3     \n\t"
770
            "movq      %%mm0, %%mm1     \n\t"
771
            "movq      %%mm0, %%mm2     \n\t"
772
            "movq      %%mm3, %%mm4     \n\t"
773
            "movq      %%mm3, %%mm5     \n\t"
774
            "psllq        $7, %%mm0     \n\t"
775
            "psllq        $7, %%mm3     \n\t"
776
            "pand      %%mm7, %%mm0     \n\t"
777
            "pand      %%mm7, %%mm3     \n\t"
778
            "psrlq        $6, %%mm1     \n\t"
779
            "psrlq        $6, %%mm4     \n\t"
780
            "pand      %%mm6, %%mm1     \n\t"
781
            "pand      %%mm6, %%mm4     \n\t"
782
            "psrlq       $19, %%mm2     \n\t"
783
            "psrlq       $19, %%mm5     \n\t"
784
            "pand         %2, %%mm2     \n\t"
785
            "pand         %2, %%mm5     \n\t"
786
            "por       %%mm1, %%mm0     \n\t"
787
            "por       %%mm4, %%mm3     \n\t"
788
            "por       %%mm2, %%mm0     \n\t"
789
            "por       %%mm5, %%mm3     \n\t"
790
            "psllq       $16, %%mm3     \n\t"
791
            "por       %%mm3, %%mm0     \n\t"
792
            MOVNTQ"    %%mm0, %0        \n\t"
793
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794
        d += 4;
795
        s += 12;
796
    }
797
    __asm__ volatile(SFENCE:::"memory");
798
    __asm__ volatile(EMMS:::"memory");
799
    while (s < end) {
800
        const int r = *s++;
801
        const int g = *s++;
802
        const int b = *s++;
803
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
804
    }
805
}
806

    
807
/*
808
  I use less accurate approximation here by simply left-shifting the input
809
  value and filling the low order bits with zeroes. This method improves PNG
810
  compression but this scheme cannot reproduce white exactly, since it does
811
  not generate an all-ones maximum value; the net effect is to darken the
812
  image slightly.
813

814
  The better method should be "left bit replication":
815

816
   4 3 2 1 0
817
   ---------
818
   1 1 0 1 1
819

820
   7 6 5 4 3  2 1 0
821
   ----------------
822
   1 1 0 1 1  1 1 0
823
   |=======|  |===|
824
       |      leftmost bits repeated to fill open bits
825
       |
826
   original bits
827
*/
828
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
829
{
830
    const uint16_t *end;
831
    const uint16_t *mm_end;
832
    uint8_t *d = dst;
833
    const uint16_t *s = (const uint16_t*)src;
834
    end = s + src_size/2;
835
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
836
    mm_end = end - 7;
837
    while (s < mm_end) {
838
        __asm__ volatile(
839
            PREFETCH"    32%1           \n\t"
840
            "movq          %1, %%mm0    \n\t"
841
            "movq          %1, %%mm1    \n\t"
842
            "movq          %1, %%mm2    \n\t"
843
            "pand          %2, %%mm0    \n\t"
844
            "pand          %3, %%mm1    \n\t"
845
            "pand          %4, %%mm2    \n\t"
846
            "psllq         $3, %%mm0    \n\t"
847
            "psrlq         $2, %%mm1    \n\t"
848
            "psrlq         $7, %%mm2    \n\t"
849
            "movq       %%mm0, %%mm3    \n\t"
850
            "movq       %%mm1, %%mm4    \n\t"
851
            "movq       %%mm2, %%mm5    \n\t"
852
            "punpcklwd     %5, %%mm0    \n\t"
853
            "punpcklwd     %5, %%mm1    \n\t"
854
            "punpcklwd     %5, %%mm2    \n\t"
855
            "punpckhwd     %5, %%mm3    \n\t"
856
            "punpckhwd     %5, %%mm4    \n\t"
857
            "punpckhwd     %5, %%mm5    \n\t"
858
            "psllq         $8, %%mm1    \n\t"
859
            "psllq        $16, %%mm2    \n\t"
860
            "por        %%mm1, %%mm0    \n\t"
861
            "por        %%mm2, %%mm0    \n\t"
862
            "psllq         $8, %%mm4    \n\t"
863
            "psllq        $16, %%mm5    \n\t"
864
            "por        %%mm4, %%mm3    \n\t"
865
            "por        %%mm5, %%mm3    \n\t"
866

    
867
            "movq       %%mm0, %%mm6    \n\t"
868
            "movq       %%mm3, %%mm7    \n\t"
869

    
870
            "movq         8%1, %%mm0    \n\t"
871
            "movq         8%1, %%mm1    \n\t"
872
            "movq         8%1, %%mm2    \n\t"
873
            "pand          %2, %%mm0    \n\t"
874
            "pand          %3, %%mm1    \n\t"
875
            "pand          %4, %%mm2    \n\t"
876
            "psllq         $3, %%mm0    \n\t"
877
            "psrlq         $2, %%mm1    \n\t"
878
            "psrlq         $7, %%mm2    \n\t"
879
            "movq       %%mm0, %%mm3    \n\t"
880
            "movq       %%mm1, %%mm4    \n\t"
881
            "movq       %%mm2, %%mm5    \n\t"
882
            "punpcklwd     %5, %%mm0    \n\t"
883
            "punpcklwd     %5, %%mm1    \n\t"
884
            "punpcklwd     %5, %%mm2    \n\t"
885
            "punpckhwd     %5, %%mm3    \n\t"
886
            "punpckhwd     %5, %%mm4    \n\t"
887
            "punpckhwd     %5, %%mm5    \n\t"
888
            "psllq         $8, %%mm1    \n\t"
889
            "psllq        $16, %%mm2    \n\t"
890
            "por        %%mm1, %%mm0    \n\t"
891
            "por        %%mm2, %%mm0    \n\t"
892
            "psllq         $8, %%mm4    \n\t"
893
            "psllq        $16, %%mm5    \n\t"
894
            "por        %%mm4, %%mm3    \n\t"
895
            "por        %%mm5, %%mm3    \n\t"
896

    
897
            :"=m"(*d)
898
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
899
            :"memory");
900
        /* borrowed 32 to 24 */
901
        __asm__ volatile(
902
            "movq       %%mm0, %%mm4    \n\t"
903
            "movq       %%mm3, %%mm5    \n\t"
904
            "movq       %%mm6, %%mm0    \n\t"
905
            "movq       %%mm7, %%mm1    \n\t"
906

    
907
            "movq       %%mm4, %%mm6    \n\t"
908
            "movq       %%mm5, %%mm7    \n\t"
909
            "movq       %%mm0, %%mm2    \n\t"
910
            "movq       %%mm1, %%mm3    \n\t"
911

    
912
            STORE_BGR24_MMX
913

    
914
            :"=m"(*d)
915
            :"m"(*s)
916
            :"memory");
917
        d += 24;
918
        s += 8;
919
    }
920
    __asm__ volatile(SFENCE:::"memory");
921
    __asm__ volatile(EMMS:::"memory");
922
    while (s < end) {
923
        register uint16_t bgr;
924
        bgr = *s++;
925
        *d++ = (bgr&0x1F)<<3;
926
        *d++ = (bgr&0x3E0)>>2;
927
        *d++ = (bgr&0x7C00)>>7;
928
    }
929
}
930

    
931
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
932
{
933
    const uint16_t *end;
934
    const uint16_t *mm_end;
935
    uint8_t *d = (uint8_t *)dst;
936
    const uint16_t *s = (const uint16_t *)src;
937
    end = s + src_size/2;
938
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
939
    mm_end = end - 7;
940
    while (s < mm_end) {
941
        __asm__ volatile(
942
            PREFETCH"    32%1           \n\t"
943
            "movq          %1, %%mm0    \n\t"
944
            "movq          %1, %%mm1    \n\t"
945
            "movq          %1, %%mm2    \n\t"
946
            "pand          %2, %%mm0    \n\t"
947
            "pand          %3, %%mm1    \n\t"
948
            "pand          %4, %%mm2    \n\t"
949
            "psllq         $3, %%mm0    \n\t"
950
            "psrlq         $3, %%mm1    \n\t"
951
            "psrlq         $8, %%mm2    \n\t"
952
            "movq       %%mm0, %%mm3    \n\t"
953
            "movq       %%mm1, %%mm4    \n\t"
954
            "movq       %%mm2, %%mm5    \n\t"
955
            "punpcklwd     %5, %%mm0    \n\t"
956
            "punpcklwd     %5, %%mm1    \n\t"
957
            "punpcklwd     %5, %%mm2    \n\t"
958
            "punpckhwd     %5, %%mm3    \n\t"
959
            "punpckhwd     %5, %%mm4    \n\t"
960
            "punpckhwd     %5, %%mm5    \n\t"
961
            "psllq         $8, %%mm1    \n\t"
962
            "psllq        $16, %%mm2    \n\t"
963
            "por        %%mm1, %%mm0    \n\t"
964
            "por        %%mm2, %%mm0    \n\t"
965
            "psllq         $8, %%mm4    \n\t"
966
            "psllq        $16, %%mm5    \n\t"
967
            "por        %%mm4, %%mm3    \n\t"
968
            "por        %%mm5, %%mm3    \n\t"
969

    
970
            "movq       %%mm0, %%mm6    \n\t"
971
            "movq       %%mm3, %%mm7    \n\t"
972

    
973
            "movq         8%1, %%mm0    \n\t"
974
            "movq         8%1, %%mm1    \n\t"
975
            "movq         8%1, %%mm2    \n\t"
976
            "pand          %2, %%mm0    \n\t"
977
            "pand          %3, %%mm1    \n\t"
978
            "pand          %4, %%mm2    \n\t"
979
            "psllq         $3, %%mm0    \n\t"
980
            "psrlq         $3, %%mm1    \n\t"
981
            "psrlq         $8, %%mm2    \n\t"
982
            "movq       %%mm0, %%mm3    \n\t"
983
            "movq       %%mm1, %%mm4    \n\t"
984
            "movq       %%mm2, %%mm5    \n\t"
985
            "punpcklwd     %5, %%mm0    \n\t"
986
            "punpcklwd     %5, %%mm1    \n\t"
987
            "punpcklwd     %5, %%mm2    \n\t"
988
            "punpckhwd     %5, %%mm3    \n\t"
989
            "punpckhwd     %5, %%mm4    \n\t"
990
            "punpckhwd     %5, %%mm5    \n\t"
991
            "psllq         $8, %%mm1    \n\t"
992
            "psllq        $16, %%mm2    \n\t"
993
            "por        %%mm1, %%mm0    \n\t"
994
            "por        %%mm2, %%mm0    \n\t"
995
            "psllq         $8, %%mm4    \n\t"
996
            "psllq        $16, %%mm5    \n\t"
997
            "por        %%mm4, %%mm3    \n\t"
998
            "por        %%mm5, %%mm3    \n\t"
999
            :"=m"(*d)
1000
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1001
            :"memory");
1002
        /* borrowed 32 to 24 */
1003
        __asm__ volatile(
1004
            "movq       %%mm0, %%mm4    \n\t"
1005
            "movq       %%mm3, %%mm5    \n\t"
1006
            "movq       %%mm6, %%mm0    \n\t"
1007
            "movq       %%mm7, %%mm1    \n\t"
1008

    
1009
            "movq       %%mm4, %%mm6    \n\t"
1010
            "movq       %%mm5, %%mm7    \n\t"
1011
            "movq       %%mm0, %%mm2    \n\t"
1012
            "movq       %%mm1, %%mm3    \n\t"
1013

    
1014
            STORE_BGR24_MMX
1015

    
1016
            :"=m"(*d)
1017
            :"m"(*s)
1018
            :"memory");
1019
        d += 24;
1020
        s += 8;
1021
    }
1022
    __asm__ volatile(SFENCE:::"memory");
1023
    __asm__ volatile(EMMS:::"memory");
1024
    while (s < end) {
1025
        register uint16_t bgr;
1026
        bgr = *s++;
1027
        *d++ = (bgr&0x1F)<<3;
1028
        *d++ = (bgr&0x7E0)>>3;
1029
        *d++ = (bgr&0xF800)>>8;
1030
    }
1031
}
1032

    
1033
/*
1034
 * mm0 = 00 B3 00 B2 00 B1 00 B0
1035
 * mm1 = 00 G3 00 G2 00 G1 00 G0
1036
 * mm2 = 00 R3 00 R2 00 R1 00 R0
1037
 * mm6 = FF FF FF FF FF FF FF FF
1038
 * mm7 = 00 00 00 00 00 00 00 00
1039
 */
1040
#define PACK_RGB32 \
1041
    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1042
    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1043
    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1044
    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1045
    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1046
    "movq       %%mm0, %%mm3    \n\t"                               \
1047
    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1048
    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1049
    MOVNTQ"     %%mm0,  %0      \n\t"                               \
1050
    MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1051

    
1052
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1053
{
1054
    const uint16_t *end;
1055
    const uint16_t *mm_end;
1056
    uint8_t *d = dst;
1057
    const uint16_t *s = (const uint16_t *)src;
1058
    end = s + src_size/2;
1059
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1060
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1061
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1062
    mm_end = end - 3;
1063
    while (s < mm_end) {
1064
        __asm__ volatile(
1065
            PREFETCH"    32%1           \n\t"
1066
            "movq          %1, %%mm0    \n\t"
1067
            "movq          %1, %%mm1    \n\t"
1068
            "movq          %1, %%mm2    \n\t"
1069
            "pand          %2, %%mm0    \n\t"
1070
            "pand          %3, %%mm1    \n\t"
1071
            "pand          %4, %%mm2    \n\t"
1072
            "psllq         $3, %%mm0    \n\t"
1073
            "psrlq         $2, %%mm1    \n\t"
1074
            "psrlq         $7, %%mm2    \n\t"
1075
            PACK_RGB32
1076
            :"=m"(*d)
1077
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1078
            :"memory");
1079
        d += 16;
1080
        s += 4;
1081
    }
1082
    __asm__ volatile(SFENCE:::"memory");
1083
    __asm__ volatile(EMMS:::"memory");
1084
    while (s < end) {
1085
        register uint16_t bgr;
1086
        bgr = *s++;
1087
        *d++ = (bgr&0x1F)<<3;
1088
        *d++ = (bgr&0x3E0)>>2;
1089
        *d++ = (bgr&0x7C00)>>7;
1090
        *d++ = 255;
1091
    }
1092
}
1093

    
1094
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1095
{
1096
    const uint16_t *end;
1097
    const uint16_t *mm_end;
1098
    uint8_t *d = dst;
1099
    const uint16_t *s = (const uint16_t*)src;
1100
    end = s + src_size/2;
1101
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1102
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1103
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1104
    mm_end = end - 3;
1105
    while (s < mm_end) {
1106
        __asm__ volatile(
1107
            PREFETCH"    32%1           \n\t"
1108
            "movq          %1, %%mm0    \n\t"
1109
            "movq          %1, %%mm1    \n\t"
1110
            "movq          %1, %%mm2    \n\t"
1111
            "pand          %2, %%mm0    \n\t"
1112
            "pand          %3, %%mm1    \n\t"
1113
            "pand          %4, %%mm2    \n\t"
1114
            "psllq         $3, %%mm0    \n\t"
1115
            "psrlq         $3, %%mm1    \n\t"
1116
            "psrlq         $8, %%mm2    \n\t"
1117
            PACK_RGB32
1118
            :"=m"(*d)
1119
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1120
            :"memory");
1121
        d += 16;
1122
        s += 4;
1123
    }
1124
    __asm__ volatile(SFENCE:::"memory");
1125
    __asm__ volatile(EMMS:::"memory");
1126
    while (s < end) {
1127
        register uint16_t bgr;
1128
        bgr = *s++;
1129
        *d++ = (bgr&0x1F)<<3;
1130
        *d++ = (bgr&0x7E0)>>3;
1131
        *d++ = (bgr&0xF800)>>8;
1132
        *d++ = 255;
1133
    }
1134
}
1135

    
1136
static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
1137
{
1138
    x86_reg idx = 15 - src_size;
1139
    const uint8_t *s = src-idx;
1140
    uint8_t *d = dst-idx;
1141
    __asm__ volatile(
1142
        "test          %0, %0           \n\t"
1143
        "jns           2f               \n\t"
1144
        PREFETCH"       (%1, %0)        \n\t"
1145
        "movq          %3, %%mm7        \n\t"
1146
        "pxor          %4, %%mm7        \n\t"
1147
        "movq       %%mm7, %%mm6        \n\t"
1148
        "pxor          %5, %%mm7        \n\t"
1149
        ".p2align       4               \n\t"
1150
        "1:                             \n\t"
1151
        PREFETCH"     32(%1, %0)        \n\t"
1152
        "movq           (%1, %0), %%mm0 \n\t"
1153
        "movq          8(%1, %0), %%mm1 \n\t"
1154
# if COMPILE_TEMPLATE_MMX2
1155
        "pshufw      $177, %%mm0, %%mm3 \n\t"
1156
        "pshufw      $177, %%mm1, %%mm5 \n\t"
1157
        "pand       %%mm7, %%mm0        \n\t"
1158
        "pand       %%mm6, %%mm3        \n\t"
1159
        "pand       %%mm7, %%mm1        \n\t"
1160
        "pand       %%mm6, %%mm5        \n\t"
1161
        "por        %%mm3, %%mm0        \n\t"
1162
        "por        %%mm5, %%mm1        \n\t"
1163
# else
1164
        "movq       %%mm0, %%mm2        \n\t"
1165
        "movq       %%mm1, %%mm4        \n\t"
1166
        "pand       %%mm7, %%mm0        \n\t"
1167
        "pand       %%mm6, %%mm2        \n\t"
1168
        "pand       %%mm7, %%mm1        \n\t"
1169
        "pand       %%mm6, %%mm4        \n\t"
1170
        "movq       %%mm2, %%mm3        \n\t"
1171
        "movq       %%mm4, %%mm5        \n\t"
1172
        "pslld        $16, %%mm2        \n\t"
1173
        "psrld        $16, %%mm3        \n\t"
1174
        "pslld        $16, %%mm4        \n\t"
1175
        "psrld        $16, %%mm5        \n\t"
1176
        "por        %%mm2, %%mm0        \n\t"
1177
        "por        %%mm4, %%mm1        \n\t"
1178
        "por        %%mm3, %%mm0        \n\t"
1179
        "por        %%mm5, %%mm1        \n\t"
1180
# endif
1181
        MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1182
        MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1183
        "add          $16, %0           \n\t"
1184
        "js            1b               \n\t"
1185
        SFENCE"                         \n\t"
1186
        EMMS"                           \n\t"
1187
        "2:                             \n\t"
1188
        : "+&r"(idx)
1189
        : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1190
        : "memory");
1191
    for (; idx<15; idx+=4) {
1192
        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1193
        v &= 0xff00ff;
1194
        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1195
    }
1196
}
1197

    
1198
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1199
{
1200
    unsigned i;
1201
    x86_reg mmx_size= 23 - src_size;
1202
    __asm__ volatile (
1203
        "test             %%"REG_a", %%"REG_a"          \n\t"
1204
        "jns                     2f                     \n\t"
1205
        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1206
        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1207
        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1208
        ".p2align                 4                     \n\t"
1209
        "1:                                             \n\t"
1210
        PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1211
        "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1212
        "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1213
        "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1214
        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1215
        "pand                 %%mm5, %%mm0              \n\t"
1216
        "pand                 %%mm6, %%mm1              \n\t"
1217
        "pand                 %%mm7, %%mm2              \n\t"
1218
        "por                  %%mm0, %%mm1              \n\t"
1219
        "por                  %%mm2, %%mm1              \n\t"
1220
        "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1221
        MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1222
        "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1223
        "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1224
        "pand                 %%mm7, %%mm0              \n\t"
1225
        "pand                 %%mm5, %%mm1              \n\t"
1226
        "pand                 %%mm6, %%mm2              \n\t"
1227
        "por                  %%mm0, %%mm1              \n\t"
1228
        "por                  %%mm2, %%mm1              \n\t"
1229
        "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1230
        MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1231
        "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1232
        "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1233
        "pand                 %%mm6, %%mm0              \n\t"
1234
        "pand                 %%mm7, %%mm1              \n\t"
1235
        "pand                 %%mm5, %%mm2              \n\t"
1236
        "por                  %%mm0, %%mm1              \n\t"
1237
        "por                  %%mm2, %%mm1              \n\t"
1238
        MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1239
        "add                    $24, %%"REG_a"          \n\t"
1240
        " js                     1b                     \n\t"
1241
        "2:                                             \n\t"
1242
        : "+a" (mmx_size)
1243
        : "r" (src-mmx_size), "r"(dst-mmx_size)
1244
    );
1245

    
1246
    __asm__ volatile(SFENCE:::"memory");
1247
    __asm__ volatile(EMMS:::"memory");
1248

    
1249
    if (mmx_size==23) return; //finished, was multiple of 8
1250

    
1251
    src+= src_size;
1252
    dst+= src_size;
1253
    src_size= 23-mmx_size;
1254
    src-= src_size;
1255
    dst-= src_size;
1256
    for (i=0; i<src_size; i+=3) {
1257
        register uint8_t x;
1258
        x          = src[i + 2];
1259
        dst[i + 1] = src[i + 1];
1260
        dst[i + 2] = src[i + 0];
1261
        dst[i + 0] = x;
1262
    }
1263
}
1264

    
1265
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1266
                                           long width, long height,
1267
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1268
{
1269
    long y;
1270
    const x86_reg chromWidth= width>>1;
1271
    for (y=0; y<height; y++) {
1272
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1273
        __asm__ volatile(
1274
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1275
            ".p2align                    4              \n\t"
1276
            "1:                                         \n\t"
1277
            PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1278
            PREFETCH"    32(%2, %%"REG_a")              \n\t"
1279
            PREFETCH"    32(%3, %%"REG_a")              \n\t"
1280
            "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1281
            "movq                    %%mm0, %%mm2       \n\t" // U(0)
1282
            "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1283
            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1284
            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1285

    
1286
            "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1287
            "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1288
            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1289
            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1290
            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1291
            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1292
            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1293
            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1294

    
1295
            MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1296
            MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1297
            MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1298
            MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1299

    
1300
            "add                        $8, %%"REG_a"   \n\t"
1301
            "cmp                        %4, %%"REG_a"   \n\t"
1302
            " jb                        1b              \n\t"
1303
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1304
            : "%"REG_a
1305
        );
1306
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1307
            usrc += chromStride;
1308
            vsrc += chromStride;
1309
        }
1310
        ysrc += lumStride;
1311
        dst  += dstStride;
1312
    }
1313
    __asm__(EMMS"       \n\t"
1314
            SFENCE"     \n\t"
1315
            :::"memory");
1316
}
1317

    
1318
/**
1319
 * Height should be a multiple of 2 and width should be a multiple of 16.
1320
 * (If this is a problem for anyone then tell me, and I will fix it.)
1321
 */
1322
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1323
                                      long width, long height,
1324
                                      long lumStride, long chromStride, long dstStride)
1325
{
1326
    //FIXME interpolate chroma
1327
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1328
}
1329

    
1330
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1331
                                           long width, long height,
1332
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1333
{
1334
    long y;
1335
    const x86_reg chromWidth= width>>1;
1336
    for (y=0; y<height; y++) {
1337
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1338
        __asm__ volatile(
1339
            "xor                %%"REG_a", %%"REG_a"    \n\t"
1340
            ".p2align                   4               \n\t"
1341
            "1:                                         \n\t"
1342
            PREFETCH"   32(%1, %%"REG_a", 2)            \n\t"
1343
            PREFETCH"   32(%2, %%"REG_a")               \n\t"
1344
            PREFETCH"   32(%3, %%"REG_a")               \n\t"
1345
            "movq         (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1346
            "movq                   %%mm0, %%mm2        \n\t" // U(0)
1347
            "movq         (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1348
            "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1349
            "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1350

    
1351
            "movq       (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1352
            "movq      8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1353
            "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1354
            "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1355
            "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1356
            "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1357
            "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1358
            "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1359

    
1360
            MOVNTQ"                 %%mm0,   (%0, %%"REG_a", 4)     \n\t"
1361
            MOVNTQ"                 %%mm4,  8(%0, %%"REG_a", 4)     \n\t"
1362
            MOVNTQ"                 %%mm2, 16(%0, %%"REG_a", 4)     \n\t"
1363
            MOVNTQ"                 %%mm6, 24(%0, %%"REG_a", 4)     \n\t"
1364

    
1365
            "add                       $8, %%"REG_a"    \n\t"
1366
            "cmp                       %4, %%"REG_a"    \n\t"
1367
            " jb                       1b               \n\t"
1368
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1369
            : "%"REG_a
1370
        );
1371
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1372
            usrc += chromStride;
1373
            vsrc += chromStride;
1374
        }
1375
        ysrc += lumStride;
1376
        dst += dstStride;
1377
    }
1378
    __asm__(EMMS"       \n\t"
1379
            SFENCE"     \n\t"
1380
            :::"memory");
1381
}
1382

    
1383
/**
1384
 * Height should be a multiple of 2 and width should be a multiple of 16
1385
 * (If this is a problem for anyone then tell me, and I will fix it.)
1386
 */
1387
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1388
                                      long width, long height,
1389
                                      long lumStride, long chromStride, long dstStride)
1390
{
1391
    //FIXME interpolate chroma
1392
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1393
}
1394

    
1395
/**
1396
 * Width should be a multiple of 16.
1397
 */
1398
static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1399
                                         long width, long height,
1400
                                         long lumStride, long chromStride, long dstStride)
1401
{
1402
    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1403
}
1404

    
1405
/**
1406
 * Width should be a multiple of 16.
1407
 */
1408
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1409
                                         long width, long height,
1410
                                         long lumStride, long chromStride, long dstStride)
1411
{
1412
    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1413
}
1414

    
1415
/**
1416
 * Height should be a multiple of 2 and width should be a multiple of 16.
1417
 * (If this is a problem for anyone then tell me, and I will fix it.)
1418
 */
1419
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1420
                                      long width, long height,
1421
                                      long lumStride, long chromStride, long srcStride)
1422
{
1423
    long y;
1424
    const x86_reg chromWidth= width>>1;
1425
    for (y=0; y<height; y+=2) {
1426
        __asm__ volatile(
1427
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1428
            "pcmpeqw                 %%mm7, %%mm7       \n\t"
1429
            "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1430
            ".p2align                    4              \n\t"
1431
            "1:                \n\t"
1432
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1433
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1434
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1435
            "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1436
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1437
            "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1438
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1439
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1440
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1441
            "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1442
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1443

    
1444
            MOVNTQ"                  %%mm2, (%1, %%"REG_a", 2)  \n\t"
1445

    
1446
            "movq     16(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1447
            "movq     24(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1448
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1449
            "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1450
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1451
            "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1452
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1453
            "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1454
            "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1455
            "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1456

    
1457
            MOVNTQ"                  %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1458

    
1459
            "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1460
            "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1461
            "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1462
            "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1463
            "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1464
            "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1465
            "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1466
            "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1467

    
1468
            MOVNTQ"                  %%mm0, (%3, %%"REG_a")     \n\t"
1469
            MOVNTQ"                  %%mm2, (%2, %%"REG_a")     \n\t"
1470

    
1471
            "add                        $8, %%"REG_a"   \n\t"
1472
            "cmp                        %4, %%"REG_a"   \n\t"
1473
            " jb                        1b              \n\t"
1474
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1475
            : "memory", "%"REG_a
1476
        );
1477

    
1478
        ydst += lumStride;
1479
        src  += srcStride;
1480

    
1481
        __asm__ volatile(
1482
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1483
            ".p2align                    4              \n\t"
1484
            "1:                                         \n\t"
1485
            PREFETCH" 64(%0, %%"REG_a", 4)              \n\t"
1486
            "movq       (%0, %%"REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1487
            "movq      8(%0, %%"REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1488
            "movq     16(%0, %%"REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1489
            "movq     24(%0, %%"REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1490
            "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1491
            "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1492
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1493
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1494
            "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1495
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1496

    
1497
            MOVNTQ"                  %%mm0,  (%1, %%"REG_a", 2) \n\t"
1498
            MOVNTQ"                  %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1499

    
1500
            "add                        $8, %%"REG_a"   \n\t"
1501
            "cmp                        %4, %%"REG_a"   \n\t"
1502
            " jb                        1b              \n\t"
1503

    
1504
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1505
            : "memory", "%"REG_a
1506
        );
1507
        udst += chromStride;
1508
        vdst += chromStride;
1509
        ydst += lumStride;
1510
        src  += srcStride;
1511
    }
1512
    __asm__ volatile(EMMS"       \n\t"
1513
                     SFENCE"     \n\t"
1514
                     :::"memory");
1515
}
1516

    
1517
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1518
{
1519
    long x,y;
1520

    
1521
    dst[0]= src[0];
1522

    
1523
    // first line
1524
    for (x=0; x<srcWidth-1; x++) {
1525
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1526
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1527
    }
1528
    dst[2*srcWidth-1]= src[srcWidth-1];
1529

    
1530
    dst+= dstStride;
1531

    
1532
    for (y=1; y<srcHeight; y++) {
1533
#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1534
        const x86_reg mmxSize= srcWidth&~15;
1535
        __asm__ volatile(
1536
            "mov           %4, %%"REG_a"            \n\t"
1537
            "movq        "MANGLE(mmx_ff)", %%mm0    \n\t"
1538
            "movq         (%0, %%"REG_a"), %%mm4    \n\t"
1539
            "movq                   %%mm4, %%mm2    \n\t"
1540
            "psllq                     $8, %%mm4    \n\t"
1541
            "pand                   %%mm0, %%mm2    \n\t"
1542
            "por                    %%mm2, %%mm4    \n\t"
1543
            "movq         (%1, %%"REG_a"), %%mm5    \n\t"
1544
            "movq                   %%mm5, %%mm3    \n\t"
1545
            "psllq                     $8, %%mm5    \n\t"
1546
            "pand                   %%mm0, %%mm3    \n\t"
1547
            "por                    %%mm3, %%mm5    \n\t"
1548
            "1:                                     \n\t"
1549
            "movq         (%0, %%"REG_a"), %%mm0    \n\t"
1550
            "movq         (%1, %%"REG_a"), %%mm1    \n\t"
1551
            "movq        1(%0, %%"REG_a"), %%mm2    \n\t"
1552
            "movq        1(%1, %%"REG_a"), %%mm3    \n\t"
1553
            PAVGB"                  %%mm0, %%mm5    \n\t"
1554
            PAVGB"                  %%mm0, %%mm3    \n\t"
1555
            PAVGB"                  %%mm0, %%mm5    \n\t"
1556
            PAVGB"                  %%mm0, %%mm3    \n\t"
1557
            PAVGB"                  %%mm1, %%mm4    \n\t"
1558
            PAVGB"                  %%mm1, %%mm2    \n\t"
1559
            PAVGB"                  %%mm1, %%mm4    \n\t"
1560
            PAVGB"                  %%mm1, %%mm2    \n\t"
1561
            "movq                   %%mm5, %%mm7    \n\t"
1562
            "movq                   %%mm4, %%mm6    \n\t"
1563
            "punpcklbw              %%mm3, %%mm5    \n\t"
1564
            "punpckhbw              %%mm3, %%mm7    \n\t"
1565
            "punpcklbw              %%mm2, %%mm4    \n\t"
1566
            "punpckhbw              %%mm2, %%mm6    \n\t"
1567
#if 1
1568
            MOVNTQ"                 %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1569
            MOVNTQ"                 %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1570
            MOVNTQ"                 %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1571
            MOVNTQ"                 %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1572
#else
1573
            "movq                   %%mm5,  (%2, %%"REG_a", 2)  \n\t"
1574
            "movq                   %%mm7, 8(%2, %%"REG_a", 2)  \n\t"
1575
            "movq                   %%mm4,  (%3, %%"REG_a", 2)  \n\t"
1576
            "movq                   %%mm6, 8(%3, %%"REG_a", 2)  \n\t"
1577
#endif
1578
            "add                       $8, %%"REG_a"            \n\t"
1579
            "movq       -1(%0, %%"REG_a"), %%mm4    \n\t"
1580
            "movq       -1(%1, %%"REG_a"), %%mm5    \n\t"
1581
            " js                       1b                       \n\t"
1582
            :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1583
               "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1584
               "g" (-mmxSize)
1585
            : "%"REG_a
1586
        );
1587
#else
1588
        const x86_reg mmxSize=1;
1589

    
1590
        dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1591
        dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1592
#endif
1593

    
1594
        for (x=mmxSize-1; x<srcWidth-1; x++) {
1595
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1596
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1597
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1598
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1599
        }
1600
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1601
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1602

    
1603
        dst+=dstStride*2;
1604
        src+=srcStride;
1605
    }
1606

    
1607
    // last line
1608
#if 1
1609
    dst[0]= src[0];
1610

    
1611
    for (x=0; x<srcWidth-1; x++) {
1612
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1613
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1614
    }
1615
    dst[2*srcWidth-1]= src[srcWidth-1];
1616
#else
1617
    for (x=0; x<srcWidth; x++) {
1618
        dst[2*x+0]=
1619
        dst[2*x+1]= src[x];
1620
    }
1621
#endif
1622

    
1623
    __asm__ volatile(EMMS"       \n\t"
1624
                     SFENCE"     \n\t"
1625
                     :::"memory");
1626
}
1627

    
1628
/**
1629
 * Height should be a multiple of 2 and width should be a multiple of 16.
1630
 * (If this is a problem for anyone then tell me, and I will fix it.)
1631
 * Chrominance data is only taken from every second line, others are ignored.
1632
 * FIXME: Write HQ version.
1633
 */
1634
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1635
                                      long width, long height,
1636
                                      long lumStride, long chromStride, long srcStride)
1637
{
1638
    long y;
1639
    const x86_reg chromWidth= width>>1;
1640
    for (y=0; y<height; y+=2) {
1641
        __asm__ volatile(
1642
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1643
            "pcmpeqw             %%mm7, %%mm7   \n\t"
1644
            "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
1645
            ".p2align                4          \n\t"
1646
            "1:                                 \n\t"
1647
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1648
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
1649
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
1650
            "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
1651
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
1652
            "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
1653
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
1654
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
1655
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
1656
            "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
1657
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
1658

    
1659
            MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
1660

    
1661
            "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
1662
            "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
1663
            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
1664
            "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
1665
            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
1666
            "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
1667
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
1668
            "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
1669
            "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
1670
            "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
1671

    
1672
            MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1673

    
1674
            "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
1675
            "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
1676
            "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
1677
            "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
1678
            "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
1679
            "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
1680
            "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
1681
            "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
1682

    
1683
            MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
1684
            MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
1685

    
1686
            "add                    $8, %%"REG_a"   \n\t"
1687
            "cmp                    %4, %%"REG_a"   \n\t"
1688
            " jb                    1b          \n\t"
1689
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1690
            : "memory", "%"REG_a
1691
        );
1692

    
1693
        ydst += lumStride;
1694
        src  += srcStride;
1695

    
1696
        __asm__ volatile(
1697
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1698
            ".p2align                    4              \n\t"
1699
            "1:                                 \n\t"
1700
            PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
1701
            "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
1702
            "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
1703
            "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
1704
            "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
1705
            "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
1706
            "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
1707
            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
1708
            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
1709
            "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
1710
            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
1711

    
1712
            MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
1713
            MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1714

    
1715
            "add                    $8, %%"REG_a"   \n\t"
1716
            "cmp                    %4, %%"REG_a"   \n\t"
1717
            " jb                    1b          \n\t"
1718

    
1719
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1720
            : "memory", "%"REG_a
1721
        );
1722
        udst += chromStride;
1723
        vdst += chromStride;
1724
        ydst += lumStride;
1725
        src  += srcStride;
1726
    }
1727
    __asm__ volatile(EMMS"       \n\t"
1728
                     SFENCE"     \n\t"
1729
                     :::"memory");
1730
}
1731

    
1732
/**
1733
 * Height should be a multiple of 2 and width should be a multiple of 2.
1734
 * (If this is a problem for anyone then tell me, and I will fix it.)
1735
 * Chrominance data is only taken from every second line,
1736
 * others are ignored in the C version.
1737
 * FIXME: Write HQ version.
1738
 */
1739
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1740
                                       long width, long height,
1741
                                       long lumStride, long chromStride, long srcStride)
1742
{
1743
    long y;
1744
    const x86_reg chromWidth= width>>1;
1745
    for (y=0; y<height-2; y+=2) {
1746
        long i;
1747
        for (i=0; i<2; i++) {
1748
            __asm__ volatile(
1749
                "mov                        %2, %%"REG_a"   \n\t"
1750
                "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
1751
                "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1752
                "pxor                    %%mm7, %%mm7       \n\t"
1753
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1754
                ".p2align                    4              \n\t"
1755
                "1:                                         \n\t"
1756
                PREFETCH"    64(%0, %%"REG_d")              \n\t"
1757
                "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1758
                "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
1759
                "punpcklbw               %%mm7, %%mm0       \n\t"
1760
                "punpcklbw               %%mm7, %%mm1       \n\t"
1761
                "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
1762
                "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
1763
                "punpcklbw               %%mm7, %%mm2       \n\t"
1764
                "punpcklbw               %%mm7, %%mm3       \n\t"
1765
                "pmaddwd                 %%mm6, %%mm0       \n\t"
1766
                "pmaddwd                 %%mm6, %%mm1       \n\t"
1767
                "pmaddwd                 %%mm6, %%mm2       \n\t"
1768
                "pmaddwd                 %%mm6, %%mm3       \n\t"
1769
#ifndef FAST_BGR2YV12
1770
                "psrad                      $8, %%mm0       \n\t"
1771
                "psrad                      $8, %%mm1       \n\t"
1772
                "psrad                      $8, %%mm2       \n\t"
1773
                "psrad                      $8, %%mm3       \n\t"
1774
#endif
1775
                "packssdw                %%mm1, %%mm0       \n\t"
1776
                "packssdw                %%mm3, %%mm2       \n\t"
1777
                "pmaddwd                 %%mm5, %%mm0       \n\t"
1778
                "pmaddwd                 %%mm5, %%mm2       \n\t"
1779
                "packssdw                %%mm2, %%mm0       \n\t"
1780
                "psraw                      $7, %%mm0       \n\t"
1781

    
1782
                "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
1783
                "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
1784
                "punpcklbw               %%mm7, %%mm4       \n\t"
1785
                "punpcklbw               %%mm7, %%mm1       \n\t"
1786
                "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
1787
                "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
1788
                "punpcklbw               %%mm7, %%mm2       \n\t"
1789
                "punpcklbw               %%mm7, %%mm3       \n\t"
1790
                "pmaddwd                 %%mm6, %%mm4       \n\t"
1791
                "pmaddwd                 %%mm6, %%mm1       \n\t"
1792
                "pmaddwd                 %%mm6, %%mm2       \n\t"
1793
                "pmaddwd                 %%mm6, %%mm3       \n\t"
1794
#ifndef FAST_BGR2YV12
1795
                "psrad                      $8, %%mm4       \n\t"
1796
                "psrad                      $8, %%mm1       \n\t"
1797
                "psrad                      $8, %%mm2       \n\t"
1798
                "psrad                      $8, %%mm3       \n\t"
1799
#endif
1800
                "packssdw                %%mm1, %%mm4       \n\t"
1801
                "packssdw                %%mm3, %%mm2       \n\t"
1802
                "pmaddwd                 %%mm5, %%mm4       \n\t"
1803
                "pmaddwd                 %%mm5, %%mm2       \n\t"
1804
                "add                       $24, %%"REG_d"   \n\t"
1805
                "packssdw                %%mm2, %%mm4       \n\t"
1806
                "psraw                      $7, %%mm4       \n\t"
1807

    
1808
                "packuswb                %%mm4, %%mm0       \n\t"
1809
                "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
1810

    
1811
                MOVNTQ"                  %%mm0, (%1, %%"REG_a") \n\t"
1812
                "add                        $8,      %%"REG_a"  \n\t"
1813
                " js                        1b                  \n\t"
1814
                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1815
                : "%"REG_a, "%"REG_d
1816
            );
1817
            ydst += lumStride;
1818
            src  += srcStride;
1819
        }
1820
        src -= srcStride*2;
1821
        __asm__ volatile(
1822
            "mov                        %4, %%"REG_a"   \n\t"
1823
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1824
            "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
1825
            "pxor                    %%mm7, %%mm7       \n\t"
1826
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1827
            "add                 %%"REG_d", %%"REG_d"   \n\t"
1828
            ".p2align                    4              \n\t"
1829
            "1:                                         \n\t"
1830
            PREFETCH"    64(%0, %%"REG_d")              \n\t"
1831
            PREFETCH"    64(%1, %%"REG_d")              \n\t"
1832
#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1833
            "movq          (%0, %%"REG_d"), %%mm0       \n\t"
1834
            "movq          (%1, %%"REG_d"), %%mm1       \n\t"
1835
            "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
1836
            "movq         6(%1, %%"REG_d"), %%mm3       \n\t"
1837
            PAVGB"                   %%mm1, %%mm0       \n\t"
1838
            PAVGB"                   %%mm3, %%mm2       \n\t"
1839
            "movq                    %%mm0, %%mm1       \n\t"
1840
            "movq                    %%mm2, %%mm3       \n\t"
1841
            "psrlq                     $24, %%mm0       \n\t"
1842
            "psrlq                     $24, %%mm2       \n\t"
1843
            PAVGB"                   %%mm1, %%mm0       \n\t"
1844
            PAVGB"                   %%mm3, %%mm2       \n\t"
1845
            "punpcklbw               %%mm7, %%mm0       \n\t"
1846
            "punpcklbw               %%mm7, %%mm2       \n\t"
1847
#else
1848
            "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1849
            "movd          (%1, %%"REG_d"), %%mm1       \n\t"
1850
            "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
1851
            "movd         3(%1, %%"REG_d"), %%mm3       \n\t"
1852
            "punpcklbw               %%mm7, %%mm0       \n\t"
1853
            "punpcklbw               %%mm7, %%mm1       \n\t"
1854
            "punpcklbw               %%mm7, %%mm2       \n\t"
1855
            "punpcklbw               %%mm7, %%mm3       \n\t"
1856
            "paddw                   %%mm1, %%mm0       \n\t"
1857
            "paddw                   %%mm3, %%mm2       \n\t"
1858
            "paddw                   %%mm2, %%mm0       \n\t"
1859
            "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
1860
            "movd         6(%1, %%"REG_d"), %%mm1       \n\t"
1861
            "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
1862
            "movd         9(%1, %%"REG_d"), %%mm3       \n\t"
1863
            "punpcklbw               %%mm7, %%mm4       \n\t"
1864
            "punpcklbw               %%mm7, %%mm1       \n\t"
1865
            "punpcklbw               %%mm7, %%mm2       \n\t"
1866
            "punpcklbw               %%mm7, %%mm3       \n\t"
1867
            "paddw                   %%mm1, %%mm4       \n\t"
1868
            "paddw                   %%mm3, %%mm2       \n\t"
1869
            "paddw                   %%mm4, %%mm2       \n\t"
1870
            "psrlw                      $2, %%mm0       \n\t"
1871
            "psrlw                      $2, %%mm2       \n\t"
1872
#endif
1873
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
1874
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
1875

    
1876
            "pmaddwd                 %%mm0, %%mm1       \n\t"
1877
            "pmaddwd                 %%mm2, %%mm3       \n\t"
1878
            "pmaddwd                 %%mm6, %%mm0       \n\t"
1879
            "pmaddwd                 %%mm6, %%mm2       \n\t"
1880
#ifndef FAST_BGR2YV12
1881
            "psrad                      $8, %%mm0       \n\t"
1882
            "psrad                      $8, %%mm1       \n\t"
1883
            "psrad                      $8, %%mm2       \n\t"
1884
            "psrad                      $8, %%mm3       \n\t"
1885
#endif
1886
            "packssdw                %%mm2, %%mm0       \n\t"
1887
            "packssdw                %%mm3, %%mm1       \n\t"
1888
            "pmaddwd                 %%mm5, %%mm0       \n\t"
1889
            "pmaddwd                 %%mm5, %%mm1       \n\t"
1890
            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
1891
            "psraw                      $7, %%mm0       \n\t"
1892

    
1893
#if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW
1894
            "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
1895
            "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
1896
            "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
1897
            "movq        18(%1, %%"REG_d"), %%mm3       \n\t"
1898
            PAVGB"                   %%mm1, %%mm4       \n\t"
1899
            PAVGB"                   %%mm3, %%mm2       \n\t"
1900
            "movq                    %%mm4, %%mm1       \n\t"
1901
            "movq                    %%mm2, %%mm3       \n\t"
1902
            "psrlq                     $24, %%mm4       \n\t"
1903
            "psrlq                     $24, %%mm2       \n\t"
1904
            PAVGB"                   %%mm1, %%mm4       \n\t"
1905
            PAVGB"                   %%mm3, %%mm2       \n\t"
1906
            "punpcklbw               %%mm7, %%mm4       \n\t"
1907
            "punpcklbw               %%mm7, %%mm2       \n\t"
1908
#else
1909
            "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
1910
            "movd        12(%1, %%"REG_d"), %%mm1       \n\t"
1911
            "movd        15(%0, %%"REG_d"), %%mm2       \n\t"
1912
            "movd        15(%1, %%"REG_d"), %%mm3       \n\t"
1913
            "punpcklbw               %%mm7, %%mm4       \n\t"
1914
            "punpcklbw               %%mm7, %%mm1       \n\t"
1915
            "punpcklbw               %%mm7, %%mm2       \n\t"
1916
            "punpcklbw               %%mm7, %%mm3       \n\t"
1917
            "paddw                   %%mm1, %%mm4       \n\t"
1918
            "paddw                   %%mm3, %%mm2       \n\t"
1919
            "paddw                   %%mm2, %%mm4       \n\t"
1920
            "movd        18(%0, %%"REG_d"), %%mm5       \n\t"
1921
            "movd        18(%1, %%"REG_d"), %%mm1       \n\t"
1922
            "movd        21(%0, %%"REG_d"), %%mm2       \n\t"
1923
            "movd        21(%1, %%"REG_d"), %%mm3       \n\t"
1924
            "punpcklbw               %%mm7, %%mm5       \n\t"
1925
            "punpcklbw               %%mm7, %%mm1       \n\t"
1926
            "punpcklbw               %%mm7, %%mm2       \n\t"
1927
            "punpcklbw               %%mm7, %%mm3       \n\t"
1928
            "paddw                   %%mm1, %%mm5       \n\t"
1929
            "paddw                   %%mm3, %%mm2       \n\t"
1930
            "paddw                   %%mm5, %%mm2       \n\t"
1931
            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1932
            "psrlw                      $2, %%mm4       \n\t"
1933
            "psrlw                      $2, %%mm2       \n\t"
1934
#endif
1935
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
1936
            "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
1937

    
1938
            "pmaddwd                 %%mm4, %%mm1       \n\t"
1939
            "pmaddwd                 %%mm2, %%mm3       \n\t"
1940
            "pmaddwd                 %%mm6, %%mm4       \n\t"
1941
            "pmaddwd                 %%mm6, %%mm2       \n\t"
1942
#ifndef FAST_BGR2YV12
1943
            "psrad                      $8, %%mm4       \n\t"
1944
            "psrad                      $8, %%mm1       \n\t"
1945
            "psrad                      $8, %%mm2       \n\t"
1946
            "psrad                      $8, %%mm3       \n\t"
1947
#endif
1948
            "packssdw                %%mm2, %%mm4       \n\t"
1949
            "packssdw                %%mm3, %%mm1       \n\t"
1950
            "pmaddwd                 %%mm5, %%mm4       \n\t"
1951
            "pmaddwd                 %%mm5, %%mm1       \n\t"
1952
            "add                       $24, %%"REG_d"   \n\t"
1953
            "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
1954
            "psraw                      $7, %%mm4       \n\t"
1955

    
1956
            "movq                    %%mm0, %%mm1           \n\t"
1957
            "punpckldq               %%mm4, %%mm0           \n\t"
1958
            "punpckhdq               %%mm4, %%mm1           \n\t"
1959
            "packsswb                %%mm1, %%mm0           \n\t"
1960
            "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
1961
            "movd                    %%mm0, (%2, %%"REG_a") \n\t"
1962
            "punpckhdq               %%mm0, %%mm0           \n\t"
1963
            "movd                    %%mm0, (%3, %%"REG_a") \n\t"
1964
            "add                        $4, %%"REG_a"       \n\t"
1965
            " js                        1b                  \n\t"
1966
            : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1967
            : "%"REG_a, "%"REG_d
1968
        );
1969

    
1970
        udst += chromStride;
1971
        vdst += chromStride;
1972
        src  += srcStride*2;
1973
    }
1974

    
1975
    __asm__ volatile(EMMS"       \n\t"
1976
                     SFENCE"     \n\t"
1977
                     :::"memory");
1978

    
1979
    for (; y<height; y+=2) {
1980
        long i;
1981
        for (i=0; i<chromWidth; i++) {
1982
            unsigned int b = src[6*i+0];
1983
            unsigned int g = src[6*i+1];
1984
            unsigned int r = src[6*i+2];
1985

    
1986
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1987
            unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
1988
            unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
1989

    
1990
            udst[i]     = U;
1991
            vdst[i]     = V;
1992
            ydst[2*i]   = Y;
1993

    
1994
            b = src[6*i+3];
1995
            g = src[6*i+4];
1996
            r = src[6*i+5];
1997

    
1998
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1999
            ydst[2*i+1]     = Y;
2000
        }
2001
        ydst += lumStride;
2002
        src  += srcStride;
2003

    
2004
        for (i=0; i<chromWidth; i++) {
2005
            unsigned int b = src[6*i+0];
2006
            unsigned int g = src[6*i+1];
2007
            unsigned int r = src[6*i+2];
2008

    
2009
            unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2010

    
2011
            ydst[2*i]     = Y;
2012

    
2013
            b = src[6*i+3];
2014
            g = src[6*i+4];
2015
            r = src[6*i+5];
2016

    
2017
            Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2018
            ydst[2*i+1]     = Y;
2019
        }
2020
        udst += chromStride;
2021
        vdst += chromStride;
2022
        ydst += lumStride;
2023
        src  += srcStride;
2024
    }
2025
}
2026

    
2027
static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2028
                                    long width, long height, long src1Stride,
2029
                                    long src2Stride, long dstStride)
2030
{
2031
    long h;
2032

    
2033
    for (h=0; h < height; h++) {
2034
        long w;
2035

    
2036
#if COMPILE_TEMPLATE_SSE2
2037
        __asm__(
2038
            "xor              %%"REG_a", %%"REG_a"  \n\t"
2039
            "1:                                     \n\t"
2040
            PREFETCH" 64(%1, %%"REG_a")             \n\t"
2041
            PREFETCH" 64(%2, %%"REG_a")             \n\t"
2042
            "movdqa     (%1, %%"REG_a"), %%xmm0     \n\t"
2043
            "movdqa     (%1, %%"REG_a"), %%xmm1     \n\t"
2044
            "movdqa     (%2, %%"REG_a"), %%xmm2     \n\t"
2045
            "punpcklbw           %%xmm2, %%xmm0     \n\t"
2046
            "punpckhbw           %%xmm2, %%xmm1     \n\t"
2047
            "movntdq             %%xmm0,   (%0, %%"REG_a", 2)   \n\t"
2048
            "movntdq             %%xmm1, 16(%0, %%"REG_a", 2)   \n\t"
2049
            "add                    $16, %%"REG_a"  \n\t"
2050
            "cmp                     %3, %%"REG_a"  \n\t"
2051
            " jb                     1b             \n\t"
2052
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2053
            : "memory", "%"REG_a""
2054
        );
2055
#else
2056
        __asm__(
2057
            "xor %%"REG_a", %%"REG_a"               \n\t"
2058
            "1:                                     \n\t"
2059
            PREFETCH" 64(%1, %%"REG_a")             \n\t"
2060
            PREFETCH" 64(%2, %%"REG_a")             \n\t"
2061
            "movq       (%1, %%"REG_a"), %%mm0      \n\t"
2062
            "movq      8(%1, %%"REG_a"), %%mm2      \n\t"
2063
            "movq                 %%mm0, %%mm1      \n\t"
2064
            "movq                 %%mm2, %%mm3      \n\t"
2065
            "movq       (%2, %%"REG_a"), %%mm4      \n\t"
2066
            "movq      8(%2, %%"REG_a"), %%mm5      \n\t"
2067
            "punpcklbw            %%mm4, %%mm0      \n\t"
2068
            "punpckhbw            %%mm4, %%mm1      \n\t"
2069
            "punpcklbw            %%mm5, %%mm2      \n\t"
2070
            "punpckhbw            %%mm5, %%mm3      \n\t"
2071
            MOVNTQ"               %%mm0,   (%0, %%"REG_a", 2)   \n\t"
2072
            MOVNTQ"               %%mm1,  8(%0, %%"REG_a", 2)   \n\t"
2073
            MOVNTQ"               %%mm2, 16(%0, %%"REG_a", 2)   \n\t"
2074
            MOVNTQ"               %%mm3, 24(%0, %%"REG_a", 2)   \n\t"
2075
            "add                    $16, %%"REG_a"  \n\t"
2076
            "cmp                     %3, %%"REG_a"  \n\t"
2077
            " jb                     1b             \n\t"
2078
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2079
            : "memory", "%"REG_a
2080
        );
2081
#endif
2082
        for (w= (width&(~15)); w < width; w++) {
2083
            dest[2*w+0] = src1[w];
2084
            dest[2*w+1] = src2[w];
2085
        }
2086
        dest += dstStride;
2087
        src1 += src1Stride;
2088
        src2 += src2Stride;
2089
    }
2090
    __asm__(
2091
            EMMS"       \n\t"
2092
            SFENCE"     \n\t"
2093
            ::: "memory"
2094
            );
2095
}
2096

    
2097
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2098
                                       uint8_t *dst1, uint8_t *dst2,
2099
                                       long width, long height,
2100
                                       long srcStride1, long srcStride2,
2101
                                       long dstStride1, long dstStride2)
2102
{
2103
    x86_reg y;
2104
    long x,w,h;
2105
    w=width/2; h=height/2;
2106
    __asm__ volatile(
2107
        PREFETCH" %0    \n\t"
2108
        PREFETCH" %1    \n\t"
2109
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2110
    for (y=0;y<h;y++) {
2111
        const uint8_t* s1=src1+srcStride1*(y>>1);
2112
        uint8_t* d=dst1+dstStride1*y;
2113
        x=0;
2114
        for (;x<w-31;x+=32) {
2115
            __asm__ volatile(
2116
                PREFETCH"   32%1        \n\t"
2117
                "movq         %1, %%mm0 \n\t"
2118
                "movq        8%1, %%mm2 \n\t"
2119
                "movq       16%1, %%mm4 \n\t"
2120
                "movq       24%1, %%mm6 \n\t"
2121
                "movq      %%mm0, %%mm1 \n\t"
2122
                "movq      %%mm2, %%mm3 \n\t"
2123
                "movq      %%mm4, %%mm5 \n\t"
2124
                "movq      %%mm6, %%mm7 \n\t"
2125
                "punpcklbw %%mm0, %%mm0 \n\t"
2126
                "punpckhbw %%mm1, %%mm1 \n\t"
2127
                "punpcklbw %%mm2, %%mm2 \n\t"
2128
                "punpckhbw %%mm3, %%mm3 \n\t"
2129
                "punpcklbw %%mm4, %%mm4 \n\t"
2130
                "punpckhbw %%mm5, %%mm5 \n\t"
2131
                "punpcklbw %%mm6, %%mm6 \n\t"
2132
                "punpckhbw %%mm7, %%mm7 \n\t"
2133
                MOVNTQ"    %%mm0,   %0  \n\t"
2134
                MOVNTQ"    %%mm1,  8%0  \n\t"
2135
                MOVNTQ"    %%mm2, 16%0  \n\t"
2136
                MOVNTQ"    %%mm3, 24%0  \n\t"
2137
                MOVNTQ"    %%mm4, 32%0  \n\t"
2138
                MOVNTQ"    %%mm5, 40%0  \n\t"
2139
                MOVNTQ"    %%mm6, 48%0  \n\t"
2140
                MOVNTQ"    %%mm7, 56%0"
2141
                :"=m"(d[2*x])
2142
                :"m"(s1[x])
2143
                :"memory");
2144
        }
2145
        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2146
    }
2147
    for (y=0;y<h;y++) {
2148
        const uint8_t* s2=src2+srcStride2*(y>>1);
2149
        uint8_t* d=dst2+dstStride2*y;
2150
        x=0;
2151
        for (;x<w-31;x+=32) {
2152
            __asm__ volatile(
2153
                PREFETCH"   32%1        \n\t"
2154
                "movq         %1, %%mm0 \n\t"
2155
                "movq        8%1, %%mm2 \n\t"
2156
                "movq       16%1, %%mm4 \n\t"
2157
                "movq       24%1, %%mm6 \n\t"
2158
                "movq      %%mm0, %%mm1 \n\t"
2159
                "movq      %%mm2, %%mm3 \n\t"
2160
                "movq      %%mm4, %%mm5 \n\t"
2161
                "movq      %%mm6, %%mm7 \n\t"
2162
                "punpcklbw %%mm0, %%mm0 \n\t"
2163
                "punpckhbw %%mm1, %%mm1 \n\t"
2164
                "punpcklbw %%mm2, %%mm2 \n\t"
2165
                "punpckhbw %%mm3, %%mm3 \n\t"
2166
                "punpcklbw %%mm4, %%mm4 \n\t"
2167
                "punpckhbw %%mm5, %%mm5 \n\t"
2168
                "punpcklbw %%mm6, %%mm6 \n\t"
2169
                "punpckhbw %%mm7, %%mm7 \n\t"
2170
                MOVNTQ"    %%mm0,   %0  \n\t"
2171
                MOVNTQ"    %%mm1,  8%0  \n\t"
2172
                MOVNTQ"    %%mm2, 16%0  \n\t"
2173
                MOVNTQ"    %%mm3, 24%0  \n\t"
2174
                MOVNTQ"    %%mm4, 32%0  \n\t"
2175
                MOVNTQ"    %%mm5, 40%0  \n\t"
2176
                MOVNTQ"    %%mm6, 48%0  \n\t"
2177
                MOVNTQ"    %%mm7, 56%0"
2178
                :"=m"(d[2*x])
2179
                :"m"(s2[x])
2180
                :"memory");
2181
        }
2182
        for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2183
    }
2184
    __asm__(
2185
            EMMS"       \n\t"
2186
            SFENCE"     \n\t"
2187
            ::: "memory"
2188
        );
2189
}
2190

    
2191
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2192
                                        uint8_t *dst,
2193
                                        long width, long height,
2194
                                        long srcStride1, long srcStride2,
2195
                                        long srcStride3, long dstStride)
2196
{
2197
    x86_reg x;
2198
    long y,w,h;
2199
    w=width/2; h=height;
2200
    for (y=0;y<h;y++) {
2201
        const uint8_t* yp=src1+srcStride1*y;
2202
        const uint8_t* up=src2+srcStride2*(y>>2);
2203
        const uint8_t* vp=src3+srcStride3*(y>>2);
2204
        uint8_t* d=dst+dstStride*y;
2205
        x=0;
2206
        for (;x<w-7;x+=8) {
2207
            __asm__ volatile(
2208
                PREFETCH"   32(%1, %0)          \n\t"
2209
                PREFETCH"   32(%2, %0)          \n\t"
2210
                PREFETCH"   32(%3, %0)          \n\t"
2211
                "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2212
                "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
2213
                "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
2214
                "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2215
                "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
2216
                "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
2217
                "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2218
                "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2219
                "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2220
                "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2221

    
2222
                "movq            %%mm1, %%mm6   \n\t"
2223
                "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2224
                "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2225
                "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2226
                MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
2227
                MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
2228

    
2229
                "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2230
                "movq     8(%1, %0, 4), %%mm0   \n\t"
2231
                "movq            %%mm0, %%mm3   \n\t"
2232
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2233
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2234
                MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
2235
                MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
2236

    
2237
                "movq            %%mm4, %%mm6   \n\t"
2238
                "movq    16(%1, %0, 4), %%mm0   \n\t"
2239
                "movq            %%mm0, %%mm3   \n\t"
2240
                "punpcklbw       %%mm5, %%mm4   \n\t"
2241
                "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2242
                "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2243
                MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
2244
                MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
2245

    
2246
                "punpckhbw       %%mm5, %%mm6   \n\t"
2247
                "movq    24(%1, %0, 4), %%mm0   \n\t"
2248
                "movq            %%mm0, %%mm3   \n\t"
2249
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2250
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2251
                MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2252
                MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2253

    
2254
                : "+r" (x)
2255
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2256
                :"memory");
2257
        }
2258
        for (; x<w; x++) {
2259
            const long x2 = x<<2;
2260
            d[8*x+0] = yp[x2];
2261
            d[8*x+1] = up[x];
2262
            d[8*x+2] = yp[x2+1];
2263
            d[8*x+3] = vp[x];
2264
            d[8*x+4] = yp[x2+2];
2265
            d[8*x+5] = up[x];
2266
            d[8*x+6] = yp[x2+3];
2267
            d[8*x+7] = vp[x];
2268
        }
2269
    }
2270
    __asm__(
2271
            EMMS"       \n\t"
2272
            SFENCE"     \n\t"
2273
            ::: "memory"
2274
        );
2275
}
2276

    
2277
static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2278
{
2279
    dst +=   count;
2280
    src += 2*count;
2281
    count= - count;
2282

    
2283
    if(count <= -16) {
2284
        count += 15;
2285
        __asm__ volatile(
2286
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2287
            "psrlw            $8, %%mm7        \n\t"
2288
            "1:                                \n\t"
2289
            "movq -30(%1, %0, 2), %%mm0        \n\t"
2290
            "movq -22(%1, %0, 2), %%mm1        \n\t"
2291
            "movq -14(%1, %0, 2), %%mm2        \n\t"
2292
            "movq  -6(%1, %0, 2), %%mm3        \n\t"
2293
            "pand          %%mm7, %%mm0        \n\t"
2294
            "pand          %%mm7, %%mm1        \n\t"
2295
            "pand          %%mm7, %%mm2        \n\t"
2296
            "pand          %%mm7, %%mm3        \n\t"
2297
            "packuswb      %%mm1, %%mm0        \n\t"
2298
            "packuswb      %%mm3, %%mm2        \n\t"
2299
            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2300
            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2301
            "add             $16, %0           \n\t"
2302
            " js 1b                            \n\t"
2303
            : "+r"(count)
2304
            : "r"(src), "r"(dst)
2305
        );
2306
        count -= 15;
2307
    }
2308
    while(count<0) {
2309
        dst[count]= src[2*count];
2310
        count++;
2311
    }
2312
}
2313

    
2314
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2315
{
2316
    dst0+=   count;
2317
    dst1+=   count;
2318
    src += 4*count;
2319
    count= - count;
2320
    if(count <= -8) {
2321
        count += 7;
2322
        __asm__ volatile(
2323
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2324
            "psrlw            $8, %%mm7        \n\t"
2325
            "1:                                \n\t"
2326
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2327
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2328
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2329
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2330
            "pand          %%mm7, %%mm0        \n\t"
2331
            "pand          %%mm7, %%mm1        \n\t"
2332
            "pand          %%mm7, %%mm2        \n\t"
2333
            "pand          %%mm7, %%mm3        \n\t"
2334
            "packuswb      %%mm1, %%mm0        \n\t"
2335
            "packuswb      %%mm3, %%mm2        \n\t"
2336
            "movq          %%mm0, %%mm1        \n\t"
2337
            "movq          %%mm2, %%mm3        \n\t"
2338
            "psrlw            $8, %%mm0        \n\t"
2339
            "psrlw            $8, %%mm2        \n\t"
2340
            "pand          %%mm7, %%mm1        \n\t"
2341
            "pand          %%mm7, %%mm3        \n\t"
2342
            "packuswb      %%mm2, %%mm0        \n\t"
2343
            "packuswb      %%mm3, %%mm1        \n\t"
2344
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2345
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2346
            "add              $8, %0           \n\t"
2347
            " js 1b                            \n\t"
2348
            : "+r"(count)
2349
            : "r"(src), "r"(dst0), "r"(dst1)
2350
        );
2351
        count -= 7;
2352
    }
2353
    while(count<0) {
2354
        dst0[count]= src[4*count+0];
2355
        dst1[count]= src[4*count+2];
2356
        count++;
2357
    }
2358
}
2359

    
2360
static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2361
{
2362
    dst0 +=   count;
2363
    dst1 +=   count;
2364
    src0 += 4*count;
2365
    src1 += 4*count;
2366
    count= - count;
2367
#ifdef PAVGB
2368
    if(count <= -8) {
2369
        count += 7;
2370
        __asm__ volatile(
2371
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2372
            "psrlw             $8, %%mm7        \n\t"
2373
            "1:                                \n\t"
2374
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2375
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2376
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2377
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2378
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2379
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2380
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2381
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2382
            "pand           %%mm7, %%mm0        \n\t"
2383
            "pand           %%mm7, %%mm1        \n\t"
2384
            "pand           %%mm7, %%mm2        \n\t"
2385
            "pand           %%mm7, %%mm3        \n\t"
2386
            "packuswb       %%mm1, %%mm0        \n\t"
2387
            "packuswb       %%mm3, %%mm2        \n\t"
2388
            "movq           %%mm0, %%mm1        \n\t"
2389
            "movq           %%mm2, %%mm3        \n\t"
2390
            "psrlw             $8, %%mm0        \n\t"
2391
            "psrlw             $8, %%mm2        \n\t"
2392
            "pand           %%mm7, %%mm1        \n\t"
2393
            "pand           %%mm7, %%mm3        \n\t"
2394
            "packuswb       %%mm2, %%mm0        \n\t"
2395
            "packuswb       %%mm3, %%mm1        \n\t"
2396
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2397
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2398
            "add               $8, %0           \n\t"
2399
            " js 1b                            \n\t"
2400
            : "+r"(count)
2401
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2402
        );
2403
        count -= 7;
2404
    }
2405
#endif
2406
    while(count<0) {
2407
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2408
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2409
        count++;
2410
    }
2411
}
2412

    
2413
static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2414
{
2415
    dst0+=   count;
2416
    dst1+=   count;
2417
    src += 4*count;
2418
    count= - count;
2419
    if(count <= -8) {
2420
        count += 7;
2421
        __asm__ volatile(
2422
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2423
            "psrlw            $8, %%mm7        \n\t"
2424
            "1:                                \n\t"
2425
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2426
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2427
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2428
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2429
            "psrlw            $8, %%mm0        \n\t"
2430
            "psrlw            $8, %%mm1        \n\t"
2431
            "psrlw            $8, %%mm2        \n\t"
2432
            "psrlw            $8, %%mm3        \n\t"
2433
            "packuswb      %%mm1, %%mm0        \n\t"
2434
            "packuswb      %%mm3, %%mm2        \n\t"
2435
            "movq          %%mm0, %%mm1        \n\t"
2436
            "movq          %%mm2, %%mm3        \n\t"
2437
            "psrlw            $8, %%mm0        \n\t"
2438
            "psrlw            $8, %%mm2        \n\t"
2439
            "pand          %%mm7, %%mm1        \n\t"
2440
            "pand          %%mm7, %%mm3        \n\t"
2441
            "packuswb      %%mm2, %%mm0        \n\t"
2442
            "packuswb      %%mm3, %%mm1        \n\t"
2443
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2444
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2445
            "add              $8, %0           \n\t"
2446
            " js 1b                            \n\t"
2447
            : "+r"(count)
2448
            : "r"(src), "r"(dst0), "r"(dst1)
2449
        );
2450
        count -= 7;
2451
    }
2452
    src++;
2453
    while(count<0) {
2454
        dst0[count]= src[4*count+0];
2455
        dst1[count]= src[4*count+2];
2456
        count++;
2457
    }
2458
}
2459

    
2460
static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2461
{
2462
    dst0 +=   count;
2463
    dst1 +=   count;
2464
    src0 += 4*count;
2465
    src1 += 4*count;
2466
    count= - count;
2467
#ifdef PAVGB
2468
    if(count <= -8) {
2469
        count += 7;
2470
        __asm__ volatile(
2471
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2472
            "psrlw             $8, %%mm7        \n\t"
2473
            "1:                                \n\t"
2474
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2475
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2476
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2477
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2478
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2479
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2480
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2481
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2482
            "psrlw             $8, %%mm0        \n\t"
2483
            "psrlw             $8, %%mm1        \n\t"
2484
            "psrlw             $8, %%mm2        \n\t"
2485
            "psrlw             $8, %%mm3        \n\t"
2486
            "packuswb       %%mm1, %%mm0        \n\t"
2487
            "packuswb       %%mm3, %%mm2        \n\t"
2488
            "movq           %%mm0, %%mm1        \n\t"
2489
            "movq           %%mm2, %%mm3        \n\t"
2490
            "psrlw             $8, %%mm0        \n\t"
2491
            "psrlw             $8, %%mm2        \n\t"
2492
            "pand           %%mm7, %%mm1        \n\t"
2493
            "pand           %%mm7, %%mm3        \n\t"
2494
            "packuswb       %%mm2, %%mm0        \n\t"
2495
            "packuswb       %%mm3, %%mm1        \n\t"
2496
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2497
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2498
            "add               $8, %0           \n\t"
2499
            " js 1b                            \n\t"
2500
            : "+r"(count)
2501
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2502
        );
2503
        count -= 7;
2504
    }
2505
#endif
2506
    src0++;
2507
    src1++;
2508
    while(count<0) {
2509
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2510
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2511
        count++;
2512
    }
2513
}
2514

    
2515
static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2516
                                 long width, long height,
2517
                                 long lumStride, long chromStride, long srcStride)
2518
{
2519
    long y;
2520
    const long chromWidth= -((-width)>>1);
2521

    
2522
    for (y=0; y<height; y++) {
2523
        RENAME(extract_even)(src, ydst, width);
2524
        if(y&1) {
2525
            RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2526
            udst+= chromStride;
2527
            vdst+= chromStride;
2528
        }
2529

    
2530
        src += srcStride;
2531
        ydst+= lumStride;
2532
    }
2533
    __asm__(
2534
            EMMS"       \n\t"
2535
            SFENCE"     \n\t"
2536
            ::: "memory"
2537
        );
2538
}
2539

    
2540
static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2541
                                 long width, long height,
2542
                                 long lumStride, long chromStride, long srcStride)
2543
{
2544
    long y;
2545
    const long chromWidth= -((-width)>>1);
2546

    
2547
    for (y=0; y<height; y++) {
2548
        RENAME(extract_even)(src, ydst, width);
2549
        RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2550

    
2551
        src += srcStride;
2552
        ydst+= lumStride;
2553
        udst+= chromStride;
2554
        vdst+= chromStride;
2555
    }
2556
    __asm__(
2557
            EMMS"       \n\t"
2558
            SFENCE"     \n\t"
2559
            ::: "memory"
2560
        );
2561
}
2562

    
2563
static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2564
                                 long width, long height,
2565
                                 long lumStride, long chromStride, long srcStride)
2566
{
2567
    long y;
2568
    const long chromWidth= -((-width)>>1);
2569

    
2570
    for (y=0; y<height; y++) {
2571
        RENAME(extract_even)(src+1, ydst, width);
2572
        if(y&1) {
2573
            RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2574
            udst+= chromStride;
2575
            vdst+= chromStride;
2576
        }
2577

    
2578
        src += srcStride;
2579
        ydst+= lumStride;
2580
    }
2581
    __asm__(
2582
            EMMS"       \n\t"
2583
            SFENCE"     \n\t"
2584
            ::: "memory"
2585
        );
2586
}
2587

    
2588
static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2589
                                 long width, long height,
2590
                                 long lumStride, long chromStride, long srcStride)
2591
{
2592
    long y;
2593
    const long chromWidth= -((-width)>>1);
2594

    
2595
    for (y=0; y<height; y++) {
2596
        RENAME(extract_even)(src+1, ydst, width);
2597
        RENAME(extract_even2)(src, udst, vdst, chromWidth);
2598

    
2599
        src += srcStride;
2600
        ydst+= lumStride;
2601
        udst+= chromStride;
2602
        vdst+= chromStride;
2603
    }
2604
    __asm__(
2605
            EMMS"       \n\t"
2606
            SFENCE"     \n\t"
2607
            ::: "memory"
2608
        );
2609
}
2610

    
2611
static inline void RENAME(rgb2rgb_init)(void)
2612
{
2613
    rgb15to16          = RENAME(rgb15to16);
2614
    rgb15tobgr24       = RENAME(rgb15tobgr24);
2615
    rgb15to32          = RENAME(rgb15to32);
2616
    rgb16tobgr24       = RENAME(rgb16tobgr24);
2617
    rgb16to32          = RENAME(rgb16to32);
2618
    rgb16to15          = RENAME(rgb16to15);
2619
    rgb24tobgr16       = RENAME(rgb24tobgr16);
2620
    rgb24tobgr15       = RENAME(rgb24tobgr15);
2621
    rgb24tobgr32       = RENAME(rgb24tobgr32);
2622
    rgb32to16          = RENAME(rgb32to16);
2623
    rgb32to15          = RENAME(rgb32to15);
2624
    rgb32tobgr24       = RENAME(rgb32tobgr24);
2625
    rgb24to15          = RENAME(rgb24to15);
2626
    rgb24to16          = RENAME(rgb24to16);
2627
    rgb24tobgr24       = RENAME(rgb24tobgr24);
2628
    shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2629
    rgb32tobgr16       = RENAME(rgb32tobgr16);
2630
    rgb32tobgr15       = RENAME(rgb32tobgr15);
2631
    yv12toyuy2         = RENAME(yv12toyuy2);
2632
    yv12touyvy         = RENAME(yv12touyvy);
2633
    yuv422ptoyuy2      = RENAME(yuv422ptoyuy2);
2634
    yuv422ptouyvy      = RENAME(yuv422ptouyvy);
2635
    yuy2toyv12         = RENAME(yuy2toyv12);
2636
    planar2x           = RENAME(planar2x);
2637
    rgb24toyv12        = RENAME(rgb24toyv12);
2638
    interleaveBytes    = RENAME(interleaveBytes);
2639
    vu9_to_vu12        = RENAME(vu9_to_vu12);
2640
    yvu9_to_yuy2       = RENAME(yvu9_to_yuy2);
2641

    
2642
    uyvytoyuv420       = RENAME(uyvytoyuv420);
2643
    uyvytoyuv422       = RENAME(uyvytoyuv422);
2644
    yuyvtoyuv420       = RENAME(yuyvtoyuv420);
2645
    yuyvtoyuv422       = RENAME(yuyvtoyuv422);
2646
}