Statistics
| Branch: | Revision:

ffmpeg / libswscale / rgb2rgb_template.c @ 935f50c8

History | View | Annotate | Download (69.1 KB)

1
/*
2
 *
3
 *  rgb2rgb.c, Software RGB to RGB convertor
4
 *  pluralize by Software PAL8 to RGB convertor
5
 *               Software YUV to YUV convertor
6
 *               Software YUV to RGB convertor
7
 *  Written by Nick Kurshev.
8
 *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
9
 *  lot of big-endian byteorder fixes by Alex Beregszaszi
10
 *
11
 * This file is part of FFmpeg.
12
 *
13
 * FFmpeg is free software; you can redistribute it and/or modify
14
 * it under the terms of the GNU General Public License as published by
15
 * the Free Software Foundation; either version 2 of the License, or
16
 * (at your option) any later version.
17
 *
18
 * FFmpeg is distributed in the hope that it will be useful,
19
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 * GNU General Public License for more details.
22
 *
23
 * You should have received a copy of the GNU General Public License
24
 * along with FFmpeg; if not, write to the Free Software
25
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26
 * 
27
 * the C code (not assembly, mmx, ...) of this file can be used
28
 * under the LGPL license too
29
 */
30

    
31
#include <stddef.h>
32
#include <inttypes.h> /* for __WORDSIZE */
33

    
34
#ifndef __WORDSIZE
35
// #warning You have misconfigured system and probably will lose performance!
36
#define __WORDSIZE MP_WORDSIZE
37
#endif
38

    
39
#undef PREFETCH
40
#undef MOVNTQ
41
#undef EMMS
42
#undef SFENCE
43
#undef MMREG_SIZE
44
#undef PREFETCHW
45
#undef PAVGB
46

    
47
#ifdef HAVE_SSE2
48
#define MMREG_SIZE 16
49
#else
50
#define MMREG_SIZE 8
51
#endif
52

    
53
#ifdef HAVE_3DNOW
54
#define PREFETCH  "prefetch"
55
#define PREFETCHW "prefetchw"
56
#define PAVGB          "pavgusb"
57
#elif defined ( HAVE_MMX2 )
58
#define PREFETCH "prefetchnta"
59
#define PREFETCHW "prefetcht0"
60
#define PAVGB          "pavgb"
61
#else
62
#ifdef __APPLE__
63
#define PREFETCH "#"
64
#define PREFETCHW "#"
65
#else
66
#define PREFETCH  " # nop"
67
#define PREFETCHW " # nop"
68
#endif
69
#endif
70

    
71
#ifdef HAVE_3DNOW
72
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
73
#define EMMS     "femms"
74
#else
75
#define EMMS     "emms"
76
#endif
77

    
78
#ifdef HAVE_MMX2
79
#define MOVNTQ "movntq"
80
#define SFENCE "sfence"
81
#else
82
#define MOVNTQ "movq"
83
#define SFENCE " # nop"
84
#endif
85

    
86
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
87
{
88
  uint8_t *dest = dst;
89
  const uint8_t *s = src;
90
  const uint8_t *end;
91
#ifdef HAVE_MMX
92
  const uint8_t *mm_end;
93
#endif
94
  end = s + src_size;
95
#ifdef HAVE_MMX
96
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
97
  mm_end = end - 23;
98
  __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
99
  while(s < mm_end)
100
  {
101
    __asm __volatile(
102
        PREFETCH"        32%1\n\t"
103
        "movd        %1, %%mm0\n\t"
104
        "punpckldq 3%1, %%mm0\n\t"
105
        "movd        6%1, %%mm1\n\t"
106
        "punpckldq 9%1, %%mm1\n\t"
107
        "movd        12%1, %%mm2\n\t"
108
        "punpckldq 15%1, %%mm2\n\t"
109
        "movd        18%1, %%mm3\n\t"
110
        "punpckldq 21%1, %%mm3\n\t"
111
        "pand        %%mm7, %%mm0\n\t"
112
        "pand        %%mm7, %%mm1\n\t"
113
        "pand        %%mm7, %%mm2\n\t"
114
        "pand        %%mm7, %%mm3\n\t"
115
        MOVNTQ"        %%mm0, %0\n\t"
116
        MOVNTQ"        %%mm1, 8%0\n\t"
117
        MOVNTQ"        %%mm2, 16%0\n\t"
118
        MOVNTQ"        %%mm3, 24%0"
119
        :"=m"(*dest)
120
        :"m"(*s)
121
        :"memory");
122
    dest += 32;
123
    s += 24;
124
  }
125
  __asm __volatile(SFENCE:::"memory");
126
  __asm __volatile(EMMS:::"memory");
127
#endif
128
  while(s < end)
129
  {
130
#ifdef WORDS_BIGENDIAN
131
    /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
132
    *dest++ = 0;
133
    *dest++ = s[2];
134
    *dest++ = s[1];
135
    *dest++ = s[0];
136
    s+=3;
137
#else
138
    *dest++ = *s++;
139
    *dest++ = *s++;
140
    *dest++ = *s++;
141
    *dest++ = 0;
142
#endif
143
  }
144
}
145

    
146
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
147
{
148
  uint8_t *dest = dst;
149
  const uint8_t *s = src;
150
  const uint8_t *end;
151
#ifdef HAVE_MMX
152
  const uint8_t *mm_end;
153
#endif
154
  end = s + src_size;
155
#ifdef HAVE_MMX
156
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
157
  mm_end = end - 31;
158
  while(s < mm_end)
159
  {
160
    __asm __volatile(
161
        PREFETCH"        32%1\n\t"
162
        "movq        %1, %%mm0\n\t"
163
        "movq        8%1, %%mm1\n\t"
164
        "movq        16%1, %%mm4\n\t"
165
        "movq        24%1, %%mm5\n\t"
166
        "movq        %%mm0, %%mm2\n\t"
167
        "movq        %%mm1, %%mm3\n\t"
168
        "movq        %%mm4, %%mm6\n\t"
169
        "movq        %%mm5, %%mm7\n\t"
170
        "psrlq        $8, %%mm2\n\t"
171
        "psrlq        $8, %%mm3\n\t"
172
        "psrlq        $8, %%mm6\n\t"
173
        "psrlq        $8, %%mm7\n\t"
174
        "pand        %2, %%mm0\n\t"
175
        "pand        %2, %%mm1\n\t"
176
        "pand        %2, %%mm4\n\t"
177
        "pand        %2, %%mm5\n\t"
178
        "pand        %3, %%mm2\n\t"
179
        "pand        %3, %%mm3\n\t"
180
        "pand        %3, %%mm6\n\t"
181
        "pand        %3, %%mm7\n\t"
182
        "por        %%mm2, %%mm0\n\t"
183
        "por        %%mm3, %%mm1\n\t"
184
        "por        %%mm6, %%mm4\n\t"
185
        "por        %%mm7, %%mm5\n\t"
186

    
187
        "movq        %%mm1, %%mm2\n\t"
188
        "movq        %%mm4, %%mm3\n\t"
189
        "psllq        $48, %%mm2\n\t"
190
        "psllq        $32, %%mm3\n\t"
191
        "pand        %4, %%mm2\n\t"
192
        "pand        %5, %%mm3\n\t"
193
        "por        %%mm2, %%mm0\n\t"
194
        "psrlq        $16, %%mm1\n\t"
195
        "psrlq        $32, %%mm4\n\t"
196
        "psllq        $16, %%mm5\n\t"
197
        "por        %%mm3, %%mm1\n\t"
198
        "pand        %6, %%mm5\n\t"
199
        "por        %%mm5, %%mm4\n\t"
200

    
201
        MOVNTQ"        %%mm0, %0\n\t"
202
        MOVNTQ"        %%mm1, 8%0\n\t"
203
        MOVNTQ"        %%mm4, 16%0"
204
        :"=m"(*dest)
205
        :"m"(*s),"m"(mask24l),
206
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
207
        :"memory");
208
    dest += 24;
209
    s += 32;
210
  }
211
  __asm __volatile(SFENCE:::"memory");
212
  __asm __volatile(EMMS:::"memory");
213
#endif
214
  while(s < end)
215
  {
216
#ifdef WORDS_BIGENDIAN
217
    /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
218
    s++;
219
    dest[2] = *s++;
220
    dest[1] = *s++;
221
    dest[0] = *s++;
222
    dest += 3;
223
#else
224
    *dest++ = *s++;
225
    *dest++ = *s++;
226
    *dest++ = *s++;
227
    s++;
228
#endif
229
  }
230
}
231

    
232
/*
233
 Original by Strepto/Astral
234
 ported to gcc & bugfixed : A'rpi
235
 MMX2, 3DNOW optimization by Nick Kurshev
236
 32bit c version, and and&add trick by Michael Niedermayer
237
*/
238
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
239
{
240
  register const uint8_t* s=src;
241
  register uint8_t* d=dst;
242
  register const uint8_t *end;
243
  const uint8_t *mm_end;
244
  end = s + src_size;
245
#ifdef HAVE_MMX
246
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
247
  __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
248
  mm_end = end - 15;
249
  while(s<mm_end)
250
  {
251
        __asm __volatile(
252
                PREFETCH"        32%1\n\t"
253
                "movq        %1, %%mm0\n\t"
254
                "movq        8%1, %%mm2\n\t"
255
                "movq        %%mm0, %%mm1\n\t"
256
                "movq        %%mm2, %%mm3\n\t"
257
                "pand        %%mm4, %%mm0\n\t"
258
                "pand        %%mm4, %%mm2\n\t"
259
                "paddw        %%mm1, %%mm0\n\t"
260
                "paddw        %%mm3, %%mm2\n\t"
261
                MOVNTQ"        %%mm0, %0\n\t"
262
                MOVNTQ"        %%mm2, 8%0"
263
                :"=m"(*d)
264
                :"m"(*s)
265
                );
266
        d+=16;
267
        s+=16;
268
  }
269
  __asm __volatile(SFENCE:::"memory");
270
  __asm __volatile(EMMS:::"memory");
271
#endif
272
    mm_end = end - 3;
273
    while(s < mm_end)
274
    {
275
        register unsigned x= *((uint32_t *)s);
276
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
277
        d+=4;
278
        s+=4;
279
    }
280
    if(s < end)
281
    {
282
        register unsigned short x= *((uint16_t *)s);
283
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
284
    }
285
}
286

    
287
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
288
{
289
  register const uint8_t* s=src;
290
  register uint8_t* d=dst;
291
  register const uint8_t *end;
292
  const uint8_t *mm_end;
293
  end = s + src_size;
294
#ifdef HAVE_MMX
295
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
296
  __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
297
  __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
298
  mm_end = end - 15;
299
  while(s<mm_end)
300
  {
301
        __asm __volatile(
302
                PREFETCH"        32%1\n\t"
303
                "movq        %1, %%mm0\n\t"
304
                "movq        8%1, %%mm2\n\t"
305
                "movq        %%mm0, %%mm1\n\t"
306
                "movq        %%mm2, %%mm3\n\t"
307
                "psrlq        $1, %%mm0\n\t"
308
                "psrlq        $1, %%mm2\n\t"
309
                "pand        %%mm7, %%mm0\n\t"
310
                "pand        %%mm7, %%mm2\n\t"
311
                "pand        %%mm6, %%mm1\n\t"
312
                "pand        %%mm6, %%mm3\n\t"
313
                "por        %%mm1, %%mm0\n\t"
314
                "por        %%mm3, %%mm2\n\t"
315
                MOVNTQ"        %%mm0, %0\n\t"
316
                MOVNTQ"        %%mm2, 8%0"
317
                :"=m"(*d)
318
                :"m"(*s)
319
                );
320
        d+=16;
321
        s+=16;
322
  }
323
  __asm __volatile(SFENCE:::"memory");
324
  __asm __volatile(EMMS:::"memory");
325
#endif
326
    mm_end = end - 3;
327
    while(s < mm_end)
328
    {
329
        register uint32_t x= *((uint32_t *)s);
330
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
331
        s+=4;
332
        d+=4;
333
    }
334
    if(s < end)
335
    {
336
        register uint16_t x= *((uint16_t *)s);
337
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
338
        s+=2;
339
        d+=2;
340
    }
341
}
342

    
343
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
344
{
345
        const uint8_t *s = src;
346
        const uint8_t *end;
347
#ifdef HAVE_MMX
348
        const uint8_t *mm_end;
349
#endif
350
        uint16_t *d = (uint16_t *)dst;
351
        end = s + src_size;
352
#ifdef HAVE_MMX
353
        mm_end = end - 15;
354
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
355
        asm volatile(
356
                "movq %3, %%mm5                        \n\t"
357
                "movq %4, %%mm6                        \n\t"
358
                "movq %5, %%mm7                        \n\t"
359
                ASMALIGN(4)
360
                "1:                                \n\t"
361
                PREFETCH" 32(%1)                \n\t"
362
                "movd        (%1), %%mm0                \n\t"
363
                "movd        4(%1), %%mm3                \n\t"
364
                "punpckldq 8(%1), %%mm0                \n\t"
365
                "punpckldq 12(%1), %%mm3        \n\t"
366
                "movq %%mm0, %%mm1                \n\t"
367
                "movq %%mm3, %%mm4                \n\t"
368
                "pand %%mm6, %%mm0                \n\t"
369
                "pand %%mm6, %%mm3                \n\t"
370
                "pmaddwd %%mm7, %%mm0                \n\t"
371
                "pmaddwd %%mm7, %%mm3                \n\t"
372
                "pand %%mm5, %%mm1                \n\t"
373
                "pand %%mm5, %%mm4                \n\t"
374
                "por %%mm1, %%mm0                \n\t"        
375
                "por %%mm4, %%mm3                \n\t"
376
                "psrld $5, %%mm0                \n\t"
377
                "pslld $11, %%mm3                \n\t"
378
                "por %%mm3, %%mm0                \n\t"
379
                MOVNTQ"        %%mm0, (%0)                \n\t"
380
                "add $16, %1                        \n\t"
381
                "add $8, %0                        \n\t"
382
                "cmp %2, %1                        \n\t"
383
                " jb 1b                                \n\t"
384
                : "+r" (d), "+r"(s)
385
                : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
386
        );
387
#else
388
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
389
        __asm __volatile(
390
            "movq        %0, %%mm7\n\t"
391
            "movq        %1, %%mm6\n\t"
392
            ::"m"(red_16mask),"m"(green_16mask));
393
        while(s < mm_end)
394
        {
395
            __asm __volatile(
396
                PREFETCH" 32%1\n\t"
397
                "movd        %1, %%mm0\n\t"
398
                "movd        4%1, %%mm3\n\t"
399
                "punpckldq 8%1, %%mm0\n\t"
400
                "punpckldq 12%1, %%mm3\n\t"
401
                "movq        %%mm0, %%mm1\n\t"
402
                "movq        %%mm0, %%mm2\n\t"
403
                "movq        %%mm3, %%mm4\n\t"
404
                "movq        %%mm3, %%mm5\n\t"
405
                "psrlq        $3, %%mm0\n\t"
406
                "psrlq        $3, %%mm3\n\t"
407
                "pand        %2, %%mm0\n\t"
408
                "pand        %2, %%mm3\n\t"
409
                "psrlq        $5, %%mm1\n\t"
410
                "psrlq        $5, %%mm4\n\t"
411
                "pand        %%mm6, %%mm1\n\t"
412
                "pand        %%mm6, %%mm4\n\t"
413
                "psrlq        $8, %%mm2\n\t"
414
                "psrlq        $8, %%mm5\n\t"
415
                "pand        %%mm7, %%mm2\n\t"
416
                "pand        %%mm7, %%mm5\n\t"
417
                "por        %%mm1, %%mm0\n\t"
418
                "por        %%mm4, %%mm3\n\t"
419
                "por        %%mm2, %%mm0\n\t"
420
                "por        %%mm5, %%mm3\n\t"
421
                "psllq        $16, %%mm3\n\t"
422
                "por        %%mm3, %%mm0\n\t"
423
                MOVNTQ"        %%mm0, %0\n\t"
424
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
425
                d += 4;
426
                s += 16;
427
        }
428
#endif
429
        __asm __volatile(SFENCE:::"memory");
430
        __asm __volatile(EMMS:::"memory");
431
#endif
432
        while(s < end)
433
        {
434
                register int rgb = *(uint32_t*)s; s += 4;
435
                *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
436
        }
437
}
438

    
439
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
440
{
441
        const uint8_t *s = src;
442
        const uint8_t *end;
443
#ifdef HAVE_MMX
444
        const uint8_t *mm_end;
445
#endif
446
        uint16_t *d = (uint16_t *)dst;
447
        end = s + src_size;
448
#ifdef HAVE_MMX
449
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
450
        __asm __volatile(
451
            "movq        %0, %%mm7\n\t"
452
            "movq        %1, %%mm6\n\t"
453
            ::"m"(red_16mask),"m"(green_16mask));
454
        mm_end = end - 15;
455
        while(s < mm_end)
456
        {
457
            __asm __volatile(
458
                PREFETCH" 32%1\n\t"
459
                "movd        %1, %%mm0\n\t"
460
                "movd        4%1, %%mm3\n\t"
461
                "punpckldq 8%1, %%mm0\n\t"
462
                "punpckldq 12%1, %%mm3\n\t"
463
                "movq        %%mm0, %%mm1\n\t"
464
                "movq        %%mm0, %%mm2\n\t"
465
                "movq        %%mm3, %%mm4\n\t"
466
                "movq        %%mm3, %%mm5\n\t"
467
                "psllq        $8, %%mm0\n\t"
468
                "psllq        $8, %%mm3\n\t"
469
                "pand        %%mm7, %%mm0\n\t"
470
                "pand        %%mm7, %%mm3\n\t"
471
                "psrlq        $5, %%mm1\n\t"
472
                "psrlq        $5, %%mm4\n\t"
473
                "pand        %%mm6, %%mm1\n\t"
474
                "pand        %%mm6, %%mm4\n\t"
475
                "psrlq        $19, %%mm2\n\t"
476
                "psrlq        $19, %%mm5\n\t"
477
                "pand        %2, %%mm2\n\t"
478
                "pand        %2, %%mm5\n\t"
479
                "por        %%mm1, %%mm0\n\t"
480
                "por        %%mm4, %%mm3\n\t"
481
                "por        %%mm2, %%mm0\n\t"
482
                "por        %%mm5, %%mm3\n\t"
483
                "psllq        $16, %%mm3\n\t"
484
                "por        %%mm3, %%mm0\n\t"
485
                MOVNTQ"        %%mm0, %0\n\t"
486
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
487
                d += 4;
488
                s += 16;
489
        }
490
        __asm __volatile(SFENCE:::"memory");
491
        __asm __volatile(EMMS:::"memory");
492
#endif
493
        while(s < end)
494
        {
495
                register int rgb = *(uint32_t*)s; s += 4;
496
                *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
497
        }
498
}
499

    
500
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
501
{
502
        const uint8_t *s = src;
503
        const uint8_t *end;
504
#ifdef HAVE_MMX
505
        const uint8_t *mm_end;
506
#endif
507
        uint16_t *d = (uint16_t *)dst;
508
        end = s + src_size;
509
#ifdef HAVE_MMX
510
        mm_end = end - 15;
511
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
512
        asm volatile(
513
                "movq %3, %%mm5                        \n\t"
514
                "movq %4, %%mm6                        \n\t"
515
                "movq %5, %%mm7                        \n\t"
516
                ASMALIGN(4)
517
                "1:                                \n\t"
518
                PREFETCH" 32(%1)                \n\t"
519
                "movd        (%1), %%mm0                \n\t"
520
                "movd        4(%1), %%mm3                \n\t"
521
                "punpckldq 8(%1), %%mm0                \n\t"
522
                "punpckldq 12(%1), %%mm3        \n\t"
523
                "movq %%mm0, %%mm1                \n\t"
524
                "movq %%mm3, %%mm4                \n\t"
525
                "pand %%mm6, %%mm0                \n\t"
526
                "pand %%mm6, %%mm3                \n\t"
527
                "pmaddwd %%mm7, %%mm0                \n\t"
528
                "pmaddwd %%mm7, %%mm3                \n\t"
529
                "pand %%mm5, %%mm1                \n\t"
530
                "pand %%mm5, %%mm4                \n\t"
531
                "por %%mm1, %%mm0                \n\t"        
532
                "por %%mm4, %%mm3                \n\t"
533
                "psrld $6, %%mm0                \n\t"
534
                "pslld $10, %%mm3                \n\t"
535
                "por %%mm3, %%mm0                \n\t"
536
                MOVNTQ"        %%mm0, (%0)                \n\t"
537
                "add $16, %1                        \n\t"
538
                "add $8, %0                        \n\t"
539
                "cmp %2, %1                        \n\t"
540
                " jb 1b                                \n\t"
541
                : "+r" (d), "+r"(s)
542
                : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
543
        );
544
#else
545
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
546
        __asm __volatile(
547
            "movq        %0, %%mm7\n\t"
548
            "movq        %1, %%mm6\n\t"
549
            ::"m"(red_15mask),"m"(green_15mask));
550
        while(s < mm_end)
551
        {
552
            __asm __volatile(
553
                PREFETCH" 32%1\n\t"
554
                "movd        %1, %%mm0\n\t"
555
                "movd        4%1, %%mm3\n\t"
556
                "punpckldq 8%1, %%mm0\n\t"
557
                "punpckldq 12%1, %%mm3\n\t"
558
                "movq        %%mm0, %%mm1\n\t"
559
                "movq        %%mm0, %%mm2\n\t"
560
                "movq        %%mm3, %%mm4\n\t"
561
                "movq        %%mm3, %%mm5\n\t"
562
                "psrlq        $3, %%mm0\n\t"
563
                "psrlq        $3, %%mm3\n\t"
564
                "pand        %2, %%mm0\n\t"
565
                "pand        %2, %%mm3\n\t"
566
                "psrlq        $6, %%mm1\n\t"
567
                "psrlq        $6, %%mm4\n\t"
568
                "pand        %%mm6, %%mm1\n\t"
569
                "pand        %%mm6, %%mm4\n\t"
570
                "psrlq        $9, %%mm2\n\t"
571
                "psrlq        $9, %%mm5\n\t"
572
                "pand        %%mm7, %%mm2\n\t"
573
                "pand        %%mm7, %%mm5\n\t"
574
                "por        %%mm1, %%mm0\n\t"
575
                "por        %%mm4, %%mm3\n\t"
576
                "por        %%mm2, %%mm0\n\t"
577
                "por        %%mm5, %%mm3\n\t"
578
                "psllq        $16, %%mm3\n\t"
579
                "por        %%mm3, %%mm0\n\t"
580
                MOVNTQ"        %%mm0, %0\n\t"
581
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
582
                d += 4;
583
                s += 16;
584
        }
585
#endif
586
        __asm __volatile(SFENCE:::"memory");
587
        __asm __volatile(EMMS:::"memory");
588
#endif
589
        while(s < end)
590
        {
591
                register int rgb = *(uint32_t*)s; s += 4;
592
                *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
593
        }
594
}
595

    
596
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
597
{
598
        const uint8_t *s = src;
599
        const uint8_t *end;
600
#ifdef HAVE_MMX
601
        const uint8_t *mm_end;
602
#endif
603
        uint16_t *d = (uint16_t *)dst;
604
        end = s + src_size;
605
#ifdef HAVE_MMX
606
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
607
        __asm __volatile(
608
            "movq        %0, %%mm7\n\t"
609
            "movq        %1, %%mm6\n\t"
610
            ::"m"(red_15mask),"m"(green_15mask));
611
        mm_end = end - 15;
612
        while(s < mm_end)
613
        {
614
            __asm __volatile(
615
                PREFETCH" 32%1\n\t"
616
                "movd        %1, %%mm0\n\t"
617
                "movd        4%1, %%mm3\n\t"
618
                "punpckldq 8%1, %%mm0\n\t"
619
                "punpckldq 12%1, %%mm3\n\t"
620
                "movq        %%mm0, %%mm1\n\t"
621
                "movq        %%mm0, %%mm2\n\t"
622
                "movq        %%mm3, %%mm4\n\t"
623
                "movq        %%mm3, %%mm5\n\t"
624
                "psllq        $7, %%mm0\n\t"
625
                "psllq        $7, %%mm3\n\t"
626
                "pand        %%mm7, %%mm0\n\t"
627
                "pand        %%mm7, %%mm3\n\t"
628
                "psrlq        $6, %%mm1\n\t"
629
                "psrlq        $6, %%mm4\n\t"
630
                "pand        %%mm6, %%mm1\n\t"
631
                "pand        %%mm6, %%mm4\n\t"
632
                "psrlq        $19, %%mm2\n\t"
633
                "psrlq        $19, %%mm5\n\t"
634
                "pand        %2, %%mm2\n\t"
635
                "pand        %2, %%mm5\n\t"
636
                "por        %%mm1, %%mm0\n\t"
637
                "por        %%mm4, %%mm3\n\t"
638
                "por        %%mm2, %%mm0\n\t"
639
                "por        %%mm5, %%mm3\n\t"
640
                "psllq        $16, %%mm3\n\t"
641
                "por        %%mm3, %%mm0\n\t"
642
                MOVNTQ"        %%mm0, %0\n\t"
643
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
644
                d += 4;
645
                s += 16;
646
        }
647
        __asm __volatile(SFENCE:::"memory");
648
        __asm __volatile(EMMS:::"memory");
649
#endif
650
        while(s < end)
651
        {
652
                register int rgb = *(uint32_t*)s; s += 4;
653
                *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
654
        }
655
}
656

    
657
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
658
{
659
        const uint8_t *s = src;
660
        const uint8_t *end;
661
#ifdef HAVE_MMX
662
        const uint8_t *mm_end;
663
#endif
664
        uint16_t *d = (uint16_t *)dst;
665
        end = s + src_size;
666
#ifdef HAVE_MMX
667
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
668
        __asm __volatile(
669
            "movq        %0, %%mm7\n\t"
670
            "movq        %1, %%mm6\n\t"
671
            ::"m"(red_16mask),"m"(green_16mask));
672
        mm_end = end - 11;
673
        while(s < mm_end)
674
        {
675
            __asm __volatile(
676
                PREFETCH" 32%1\n\t"
677
                "movd        %1, %%mm0\n\t"
678
                "movd        3%1, %%mm3\n\t"
679
                "punpckldq 6%1, %%mm0\n\t"
680
                "punpckldq 9%1, %%mm3\n\t"
681
                "movq        %%mm0, %%mm1\n\t"
682
                "movq        %%mm0, %%mm2\n\t"
683
                "movq        %%mm3, %%mm4\n\t"
684
                "movq        %%mm3, %%mm5\n\t"
685
                "psrlq        $3, %%mm0\n\t"
686
                "psrlq        $3, %%mm3\n\t"
687
                "pand        %2, %%mm0\n\t"
688
                "pand        %2, %%mm3\n\t"
689
                "psrlq        $5, %%mm1\n\t"
690
                "psrlq        $5, %%mm4\n\t"
691
                "pand        %%mm6, %%mm1\n\t"
692
                "pand        %%mm6, %%mm4\n\t"
693
                "psrlq        $8, %%mm2\n\t"
694
                "psrlq        $8, %%mm5\n\t"
695
                "pand        %%mm7, %%mm2\n\t"
696
                "pand        %%mm7, %%mm5\n\t"
697
                "por        %%mm1, %%mm0\n\t"
698
                "por        %%mm4, %%mm3\n\t"
699
                "por        %%mm2, %%mm0\n\t"
700
                "por        %%mm5, %%mm3\n\t"
701
                "psllq        $16, %%mm3\n\t"
702
                "por        %%mm3, %%mm0\n\t"
703
                MOVNTQ"        %%mm0, %0\n\t"
704
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
705
                d += 4;
706
                s += 12;
707
        }
708
        __asm __volatile(SFENCE:::"memory");
709
        __asm __volatile(EMMS:::"memory");
710
#endif
711
        while(s < end)
712
        {
713
                const int b= *s++;
714
                const int g= *s++;
715
                const int r= *s++;
716
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
717
        }
718
}
719

    
720
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
721
{
722
        const uint8_t *s = src;
723
        const uint8_t *end;
724
#ifdef HAVE_MMX
725
        const uint8_t *mm_end;
726
#endif
727
        uint16_t *d = (uint16_t *)dst;
728
        end = s + src_size;
729
#ifdef HAVE_MMX
730
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
731
        __asm __volatile(
732
            "movq        %0, %%mm7\n\t"
733
            "movq        %1, %%mm6\n\t"
734
            ::"m"(red_16mask),"m"(green_16mask));
735
        mm_end = end - 15;
736
        while(s < mm_end)
737
        {
738
            __asm __volatile(
739
                PREFETCH" 32%1\n\t"
740
                "movd        %1, %%mm0\n\t"
741
                "movd        3%1, %%mm3\n\t"
742
                "punpckldq 6%1, %%mm0\n\t"
743
                "punpckldq 9%1, %%mm3\n\t"
744
                "movq        %%mm0, %%mm1\n\t"
745
                "movq        %%mm0, %%mm2\n\t"
746
                "movq        %%mm3, %%mm4\n\t"
747
                "movq        %%mm3, %%mm5\n\t"
748
                "psllq        $8, %%mm0\n\t"
749
                "psllq        $8, %%mm3\n\t"
750
                "pand        %%mm7, %%mm0\n\t"
751
                "pand        %%mm7, %%mm3\n\t"
752
                "psrlq        $5, %%mm1\n\t"
753
                "psrlq        $5, %%mm4\n\t"
754
                "pand        %%mm6, %%mm1\n\t"
755
                "pand        %%mm6, %%mm4\n\t"
756
                "psrlq        $19, %%mm2\n\t"
757
                "psrlq        $19, %%mm5\n\t"
758
                "pand        %2, %%mm2\n\t"
759
                "pand        %2, %%mm5\n\t"
760
                "por        %%mm1, %%mm0\n\t"
761
                "por        %%mm4, %%mm3\n\t"
762
                "por        %%mm2, %%mm0\n\t"
763
                "por        %%mm5, %%mm3\n\t"
764
                "psllq        $16, %%mm3\n\t"
765
                "por        %%mm3, %%mm0\n\t"
766
                MOVNTQ"        %%mm0, %0\n\t"
767
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
768
                d += 4;
769
                s += 12;
770
        }
771
        __asm __volatile(SFENCE:::"memory");
772
        __asm __volatile(EMMS:::"memory");
773
#endif
774
        while(s < end)
775
        {
776
                const int r= *s++;
777
                const int g= *s++;
778
                const int b= *s++;
779
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
780
        }
781
}
782

    
783
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
784
{
785
        const uint8_t *s = src;
786
        const uint8_t *end;
787
#ifdef HAVE_MMX
788
        const uint8_t *mm_end;
789
#endif
790
        uint16_t *d = (uint16_t *)dst;
791
        end = s + src_size;
792
#ifdef HAVE_MMX
793
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
794
        __asm __volatile(
795
            "movq        %0, %%mm7\n\t"
796
            "movq        %1, %%mm6\n\t"
797
            ::"m"(red_15mask),"m"(green_15mask));
798
        mm_end = end - 11;
799
        while(s < mm_end)
800
        {
801
            __asm __volatile(
802
                PREFETCH" 32%1\n\t"
803
                "movd        %1, %%mm0\n\t"
804
                "movd        3%1, %%mm3\n\t"
805
                "punpckldq 6%1, %%mm0\n\t"
806
                "punpckldq 9%1, %%mm3\n\t"
807
                "movq        %%mm0, %%mm1\n\t"
808
                "movq        %%mm0, %%mm2\n\t"
809
                "movq        %%mm3, %%mm4\n\t"
810
                "movq        %%mm3, %%mm5\n\t"
811
                "psrlq        $3, %%mm0\n\t"
812
                "psrlq        $3, %%mm3\n\t"
813
                "pand        %2, %%mm0\n\t"
814
                "pand        %2, %%mm3\n\t"
815
                "psrlq        $6, %%mm1\n\t"
816
                "psrlq        $6, %%mm4\n\t"
817
                "pand        %%mm6, %%mm1\n\t"
818
                "pand        %%mm6, %%mm4\n\t"
819
                "psrlq        $9, %%mm2\n\t"
820
                "psrlq        $9, %%mm5\n\t"
821
                "pand        %%mm7, %%mm2\n\t"
822
                "pand        %%mm7, %%mm5\n\t"
823
                "por        %%mm1, %%mm0\n\t"
824
                "por        %%mm4, %%mm3\n\t"
825
                "por        %%mm2, %%mm0\n\t"
826
                "por        %%mm5, %%mm3\n\t"
827
                "psllq        $16, %%mm3\n\t"
828
                "por        %%mm3, %%mm0\n\t"
829
                MOVNTQ"        %%mm0, %0\n\t"
830
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
831
                d += 4;
832
                s += 12;
833
        }
834
        __asm __volatile(SFENCE:::"memory");
835
        __asm __volatile(EMMS:::"memory");
836
#endif
837
        while(s < end)
838
        {
839
                const int b= *s++;
840
                const int g= *s++;
841
                const int r= *s++;
842
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
843
        }
844
}
845

    
846
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
847
{
848
        const uint8_t *s = src;
849
        const uint8_t *end;
850
#ifdef HAVE_MMX
851
        const uint8_t *mm_end;
852
#endif
853
        uint16_t *d = (uint16_t *)dst;
854
        end = s + src_size;
855
#ifdef HAVE_MMX
856
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
857
        __asm __volatile(
858
            "movq        %0, %%mm7\n\t"
859
            "movq        %1, %%mm6\n\t"
860
            ::"m"(red_15mask),"m"(green_15mask));
861
        mm_end = end - 15;
862
        while(s < mm_end)
863
        {
864
            __asm __volatile(
865
                PREFETCH" 32%1\n\t"
866
                "movd        %1, %%mm0\n\t"
867
                "movd        3%1, %%mm3\n\t"
868
                "punpckldq 6%1, %%mm0\n\t"
869
                "punpckldq 9%1, %%mm3\n\t"
870
                "movq        %%mm0, %%mm1\n\t"
871
                "movq        %%mm0, %%mm2\n\t"
872
                "movq        %%mm3, %%mm4\n\t"
873
                "movq        %%mm3, %%mm5\n\t"
874
                "psllq        $7, %%mm0\n\t"
875
                "psllq        $7, %%mm3\n\t"
876
                "pand        %%mm7, %%mm0\n\t"
877
                "pand        %%mm7, %%mm3\n\t"
878
                "psrlq        $6, %%mm1\n\t"
879
                "psrlq        $6, %%mm4\n\t"
880
                "pand        %%mm6, %%mm1\n\t"
881
                "pand        %%mm6, %%mm4\n\t"
882
                "psrlq        $19, %%mm2\n\t"
883
                "psrlq        $19, %%mm5\n\t"
884
                "pand        %2, %%mm2\n\t"
885
                "pand        %2, %%mm5\n\t"
886
                "por        %%mm1, %%mm0\n\t"
887
                "por        %%mm4, %%mm3\n\t"
888
                "por        %%mm2, %%mm0\n\t"
889
                "por        %%mm5, %%mm3\n\t"
890
                "psllq        $16, %%mm3\n\t"
891
                "por        %%mm3, %%mm0\n\t"
892
                MOVNTQ"        %%mm0, %0\n\t"
893
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
894
                d += 4;
895
                s += 12;
896
        }
897
        __asm __volatile(SFENCE:::"memory");
898
        __asm __volatile(EMMS:::"memory");
899
#endif
900
        while(s < end)
901
        {
902
                const int r= *s++;
903
                const int g= *s++;
904
                const int b= *s++;
905
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
906
        }
907
}
908

    
909
/*
910
  I use here less accurate approximation by simply
911
 left-shifting the input
912
  value and filling the low order bits with
913
 zeroes. This method improves png's
914
  compression but this scheme cannot reproduce white exactly, since it does not
915
  generate an all-ones maximum value; the net effect is to darken the
916
  image slightly.
917

918
  The better method should be "left bit replication":
919

920
   4 3 2 1 0
921
   ---------
922
   1 1 0 1 1
923

924
   7 6 5 4 3  2 1 0
925
   ----------------
926
   1 1 0 1 1  1 1 0
927
   |=======|  |===|
928
       |      Leftmost Bits Repeated to Fill Open Bits
929
       |
930
   Original Bits
931
*/
932
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
933
{
934
        const uint16_t *end;
935
#ifdef HAVE_MMX
936
        const uint16_t *mm_end;
937
#endif
938
        uint8_t *d = (uint8_t *)dst;
939
        const uint16_t *s = (uint16_t *)src;
940
        end = s + src_size/2;
941
#ifdef HAVE_MMX
942
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
943
        mm_end = end - 7;
944
        while(s < mm_end)
945
        {
946
            __asm __volatile(
947
                PREFETCH" 32%1\n\t"
948
                "movq        %1, %%mm0\n\t"
949
                "movq        %1, %%mm1\n\t"
950
                "movq        %1, %%mm2\n\t"
951
                "pand        %2, %%mm0\n\t"
952
                "pand        %3, %%mm1\n\t"
953
                "pand        %4, %%mm2\n\t"
954
                "psllq        $3, %%mm0\n\t"
955
                "psrlq        $2, %%mm1\n\t"
956
                "psrlq        $7, %%mm2\n\t"
957
                "movq        %%mm0, %%mm3\n\t"
958
                "movq        %%mm1, %%mm4\n\t"
959
                "movq        %%mm2, %%mm5\n\t"
960
                "punpcklwd %5, %%mm0\n\t"
961
                "punpcklwd %5, %%mm1\n\t"
962
                "punpcklwd %5, %%mm2\n\t"
963
                "punpckhwd %5, %%mm3\n\t"
964
                "punpckhwd %5, %%mm4\n\t"
965
                "punpckhwd %5, %%mm5\n\t"
966
                "psllq        $8, %%mm1\n\t"
967
                "psllq        $16, %%mm2\n\t"
968
                "por        %%mm1, %%mm0\n\t"
969
                "por        %%mm2, %%mm0\n\t"
970
                "psllq        $8, %%mm4\n\t"
971
                "psllq        $16, %%mm5\n\t"
972
                "por        %%mm4, %%mm3\n\t"
973
                "por        %%mm5, %%mm3\n\t"
974

    
975
                "movq        %%mm0, %%mm6\n\t"
976
                "movq        %%mm3, %%mm7\n\t"
977
                
978
                "movq        8%1, %%mm0\n\t"
979
                "movq        8%1, %%mm1\n\t"
980
                "movq        8%1, %%mm2\n\t"
981
                "pand        %2, %%mm0\n\t"
982
                "pand        %3, %%mm1\n\t"
983
                "pand        %4, %%mm2\n\t"
984
                "psllq        $3, %%mm0\n\t"
985
                "psrlq        $2, %%mm1\n\t"
986
                "psrlq        $7, %%mm2\n\t"
987
                "movq        %%mm0, %%mm3\n\t"
988
                "movq        %%mm1, %%mm4\n\t"
989
                "movq        %%mm2, %%mm5\n\t"
990
                "punpcklwd %5, %%mm0\n\t"
991
                "punpcklwd %5, %%mm1\n\t"
992
                "punpcklwd %5, %%mm2\n\t"
993
                "punpckhwd %5, %%mm3\n\t"
994
                "punpckhwd %5, %%mm4\n\t"
995
                "punpckhwd %5, %%mm5\n\t"
996
                "psllq        $8, %%mm1\n\t"
997
                "psllq        $16, %%mm2\n\t"
998
                "por        %%mm1, %%mm0\n\t"
999
                "por        %%mm2, %%mm0\n\t"
1000
                "psllq        $8, %%mm4\n\t"
1001
                "psllq        $16, %%mm5\n\t"
1002
                "por        %%mm4, %%mm3\n\t"
1003
                "por        %%mm5, %%mm3\n\t"
1004

    
1005
                :"=m"(*d)
1006
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1007
                :"memory");
1008
            /* Borrowed 32 to 24 */
1009
            __asm __volatile(
1010
                "movq        %%mm0, %%mm4\n\t"
1011
                "movq        %%mm3, %%mm5\n\t"
1012
                "movq        %%mm6, %%mm0\n\t"
1013
                "movq        %%mm7, %%mm1\n\t"
1014
                
1015
                "movq        %%mm4, %%mm6\n\t"
1016
                "movq        %%mm5, %%mm7\n\t"
1017
                "movq        %%mm0, %%mm2\n\t"
1018
                "movq        %%mm1, %%mm3\n\t"
1019

    
1020
                "psrlq        $8, %%mm2\n\t"
1021
                "psrlq        $8, %%mm3\n\t"
1022
                "psrlq        $8, %%mm6\n\t"
1023
                "psrlq        $8, %%mm7\n\t"
1024
                "pand        %2, %%mm0\n\t"
1025
                "pand        %2, %%mm1\n\t"
1026
                "pand        %2, %%mm4\n\t"
1027
                "pand        %2, %%mm5\n\t"
1028
                "pand        %3, %%mm2\n\t"
1029
                "pand        %3, %%mm3\n\t"
1030
                "pand        %3, %%mm6\n\t"
1031
                "pand        %3, %%mm7\n\t"
1032
                "por        %%mm2, %%mm0\n\t"
1033
                "por        %%mm3, %%mm1\n\t"
1034
                "por        %%mm6, %%mm4\n\t"
1035
                "por        %%mm7, %%mm5\n\t"
1036

    
1037
                "movq        %%mm1, %%mm2\n\t"
1038
                "movq        %%mm4, %%mm3\n\t"
1039
                "psllq        $48, %%mm2\n\t"
1040
                "psllq        $32, %%mm3\n\t"
1041
                "pand        %4, %%mm2\n\t"
1042
                "pand        %5, %%mm3\n\t"
1043
                "por        %%mm2, %%mm0\n\t"
1044
                "psrlq        $16, %%mm1\n\t"
1045
                "psrlq        $32, %%mm4\n\t"
1046
                "psllq        $16, %%mm5\n\t"
1047
                "por        %%mm3, %%mm1\n\t"
1048
                "pand        %6, %%mm5\n\t"
1049
                "por        %%mm5, %%mm4\n\t"
1050

    
1051
                MOVNTQ"        %%mm0, %0\n\t"
1052
                MOVNTQ"        %%mm1, 8%0\n\t"
1053
                MOVNTQ"        %%mm4, 16%0"
1054

    
1055
                :"=m"(*d)
1056
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1057
                :"memory");
1058
                d += 24;
1059
                s += 8;
1060
        }
1061
        __asm __volatile(SFENCE:::"memory");
1062
        __asm __volatile(EMMS:::"memory");
1063
#endif
1064
        while(s < end)
1065
        {
1066
                register uint16_t bgr;
1067
                bgr = *s++;
1068
                *d++ = (bgr&0x1F)<<3;
1069
                *d++ = (bgr&0x3E0)>>2;
1070
                *d++ = (bgr&0x7C00)>>7;
1071
        }
1072
}
1073

    
1074
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1075
{
1076
        const uint16_t *end;
1077
#ifdef HAVE_MMX
1078
        const uint16_t *mm_end;
1079
#endif
1080
        uint8_t *d = (uint8_t *)dst;
1081
        const uint16_t *s = (const uint16_t *)src;
1082
        end = s + src_size/2;
1083
#ifdef HAVE_MMX
1084
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1085
        mm_end = end - 7;
1086
        while(s < mm_end)
1087
        {
1088
            __asm __volatile(
1089
                PREFETCH" 32%1\n\t"
1090
                "movq        %1, %%mm0\n\t"
1091
                "movq        %1, %%mm1\n\t"
1092
                "movq        %1, %%mm2\n\t"
1093
                "pand        %2, %%mm0\n\t"
1094
                "pand        %3, %%mm1\n\t"
1095
                "pand        %4, %%mm2\n\t"
1096
                "psllq        $3, %%mm0\n\t"
1097
                "psrlq        $3, %%mm1\n\t"
1098
                "psrlq        $8, %%mm2\n\t"
1099
                "movq        %%mm0, %%mm3\n\t"
1100
                "movq        %%mm1, %%mm4\n\t"
1101
                "movq        %%mm2, %%mm5\n\t"
1102
                "punpcklwd %5, %%mm0\n\t"
1103
                "punpcklwd %5, %%mm1\n\t"
1104
                "punpcklwd %5, %%mm2\n\t"
1105
                "punpckhwd %5, %%mm3\n\t"
1106
                "punpckhwd %5, %%mm4\n\t"
1107
                "punpckhwd %5, %%mm5\n\t"
1108
                "psllq        $8, %%mm1\n\t"
1109
                "psllq        $16, %%mm2\n\t"
1110
                "por        %%mm1, %%mm0\n\t"
1111
                "por        %%mm2, %%mm0\n\t"
1112
                "psllq        $8, %%mm4\n\t"
1113
                "psllq        $16, %%mm5\n\t"
1114
                "por        %%mm4, %%mm3\n\t"
1115
                "por        %%mm5, %%mm3\n\t"
1116
                
1117
                "movq        %%mm0, %%mm6\n\t"
1118
                "movq        %%mm3, %%mm7\n\t"
1119

    
1120
                "movq        8%1, %%mm0\n\t"
1121
                "movq        8%1, %%mm1\n\t"
1122
                "movq        8%1, %%mm2\n\t"
1123
                "pand        %2, %%mm0\n\t"
1124
                "pand        %3, %%mm1\n\t"
1125
                "pand        %4, %%mm2\n\t"
1126
                "psllq        $3, %%mm0\n\t"
1127
                "psrlq        $3, %%mm1\n\t"
1128
                "psrlq        $8, %%mm2\n\t"
1129
                "movq        %%mm0, %%mm3\n\t"
1130
                "movq        %%mm1, %%mm4\n\t"
1131
                "movq        %%mm2, %%mm5\n\t"
1132
                "punpcklwd %5, %%mm0\n\t"
1133
                "punpcklwd %5, %%mm1\n\t"
1134
                "punpcklwd %5, %%mm2\n\t"
1135
                "punpckhwd %5, %%mm3\n\t"
1136
                "punpckhwd %5, %%mm4\n\t"
1137
                "punpckhwd %5, %%mm5\n\t"
1138
                "psllq        $8, %%mm1\n\t"
1139
                "psllq        $16, %%mm2\n\t"
1140
                "por        %%mm1, %%mm0\n\t"
1141
                "por        %%mm2, %%mm0\n\t"
1142
                "psllq        $8, %%mm4\n\t"
1143
                "psllq        $16, %%mm5\n\t"
1144
                "por        %%mm4, %%mm3\n\t"
1145
                "por        %%mm5, %%mm3\n\t"
1146
                :"=m"(*d)
1147
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)                
1148
                :"memory");
1149
            /* Borrowed 32 to 24 */
1150
            __asm __volatile(
1151
                "movq        %%mm0, %%mm4\n\t"
1152
                "movq        %%mm3, %%mm5\n\t"
1153
                "movq        %%mm6, %%mm0\n\t"
1154
                "movq        %%mm7, %%mm1\n\t"
1155
                
1156
                "movq        %%mm4, %%mm6\n\t"
1157
                "movq        %%mm5, %%mm7\n\t"
1158
                "movq        %%mm0, %%mm2\n\t"
1159
                "movq        %%mm1, %%mm3\n\t"
1160

    
1161
                "psrlq        $8, %%mm2\n\t"
1162
                "psrlq        $8, %%mm3\n\t"
1163
                "psrlq        $8, %%mm6\n\t"
1164
                "psrlq        $8, %%mm7\n\t"
1165
                "pand        %2, %%mm0\n\t"
1166
                "pand        %2, %%mm1\n\t"
1167
                "pand        %2, %%mm4\n\t"
1168
                "pand        %2, %%mm5\n\t"
1169
                "pand        %3, %%mm2\n\t"
1170
                "pand        %3, %%mm3\n\t"
1171
                "pand        %3, %%mm6\n\t"
1172
                "pand        %3, %%mm7\n\t"
1173
                "por        %%mm2, %%mm0\n\t"
1174
                "por        %%mm3, %%mm1\n\t"
1175
                "por        %%mm6, %%mm4\n\t"
1176
                "por        %%mm7, %%mm5\n\t"
1177

    
1178
                "movq        %%mm1, %%mm2\n\t"
1179
                "movq        %%mm4, %%mm3\n\t"
1180
                "psllq        $48, %%mm2\n\t"
1181
                "psllq        $32, %%mm3\n\t"
1182
                "pand        %4, %%mm2\n\t"
1183
                "pand        %5, %%mm3\n\t"
1184
                "por        %%mm2, %%mm0\n\t"
1185
                "psrlq        $16, %%mm1\n\t"
1186
                "psrlq        $32, %%mm4\n\t"
1187
                "psllq        $16, %%mm5\n\t"
1188
                "por        %%mm3, %%mm1\n\t"
1189
                "pand        %6, %%mm5\n\t"
1190
                "por        %%mm5, %%mm4\n\t"
1191

    
1192
                MOVNTQ"        %%mm0, %0\n\t"
1193
                MOVNTQ"        %%mm1, 8%0\n\t"
1194
                MOVNTQ"        %%mm4, 16%0"
1195

    
1196
                :"=m"(*d)
1197
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1198
                :"memory");
1199
                d += 24;
1200
                s += 8;
1201
        }
1202
        __asm __volatile(SFENCE:::"memory");
1203
        __asm __volatile(EMMS:::"memory");
1204
#endif
1205
        while(s < end)
1206
        {
1207
                register uint16_t bgr;
1208
                bgr = *s++;
1209
                *d++ = (bgr&0x1F)<<3;
1210
                *d++ = (bgr&0x7E0)>>3;
1211
                *d++ = (bgr&0xF800)>>8;
1212
        }
1213
}
1214

    
1215
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1216
{
1217
        const uint16_t *end;
1218
#ifdef HAVE_MMX
1219
        const uint16_t *mm_end;
1220
#endif
1221
        uint8_t *d = (uint8_t *)dst;
1222
        const uint16_t *s = (const uint16_t *)src;
1223
        end = s + src_size/2;
1224
#ifdef HAVE_MMX
1225
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1226
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1227
        mm_end = end - 3;
1228
        while(s < mm_end)
1229
        {
1230
            __asm __volatile(
1231
                PREFETCH" 32%1\n\t"
1232
                "movq        %1, %%mm0\n\t"
1233
                "movq        %1, %%mm1\n\t"
1234
                "movq        %1, %%mm2\n\t"
1235
                "pand        %2, %%mm0\n\t"
1236
                "pand        %3, %%mm1\n\t"
1237
                "pand        %4, %%mm2\n\t"
1238
                "psllq        $3, %%mm0\n\t"
1239
                "psrlq        $2, %%mm1\n\t"
1240
                "psrlq        $7, %%mm2\n\t"
1241
                "movq        %%mm0, %%mm3\n\t"
1242
                "movq        %%mm1, %%mm4\n\t"
1243
                "movq        %%mm2, %%mm5\n\t"
1244
                "punpcklwd %%mm7, %%mm0\n\t"
1245
                "punpcklwd %%mm7, %%mm1\n\t"
1246
                "punpcklwd %%mm7, %%mm2\n\t"
1247
                "punpckhwd %%mm7, %%mm3\n\t"
1248
                "punpckhwd %%mm7, %%mm4\n\t"
1249
                "punpckhwd %%mm7, %%mm5\n\t"
1250
                "psllq        $8, %%mm1\n\t"
1251
                "psllq        $16, %%mm2\n\t"
1252
                "por        %%mm1, %%mm0\n\t"
1253
                "por        %%mm2, %%mm0\n\t"
1254
                "psllq        $8, %%mm4\n\t"
1255
                "psllq        $16, %%mm5\n\t"
1256
                "por        %%mm4, %%mm3\n\t"
1257
                "por        %%mm5, %%mm3\n\t"
1258
                MOVNTQ"        %%mm0, %0\n\t"
1259
                MOVNTQ"        %%mm3, 8%0\n\t"
1260
                :"=m"(*d)
1261
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1262
                :"memory");
1263
                d += 16;
1264
                s += 4;
1265
        }
1266
        __asm __volatile(SFENCE:::"memory");
1267
        __asm __volatile(EMMS:::"memory");
1268
#endif
1269
        while(s < end)
1270
        {
1271
#if 0 //slightly slower on athlon
1272
                int bgr= *s++;
1273
                *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1274
#else
1275
                register uint16_t bgr;
1276
                bgr = *s++;
1277
#ifdef WORDS_BIGENDIAN
1278
                *d++ = 0;
1279
                *d++ = (bgr&0x7C00)>>7;
1280
                *d++ = (bgr&0x3E0)>>2;
1281
                *d++ = (bgr&0x1F)<<3;
1282
#else
1283
                *d++ = (bgr&0x1F)<<3;
1284
                *d++ = (bgr&0x3E0)>>2;
1285
                *d++ = (bgr&0x7C00)>>7;
1286
                *d++ = 0;
1287
#endif
1288

    
1289
#endif
1290
        }
1291
}
1292

    
1293
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1294
{
1295
        const uint16_t *end;
1296
#ifdef HAVE_MMX
1297
        const uint16_t *mm_end;
1298
#endif
1299
        uint8_t *d = (uint8_t *)dst;
1300
        const uint16_t *s = (uint16_t *)src;
1301
        end = s + src_size/2;
1302
#ifdef HAVE_MMX
1303
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1304
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1305
        mm_end = end - 3;
1306
        while(s < mm_end)
1307
        {
1308
            __asm __volatile(
1309
                PREFETCH" 32%1\n\t"
1310
                "movq        %1, %%mm0\n\t"
1311
                "movq        %1, %%mm1\n\t"
1312
                "movq        %1, %%mm2\n\t"
1313
                "pand        %2, %%mm0\n\t"
1314
                "pand        %3, %%mm1\n\t"
1315
                "pand        %4, %%mm2\n\t"
1316
                "psllq        $3, %%mm0\n\t"
1317
                "psrlq        $3, %%mm1\n\t"
1318
                "psrlq        $8, %%mm2\n\t"
1319
                "movq        %%mm0, %%mm3\n\t"
1320
                "movq        %%mm1, %%mm4\n\t"
1321
                "movq        %%mm2, %%mm5\n\t"
1322
                "punpcklwd %%mm7, %%mm0\n\t"
1323
                "punpcklwd %%mm7, %%mm1\n\t"
1324
                "punpcklwd %%mm7, %%mm2\n\t"
1325
                "punpckhwd %%mm7, %%mm3\n\t"
1326
                "punpckhwd %%mm7, %%mm4\n\t"
1327
                "punpckhwd %%mm7, %%mm5\n\t"
1328
                "psllq        $8, %%mm1\n\t"
1329
                "psllq        $16, %%mm2\n\t"
1330
                "por        %%mm1, %%mm0\n\t"
1331
                "por        %%mm2, %%mm0\n\t"
1332
                "psllq        $8, %%mm4\n\t"
1333
                "psllq        $16, %%mm5\n\t"
1334
                "por        %%mm4, %%mm3\n\t"
1335
                "por        %%mm5, %%mm3\n\t"
1336
                MOVNTQ"        %%mm0, %0\n\t"
1337
                MOVNTQ"        %%mm3, 8%0\n\t"
1338
                :"=m"(*d)
1339
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1340
                :"memory");
1341
                d += 16;
1342
                s += 4;
1343
        }
1344
        __asm __volatile(SFENCE:::"memory");
1345
        __asm __volatile(EMMS:::"memory");
1346
#endif
1347
        while(s < end)
1348
        {
1349
                register uint16_t bgr;
1350
                bgr = *s++;
1351
#ifdef WORDS_BIGENDIAN
1352
                *d++ = 0;
1353
                *d++ = (bgr&0xF800)>>8;
1354
                *d++ = (bgr&0x7E0)>>3;
1355
                *d++ = (bgr&0x1F)<<3;
1356
#else
1357
                *d++ = (bgr&0x1F)<<3;
1358
                *d++ = (bgr&0x7E0)>>3;
1359
                *d++ = (bgr&0xF800)>>8;
1360
                *d++ = 0;
1361
#endif
1362
        }
1363
}
1364

    
1365
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1366
{
1367
        long idx = 15 - src_size;
1368
        uint8_t *s = (uint8_t *) src-idx, *d = dst-idx;
1369
#ifdef HAVE_MMX
1370
        __asm __volatile(
1371
                "        test %0, %0                        \n"
1372
                "        jns 2f                                \n"
1373
                "        "PREFETCH" (%1, %0)                \n"
1374
                "        movq %3, %%mm7                        \n"
1375
                "        pxor %4, %%mm7                        \n"
1376
                "        movq %%mm7, %%mm6                \n"
1377
                "        pxor %5, %%mm7                        \n"
1378
                        ASMALIGN(4)
1379
                "1:                                        \n"
1380
                "        "PREFETCH" 32(%1, %0)                \n"
1381
                "        movq (%1, %0), %%mm0                \n"
1382
                "        movq 8(%1, %0), %%mm1                \n"
1383
# ifdef HAVE_MMX2
1384
                "        pshufw $177, %%mm0, %%mm3        \n"
1385
                "        pshufw $177, %%mm1, %%mm5        \n"
1386
                "        pand %%mm7, %%mm0                \n"
1387
                "        pand %%mm6, %%mm3                \n"
1388
                "        pand %%mm7, %%mm1                \n"
1389
                "        pand %%mm6, %%mm5                \n"
1390
                "        por %%mm3, %%mm0                \n"
1391
                "        por %%mm5, %%mm1                \n"
1392
# else
1393
                "        movq %%mm0, %%mm2                \n"
1394
                "        movq %%mm1, %%mm4                \n"
1395
                "        pand %%mm7, %%mm0                \n"
1396
                "        pand %%mm6, %%mm2                \n"
1397
                "        pand %%mm7, %%mm1                \n"
1398
                "        pand %%mm6, %%mm4                \n"
1399
                "        movq %%mm2, %%mm3                \n"
1400
                "        movq %%mm4, %%mm5                \n"
1401
                "        pslld $16, %%mm2                \n"
1402
                "        psrld $16, %%mm3                \n"
1403
                "        pslld $16, %%mm4                \n"
1404
                "        psrld $16, %%mm5                \n"
1405
                "        por %%mm2, %%mm0                \n"
1406
                "        por %%mm4, %%mm1                \n"
1407
                "        por %%mm3, %%mm0                \n"
1408
                "        por %%mm5, %%mm1                \n"
1409
# endif
1410
                "        "MOVNTQ" %%mm0, (%2, %0)        \n"
1411
                "        "MOVNTQ" %%mm1, 8(%2, %0)        \n"
1412
                "        add $16, %0                        \n"
1413
                "        js 1b                                \n"
1414
                "        "SFENCE"                        \n"
1415
                "        "EMMS"                                \n"
1416
                "2:                                        \n"
1417
                : "+&r"(idx)
1418
                : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1419
                : "memory");
1420
#endif
1421
        for (; idx<15; idx+=4) {
1422
                register int v = *(uint32_t *)&s[idx], g = v & 0xff00;
1423
                v &= 0xff00ff;
1424
                *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1425
        }
1426
}
1427

    
1428
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1429
{
1430
        unsigned i;
1431
#ifdef HAVE_MMX
1432
        long mmx_size= 23 - src_size;
1433
        asm volatile (
1434
                "movq "MANGLE(mask24r)", %%mm5        \n\t"
1435
                "movq "MANGLE(mask24g)", %%mm6        \n\t"
1436
                "movq "MANGLE(mask24b)", %%mm7        \n\t"
1437
                ASMALIGN(4)
1438
                "1:                                \n\t"
1439
                PREFETCH" 32(%1, %%"REG_a")        \n\t"
1440
                "movq   (%1, %%"REG_a"), %%mm0        \n\t" // BGR BGR BG
1441
                "movq   (%1, %%"REG_a"), %%mm1        \n\t" // BGR BGR BG
1442
                "movq  2(%1, %%"REG_a"), %%mm2        \n\t" // R BGR BGR B
1443
                "psllq $16, %%mm0                \n\t" // 00 BGR BGR
1444
                "pand %%mm5, %%mm0                \n\t"
1445
                "pand %%mm6, %%mm1                \n\t"
1446
                "pand %%mm7, %%mm2                \n\t"
1447
                "por %%mm0, %%mm1                \n\t"
1448
                "por %%mm2, %%mm1                \n\t"                
1449
                "movq  6(%1, %%"REG_a"), %%mm0        \n\t" // BGR BGR BG
1450
                MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1451
                "movq  8(%1, %%"REG_a"), %%mm1        \n\t" // R BGR BGR B
1452
                "movq 10(%1, %%"REG_a"), %%mm2        \n\t" // GR BGR BGR
1453
                "pand %%mm7, %%mm0                \n\t"
1454
                "pand %%mm5, %%mm1                \n\t"
1455
                "pand %%mm6, %%mm2                \n\t"
1456
                "por %%mm0, %%mm1                \n\t"
1457
                "por %%mm2, %%mm1                \n\t"                
1458
                "movq 14(%1, %%"REG_a"), %%mm0        \n\t" // R BGR BGR B
1459
                MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1460
                "movq 16(%1, %%"REG_a"), %%mm1        \n\t" // GR BGR BGR
1461
                "movq 18(%1, %%"REG_a"), %%mm2        \n\t" // BGR BGR BG
1462
                "pand %%mm6, %%mm0                \n\t"
1463
                "pand %%mm7, %%mm1                \n\t"
1464
                "pand %%mm5, %%mm2                \n\t"
1465
                "por %%mm0, %%mm1                \n\t"
1466
                "por %%mm2, %%mm1                \n\t"                
1467
                MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1468
                "add $24, %%"REG_a"                \n\t"
1469
                " js 1b                                \n\t"
1470
                : "+a" (mmx_size)
1471
                : "r" (src-mmx_size), "r"(dst-mmx_size)
1472
        );
1473

    
1474
        __asm __volatile(SFENCE:::"memory");
1475
        __asm __volatile(EMMS:::"memory");
1476

    
1477
        if(mmx_size==23) return; //finihsed, was multiple of 8
1478

    
1479
        src+= src_size;
1480
        dst+= src_size;
1481
        src_size= 23-mmx_size;
1482
        src-= src_size;
1483
        dst-= src_size;
1484
#endif
1485
        for(i=0; i<src_size; i+=3)
1486
        {
1487
                register uint8_t x;
1488
                x          = src[i + 2];
1489
                dst[i + 1] = src[i + 1];
1490
                dst[i + 2] = src[i + 0];
1491
                dst[i + 0] = x;
1492
        }
1493
}
1494

    
1495
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1496
        long width, long height,
1497
        long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1498
{
1499
        long y;
1500
        const long chromWidth= width>>1;
1501
        for(y=0; y<height; y++)
1502
        {
1503
#ifdef HAVE_MMX
1504
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1505
                asm volatile(
1506
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1507
                        ASMALIGN(4)
1508
                        "1:                                \n\t"
1509
                        PREFETCH" 32(%1, %%"REG_a", 2)        \n\t"
1510
                        PREFETCH" 32(%2, %%"REG_a")        \n\t"
1511
                        PREFETCH" 32(%3, %%"REG_a")        \n\t"
1512
                        "movq (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1513
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1514
                        "movq (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1515
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1516
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1517

    
1518
                        "movq (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1519
                        "movq 8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1520
                        "movq %%mm3, %%mm4                \n\t" // Y(0)
1521
                        "movq %%mm5, %%mm6                \n\t" // Y(8)
1522
                        "punpcklbw %%mm0, %%mm3                \n\t" // YUYV YUYV(0)
1523
                        "punpckhbw %%mm0, %%mm4                \n\t" // YUYV YUYV(4)
1524
                        "punpcklbw %%mm2, %%mm5                \n\t" // YUYV YUYV(8)
1525
                        "punpckhbw %%mm2, %%mm6                \n\t" // YUYV YUYV(12)
1526

    
1527
                        MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1528
                        MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1529
                        MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1530
                        MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1531

    
1532
                        "add $8, %%"REG_a"                \n\t"
1533
                        "cmp %4, %%"REG_a"                \n\t"
1534
                        " jb 1b                                \n\t"
1535
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1536
                        : "%"REG_a
1537
                );
1538
#else
1539

    
1540
#if defined ARCH_ALPHA && defined HAVE_MVI
1541
#define pl2yuy2(n)                                        \
1542
        y1 = yc[n];                                        \
1543
        y2 = yc2[n];                                        \
1544
        u = uc[n];                                        \
1545
        v = vc[n];                                        \
1546
        asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));        \
1547
        asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));        \
1548
        asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1549
        asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1550
        yuv1 = (u << 8) + (v << 24);                        \
1551
        yuv2 = yuv1 + y2;                                \
1552
        yuv1 += y1;                                        \
1553
        qdst[n] = yuv1;                                        \
1554
        qdst2[n] = yuv2;
1555

    
1556
                int i;
1557
                uint64_t *qdst = (uint64_t *) dst;
1558
                uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1559
                const uint32_t *yc = (uint32_t *) ysrc;
1560
                const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1561
                const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1562
                for(i = 0; i < chromWidth; i += 8){
1563
                        uint64_t y1, y2, yuv1, yuv2;
1564
                        uint64_t u, v;
1565
                        /* Prefetch */
1566
                        asm("ldq $31,64(%0)" :: "r"(yc));
1567
                        asm("ldq $31,64(%0)" :: "r"(yc2));
1568
                        asm("ldq $31,64(%0)" :: "r"(uc));
1569
                        asm("ldq $31,64(%0)" :: "r"(vc));
1570

    
1571
                        pl2yuy2(0);
1572
                        pl2yuy2(1);
1573
                        pl2yuy2(2);
1574
                        pl2yuy2(3);
1575

    
1576
                        yc += 4;
1577
                        yc2 += 4;
1578
                        uc += 4;
1579
                        vc += 4;
1580
                        qdst += 4;
1581
                        qdst2 += 4;
1582
                }
1583
                y++;
1584
                ysrc += lumStride;
1585
                dst += dstStride;
1586

    
1587
#elif __WORDSIZE >= 64
1588
                int i;
1589
                uint64_t *ldst = (uint64_t *) dst;
1590
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1591
                for(i = 0; i < chromWidth; i += 2){
1592
                        uint64_t k, l;
1593
                        k = yc[0] + (uc[0] << 8) +
1594
                            (yc[1] << 16) + (vc[0] << 24);
1595
                        l = yc[2] + (uc[1] << 8) +
1596
                            (yc[3] << 16) + (vc[1] << 24);
1597
                        *ldst++ = k + (l << 32);
1598
                        yc += 4;
1599
                        uc += 2;
1600
                        vc += 2;
1601
                }
1602

    
1603
#else
1604
                int i, *idst = (int32_t *) dst;
1605
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1606
                for(i = 0; i < chromWidth; i++){
1607
#ifdef WORDS_BIGENDIAN
1608
                        *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1609
                            (yc[1] << 8) + (vc[0] << 0);
1610
#else
1611
                        *idst++ = yc[0] + (uc[0] << 8) +
1612
                            (yc[1] << 16) + (vc[0] << 24);
1613
#endif
1614
                        yc += 2;
1615
                        uc++;
1616
                        vc++;
1617
                }
1618
#endif
1619
#endif
1620
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1621
                {
1622
                        usrc += chromStride;
1623
                        vsrc += chromStride;
1624
                }
1625
                ysrc += lumStride;
1626
                dst += dstStride;
1627
        }
1628
#ifdef HAVE_MMX
1629
asm(    EMMS" \n\t"
1630
        SFENCE" \n\t"
1631
        :::"memory");
1632
#endif
1633
}
1634

    
1635
/**
1636
 *
1637
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1638
 * problem for anyone then tell me, and ill fix it)
1639
 */
1640
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1641
        long width, long height,
1642
        long lumStride, long chromStride, long dstStride)
1643
{
1644
        //FIXME interpolate chroma
1645
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1646
}
1647

    
1648
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1649
        long width, long height,
1650
        long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1651
{
1652
        long y;
1653
        const long chromWidth= width>>1;
1654
        for(y=0; y<height; y++)
1655
        {
1656
#ifdef HAVE_MMX
1657
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1658
                asm volatile(
1659
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1660
                        ASMALIGN(4)
1661
                        "1:                                \n\t"
1662
                        PREFETCH" 32(%1, %%"REG_a", 2)        \n\t"
1663
                        PREFETCH" 32(%2, %%"REG_a")        \n\t"
1664
                        PREFETCH" 32(%3, %%"REG_a")        \n\t"
1665
                        "movq (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1666
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1667
                        "movq (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1668
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1669
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1670

    
1671
                        "movq (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1672
                        "movq 8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1673
                        "movq %%mm0, %%mm4                \n\t" // Y(0)
1674
                        "movq %%mm2, %%mm6                \n\t" // Y(8)
1675
                        "punpcklbw %%mm3, %%mm0                \n\t" // YUYV YUYV(0)
1676
                        "punpckhbw %%mm3, %%mm4                \n\t" // YUYV YUYV(4)
1677
                        "punpcklbw %%mm5, %%mm2                \n\t" // YUYV YUYV(8)
1678
                        "punpckhbw %%mm5, %%mm6                \n\t" // YUYV YUYV(12)
1679

    
1680
                        MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1681
                        MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1682
                        MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1683
                        MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1684

    
1685
                        "add $8, %%"REG_a"                \n\t"
1686
                        "cmp %4, %%"REG_a"                \n\t"
1687
                        " jb 1b                                \n\t"
1688
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1689
                        : "%"REG_a
1690
                );
1691
#else
1692
//FIXME adapt the alpha asm code from yv12->yuy2
1693

    
1694
#if __WORDSIZE >= 64
1695
                int i;
1696
                uint64_t *ldst = (uint64_t *) dst;
1697
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1698
                for(i = 0; i < chromWidth; i += 2){
1699
                        uint64_t k, l;
1700
                        k = uc[0] + (yc[0] << 8) +
1701
                            (vc[0] << 16) + (yc[1] << 24);
1702
                        l = uc[1] + (yc[2] << 8) +
1703
                            (vc[1] << 16) + (yc[3] << 24);
1704
                        *ldst++ = k + (l << 32);
1705
                        yc += 4;
1706
                        uc += 2;
1707
                        vc += 2;
1708
                }
1709

    
1710
#else
1711
                int i, *idst = (int32_t *) dst;
1712
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1713
                for(i = 0; i < chromWidth; i++){
1714
#ifdef WORDS_BIGENDIAN
1715
                        *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1716
                            (vc[0] << 8) + (yc[1] << 0);
1717
#else
1718
                        *idst++ = uc[0] + (yc[0] << 8) +
1719
                            (vc[0] << 16) + (yc[1] << 24);
1720
#endif
1721
                        yc += 2;
1722
                        uc++;
1723
                        vc++;
1724
                }
1725
#endif
1726
#endif
1727
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1728
                {
1729
                        usrc += chromStride;
1730
                        vsrc += chromStride;
1731
                }
1732
                ysrc += lumStride;
1733
                dst += dstStride;
1734
        }
1735
#ifdef HAVE_MMX
1736
asm(    EMMS" \n\t"
1737
        SFENCE" \n\t"
1738
        :::"memory");
1739
#endif
1740
}
1741

    
1742
/**
1743
 *
1744
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1745
 * problem for anyone then tell me, and ill fix it)
1746
 */
1747
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1748
        long width, long height,
1749
        long lumStride, long chromStride, long dstStride)
1750
{
1751
        //FIXME interpolate chroma
1752
        RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1753
}
1754

    
1755
/**
1756
 *
1757
 * width should be a multiple of 16
1758
 */
1759
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1760
        long width, long height,
1761
        long lumStride, long chromStride, long dstStride)
1762
{
1763
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1764
}
1765

    
1766
/**
1767
 *
1768
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1769
 * problem for anyone then tell me, and ill fix it)
1770
 */
1771
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1772
        long width, long height,
1773
        long lumStride, long chromStride, long srcStride)
1774
{
1775
        long y;
1776
        const long chromWidth= width>>1;
1777
        for(y=0; y<height; y+=2)
1778
        {
1779
#ifdef HAVE_MMX
1780
                asm volatile(
1781
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1782
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1783
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1784
                        ASMALIGN(4)
1785
                        "1:                                \n\t"
1786
                        PREFETCH" 64(%0, %%"REG_a", 4)        \n\t"
1787
                        "movq (%0, %%"REG_a", 4), %%mm0        \n\t" // YUYV YUYV(0)
1788
                        "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1789
                        "movq %%mm0, %%mm2                \n\t" // YUYV YUYV(0)
1790
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(4)
1791
                        "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1792
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1793
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1794
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1795
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1796
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1797

    
1798
                        MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1799

    
1800
                        "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1801
                        "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1802
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(8)
1803
                        "movq %%mm2, %%mm4                \n\t" // YUYV YUYV(12)
1804
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1805
                        "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1806
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1807
                        "pand %%mm7, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1808
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1809
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1810

    
1811
                        MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1812

    
1813
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1814
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1815
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1816
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1817
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1818
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1819
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1820
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1821

    
1822
                        MOVNTQ" %%mm0, (%3, %%"REG_a")        \n\t"
1823
                        MOVNTQ" %%mm2, (%2, %%"REG_a")        \n\t"
1824

    
1825
                        "add $8, %%"REG_a"                \n\t"
1826
                        "cmp %4, %%"REG_a"                \n\t"
1827
                        " jb 1b                                \n\t"
1828
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1829
                        : "memory", "%"REG_a
1830
                );
1831

    
1832
                ydst += lumStride;
1833
                src  += srcStride;
1834

    
1835
                asm volatile(
1836
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1837
                        ASMALIGN(4)
1838
                        "1:                                \n\t"
1839
                        PREFETCH" 64(%0, %%"REG_a", 4)        \n\t"
1840
                        "movq (%0, %%"REG_a", 4), %%mm0        \n\t" // YUYV YUYV(0)
1841
                        "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1842
                        "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1843
                        "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1844
                        "pand %%mm7, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1845
                        "pand %%mm7, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1846
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1847
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1848
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1849
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1850

    
1851
                        MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1852
                        MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1853

    
1854
                        "add $8, %%"REG_a"                \n\t"
1855
                        "cmp %4, %%"REG_a"                \n\t"
1856
                        " jb 1b                                \n\t"
1857

    
1858
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1859
                        : "memory", "%"REG_a
1860
                );
1861
#else
1862
                long i;
1863
                for(i=0; i<chromWidth; i++)
1864
                {
1865
                        ydst[2*i+0]         = src[4*i+0];
1866
                        udst[i]         = src[4*i+1];
1867
                        ydst[2*i+1]         = src[4*i+2];
1868
                        vdst[i]         = src[4*i+3];
1869
                }
1870
                ydst += lumStride;
1871
                src  += srcStride;
1872

    
1873
                for(i=0; i<chromWidth; i++)
1874
                {
1875
                        ydst[2*i+0]         = src[4*i+0];
1876
                        ydst[2*i+1]         = src[4*i+2];
1877
                }
1878
#endif
1879
                udst += chromStride;
1880
                vdst += chromStride;
1881
                ydst += lumStride;
1882
                src  += srcStride;
1883
        }
1884
#ifdef HAVE_MMX
1885
asm volatile(   EMMS" \n\t"
1886
                SFENCE" \n\t"
1887
                :::"memory");
1888
#endif
1889
}
1890

    
1891
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1892
        uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1893
        long width, long height, long lumStride, long chromStride)
1894
{
1895
        /* Y Plane */
1896
        memcpy(ydst, ysrc, width*height);
1897

    
1898
        /* XXX: implement upscaling for U,V */
1899
}
1900

    
1901
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1902
{
1903
        long x,y;
1904
        
1905
        dst[0]= src[0];
1906
        
1907
        // first line
1908
        for(x=0; x<srcWidth-1; x++){
1909
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1910
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1911
        }
1912
        dst[2*srcWidth-1]= src[srcWidth-1];
1913
        
1914
        dst+= dstStride;
1915

    
1916
        for(y=1; y<srcHeight; y++){
1917
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1918
                const long mmxSize= srcWidth&~15;
1919
                asm volatile(
1920
                        "mov %4, %%"REG_a"                \n\t"
1921
                        "1:                                \n\t"
1922
                        "movq (%0, %%"REG_a"), %%mm0        \n\t"
1923
                        "movq (%1, %%"REG_a"), %%mm1        \n\t"
1924
                        "movq 1(%0, %%"REG_a"), %%mm2        \n\t"
1925
                        "movq 1(%1, %%"REG_a"), %%mm3        \n\t"
1926
                        "movq -1(%0, %%"REG_a"), %%mm4        \n\t"
1927
                        "movq -1(%1, %%"REG_a"), %%mm5        \n\t"
1928
                        PAVGB" %%mm0, %%mm5                \n\t"
1929
                        PAVGB" %%mm0, %%mm3                \n\t"
1930
                        PAVGB" %%mm0, %%mm5                \n\t"
1931
                        PAVGB" %%mm0, %%mm3                \n\t"
1932
                        PAVGB" %%mm1, %%mm4                \n\t"
1933
                        PAVGB" %%mm1, %%mm2                \n\t"
1934
                        PAVGB" %%mm1, %%mm4                \n\t"
1935
                        PAVGB" %%mm1, %%mm2                \n\t"
1936
                        "movq %%mm5, %%mm7                \n\t"
1937
                        "movq %%mm4, %%mm6                \n\t"
1938
                        "punpcklbw %%mm3, %%mm5                \n\t"
1939
                        "punpckhbw %%mm3, %%mm7                \n\t"
1940
                        "punpcklbw %%mm2, %%mm4                \n\t"
1941
                        "punpckhbw %%mm2, %%mm6                \n\t"
1942
#if 1
1943
                        MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1944
                        MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1945
                        MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1946
                        MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1947
#else
1948
                        "movq %%mm5, (%2, %%"REG_a", 2)        \n\t"
1949
                        "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1950
                        "movq %%mm4, (%3, %%"REG_a", 2)        \n\t"
1951
                        "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1952
#endif
1953
                        "add $8, %%"REG_a"                \n\t"
1954
                        " js 1b                                \n\t"
1955
                        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1956
                           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1957
                           "g" (-mmxSize)
1958
                        : "%"REG_a
1959

    
1960
                );
1961
#else
1962
                const long mmxSize=1;
1963
#endif
1964
                dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1965
                dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1966

    
1967
                for(x=mmxSize-1; x<srcWidth-1; x++){
1968
                        dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1969
                        dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1970
                        dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1971
                        dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1972
                }
1973
                dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1974
                dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1975

    
1976
                dst+=dstStride*2;
1977
                src+=srcStride;
1978
        }
1979
        
1980
        // last line
1981
#if 1
1982
        dst[0]= src[0];
1983
        
1984
        for(x=0; x<srcWidth-1; x++){
1985
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1986
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1987
        }
1988
        dst[2*srcWidth-1]= src[srcWidth-1];
1989
#else
1990
        for(x=0; x<srcWidth; x++){
1991
                dst[2*x+0]=
1992
                dst[2*x+1]= src[x];
1993
        }
1994
#endif
1995

    
1996
#ifdef HAVE_MMX
1997
asm volatile(   EMMS" \n\t"
1998
                SFENCE" \n\t"
1999
                :::"memory");
2000
#endif
2001
}
2002

    
2003
/**
2004
 *
2005
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
2006
 * problem for anyone then tell me, and ill fix it)
2007
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
2008
 */
2009
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2010
        long width, long height,
2011
        long lumStride, long chromStride, long srcStride)
2012
{
2013
        long y;
2014
        const long chromWidth= width>>1;
2015
        for(y=0; y<height; y+=2)
2016
        {
2017
#ifdef HAVE_MMX
2018
                asm volatile(
2019
                        "xorl %%eax, %%eax                \n\t"
2020
                        "pcmpeqw %%mm7, %%mm7                \n\t"
2021
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
2022
                        ASMALIGN(4)
2023
                        "1:                                \n\t"
2024
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
2025
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // UYVY UYVY(0)
2026
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(4)
2027
                        "movq %%mm0, %%mm2                \n\t" // UYVY UYVY(0)
2028
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(4)
2029
                        "pand %%mm7, %%mm0                \n\t" // U0V0 U0V0(0)
2030
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(4)
2031
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
2032
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
2033
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
2034
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
2035

    
2036
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
2037

    
2038
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(8)
2039
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // UYVY UYVY(12)
2040
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(8)
2041
                        "movq %%mm2, %%mm4                \n\t" // UYVY UYVY(12)
2042
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(8)
2043
                        "pand %%mm7, %%mm2                \n\t" // U0V0 U0V0(12)
2044
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2045
                        "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2046
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
2047
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
2048

    
2049
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
2050

    
2051
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
2052
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
2053
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2054
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2055
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
2056
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
2057
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
2058
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
2059

    
2060
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
2061
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
2062

    
2063
                        "addl $8, %%eax                        \n\t"
2064
                        "cmpl %4, %%eax                        \n\t"
2065
                        " jb 1b                                \n\t"
2066
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2067
                        : "memory", "%eax"
2068
                );
2069

    
2070
                ydst += lumStride;
2071
                src  += srcStride;
2072

    
2073
                asm volatile(
2074
                        "xorl %%eax, %%eax                \n\t"
2075
                        ASMALIGN(4)
2076
                        "1:                                \n\t"
2077
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
2078
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
2079
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
2080
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
2081
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
2082
                        "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2083
                        "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2084
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2085
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2086
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
2087
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
2088

    
2089
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
2090
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
2091

    
2092
                        "addl $8, %%eax                        \n\t"
2093
                        "cmpl %4, %%eax                        \n\t"
2094
                        " jb 1b                                \n\t"
2095

    
2096
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2097
                        : "memory", "%eax"
2098
                );
2099
#else
2100
                long i;
2101
                for(i=0; i<chromWidth; i++)
2102
                {
2103
                        udst[i]         = src[4*i+0];
2104
                        ydst[2*i+0]         = src[4*i+1];
2105
                        vdst[i]         = src[4*i+2];
2106
                        ydst[2*i+1]         = src[4*i+3];
2107
                }
2108
                ydst += lumStride;
2109
                src  += srcStride;
2110

    
2111
                for(i=0; i<chromWidth; i++)
2112
                {
2113
                        ydst[2*i+0]         = src[4*i+1];
2114
                        ydst[2*i+1]         = src[4*i+3];
2115
                }
2116
#endif
2117
                udst += chromStride;
2118
                vdst += chromStride;
2119
                ydst += lumStride;
2120
                src  += srcStride;
2121
        }
2122
#ifdef HAVE_MMX
2123
asm volatile(   EMMS" \n\t"
2124
                SFENCE" \n\t"
2125
                :::"memory");
2126
#endif
2127
}
2128

    
2129
/**
2130
 *
2131
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2132
 * problem for anyone then tell me, and ill fix it)
2133
 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2134
 */
2135
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2136
        long width, long height,
2137
        long lumStride, long chromStride, long srcStride)
2138
{
2139
        long y;
2140
        const long chromWidth= width>>1;
2141
#ifdef HAVE_MMX
2142
        for(y=0; y<height-2; y+=2)
2143
        {
2144
                long i;
2145
                for(i=0; i<2; i++)
2146
                {
2147
                        asm volatile(
2148
                                "mov %2, %%"REG_a"                \n\t"
2149
                                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
2150
                                "movq "MANGLE(w1111)", %%mm5                \n\t"
2151
                                "pxor %%mm7, %%mm7                \n\t"
2152
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2153
                                ASMALIGN(4)
2154
                                "1:                                \n\t"
2155
                                PREFETCH" 64(%0, %%"REG_d")        \n\t"
2156
                                "movd (%0, %%"REG_d"), %%mm0        \n\t"
2157
                                "movd 3(%0, %%"REG_d"), %%mm1        \n\t"
2158
                                "punpcklbw %%mm7, %%mm0                \n\t"
2159
                                "punpcklbw %%mm7, %%mm1                \n\t"
2160
                                "movd 6(%0, %%"REG_d"), %%mm2        \n\t"
2161
                                "movd 9(%0, %%"REG_d"), %%mm3        \n\t"
2162
                                "punpcklbw %%mm7, %%mm2                \n\t"
2163
                                "punpcklbw %%mm7, %%mm3                \n\t"
2164
                                "pmaddwd %%mm6, %%mm0                \n\t"
2165
                                "pmaddwd %%mm6, %%mm1                \n\t"
2166
                                "pmaddwd %%mm6, %%mm2                \n\t"
2167
                                "pmaddwd %%mm6, %%mm3                \n\t"
2168
#ifndef FAST_BGR2YV12
2169
                                "psrad $8, %%mm0                \n\t"
2170
                                "psrad $8, %%mm1                \n\t"
2171
                                "psrad $8, %%mm2                \n\t"
2172
                                "psrad $8, %%mm3                \n\t"
2173
#endif
2174
                                "packssdw %%mm1, %%mm0                \n\t"
2175
                                "packssdw %%mm3, %%mm2                \n\t"
2176
                                "pmaddwd %%mm5, %%mm0                \n\t"
2177
                                "pmaddwd %%mm5, %%mm2                \n\t"
2178
                                "packssdw %%mm2, %%mm0                \n\t"
2179
                                "psraw $7, %%mm0                \n\t"
2180

    
2181
                                "movd 12(%0, %%"REG_d"), %%mm4        \n\t"
2182
                                "movd 15(%0, %%"REG_d"), %%mm1        \n\t"
2183
                                "punpcklbw %%mm7, %%mm4                \n\t"
2184
                                "punpcklbw %%mm7, %%mm1                \n\t"
2185
                                "movd 18(%0, %%"REG_d"), %%mm2        \n\t"
2186
                                "movd 21(%0, %%"REG_d"), %%mm3        \n\t"
2187
                                "punpcklbw %%mm7, %%mm2                \n\t"
2188
                                "punpcklbw %%mm7, %%mm3                \n\t"
2189
                                "pmaddwd %%mm6, %%mm4                \n\t"
2190
                                "pmaddwd %%mm6, %%mm1                \n\t"
2191
                                "pmaddwd %%mm6, %%mm2                \n\t"
2192
                                "pmaddwd %%mm6, %%mm3                \n\t"
2193
#ifndef FAST_BGR2YV12
2194
                                "psrad $8, %%mm4                \n\t"
2195
                                "psrad $8, %%mm1                \n\t"
2196
                                "psrad $8, %%mm2                \n\t"
2197
                                "psrad $8, %%mm3                \n\t"
2198
#endif
2199
                                "packssdw %%mm1, %%mm4                \n\t"
2200
                                "packssdw %%mm3, %%mm2                \n\t"
2201
                                "pmaddwd %%mm5, %%mm4                \n\t"
2202
                                "pmaddwd %%mm5, %%mm2                \n\t"
2203
                                "add $24, %%"REG_d"                \n\t"
2204
                                "packssdw %%mm2, %%mm4                \n\t"
2205
                                "psraw $7, %%mm4                \n\t"
2206

    
2207
                                "packuswb %%mm4, %%mm0                \n\t"
2208
                                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
2209

    
2210
                                MOVNTQ" %%mm0, (%1, %%"REG_a")        \n\t"
2211
                                "add $8, %%"REG_a"                \n\t"
2212
                                " js 1b                                \n\t"
2213
                                : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2214
                                : "%"REG_a, "%"REG_d
2215
                        );
2216
                        ydst += lumStride;
2217
                        src  += srcStride;
2218
                }
2219
                src -= srcStride*2;
2220
                asm volatile(
2221
                        "mov %4, %%"REG_a"                \n\t"
2222
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2223
                        "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
2224
                        "pxor %%mm7, %%mm7                \n\t"
2225
                        "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2226
                        "add %%"REG_d", %%"REG_d"        \n\t"
2227
                        ASMALIGN(4)
2228
                        "1:                                \n\t"
2229
                        PREFETCH" 64(%0, %%"REG_d")        \n\t"
2230
                        PREFETCH" 64(%1, %%"REG_d")        \n\t"
2231
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2232
                        "movq (%0, %%"REG_d"), %%mm0        \n\t"
2233
                        "movq (%1, %%"REG_d"), %%mm1        \n\t"
2234
                        "movq 6(%0, %%"REG_d"), %%mm2        \n\t"
2235
                        "movq 6(%1, %%"REG_d"), %%mm3        \n\t"
2236
                        PAVGB" %%mm1, %%mm0                \n\t"
2237
                        PAVGB" %%mm3, %%mm2                \n\t"
2238
                        "movq %%mm0, %%mm1                \n\t"
2239
                        "movq %%mm2, %%mm3                \n\t"
2240
                        "psrlq $24, %%mm0                \n\t"
2241
                        "psrlq $24, %%mm2                \n\t"
2242
                        PAVGB" %%mm1, %%mm0                \n\t"
2243
                        PAVGB" %%mm3, %%mm2                \n\t"
2244
                        "punpcklbw %%mm7, %%mm0                \n\t"
2245
                        "punpcklbw %%mm7, %%mm2                \n\t"
2246
#else
2247
                        "movd (%0, %%"REG_d"), %%mm0        \n\t"
2248
                        "movd (%1, %%"REG_d"), %%mm1        \n\t"
2249
                        "movd 3(%0, %%"REG_d"), %%mm2        \n\t"
2250
                        "movd 3(%1, %%"REG_d"), %%mm3        \n\t"
2251
                        "punpcklbw %%mm7, %%mm0                \n\t"
2252
                        "punpcklbw %%mm7, %%mm1                \n\t"
2253
                        "punpcklbw %%mm7, %%mm2                \n\t"
2254
                        "punpcklbw %%mm7, %%mm3                \n\t"
2255
                        "paddw %%mm1, %%mm0                \n\t"
2256
                        "paddw %%mm3, %%mm2                \n\t"
2257
                        "paddw %%mm2, %%mm0                \n\t"
2258
                        "movd 6(%0, %%"REG_d"), %%mm4        \n\t"
2259
                        "movd 6(%1, %%"REG_d"), %%mm1        \n\t"
2260
                        "movd 9(%0, %%"REG_d"), %%mm2        \n\t"
2261
                        "movd 9(%1, %%"REG_d"), %%mm3        \n\t"
2262
                        "punpcklbw %%mm7, %%mm4                \n\t"
2263
                        "punpcklbw %%mm7, %%mm1                \n\t"
2264
                        "punpcklbw %%mm7, %%mm2                \n\t"
2265
                        "punpcklbw %%mm7, %%mm3                \n\t"
2266
                        "paddw %%mm1, %%mm4                \n\t"
2267
                        "paddw %%mm3, %%mm2                \n\t"
2268
                        "paddw %%mm4, %%mm2                \n\t"
2269
                        "psrlw $2, %%mm0                \n\t"
2270
                        "psrlw $2, %%mm2                \n\t"
2271
#endif
2272
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2273
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2274

    
2275
                        "pmaddwd %%mm0, %%mm1                \n\t"
2276
                        "pmaddwd %%mm2, %%mm3                \n\t"
2277
                        "pmaddwd %%mm6, %%mm0                \n\t"
2278
                        "pmaddwd %%mm6, %%mm2                \n\t"
2279
#ifndef FAST_BGR2YV12
2280
                        "psrad $8, %%mm0                \n\t"
2281
                        "psrad $8, %%mm1                \n\t"
2282
                        "psrad $8, %%mm2                \n\t"
2283
                        "psrad $8, %%mm3                \n\t"
2284
#endif
2285
                        "packssdw %%mm2, %%mm0                \n\t"
2286
                        "packssdw %%mm3, %%mm1                \n\t"
2287
                        "pmaddwd %%mm5, %%mm0                \n\t"
2288
                        "pmaddwd %%mm5, %%mm1                \n\t"
2289
                        "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
2290
                        "psraw $7, %%mm0                \n\t"
2291

    
2292
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2293
                        "movq 12(%0, %%"REG_d"), %%mm4        \n\t"
2294
                        "movq 12(%1, %%"REG_d"), %%mm1        \n\t"
2295
                        "movq 18(%0, %%"REG_d"), %%mm2        \n\t"
2296
                        "movq 18(%1, %%"REG_d"), %%mm3        \n\t"
2297
                        PAVGB" %%mm1, %%mm4                \n\t"
2298
                        PAVGB" %%mm3, %%mm2                \n\t"
2299
                        "movq %%mm4, %%mm1                \n\t"
2300
                        "movq %%mm2, %%mm3                \n\t"
2301
                        "psrlq $24, %%mm4                \n\t"
2302
                        "psrlq $24, %%mm2                \n\t"
2303
                        PAVGB" %%mm1, %%mm4                \n\t"
2304
                        PAVGB" %%mm3, %%mm2                \n\t"
2305
                        "punpcklbw %%mm7, %%mm4                \n\t"
2306
                        "punpcklbw %%mm7, %%mm2                \n\t"
2307
#else
2308
                        "movd 12(%0, %%"REG_d"), %%mm4        \n\t"
2309
                        "movd 12(%1, %%"REG_d"), %%mm1        \n\t"
2310
                        "movd 15(%0, %%"REG_d"), %%mm2        \n\t"
2311
                        "movd 15(%1, %%"REG_d"), %%mm3        \n\t"
2312
                        "punpcklbw %%mm7, %%mm4                \n\t"
2313
                        "punpcklbw %%mm7, %%mm1                \n\t"
2314
                        "punpcklbw %%mm7, %%mm2                \n\t"
2315
                        "punpcklbw %%mm7, %%mm3                \n\t"
2316
                        "paddw %%mm1, %%mm4                \n\t"
2317
                        "paddw %%mm3, %%mm2                \n\t"
2318
                        "paddw %%mm2, %%mm4                \n\t"
2319
                        "movd 18(%0, %%"REG_d"), %%mm5        \n\t"
2320
                        "movd 18(%1, %%"REG_d"), %%mm1        \n\t"
2321
                        "movd 21(%0, %%"REG_d"), %%mm2        \n\t"
2322
                        "movd 21(%1, %%"REG_d"), %%mm3        \n\t"
2323
                        "punpcklbw %%mm7, %%mm5                \n\t"
2324
                        "punpcklbw %%mm7, %%mm1                \n\t"
2325
                        "punpcklbw %%mm7, %%mm2                \n\t"
2326
                        "punpcklbw %%mm7, %%mm3                \n\t"
2327
                        "paddw %%mm1, %%mm5                \n\t"
2328
                        "paddw %%mm3, %%mm2                \n\t"
2329
                        "paddw %%mm5, %%mm2                \n\t"
2330
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2331
                        "psrlw $2, %%mm4                \n\t"
2332
                        "psrlw $2, %%mm2                \n\t"
2333
#endif
2334
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2335
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2336

    
2337
                        "pmaddwd %%mm4, %%mm1                \n\t"
2338
                        "pmaddwd %%mm2, %%mm3                \n\t"
2339
                        "pmaddwd %%mm6, %%mm4                \n\t"
2340
                        "pmaddwd %%mm6, %%mm2                \n\t"
2341
#ifndef FAST_BGR2YV12
2342
                        "psrad $8, %%mm4                \n\t"
2343
                        "psrad $8, %%mm1                \n\t"
2344
                        "psrad $8, %%mm2                \n\t"
2345
                        "psrad $8, %%mm3                \n\t"
2346
#endif
2347
                        "packssdw %%mm2, %%mm4                \n\t"
2348
                        "packssdw %%mm3, %%mm1                \n\t"
2349
                        "pmaddwd %%mm5, %%mm4                \n\t"
2350
                        "pmaddwd %%mm5, %%mm1                \n\t"
2351
                        "add $24, %%"REG_d"                \n\t"
2352
                        "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
2353
                        "psraw $7, %%mm4                \n\t"
2354

    
2355
                        "movq %%mm0, %%mm1                \n\t"
2356
                        "punpckldq %%mm4, %%mm0                \n\t"
2357
                        "punpckhdq %%mm4, %%mm1                \n\t"
2358
                        "packsswb %%mm1, %%mm0                \n\t"
2359
                        "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
2360
                        "movd %%mm0, (%2, %%"REG_a")        \n\t"
2361
                        "punpckhdq %%mm0, %%mm0                \n\t"
2362
                        "movd %%mm0, (%3, %%"REG_a")        \n\t"
2363
                        "add $4, %%"REG_a"                \n\t"
2364
                        " js 1b                                \n\t"
2365
                        : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2366
                        : "%"REG_a, "%"REG_d
2367
                );
2368

    
2369
                udst += chromStride;
2370
                vdst += chromStride;
2371
                src  += srcStride*2;
2372
        }
2373

    
2374
        asm volatile(   EMMS" \n\t"
2375
                        SFENCE" \n\t"
2376
                        :::"memory");
2377
#else
2378
        y=0;
2379
#endif
2380
        for(; y<height; y+=2)
2381
        {
2382
                long i;
2383
                for(i=0; i<chromWidth; i++)
2384
                {
2385
                        unsigned int b= src[6*i+0];
2386
                        unsigned int g= src[6*i+1];
2387
                        unsigned int r= src[6*i+2];
2388

    
2389
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2390
                        unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2391
                        unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2392

    
2393
                        udst[i]         = U;
2394
                        vdst[i]         = V;
2395
                        ydst[2*i]         = Y;
2396

    
2397
                        b= src[6*i+3];
2398
                        g= src[6*i+4];
2399
                        r= src[6*i+5];
2400

    
2401
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2402
                        ydst[2*i+1]         = Y;
2403
                }
2404
                ydst += lumStride;
2405
                src  += srcStride;
2406

    
2407
                for(i=0; i<chromWidth; i++)
2408
                {
2409
                        unsigned int b= src[6*i+0];
2410
                        unsigned int g= src[6*i+1];
2411
                        unsigned int r= src[6*i+2];
2412

    
2413
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2414

    
2415
                        ydst[2*i]         = Y;
2416

    
2417
                        b= src[6*i+3];
2418
                        g= src[6*i+4];
2419
                        r= src[6*i+5];
2420

    
2421
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2422
                        ydst[2*i+1]         = Y;
2423
                }
2424
                udst += chromStride;
2425
                vdst += chromStride;
2426
                ydst += lumStride;
2427
                src  += srcStride;
2428
        }
2429
}
2430

    
2431
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2432
                            long width, long height, long src1Stride,
2433
                            long src2Stride, long dstStride){
2434
        long h;
2435

    
2436
        for(h=0; h < height; h++)
2437
        {
2438
                long w;
2439

    
2440
#ifdef HAVE_MMX
2441
#ifdef HAVE_SSE2
2442
                asm(
2443
                        "xor %%"REG_a", %%"REG_a"        \n\t"
2444
                        "1:                                \n\t"
2445
                        PREFETCH" 64(%1, %%"REG_a")        \n\t"
2446
                        PREFETCH" 64(%2, %%"REG_a")        \n\t"
2447
                        "movdqa (%1, %%"REG_a"), %%xmm0        \n\t"
2448
                        "movdqa (%1, %%"REG_a"), %%xmm1        \n\t"
2449
                        "movdqa (%2, %%"REG_a"), %%xmm2        \n\t"
2450
                        "punpcklbw %%xmm2, %%xmm0        \n\t"
2451
                        "punpckhbw %%xmm2, %%xmm1        \n\t"
2452
                        "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2453
                        "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2454
                        "add $16, %%"REG_a"                \n\t"
2455
                        "cmp %3, %%"REG_a"                \n\t"
2456
                        " jb 1b                                \n\t"
2457
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2458
                        : "memory", "%"REG_a""
2459
                );
2460
#else
2461
                asm(
2462
                        "xor %%"REG_a", %%"REG_a"        \n\t"
2463
                        "1:                                \n\t"
2464
                        PREFETCH" 64(%1, %%"REG_a")        \n\t"
2465
                        PREFETCH" 64(%2, %%"REG_a")        \n\t"
2466
                        "movq (%1, %%"REG_a"), %%mm0        \n\t"
2467
                        "movq 8(%1, %%"REG_a"), %%mm2        \n\t"
2468
                        "movq %%mm0, %%mm1                \n\t"
2469
                        "movq %%mm2, %%mm3                \n\t"
2470
                        "movq (%2, %%"REG_a"), %%mm4        \n\t"
2471
                        "movq 8(%2, %%"REG_a"), %%mm5        \n\t"
2472
                        "punpcklbw %%mm4, %%mm0                \n\t"
2473
                        "punpckhbw %%mm4, %%mm1                \n\t"
2474
                        "punpcklbw %%mm5, %%mm2                \n\t"
2475
                        "punpckhbw %%mm5, %%mm3                \n\t"
2476
                        MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2477
                        MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2478
                        MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2479
                        MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2480
                        "add $16, %%"REG_a"                \n\t"
2481
                        "cmp %3, %%"REG_a"                \n\t"
2482
                        " jb 1b                                \n\t"
2483
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2484
                        : "memory", "%"REG_a
2485
                );
2486
#endif
2487
                for(w= (width&(~15)); w < width; w++)
2488
                {
2489
                        dest[2*w+0] = src1[w];
2490
                        dest[2*w+1] = src2[w];
2491
                }
2492
#else
2493
                for(w=0; w < width; w++)
2494
                {
2495
                        dest[2*w+0] = src1[w];
2496
                        dest[2*w+1] = src2[w];
2497
                }
2498
#endif
2499
                dest += dstStride;
2500
                src1 += src1Stride;
2501
                src2 += src2Stride;
2502
        }
2503
#ifdef HAVE_MMX
2504
        asm(
2505
                EMMS" \n\t"
2506
                SFENCE" \n\t"
2507
                ::: "memory"
2508
                );
2509
#endif
2510
}
2511

    
2512
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2513
                        uint8_t *dst1, uint8_t *dst2,
2514
                        long width, long height,
2515
                        long srcStride1, long srcStride2,
2516
                        long dstStride1, long dstStride2)
2517
{
2518
    long y,x,w,h;
2519
    w=width/2; h=height/2;
2520
#ifdef HAVE_MMX
2521
    asm volatile(
2522
        PREFETCH" %0\n\t"
2523
        PREFETCH" %1\n\t"
2524
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2525
#endif
2526
    for(y=0;y<h;y++){
2527
        const uint8_t* s1=src1+srcStride1*(y>>1);
2528
        uint8_t* d=dst1+dstStride1*y;
2529
        x=0;
2530
#ifdef HAVE_MMX
2531
        for(;x<w-31;x+=32)
2532
        {
2533
            asm volatile(
2534
                PREFETCH" 32%1\n\t"
2535
                "movq        %1, %%mm0\n\t"
2536
                "movq        8%1, %%mm2\n\t"
2537
                "movq        16%1, %%mm4\n\t"
2538
                "movq        24%1, %%mm6\n\t"
2539
                "movq        %%mm0, %%mm1\n\t"
2540
                "movq        %%mm2, %%mm3\n\t"
2541
                "movq        %%mm4, %%mm5\n\t"
2542
                "movq        %%mm6, %%mm7\n\t"
2543
                "punpcklbw %%mm0, %%mm0\n\t"
2544
                "punpckhbw %%mm1, %%mm1\n\t"
2545
                "punpcklbw %%mm2, %%mm2\n\t"
2546
                "punpckhbw %%mm3, %%mm3\n\t"
2547
                "punpcklbw %%mm4, %%mm4\n\t"
2548
                "punpckhbw %%mm5, %%mm5\n\t"
2549
                "punpcklbw %%mm6, %%mm6\n\t"
2550
                "punpckhbw %%mm7, %%mm7\n\t"
2551
                MOVNTQ"        %%mm0, %0\n\t"
2552
                MOVNTQ"        %%mm1, 8%0\n\t"
2553
                MOVNTQ"        %%mm2, 16%0\n\t"
2554
                MOVNTQ"        %%mm3, 24%0\n\t"
2555
                MOVNTQ"        %%mm4, 32%0\n\t"
2556
                MOVNTQ"        %%mm5, 40%0\n\t"
2557
                MOVNTQ"        %%mm6, 48%0\n\t"
2558
                MOVNTQ"        %%mm7, 56%0"
2559
                :"=m"(d[2*x])
2560
                :"m"(s1[x])
2561
                :"memory");
2562
        }
2563
#endif
2564
        for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2565
    }
2566
    for(y=0;y<h;y++){
2567
        const uint8_t* s2=src2+srcStride2*(y>>1);
2568
        uint8_t* d=dst2+dstStride2*y;
2569
        x=0;
2570
#ifdef HAVE_MMX
2571
        for(;x<w-31;x+=32)
2572
        {
2573
            asm volatile(
2574
                PREFETCH" 32%1\n\t"
2575
                "movq        %1, %%mm0\n\t"
2576
                "movq        8%1, %%mm2\n\t"
2577
                "movq        16%1, %%mm4\n\t"
2578
                "movq        24%1, %%mm6\n\t"
2579
                "movq        %%mm0, %%mm1\n\t"
2580
                "movq        %%mm2, %%mm3\n\t"
2581
                "movq        %%mm4, %%mm5\n\t"
2582
                "movq        %%mm6, %%mm7\n\t"
2583
                "punpcklbw %%mm0, %%mm0\n\t"
2584
                "punpckhbw %%mm1, %%mm1\n\t"
2585
                "punpcklbw %%mm2, %%mm2\n\t"
2586
                "punpckhbw %%mm3, %%mm3\n\t"
2587
                "punpcklbw %%mm4, %%mm4\n\t"
2588
                "punpckhbw %%mm5, %%mm5\n\t"
2589
                "punpcklbw %%mm6, %%mm6\n\t"
2590
                "punpckhbw %%mm7, %%mm7\n\t"
2591
                MOVNTQ"        %%mm0, %0\n\t"
2592
                MOVNTQ"        %%mm1, 8%0\n\t"
2593
                MOVNTQ"        %%mm2, 16%0\n\t"
2594
                MOVNTQ"        %%mm3, 24%0\n\t"
2595
                MOVNTQ"        %%mm4, 32%0\n\t"
2596
                MOVNTQ"        %%mm5, 40%0\n\t"
2597
                MOVNTQ"        %%mm6, 48%0\n\t"
2598
                MOVNTQ"        %%mm7, 56%0"
2599
                :"=m"(d[2*x])
2600
                :"m"(s2[x])
2601
                :"memory");
2602
        }
2603
#endif
2604
        for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2605
    }
2606
#ifdef HAVE_MMX
2607
        asm(
2608
                EMMS" \n\t"
2609
                SFENCE" \n\t"
2610
                ::: "memory"
2611
                );
2612
#endif
2613
}
2614

    
2615
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2616
                        uint8_t *dst,
2617
                        long width, long height,
2618
                        long srcStride1, long srcStride2,
2619
                        long srcStride3, long dstStride)
2620
{
2621
    long y,x,w,h;
2622
    w=width/2; h=height;
2623
    for(y=0;y<h;y++){
2624
        const uint8_t* yp=src1+srcStride1*y;
2625
        const uint8_t* up=src2+srcStride2*(y>>2);
2626
        const uint8_t* vp=src3+srcStride3*(y>>2);
2627
        uint8_t* d=dst+dstStride*y;
2628
        x=0;
2629
#ifdef HAVE_MMX
2630
        for(;x<w-7;x+=8)
2631
        {
2632
            asm volatile(
2633
                PREFETCH" 32(%1, %0)\n\t"
2634
                PREFETCH" 32(%2, %0)\n\t"
2635
                PREFETCH" 32(%3, %0)\n\t"
2636
                "movq        (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2637
                "movq        (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2638
                "movq        (%3, %0), %%mm2\n\t"             /* V0V1V2V3V4V5V6V7 */
2639
                "movq        %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2640
                "movq        %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2641
                "movq        %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2642
                "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2643
                "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2644
                "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2645
                "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2646

    
2647
                "movq        %%mm1, %%mm6\n\t"
2648
                "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2649
                "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2650
                "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2651
                MOVNTQ"        %%mm0, (%4, %0, 8)\n\t"
2652
                MOVNTQ"        %%mm3, 8(%4, %0, 8)\n\t"
2653
                
2654
                "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2655
                "movq        8(%1, %0, 4), %%mm0\n\t"
2656
                "movq        %%mm0, %%mm3\n\t"
2657
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2658
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2659
                MOVNTQ"        %%mm0, 16(%4, %0, 8)\n\t"
2660
                MOVNTQ"        %%mm3, 24(%4, %0, 8)\n\t"
2661

    
2662
                "movq        %%mm4, %%mm6\n\t"
2663
                "movq        16(%1, %0, 4), %%mm0\n\t"
2664
                "movq        %%mm0, %%mm3\n\t"
2665
                "punpcklbw %%mm5, %%mm4\n\t"
2666
                "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2667
                "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2668
                MOVNTQ"        %%mm0, 32(%4, %0, 8)\n\t"
2669
                MOVNTQ"        %%mm3, 40(%4, %0, 8)\n\t"
2670
                
2671
                "punpckhbw %%mm5, %%mm6\n\t"
2672
                "movq        24(%1, %0, 4), %%mm0\n\t"
2673
                "movq        %%mm0, %%mm3\n\t"
2674
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2675
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2676
                MOVNTQ"        %%mm0, 48(%4, %0, 8)\n\t"
2677
                MOVNTQ"        %%mm3, 56(%4, %0, 8)\n\t"
2678

    
2679
                : "+r" (x)
2680
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2681
                :"memory");
2682
        }
2683
#endif
2684
        for(; x<w; x++)
2685
        {
2686
            const long x2= x<<2;
2687
            d[8*x+0]=yp[x2];
2688
            d[8*x+1]=up[x];
2689
            d[8*x+2]=yp[x2+1];
2690
            d[8*x+3]=vp[x];
2691
            d[8*x+4]=yp[x2+2];
2692
            d[8*x+5]=up[x];
2693
            d[8*x+6]=yp[x2+3];
2694
            d[8*x+7]=vp[x];
2695
        }
2696
    }
2697
#ifdef HAVE_MMX
2698
        asm(
2699
                EMMS" \n\t"
2700
                SFENCE" \n\t"
2701
                ::: "memory"
2702
                );
2703
#endif
2704
}
2705

    
2706
static inline void RENAME(rgb2rgb_init)(void){
2707
        rgb15to16= RENAME(rgb15to16);
2708
        rgb15to24= RENAME(rgb15to24);
2709
        rgb15to32= RENAME(rgb15to32);
2710
        rgb16to24= RENAME(rgb16to24);
2711
        rgb16to32= RENAME(rgb16to32);
2712
        rgb16to15= RENAME(rgb16to15);
2713
        rgb24to16= RENAME(rgb24to16);
2714
        rgb24to15= RENAME(rgb24to15);
2715
        rgb24to32= RENAME(rgb24to32);
2716
        rgb32to16= RENAME(rgb32to16);
2717
        rgb32to15= RENAME(rgb32to15);
2718
        rgb32to24= RENAME(rgb32to24);
2719
        rgb24tobgr15= RENAME(rgb24tobgr15);
2720
        rgb24tobgr16= RENAME(rgb24tobgr16);
2721
        rgb24tobgr24= RENAME(rgb24tobgr24);
2722
        rgb32tobgr32= RENAME(rgb32tobgr32);
2723
        rgb32tobgr16= RENAME(rgb32tobgr16);
2724
        rgb32tobgr15= RENAME(rgb32tobgr15);
2725
        yv12toyuy2= RENAME(yv12toyuy2);
2726
        yv12touyvy= RENAME(yv12touyvy);
2727
        yuv422ptoyuy2= RENAME(yuv422ptoyuy2);
2728
        yuy2toyv12= RENAME(yuy2toyv12);
2729
//        uyvytoyv12= RENAME(uyvytoyv12);
2730
//        yvu9toyv12= RENAME(yvu9toyv12);
2731
        planar2x= RENAME(planar2x);
2732
        rgb24toyv12= RENAME(rgb24toyv12);
2733
        interleaveBytes= RENAME(interleaveBytes);
2734
        vu9_to_vu12= RENAME(vu9_to_vu12);
2735
        yvu9_to_yuy2= RENAME(yvu9_to_yuy2);
2736
}