Statistics
| Branch: | Revision:

ffmpeg / libswscale / rgb2rgb_template.c @ 4bff9ef9

History | View | Annotate | Download (66.9 KB)

1
/*
2
 *
3
 *  rgb2rgb.c, Software RGB to RGB convertor
4
 *  pluralize by Software PAL8 to RGB convertor
5
 *               Software YUV to YUV convertor
6
 *               Software YUV to RGB convertor
7
 *  Written by Nick Kurshev.
8
 *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9
 *  lot of big-endian byteorder fixes by Alex Beregszaszi
10
 */
11

    
12
#include <stddef.h>
13
#include <inttypes.h> /* for __WORDSIZE */
14

    
15
#ifndef __WORDSIZE
16
// #warning You have misconfigured system and probably will lose performance!
17
#define __WORDSIZE MP_WORDSIZE
18
#endif
19

    
20
#undef PREFETCH
21
#undef MOVNTQ
22
#undef EMMS
23
#undef SFENCE
24
#undef MMREG_SIZE
25
#undef PREFETCHW
26
#undef PAVGB
27

    
28
#ifdef HAVE_SSE2
29
#define MMREG_SIZE 16
30
#else
31
#define MMREG_SIZE 8
32
#endif
33

    
34
#ifdef HAVE_3DNOW
35
#define PREFETCH  "prefetch"
36
#define PREFETCHW "prefetchw"
37
#define PAVGB          "pavgusb"
38
#elif defined ( HAVE_MMX2 )
39
#define PREFETCH "prefetchnta"
40
#define PREFETCHW "prefetcht0"
41
#define PAVGB          "pavgb"
42
#else
43
#ifdef __APPLE__
44
#define PREFETCH "#"
45
#define PREFETCHW "#"
46
#else
47
#define PREFETCH "/nop"
48
#define PREFETCHW "/nop"
49
#endif
50
#endif
51

    
52
#ifdef HAVE_3DNOW
53
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
54
#define EMMS     "femms"
55
#else
56
#define EMMS     "emms"
57
#endif
58

    
59
#ifdef HAVE_MMX2
60
#define MOVNTQ "movntq"
61
#define SFENCE "sfence"
62
#else
63
#define MOVNTQ "movq"
64
#ifdef __APPLE__
65
#define SFENCE "#"
66
#else
67
#define SFENCE "/nop"
68
#endif
69
#endif
70

    
71
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
72
{
73
  uint8_t *dest = dst;
74
  const uint8_t *s = src;
75
  const uint8_t *end;
76
#ifdef HAVE_MMX
77
  const uint8_t *mm_end;
78
#endif
79
  end = s + src_size;
80
#ifdef HAVE_MMX
81
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
82
  mm_end = end - 23;
83
  __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
84
  while(s < mm_end)
85
  {
86
    __asm __volatile(
87
        PREFETCH"        32%1\n\t"
88
        "movd        %1, %%mm0\n\t"
89
        "punpckldq 3%1, %%mm0\n\t"
90
        "movd        6%1, %%mm1\n\t"
91
        "punpckldq 9%1, %%mm1\n\t"
92
        "movd        12%1, %%mm2\n\t"
93
        "punpckldq 15%1, %%mm2\n\t"
94
        "movd        18%1, %%mm3\n\t"
95
        "punpckldq 21%1, %%mm3\n\t"
96
        "pand        %%mm7, %%mm0\n\t"
97
        "pand        %%mm7, %%mm1\n\t"
98
        "pand        %%mm7, %%mm2\n\t"
99
        "pand        %%mm7, %%mm3\n\t"
100
        MOVNTQ"        %%mm0, %0\n\t"
101
        MOVNTQ"        %%mm1, 8%0\n\t"
102
        MOVNTQ"        %%mm2, 16%0\n\t"
103
        MOVNTQ"        %%mm3, 24%0"
104
        :"=m"(*dest)
105
        :"m"(*s)
106
        :"memory");
107
    dest += 32;
108
    s += 24;
109
  }
110
  __asm __volatile(SFENCE:::"memory");
111
  __asm __volatile(EMMS:::"memory");
112
#endif
113
  while(s < end)
114
  {
115
#ifdef WORDS_BIGENDIAN
116
    /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
117
    *dest++ = 0;
118
    *dest++ = s[2];
119
    *dest++ = s[1];
120
    *dest++ = s[0];
121
    s+=3;
122
#else
123
    *dest++ = *s++;
124
    *dest++ = *s++;
125
    *dest++ = *s++;
126
    *dest++ = 0;
127
#endif
128
  }
129
}
130

    
131
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
132
{
133
  uint8_t *dest = dst;
134
  const uint8_t *s = src;
135
  const uint8_t *end;
136
#ifdef HAVE_MMX
137
  const uint8_t *mm_end;
138
#endif
139
  end = s + src_size;
140
#ifdef HAVE_MMX
141
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
142
  mm_end = end - 31;
143
  while(s < mm_end)
144
  {
145
    __asm __volatile(
146
        PREFETCH"        32%1\n\t"
147
        "movq        %1, %%mm0\n\t"
148
        "movq        8%1, %%mm1\n\t"
149
        "movq        16%1, %%mm4\n\t"
150
        "movq        24%1, %%mm5\n\t"
151
        "movq        %%mm0, %%mm2\n\t"
152
        "movq        %%mm1, %%mm3\n\t"
153
        "movq        %%mm4, %%mm6\n\t"
154
        "movq        %%mm5, %%mm7\n\t"
155
        "psrlq        $8, %%mm2\n\t"
156
        "psrlq        $8, %%mm3\n\t"
157
        "psrlq        $8, %%mm6\n\t"
158
        "psrlq        $8, %%mm7\n\t"
159
        "pand        %2, %%mm0\n\t"
160
        "pand        %2, %%mm1\n\t"
161
        "pand        %2, %%mm4\n\t"
162
        "pand        %2, %%mm5\n\t"
163
        "pand        %3, %%mm2\n\t"
164
        "pand        %3, %%mm3\n\t"
165
        "pand        %3, %%mm6\n\t"
166
        "pand        %3, %%mm7\n\t"
167
        "por        %%mm2, %%mm0\n\t"
168
        "por        %%mm3, %%mm1\n\t"
169
        "por        %%mm6, %%mm4\n\t"
170
        "por        %%mm7, %%mm5\n\t"
171

    
172
        "movq        %%mm1, %%mm2\n\t"
173
        "movq        %%mm4, %%mm3\n\t"
174
        "psllq        $48, %%mm2\n\t"
175
        "psllq        $32, %%mm3\n\t"
176
        "pand        %4, %%mm2\n\t"
177
        "pand        %5, %%mm3\n\t"
178
        "por        %%mm2, %%mm0\n\t"
179
        "psrlq        $16, %%mm1\n\t"
180
        "psrlq        $32, %%mm4\n\t"
181
        "psllq        $16, %%mm5\n\t"
182
        "por        %%mm3, %%mm1\n\t"
183
        "pand        %6, %%mm5\n\t"
184
        "por        %%mm5, %%mm4\n\t"
185

    
186
        MOVNTQ"        %%mm0, %0\n\t"
187
        MOVNTQ"        %%mm1, 8%0\n\t"
188
        MOVNTQ"        %%mm4, 16%0"
189
        :"=m"(*dest)
190
        :"m"(*s),"m"(mask24l),
191
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
192
        :"memory");
193
    dest += 24;
194
    s += 32;
195
  }
196
  __asm __volatile(SFENCE:::"memory");
197
  __asm __volatile(EMMS:::"memory");
198
#endif
199
  while(s < end)
200
  {
201
#ifdef WORDS_BIGENDIAN
202
    /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
203
    s++;
204
    dest[2] = *s++;
205
    dest[1] = *s++;
206
    dest[0] = *s++;
207
    dest += 3;
208
#else
209
    *dest++ = *s++;
210
    *dest++ = *s++;
211
    *dest++ = *s++;
212
    s++;
213
#endif
214
  }
215
}
216

    
217
/*
218
 Original by Strepto/Astral
219
 ported to gcc & bugfixed : A'rpi
220
 MMX2, 3DNOW optimization by Nick Kurshev
221
 32bit c version, and and&add trick by Michael Niedermayer
222
*/
223
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
224
{
225
  register const uint8_t* s=src;
226
  register uint8_t* d=dst;
227
  register const uint8_t *end;
228
  const uint8_t *mm_end;
229
  end = s + src_size;
230
#ifdef HAVE_MMX
231
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
232
  __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
233
  mm_end = end - 15;
234
  while(s<mm_end)
235
  {
236
        __asm __volatile(
237
                PREFETCH"        32%1\n\t"
238
                "movq        %1, %%mm0\n\t"
239
                "movq        8%1, %%mm2\n\t"
240
                "movq        %%mm0, %%mm1\n\t"
241
                "movq        %%mm2, %%mm3\n\t"
242
                "pand        %%mm4, %%mm0\n\t"
243
                "pand        %%mm4, %%mm2\n\t"
244
                "paddw        %%mm1, %%mm0\n\t"
245
                "paddw        %%mm3, %%mm2\n\t"
246
                MOVNTQ"        %%mm0, %0\n\t"
247
                MOVNTQ"        %%mm2, 8%0"
248
                :"=m"(*d)
249
                :"m"(*s)
250
                );
251
        d+=16;
252
        s+=16;
253
  }
254
  __asm __volatile(SFENCE:::"memory");
255
  __asm __volatile(EMMS:::"memory");
256
#endif
257
    mm_end = end - 3;
258
    while(s < mm_end)
259
    {
260
        register unsigned x= *((uint32_t *)s);
261
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
262
        d+=4;
263
        s+=4;
264
    }
265
    if(s < end)
266
    {
267
        register unsigned short x= *((uint16_t *)s);
268
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
269
    }
270
}
271

    
272
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
273
{
274
  register const uint8_t* s=src;
275
  register uint8_t* d=dst;
276
  register const uint8_t *end;
277
  const uint8_t *mm_end;
278
  end = s + src_size;
279
#ifdef HAVE_MMX
280
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
281
  __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
282
  __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
283
  mm_end = end - 15;
284
  while(s<mm_end)
285
  {
286
        __asm __volatile(
287
                PREFETCH"        32%1\n\t"
288
                "movq        %1, %%mm0\n\t"
289
                "movq        8%1, %%mm2\n\t"
290
                "movq        %%mm0, %%mm1\n\t"
291
                "movq        %%mm2, %%mm3\n\t"
292
                "psrlq        $1, %%mm0\n\t"
293
                "psrlq        $1, %%mm2\n\t"
294
                "pand        %%mm7, %%mm0\n\t"
295
                "pand        %%mm7, %%mm2\n\t"
296
                "pand        %%mm6, %%mm1\n\t"
297
                "pand        %%mm6, %%mm3\n\t"
298
                "por        %%mm1, %%mm0\n\t"
299
                "por        %%mm3, %%mm2\n\t"
300
                MOVNTQ"        %%mm0, %0\n\t"
301
                MOVNTQ"        %%mm2, 8%0"
302
                :"=m"(*d)
303
                :"m"(*s)
304
                );
305
        d+=16;
306
        s+=16;
307
  }
308
  __asm __volatile(SFENCE:::"memory");
309
  __asm __volatile(EMMS:::"memory");
310
#endif
311
    mm_end = end - 3;
312
    while(s < mm_end)
313
    {
314
        register uint32_t x= *((uint32_t *)s);
315
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
316
        s+=4;
317
        d+=4;
318
    }
319
    if(s < end)
320
    {
321
        register uint16_t x= *((uint16_t *)s);
322
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
323
        s+=2;
324
        d+=2;
325
    }
326
}
327

    
328
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
329
{
330
        const uint8_t *s = src;
331
        const uint8_t *end;
332
#ifdef HAVE_MMX
333
        const uint8_t *mm_end;
334
#endif
335
        uint16_t *d = (uint16_t *)dst;
336
        end = s + src_size;
337
#ifdef HAVE_MMX
338
        mm_end = end - 15;
339
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
340
        asm volatile(
341
                "movq %3, %%mm5                        \n\t"
342
                "movq %4, %%mm6                        \n\t"
343
                "movq %5, %%mm7                        \n\t"
344
                ASMALIGN(4)
345
                "1:                                \n\t"
346
                PREFETCH" 32(%1)                \n\t"
347
                "movd        (%1), %%mm0                \n\t"
348
                "movd        4(%1), %%mm3                \n\t"
349
                "punpckldq 8(%1), %%mm0                \n\t"
350
                "punpckldq 12(%1), %%mm3        \n\t"
351
                "movq %%mm0, %%mm1                \n\t"
352
                "movq %%mm3, %%mm4                \n\t"
353
                "pand %%mm6, %%mm0                \n\t"
354
                "pand %%mm6, %%mm3                \n\t"
355
                "pmaddwd %%mm7, %%mm0                \n\t"
356
                "pmaddwd %%mm7, %%mm3                \n\t"
357
                "pand %%mm5, %%mm1                \n\t"
358
                "pand %%mm5, %%mm4                \n\t"
359
                "por %%mm1, %%mm0                \n\t"        
360
                "por %%mm4, %%mm3                \n\t"
361
                "psrld $5, %%mm0                \n\t"
362
                "pslld $11, %%mm3                \n\t"
363
                "por %%mm3, %%mm0                \n\t"
364
                MOVNTQ"        %%mm0, (%0)                \n\t"
365
                "add $16, %1                        \n\t"
366
                "add $8, %0                        \n\t"
367
                "cmp %2, %1                        \n\t"
368
                " jb 1b                                \n\t"
369
                : "+r" (d), "+r"(s)
370
                : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
371
        );
372
#else
373
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
374
        __asm __volatile(
375
            "movq        %0, %%mm7\n\t"
376
            "movq        %1, %%mm6\n\t"
377
            ::"m"(red_16mask),"m"(green_16mask));
378
        while(s < mm_end)
379
        {
380
            __asm __volatile(
381
                PREFETCH" 32%1\n\t"
382
                "movd        %1, %%mm0\n\t"
383
                "movd        4%1, %%mm3\n\t"
384
                "punpckldq 8%1, %%mm0\n\t"
385
                "punpckldq 12%1, %%mm3\n\t"
386
                "movq        %%mm0, %%mm1\n\t"
387
                "movq        %%mm0, %%mm2\n\t"
388
                "movq        %%mm3, %%mm4\n\t"
389
                "movq        %%mm3, %%mm5\n\t"
390
                "psrlq        $3, %%mm0\n\t"
391
                "psrlq        $3, %%mm3\n\t"
392
                "pand        %2, %%mm0\n\t"
393
                "pand        %2, %%mm3\n\t"
394
                "psrlq        $5, %%mm1\n\t"
395
                "psrlq        $5, %%mm4\n\t"
396
                "pand        %%mm6, %%mm1\n\t"
397
                "pand        %%mm6, %%mm4\n\t"
398
                "psrlq        $8, %%mm2\n\t"
399
                "psrlq        $8, %%mm5\n\t"
400
                "pand        %%mm7, %%mm2\n\t"
401
                "pand        %%mm7, %%mm5\n\t"
402
                "por        %%mm1, %%mm0\n\t"
403
                "por        %%mm4, %%mm3\n\t"
404
                "por        %%mm2, %%mm0\n\t"
405
                "por        %%mm5, %%mm3\n\t"
406
                "psllq        $16, %%mm3\n\t"
407
                "por        %%mm3, %%mm0\n\t"
408
                MOVNTQ"        %%mm0, %0\n\t"
409
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
410
                d += 4;
411
                s += 16;
412
        }
413
#endif
414
        __asm __volatile(SFENCE:::"memory");
415
        __asm __volatile(EMMS:::"memory");
416
#endif
417
        while(s < end)
418
        {
419
                register int rgb = *(uint32_t*)s; s += 4;
420
                *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
421
        }
422
}
423

    
424
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
425
{
426
        const uint8_t *s = src;
427
        const uint8_t *end;
428
#ifdef HAVE_MMX
429
        const uint8_t *mm_end;
430
#endif
431
        uint16_t *d = (uint16_t *)dst;
432
        end = s + src_size;
433
#ifdef HAVE_MMX
434
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
435
        __asm __volatile(
436
            "movq        %0, %%mm7\n\t"
437
            "movq        %1, %%mm6\n\t"
438
            ::"m"(red_16mask),"m"(green_16mask));
439
        mm_end = end - 15;
440
        while(s < mm_end)
441
        {
442
            __asm __volatile(
443
                PREFETCH" 32%1\n\t"
444
                "movd        %1, %%mm0\n\t"
445
                "movd        4%1, %%mm3\n\t"
446
                "punpckldq 8%1, %%mm0\n\t"
447
                "punpckldq 12%1, %%mm3\n\t"
448
                "movq        %%mm0, %%mm1\n\t"
449
                "movq        %%mm0, %%mm2\n\t"
450
                "movq        %%mm3, %%mm4\n\t"
451
                "movq        %%mm3, %%mm5\n\t"
452
                "psllq        $8, %%mm0\n\t"
453
                "psllq        $8, %%mm3\n\t"
454
                "pand        %%mm7, %%mm0\n\t"
455
                "pand        %%mm7, %%mm3\n\t"
456
                "psrlq        $5, %%mm1\n\t"
457
                "psrlq        $5, %%mm4\n\t"
458
                "pand        %%mm6, %%mm1\n\t"
459
                "pand        %%mm6, %%mm4\n\t"
460
                "psrlq        $19, %%mm2\n\t"
461
                "psrlq        $19, %%mm5\n\t"
462
                "pand        %2, %%mm2\n\t"
463
                "pand        %2, %%mm5\n\t"
464
                "por        %%mm1, %%mm0\n\t"
465
                "por        %%mm4, %%mm3\n\t"
466
                "por        %%mm2, %%mm0\n\t"
467
                "por        %%mm5, %%mm3\n\t"
468
                "psllq        $16, %%mm3\n\t"
469
                "por        %%mm3, %%mm0\n\t"
470
                MOVNTQ"        %%mm0, %0\n\t"
471
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
472
                d += 4;
473
                s += 16;
474
        }
475
        __asm __volatile(SFENCE:::"memory");
476
        __asm __volatile(EMMS:::"memory");
477
#endif
478
        while(s < end)
479
        {
480
                register int rgb = *(uint32_t*)s; s += 4;
481
                *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
482
        }
483
}
484

    
485
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
486
{
487
        const uint8_t *s = src;
488
        const uint8_t *end;
489
#ifdef HAVE_MMX
490
        const uint8_t *mm_end;
491
#endif
492
        uint16_t *d = (uint16_t *)dst;
493
        end = s + src_size;
494
#ifdef HAVE_MMX
495
        mm_end = end - 15;
496
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
497
        asm volatile(
498
                "movq %3, %%mm5                        \n\t"
499
                "movq %4, %%mm6                        \n\t"
500
                "movq %5, %%mm7                        \n\t"
501
                ASMALIGN(4)
502
                "1:                                \n\t"
503
                PREFETCH" 32(%1)                \n\t"
504
                "movd        (%1), %%mm0                \n\t"
505
                "movd        4(%1), %%mm3                \n\t"
506
                "punpckldq 8(%1), %%mm0                \n\t"
507
                "punpckldq 12(%1), %%mm3        \n\t"
508
                "movq %%mm0, %%mm1                \n\t"
509
                "movq %%mm3, %%mm4                \n\t"
510
                "pand %%mm6, %%mm0                \n\t"
511
                "pand %%mm6, %%mm3                \n\t"
512
                "pmaddwd %%mm7, %%mm0                \n\t"
513
                "pmaddwd %%mm7, %%mm3                \n\t"
514
                "pand %%mm5, %%mm1                \n\t"
515
                "pand %%mm5, %%mm4                \n\t"
516
                "por %%mm1, %%mm0                \n\t"        
517
                "por %%mm4, %%mm3                \n\t"
518
                "psrld $6, %%mm0                \n\t"
519
                "pslld $10, %%mm3                \n\t"
520
                "por %%mm3, %%mm0                \n\t"
521
                MOVNTQ"        %%mm0, (%0)                \n\t"
522
                "add $16, %1                        \n\t"
523
                "add $8, %0                        \n\t"
524
                "cmp %2, %1                        \n\t"
525
                " jb 1b                                \n\t"
526
                : "+r" (d), "+r"(s)
527
                : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
528
        );
529
#else
530
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
531
        __asm __volatile(
532
            "movq        %0, %%mm7\n\t"
533
            "movq        %1, %%mm6\n\t"
534
            ::"m"(red_15mask),"m"(green_15mask));
535
        while(s < mm_end)
536
        {
537
            __asm __volatile(
538
                PREFETCH" 32%1\n\t"
539
                "movd        %1, %%mm0\n\t"
540
                "movd        4%1, %%mm3\n\t"
541
                "punpckldq 8%1, %%mm0\n\t"
542
                "punpckldq 12%1, %%mm3\n\t"
543
                "movq        %%mm0, %%mm1\n\t"
544
                "movq        %%mm0, %%mm2\n\t"
545
                "movq        %%mm3, %%mm4\n\t"
546
                "movq        %%mm3, %%mm5\n\t"
547
                "psrlq        $3, %%mm0\n\t"
548
                "psrlq        $3, %%mm3\n\t"
549
                "pand        %2, %%mm0\n\t"
550
                "pand        %2, %%mm3\n\t"
551
                "psrlq        $6, %%mm1\n\t"
552
                "psrlq        $6, %%mm4\n\t"
553
                "pand        %%mm6, %%mm1\n\t"
554
                "pand        %%mm6, %%mm4\n\t"
555
                "psrlq        $9, %%mm2\n\t"
556
                "psrlq        $9, %%mm5\n\t"
557
                "pand        %%mm7, %%mm2\n\t"
558
                "pand        %%mm7, %%mm5\n\t"
559
                "por        %%mm1, %%mm0\n\t"
560
                "por        %%mm4, %%mm3\n\t"
561
                "por        %%mm2, %%mm0\n\t"
562
                "por        %%mm5, %%mm3\n\t"
563
                "psllq        $16, %%mm3\n\t"
564
                "por        %%mm3, %%mm0\n\t"
565
                MOVNTQ"        %%mm0, %0\n\t"
566
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
567
                d += 4;
568
                s += 16;
569
        }
570
#endif
571
        __asm __volatile(SFENCE:::"memory");
572
        __asm __volatile(EMMS:::"memory");
573
#endif
574
        while(s < end)
575
        {
576
                register int rgb = *(uint32_t*)s; s += 4;
577
                *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
578
        }
579
}
580

    
581
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
582
{
583
        const uint8_t *s = src;
584
        const uint8_t *end;
585
#ifdef HAVE_MMX
586
        const uint8_t *mm_end;
587
#endif
588
        uint16_t *d = (uint16_t *)dst;
589
        end = s + src_size;
590
#ifdef HAVE_MMX
591
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
592
        __asm __volatile(
593
            "movq        %0, %%mm7\n\t"
594
            "movq        %1, %%mm6\n\t"
595
            ::"m"(red_15mask),"m"(green_15mask));
596
        mm_end = end - 15;
597
        while(s < mm_end)
598
        {
599
            __asm __volatile(
600
                PREFETCH" 32%1\n\t"
601
                "movd        %1, %%mm0\n\t"
602
                "movd        4%1, %%mm3\n\t"
603
                "punpckldq 8%1, %%mm0\n\t"
604
                "punpckldq 12%1, %%mm3\n\t"
605
                "movq        %%mm0, %%mm1\n\t"
606
                "movq        %%mm0, %%mm2\n\t"
607
                "movq        %%mm3, %%mm4\n\t"
608
                "movq        %%mm3, %%mm5\n\t"
609
                "psllq        $7, %%mm0\n\t"
610
                "psllq        $7, %%mm3\n\t"
611
                "pand        %%mm7, %%mm0\n\t"
612
                "pand        %%mm7, %%mm3\n\t"
613
                "psrlq        $6, %%mm1\n\t"
614
                "psrlq        $6, %%mm4\n\t"
615
                "pand        %%mm6, %%mm1\n\t"
616
                "pand        %%mm6, %%mm4\n\t"
617
                "psrlq        $19, %%mm2\n\t"
618
                "psrlq        $19, %%mm5\n\t"
619
                "pand        %2, %%mm2\n\t"
620
                "pand        %2, %%mm5\n\t"
621
                "por        %%mm1, %%mm0\n\t"
622
                "por        %%mm4, %%mm3\n\t"
623
                "por        %%mm2, %%mm0\n\t"
624
                "por        %%mm5, %%mm3\n\t"
625
                "psllq        $16, %%mm3\n\t"
626
                "por        %%mm3, %%mm0\n\t"
627
                MOVNTQ"        %%mm0, %0\n\t"
628
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
629
                d += 4;
630
                s += 16;
631
        }
632
        __asm __volatile(SFENCE:::"memory");
633
        __asm __volatile(EMMS:::"memory");
634
#endif
635
        while(s < end)
636
        {
637
                register int rgb = *(uint32_t*)s; s += 4;
638
                *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
639
        }
640
}
641

    
642
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
643
{
644
        const uint8_t *s = src;
645
        const uint8_t *end;
646
#ifdef HAVE_MMX
647
        const uint8_t *mm_end;
648
#endif
649
        uint16_t *d = (uint16_t *)dst;
650
        end = s + src_size;
651
#ifdef HAVE_MMX
652
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
653
        __asm __volatile(
654
            "movq        %0, %%mm7\n\t"
655
            "movq        %1, %%mm6\n\t"
656
            ::"m"(red_16mask),"m"(green_16mask));
657
        mm_end = end - 11;
658
        while(s < mm_end)
659
        {
660
            __asm __volatile(
661
                PREFETCH" 32%1\n\t"
662
                "movd        %1, %%mm0\n\t"
663
                "movd        3%1, %%mm3\n\t"
664
                "punpckldq 6%1, %%mm0\n\t"
665
                "punpckldq 9%1, %%mm3\n\t"
666
                "movq        %%mm0, %%mm1\n\t"
667
                "movq        %%mm0, %%mm2\n\t"
668
                "movq        %%mm3, %%mm4\n\t"
669
                "movq        %%mm3, %%mm5\n\t"
670
                "psrlq        $3, %%mm0\n\t"
671
                "psrlq        $3, %%mm3\n\t"
672
                "pand        %2, %%mm0\n\t"
673
                "pand        %2, %%mm3\n\t"
674
                "psrlq        $5, %%mm1\n\t"
675
                "psrlq        $5, %%mm4\n\t"
676
                "pand        %%mm6, %%mm1\n\t"
677
                "pand        %%mm6, %%mm4\n\t"
678
                "psrlq        $8, %%mm2\n\t"
679
                "psrlq        $8, %%mm5\n\t"
680
                "pand        %%mm7, %%mm2\n\t"
681
                "pand        %%mm7, %%mm5\n\t"
682
                "por        %%mm1, %%mm0\n\t"
683
                "por        %%mm4, %%mm3\n\t"
684
                "por        %%mm2, %%mm0\n\t"
685
                "por        %%mm5, %%mm3\n\t"
686
                "psllq        $16, %%mm3\n\t"
687
                "por        %%mm3, %%mm0\n\t"
688
                MOVNTQ"        %%mm0, %0\n\t"
689
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
690
                d += 4;
691
                s += 12;
692
        }
693
        __asm __volatile(SFENCE:::"memory");
694
        __asm __volatile(EMMS:::"memory");
695
#endif
696
        while(s < end)
697
        {
698
                const int b= *s++;
699
                const int g= *s++;
700
                const int r= *s++;
701
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
702
        }
703
}
704

    
705
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
706
{
707
        const uint8_t *s = src;
708
        const uint8_t *end;
709
#ifdef HAVE_MMX
710
        const uint8_t *mm_end;
711
#endif
712
        uint16_t *d = (uint16_t *)dst;
713
        end = s + src_size;
714
#ifdef HAVE_MMX
715
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
716
        __asm __volatile(
717
            "movq        %0, %%mm7\n\t"
718
            "movq        %1, %%mm6\n\t"
719
            ::"m"(red_16mask),"m"(green_16mask));
720
        mm_end = end - 15;
721
        while(s < mm_end)
722
        {
723
            __asm __volatile(
724
                PREFETCH" 32%1\n\t"
725
                "movd        %1, %%mm0\n\t"
726
                "movd        3%1, %%mm3\n\t"
727
                "punpckldq 6%1, %%mm0\n\t"
728
                "punpckldq 9%1, %%mm3\n\t"
729
                "movq        %%mm0, %%mm1\n\t"
730
                "movq        %%mm0, %%mm2\n\t"
731
                "movq        %%mm3, %%mm4\n\t"
732
                "movq        %%mm3, %%mm5\n\t"
733
                "psllq        $8, %%mm0\n\t"
734
                "psllq        $8, %%mm3\n\t"
735
                "pand        %%mm7, %%mm0\n\t"
736
                "pand        %%mm7, %%mm3\n\t"
737
                "psrlq        $5, %%mm1\n\t"
738
                "psrlq        $5, %%mm4\n\t"
739
                "pand        %%mm6, %%mm1\n\t"
740
                "pand        %%mm6, %%mm4\n\t"
741
                "psrlq        $19, %%mm2\n\t"
742
                "psrlq        $19, %%mm5\n\t"
743
                "pand        %2, %%mm2\n\t"
744
                "pand        %2, %%mm5\n\t"
745
                "por        %%mm1, %%mm0\n\t"
746
                "por        %%mm4, %%mm3\n\t"
747
                "por        %%mm2, %%mm0\n\t"
748
                "por        %%mm5, %%mm3\n\t"
749
                "psllq        $16, %%mm3\n\t"
750
                "por        %%mm3, %%mm0\n\t"
751
                MOVNTQ"        %%mm0, %0\n\t"
752
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
753
                d += 4;
754
                s += 12;
755
        }
756
        __asm __volatile(SFENCE:::"memory");
757
        __asm __volatile(EMMS:::"memory");
758
#endif
759
        while(s < end)
760
        {
761
                const int r= *s++;
762
                const int g= *s++;
763
                const int b= *s++;
764
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
765
        }
766
}
767

    
768
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
769
{
770
        const uint8_t *s = src;
771
        const uint8_t *end;
772
#ifdef HAVE_MMX
773
        const uint8_t *mm_end;
774
#endif
775
        uint16_t *d = (uint16_t *)dst;
776
        end = s + src_size;
777
#ifdef HAVE_MMX
778
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
779
        __asm __volatile(
780
            "movq        %0, %%mm7\n\t"
781
            "movq        %1, %%mm6\n\t"
782
            ::"m"(red_15mask),"m"(green_15mask));
783
        mm_end = end - 11;
784
        while(s < mm_end)
785
        {
786
            __asm __volatile(
787
                PREFETCH" 32%1\n\t"
788
                "movd        %1, %%mm0\n\t"
789
                "movd        3%1, %%mm3\n\t"
790
                "punpckldq 6%1, %%mm0\n\t"
791
                "punpckldq 9%1, %%mm3\n\t"
792
                "movq        %%mm0, %%mm1\n\t"
793
                "movq        %%mm0, %%mm2\n\t"
794
                "movq        %%mm3, %%mm4\n\t"
795
                "movq        %%mm3, %%mm5\n\t"
796
                "psrlq        $3, %%mm0\n\t"
797
                "psrlq        $3, %%mm3\n\t"
798
                "pand        %2, %%mm0\n\t"
799
                "pand        %2, %%mm3\n\t"
800
                "psrlq        $6, %%mm1\n\t"
801
                "psrlq        $6, %%mm4\n\t"
802
                "pand        %%mm6, %%mm1\n\t"
803
                "pand        %%mm6, %%mm4\n\t"
804
                "psrlq        $9, %%mm2\n\t"
805
                "psrlq        $9, %%mm5\n\t"
806
                "pand        %%mm7, %%mm2\n\t"
807
                "pand        %%mm7, %%mm5\n\t"
808
                "por        %%mm1, %%mm0\n\t"
809
                "por        %%mm4, %%mm3\n\t"
810
                "por        %%mm2, %%mm0\n\t"
811
                "por        %%mm5, %%mm3\n\t"
812
                "psllq        $16, %%mm3\n\t"
813
                "por        %%mm3, %%mm0\n\t"
814
                MOVNTQ"        %%mm0, %0\n\t"
815
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
816
                d += 4;
817
                s += 12;
818
        }
819
        __asm __volatile(SFENCE:::"memory");
820
        __asm __volatile(EMMS:::"memory");
821
#endif
822
        while(s < end)
823
        {
824
                const int b= *s++;
825
                const int g= *s++;
826
                const int r= *s++;
827
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
828
        }
829
}
830

    
831
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
832
{
833
        const uint8_t *s = src;
834
        const uint8_t *end;
835
#ifdef HAVE_MMX
836
        const uint8_t *mm_end;
837
#endif
838
        uint16_t *d = (uint16_t *)dst;
839
        end = s + src_size;
840
#ifdef HAVE_MMX
841
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
842
        __asm __volatile(
843
            "movq        %0, %%mm7\n\t"
844
            "movq        %1, %%mm6\n\t"
845
            ::"m"(red_15mask),"m"(green_15mask));
846
        mm_end = end - 15;
847
        while(s < mm_end)
848
        {
849
            __asm __volatile(
850
                PREFETCH" 32%1\n\t"
851
                "movd        %1, %%mm0\n\t"
852
                "movd        3%1, %%mm3\n\t"
853
                "punpckldq 6%1, %%mm0\n\t"
854
                "punpckldq 9%1, %%mm3\n\t"
855
                "movq        %%mm0, %%mm1\n\t"
856
                "movq        %%mm0, %%mm2\n\t"
857
                "movq        %%mm3, %%mm4\n\t"
858
                "movq        %%mm3, %%mm5\n\t"
859
                "psllq        $7, %%mm0\n\t"
860
                "psllq        $7, %%mm3\n\t"
861
                "pand        %%mm7, %%mm0\n\t"
862
                "pand        %%mm7, %%mm3\n\t"
863
                "psrlq        $6, %%mm1\n\t"
864
                "psrlq        $6, %%mm4\n\t"
865
                "pand        %%mm6, %%mm1\n\t"
866
                "pand        %%mm6, %%mm4\n\t"
867
                "psrlq        $19, %%mm2\n\t"
868
                "psrlq        $19, %%mm5\n\t"
869
                "pand        %2, %%mm2\n\t"
870
                "pand        %2, %%mm5\n\t"
871
                "por        %%mm1, %%mm0\n\t"
872
                "por        %%mm4, %%mm3\n\t"
873
                "por        %%mm2, %%mm0\n\t"
874
                "por        %%mm5, %%mm3\n\t"
875
                "psllq        $16, %%mm3\n\t"
876
                "por        %%mm3, %%mm0\n\t"
877
                MOVNTQ"        %%mm0, %0\n\t"
878
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
879
                d += 4;
880
                s += 12;
881
        }
882
        __asm __volatile(SFENCE:::"memory");
883
        __asm __volatile(EMMS:::"memory");
884
#endif
885
        while(s < end)
886
        {
887
                const int r= *s++;
888
                const int g= *s++;
889
                const int b= *s++;
890
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
891
        }
892
}
893

    
894
/*
895
  I use here less accurate approximation by simply
896
 left-shifting the input
897
  value and filling the low order bits with
898
 zeroes. This method improves png's
899
  compression but this scheme cannot reproduce white exactly, since it does not
900
  generate an all-ones maximum value; the net effect is to darken the
901
  image slightly.
902

903
  The better method should be "left bit replication":
904

905
   4 3 2 1 0
906
   ---------
907
   1 1 0 1 1
908

909
   7 6 5 4 3  2 1 0
910
   ----------------
911
   1 1 0 1 1  1 1 0
912
   |=======|  |===|
913
       |      Leftmost Bits Repeated to Fill Open Bits
914
       |
915
   Original Bits
916
*/
917
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
918
{
919
        const uint16_t *end;
920
#ifdef HAVE_MMX
921
        const uint16_t *mm_end;
922
#endif
923
        uint8_t *d = (uint8_t *)dst;
924
        const uint16_t *s = (uint16_t *)src;
925
        end = s + src_size/2;
926
#ifdef HAVE_MMX
927
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
928
        mm_end = end - 7;
929
        while(s < mm_end)
930
        {
931
            __asm __volatile(
932
                PREFETCH" 32%1\n\t"
933
                "movq        %1, %%mm0\n\t"
934
                "movq        %1, %%mm1\n\t"
935
                "movq        %1, %%mm2\n\t"
936
                "pand        %2, %%mm0\n\t"
937
                "pand        %3, %%mm1\n\t"
938
                "pand        %4, %%mm2\n\t"
939
                "psllq        $3, %%mm0\n\t"
940
                "psrlq        $2, %%mm1\n\t"
941
                "psrlq        $7, %%mm2\n\t"
942
                "movq        %%mm0, %%mm3\n\t"
943
                "movq        %%mm1, %%mm4\n\t"
944
                "movq        %%mm2, %%mm5\n\t"
945
                "punpcklwd %5, %%mm0\n\t"
946
                "punpcklwd %5, %%mm1\n\t"
947
                "punpcklwd %5, %%mm2\n\t"
948
                "punpckhwd %5, %%mm3\n\t"
949
                "punpckhwd %5, %%mm4\n\t"
950
                "punpckhwd %5, %%mm5\n\t"
951
                "psllq        $8, %%mm1\n\t"
952
                "psllq        $16, %%mm2\n\t"
953
                "por        %%mm1, %%mm0\n\t"
954
                "por        %%mm2, %%mm0\n\t"
955
                "psllq        $8, %%mm4\n\t"
956
                "psllq        $16, %%mm5\n\t"
957
                "por        %%mm4, %%mm3\n\t"
958
                "por        %%mm5, %%mm3\n\t"
959

    
960
                "movq        %%mm0, %%mm6\n\t"
961
                "movq        %%mm3, %%mm7\n\t"
962
                
963
                "movq        8%1, %%mm0\n\t"
964
                "movq        8%1, %%mm1\n\t"
965
                "movq        8%1, %%mm2\n\t"
966
                "pand        %2, %%mm0\n\t"
967
                "pand        %3, %%mm1\n\t"
968
                "pand        %4, %%mm2\n\t"
969
                "psllq        $3, %%mm0\n\t"
970
                "psrlq        $2, %%mm1\n\t"
971
                "psrlq        $7, %%mm2\n\t"
972
                "movq        %%mm0, %%mm3\n\t"
973
                "movq        %%mm1, %%mm4\n\t"
974
                "movq        %%mm2, %%mm5\n\t"
975
                "punpcklwd %5, %%mm0\n\t"
976
                "punpcklwd %5, %%mm1\n\t"
977
                "punpcklwd %5, %%mm2\n\t"
978
                "punpckhwd %5, %%mm3\n\t"
979
                "punpckhwd %5, %%mm4\n\t"
980
                "punpckhwd %5, %%mm5\n\t"
981
                "psllq        $8, %%mm1\n\t"
982
                "psllq        $16, %%mm2\n\t"
983
                "por        %%mm1, %%mm0\n\t"
984
                "por        %%mm2, %%mm0\n\t"
985
                "psllq        $8, %%mm4\n\t"
986
                "psllq        $16, %%mm5\n\t"
987
                "por        %%mm4, %%mm3\n\t"
988
                "por        %%mm5, %%mm3\n\t"
989

    
990
                :"=m"(*d)
991
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
992
                :"memory");
993
            /* Borrowed 32 to 24 */
994
            __asm __volatile(
995
                "movq        %%mm0, %%mm4\n\t"
996
                "movq        %%mm3, %%mm5\n\t"
997
                "movq        %%mm6, %%mm0\n\t"
998
                "movq        %%mm7, %%mm1\n\t"
999
                
1000
                "movq        %%mm4, %%mm6\n\t"
1001
                "movq        %%mm5, %%mm7\n\t"
1002
                "movq        %%mm0, %%mm2\n\t"
1003
                "movq        %%mm1, %%mm3\n\t"
1004

    
1005
                "psrlq        $8, %%mm2\n\t"
1006
                "psrlq        $8, %%mm3\n\t"
1007
                "psrlq        $8, %%mm6\n\t"
1008
                "psrlq        $8, %%mm7\n\t"
1009
                "pand        %2, %%mm0\n\t"
1010
                "pand        %2, %%mm1\n\t"
1011
                "pand        %2, %%mm4\n\t"
1012
                "pand        %2, %%mm5\n\t"
1013
                "pand        %3, %%mm2\n\t"
1014
                "pand        %3, %%mm3\n\t"
1015
                "pand        %3, %%mm6\n\t"
1016
                "pand        %3, %%mm7\n\t"
1017
                "por        %%mm2, %%mm0\n\t"
1018
                "por        %%mm3, %%mm1\n\t"
1019
                "por        %%mm6, %%mm4\n\t"
1020
                "por        %%mm7, %%mm5\n\t"
1021

    
1022
                "movq        %%mm1, %%mm2\n\t"
1023
                "movq        %%mm4, %%mm3\n\t"
1024
                "psllq        $48, %%mm2\n\t"
1025
                "psllq        $32, %%mm3\n\t"
1026
                "pand        %4, %%mm2\n\t"
1027
                "pand        %5, %%mm3\n\t"
1028
                "por        %%mm2, %%mm0\n\t"
1029
                "psrlq        $16, %%mm1\n\t"
1030
                "psrlq        $32, %%mm4\n\t"
1031
                "psllq        $16, %%mm5\n\t"
1032
                "por        %%mm3, %%mm1\n\t"
1033
                "pand        %6, %%mm5\n\t"
1034
                "por        %%mm5, %%mm4\n\t"
1035

    
1036
                MOVNTQ"        %%mm0, %0\n\t"
1037
                MOVNTQ"        %%mm1, 8%0\n\t"
1038
                MOVNTQ"        %%mm4, 16%0"
1039

    
1040
                :"=m"(*d)
1041
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1042
                :"memory");
1043
                d += 24;
1044
                s += 8;
1045
        }
1046
        __asm __volatile(SFENCE:::"memory");
1047
        __asm __volatile(EMMS:::"memory");
1048
#endif
1049
        while(s < end)
1050
        {
1051
                register uint16_t bgr;
1052
                bgr = *s++;
1053
                *d++ = (bgr&0x1F)<<3;
1054
                *d++ = (bgr&0x3E0)>>2;
1055
                *d++ = (bgr&0x7C00)>>7;
1056
        }
1057
}
1058

    
1059
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1060
{
1061
        const uint16_t *end;
1062
#ifdef HAVE_MMX
1063
        const uint16_t *mm_end;
1064
#endif
1065
        uint8_t *d = (uint8_t *)dst;
1066
        const uint16_t *s = (const uint16_t *)src;
1067
        end = s + src_size/2;
1068
#ifdef HAVE_MMX
1069
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1070
        mm_end = end - 7;
1071
        while(s < mm_end)
1072
        {
1073
            __asm __volatile(
1074
                PREFETCH" 32%1\n\t"
1075
                "movq        %1, %%mm0\n\t"
1076
                "movq        %1, %%mm1\n\t"
1077
                "movq        %1, %%mm2\n\t"
1078
                "pand        %2, %%mm0\n\t"
1079
                "pand        %3, %%mm1\n\t"
1080
                "pand        %4, %%mm2\n\t"
1081
                "psllq        $3, %%mm0\n\t"
1082
                "psrlq        $3, %%mm1\n\t"
1083
                "psrlq        $8, %%mm2\n\t"
1084
                "movq        %%mm0, %%mm3\n\t"
1085
                "movq        %%mm1, %%mm4\n\t"
1086
                "movq        %%mm2, %%mm5\n\t"
1087
                "punpcklwd %5, %%mm0\n\t"
1088
                "punpcklwd %5, %%mm1\n\t"
1089
                "punpcklwd %5, %%mm2\n\t"
1090
                "punpckhwd %5, %%mm3\n\t"
1091
                "punpckhwd %5, %%mm4\n\t"
1092
                "punpckhwd %5, %%mm5\n\t"
1093
                "psllq        $8, %%mm1\n\t"
1094
                "psllq        $16, %%mm2\n\t"
1095
                "por        %%mm1, %%mm0\n\t"
1096
                "por        %%mm2, %%mm0\n\t"
1097
                "psllq        $8, %%mm4\n\t"
1098
                "psllq        $16, %%mm5\n\t"
1099
                "por        %%mm4, %%mm3\n\t"
1100
                "por        %%mm5, %%mm3\n\t"
1101
                
1102
                "movq        %%mm0, %%mm6\n\t"
1103
                "movq        %%mm3, %%mm7\n\t"
1104

    
1105
                "movq        8%1, %%mm0\n\t"
1106
                "movq        8%1, %%mm1\n\t"
1107
                "movq        8%1, %%mm2\n\t"
1108
                "pand        %2, %%mm0\n\t"
1109
                "pand        %3, %%mm1\n\t"
1110
                "pand        %4, %%mm2\n\t"
1111
                "psllq        $3, %%mm0\n\t"
1112
                "psrlq        $3, %%mm1\n\t"
1113
                "psrlq        $8, %%mm2\n\t"
1114
                "movq        %%mm0, %%mm3\n\t"
1115
                "movq        %%mm1, %%mm4\n\t"
1116
                "movq        %%mm2, %%mm5\n\t"
1117
                "punpcklwd %5, %%mm0\n\t"
1118
                "punpcklwd %5, %%mm1\n\t"
1119
                "punpcklwd %5, %%mm2\n\t"
1120
                "punpckhwd %5, %%mm3\n\t"
1121
                "punpckhwd %5, %%mm4\n\t"
1122
                "punpckhwd %5, %%mm5\n\t"
1123
                "psllq        $8, %%mm1\n\t"
1124
                "psllq        $16, %%mm2\n\t"
1125
                "por        %%mm1, %%mm0\n\t"
1126
                "por        %%mm2, %%mm0\n\t"
1127
                "psllq        $8, %%mm4\n\t"
1128
                "psllq        $16, %%mm5\n\t"
1129
                "por        %%mm4, %%mm3\n\t"
1130
                "por        %%mm5, %%mm3\n\t"
1131
                :"=m"(*d)
1132
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)                
1133
                :"memory");
1134
            /* Borrowed 32 to 24 */
1135
            __asm __volatile(
1136
                "movq        %%mm0, %%mm4\n\t"
1137
                "movq        %%mm3, %%mm5\n\t"
1138
                "movq        %%mm6, %%mm0\n\t"
1139
                "movq        %%mm7, %%mm1\n\t"
1140
                
1141
                "movq        %%mm4, %%mm6\n\t"
1142
                "movq        %%mm5, %%mm7\n\t"
1143
                "movq        %%mm0, %%mm2\n\t"
1144
                "movq        %%mm1, %%mm3\n\t"
1145

    
1146
                "psrlq        $8, %%mm2\n\t"
1147
                "psrlq        $8, %%mm3\n\t"
1148
                "psrlq        $8, %%mm6\n\t"
1149
                "psrlq        $8, %%mm7\n\t"
1150
                "pand        %2, %%mm0\n\t"
1151
                "pand        %2, %%mm1\n\t"
1152
                "pand        %2, %%mm4\n\t"
1153
                "pand        %2, %%mm5\n\t"
1154
                "pand        %3, %%mm2\n\t"
1155
                "pand        %3, %%mm3\n\t"
1156
                "pand        %3, %%mm6\n\t"
1157
                "pand        %3, %%mm7\n\t"
1158
                "por        %%mm2, %%mm0\n\t"
1159
                "por        %%mm3, %%mm1\n\t"
1160
                "por        %%mm6, %%mm4\n\t"
1161
                "por        %%mm7, %%mm5\n\t"
1162

    
1163
                "movq        %%mm1, %%mm2\n\t"
1164
                "movq        %%mm4, %%mm3\n\t"
1165
                "psllq        $48, %%mm2\n\t"
1166
                "psllq        $32, %%mm3\n\t"
1167
                "pand        %4, %%mm2\n\t"
1168
                "pand        %5, %%mm3\n\t"
1169
                "por        %%mm2, %%mm0\n\t"
1170
                "psrlq        $16, %%mm1\n\t"
1171
                "psrlq        $32, %%mm4\n\t"
1172
                "psllq        $16, %%mm5\n\t"
1173
                "por        %%mm3, %%mm1\n\t"
1174
                "pand        %6, %%mm5\n\t"
1175
                "por        %%mm5, %%mm4\n\t"
1176

    
1177
                MOVNTQ"        %%mm0, %0\n\t"
1178
                MOVNTQ"        %%mm1, 8%0\n\t"
1179
                MOVNTQ"        %%mm4, 16%0"
1180

    
1181
                :"=m"(*d)
1182
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1183
                :"memory");
1184
                d += 24;
1185
                s += 8;
1186
        }
1187
        __asm __volatile(SFENCE:::"memory");
1188
        __asm __volatile(EMMS:::"memory");
1189
#endif
1190
        while(s < end)
1191
        {
1192
                register uint16_t bgr;
1193
                bgr = *s++;
1194
                *d++ = (bgr&0x1F)<<3;
1195
                *d++ = (bgr&0x7E0)>>3;
1196
                *d++ = (bgr&0xF800)>>8;
1197
        }
1198
}
1199

    
1200
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1201
{
1202
        const uint16_t *end;
1203
#ifdef HAVE_MMX
1204
        const uint16_t *mm_end;
1205
#endif
1206
        uint8_t *d = (uint8_t *)dst;
1207
        const uint16_t *s = (const uint16_t *)src;
1208
        end = s + src_size/2;
1209
#ifdef HAVE_MMX
1210
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1211
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1212
        mm_end = end - 3;
1213
        while(s < mm_end)
1214
        {
1215
            __asm __volatile(
1216
                PREFETCH" 32%1\n\t"
1217
                "movq        %1, %%mm0\n\t"
1218
                "movq        %1, %%mm1\n\t"
1219
                "movq        %1, %%mm2\n\t"
1220
                "pand        %2, %%mm0\n\t"
1221
                "pand        %3, %%mm1\n\t"
1222
                "pand        %4, %%mm2\n\t"
1223
                "psllq        $3, %%mm0\n\t"
1224
                "psrlq        $2, %%mm1\n\t"
1225
                "psrlq        $7, %%mm2\n\t"
1226
                "movq        %%mm0, %%mm3\n\t"
1227
                "movq        %%mm1, %%mm4\n\t"
1228
                "movq        %%mm2, %%mm5\n\t"
1229
                "punpcklwd %%mm7, %%mm0\n\t"
1230
                "punpcklwd %%mm7, %%mm1\n\t"
1231
                "punpcklwd %%mm7, %%mm2\n\t"
1232
                "punpckhwd %%mm7, %%mm3\n\t"
1233
                "punpckhwd %%mm7, %%mm4\n\t"
1234
                "punpckhwd %%mm7, %%mm5\n\t"
1235
                "psllq        $8, %%mm1\n\t"
1236
                "psllq        $16, %%mm2\n\t"
1237
                "por        %%mm1, %%mm0\n\t"
1238
                "por        %%mm2, %%mm0\n\t"
1239
                "psllq        $8, %%mm4\n\t"
1240
                "psllq        $16, %%mm5\n\t"
1241
                "por        %%mm4, %%mm3\n\t"
1242
                "por        %%mm5, %%mm3\n\t"
1243
                MOVNTQ"        %%mm0, %0\n\t"
1244
                MOVNTQ"        %%mm3, 8%0\n\t"
1245
                :"=m"(*d)
1246
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1247
                :"memory");
1248
                d += 16;
1249
                s += 4;
1250
        }
1251
        __asm __volatile(SFENCE:::"memory");
1252
        __asm __volatile(EMMS:::"memory");
1253
#endif
1254
        while(s < end)
1255
        {
1256
#if 0 //slightly slower on athlon
1257
                int bgr= *s++;
1258
                *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1259
#else
1260
                register uint16_t bgr;
1261
                bgr = *s++;
1262
#ifdef WORDS_BIGENDIAN
1263
                *d++ = 0;
1264
                *d++ = (bgr&0x7C00)>>7;
1265
                *d++ = (bgr&0x3E0)>>2;
1266
                *d++ = (bgr&0x1F)<<3;
1267
#else
1268
                *d++ = (bgr&0x1F)<<3;
1269
                *d++ = (bgr&0x3E0)>>2;
1270
                *d++ = (bgr&0x7C00)>>7;
1271
                *d++ = 0;
1272
#endif
1273

    
1274
#endif
1275
        }
1276
}
1277

    
1278
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1279
{
1280
        const uint16_t *end;
1281
#ifdef HAVE_MMX
1282
        const uint16_t *mm_end;
1283
#endif
1284
        uint8_t *d = (uint8_t *)dst;
1285
        const uint16_t *s = (uint16_t *)src;
1286
        end = s + src_size/2;
1287
#ifdef HAVE_MMX
1288
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1289
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1290
        mm_end = end - 3;
1291
        while(s < mm_end)
1292
        {
1293
            __asm __volatile(
1294
                PREFETCH" 32%1\n\t"
1295
                "movq        %1, %%mm0\n\t"
1296
                "movq        %1, %%mm1\n\t"
1297
                "movq        %1, %%mm2\n\t"
1298
                "pand        %2, %%mm0\n\t"
1299
                "pand        %3, %%mm1\n\t"
1300
                "pand        %4, %%mm2\n\t"
1301
                "psllq        $3, %%mm0\n\t"
1302
                "psrlq        $3, %%mm1\n\t"
1303
                "psrlq        $8, %%mm2\n\t"
1304
                "movq        %%mm0, %%mm3\n\t"
1305
                "movq        %%mm1, %%mm4\n\t"
1306
                "movq        %%mm2, %%mm5\n\t"
1307
                "punpcklwd %%mm7, %%mm0\n\t"
1308
                "punpcklwd %%mm7, %%mm1\n\t"
1309
                "punpcklwd %%mm7, %%mm2\n\t"
1310
                "punpckhwd %%mm7, %%mm3\n\t"
1311
                "punpckhwd %%mm7, %%mm4\n\t"
1312
                "punpckhwd %%mm7, %%mm5\n\t"
1313
                "psllq        $8, %%mm1\n\t"
1314
                "psllq        $16, %%mm2\n\t"
1315
                "por        %%mm1, %%mm0\n\t"
1316
                "por        %%mm2, %%mm0\n\t"
1317
                "psllq        $8, %%mm4\n\t"
1318
                "psllq        $16, %%mm5\n\t"
1319
                "por        %%mm4, %%mm3\n\t"
1320
                "por        %%mm5, %%mm3\n\t"
1321
                MOVNTQ"        %%mm0, %0\n\t"
1322
                MOVNTQ"        %%mm3, 8%0\n\t"
1323
                :"=m"(*d)
1324
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1325
                :"memory");
1326
                d += 16;
1327
                s += 4;
1328
        }
1329
        __asm __volatile(SFENCE:::"memory");
1330
        __asm __volatile(EMMS:::"memory");
1331
#endif
1332
        while(s < end)
1333
        {
1334
                register uint16_t bgr;
1335
                bgr = *s++;
1336
#ifdef WORDS_BIGENDIAN
1337
                *d++ = 0;
1338
                *d++ = (bgr&0xF800)>>8;
1339
                *d++ = (bgr&0x7E0)>>3;
1340
                *d++ = (bgr&0x1F)<<3;
1341
#else
1342
                *d++ = (bgr&0x1F)<<3;
1343
                *d++ = (bgr&0x7E0)>>3;
1344
                *d++ = (bgr&0xF800)>>8;
1345
                *d++ = 0;
1346
#endif
1347
        }
1348
}
1349

    
1350
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1351
{
1352
#ifdef HAVE_MMX
1353
/* TODO: unroll this loop */
1354
        asm volatile (
1355
                "xor %%"REG_a", %%"REG_a"        \n\t"
1356
                ASMALIGN(4)
1357
                "1:                                \n\t"
1358
                PREFETCH" 32(%0, %%"REG_a")        \n\t"
1359
                "movq (%0, %%"REG_a"), %%mm0        \n\t"
1360
                "movq %%mm0, %%mm1                \n\t"
1361
                "movq %%mm0, %%mm2                \n\t"
1362
                "pslld $16, %%mm0                \n\t"
1363
                "psrld $16, %%mm1                \n\t"
1364
                "pand "MANGLE(mask32r)", %%mm0        \n\t"
1365
                "pand "MANGLE(mask32g)", %%mm2        \n\t"
1366
                "pand "MANGLE(mask32b)", %%mm1        \n\t"
1367
                "por %%mm0, %%mm2                \n\t"
1368
                "por %%mm1, %%mm2                \n\t"
1369
                MOVNTQ" %%mm2, (%1, %%"REG_a")        \n\t"
1370
                "add $8, %%"REG_a"                \n\t"
1371
                "cmp %2, %%"REG_a"                \n\t"
1372
                " jb 1b                                \n\t"
1373
                :: "r" (src), "r"(dst), "r" (src_size-7)
1374
                : "%"REG_a
1375
        );
1376

    
1377
        __asm __volatile(SFENCE:::"memory");
1378
        __asm __volatile(EMMS:::"memory");
1379
#else
1380
        unsigned i;
1381
        unsigned num_pixels = src_size >> 2;
1382
        for(i=0; i<num_pixels; i++)
1383
        {
1384
#ifdef WORDS_BIGENDIAN  
1385
          dst[4*i + 1] = src[4*i + 3];
1386
          dst[4*i + 2] = src[4*i + 2];
1387
          dst[4*i + 3] = src[4*i + 1];
1388
#else
1389
          dst[4*i + 0] = src[4*i + 2];
1390
          dst[4*i + 1] = src[4*i + 1];
1391
          dst[4*i + 2] = src[4*i + 0];
1392
#endif
1393
        }
1394
#endif
1395
}
1396

    
1397
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1398
{
1399
        unsigned i;
1400
#ifdef HAVE_MMX
1401
        long mmx_size= 23 - src_size;
1402
        asm volatile (
1403
                "movq "MANGLE(mask24r)", %%mm5        \n\t"
1404
                "movq "MANGLE(mask24g)", %%mm6        \n\t"
1405
                "movq "MANGLE(mask24b)", %%mm7        \n\t"
1406
                ASMALIGN(4)
1407
                "1:                                \n\t"
1408
                PREFETCH" 32(%1, %%"REG_a")        \n\t"
1409
                "movq   (%1, %%"REG_a"), %%mm0        \n\t" // BGR BGR BG
1410
                "movq   (%1, %%"REG_a"), %%mm1        \n\t" // BGR BGR BG
1411
                "movq  2(%1, %%"REG_a"), %%mm2        \n\t" // R BGR BGR B
1412
                "psllq $16, %%mm0                \n\t" // 00 BGR BGR
1413
                "pand %%mm5, %%mm0                \n\t"
1414
                "pand %%mm6, %%mm1                \n\t"
1415
                "pand %%mm7, %%mm2                \n\t"
1416
                "por %%mm0, %%mm1                \n\t"
1417
                "por %%mm2, %%mm1                \n\t"                
1418
                "movq  6(%1, %%"REG_a"), %%mm0        \n\t" // BGR BGR BG
1419
                MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1420
                "movq  8(%1, %%"REG_a"), %%mm1        \n\t" // R BGR BGR B
1421
                "movq 10(%1, %%"REG_a"), %%mm2        \n\t" // GR BGR BGR
1422
                "pand %%mm7, %%mm0                \n\t"
1423
                "pand %%mm5, %%mm1                \n\t"
1424
                "pand %%mm6, %%mm2                \n\t"
1425
                "por %%mm0, %%mm1                \n\t"
1426
                "por %%mm2, %%mm1                \n\t"                
1427
                "movq 14(%1, %%"REG_a"), %%mm0        \n\t" // R BGR BGR B
1428
                MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1429
                "movq 16(%1, %%"REG_a"), %%mm1        \n\t" // GR BGR BGR
1430
                "movq 18(%1, %%"REG_a"), %%mm2        \n\t" // BGR BGR BG
1431
                "pand %%mm6, %%mm0                \n\t"
1432
                "pand %%mm7, %%mm1                \n\t"
1433
                "pand %%mm5, %%mm2                \n\t"
1434
                "por %%mm0, %%mm1                \n\t"
1435
                "por %%mm2, %%mm1                \n\t"                
1436
                MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1437
                "add $24, %%"REG_a"                \n\t"
1438
                " js 1b                                \n\t"
1439
                : "+a" (mmx_size)
1440
                : "r" (src-mmx_size), "r"(dst-mmx_size)
1441
        );
1442

    
1443
        __asm __volatile(SFENCE:::"memory");
1444
        __asm __volatile(EMMS:::"memory");
1445

    
1446
        if(mmx_size==23) return; //finihsed, was multiple of 8
1447

    
1448
        src+= src_size;
1449
        dst+= src_size;
1450
        src_size= 23-mmx_size;
1451
        src-= src_size;
1452
        dst-= src_size;
1453
#endif
1454
        for(i=0; i<src_size; i+=3)
1455
        {
1456
                register uint8_t x;
1457
                x          = src[i + 2];
1458
                dst[i + 1] = src[i + 1];
1459
                dst[i + 2] = src[i + 0];
1460
                dst[i + 0] = x;
1461
        }
1462
}
1463

    
1464
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1465
        long width, long height,
1466
        long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1467
{
1468
        long y;
1469
        const long chromWidth= width>>1;
1470
        for(y=0; y<height; y++)
1471
        {
1472
#ifdef HAVE_MMX
1473
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1474
                asm volatile(
1475
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1476
                        ASMALIGN(4)
1477
                        "1:                                \n\t"
1478
                        PREFETCH" 32(%1, %%"REG_a", 2)        \n\t"
1479
                        PREFETCH" 32(%2, %%"REG_a")        \n\t"
1480
                        PREFETCH" 32(%3, %%"REG_a")        \n\t"
1481
                        "movq (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1482
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1483
                        "movq (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1484
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1485
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1486

    
1487
                        "movq (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1488
                        "movq 8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1489
                        "movq %%mm3, %%mm4                \n\t" // Y(0)
1490
                        "movq %%mm5, %%mm6                \n\t" // Y(8)
1491
                        "punpcklbw %%mm0, %%mm3                \n\t" // YUYV YUYV(0)
1492
                        "punpckhbw %%mm0, %%mm4                \n\t" // YUYV YUYV(4)
1493
                        "punpcklbw %%mm2, %%mm5                \n\t" // YUYV YUYV(8)
1494
                        "punpckhbw %%mm2, %%mm6                \n\t" // YUYV YUYV(12)
1495

    
1496
                        MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1497
                        MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1498
                        MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1499
                        MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1500

    
1501
                        "add $8, %%"REG_a"                \n\t"
1502
                        "cmp %4, %%"REG_a"                \n\t"
1503
                        " jb 1b                                \n\t"
1504
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1505
                        : "%"REG_a
1506
                );
1507
#else
1508

    
1509
#if defined ARCH_ALPHA && defined HAVE_MVI
1510
#define pl2yuy2(n)                                        \
1511
        y1 = yc[n];                                        \
1512
        y2 = yc2[n];                                        \
1513
        u = uc[n];                                        \
1514
        v = vc[n];                                        \
1515
        asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));        \
1516
        asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));        \
1517
        asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1518
        asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1519
        yuv1 = (u << 8) + (v << 24);                        \
1520
        yuv2 = yuv1 + y2;                                \
1521
        yuv1 += y1;                                        \
1522
        qdst[n] = yuv1;                                        \
1523
        qdst2[n] = yuv2;
1524

    
1525
                int i;
1526
                uint64_t *qdst = (uint64_t *) dst;
1527
                uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1528
                const uint32_t *yc = (uint32_t *) ysrc;
1529
                const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1530
                const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1531
                for(i = 0; i < chromWidth; i += 8){
1532
                        uint64_t y1, y2, yuv1, yuv2;
1533
                        uint64_t u, v;
1534
                        /* Prefetch */
1535
                        asm("ldq $31,64(%0)" :: "r"(yc));
1536
                        asm("ldq $31,64(%0)" :: "r"(yc2));
1537
                        asm("ldq $31,64(%0)" :: "r"(uc));
1538
                        asm("ldq $31,64(%0)" :: "r"(vc));
1539

    
1540
                        pl2yuy2(0);
1541
                        pl2yuy2(1);
1542
                        pl2yuy2(2);
1543
                        pl2yuy2(3);
1544

    
1545
                        yc += 4;
1546
                        yc2 += 4;
1547
                        uc += 4;
1548
                        vc += 4;
1549
                        qdst += 4;
1550
                        qdst2 += 4;
1551
                }
1552
                y++;
1553
                ysrc += lumStride;
1554
                dst += dstStride;
1555

    
1556
#elif __WORDSIZE >= 64
1557
                int i;
1558
                uint64_t *ldst = (uint64_t *) dst;
1559
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560
                for(i = 0; i < chromWidth; i += 2){
1561
                        uint64_t k, l;
1562
                        k = yc[0] + (uc[0] << 8) +
1563
                            (yc[1] << 16) + (vc[0] << 24);
1564
                        l = yc[2] + (uc[1] << 8) +
1565
                            (yc[3] << 16) + (vc[1] << 24);
1566
                        *ldst++ = k + (l << 32);
1567
                        yc += 4;
1568
                        uc += 2;
1569
                        vc += 2;
1570
                }
1571

    
1572
#else
1573
                int i, *idst = (int32_t *) dst;
1574
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575
                for(i = 0; i < chromWidth; i++){
1576
#ifdef WORDS_BIGENDIAN
1577
                        *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1578
                            (yc[1] << 8) + (vc[0] << 0);
1579
#else
1580
                        *idst++ = yc[0] + (uc[0] << 8) +
1581
                            (yc[1] << 16) + (vc[0] << 24);
1582
#endif
1583
                        yc += 2;
1584
                        uc++;
1585
                        vc++;
1586
                }
1587
#endif
1588
#endif
1589
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1590
                {
1591
                        usrc += chromStride;
1592
                        vsrc += chromStride;
1593
                }
1594
                ysrc += lumStride;
1595
                dst += dstStride;
1596
        }
1597
#ifdef HAVE_MMX
1598
asm(    EMMS" \n\t"
1599
        SFENCE" \n\t"
1600
        :::"memory");
1601
#endif
1602
}
1603

    
1604
/**
1605
 *
1606
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1607
 * problem for anyone then tell me, and ill fix it)
1608
 */
1609
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1610
        long width, long height,
1611
        long lumStride, long chromStride, long dstStride)
1612
{
1613
        //FIXME interpolate chroma
1614
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1615
}
1616

    
1617
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1618
        long width, long height,
1619
        long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1620
{
1621
        long y;
1622
        const long chromWidth= width>>1;
1623
        for(y=0; y<height; y++)
1624
        {
1625
#ifdef HAVE_MMX
1626
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1627
                asm volatile(
1628
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1629
                        ASMALIGN(4)
1630
                        "1:                                \n\t"
1631
                        PREFETCH" 32(%1, %%"REG_a", 2)        \n\t"
1632
                        PREFETCH" 32(%2, %%"REG_a")        \n\t"
1633
                        PREFETCH" 32(%3, %%"REG_a")        \n\t"
1634
                        "movq (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1635
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1636
                        "movq (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1637
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1638
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1639

    
1640
                        "movq (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1641
                        "movq 8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1642
                        "movq %%mm0, %%mm4                \n\t" // Y(0)
1643
                        "movq %%mm2, %%mm6                \n\t" // Y(8)
1644
                        "punpcklbw %%mm3, %%mm0                \n\t" // YUYV YUYV(0)
1645
                        "punpckhbw %%mm3, %%mm4                \n\t" // YUYV YUYV(4)
1646
                        "punpcklbw %%mm5, %%mm2                \n\t" // YUYV YUYV(8)
1647
                        "punpckhbw %%mm5, %%mm6                \n\t" // YUYV YUYV(12)
1648

    
1649
                        MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1650
                        MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1651
                        MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1652
                        MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1653

    
1654
                        "add $8, %%"REG_a"                \n\t"
1655
                        "cmp %4, %%"REG_a"                \n\t"
1656
                        " jb 1b                                \n\t"
1657
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1658
                        : "%"REG_a
1659
                );
1660
#else
1661
//FIXME adapt the alpha asm code from yv12->yuy2
1662

    
1663
#if __WORDSIZE >= 64
1664
                int i;
1665
                uint64_t *ldst = (uint64_t *) dst;
1666
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1667
                for(i = 0; i < chromWidth; i += 2){
1668
                        uint64_t k, l;
1669
                        k = uc[0] + (yc[0] << 8) +
1670
                            (vc[0] << 16) + (yc[1] << 24);
1671
                        l = uc[1] + (yc[2] << 8) +
1672
                            (vc[1] << 16) + (yc[3] << 24);
1673
                        *ldst++ = k + (l << 32);
1674
                        yc += 4;
1675
                        uc += 2;
1676
                        vc += 2;
1677
                }
1678

    
1679
#else
1680
                int i, *idst = (int32_t *) dst;
1681
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1682
                for(i = 0; i < chromWidth; i++){
1683
#ifdef WORDS_BIGENDIAN
1684
                        *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1685
                            (vc[0] << 8) + (yc[1] << 0);
1686
#else
1687
                        *idst++ = uc[0] + (yc[0] << 8) +
1688
                            (vc[0] << 16) + (yc[1] << 24);
1689
#endif
1690
                        yc += 2;
1691
                        uc++;
1692
                        vc++;
1693
                }
1694
#endif
1695
#endif
1696
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1697
                {
1698
                        usrc += chromStride;
1699
                        vsrc += chromStride;
1700
                }
1701
                ysrc += lumStride;
1702
                dst += dstStride;
1703
        }
1704
#ifdef HAVE_MMX
1705
asm(    EMMS" \n\t"
1706
        SFENCE" \n\t"
1707
        :::"memory");
1708
#endif
1709
}
1710

    
1711
/**
1712
 *
1713
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1714
 * problem for anyone then tell me, and ill fix it)
1715
 */
1716
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1717
        long width, long height,
1718
        long lumStride, long chromStride, long dstStride)
1719
{
1720
        //FIXME interpolate chroma
1721
        RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1722
}
1723

    
1724
/**
1725
 *
1726
 * width should be a multiple of 16
1727
 */
1728
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1729
        long width, long height,
1730
        long lumStride, long chromStride, long dstStride)
1731
{
1732
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1733
}
1734

    
1735
/**
1736
 *
1737
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1738
 * problem for anyone then tell me, and ill fix it)
1739
 */
1740
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1741
        long width, long height,
1742
        long lumStride, long chromStride, long srcStride)
1743
{
1744
        long y;
1745
        const long chromWidth= width>>1;
1746
        for(y=0; y<height; y+=2)
1747
        {
1748
#ifdef HAVE_MMX
1749
                asm volatile(
1750
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1751
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1752
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1753
                        ASMALIGN(4)
1754
                        "1:                                \n\t"
1755
                        PREFETCH" 64(%0, %%"REG_a", 4)        \n\t"
1756
                        "movq (%0, %%"REG_a", 4), %%mm0        \n\t" // YUYV YUYV(0)
1757
                        "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1758
                        "movq %%mm0, %%mm2                \n\t" // YUYV YUYV(0)
1759
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(4)
1760
                        "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1761
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1762
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1763
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1764
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1765
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1766

    
1767
                        MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1768

    
1769
                        "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1770
                        "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1771
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(8)
1772
                        "movq %%mm2, %%mm4                \n\t" // YUYV YUYV(12)
1773
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1774
                        "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1775
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1776
                        "pand %%mm7, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1777
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1778
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1779

    
1780
                        MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1781

    
1782
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1783
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1784
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1785
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1786
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1787
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1788
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1789
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1790

    
1791
                        MOVNTQ" %%mm0, (%3, %%"REG_a")        \n\t"
1792
                        MOVNTQ" %%mm2, (%2, %%"REG_a")        \n\t"
1793

    
1794
                        "add $8, %%"REG_a"                \n\t"
1795
                        "cmp %4, %%"REG_a"                \n\t"
1796
                        " jb 1b                                \n\t"
1797
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1798
                        : "memory", "%"REG_a
1799
                );
1800

    
1801
                ydst += lumStride;
1802
                src  += srcStride;
1803

    
1804
                asm volatile(
1805
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1806
                        ASMALIGN(4)
1807
                        "1:                                \n\t"
1808
                        PREFETCH" 64(%0, %%"REG_a", 4)        \n\t"
1809
                        "movq (%0, %%"REG_a", 4), %%mm0        \n\t" // YUYV YUYV(0)
1810
                        "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1811
                        "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1812
                        "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1813
                        "pand %%mm7, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1814
                        "pand %%mm7, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1815
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1816
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1817
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1818
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1819

    
1820
                        MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1821
                        MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1822

    
1823
                        "add $8, %%"REG_a"                \n\t"
1824
                        "cmp %4, %%"REG_a"                \n\t"
1825
                        " jb 1b                                \n\t"
1826

    
1827
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1828
                        : "memory", "%"REG_a
1829
                );
1830
#else
1831
                long i;
1832
                for(i=0; i<chromWidth; i++)
1833
                {
1834
                        ydst[2*i+0]         = src[4*i+0];
1835
                        udst[i]         = src[4*i+1];
1836
                        ydst[2*i+1]         = src[4*i+2];
1837
                        vdst[i]         = src[4*i+3];
1838
                }
1839
                ydst += lumStride;
1840
                src  += srcStride;
1841

    
1842
                for(i=0; i<chromWidth; i++)
1843
                {
1844
                        ydst[2*i+0]         = src[4*i+0];
1845
                        ydst[2*i+1]         = src[4*i+2];
1846
                }
1847
#endif
1848
                udst += chromStride;
1849
                vdst += chromStride;
1850
                ydst += lumStride;
1851
                src  += srcStride;
1852
        }
1853
#ifdef HAVE_MMX
1854
asm volatile(   EMMS" \n\t"
1855
                SFENCE" \n\t"
1856
                :::"memory");
1857
#endif
1858
}
1859

    
1860
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1861
        uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1862
        long width, long height, long lumStride, long chromStride)
1863
{
1864
        /* Y Plane */
1865
        memcpy(ydst, ysrc, width*height);
1866

    
1867
        /* XXX: implement upscaling for U,V */
1868
}
1869

    
1870
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1871
{
1872
        long x,y;
1873
        
1874
        dst[0]= src[0];
1875
        
1876
        // first line
1877
        for(x=0; x<srcWidth-1; x++){
1878
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1879
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1880
        }
1881
        dst[2*srcWidth-1]= src[srcWidth-1];
1882
        
1883
        dst+= dstStride;
1884

    
1885
        for(y=1; y<srcHeight; y++){
1886
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1887
                const long mmxSize= srcWidth&~15;
1888
                asm volatile(
1889
                        "mov %4, %%"REG_a"                \n\t"
1890
                        "1:                                \n\t"
1891
                        "movq (%0, %%"REG_a"), %%mm0        \n\t"
1892
                        "movq (%1, %%"REG_a"), %%mm1        \n\t"
1893
                        "movq 1(%0, %%"REG_a"), %%mm2        \n\t"
1894
                        "movq 1(%1, %%"REG_a"), %%mm3        \n\t"
1895
                        "movq -1(%0, %%"REG_a"), %%mm4        \n\t"
1896
                        "movq -1(%1, %%"REG_a"), %%mm5        \n\t"
1897
                        PAVGB" %%mm0, %%mm5                \n\t"
1898
                        PAVGB" %%mm0, %%mm3                \n\t"
1899
                        PAVGB" %%mm0, %%mm5                \n\t"
1900
                        PAVGB" %%mm0, %%mm3                \n\t"
1901
                        PAVGB" %%mm1, %%mm4                \n\t"
1902
                        PAVGB" %%mm1, %%mm2                \n\t"
1903
                        PAVGB" %%mm1, %%mm4                \n\t"
1904
                        PAVGB" %%mm1, %%mm2                \n\t"
1905
                        "movq %%mm5, %%mm7                \n\t"
1906
                        "movq %%mm4, %%mm6                \n\t"
1907
                        "punpcklbw %%mm3, %%mm5                \n\t"
1908
                        "punpckhbw %%mm3, %%mm7                \n\t"
1909
                        "punpcklbw %%mm2, %%mm4                \n\t"
1910
                        "punpckhbw %%mm2, %%mm6                \n\t"
1911
#if 1
1912
                        MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1913
                        MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1914
                        MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1915
                        MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1916
#else
1917
                        "movq %%mm5, (%2, %%"REG_a", 2)        \n\t"
1918
                        "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1919
                        "movq %%mm4, (%3, %%"REG_a", 2)        \n\t"
1920
                        "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1921
#endif
1922
                        "add $8, %%"REG_a"                \n\t"
1923
                        " js 1b                                \n\t"
1924
                        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1925
                           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1926
                           "g" (-mmxSize)
1927
                        : "%"REG_a
1928

    
1929
                );
1930
#else
1931
                const long mmxSize=1;
1932
#endif
1933
                dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1934
                dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1935

    
1936
                for(x=mmxSize-1; x<srcWidth-1; x++){
1937
                        dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1938
                        dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1939
                        dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1940
                        dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1941
                }
1942
                dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1943
                dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1944

    
1945
                dst+=dstStride*2;
1946
                src+=srcStride;
1947
        }
1948
        
1949
        // last line
1950
#if 1
1951
        dst[0]= src[0];
1952
        
1953
        for(x=0; x<srcWidth-1; x++){
1954
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1955
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1956
        }
1957
        dst[2*srcWidth-1]= src[srcWidth-1];
1958
#else
1959
        for(x=0; x<srcWidth; x++){
1960
                dst[2*x+0]=
1961
                dst[2*x+1]= src[x];
1962
        }
1963
#endif
1964

    
1965
#ifdef HAVE_MMX
1966
asm volatile(   EMMS" \n\t"
1967
                SFENCE" \n\t"
1968
                :::"memory");
1969
#endif
1970
}
1971

    
1972
/**
1973
 *
1974
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1975
 * problem for anyone then tell me, and ill fix it)
1976
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1977
 */
1978
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1979
        long width, long height,
1980
        long lumStride, long chromStride, long srcStride)
1981
{
1982
        long y;
1983
        const long chromWidth= width>>1;
1984
        for(y=0; y<height; y+=2)
1985
        {
1986
#ifdef HAVE_MMX
1987
                asm volatile(
1988
                        "xorl %%eax, %%eax                \n\t"
1989
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1990
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1991
                        ASMALIGN(4)
1992
                        "1:                                \n\t"
1993
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1994
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // UYVY UYVY(0)
1995
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(4)
1996
                        "movq %%mm0, %%mm2                \n\t" // UYVY UYVY(0)
1997
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(4)
1998
                        "pand %%mm7, %%mm0                \n\t" // U0V0 U0V0(0)
1999
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(4)
2000
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
2001
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
2002
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
2003
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
2004

    
2005
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
2006

    
2007
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(8)
2008
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // UYVY UYVY(12)
2009
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(8)
2010
                        "movq %%mm2, %%mm4                \n\t" // UYVY UYVY(12)
2011
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(8)
2012
                        "pand %%mm7, %%mm2                \n\t" // U0V0 U0V0(12)
2013
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2014
                        "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2015
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
2016
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
2017

    
2018
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
2019

    
2020
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
2021
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
2022
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2023
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2024
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
2025
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
2026
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
2027
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
2028

    
2029
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
2030
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
2031

    
2032
                        "addl $8, %%eax                        \n\t"
2033
                        "cmpl %4, %%eax                        \n\t"
2034
                        " jb 1b                                \n\t"
2035
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2036
                        : "memory", "%eax"
2037
                );
2038

    
2039
                ydst += lumStride;
2040
                src  += srcStride;
2041

    
2042
                asm volatile(
2043
                        "xorl %%eax, %%eax                \n\t"
2044
                        ASMALIGN(4)
2045
                        "1:                                \n\t"
2046
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
2047
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
2048
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
2049
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
2050
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
2051
                        "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2052
                        "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2053
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2054
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2055
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
2056
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
2057

    
2058
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
2059
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
2060

    
2061
                        "addl $8, %%eax                        \n\t"
2062
                        "cmpl %4, %%eax                        \n\t"
2063
                        " jb 1b                                \n\t"
2064

    
2065
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2066
                        : "memory", "%eax"
2067
                );
2068
#else
2069
                long i;
2070
                for(i=0; i<chromWidth; i++)
2071
                {
2072
                        udst[i]         = src[4*i+0];
2073
                        ydst[2*i+0]         = src[4*i+1];
2074
                        vdst[i]         = src[4*i+2];
2075
                        ydst[2*i+1]         = src[4*i+3];
2076
                }
2077
                ydst += lumStride;
2078
                src  += srcStride;
2079

    
2080
                for(i=0; i<chromWidth; i++)
2081
                {
2082
                        ydst[2*i+0]         = src[4*i+1];
2083
                        ydst[2*i+1]         = src[4*i+3];
2084
                }
2085
#endif
2086
                udst += chromStride;
2087
                vdst += chromStride;
2088
                ydst += lumStride;
2089
                src  += srcStride;
2090
        }
2091
#ifdef HAVE_MMX
2092
asm volatile(   EMMS" \n\t"
2093
                SFENCE" \n\t"
2094
                :::"memory");
2095
#endif
2096
}
2097

    
2098
/**
2099
 *
2100
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2101
 * problem for anyone then tell me, and ill fix it)
2102
 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2103
 */
2104
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2105
        long width, long height,
2106
        long lumStride, long chromStride, long srcStride)
2107
{
2108
        long y;
2109
        const long chromWidth= width>>1;
2110
#ifdef HAVE_MMX
2111
        for(y=0; y<height-2; y+=2)
2112
        {
2113
                long i;
2114
                for(i=0; i<2; i++)
2115
                {
2116
                        asm volatile(
2117
                                "mov %2, %%"REG_a"                \n\t"
2118
                                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
2119
                                "movq "MANGLE(w1111)", %%mm5                \n\t"
2120
                                "pxor %%mm7, %%mm7                \n\t"
2121
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2122
                                ASMALIGN(4)
2123
                                "1:                                \n\t"
2124
                                PREFETCH" 64(%0, %%"REG_b")        \n\t"
2125
                                "movd (%0, %%"REG_b"), %%mm0        \n\t"
2126
                                "movd 3(%0, %%"REG_b"), %%mm1        \n\t"
2127
                                "punpcklbw %%mm7, %%mm0                \n\t"
2128
                                "punpcklbw %%mm7, %%mm1                \n\t"
2129
                                "movd 6(%0, %%"REG_b"), %%mm2        \n\t"
2130
                                "movd 9(%0, %%"REG_b"), %%mm3        \n\t"
2131
                                "punpcklbw %%mm7, %%mm2                \n\t"
2132
                                "punpcklbw %%mm7, %%mm3                \n\t"
2133
                                "pmaddwd %%mm6, %%mm0                \n\t"
2134
                                "pmaddwd %%mm6, %%mm1                \n\t"
2135
                                "pmaddwd %%mm6, %%mm2                \n\t"
2136
                                "pmaddwd %%mm6, %%mm3                \n\t"
2137
#ifndef FAST_BGR2YV12
2138
                                "psrad $8, %%mm0                \n\t"
2139
                                "psrad $8, %%mm1                \n\t"
2140
                                "psrad $8, %%mm2                \n\t"
2141
                                "psrad $8, %%mm3                \n\t"
2142
#endif
2143
                                "packssdw %%mm1, %%mm0                \n\t"
2144
                                "packssdw %%mm3, %%mm2                \n\t"
2145
                                "pmaddwd %%mm5, %%mm0                \n\t"
2146
                                "pmaddwd %%mm5, %%mm2                \n\t"
2147
                                "packssdw %%mm2, %%mm0                \n\t"
2148
                                "psraw $7, %%mm0                \n\t"
2149

    
2150
                                "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
2151
                                "movd 15(%0, %%"REG_b"), %%mm1        \n\t"
2152
                                "punpcklbw %%mm7, %%mm4                \n\t"
2153
                                "punpcklbw %%mm7, %%mm1                \n\t"
2154
                                "movd 18(%0, %%"REG_b"), %%mm2        \n\t"
2155
                                "movd 21(%0, %%"REG_b"), %%mm3        \n\t"
2156
                                "punpcklbw %%mm7, %%mm2                \n\t"
2157
                                "punpcklbw %%mm7, %%mm3                \n\t"
2158
                                "pmaddwd %%mm6, %%mm4                \n\t"
2159
                                "pmaddwd %%mm6, %%mm1                \n\t"
2160
                                "pmaddwd %%mm6, %%mm2                \n\t"
2161
                                "pmaddwd %%mm6, %%mm3                \n\t"
2162
#ifndef FAST_BGR2YV12
2163
                                "psrad $8, %%mm4                \n\t"
2164
                                "psrad $8, %%mm1                \n\t"
2165
                                "psrad $8, %%mm2                \n\t"
2166
                                "psrad $8, %%mm3                \n\t"
2167
#endif
2168
                                "packssdw %%mm1, %%mm4                \n\t"
2169
                                "packssdw %%mm3, %%mm2                \n\t"
2170
                                "pmaddwd %%mm5, %%mm4                \n\t"
2171
                                "pmaddwd %%mm5, %%mm2                \n\t"
2172
                                "add $24, %%"REG_b"                \n\t"
2173
                                "packssdw %%mm2, %%mm4                \n\t"
2174
                                "psraw $7, %%mm4                \n\t"
2175

    
2176
                                "packuswb %%mm4, %%mm0                \n\t"
2177
                                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
2178

    
2179
                                MOVNTQ" %%mm0, (%1, %%"REG_a")        \n\t"
2180
                                "add $8, %%"REG_a"                \n\t"
2181
                                " js 1b                                \n\t"
2182
                                : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2183
                                : "%"REG_a, "%"REG_b
2184
                        );
2185
                        ydst += lumStride;
2186
                        src  += srcStride;
2187
                }
2188
                src -= srcStride*2;
2189
                asm volatile(
2190
                        "mov %4, %%"REG_a"                \n\t"
2191
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2192
                        "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
2193
                        "pxor %%mm7, %%mm7                \n\t"
2194
                        "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2195
                        "add %%"REG_b", %%"REG_b"        \n\t"
2196
                        ASMALIGN(4)
2197
                        "1:                                \n\t"
2198
                        PREFETCH" 64(%0, %%"REG_b")        \n\t"
2199
                        PREFETCH" 64(%1, %%"REG_b")        \n\t"
2200
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2201
                        "movq (%0, %%"REG_b"), %%mm0        \n\t"
2202
                        "movq (%1, %%"REG_b"), %%mm1        \n\t"
2203
                        "movq 6(%0, %%"REG_b"), %%mm2        \n\t"
2204
                        "movq 6(%1, %%"REG_b"), %%mm3        \n\t"
2205
                        PAVGB" %%mm1, %%mm0                \n\t"
2206
                        PAVGB" %%mm3, %%mm2                \n\t"
2207
                        "movq %%mm0, %%mm1                \n\t"
2208
                        "movq %%mm2, %%mm3                \n\t"
2209
                        "psrlq $24, %%mm0                \n\t"
2210
                        "psrlq $24, %%mm2                \n\t"
2211
                        PAVGB" %%mm1, %%mm0                \n\t"
2212
                        PAVGB" %%mm3, %%mm2                \n\t"
2213
                        "punpcklbw %%mm7, %%mm0                \n\t"
2214
                        "punpcklbw %%mm7, %%mm2                \n\t"
2215
#else
2216
                        "movd (%0, %%"REG_b"), %%mm0        \n\t"
2217
                        "movd (%1, %%"REG_b"), %%mm1        \n\t"
2218
                        "movd 3(%0, %%"REG_b"), %%mm2        \n\t"
2219
                        "movd 3(%1, %%"REG_b"), %%mm3        \n\t"
2220
                        "punpcklbw %%mm7, %%mm0                \n\t"
2221
                        "punpcklbw %%mm7, %%mm1                \n\t"
2222
                        "punpcklbw %%mm7, %%mm2                \n\t"
2223
                        "punpcklbw %%mm7, %%mm3                \n\t"
2224
                        "paddw %%mm1, %%mm0                \n\t"
2225
                        "paddw %%mm3, %%mm2                \n\t"
2226
                        "paddw %%mm2, %%mm0                \n\t"
2227
                        "movd 6(%0, %%"REG_b"), %%mm4        \n\t"
2228
                        "movd 6(%1, %%"REG_b"), %%mm1        \n\t"
2229
                        "movd 9(%0, %%"REG_b"), %%mm2        \n\t"
2230
                        "movd 9(%1, %%"REG_b"), %%mm3        \n\t"
2231
                        "punpcklbw %%mm7, %%mm4                \n\t"
2232
                        "punpcklbw %%mm7, %%mm1                \n\t"
2233
                        "punpcklbw %%mm7, %%mm2                \n\t"
2234
                        "punpcklbw %%mm7, %%mm3                \n\t"
2235
                        "paddw %%mm1, %%mm4                \n\t"
2236
                        "paddw %%mm3, %%mm2                \n\t"
2237
                        "paddw %%mm4, %%mm2                \n\t"
2238
                        "psrlw $2, %%mm0                \n\t"
2239
                        "psrlw $2, %%mm2                \n\t"
2240
#endif
2241
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2242
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2243

    
2244
                        "pmaddwd %%mm0, %%mm1                \n\t"
2245
                        "pmaddwd %%mm2, %%mm3                \n\t"
2246
                        "pmaddwd %%mm6, %%mm0                \n\t"
2247
                        "pmaddwd %%mm6, %%mm2                \n\t"
2248
#ifndef FAST_BGR2YV12
2249
                        "psrad $8, %%mm0                \n\t"
2250
                        "psrad $8, %%mm1                \n\t"
2251
                        "psrad $8, %%mm2                \n\t"
2252
                        "psrad $8, %%mm3                \n\t"
2253
#endif
2254
                        "packssdw %%mm2, %%mm0                \n\t"
2255
                        "packssdw %%mm3, %%mm1                \n\t"
2256
                        "pmaddwd %%mm5, %%mm0                \n\t"
2257
                        "pmaddwd %%mm5, %%mm1                \n\t"
2258
                        "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
2259
                        "psraw $7, %%mm0                \n\t"
2260

    
2261
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2262
                        "movq 12(%0, %%"REG_b"), %%mm4        \n\t"
2263
                        "movq 12(%1, %%"REG_b"), %%mm1        \n\t"
2264
                        "movq 18(%0, %%"REG_b"), %%mm2        \n\t"
2265
                        "movq 18(%1, %%"REG_b"), %%mm3        \n\t"
2266
                        PAVGB" %%mm1, %%mm4                \n\t"
2267
                        PAVGB" %%mm3, %%mm2                \n\t"
2268
                        "movq %%mm4, %%mm1                \n\t"
2269
                        "movq %%mm2, %%mm3                \n\t"
2270
                        "psrlq $24, %%mm4                \n\t"
2271
                        "psrlq $24, %%mm2                \n\t"
2272
                        PAVGB" %%mm1, %%mm4                \n\t"
2273
                        PAVGB" %%mm3, %%mm2                \n\t"
2274
                        "punpcklbw %%mm7, %%mm4                \n\t"
2275
                        "punpcklbw %%mm7, %%mm2                \n\t"
2276
#else
2277
                        "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
2278
                        "movd 12(%1, %%"REG_b"), %%mm1        \n\t"
2279
                        "movd 15(%0, %%"REG_b"), %%mm2        \n\t"
2280
                        "movd 15(%1, %%"REG_b"), %%mm3        \n\t"
2281
                        "punpcklbw %%mm7, %%mm4                \n\t"
2282
                        "punpcklbw %%mm7, %%mm1                \n\t"
2283
                        "punpcklbw %%mm7, %%mm2                \n\t"
2284
                        "punpcklbw %%mm7, %%mm3                \n\t"
2285
                        "paddw %%mm1, %%mm4                \n\t"
2286
                        "paddw %%mm3, %%mm2                \n\t"
2287
                        "paddw %%mm2, %%mm4                \n\t"
2288
                        "movd 18(%0, %%"REG_b"), %%mm5        \n\t"
2289
                        "movd 18(%1, %%"REG_b"), %%mm1        \n\t"
2290
                        "movd 21(%0, %%"REG_b"), %%mm2        \n\t"
2291
                        "movd 21(%1, %%"REG_b"), %%mm3        \n\t"
2292
                        "punpcklbw %%mm7, %%mm5                \n\t"
2293
                        "punpcklbw %%mm7, %%mm1                \n\t"
2294
                        "punpcklbw %%mm7, %%mm2                \n\t"
2295
                        "punpcklbw %%mm7, %%mm3                \n\t"
2296
                        "paddw %%mm1, %%mm5                \n\t"
2297
                        "paddw %%mm3, %%mm2                \n\t"
2298
                        "paddw %%mm5, %%mm2                \n\t"
2299
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2300
                        "psrlw $2, %%mm4                \n\t"
2301
                        "psrlw $2, %%mm2                \n\t"
2302
#endif
2303
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2304
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2305

    
2306
                        "pmaddwd %%mm4, %%mm1                \n\t"
2307
                        "pmaddwd %%mm2, %%mm3                \n\t"
2308
                        "pmaddwd %%mm6, %%mm4                \n\t"
2309
                        "pmaddwd %%mm6, %%mm2                \n\t"
2310
#ifndef FAST_BGR2YV12
2311
                        "psrad $8, %%mm4                \n\t"
2312
                        "psrad $8, %%mm1                \n\t"
2313
                        "psrad $8, %%mm2                \n\t"
2314
                        "psrad $8, %%mm3                \n\t"
2315
#endif
2316
                        "packssdw %%mm2, %%mm4                \n\t"
2317
                        "packssdw %%mm3, %%mm1                \n\t"
2318
                        "pmaddwd %%mm5, %%mm4                \n\t"
2319
                        "pmaddwd %%mm5, %%mm1                \n\t"
2320
                        "add $24, %%"REG_b"                \n\t"
2321
                        "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
2322
                        "psraw $7, %%mm4                \n\t"
2323

    
2324
                        "movq %%mm0, %%mm1                \n\t"
2325
                        "punpckldq %%mm4, %%mm0                \n\t"
2326
                        "punpckhdq %%mm4, %%mm1                \n\t"
2327
                        "packsswb %%mm1, %%mm0                \n\t"
2328
                        "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
2329
                        "movd %%mm0, (%2, %%"REG_a")        \n\t"
2330
                        "punpckhdq %%mm0, %%mm0                \n\t"
2331
                        "movd %%mm0, (%3, %%"REG_a")        \n\t"
2332
                        "add $4, %%"REG_a"                \n\t"
2333
                        " js 1b                                \n\t"
2334
                        : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2335
                        : "%"REG_a, "%"REG_b
2336
                );
2337

    
2338
                udst += chromStride;
2339
                vdst += chromStride;
2340
                src  += srcStride*2;
2341
        }
2342

    
2343
        asm volatile(   EMMS" \n\t"
2344
                        SFENCE" \n\t"
2345
                        :::"memory");
2346
#else
2347
        y=0;
2348
#endif
2349
        for(; y<height; y+=2)
2350
        {
2351
                long i;
2352
                for(i=0; i<chromWidth; i++)
2353
                {
2354
                        unsigned int b= src[6*i+0];
2355
                        unsigned int g= src[6*i+1];
2356
                        unsigned int r= src[6*i+2];
2357

    
2358
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2359
                        unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2360
                        unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2361

    
2362
                        udst[i]         = U;
2363
                        vdst[i]         = V;
2364
                        ydst[2*i]         = Y;
2365

    
2366
                        b= src[6*i+3];
2367
                        g= src[6*i+4];
2368
                        r= src[6*i+5];
2369

    
2370
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2371
                        ydst[2*i+1]         = Y;
2372
                }
2373
                ydst += lumStride;
2374
                src  += srcStride;
2375

    
2376
                for(i=0; i<chromWidth; i++)
2377
                {
2378
                        unsigned int b= src[6*i+0];
2379
                        unsigned int g= src[6*i+1];
2380
                        unsigned int r= src[6*i+2];
2381

    
2382
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2383

    
2384
                        ydst[2*i]         = Y;
2385

    
2386
                        b= src[6*i+3];
2387
                        g= src[6*i+4];
2388
                        r= src[6*i+5];
2389

    
2390
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2391
                        ydst[2*i+1]         = Y;
2392
                }
2393
                udst += chromStride;
2394
                vdst += chromStride;
2395
                ydst += lumStride;
2396
                src  += srcStride;
2397
        }
2398
}
2399

    
2400
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2401
                            long width, long height, long src1Stride,
2402
                            long src2Stride, long dstStride){
2403
        long h;
2404

    
2405
        for(h=0; h < height; h++)
2406
        {
2407
                long w;
2408

    
2409
#ifdef HAVE_MMX
2410
#ifdef HAVE_SSE2
2411
                asm(
2412
                        "xor %%"REG_a", %%"REG_a"        \n\t"
2413
                        "1:                                \n\t"
2414
                        PREFETCH" 64(%1, %%"REG_a")        \n\t"
2415
                        PREFETCH" 64(%2, %%"REG_a")        \n\t"
2416
                        "movdqa (%1, %%"REG_a"), %%xmm0        \n\t"
2417
                        "movdqa (%1, %%"REG_a"), %%xmm1        \n\t"
2418
                        "movdqa (%2, %%"REG_a"), %%xmm2        \n\t"
2419
                        "punpcklbw %%xmm2, %%xmm0        \n\t"
2420
                        "punpckhbw %%xmm2, %%xmm1        \n\t"
2421
                        "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2422
                        "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2423
                        "add $16, %%"REG_a"                \n\t"
2424
                        "cmp %3, %%"REG_a"                \n\t"
2425
                        " jb 1b                                \n\t"
2426
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2427
                        : "memory", "%"REG_a""
2428
                );
2429
#else
2430
                asm(
2431
                        "xor %%"REG_a", %%"REG_a"        \n\t"
2432
                        "1:                                \n\t"
2433
                        PREFETCH" 64(%1, %%"REG_a")        \n\t"
2434
                        PREFETCH" 64(%2, %%"REG_a")        \n\t"
2435
                        "movq (%1, %%"REG_a"), %%mm0        \n\t"
2436
                        "movq 8(%1, %%"REG_a"), %%mm2        \n\t"
2437
                        "movq %%mm0, %%mm1                \n\t"
2438
                        "movq %%mm2, %%mm3                \n\t"
2439
                        "movq (%2, %%"REG_a"), %%mm4        \n\t"
2440
                        "movq 8(%2, %%"REG_a"), %%mm5        \n\t"
2441
                        "punpcklbw %%mm4, %%mm0                \n\t"
2442
                        "punpckhbw %%mm4, %%mm1                \n\t"
2443
                        "punpcklbw %%mm5, %%mm2                \n\t"
2444
                        "punpckhbw %%mm5, %%mm3                \n\t"
2445
                        MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2446
                        MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2447
                        MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2448
                        MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2449
                        "add $16, %%"REG_a"                \n\t"
2450
                        "cmp %3, %%"REG_a"                \n\t"
2451
                        " jb 1b                                \n\t"
2452
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2453
                        : "memory", "%"REG_a
2454
                );
2455
#endif
2456
                for(w= (width&(~15)); w < width; w++)
2457
                {
2458
                        dest[2*w+0] = src1[w];
2459
                        dest[2*w+1] = src2[w];
2460
                }
2461
#else
2462
                for(w=0; w < width; w++)
2463
                {
2464
                        dest[2*w+0] = src1[w];
2465
                        dest[2*w+1] = src2[w];
2466
                }
2467
#endif
2468
                dest += dstStride;
2469
                src1 += src1Stride;
2470
                src2 += src2Stride;
2471
        }
2472
#ifdef HAVE_MMX
2473
        asm(
2474
                EMMS" \n\t"
2475
                SFENCE" \n\t"
2476
                ::: "memory"
2477
                );
2478
#endif
2479
}
2480

    
2481
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2482
                        uint8_t *dst1, uint8_t *dst2,
2483
                        long width, long height,
2484
                        long srcStride1, long srcStride2,
2485
                        long dstStride1, long dstStride2)
2486
{
2487
    long y,x,w,h;
2488
    w=width/2; h=height/2;
2489
#ifdef HAVE_MMX
2490
    asm volatile(
2491
        PREFETCH" %0\n\t"
2492
        PREFETCH" %1\n\t"
2493
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2494
#endif
2495
    for(y=0;y<h;y++){
2496
        const uint8_t* s1=src1+srcStride1*(y>>1);
2497
        uint8_t* d=dst1+dstStride1*y;
2498
        x=0;
2499
#ifdef HAVE_MMX
2500
        for(;x<w-31;x+=32)
2501
        {
2502
            asm volatile(
2503
                PREFETCH" 32%1\n\t"
2504
                "movq        %1, %%mm0\n\t"
2505
                "movq        8%1, %%mm2\n\t"
2506
                "movq        16%1, %%mm4\n\t"
2507
                "movq        24%1, %%mm6\n\t"
2508
                "movq        %%mm0, %%mm1\n\t"
2509
                "movq        %%mm2, %%mm3\n\t"
2510
                "movq        %%mm4, %%mm5\n\t"
2511
                "movq        %%mm6, %%mm7\n\t"
2512
                "punpcklbw %%mm0, %%mm0\n\t"
2513
                "punpckhbw %%mm1, %%mm1\n\t"
2514
                "punpcklbw %%mm2, %%mm2\n\t"
2515
                "punpckhbw %%mm3, %%mm3\n\t"
2516
                "punpcklbw %%mm4, %%mm4\n\t"
2517
                "punpckhbw %%mm5, %%mm5\n\t"
2518
                "punpcklbw %%mm6, %%mm6\n\t"
2519
                "punpckhbw %%mm7, %%mm7\n\t"
2520
                MOVNTQ"        %%mm0, %0\n\t"
2521
                MOVNTQ"        %%mm1, 8%0\n\t"
2522
                MOVNTQ"        %%mm2, 16%0\n\t"
2523
                MOVNTQ"        %%mm3, 24%0\n\t"
2524
                MOVNTQ"        %%mm4, 32%0\n\t"
2525
                MOVNTQ"        %%mm5, 40%0\n\t"
2526
                MOVNTQ"        %%mm6, 48%0\n\t"
2527
                MOVNTQ"        %%mm7, 56%0"
2528
                :"=m"(d[2*x])
2529
                :"m"(s1[x])
2530
                :"memory");
2531
        }
2532
#endif
2533
        for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2534
    }
2535
    for(y=0;y<h;y++){
2536
        const uint8_t* s2=src2+srcStride2*(y>>1);
2537
        uint8_t* d=dst2+dstStride2*y;
2538
        x=0;
2539
#ifdef HAVE_MMX
2540
        for(;x<w-31;x+=32)
2541
        {
2542
            asm volatile(
2543
                PREFETCH" 32%1\n\t"
2544
                "movq        %1, %%mm0\n\t"
2545
                "movq        8%1, %%mm2\n\t"
2546
                "movq        16%1, %%mm4\n\t"
2547
                "movq        24%1, %%mm6\n\t"
2548
                "movq        %%mm0, %%mm1\n\t"
2549
                "movq        %%mm2, %%mm3\n\t"
2550
                "movq        %%mm4, %%mm5\n\t"
2551
                "movq        %%mm6, %%mm7\n\t"
2552
                "punpcklbw %%mm0, %%mm0\n\t"
2553
                "punpckhbw %%mm1, %%mm1\n\t"
2554
                "punpcklbw %%mm2, %%mm2\n\t"
2555
                "punpckhbw %%mm3, %%mm3\n\t"
2556
                "punpcklbw %%mm4, %%mm4\n\t"
2557
                "punpckhbw %%mm5, %%mm5\n\t"
2558
                "punpcklbw %%mm6, %%mm6\n\t"
2559
                "punpckhbw %%mm7, %%mm7\n\t"
2560
                MOVNTQ"        %%mm0, %0\n\t"
2561
                MOVNTQ"        %%mm1, 8%0\n\t"
2562
                MOVNTQ"        %%mm2, 16%0\n\t"
2563
                MOVNTQ"        %%mm3, 24%0\n\t"
2564
                MOVNTQ"        %%mm4, 32%0\n\t"
2565
                MOVNTQ"        %%mm5, 40%0\n\t"
2566
                MOVNTQ"        %%mm6, 48%0\n\t"
2567
                MOVNTQ"        %%mm7, 56%0"
2568
                :"=m"(d[2*x])
2569
                :"m"(s2[x])
2570
                :"memory");
2571
        }
2572
#endif
2573
        for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2574
    }
2575
#ifdef HAVE_MMX
2576
        asm(
2577
                EMMS" \n\t"
2578
                SFENCE" \n\t"
2579
                ::: "memory"
2580
                );
2581
#endif
2582
}
2583

    
2584
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2585
                        uint8_t *dst,
2586
                        long width, long height,
2587
                        long srcStride1, long srcStride2,
2588
                        long srcStride3, long dstStride)
2589
{
2590
    long y,x,w,h;
2591
    w=width/2; h=height;
2592
    for(y=0;y<h;y++){
2593
        const uint8_t* yp=src1+srcStride1*y;
2594
        const uint8_t* up=src2+srcStride2*(y>>2);
2595
        const uint8_t* vp=src3+srcStride3*(y>>2);
2596
        uint8_t* d=dst+dstStride*y;
2597
        x=0;
2598
#ifdef HAVE_MMX
2599
        for(;x<w-7;x+=8)
2600
        {
2601
            asm volatile(
2602
                PREFETCH" 32(%1, %0)\n\t"
2603
                PREFETCH" 32(%2, %0)\n\t"
2604
                PREFETCH" 32(%3, %0)\n\t"
2605
                "movq        (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2606
                "movq        (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2607
                "movq        (%3, %0), %%mm2\n\t"             /* V0V1V2V3V4V5V6V7 */
2608
                "movq        %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2609
                "movq        %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2610
                "movq        %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2611
                "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2612
                "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2613
                "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2614
                "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2615

    
2616
                "movq        %%mm1, %%mm6\n\t"
2617
                "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2618
                "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2619
                "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2620
                MOVNTQ"        %%mm0, (%4, %0, 8)\n\t"
2621
                MOVNTQ"        %%mm3, 8(%4, %0, 8)\n\t"
2622
                
2623
                "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2624
                "movq        8(%1, %0, 4), %%mm0\n\t"
2625
                "movq        %%mm0, %%mm3\n\t"
2626
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2627
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2628
                MOVNTQ"        %%mm0, 16(%4, %0, 8)\n\t"
2629
                MOVNTQ"        %%mm3, 24(%4, %0, 8)\n\t"
2630

    
2631
                "movq        %%mm4, %%mm6\n\t"
2632
                "movq        16(%1, %0, 4), %%mm0\n\t"
2633
                "movq        %%mm0, %%mm3\n\t"
2634
                "punpcklbw %%mm5, %%mm4\n\t"
2635
                "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2636
                "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2637
                MOVNTQ"        %%mm0, 32(%4, %0, 8)\n\t"
2638
                MOVNTQ"        %%mm3, 40(%4, %0, 8)\n\t"
2639
                
2640
                "punpckhbw %%mm5, %%mm6\n\t"
2641
                "movq        24(%1, %0, 4), %%mm0\n\t"
2642
                "movq        %%mm0, %%mm3\n\t"
2643
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2644
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2645
                MOVNTQ"        %%mm0, 48(%4, %0, 8)\n\t"
2646
                MOVNTQ"        %%mm3, 56(%4, %0, 8)\n\t"
2647

    
2648
                : "+r" (x)
2649
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2650
                :"memory");
2651
        }
2652
#endif
2653
        for(; x<w; x++)
2654
        {
2655
            const long x2= x<<2;
2656
            d[8*x+0]=yp[x2];
2657
            d[8*x+1]=up[x];
2658
            d[8*x+2]=yp[x2+1];
2659
            d[8*x+3]=vp[x];
2660
            d[8*x+4]=yp[x2+2];
2661
            d[8*x+5]=up[x];
2662
            d[8*x+6]=yp[x2+3];
2663
            d[8*x+7]=vp[x];
2664
        }
2665
    }
2666
#ifdef HAVE_MMX
2667
        asm(
2668
                EMMS" \n\t"
2669
                SFENCE" \n\t"
2670
                ::: "memory"
2671
                );
2672
#endif
2673
}