Statistics
| Branch: | Revision:

ffmpeg / postproc / rgb2rgb_template.c @ 4ee5599f

History | View | Annotate | Download (67 KB)

1
/*
2
 *
3
 *  rgb2rgb.c, Software RGB to RGB convertor
4
 *  pluralize by Software PAL8 to RGB convertor
5
 *               Software YUV to YUV convertor
6
 *               Software YUV to RGB convertor
7
 *  Written by Nick Kurshev.
8
 *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9
 *  lot of big-endian byteorder fixes by Alex Beregszaszi
10
 */
11

    
12
#include <stddef.h>
13
#include <inttypes.h> /* for __WORDSIZE */
14

    
15
#include "asmalign.h"
16

    
17
#ifndef __WORDSIZE
18
// #warning You have misconfigured system and probably will lose performance!
19
#define __WORDSIZE MP_WORDSIZE
20
#endif
21

    
22
#undef PREFETCH
23
#undef MOVNTQ
24
#undef EMMS
25
#undef SFENCE
26
#undef MMREG_SIZE
27
#undef PREFETCHW
28
#undef PAVGB
29

    
30
#ifdef HAVE_SSE2
31
#define MMREG_SIZE 16
32
#else
33
#define MMREG_SIZE 8
34
#endif
35

    
36
#ifdef HAVE_3DNOW
37
#define PREFETCH  "prefetch"
38
#define PREFETCHW "prefetchw"
39
#define PAVGB          "pavgusb"
40
#elif defined ( HAVE_MMX2 )
41
#define PREFETCH "prefetchnta"
42
#define PREFETCHW "prefetcht0"
43
#define PAVGB          "pavgb"
44
#else
45
#ifdef __APPLE__
46
#define PREFETCH "#"
47
#define PREFETCHW "#"
48
#else
49
#define PREFETCH "/nop"
50
#define PREFETCHW "/nop"
51
#endif
52
#endif
53

    
54
#ifdef HAVE_3DNOW
55
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
56
#define EMMS     "femms"
57
#else
58
#define EMMS     "emms"
59
#endif
60

    
61
#ifdef HAVE_MMX2
62
#define MOVNTQ "movntq"
63
#define SFENCE "sfence"
64
#else
65
#define MOVNTQ "movq"
66
#ifdef __APPLE__
67
#define SFENCE "#"
68
#else
69
#define SFENCE "/nop"
70
#endif
71
#endif
72

    
73
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
74
{
75
  uint8_t *dest = dst;
76
  const uint8_t *s = src;
77
  const uint8_t *end;
78
#ifdef HAVE_MMX
79
  const uint8_t *mm_end;
80
#endif
81
  end = s + src_size;
82
#ifdef HAVE_MMX
83
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
84
  mm_end = end - 23;
85
  __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
86
  while(s < mm_end)
87
  {
88
    __asm __volatile(
89
        PREFETCH"        32%1\n\t"
90
        "movd        %1, %%mm0\n\t"
91
        "punpckldq 3%1, %%mm0\n\t"
92
        "movd        6%1, %%mm1\n\t"
93
        "punpckldq 9%1, %%mm1\n\t"
94
        "movd        12%1, %%mm2\n\t"
95
        "punpckldq 15%1, %%mm2\n\t"
96
        "movd        18%1, %%mm3\n\t"
97
        "punpckldq 21%1, %%mm3\n\t"
98
        "pand        %%mm7, %%mm0\n\t"
99
        "pand        %%mm7, %%mm1\n\t"
100
        "pand        %%mm7, %%mm2\n\t"
101
        "pand        %%mm7, %%mm3\n\t"
102
        MOVNTQ"        %%mm0, %0\n\t"
103
        MOVNTQ"        %%mm1, 8%0\n\t"
104
        MOVNTQ"        %%mm2, 16%0\n\t"
105
        MOVNTQ"        %%mm3, 24%0"
106
        :"=m"(*dest)
107
        :"m"(*s)
108
        :"memory");
109
    dest += 32;
110
    s += 24;
111
  }
112
  __asm __volatile(SFENCE:::"memory");
113
  __asm __volatile(EMMS:::"memory");
114
#endif
115
  while(s < end)
116
  {
117
#ifdef WORDS_BIGENDIAN
118
    /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
119
    *dest++ = 0;
120
    *dest++ = s[2];
121
    *dest++ = s[1];
122
    *dest++ = s[0];
123
    s+=3;
124
#else
125
    *dest++ = *s++;
126
    *dest++ = *s++;
127
    *dest++ = *s++;
128
    *dest++ = 0;
129
#endif
130
  }
131
}
132

    
133
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
134
{
135
  uint8_t *dest = dst;
136
  const uint8_t *s = src;
137
  const uint8_t *end;
138
#ifdef HAVE_MMX
139
  const uint8_t *mm_end;
140
#endif
141
  end = s + src_size;
142
#ifdef HAVE_MMX
143
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
144
  mm_end = end - 31;
145
  while(s < mm_end)
146
  {
147
    __asm __volatile(
148
        PREFETCH"        32%1\n\t"
149
        "movq        %1, %%mm0\n\t"
150
        "movq        8%1, %%mm1\n\t"
151
        "movq        16%1, %%mm4\n\t"
152
        "movq        24%1, %%mm5\n\t"
153
        "movq        %%mm0, %%mm2\n\t"
154
        "movq        %%mm1, %%mm3\n\t"
155
        "movq        %%mm4, %%mm6\n\t"
156
        "movq        %%mm5, %%mm7\n\t"
157
        "psrlq        $8, %%mm2\n\t"
158
        "psrlq        $8, %%mm3\n\t"
159
        "psrlq        $8, %%mm6\n\t"
160
        "psrlq        $8, %%mm7\n\t"
161
        "pand        %2, %%mm0\n\t"
162
        "pand        %2, %%mm1\n\t"
163
        "pand        %2, %%mm4\n\t"
164
        "pand        %2, %%mm5\n\t"
165
        "pand        %3, %%mm2\n\t"
166
        "pand        %3, %%mm3\n\t"
167
        "pand        %3, %%mm6\n\t"
168
        "pand        %3, %%mm7\n\t"
169
        "por        %%mm2, %%mm0\n\t"
170
        "por        %%mm3, %%mm1\n\t"
171
        "por        %%mm6, %%mm4\n\t"
172
        "por        %%mm7, %%mm5\n\t"
173

    
174
        "movq        %%mm1, %%mm2\n\t"
175
        "movq        %%mm4, %%mm3\n\t"
176
        "psllq        $48, %%mm2\n\t"
177
        "psllq        $32, %%mm3\n\t"
178
        "pand        %4, %%mm2\n\t"
179
        "pand        %5, %%mm3\n\t"
180
        "por        %%mm2, %%mm0\n\t"
181
        "psrlq        $16, %%mm1\n\t"
182
        "psrlq        $32, %%mm4\n\t"
183
        "psllq        $16, %%mm5\n\t"
184
        "por        %%mm3, %%mm1\n\t"
185
        "pand        %6, %%mm5\n\t"
186
        "por        %%mm5, %%mm4\n\t"
187

    
188
        MOVNTQ"        %%mm0, %0\n\t"
189
        MOVNTQ"        %%mm1, 8%0\n\t"
190
        MOVNTQ"        %%mm4, 16%0"
191
        :"=m"(*dest)
192
        :"m"(*s),"m"(mask24l),
193
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
194
        :"memory");
195
    dest += 24;
196
    s += 32;
197
  }
198
  __asm __volatile(SFENCE:::"memory");
199
  __asm __volatile(EMMS:::"memory");
200
#endif
201
  while(s < end)
202
  {
203
#ifdef WORDS_BIGENDIAN
204
    /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
205
    s++;
206
    dest[2] = *s++;
207
    dest[1] = *s++;
208
    dest[0] = *s++;
209
    dest += 3;
210
#else
211
    *dest++ = *s++;
212
    *dest++ = *s++;
213
    *dest++ = *s++;
214
    s++;
215
#endif
216
  }
217
}
218

    
219
/*
220
 Original by Strepto/Astral
221
 ported to gcc & bugfixed : A'rpi
222
 MMX2, 3DNOW optimization by Nick Kurshev
223
 32bit c version, and and&add trick by Michael Niedermayer
224
*/
225
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
226
{
227
  register const uint8_t* s=src;
228
  register uint8_t* d=dst;
229
  register const uint8_t *end;
230
  const uint8_t *mm_end;
231
  end = s + src_size;
232
#ifdef HAVE_MMX
233
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
234
  __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
235
  mm_end = end - 15;
236
  while(s<mm_end)
237
  {
238
        __asm __volatile(
239
                PREFETCH"        32%1\n\t"
240
                "movq        %1, %%mm0\n\t"
241
                "movq        8%1, %%mm2\n\t"
242
                "movq        %%mm0, %%mm1\n\t"
243
                "movq        %%mm2, %%mm3\n\t"
244
                "pand        %%mm4, %%mm0\n\t"
245
                "pand        %%mm4, %%mm2\n\t"
246
                "paddw        %%mm1, %%mm0\n\t"
247
                "paddw        %%mm3, %%mm2\n\t"
248
                MOVNTQ"        %%mm0, %0\n\t"
249
                MOVNTQ"        %%mm2, 8%0"
250
                :"=m"(*d)
251
                :"m"(*s)
252
                );
253
        d+=16;
254
        s+=16;
255
  }
256
  __asm __volatile(SFENCE:::"memory");
257
  __asm __volatile(EMMS:::"memory");
258
#endif
259
    mm_end = end - 3;
260
    while(s < mm_end)
261
    {
262
        register unsigned x= *((uint32_t *)s);
263
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
264
        d+=4;
265
        s+=4;
266
    }
267
    if(s < end)
268
    {
269
        register unsigned short x= *((uint16_t *)s);
270
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
271
    }
272
}
273

    
274
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
275
{
276
  register const uint8_t* s=src;
277
  register uint8_t* d=dst;
278
  register const uint8_t *end;
279
  const uint8_t *mm_end;
280
  end = s + src_size;
281
#ifdef HAVE_MMX
282
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
283
  __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
284
  __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
285
  mm_end = end - 15;
286
  while(s<mm_end)
287
  {
288
        __asm __volatile(
289
                PREFETCH"        32%1\n\t"
290
                "movq        %1, %%mm0\n\t"
291
                "movq        8%1, %%mm2\n\t"
292
                "movq        %%mm0, %%mm1\n\t"
293
                "movq        %%mm2, %%mm3\n\t"
294
                "psrlq        $1, %%mm0\n\t"
295
                "psrlq        $1, %%mm2\n\t"
296
                "pand        %%mm7, %%mm0\n\t"
297
                "pand        %%mm7, %%mm2\n\t"
298
                "pand        %%mm6, %%mm1\n\t"
299
                "pand        %%mm6, %%mm3\n\t"
300
                "por        %%mm1, %%mm0\n\t"
301
                "por        %%mm3, %%mm2\n\t"
302
                MOVNTQ"        %%mm0, %0\n\t"
303
                MOVNTQ"        %%mm2, 8%0"
304
                :"=m"(*d)
305
                :"m"(*s)
306
                );
307
        d+=16;
308
        s+=16;
309
  }
310
  __asm __volatile(SFENCE:::"memory");
311
  __asm __volatile(EMMS:::"memory");
312
#endif
313
    mm_end = end - 3;
314
    while(s < mm_end)
315
    {
316
        register uint32_t x= *((uint32_t *)s);
317
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
318
        s+=4;
319
        d+=4;
320
    }
321
    if(s < end)
322
    {
323
        register uint16_t x= *((uint16_t *)s);
324
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
325
        s+=2;
326
        d+=2;
327
    }
328
}
329

    
330
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
331
{
332
        const uint8_t *s = src;
333
        const uint8_t *end;
334
#ifdef HAVE_MMX
335
        const uint8_t *mm_end;
336
#endif
337
        uint16_t *d = (uint16_t *)dst;
338
        end = s + src_size;
339
#ifdef HAVE_MMX
340
        mm_end = end - 15;
341
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
342
        asm volatile(
343
                "movq %3, %%mm5                        \n\t"
344
                "movq %4, %%mm6                        \n\t"
345
                "movq %5, %%mm7                        \n\t"
346
                ASMALIGN16
347
                "1:                                \n\t"
348
                PREFETCH" 32(%1)                \n\t"
349
                "movd        (%1), %%mm0                \n\t"
350
                "movd        4(%1), %%mm3                \n\t"
351
                "punpckldq 8(%1), %%mm0                \n\t"
352
                "punpckldq 12(%1), %%mm3        \n\t"
353
                "movq %%mm0, %%mm1                \n\t"
354
                "movq %%mm3, %%mm4                \n\t"
355
                "pand %%mm6, %%mm0                \n\t"
356
                "pand %%mm6, %%mm3                \n\t"
357
                "pmaddwd %%mm7, %%mm0                \n\t"
358
                "pmaddwd %%mm7, %%mm3                \n\t"
359
                "pand %%mm5, %%mm1                \n\t"
360
                "pand %%mm5, %%mm4                \n\t"
361
                "por %%mm1, %%mm0                \n\t"        
362
                "por %%mm4, %%mm3                \n\t"
363
                "psrld $5, %%mm0                \n\t"
364
                "pslld $11, %%mm3                \n\t"
365
                "por %%mm3, %%mm0                \n\t"
366
                MOVNTQ"        %%mm0, (%0)                \n\t"
367
                "add $16, %1                        \n\t"
368
                "add $8, %0                        \n\t"
369
                "cmp %2, %1                        \n\t"
370
                " jb 1b                                \n\t"
371
                : "+r" (d), "+r"(s)
372
                : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
373
        );
374
#else
375
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
376
        __asm __volatile(
377
            "movq        %0, %%mm7\n\t"
378
            "movq        %1, %%mm6\n\t"
379
            ::"m"(red_16mask),"m"(green_16mask));
380
        while(s < mm_end)
381
        {
382
            __asm __volatile(
383
                PREFETCH" 32%1\n\t"
384
                "movd        %1, %%mm0\n\t"
385
                "movd        4%1, %%mm3\n\t"
386
                "punpckldq 8%1, %%mm0\n\t"
387
                "punpckldq 12%1, %%mm3\n\t"
388
                "movq        %%mm0, %%mm1\n\t"
389
                "movq        %%mm0, %%mm2\n\t"
390
                "movq        %%mm3, %%mm4\n\t"
391
                "movq        %%mm3, %%mm5\n\t"
392
                "psrlq        $3, %%mm0\n\t"
393
                "psrlq        $3, %%mm3\n\t"
394
                "pand        %2, %%mm0\n\t"
395
                "pand        %2, %%mm3\n\t"
396
                "psrlq        $5, %%mm1\n\t"
397
                "psrlq        $5, %%mm4\n\t"
398
                "pand        %%mm6, %%mm1\n\t"
399
                "pand        %%mm6, %%mm4\n\t"
400
                "psrlq        $8, %%mm2\n\t"
401
                "psrlq        $8, %%mm5\n\t"
402
                "pand        %%mm7, %%mm2\n\t"
403
                "pand        %%mm7, %%mm5\n\t"
404
                "por        %%mm1, %%mm0\n\t"
405
                "por        %%mm4, %%mm3\n\t"
406
                "por        %%mm2, %%mm0\n\t"
407
                "por        %%mm5, %%mm3\n\t"
408
                "psllq        $16, %%mm3\n\t"
409
                "por        %%mm3, %%mm0\n\t"
410
                MOVNTQ"        %%mm0, %0\n\t"
411
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
412
                d += 4;
413
                s += 16;
414
        }
415
#endif
416
        __asm __volatile(SFENCE:::"memory");
417
        __asm __volatile(EMMS:::"memory");
418
#endif
419
        while(s < end)
420
        {
421
                register int rgb = *(uint32_t*)s; s += 4;
422
                *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
423
        }
424
}
425

    
426
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
427
{
428
        const uint8_t *s = src;
429
        const uint8_t *end;
430
#ifdef HAVE_MMX
431
        const uint8_t *mm_end;
432
#endif
433
        uint16_t *d = (uint16_t *)dst;
434
        end = s + src_size;
435
#ifdef HAVE_MMX
436
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
437
        __asm __volatile(
438
            "movq        %0, %%mm7\n\t"
439
            "movq        %1, %%mm6\n\t"
440
            ::"m"(red_16mask),"m"(green_16mask));
441
        mm_end = end - 15;
442
        while(s < mm_end)
443
        {
444
            __asm __volatile(
445
                PREFETCH" 32%1\n\t"
446
                "movd        %1, %%mm0\n\t"
447
                "movd        4%1, %%mm3\n\t"
448
                "punpckldq 8%1, %%mm0\n\t"
449
                "punpckldq 12%1, %%mm3\n\t"
450
                "movq        %%mm0, %%mm1\n\t"
451
                "movq        %%mm0, %%mm2\n\t"
452
                "movq        %%mm3, %%mm4\n\t"
453
                "movq        %%mm3, %%mm5\n\t"
454
                "psllq        $8, %%mm0\n\t"
455
                "psllq        $8, %%mm3\n\t"
456
                "pand        %%mm7, %%mm0\n\t"
457
                "pand        %%mm7, %%mm3\n\t"
458
                "psrlq        $5, %%mm1\n\t"
459
                "psrlq        $5, %%mm4\n\t"
460
                "pand        %%mm6, %%mm1\n\t"
461
                "pand        %%mm6, %%mm4\n\t"
462
                "psrlq        $19, %%mm2\n\t"
463
                "psrlq        $19, %%mm5\n\t"
464
                "pand        %2, %%mm2\n\t"
465
                "pand        %2, %%mm5\n\t"
466
                "por        %%mm1, %%mm0\n\t"
467
                "por        %%mm4, %%mm3\n\t"
468
                "por        %%mm2, %%mm0\n\t"
469
                "por        %%mm5, %%mm3\n\t"
470
                "psllq        $16, %%mm3\n\t"
471
                "por        %%mm3, %%mm0\n\t"
472
                MOVNTQ"        %%mm0, %0\n\t"
473
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
474
                d += 4;
475
                s += 16;
476
        }
477
        __asm __volatile(SFENCE:::"memory");
478
        __asm __volatile(EMMS:::"memory");
479
#endif
480
        while(s < end)
481
        {
482
                register int rgb = *(uint32_t*)s; s += 4;
483
                *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
484
        }
485
}
486

    
487
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
488
{
489
        const uint8_t *s = src;
490
        const uint8_t *end;
491
#ifdef HAVE_MMX
492
        const uint8_t *mm_end;
493
#endif
494
        uint16_t *d = (uint16_t *)dst;
495
        end = s + src_size;
496
#ifdef HAVE_MMX
497
        mm_end = end - 15;
498
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
499
        asm volatile(
500
                "movq %3, %%mm5                        \n\t"
501
                "movq %4, %%mm6                        \n\t"
502
                "movq %5, %%mm7                        \n\t"
503
                ASMALIGN16
504
                "1:                                \n\t"
505
                PREFETCH" 32(%1)                \n\t"
506
                "movd        (%1), %%mm0                \n\t"
507
                "movd        4(%1), %%mm3                \n\t"
508
                "punpckldq 8(%1), %%mm0                \n\t"
509
                "punpckldq 12(%1), %%mm3        \n\t"
510
                "movq %%mm0, %%mm1                \n\t"
511
                "movq %%mm3, %%mm4                \n\t"
512
                "pand %%mm6, %%mm0                \n\t"
513
                "pand %%mm6, %%mm3                \n\t"
514
                "pmaddwd %%mm7, %%mm0                \n\t"
515
                "pmaddwd %%mm7, %%mm3                \n\t"
516
                "pand %%mm5, %%mm1                \n\t"
517
                "pand %%mm5, %%mm4                \n\t"
518
                "por %%mm1, %%mm0                \n\t"        
519
                "por %%mm4, %%mm3                \n\t"
520
                "psrld $6, %%mm0                \n\t"
521
                "pslld $10, %%mm3                \n\t"
522
                "por %%mm3, %%mm0                \n\t"
523
                MOVNTQ"        %%mm0, (%0)                \n\t"
524
                "add $16, %1                        \n\t"
525
                "add $8, %0                        \n\t"
526
                "cmp %2, %1                        \n\t"
527
                " jb 1b                                \n\t"
528
                : "+r" (d), "+r"(s)
529
                : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
530
        );
531
#else
532
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
533
        __asm __volatile(
534
            "movq        %0, %%mm7\n\t"
535
            "movq        %1, %%mm6\n\t"
536
            ::"m"(red_15mask),"m"(green_15mask));
537
        while(s < mm_end)
538
        {
539
            __asm __volatile(
540
                PREFETCH" 32%1\n\t"
541
                "movd        %1, %%mm0\n\t"
542
                "movd        4%1, %%mm3\n\t"
543
                "punpckldq 8%1, %%mm0\n\t"
544
                "punpckldq 12%1, %%mm3\n\t"
545
                "movq        %%mm0, %%mm1\n\t"
546
                "movq        %%mm0, %%mm2\n\t"
547
                "movq        %%mm3, %%mm4\n\t"
548
                "movq        %%mm3, %%mm5\n\t"
549
                "psrlq        $3, %%mm0\n\t"
550
                "psrlq        $3, %%mm3\n\t"
551
                "pand        %2, %%mm0\n\t"
552
                "pand        %2, %%mm3\n\t"
553
                "psrlq        $6, %%mm1\n\t"
554
                "psrlq        $6, %%mm4\n\t"
555
                "pand        %%mm6, %%mm1\n\t"
556
                "pand        %%mm6, %%mm4\n\t"
557
                "psrlq        $9, %%mm2\n\t"
558
                "psrlq        $9, %%mm5\n\t"
559
                "pand        %%mm7, %%mm2\n\t"
560
                "pand        %%mm7, %%mm5\n\t"
561
                "por        %%mm1, %%mm0\n\t"
562
                "por        %%mm4, %%mm3\n\t"
563
                "por        %%mm2, %%mm0\n\t"
564
                "por        %%mm5, %%mm3\n\t"
565
                "psllq        $16, %%mm3\n\t"
566
                "por        %%mm3, %%mm0\n\t"
567
                MOVNTQ"        %%mm0, %0\n\t"
568
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
569
                d += 4;
570
                s += 16;
571
        }
572
#endif
573
        __asm __volatile(SFENCE:::"memory");
574
        __asm __volatile(EMMS:::"memory");
575
#endif
576
        while(s < end)
577
        {
578
                register int rgb = *(uint32_t*)s; s += 4;
579
                *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
580
        }
581
}
582

    
583
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
584
{
585
        const uint8_t *s = src;
586
        const uint8_t *end;
587
#ifdef HAVE_MMX
588
        const uint8_t *mm_end;
589
#endif
590
        uint16_t *d = (uint16_t *)dst;
591
        end = s + src_size;
592
#ifdef HAVE_MMX
593
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
594
        __asm __volatile(
595
            "movq        %0, %%mm7\n\t"
596
            "movq        %1, %%mm6\n\t"
597
            ::"m"(red_15mask),"m"(green_15mask));
598
        mm_end = end - 15;
599
        while(s < mm_end)
600
        {
601
            __asm __volatile(
602
                PREFETCH" 32%1\n\t"
603
                "movd        %1, %%mm0\n\t"
604
                "movd        4%1, %%mm3\n\t"
605
                "punpckldq 8%1, %%mm0\n\t"
606
                "punpckldq 12%1, %%mm3\n\t"
607
                "movq        %%mm0, %%mm1\n\t"
608
                "movq        %%mm0, %%mm2\n\t"
609
                "movq        %%mm3, %%mm4\n\t"
610
                "movq        %%mm3, %%mm5\n\t"
611
                "psllq        $7, %%mm0\n\t"
612
                "psllq        $7, %%mm3\n\t"
613
                "pand        %%mm7, %%mm0\n\t"
614
                "pand        %%mm7, %%mm3\n\t"
615
                "psrlq        $6, %%mm1\n\t"
616
                "psrlq        $6, %%mm4\n\t"
617
                "pand        %%mm6, %%mm1\n\t"
618
                "pand        %%mm6, %%mm4\n\t"
619
                "psrlq        $19, %%mm2\n\t"
620
                "psrlq        $19, %%mm5\n\t"
621
                "pand        %2, %%mm2\n\t"
622
                "pand        %2, %%mm5\n\t"
623
                "por        %%mm1, %%mm0\n\t"
624
                "por        %%mm4, %%mm3\n\t"
625
                "por        %%mm2, %%mm0\n\t"
626
                "por        %%mm5, %%mm3\n\t"
627
                "psllq        $16, %%mm3\n\t"
628
                "por        %%mm3, %%mm0\n\t"
629
                MOVNTQ"        %%mm0, %0\n\t"
630
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
631
                d += 4;
632
                s += 16;
633
        }
634
        __asm __volatile(SFENCE:::"memory");
635
        __asm __volatile(EMMS:::"memory");
636
#endif
637
        while(s < end)
638
        {
639
                register int rgb = *(uint32_t*)s; s += 4;
640
                *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
641
        }
642
}
643

    
644
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
645
{
646
        const uint8_t *s = src;
647
        const uint8_t *end;
648
#ifdef HAVE_MMX
649
        const uint8_t *mm_end;
650
#endif
651
        uint16_t *d = (uint16_t *)dst;
652
        end = s + src_size;
653
#ifdef HAVE_MMX
654
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
655
        __asm __volatile(
656
            "movq        %0, %%mm7\n\t"
657
            "movq        %1, %%mm6\n\t"
658
            ::"m"(red_16mask),"m"(green_16mask));
659
        mm_end = end - 11;
660
        while(s < mm_end)
661
        {
662
            __asm __volatile(
663
                PREFETCH" 32%1\n\t"
664
                "movd        %1, %%mm0\n\t"
665
                "movd        3%1, %%mm3\n\t"
666
                "punpckldq 6%1, %%mm0\n\t"
667
                "punpckldq 9%1, %%mm3\n\t"
668
                "movq        %%mm0, %%mm1\n\t"
669
                "movq        %%mm0, %%mm2\n\t"
670
                "movq        %%mm3, %%mm4\n\t"
671
                "movq        %%mm3, %%mm5\n\t"
672
                "psrlq        $3, %%mm0\n\t"
673
                "psrlq        $3, %%mm3\n\t"
674
                "pand        %2, %%mm0\n\t"
675
                "pand        %2, %%mm3\n\t"
676
                "psrlq        $5, %%mm1\n\t"
677
                "psrlq        $5, %%mm4\n\t"
678
                "pand        %%mm6, %%mm1\n\t"
679
                "pand        %%mm6, %%mm4\n\t"
680
                "psrlq        $8, %%mm2\n\t"
681
                "psrlq        $8, %%mm5\n\t"
682
                "pand        %%mm7, %%mm2\n\t"
683
                "pand        %%mm7, %%mm5\n\t"
684
                "por        %%mm1, %%mm0\n\t"
685
                "por        %%mm4, %%mm3\n\t"
686
                "por        %%mm2, %%mm0\n\t"
687
                "por        %%mm5, %%mm3\n\t"
688
                "psllq        $16, %%mm3\n\t"
689
                "por        %%mm3, %%mm0\n\t"
690
                MOVNTQ"        %%mm0, %0\n\t"
691
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
692
                d += 4;
693
                s += 12;
694
        }
695
        __asm __volatile(SFENCE:::"memory");
696
        __asm __volatile(EMMS:::"memory");
697
#endif
698
        while(s < end)
699
        {
700
                const int b= *s++;
701
                const int g= *s++;
702
                const int r= *s++;
703
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
704
        }
705
}
706

    
707
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
708
{
709
        const uint8_t *s = src;
710
        const uint8_t *end;
711
#ifdef HAVE_MMX
712
        const uint8_t *mm_end;
713
#endif
714
        uint16_t *d = (uint16_t *)dst;
715
        end = s + src_size;
716
#ifdef HAVE_MMX
717
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
718
        __asm __volatile(
719
            "movq        %0, %%mm7\n\t"
720
            "movq        %1, %%mm6\n\t"
721
            ::"m"(red_16mask),"m"(green_16mask));
722
        mm_end = end - 15;
723
        while(s < mm_end)
724
        {
725
            __asm __volatile(
726
                PREFETCH" 32%1\n\t"
727
                "movd        %1, %%mm0\n\t"
728
                "movd        3%1, %%mm3\n\t"
729
                "punpckldq 6%1, %%mm0\n\t"
730
                "punpckldq 9%1, %%mm3\n\t"
731
                "movq        %%mm0, %%mm1\n\t"
732
                "movq        %%mm0, %%mm2\n\t"
733
                "movq        %%mm3, %%mm4\n\t"
734
                "movq        %%mm3, %%mm5\n\t"
735
                "psllq        $8, %%mm0\n\t"
736
                "psllq        $8, %%mm3\n\t"
737
                "pand        %%mm7, %%mm0\n\t"
738
                "pand        %%mm7, %%mm3\n\t"
739
                "psrlq        $5, %%mm1\n\t"
740
                "psrlq        $5, %%mm4\n\t"
741
                "pand        %%mm6, %%mm1\n\t"
742
                "pand        %%mm6, %%mm4\n\t"
743
                "psrlq        $19, %%mm2\n\t"
744
                "psrlq        $19, %%mm5\n\t"
745
                "pand        %2, %%mm2\n\t"
746
                "pand        %2, %%mm5\n\t"
747
                "por        %%mm1, %%mm0\n\t"
748
                "por        %%mm4, %%mm3\n\t"
749
                "por        %%mm2, %%mm0\n\t"
750
                "por        %%mm5, %%mm3\n\t"
751
                "psllq        $16, %%mm3\n\t"
752
                "por        %%mm3, %%mm0\n\t"
753
                MOVNTQ"        %%mm0, %0\n\t"
754
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
755
                d += 4;
756
                s += 12;
757
        }
758
        __asm __volatile(SFENCE:::"memory");
759
        __asm __volatile(EMMS:::"memory");
760
#endif
761
        while(s < end)
762
        {
763
                const int r= *s++;
764
                const int g= *s++;
765
                const int b= *s++;
766
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
767
        }
768
}
769

    
770
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
771
{
772
        const uint8_t *s = src;
773
        const uint8_t *end;
774
#ifdef HAVE_MMX
775
        const uint8_t *mm_end;
776
#endif
777
        uint16_t *d = (uint16_t *)dst;
778
        end = s + src_size;
779
#ifdef HAVE_MMX
780
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
781
        __asm __volatile(
782
            "movq        %0, %%mm7\n\t"
783
            "movq        %1, %%mm6\n\t"
784
            ::"m"(red_15mask),"m"(green_15mask));
785
        mm_end = end - 11;
786
        while(s < mm_end)
787
        {
788
            __asm __volatile(
789
                PREFETCH" 32%1\n\t"
790
                "movd        %1, %%mm0\n\t"
791
                "movd        3%1, %%mm3\n\t"
792
                "punpckldq 6%1, %%mm0\n\t"
793
                "punpckldq 9%1, %%mm3\n\t"
794
                "movq        %%mm0, %%mm1\n\t"
795
                "movq        %%mm0, %%mm2\n\t"
796
                "movq        %%mm3, %%mm4\n\t"
797
                "movq        %%mm3, %%mm5\n\t"
798
                "psrlq        $3, %%mm0\n\t"
799
                "psrlq        $3, %%mm3\n\t"
800
                "pand        %2, %%mm0\n\t"
801
                "pand        %2, %%mm3\n\t"
802
                "psrlq        $6, %%mm1\n\t"
803
                "psrlq        $6, %%mm4\n\t"
804
                "pand        %%mm6, %%mm1\n\t"
805
                "pand        %%mm6, %%mm4\n\t"
806
                "psrlq        $9, %%mm2\n\t"
807
                "psrlq        $9, %%mm5\n\t"
808
                "pand        %%mm7, %%mm2\n\t"
809
                "pand        %%mm7, %%mm5\n\t"
810
                "por        %%mm1, %%mm0\n\t"
811
                "por        %%mm4, %%mm3\n\t"
812
                "por        %%mm2, %%mm0\n\t"
813
                "por        %%mm5, %%mm3\n\t"
814
                "psllq        $16, %%mm3\n\t"
815
                "por        %%mm3, %%mm0\n\t"
816
                MOVNTQ"        %%mm0, %0\n\t"
817
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
818
                d += 4;
819
                s += 12;
820
        }
821
        __asm __volatile(SFENCE:::"memory");
822
        __asm __volatile(EMMS:::"memory");
823
#endif
824
        while(s < end)
825
        {
826
                const int b= *s++;
827
                const int g= *s++;
828
                const int r= *s++;
829
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
830
        }
831
}
832

    
833
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
834
{
835
        const uint8_t *s = src;
836
        const uint8_t *end;
837
#ifdef HAVE_MMX
838
        const uint8_t *mm_end;
839
#endif
840
        uint16_t *d = (uint16_t *)dst;
841
        end = s + src_size;
842
#ifdef HAVE_MMX
843
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
844
        __asm __volatile(
845
            "movq        %0, %%mm7\n\t"
846
            "movq        %1, %%mm6\n\t"
847
            ::"m"(red_15mask),"m"(green_15mask));
848
        mm_end = end - 15;
849
        while(s < mm_end)
850
        {
851
            __asm __volatile(
852
                PREFETCH" 32%1\n\t"
853
                "movd        %1, %%mm0\n\t"
854
                "movd        3%1, %%mm3\n\t"
855
                "punpckldq 6%1, %%mm0\n\t"
856
                "punpckldq 9%1, %%mm3\n\t"
857
                "movq        %%mm0, %%mm1\n\t"
858
                "movq        %%mm0, %%mm2\n\t"
859
                "movq        %%mm3, %%mm4\n\t"
860
                "movq        %%mm3, %%mm5\n\t"
861
                "psllq        $7, %%mm0\n\t"
862
                "psllq        $7, %%mm3\n\t"
863
                "pand        %%mm7, %%mm0\n\t"
864
                "pand        %%mm7, %%mm3\n\t"
865
                "psrlq        $6, %%mm1\n\t"
866
                "psrlq        $6, %%mm4\n\t"
867
                "pand        %%mm6, %%mm1\n\t"
868
                "pand        %%mm6, %%mm4\n\t"
869
                "psrlq        $19, %%mm2\n\t"
870
                "psrlq        $19, %%mm5\n\t"
871
                "pand        %2, %%mm2\n\t"
872
                "pand        %2, %%mm5\n\t"
873
                "por        %%mm1, %%mm0\n\t"
874
                "por        %%mm4, %%mm3\n\t"
875
                "por        %%mm2, %%mm0\n\t"
876
                "por        %%mm5, %%mm3\n\t"
877
                "psllq        $16, %%mm3\n\t"
878
                "por        %%mm3, %%mm0\n\t"
879
                MOVNTQ"        %%mm0, %0\n\t"
880
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
881
                d += 4;
882
                s += 12;
883
        }
884
        __asm __volatile(SFENCE:::"memory");
885
        __asm __volatile(EMMS:::"memory");
886
#endif
887
        while(s < end)
888
        {
889
                const int r= *s++;
890
                const int g= *s++;
891
                const int b= *s++;
892
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
893
        }
894
}
895

    
896
/*
897
  I use here less accurate approximation by simply
898
 left-shifting the input
899
  value and filling the low order bits with
900
 zeroes. This method improves png's
901
  compression but this scheme cannot reproduce white exactly, since it does not
902
  generate an all-ones maximum value; the net effect is to darken the
903
  image slightly.
904

905
  The better method should be "left bit replication":
906

907
   4 3 2 1 0
908
   ---------
909
   1 1 0 1 1
910

911
   7 6 5 4 3  2 1 0
912
   ----------------
913
   1 1 0 1 1  1 1 0
914
   |=======|  |===|
915
       |      Leftmost Bits Repeated to Fill Open Bits
916
       |
917
   Original Bits
918
*/
919
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
920
{
921
        const uint16_t *end;
922
#ifdef HAVE_MMX
923
        const uint16_t *mm_end;
924
#endif
925
        uint8_t *d = (uint8_t *)dst;
926
        const uint16_t *s = (uint16_t *)src;
927
        end = s + src_size/2;
928
#ifdef HAVE_MMX
929
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
930
        mm_end = end - 7;
931
        while(s < mm_end)
932
        {
933
            __asm __volatile(
934
                PREFETCH" 32%1\n\t"
935
                "movq        %1, %%mm0\n\t"
936
                "movq        %1, %%mm1\n\t"
937
                "movq        %1, %%mm2\n\t"
938
                "pand        %2, %%mm0\n\t"
939
                "pand        %3, %%mm1\n\t"
940
                "pand        %4, %%mm2\n\t"
941
                "psllq        $3, %%mm0\n\t"
942
                "psrlq        $2, %%mm1\n\t"
943
                "psrlq        $7, %%mm2\n\t"
944
                "movq        %%mm0, %%mm3\n\t"
945
                "movq        %%mm1, %%mm4\n\t"
946
                "movq        %%mm2, %%mm5\n\t"
947
                "punpcklwd %5, %%mm0\n\t"
948
                "punpcklwd %5, %%mm1\n\t"
949
                "punpcklwd %5, %%mm2\n\t"
950
                "punpckhwd %5, %%mm3\n\t"
951
                "punpckhwd %5, %%mm4\n\t"
952
                "punpckhwd %5, %%mm5\n\t"
953
                "psllq        $8, %%mm1\n\t"
954
                "psllq        $16, %%mm2\n\t"
955
                "por        %%mm1, %%mm0\n\t"
956
                "por        %%mm2, %%mm0\n\t"
957
                "psllq        $8, %%mm4\n\t"
958
                "psllq        $16, %%mm5\n\t"
959
                "por        %%mm4, %%mm3\n\t"
960
                "por        %%mm5, %%mm3\n\t"
961

    
962
                "movq        %%mm0, %%mm6\n\t"
963
                "movq        %%mm3, %%mm7\n\t"
964
                
965
                "movq        8%1, %%mm0\n\t"
966
                "movq        8%1, %%mm1\n\t"
967
                "movq        8%1, %%mm2\n\t"
968
                "pand        %2, %%mm0\n\t"
969
                "pand        %3, %%mm1\n\t"
970
                "pand        %4, %%mm2\n\t"
971
                "psllq        $3, %%mm0\n\t"
972
                "psrlq        $2, %%mm1\n\t"
973
                "psrlq        $7, %%mm2\n\t"
974
                "movq        %%mm0, %%mm3\n\t"
975
                "movq        %%mm1, %%mm4\n\t"
976
                "movq        %%mm2, %%mm5\n\t"
977
                "punpcklwd %5, %%mm0\n\t"
978
                "punpcklwd %5, %%mm1\n\t"
979
                "punpcklwd %5, %%mm2\n\t"
980
                "punpckhwd %5, %%mm3\n\t"
981
                "punpckhwd %5, %%mm4\n\t"
982
                "punpckhwd %5, %%mm5\n\t"
983
                "psllq        $8, %%mm1\n\t"
984
                "psllq        $16, %%mm2\n\t"
985
                "por        %%mm1, %%mm0\n\t"
986
                "por        %%mm2, %%mm0\n\t"
987
                "psllq        $8, %%mm4\n\t"
988
                "psllq        $16, %%mm5\n\t"
989
                "por        %%mm4, %%mm3\n\t"
990
                "por        %%mm5, %%mm3\n\t"
991

    
992
                :"=m"(*d)
993
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
994
                :"memory");
995
            /* Borrowed 32 to 24 */
996
            __asm __volatile(
997
                "movq        %%mm0, %%mm4\n\t"
998
                "movq        %%mm3, %%mm5\n\t"
999
                "movq        %%mm6, %%mm0\n\t"
1000
                "movq        %%mm7, %%mm1\n\t"
1001
                
1002
                "movq        %%mm4, %%mm6\n\t"
1003
                "movq        %%mm5, %%mm7\n\t"
1004
                "movq        %%mm0, %%mm2\n\t"
1005
                "movq        %%mm1, %%mm3\n\t"
1006

    
1007
                "psrlq        $8, %%mm2\n\t"
1008
                "psrlq        $8, %%mm3\n\t"
1009
                "psrlq        $8, %%mm6\n\t"
1010
                "psrlq        $8, %%mm7\n\t"
1011
                "pand        %2, %%mm0\n\t"
1012
                "pand        %2, %%mm1\n\t"
1013
                "pand        %2, %%mm4\n\t"
1014
                "pand        %2, %%mm5\n\t"
1015
                "pand        %3, %%mm2\n\t"
1016
                "pand        %3, %%mm3\n\t"
1017
                "pand        %3, %%mm6\n\t"
1018
                "pand        %3, %%mm7\n\t"
1019
                "por        %%mm2, %%mm0\n\t"
1020
                "por        %%mm3, %%mm1\n\t"
1021
                "por        %%mm6, %%mm4\n\t"
1022
                "por        %%mm7, %%mm5\n\t"
1023

    
1024
                "movq        %%mm1, %%mm2\n\t"
1025
                "movq        %%mm4, %%mm3\n\t"
1026
                "psllq        $48, %%mm2\n\t"
1027
                "psllq        $32, %%mm3\n\t"
1028
                "pand        %4, %%mm2\n\t"
1029
                "pand        %5, %%mm3\n\t"
1030
                "por        %%mm2, %%mm0\n\t"
1031
                "psrlq        $16, %%mm1\n\t"
1032
                "psrlq        $32, %%mm4\n\t"
1033
                "psllq        $16, %%mm5\n\t"
1034
                "por        %%mm3, %%mm1\n\t"
1035
                "pand        %6, %%mm5\n\t"
1036
                "por        %%mm5, %%mm4\n\t"
1037

    
1038
                MOVNTQ"        %%mm0, %0\n\t"
1039
                MOVNTQ"        %%mm1, 8%0\n\t"
1040
                MOVNTQ"        %%mm4, 16%0"
1041

    
1042
                :"=m"(*d)
1043
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1044
                :"memory");
1045
                d += 24;
1046
                s += 8;
1047
        }
1048
        __asm __volatile(SFENCE:::"memory");
1049
        __asm __volatile(EMMS:::"memory");
1050
#endif
1051
        while(s < end)
1052
        {
1053
                register uint16_t bgr;
1054
                bgr = *s++;
1055
                *d++ = (bgr&0x1F)<<3;
1056
                *d++ = (bgr&0x3E0)>>2;
1057
                *d++ = (bgr&0x7C00)>>7;
1058
        }
1059
}
1060

    
1061
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1062
{
1063
        const uint16_t *end;
1064
#ifdef HAVE_MMX
1065
        const uint16_t *mm_end;
1066
#endif
1067
        uint8_t *d = (uint8_t *)dst;
1068
        const uint16_t *s = (const uint16_t *)src;
1069
        end = s + src_size/2;
1070
#ifdef HAVE_MMX
1071
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1072
        mm_end = end - 7;
1073
        while(s < mm_end)
1074
        {
1075
            __asm __volatile(
1076
                PREFETCH" 32%1\n\t"
1077
                "movq        %1, %%mm0\n\t"
1078
                "movq        %1, %%mm1\n\t"
1079
                "movq        %1, %%mm2\n\t"
1080
                "pand        %2, %%mm0\n\t"
1081
                "pand        %3, %%mm1\n\t"
1082
                "pand        %4, %%mm2\n\t"
1083
                "psllq        $3, %%mm0\n\t"
1084
                "psrlq        $3, %%mm1\n\t"
1085
                "psrlq        $8, %%mm2\n\t"
1086
                "movq        %%mm0, %%mm3\n\t"
1087
                "movq        %%mm1, %%mm4\n\t"
1088
                "movq        %%mm2, %%mm5\n\t"
1089
                "punpcklwd %5, %%mm0\n\t"
1090
                "punpcklwd %5, %%mm1\n\t"
1091
                "punpcklwd %5, %%mm2\n\t"
1092
                "punpckhwd %5, %%mm3\n\t"
1093
                "punpckhwd %5, %%mm4\n\t"
1094
                "punpckhwd %5, %%mm5\n\t"
1095
                "psllq        $8, %%mm1\n\t"
1096
                "psllq        $16, %%mm2\n\t"
1097
                "por        %%mm1, %%mm0\n\t"
1098
                "por        %%mm2, %%mm0\n\t"
1099
                "psllq        $8, %%mm4\n\t"
1100
                "psllq        $16, %%mm5\n\t"
1101
                "por        %%mm4, %%mm3\n\t"
1102
                "por        %%mm5, %%mm3\n\t"
1103
                
1104
                "movq        %%mm0, %%mm6\n\t"
1105
                "movq        %%mm3, %%mm7\n\t"
1106

    
1107
                "movq        8%1, %%mm0\n\t"
1108
                "movq        8%1, %%mm1\n\t"
1109
                "movq        8%1, %%mm2\n\t"
1110
                "pand        %2, %%mm0\n\t"
1111
                "pand        %3, %%mm1\n\t"
1112
                "pand        %4, %%mm2\n\t"
1113
                "psllq        $3, %%mm0\n\t"
1114
                "psrlq        $3, %%mm1\n\t"
1115
                "psrlq        $8, %%mm2\n\t"
1116
                "movq        %%mm0, %%mm3\n\t"
1117
                "movq        %%mm1, %%mm4\n\t"
1118
                "movq        %%mm2, %%mm5\n\t"
1119
                "punpcklwd %5, %%mm0\n\t"
1120
                "punpcklwd %5, %%mm1\n\t"
1121
                "punpcklwd %5, %%mm2\n\t"
1122
                "punpckhwd %5, %%mm3\n\t"
1123
                "punpckhwd %5, %%mm4\n\t"
1124
                "punpckhwd %5, %%mm5\n\t"
1125
                "psllq        $8, %%mm1\n\t"
1126
                "psllq        $16, %%mm2\n\t"
1127
                "por        %%mm1, %%mm0\n\t"
1128
                "por        %%mm2, %%mm0\n\t"
1129
                "psllq        $8, %%mm4\n\t"
1130
                "psllq        $16, %%mm5\n\t"
1131
                "por        %%mm4, %%mm3\n\t"
1132
                "por        %%mm5, %%mm3\n\t"
1133
                :"=m"(*d)
1134
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)                
1135
                :"memory");
1136
            /* Borrowed 32 to 24 */
1137
            __asm __volatile(
1138
                "movq        %%mm0, %%mm4\n\t"
1139
                "movq        %%mm3, %%mm5\n\t"
1140
                "movq        %%mm6, %%mm0\n\t"
1141
                "movq        %%mm7, %%mm1\n\t"
1142
                
1143
                "movq        %%mm4, %%mm6\n\t"
1144
                "movq        %%mm5, %%mm7\n\t"
1145
                "movq        %%mm0, %%mm2\n\t"
1146
                "movq        %%mm1, %%mm3\n\t"
1147

    
1148
                "psrlq        $8, %%mm2\n\t"
1149
                "psrlq        $8, %%mm3\n\t"
1150
                "psrlq        $8, %%mm6\n\t"
1151
                "psrlq        $8, %%mm7\n\t"
1152
                "pand        %2, %%mm0\n\t"
1153
                "pand        %2, %%mm1\n\t"
1154
                "pand        %2, %%mm4\n\t"
1155
                "pand        %2, %%mm5\n\t"
1156
                "pand        %3, %%mm2\n\t"
1157
                "pand        %3, %%mm3\n\t"
1158
                "pand        %3, %%mm6\n\t"
1159
                "pand        %3, %%mm7\n\t"
1160
                "por        %%mm2, %%mm0\n\t"
1161
                "por        %%mm3, %%mm1\n\t"
1162
                "por        %%mm6, %%mm4\n\t"
1163
                "por        %%mm7, %%mm5\n\t"
1164

    
1165
                "movq        %%mm1, %%mm2\n\t"
1166
                "movq        %%mm4, %%mm3\n\t"
1167
                "psllq        $48, %%mm2\n\t"
1168
                "psllq        $32, %%mm3\n\t"
1169
                "pand        %4, %%mm2\n\t"
1170
                "pand        %5, %%mm3\n\t"
1171
                "por        %%mm2, %%mm0\n\t"
1172
                "psrlq        $16, %%mm1\n\t"
1173
                "psrlq        $32, %%mm4\n\t"
1174
                "psllq        $16, %%mm5\n\t"
1175
                "por        %%mm3, %%mm1\n\t"
1176
                "pand        %6, %%mm5\n\t"
1177
                "por        %%mm5, %%mm4\n\t"
1178

    
1179
                MOVNTQ"        %%mm0, %0\n\t"
1180
                MOVNTQ"        %%mm1, 8%0\n\t"
1181
                MOVNTQ"        %%mm4, 16%0"
1182

    
1183
                :"=m"(*d)
1184
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1185
                :"memory");
1186
                d += 24;
1187
                s += 8;
1188
        }
1189
        __asm __volatile(SFENCE:::"memory");
1190
        __asm __volatile(EMMS:::"memory");
1191
#endif
1192
        while(s < end)
1193
        {
1194
                register uint16_t bgr;
1195
                bgr = *s++;
1196
                *d++ = (bgr&0x1F)<<3;
1197
                *d++ = (bgr&0x7E0)>>3;
1198
                *d++ = (bgr&0xF800)>>8;
1199
        }
1200
}
1201

    
1202
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1203
{
1204
        const uint16_t *end;
1205
#ifdef HAVE_MMX
1206
        const uint16_t *mm_end;
1207
#endif
1208
        uint8_t *d = (uint8_t *)dst;
1209
        const uint16_t *s = (const uint16_t *)src;
1210
        end = s + src_size/2;
1211
#ifdef HAVE_MMX
1212
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1213
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1214
        mm_end = end - 3;
1215
        while(s < mm_end)
1216
        {
1217
            __asm __volatile(
1218
                PREFETCH" 32%1\n\t"
1219
                "movq        %1, %%mm0\n\t"
1220
                "movq        %1, %%mm1\n\t"
1221
                "movq        %1, %%mm2\n\t"
1222
                "pand        %2, %%mm0\n\t"
1223
                "pand        %3, %%mm1\n\t"
1224
                "pand        %4, %%mm2\n\t"
1225
                "psllq        $3, %%mm0\n\t"
1226
                "psrlq        $2, %%mm1\n\t"
1227
                "psrlq        $7, %%mm2\n\t"
1228
                "movq        %%mm0, %%mm3\n\t"
1229
                "movq        %%mm1, %%mm4\n\t"
1230
                "movq        %%mm2, %%mm5\n\t"
1231
                "punpcklwd %%mm7, %%mm0\n\t"
1232
                "punpcklwd %%mm7, %%mm1\n\t"
1233
                "punpcklwd %%mm7, %%mm2\n\t"
1234
                "punpckhwd %%mm7, %%mm3\n\t"
1235
                "punpckhwd %%mm7, %%mm4\n\t"
1236
                "punpckhwd %%mm7, %%mm5\n\t"
1237
                "psllq        $8, %%mm1\n\t"
1238
                "psllq        $16, %%mm2\n\t"
1239
                "por        %%mm1, %%mm0\n\t"
1240
                "por        %%mm2, %%mm0\n\t"
1241
                "psllq        $8, %%mm4\n\t"
1242
                "psllq        $16, %%mm5\n\t"
1243
                "por        %%mm4, %%mm3\n\t"
1244
                "por        %%mm5, %%mm3\n\t"
1245
                MOVNTQ"        %%mm0, %0\n\t"
1246
                MOVNTQ"        %%mm3, 8%0\n\t"
1247
                :"=m"(*d)
1248
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1249
                :"memory");
1250
                d += 16;
1251
                s += 4;
1252
        }
1253
        __asm __volatile(SFENCE:::"memory");
1254
        __asm __volatile(EMMS:::"memory");
1255
#endif
1256
        while(s < end)
1257
        {
1258
#if 0 //slightly slower on athlon
1259
                int bgr= *s++;
1260
                *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1261
#else
1262
                register uint16_t bgr;
1263
                bgr = *s++;
1264
#ifdef WORDS_BIGENDIAN
1265
                *d++ = 0;
1266
                *d++ = (bgr&0x7C00)>>7;
1267
                *d++ = (bgr&0x3E0)>>2;
1268
                *d++ = (bgr&0x1F)<<3;
1269
#else
1270
                *d++ = (bgr&0x1F)<<3;
1271
                *d++ = (bgr&0x3E0)>>2;
1272
                *d++ = (bgr&0x7C00)>>7;
1273
                *d++ = 0;
1274
#endif
1275

    
1276
#endif
1277
        }
1278
}
1279

    
1280
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1281
{
1282
        const uint16_t *end;
1283
#ifdef HAVE_MMX
1284
        const uint16_t *mm_end;
1285
#endif
1286
        uint8_t *d = (uint8_t *)dst;
1287
        const uint16_t *s = (uint16_t *)src;
1288
        end = s + src_size/2;
1289
#ifdef HAVE_MMX
1290
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1291
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1292
        mm_end = end - 3;
1293
        while(s < mm_end)
1294
        {
1295
            __asm __volatile(
1296
                PREFETCH" 32%1\n\t"
1297
                "movq        %1, %%mm0\n\t"
1298
                "movq        %1, %%mm1\n\t"
1299
                "movq        %1, %%mm2\n\t"
1300
                "pand        %2, %%mm0\n\t"
1301
                "pand        %3, %%mm1\n\t"
1302
                "pand        %4, %%mm2\n\t"
1303
                "psllq        $3, %%mm0\n\t"
1304
                "psrlq        $3, %%mm1\n\t"
1305
                "psrlq        $8, %%mm2\n\t"
1306
                "movq        %%mm0, %%mm3\n\t"
1307
                "movq        %%mm1, %%mm4\n\t"
1308
                "movq        %%mm2, %%mm5\n\t"
1309
                "punpcklwd %%mm7, %%mm0\n\t"
1310
                "punpcklwd %%mm7, %%mm1\n\t"
1311
                "punpcklwd %%mm7, %%mm2\n\t"
1312
                "punpckhwd %%mm7, %%mm3\n\t"
1313
                "punpckhwd %%mm7, %%mm4\n\t"
1314
                "punpckhwd %%mm7, %%mm5\n\t"
1315
                "psllq        $8, %%mm1\n\t"
1316
                "psllq        $16, %%mm2\n\t"
1317
                "por        %%mm1, %%mm0\n\t"
1318
                "por        %%mm2, %%mm0\n\t"
1319
                "psllq        $8, %%mm4\n\t"
1320
                "psllq        $16, %%mm5\n\t"
1321
                "por        %%mm4, %%mm3\n\t"
1322
                "por        %%mm5, %%mm3\n\t"
1323
                MOVNTQ"        %%mm0, %0\n\t"
1324
                MOVNTQ"        %%mm3, 8%0\n\t"
1325
                :"=m"(*d)
1326
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1327
                :"memory");
1328
                d += 16;
1329
                s += 4;
1330
        }
1331
        __asm __volatile(SFENCE:::"memory");
1332
        __asm __volatile(EMMS:::"memory");
1333
#endif
1334
        while(s < end)
1335
        {
1336
                register uint16_t bgr;
1337
                bgr = *s++;
1338
#ifdef WORDS_BIGENDIAN
1339
                *d++ = 0;
1340
                *d++ = (bgr&0xF800)>>8;
1341
                *d++ = (bgr&0x7E0)>>3;
1342
                *d++ = (bgr&0x1F)<<3;
1343
#else
1344
                *d++ = (bgr&0x1F)<<3;
1345
                *d++ = (bgr&0x7E0)>>3;
1346
                *d++ = (bgr&0xF800)>>8;
1347
                *d++ = 0;
1348
#endif
1349
        }
1350
}
1351

    
1352
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1353
{
1354
#ifdef HAVE_MMX
1355
/* TODO: unroll this loop */
1356
        asm volatile (
1357
                "xor %%"REG_a", %%"REG_a"        \n\t"
1358
                ASMALIGN16
1359
                "1:                                \n\t"
1360
                PREFETCH" 32(%0, %%"REG_a")        \n\t"
1361
                "movq (%0, %%"REG_a"), %%mm0        \n\t"
1362
                "movq %%mm0, %%mm1                \n\t"
1363
                "movq %%mm0, %%mm2                \n\t"
1364
                "pslld $16, %%mm0                \n\t"
1365
                "psrld $16, %%mm1                \n\t"
1366
                "pand "MANGLE(mask32r)", %%mm0        \n\t"
1367
                "pand "MANGLE(mask32g)", %%mm2        \n\t"
1368
                "pand "MANGLE(mask32b)", %%mm1        \n\t"
1369
                "por %%mm0, %%mm2                \n\t"
1370
                "por %%mm1, %%mm2                \n\t"
1371
                MOVNTQ" %%mm2, (%1, %%"REG_a")        \n\t"
1372
                "add $8, %%"REG_a"                \n\t"
1373
                "cmp %2, %%"REG_a"                \n\t"
1374
                " jb 1b                                \n\t"
1375
                :: "r" (src), "r"(dst), "r" (src_size-7)
1376
                : "%"REG_a
1377
        );
1378

    
1379
        __asm __volatile(SFENCE:::"memory");
1380
        __asm __volatile(EMMS:::"memory");
1381
#else
1382
        unsigned i;
1383
        unsigned num_pixels = src_size >> 2;
1384
        for(i=0; i<num_pixels; i++)
1385
        {
1386
#ifdef WORDS_BIGENDIAN  
1387
          dst[4*i + 1] = src[4*i + 3];
1388
          dst[4*i + 2] = src[4*i + 2];
1389
          dst[4*i + 3] = src[4*i + 1];
1390
#else
1391
          dst[4*i + 0] = src[4*i + 2];
1392
          dst[4*i + 1] = src[4*i + 1];
1393
          dst[4*i + 2] = src[4*i + 0];
1394
#endif
1395
        }
1396
#endif
1397
}
1398

    
1399
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1400
{
1401
        unsigned i;
1402
#ifdef HAVE_MMX
1403
        long mmx_size= 23 - src_size;
1404
        asm volatile (
1405
                "movq "MANGLE(mask24r)", %%mm5        \n\t"
1406
                "movq "MANGLE(mask24g)", %%mm6        \n\t"
1407
                "movq "MANGLE(mask24b)", %%mm7        \n\t"
1408
                ASMALIGN16
1409
                "1:                                \n\t"
1410
                PREFETCH" 32(%1, %%"REG_a")        \n\t"
1411
                "movq   (%1, %%"REG_a"), %%mm0        \n\t" // BGR BGR BG
1412
                "movq   (%1, %%"REG_a"), %%mm1        \n\t" // BGR BGR BG
1413
                "movq  2(%1, %%"REG_a"), %%mm2        \n\t" // R BGR BGR B
1414
                "psllq $16, %%mm0                \n\t" // 00 BGR BGR
1415
                "pand %%mm5, %%mm0                \n\t"
1416
                "pand %%mm6, %%mm1                \n\t"
1417
                "pand %%mm7, %%mm2                \n\t"
1418
                "por %%mm0, %%mm1                \n\t"
1419
                "por %%mm2, %%mm1                \n\t"                
1420
                "movq  6(%1, %%"REG_a"), %%mm0        \n\t" // BGR BGR BG
1421
                MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1422
                "movq  8(%1, %%"REG_a"), %%mm1        \n\t" // R BGR BGR B
1423
                "movq 10(%1, %%"REG_a"), %%mm2        \n\t" // GR BGR BGR
1424
                "pand %%mm7, %%mm0                \n\t"
1425
                "pand %%mm5, %%mm1                \n\t"
1426
                "pand %%mm6, %%mm2                \n\t"
1427
                "por %%mm0, %%mm1                \n\t"
1428
                "por %%mm2, %%mm1                \n\t"                
1429
                "movq 14(%1, %%"REG_a"), %%mm0        \n\t" // R BGR BGR B
1430
                MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1431
                "movq 16(%1, %%"REG_a"), %%mm1        \n\t" // GR BGR BGR
1432
                "movq 18(%1, %%"REG_a"), %%mm2        \n\t" // BGR BGR BG
1433
                "pand %%mm6, %%mm0                \n\t"
1434
                "pand %%mm7, %%mm1                \n\t"
1435
                "pand %%mm5, %%mm2                \n\t"
1436
                "por %%mm0, %%mm1                \n\t"
1437
                "por %%mm2, %%mm1                \n\t"                
1438
                MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1439
                "add $24, %%"REG_a"                \n\t"
1440
                " js 1b                                \n\t"
1441
                : "+a" (mmx_size)
1442
                : "r" (src-mmx_size), "r"(dst-mmx_size)
1443
        );
1444

    
1445
        __asm __volatile(SFENCE:::"memory");
1446
        __asm __volatile(EMMS:::"memory");
1447

    
1448
        if(mmx_size==23) return; //finihsed, was multiple of 8
1449

    
1450
        src+= src_size;
1451
        dst+= src_size;
1452
        src_size= 23-mmx_size;
1453
        src-= src_size;
1454
        dst-= src_size;
1455
#endif
1456
        for(i=0; i<src_size; i+=3)
1457
        {
1458
                register uint8_t x;
1459
                x          = src[i + 2];
1460
                dst[i + 1] = src[i + 1];
1461
                dst[i + 2] = src[i + 0];
1462
                dst[i + 0] = x;
1463
        }
1464
}
1465

    
1466
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1467
        long width, long height,
1468
        long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1469
{
1470
        long y;
1471
        const long chromWidth= width>>1;
1472
        for(y=0; y<height; y++)
1473
        {
1474
#ifdef HAVE_MMX
1475
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1476
                asm volatile(
1477
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1478
                        ASMALIGN16
1479
                        "1:                                \n\t"
1480
                        PREFETCH" 32(%1, %%"REG_a", 2)        \n\t"
1481
                        PREFETCH" 32(%2, %%"REG_a")        \n\t"
1482
                        PREFETCH" 32(%3, %%"REG_a")        \n\t"
1483
                        "movq (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1484
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1485
                        "movq (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1486
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1487
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1488

    
1489
                        "movq (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1490
                        "movq 8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1491
                        "movq %%mm3, %%mm4                \n\t" // Y(0)
1492
                        "movq %%mm5, %%mm6                \n\t" // Y(8)
1493
                        "punpcklbw %%mm0, %%mm3                \n\t" // YUYV YUYV(0)
1494
                        "punpckhbw %%mm0, %%mm4                \n\t" // YUYV YUYV(4)
1495
                        "punpcklbw %%mm2, %%mm5                \n\t" // YUYV YUYV(8)
1496
                        "punpckhbw %%mm2, %%mm6                \n\t" // YUYV YUYV(12)
1497

    
1498
                        MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1499
                        MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1500
                        MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1501
                        MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1502

    
1503
                        "add $8, %%"REG_a"                \n\t"
1504
                        "cmp %4, %%"REG_a"                \n\t"
1505
                        " jb 1b                                \n\t"
1506
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1507
                        : "%"REG_a
1508
                );
1509
#else
1510

    
1511
#if defined ARCH_ALPHA && defined HAVE_MVI
1512
#define pl2yuy2(n)                                        \
1513
        y1 = yc[n];                                        \
1514
        y2 = yc2[n];                                        \
1515
        u = uc[n];                                        \
1516
        v = vc[n];                                        \
1517
        asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));        \
1518
        asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));        \
1519
        asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1520
        asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1521
        yuv1 = (u << 8) + (v << 24);                        \
1522
        yuv2 = yuv1 + y2;                                \
1523
        yuv1 += y1;                                        \
1524
        qdst[n] = yuv1;                                        \
1525
        qdst2[n] = yuv2;
1526

    
1527
                int i;
1528
                uint64_t *qdst = (uint64_t *) dst;
1529
                uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1530
                const uint32_t *yc = (uint32_t *) ysrc;
1531
                const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1532
                const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1533
                for(i = 0; i < chromWidth; i += 8){
1534
                        uint64_t y1, y2, yuv1, yuv2;
1535
                        uint64_t u, v;
1536
                        /* Prefetch */
1537
                        asm("ldq $31,64(%0)" :: "r"(yc));
1538
                        asm("ldq $31,64(%0)" :: "r"(yc2));
1539
                        asm("ldq $31,64(%0)" :: "r"(uc));
1540
                        asm("ldq $31,64(%0)" :: "r"(vc));
1541

    
1542
                        pl2yuy2(0);
1543
                        pl2yuy2(1);
1544
                        pl2yuy2(2);
1545
                        pl2yuy2(3);
1546

    
1547
                        yc += 4;
1548
                        yc2 += 4;
1549
                        uc += 4;
1550
                        vc += 4;
1551
                        qdst += 4;
1552
                        qdst2 += 4;
1553
                }
1554
                y++;
1555
                ysrc += lumStride;
1556
                dst += dstStride;
1557

    
1558
#elif __WORDSIZE >= 64
1559
                int i;
1560
                uint64_t *ldst = (uint64_t *) dst;
1561
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1562
                for(i = 0; i < chromWidth; i += 2){
1563
                        uint64_t k, l;
1564
                        k = yc[0] + (uc[0] << 8) +
1565
                            (yc[1] << 16) + (vc[0] << 24);
1566
                        l = yc[2] + (uc[1] << 8) +
1567
                            (yc[3] << 16) + (vc[1] << 24);
1568
                        *ldst++ = k + (l << 32);
1569
                        yc += 4;
1570
                        uc += 2;
1571
                        vc += 2;
1572
                }
1573

    
1574
#else
1575
                int i, *idst = (int32_t *) dst;
1576
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1577
                for(i = 0; i < chromWidth; i++){
1578
#ifdef WORDS_BIGENDIAN
1579
                        *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1580
                            (yc[1] << 8) + (vc[0] << 0);
1581
#else
1582
                        *idst++ = yc[0] + (uc[0] << 8) +
1583
                            (yc[1] << 16) + (vc[0] << 24);
1584
#endif
1585
                        yc += 2;
1586
                        uc++;
1587
                        vc++;
1588
                }
1589
#endif
1590
#endif
1591
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1592
                {
1593
                        usrc += chromStride;
1594
                        vsrc += chromStride;
1595
                }
1596
                ysrc += lumStride;
1597
                dst += dstStride;
1598
        }
1599
#ifdef HAVE_MMX
1600
asm(    EMMS" \n\t"
1601
        SFENCE" \n\t"
1602
        :::"memory");
1603
#endif
1604
}
1605

    
1606
/**
1607
 *
1608
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1609
 * problem for anyone then tell me, and ill fix it)
1610
 */
1611
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1612
        long width, long height,
1613
        long lumStride, long chromStride, long dstStride)
1614
{
1615
        //FIXME interpolate chroma
1616
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1617
}
1618

    
1619
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1620
        long width, long height,
1621
        long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1622
{
1623
        long y;
1624
        const long chromWidth= width>>1;
1625
        for(y=0; y<height; y++)
1626
        {
1627
#ifdef HAVE_MMX
1628
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1629
                asm volatile(
1630
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1631
                        ASMALIGN16
1632
                        "1:                                \n\t"
1633
                        PREFETCH" 32(%1, %%"REG_a", 2)        \n\t"
1634
                        PREFETCH" 32(%2, %%"REG_a")        \n\t"
1635
                        PREFETCH" 32(%3, %%"REG_a")        \n\t"
1636
                        "movq (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1637
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1638
                        "movq (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1639
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1640
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1641

    
1642
                        "movq (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1643
                        "movq 8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1644
                        "movq %%mm0, %%mm4                \n\t" // Y(0)
1645
                        "movq %%mm2, %%mm6                \n\t" // Y(8)
1646
                        "punpcklbw %%mm3, %%mm0                \n\t" // YUYV YUYV(0)
1647
                        "punpckhbw %%mm3, %%mm4                \n\t" // YUYV YUYV(4)
1648
                        "punpcklbw %%mm5, %%mm2                \n\t" // YUYV YUYV(8)
1649
                        "punpckhbw %%mm5, %%mm6                \n\t" // YUYV YUYV(12)
1650

    
1651
                        MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1652
                        MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1653
                        MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1654
                        MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1655

    
1656
                        "add $8, %%"REG_a"                \n\t"
1657
                        "cmp %4, %%"REG_a"                \n\t"
1658
                        " jb 1b                                \n\t"
1659
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1660
                        : "%"REG_a
1661
                );
1662
#else
1663
//FIXME adapt the alpha asm code from yv12->yuy2
1664

    
1665
#if __WORDSIZE >= 64
1666
                int i;
1667
                uint64_t *ldst = (uint64_t *) dst;
1668
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1669
                for(i = 0; i < chromWidth; i += 2){
1670
                        uint64_t k, l;
1671
                        k = uc[0] + (yc[0] << 8) +
1672
                            (vc[0] << 16) + (yc[1] << 24);
1673
                        l = uc[1] + (yc[2] << 8) +
1674
                            (vc[1] << 16) + (yc[3] << 24);
1675
                        *ldst++ = k + (l << 32);
1676
                        yc += 4;
1677
                        uc += 2;
1678
                        vc += 2;
1679
                }
1680

    
1681
#else
1682
                int i, *idst = (int32_t *) dst;
1683
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1684
                for(i = 0; i < chromWidth; i++){
1685
#ifdef WORDS_BIGENDIAN
1686
                        *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1687
                            (vc[0] << 8) + (yc[1] << 0);
1688
#else
1689
                        *idst++ = uc[0] + (yc[0] << 8) +
1690
                            (vc[0] << 16) + (yc[1] << 24);
1691
#endif
1692
                        yc += 2;
1693
                        uc++;
1694
                        vc++;
1695
                }
1696
#endif
1697
#endif
1698
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1699
                {
1700
                        usrc += chromStride;
1701
                        vsrc += chromStride;
1702
                }
1703
                ysrc += lumStride;
1704
                dst += dstStride;
1705
        }
1706
#ifdef HAVE_MMX
1707
asm(    EMMS" \n\t"
1708
        SFENCE" \n\t"
1709
        :::"memory");
1710
#endif
1711
}
1712

    
1713
/**
1714
 *
1715
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1716
 * problem for anyone then tell me, and ill fix it)
1717
 */
1718
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1719
        long width, long height,
1720
        long lumStride, long chromStride, long dstStride)
1721
{
1722
        //FIXME interpolate chroma
1723
        RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1724
}
1725

    
1726
/**
1727
 *
1728
 * width should be a multiple of 16
1729
 */
1730
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1731
        long width, long height,
1732
        long lumStride, long chromStride, long dstStride)
1733
{
1734
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1735
}
1736

    
1737
/**
1738
 *
1739
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1740
 * problem for anyone then tell me, and ill fix it)
1741
 */
1742
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1743
        long width, long height,
1744
        long lumStride, long chromStride, long srcStride)
1745
{
1746
        long y;
1747
        const long chromWidth= width>>1;
1748
        for(y=0; y<height; y+=2)
1749
        {
1750
#ifdef HAVE_MMX
1751
                asm volatile(
1752
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1753
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1754
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1755
                        ASMALIGN16
1756
                        "1:                                \n\t"
1757
                        PREFETCH" 64(%0, %%"REG_a", 4)        \n\t"
1758
                        "movq (%0, %%"REG_a", 4), %%mm0        \n\t" // YUYV YUYV(0)
1759
                        "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1760
                        "movq %%mm0, %%mm2                \n\t" // YUYV YUYV(0)
1761
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(4)
1762
                        "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1763
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1764
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1765
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1766
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1767
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1768

    
1769
                        MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1770

    
1771
                        "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1772
                        "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1773
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(8)
1774
                        "movq %%mm2, %%mm4                \n\t" // YUYV YUYV(12)
1775
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1776
                        "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1777
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1778
                        "pand %%mm7, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1779
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1780
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1781

    
1782
                        MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1783

    
1784
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1785
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1786
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1787
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1788
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1789
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1790
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1791
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1792

    
1793
                        MOVNTQ" %%mm0, (%3, %%"REG_a")        \n\t"
1794
                        MOVNTQ" %%mm2, (%2, %%"REG_a")        \n\t"
1795

    
1796
                        "add $8, %%"REG_a"                \n\t"
1797
                        "cmp %4, %%"REG_a"                \n\t"
1798
                        " jb 1b                                \n\t"
1799
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1800
                        : "memory", "%"REG_a
1801
                );
1802

    
1803
                ydst += lumStride;
1804
                src  += srcStride;
1805

    
1806
                asm volatile(
1807
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1808
                        ASMALIGN16
1809
                        "1:                                \n\t"
1810
                        PREFETCH" 64(%0, %%"REG_a", 4)        \n\t"
1811
                        "movq (%0, %%"REG_a", 4), %%mm0        \n\t" // YUYV YUYV(0)
1812
                        "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1813
                        "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1814
                        "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1815
                        "pand %%mm7, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1816
                        "pand %%mm7, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1817
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1818
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1819
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1820
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1821

    
1822
                        MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1823
                        MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1824

    
1825
                        "add $8, %%"REG_a"                \n\t"
1826
                        "cmp %4, %%"REG_a"                \n\t"
1827
                        " jb 1b                                \n\t"
1828

    
1829
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1830
                        : "memory", "%"REG_a
1831
                );
1832
#else
1833
                long i;
1834
                for(i=0; i<chromWidth; i++)
1835
                {
1836
                        ydst[2*i+0]         = src[4*i+0];
1837
                        udst[i]         = src[4*i+1];
1838
                        ydst[2*i+1]         = src[4*i+2];
1839
                        vdst[i]         = src[4*i+3];
1840
                }
1841
                ydst += lumStride;
1842
                src  += srcStride;
1843

    
1844
                for(i=0; i<chromWidth; i++)
1845
                {
1846
                        ydst[2*i+0]         = src[4*i+0];
1847
                        ydst[2*i+1]         = src[4*i+2];
1848
                }
1849
#endif
1850
                udst += chromStride;
1851
                vdst += chromStride;
1852
                ydst += lumStride;
1853
                src  += srcStride;
1854
        }
1855
#ifdef HAVE_MMX
1856
asm volatile(   EMMS" \n\t"
1857
                SFENCE" \n\t"
1858
                :::"memory");
1859
#endif
1860
}
1861

    
1862
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1863
        uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1864
        long width, long height, long lumStride, long chromStride)
1865
{
1866
        /* Y Plane */
1867
        memcpy(ydst, ysrc, width*height);
1868

    
1869
        /* XXX: implement upscaling for U,V */
1870
}
1871

    
1872
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1873
{
1874
        long x,y;
1875
        
1876
        dst[0]= src[0];
1877
        
1878
        // first line
1879
        for(x=0; x<srcWidth-1; x++){
1880
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1881
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1882
        }
1883
        dst[2*srcWidth-1]= src[srcWidth-1];
1884
        
1885
        dst+= dstStride;
1886

    
1887
        for(y=1; y<srcHeight; y++){
1888
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1889
                const long mmxSize= srcWidth&~15;
1890
                asm volatile(
1891
                        "mov %4, %%"REG_a"                \n\t"
1892
                        "1:                                \n\t"
1893
                        "movq (%0, %%"REG_a"), %%mm0        \n\t"
1894
                        "movq (%1, %%"REG_a"), %%mm1        \n\t"
1895
                        "movq 1(%0, %%"REG_a"), %%mm2        \n\t"
1896
                        "movq 1(%1, %%"REG_a"), %%mm3        \n\t"
1897
                        "movq -1(%0, %%"REG_a"), %%mm4        \n\t"
1898
                        "movq -1(%1, %%"REG_a"), %%mm5        \n\t"
1899
                        PAVGB" %%mm0, %%mm5                \n\t"
1900
                        PAVGB" %%mm0, %%mm3                \n\t"
1901
                        PAVGB" %%mm0, %%mm5                \n\t"
1902
                        PAVGB" %%mm0, %%mm3                \n\t"
1903
                        PAVGB" %%mm1, %%mm4                \n\t"
1904
                        PAVGB" %%mm1, %%mm2                \n\t"
1905
                        PAVGB" %%mm1, %%mm4                \n\t"
1906
                        PAVGB" %%mm1, %%mm2                \n\t"
1907
                        "movq %%mm5, %%mm7                \n\t"
1908
                        "movq %%mm4, %%mm6                \n\t"
1909
                        "punpcklbw %%mm3, %%mm5                \n\t"
1910
                        "punpckhbw %%mm3, %%mm7                \n\t"
1911
                        "punpcklbw %%mm2, %%mm4                \n\t"
1912
                        "punpckhbw %%mm2, %%mm6                \n\t"
1913
#if 1
1914
                        MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1915
                        MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1916
                        MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1917
                        MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1918
#else
1919
                        "movq %%mm5, (%2, %%"REG_a", 2)        \n\t"
1920
                        "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1921
                        "movq %%mm4, (%3, %%"REG_a", 2)        \n\t"
1922
                        "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1923
#endif
1924
                        "add $8, %%"REG_a"                \n\t"
1925
                        " js 1b                                \n\t"
1926
                        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1927
                           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1928
                           "g" (-mmxSize)
1929
                        : "%"REG_a
1930

    
1931
                );
1932
#else
1933
                const long mmxSize=1;
1934
#endif
1935
                dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1936
                dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1937

    
1938
                for(x=mmxSize-1; x<srcWidth-1; x++){
1939
                        dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1940
                        dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1941
                        dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1942
                        dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1943
                }
1944
                dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1945
                dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1946

    
1947
                dst+=dstStride*2;
1948
                src+=srcStride;
1949
        }
1950
        
1951
        // last line
1952
#if 1
1953
        dst[0]= src[0];
1954
        
1955
        for(x=0; x<srcWidth-1; x++){
1956
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1957
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1958
        }
1959
        dst[2*srcWidth-1]= src[srcWidth-1];
1960
#else
1961
        for(x=0; x<srcWidth; x++){
1962
                dst[2*x+0]=
1963
                dst[2*x+1]= src[x];
1964
        }
1965
#endif
1966

    
1967
#ifdef HAVE_MMX
1968
asm volatile(   EMMS" \n\t"
1969
                SFENCE" \n\t"
1970
                :::"memory");
1971
#endif
1972
}
1973

    
1974
/**
1975
 *
1976
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1977
 * problem for anyone then tell me, and ill fix it)
1978
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1979
 */
1980
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1981
        long width, long height,
1982
        long lumStride, long chromStride, long srcStride)
1983
{
1984
        long y;
1985
        const long chromWidth= width>>1;
1986
        for(y=0; y<height; y+=2)
1987
        {
1988
#ifdef HAVE_MMX
1989
                asm volatile(
1990
                        "xorl %%eax, %%eax                \n\t"
1991
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1992
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1993
                        ASMALIGN16
1994
                        "1:                                \n\t"
1995
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1996
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // UYVY UYVY(0)
1997
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(4)
1998
                        "movq %%mm0, %%mm2                \n\t" // UYVY UYVY(0)
1999
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(4)
2000
                        "pand %%mm7, %%mm0                \n\t" // U0V0 U0V0(0)
2001
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(4)
2002
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
2003
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
2004
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
2005
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
2006

    
2007
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
2008

    
2009
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(8)
2010
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // UYVY UYVY(12)
2011
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(8)
2012
                        "movq %%mm2, %%mm4                \n\t" // UYVY UYVY(12)
2013
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(8)
2014
                        "pand %%mm7, %%mm2                \n\t" // U0V0 U0V0(12)
2015
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2016
                        "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2017
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
2018
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
2019

    
2020
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
2021

    
2022
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
2023
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
2024
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2025
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2026
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
2027
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
2028
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
2029
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
2030

    
2031
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
2032
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
2033

    
2034
                        "addl $8, %%eax                        \n\t"
2035
                        "cmpl %4, %%eax                        \n\t"
2036
                        " jb 1b                                \n\t"
2037
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2038
                        : "memory", "%eax"
2039
                );
2040

    
2041
                ydst += lumStride;
2042
                src  += srcStride;
2043

    
2044
                asm volatile(
2045
                        "xorl %%eax, %%eax                \n\t"
2046
                        ASMALIGN16
2047
                        "1:                                \n\t"
2048
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
2049
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
2050
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
2051
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
2052
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
2053
                        "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2054
                        "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2055
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2056
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2057
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
2058
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
2059

    
2060
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
2061
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
2062

    
2063
                        "addl $8, %%eax                        \n\t"
2064
                        "cmpl %4, %%eax                        \n\t"
2065
                        " jb 1b                                \n\t"
2066

    
2067
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2068
                        : "memory", "%eax"
2069
                );
2070
#else
2071
                long i;
2072
                for(i=0; i<chromWidth; i++)
2073
                {
2074
                        udst[i]         = src[4*i+0];
2075
                        ydst[2*i+0]         = src[4*i+1];
2076
                        vdst[i]         = src[4*i+2];
2077
                        ydst[2*i+1]         = src[4*i+3];
2078
                }
2079
                ydst += lumStride;
2080
                src  += srcStride;
2081

    
2082
                for(i=0; i<chromWidth; i++)
2083
                {
2084
                        ydst[2*i+0]         = src[4*i+1];
2085
                        ydst[2*i+1]         = src[4*i+3];
2086
                }
2087
#endif
2088
                udst += chromStride;
2089
                vdst += chromStride;
2090
                ydst += lumStride;
2091
                src  += srcStride;
2092
        }
2093
#ifdef HAVE_MMX
2094
asm volatile(   EMMS" \n\t"
2095
                SFENCE" \n\t"
2096
                :::"memory");
2097
#endif
2098
}
2099

    
2100
/**
2101
 *
2102
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2103
 * problem for anyone then tell me, and ill fix it)
2104
 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2105
 */
2106
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2107
        long width, long height,
2108
        long lumStride, long chromStride, long srcStride)
2109
{
2110
        long y;
2111
        const long chromWidth= width>>1;
2112
#ifdef HAVE_MMX
2113
        for(y=0; y<height-2; y+=2)
2114
        {
2115
                long i;
2116
                for(i=0; i<2; i++)
2117
                {
2118
                        asm volatile(
2119
                                "mov %2, %%"REG_a"                \n\t"
2120
                                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
2121
                                "movq "MANGLE(w1111)", %%mm5                \n\t"
2122
                                "pxor %%mm7, %%mm7                \n\t"
2123
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2124
                                ASMALIGN16
2125
                                "1:                                \n\t"
2126
                                PREFETCH" 64(%0, %%"REG_b")        \n\t"
2127
                                "movd (%0, %%"REG_b"), %%mm0        \n\t"
2128
                                "movd 3(%0, %%"REG_b"), %%mm1        \n\t"
2129
                                "punpcklbw %%mm7, %%mm0                \n\t"
2130
                                "punpcklbw %%mm7, %%mm1                \n\t"
2131
                                "movd 6(%0, %%"REG_b"), %%mm2        \n\t"
2132
                                "movd 9(%0, %%"REG_b"), %%mm3        \n\t"
2133
                                "punpcklbw %%mm7, %%mm2                \n\t"
2134
                                "punpcklbw %%mm7, %%mm3                \n\t"
2135
                                "pmaddwd %%mm6, %%mm0                \n\t"
2136
                                "pmaddwd %%mm6, %%mm1                \n\t"
2137
                                "pmaddwd %%mm6, %%mm2                \n\t"
2138
                                "pmaddwd %%mm6, %%mm3                \n\t"
2139
#ifndef FAST_BGR2YV12
2140
                                "psrad $8, %%mm0                \n\t"
2141
                                "psrad $8, %%mm1                \n\t"
2142
                                "psrad $8, %%mm2                \n\t"
2143
                                "psrad $8, %%mm3                \n\t"
2144
#endif
2145
                                "packssdw %%mm1, %%mm0                \n\t"
2146
                                "packssdw %%mm3, %%mm2                \n\t"
2147
                                "pmaddwd %%mm5, %%mm0                \n\t"
2148
                                "pmaddwd %%mm5, %%mm2                \n\t"
2149
                                "packssdw %%mm2, %%mm0                \n\t"
2150
                                "psraw $7, %%mm0                \n\t"
2151

    
2152
                                "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
2153
                                "movd 15(%0, %%"REG_b"), %%mm1        \n\t"
2154
                                "punpcklbw %%mm7, %%mm4                \n\t"
2155
                                "punpcklbw %%mm7, %%mm1                \n\t"
2156
                                "movd 18(%0, %%"REG_b"), %%mm2        \n\t"
2157
                                "movd 21(%0, %%"REG_b"), %%mm3        \n\t"
2158
                                "punpcklbw %%mm7, %%mm2                \n\t"
2159
                                "punpcklbw %%mm7, %%mm3                \n\t"
2160
                                "pmaddwd %%mm6, %%mm4                \n\t"
2161
                                "pmaddwd %%mm6, %%mm1                \n\t"
2162
                                "pmaddwd %%mm6, %%mm2                \n\t"
2163
                                "pmaddwd %%mm6, %%mm3                \n\t"
2164
#ifndef FAST_BGR2YV12
2165
                                "psrad $8, %%mm4                \n\t"
2166
                                "psrad $8, %%mm1                \n\t"
2167
                                "psrad $8, %%mm2                \n\t"
2168
                                "psrad $8, %%mm3                \n\t"
2169
#endif
2170
                                "packssdw %%mm1, %%mm4                \n\t"
2171
                                "packssdw %%mm3, %%mm2                \n\t"
2172
                                "pmaddwd %%mm5, %%mm4                \n\t"
2173
                                "pmaddwd %%mm5, %%mm2                \n\t"
2174
                                "add $24, %%"REG_b"                \n\t"
2175
                                "packssdw %%mm2, %%mm4                \n\t"
2176
                                "psraw $7, %%mm4                \n\t"
2177

    
2178
                                "packuswb %%mm4, %%mm0                \n\t"
2179
                                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
2180

    
2181
                                MOVNTQ" %%mm0, (%1, %%"REG_a")        \n\t"
2182
                                "add $8, %%"REG_a"                \n\t"
2183
                                " js 1b                                \n\t"
2184
                                : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2185
                                : "%"REG_a, "%"REG_b
2186
                        );
2187
                        ydst += lumStride;
2188
                        src  += srcStride;
2189
                }
2190
                src -= srcStride*2;
2191
                asm volatile(
2192
                        "mov %4, %%"REG_a"                \n\t"
2193
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2194
                        "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
2195
                        "pxor %%mm7, %%mm7                \n\t"
2196
                        "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2197
                        "add %%"REG_b", %%"REG_b"        \n\t"
2198
                        ASMALIGN16
2199
                        "1:                                \n\t"
2200
                        PREFETCH" 64(%0, %%"REG_b")        \n\t"
2201
                        PREFETCH" 64(%1, %%"REG_b")        \n\t"
2202
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2203
                        "movq (%0, %%"REG_b"), %%mm0        \n\t"
2204
                        "movq (%1, %%"REG_b"), %%mm1        \n\t"
2205
                        "movq 6(%0, %%"REG_b"), %%mm2        \n\t"
2206
                        "movq 6(%1, %%"REG_b"), %%mm3        \n\t"
2207
                        PAVGB" %%mm1, %%mm0                \n\t"
2208
                        PAVGB" %%mm3, %%mm2                \n\t"
2209
                        "movq %%mm0, %%mm1                \n\t"
2210
                        "movq %%mm2, %%mm3                \n\t"
2211
                        "psrlq $24, %%mm0                \n\t"
2212
                        "psrlq $24, %%mm2                \n\t"
2213
                        PAVGB" %%mm1, %%mm0                \n\t"
2214
                        PAVGB" %%mm3, %%mm2                \n\t"
2215
                        "punpcklbw %%mm7, %%mm0                \n\t"
2216
                        "punpcklbw %%mm7, %%mm2                \n\t"
2217
#else
2218
                        "movd (%0, %%"REG_b"), %%mm0        \n\t"
2219
                        "movd (%1, %%"REG_b"), %%mm1        \n\t"
2220
                        "movd 3(%0, %%"REG_b"), %%mm2        \n\t"
2221
                        "movd 3(%1, %%"REG_b"), %%mm3        \n\t"
2222
                        "punpcklbw %%mm7, %%mm0                \n\t"
2223
                        "punpcklbw %%mm7, %%mm1                \n\t"
2224
                        "punpcklbw %%mm7, %%mm2                \n\t"
2225
                        "punpcklbw %%mm7, %%mm3                \n\t"
2226
                        "paddw %%mm1, %%mm0                \n\t"
2227
                        "paddw %%mm3, %%mm2                \n\t"
2228
                        "paddw %%mm2, %%mm0                \n\t"
2229
                        "movd 6(%0, %%"REG_b"), %%mm4        \n\t"
2230
                        "movd 6(%1, %%"REG_b"), %%mm1        \n\t"
2231
                        "movd 9(%0, %%"REG_b"), %%mm2        \n\t"
2232
                        "movd 9(%1, %%"REG_b"), %%mm3        \n\t"
2233
                        "punpcklbw %%mm7, %%mm4                \n\t"
2234
                        "punpcklbw %%mm7, %%mm1                \n\t"
2235
                        "punpcklbw %%mm7, %%mm2                \n\t"
2236
                        "punpcklbw %%mm7, %%mm3                \n\t"
2237
                        "paddw %%mm1, %%mm4                \n\t"
2238
                        "paddw %%mm3, %%mm2                \n\t"
2239
                        "paddw %%mm4, %%mm2                \n\t"
2240
                        "psrlw $2, %%mm0                \n\t"
2241
                        "psrlw $2, %%mm2                \n\t"
2242
#endif
2243
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2244
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2245

    
2246
                        "pmaddwd %%mm0, %%mm1                \n\t"
2247
                        "pmaddwd %%mm2, %%mm3                \n\t"
2248
                        "pmaddwd %%mm6, %%mm0                \n\t"
2249
                        "pmaddwd %%mm6, %%mm2                \n\t"
2250
#ifndef FAST_BGR2YV12
2251
                        "psrad $8, %%mm0                \n\t"
2252
                        "psrad $8, %%mm1                \n\t"
2253
                        "psrad $8, %%mm2                \n\t"
2254
                        "psrad $8, %%mm3                \n\t"
2255
#endif
2256
                        "packssdw %%mm2, %%mm0                \n\t"
2257
                        "packssdw %%mm3, %%mm1                \n\t"
2258
                        "pmaddwd %%mm5, %%mm0                \n\t"
2259
                        "pmaddwd %%mm5, %%mm1                \n\t"
2260
                        "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
2261
                        "psraw $7, %%mm0                \n\t"
2262

    
2263
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2264
                        "movq 12(%0, %%"REG_b"), %%mm4        \n\t"
2265
                        "movq 12(%1, %%"REG_b"), %%mm1        \n\t"
2266
                        "movq 18(%0, %%"REG_b"), %%mm2        \n\t"
2267
                        "movq 18(%1, %%"REG_b"), %%mm3        \n\t"
2268
                        PAVGB" %%mm1, %%mm4                \n\t"
2269
                        PAVGB" %%mm3, %%mm2                \n\t"
2270
                        "movq %%mm4, %%mm1                \n\t"
2271
                        "movq %%mm2, %%mm3                \n\t"
2272
                        "psrlq $24, %%mm4                \n\t"
2273
                        "psrlq $24, %%mm2                \n\t"
2274
                        PAVGB" %%mm1, %%mm4                \n\t"
2275
                        PAVGB" %%mm3, %%mm2                \n\t"
2276
                        "punpcklbw %%mm7, %%mm4                \n\t"
2277
                        "punpcklbw %%mm7, %%mm2                \n\t"
2278
#else
2279
                        "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
2280
                        "movd 12(%1, %%"REG_b"), %%mm1        \n\t"
2281
                        "movd 15(%0, %%"REG_b"), %%mm2        \n\t"
2282
                        "movd 15(%1, %%"REG_b"), %%mm3        \n\t"
2283
                        "punpcklbw %%mm7, %%mm4                \n\t"
2284
                        "punpcklbw %%mm7, %%mm1                \n\t"
2285
                        "punpcklbw %%mm7, %%mm2                \n\t"
2286
                        "punpcklbw %%mm7, %%mm3                \n\t"
2287
                        "paddw %%mm1, %%mm4                \n\t"
2288
                        "paddw %%mm3, %%mm2                \n\t"
2289
                        "paddw %%mm2, %%mm4                \n\t"
2290
                        "movd 18(%0, %%"REG_b"), %%mm5        \n\t"
2291
                        "movd 18(%1, %%"REG_b"), %%mm1        \n\t"
2292
                        "movd 21(%0, %%"REG_b"), %%mm2        \n\t"
2293
                        "movd 21(%1, %%"REG_b"), %%mm3        \n\t"
2294
                        "punpcklbw %%mm7, %%mm5                \n\t"
2295
                        "punpcklbw %%mm7, %%mm1                \n\t"
2296
                        "punpcklbw %%mm7, %%mm2                \n\t"
2297
                        "punpcklbw %%mm7, %%mm3                \n\t"
2298
                        "paddw %%mm1, %%mm5                \n\t"
2299
                        "paddw %%mm3, %%mm2                \n\t"
2300
                        "paddw %%mm5, %%mm2                \n\t"
2301
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2302
                        "psrlw $2, %%mm4                \n\t"
2303
                        "psrlw $2, %%mm2                \n\t"
2304
#endif
2305
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2306
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2307

    
2308
                        "pmaddwd %%mm4, %%mm1                \n\t"
2309
                        "pmaddwd %%mm2, %%mm3                \n\t"
2310
                        "pmaddwd %%mm6, %%mm4                \n\t"
2311
                        "pmaddwd %%mm6, %%mm2                \n\t"
2312
#ifndef FAST_BGR2YV12
2313
                        "psrad $8, %%mm4                \n\t"
2314
                        "psrad $8, %%mm1                \n\t"
2315
                        "psrad $8, %%mm2                \n\t"
2316
                        "psrad $8, %%mm3                \n\t"
2317
#endif
2318
                        "packssdw %%mm2, %%mm4                \n\t"
2319
                        "packssdw %%mm3, %%mm1                \n\t"
2320
                        "pmaddwd %%mm5, %%mm4                \n\t"
2321
                        "pmaddwd %%mm5, %%mm1                \n\t"
2322
                        "add $24, %%"REG_b"                \n\t"
2323
                        "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
2324
                        "psraw $7, %%mm4                \n\t"
2325

    
2326
                        "movq %%mm0, %%mm1                \n\t"
2327
                        "punpckldq %%mm4, %%mm0                \n\t"
2328
                        "punpckhdq %%mm4, %%mm1                \n\t"
2329
                        "packsswb %%mm1, %%mm0                \n\t"
2330
                        "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
2331
                        "movd %%mm0, (%2, %%"REG_a")        \n\t"
2332
                        "punpckhdq %%mm0, %%mm0                \n\t"
2333
                        "movd %%mm0, (%3, %%"REG_a")        \n\t"
2334
                        "add $4, %%"REG_a"                \n\t"
2335
                        " js 1b                                \n\t"
2336
                        : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2337
                        : "%"REG_a, "%"REG_b
2338
                );
2339

    
2340
                udst += chromStride;
2341
                vdst += chromStride;
2342
                src  += srcStride*2;
2343
        }
2344

    
2345
        asm volatile(   EMMS" \n\t"
2346
                        SFENCE" \n\t"
2347
                        :::"memory");
2348
#else
2349
        y=0;
2350
#endif
2351
        for(; y<height; y+=2)
2352
        {
2353
                long i;
2354
                for(i=0; i<chromWidth; i++)
2355
                {
2356
                        unsigned int b= src[6*i+0];
2357
                        unsigned int g= src[6*i+1];
2358
                        unsigned int r= src[6*i+2];
2359

    
2360
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2361
                        unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2362
                        unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2363

    
2364
                        udst[i]         = U;
2365
                        vdst[i]         = V;
2366
                        ydst[2*i]         = Y;
2367

    
2368
                        b= src[6*i+3];
2369
                        g= src[6*i+4];
2370
                        r= src[6*i+5];
2371

    
2372
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2373
                        ydst[2*i+1]         = Y;
2374
                }
2375
                ydst += lumStride;
2376
                src  += srcStride;
2377

    
2378
                for(i=0; i<chromWidth; i++)
2379
                {
2380
                        unsigned int b= src[6*i+0];
2381
                        unsigned int g= src[6*i+1];
2382
                        unsigned int r= src[6*i+2];
2383

    
2384
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2385

    
2386
                        ydst[2*i]         = Y;
2387

    
2388
                        b= src[6*i+3];
2389
                        g= src[6*i+4];
2390
                        r= src[6*i+5];
2391

    
2392
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2393
                        ydst[2*i+1]         = Y;
2394
                }
2395
                udst += chromStride;
2396
                vdst += chromStride;
2397
                ydst += lumStride;
2398
                src  += srcStride;
2399
        }
2400
}
2401

    
2402
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2403
                            long width, long height, long src1Stride,
2404
                            long src2Stride, long dstStride){
2405
        long h;
2406

    
2407
        for(h=0; h < height; h++)
2408
        {
2409
                long w;
2410

    
2411
#ifdef HAVE_MMX
2412
#ifdef HAVE_SSE2
2413
                asm(
2414
                        "xor %%"REG_a", %%"REG_a"        \n\t"
2415
                        "1:                                \n\t"
2416
                        PREFETCH" 64(%1, %%"REG_a")        \n\t"
2417
                        PREFETCH" 64(%2, %%"REG_a")        \n\t"
2418
                        "movdqa (%1, %%"REG_a"), %%xmm0        \n\t"
2419
                        "movdqa (%1, %%"REG_a"), %%xmm1        \n\t"
2420
                        "movdqa (%2, %%"REG_a"), %%xmm2        \n\t"
2421
                        "punpcklbw %%xmm2, %%xmm0        \n\t"
2422
                        "punpckhbw %%xmm2, %%xmm1        \n\t"
2423
                        "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2424
                        "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2425
                        "add $16, %%"REG_a"                \n\t"
2426
                        "cmp %3, %%"REG_a"                \n\t"
2427
                        " jb 1b                                \n\t"
2428
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2429
                        : "memory", "%"REG_a""
2430
                );
2431
#else
2432
                asm(
2433
                        "xor %%"REG_a", %%"REG_a"        \n\t"
2434
                        "1:                                \n\t"
2435
                        PREFETCH" 64(%1, %%"REG_a")        \n\t"
2436
                        PREFETCH" 64(%2, %%"REG_a")        \n\t"
2437
                        "movq (%1, %%"REG_a"), %%mm0        \n\t"
2438
                        "movq 8(%1, %%"REG_a"), %%mm2        \n\t"
2439
                        "movq %%mm0, %%mm1                \n\t"
2440
                        "movq %%mm2, %%mm3                \n\t"
2441
                        "movq (%2, %%"REG_a"), %%mm4        \n\t"
2442
                        "movq 8(%2, %%"REG_a"), %%mm5        \n\t"
2443
                        "punpcklbw %%mm4, %%mm0                \n\t"
2444
                        "punpckhbw %%mm4, %%mm1                \n\t"
2445
                        "punpcklbw %%mm5, %%mm2                \n\t"
2446
                        "punpckhbw %%mm5, %%mm3                \n\t"
2447
                        MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2448
                        MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2449
                        MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2450
                        MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2451
                        "add $16, %%"REG_a"                \n\t"
2452
                        "cmp %3, %%"REG_a"                \n\t"
2453
                        " jb 1b                                \n\t"
2454
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2455
                        : "memory", "%"REG_a
2456
                );
2457
#endif
2458
                for(w= (width&(~15)); w < width; w++)
2459
                {
2460
                        dest[2*w+0] = src1[w];
2461
                        dest[2*w+1] = src2[w];
2462
                }
2463
#else
2464
                for(w=0; w < width; w++)
2465
                {
2466
                        dest[2*w+0] = src1[w];
2467
                        dest[2*w+1] = src2[w];
2468
                }
2469
#endif
2470
                dest += dstStride;
2471
                src1 += src1Stride;
2472
                src2 += src2Stride;
2473
        }
2474
#ifdef HAVE_MMX
2475
        asm(
2476
                EMMS" \n\t"
2477
                SFENCE" \n\t"
2478
                ::: "memory"
2479
                );
2480
#endif
2481
}
2482

    
2483
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2484
                        uint8_t *dst1, uint8_t *dst2,
2485
                        long width, long height,
2486
                        long srcStride1, long srcStride2,
2487
                        long dstStride1, long dstStride2)
2488
{
2489
    long y,x,w,h;
2490
    w=width/2; h=height/2;
2491
#ifdef HAVE_MMX
2492
    asm volatile(
2493
        PREFETCH" %0\n\t"
2494
        PREFETCH" %1\n\t"
2495
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2496
#endif
2497
    for(y=0;y<h;y++){
2498
        const uint8_t* s1=src1+srcStride1*(y>>1);
2499
        uint8_t* d=dst1+dstStride1*y;
2500
        x=0;
2501
#ifdef HAVE_MMX
2502
        for(;x<w-31;x+=32)
2503
        {
2504
            asm volatile(
2505
                PREFETCH" 32%1\n\t"
2506
                "movq        %1, %%mm0\n\t"
2507
                "movq        8%1, %%mm2\n\t"
2508
                "movq        16%1, %%mm4\n\t"
2509
                "movq        24%1, %%mm6\n\t"
2510
                "movq        %%mm0, %%mm1\n\t"
2511
                "movq        %%mm2, %%mm3\n\t"
2512
                "movq        %%mm4, %%mm5\n\t"
2513
                "movq        %%mm6, %%mm7\n\t"
2514
                "punpcklbw %%mm0, %%mm0\n\t"
2515
                "punpckhbw %%mm1, %%mm1\n\t"
2516
                "punpcklbw %%mm2, %%mm2\n\t"
2517
                "punpckhbw %%mm3, %%mm3\n\t"
2518
                "punpcklbw %%mm4, %%mm4\n\t"
2519
                "punpckhbw %%mm5, %%mm5\n\t"
2520
                "punpcklbw %%mm6, %%mm6\n\t"
2521
                "punpckhbw %%mm7, %%mm7\n\t"
2522
                MOVNTQ"        %%mm0, %0\n\t"
2523
                MOVNTQ"        %%mm1, 8%0\n\t"
2524
                MOVNTQ"        %%mm2, 16%0\n\t"
2525
                MOVNTQ"        %%mm3, 24%0\n\t"
2526
                MOVNTQ"        %%mm4, 32%0\n\t"
2527
                MOVNTQ"        %%mm5, 40%0\n\t"
2528
                MOVNTQ"        %%mm6, 48%0\n\t"
2529
                MOVNTQ"        %%mm7, 56%0"
2530
                :"=m"(d[2*x])
2531
                :"m"(s1[x])
2532
                :"memory");
2533
        }
2534
#endif
2535
        for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2536
    }
2537
    for(y=0;y<h;y++){
2538
        const uint8_t* s2=src2+srcStride2*(y>>1);
2539
        uint8_t* d=dst2+dstStride2*y;
2540
        x=0;
2541
#ifdef HAVE_MMX
2542
        for(;x<w-31;x+=32)
2543
        {
2544
            asm volatile(
2545
                PREFETCH" 32%1\n\t"
2546
                "movq        %1, %%mm0\n\t"
2547
                "movq        8%1, %%mm2\n\t"
2548
                "movq        16%1, %%mm4\n\t"
2549
                "movq        24%1, %%mm6\n\t"
2550
                "movq        %%mm0, %%mm1\n\t"
2551
                "movq        %%mm2, %%mm3\n\t"
2552
                "movq        %%mm4, %%mm5\n\t"
2553
                "movq        %%mm6, %%mm7\n\t"
2554
                "punpcklbw %%mm0, %%mm0\n\t"
2555
                "punpckhbw %%mm1, %%mm1\n\t"
2556
                "punpcklbw %%mm2, %%mm2\n\t"
2557
                "punpckhbw %%mm3, %%mm3\n\t"
2558
                "punpcklbw %%mm4, %%mm4\n\t"
2559
                "punpckhbw %%mm5, %%mm5\n\t"
2560
                "punpcklbw %%mm6, %%mm6\n\t"
2561
                "punpckhbw %%mm7, %%mm7\n\t"
2562
                MOVNTQ"        %%mm0, %0\n\t"
2563
                MOVNTQ"        %%mm1, 8%0\n\t"
2564
                MOVNTQ"        %%mm2, 16%0\n\t"
2565
                MOVNTQ"        %%mm3, 24%0\n\t"
2566
                MOVNTQ"        %%mm4, 32%0\n\t"
2567
                MOVNTQ"        %%mm5, 40%0\n\t"
2568
                MOVNTQ"        %%mm6, 48%0\n\t"
2569
                MOVNTQ"        %%mm7, 56%0"
2570
                :"=m"(d[2*x])
2571
                :"m"(s2[x])
2572
                :"memory");
2573
        }
2574
#endif
2575
        for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2576
    }
2577
#ifdef HAVE_MMX
2578
        asm(
2579
                EMMS" \n\t"
2580
                SFENCE" \n\t"
2581
                ::: "memory"
2582
                );
2583
#endif
2584
}
2585

    
2586
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2587
                        uint8_t *dst,
2588
                        long width, long height,
2589
                        long srcStride1, long srcStride2,
2590
                        long srcStride3, long dstStride)
2591
{
2592
    long y,x,w,h;
2593
    w=width/2; h=height;
2594
    for(y=0;y<h;y++){
2595
        const uint8_t* yp=src1+srcStride1*y;
2596
        const uint8_t* up=src2+srcStride2*(y>>2);
2597
        const uint8_t* vp=src3+srcStride3*(y>>2);
2598
        uint8_t* d=dst+dstStride*y;
2599
        x=0;
2600
#ifdef HAVE_MMX
2601
        for(;x<w-7;x+=8)
2602
        {
2603
            asm volatile(
2604
                PREFETCH" 32(%1, %0)\n\t"
2605
                PREFETCH" 32(%2, %0)\n\t"
2606
                PREFETCH" 32(%3, %0)\n\t"
2607
                "movq        (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2608
                "movq        (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2609
                "movq        (%3, %0), %%mm2\n\t"             /* V0V1V2V3V4V5V6V7 */
2610
                "movq        %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2611
                "movq        %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2612
                "movq        %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2613
                "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2614
                "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2615
                "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2616
                "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2617

    
2618
                "movq        %%mm1, %%mm6\n\t"
2619
                "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2620
                "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2621
                "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2622
                MOVNTQ"        %%mm0, (%4, %0, 8)\n\t"
2623
                MOVNTQ"        %%mm3, 8(%4, %0, 8)\n\t"
2624
                
2625
                "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2626
                "movq        8(%1, %0, 4), %%mm0\n\t"
2627
                "movq        %%mm0, %%mm3\n\t"
2628
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2629
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2630
                MOVNTQ"        %%mm0, 16(%4, %0, 8)\n\t"
2631
                MOVNTQ"        %%mm3, 24(%4, %0, 8)\n\t"
2632

    
2633
                "movq        %%mm4, %%mm6\n\t"
2634
                "movq        16(%1, %0, 4), %%mm0\n\t"
2635
                "movq        %%mm0, %%mm3\n\t"
2636
                "punpcklbw %%mm5, %%mm4\n\t"
2637
                "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2638
                "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2639
                MOVNTQ"        %%mm0, 32(%4, %0, 8)\n\t"
2640
                MOVNTQ"        %%mm3, 40(%4, %0, 8)\n\t"
2641
                
2642
                "punpckhbw %%mm5, %%mm6\n\t"
2643
                "movq        24(%1, %0, 4), %%mm0\n\t"
2644
                "movq        %%mm0, %%mm3\n\t"
2645
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2646
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2647
                MOVNTQ"        %%mm0, 48(%4, %0, 8)\n\t"
2648
                MOVNTQ"        %%mm3, 56(%4, %0, 8)\n\t"
2649

    
2650
                : "+r" (x)
2651
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2652
                :"memory");
2653
        }
2654
#endif
2655
        for(; x<w; x++)
2656
        {
2657
            const long x2= x<<2;
2658
            d[8*x+0]=yp[x2];
2659
            d[8*x+1]=up[x];
2660
            d[8*x+2]=yp[x2+1];
2661
            d[8*x+3]=vp[x];
2662
            d[8*x+4]=yp[x2+2];
2663
            d[8*x+5]=up[x];
2664
            d[8*x+6]=yp[x2+3];
2665
            d[8*x+7]=vp[x];
2666
        }
2667
    }
2668
#ifdef HAVE_MMX
2669
        asm(
2670
                EMMS" \n\t"
2671
                SFENCE" \n\t"
2672
                ::: "memory"
2673
                );
2674
#endif
2675
}