Statistics
| Branch: | Revision:

ffmpeg / postproc / rgb2rgb_template.c @ da7f8893

History | View | Annotate | Download (66.1 KB)

1
/*
2
 *
3
 *  rgb2rgb.c, Software RGB to RGB convertor
4
 *  pluralize by Software PAL8 to RGB convertor
5
 *               Software YUV to YUV convertor
6
 *               Software YUV to RGB convertor
7
 *  Written by Nick Kurshev.
8
 *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9
 */
10

    
11
#include <stddef.h>
12
#include <inttypes.h> /* for __WORDSIZE */
13

    
14
#ifndef __WORDSIZE
15
// #warning You have misconfigured system and probably will lose performance!
16
#define __WORDSIZE MP_WORDSIZE
17
#endif
18

    
19
#undef PREFETCH
20
#undef MOVNTQ
21
#undef EMMS
22
#undef SFENCE
23
#undef MMREG_SIZE
24
#undef PREFETCHW
25
#undef PAVGB
26

    
27
#ifdef HAVE_SSE2
28
#define MMREG_SIZE 16
29
#else
30
#define MMREG_SIZE 8
31
#endif
32

    
33
#ifdef HAVE_3DNOW
34
#define PREFETCH  "prefetch"
35
#define PREFETCHW "prefetchw"
36
#define PAVGB          "pavgusb"
37
#elif defined ( HAVE_MMX2 )
38
#define PREFETCH "prefetchnta"
39
#define PREFETCHW "prefetcht0"
40
#define PAVGB          "pavgb"
41
#else
42
#define PREFETCH "/nop"
43
#define PREFETCHW "/nop"
44
#endif
45

    
46
#ifdef HAVE_3DNOW
47
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48
#define EMMS     "femms"
49
#else
50
#define EMMS     "emms"
51
#endif
52

    
53
#ifdef HAVE_MMX2
54
#define MOVNTQ "movntq"
55
#define SFENCE "sfence"
56
#else
57
#define MOVNTQ "movq"
58
#define SFENCE "/nop"
59
#endif
60

    
61
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62
{
63
  uint8_t *dest = dst;
64
  const uint8_t *s = src;
65
  const uint8_t *end;
66
#ifdef HAVE_MMX
67
  const uint8_t *mm_end;
68
#endif
69
  end = s + src_size;
70
#ifdef HAVE_MMX
71
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
72
  mm_end = end - 23;
73
  __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
74
  while(s < mm_end)
75
  {
76
    __asm __volatile(
77
        PREFETCH"        32%1\n\t"
78
        "movd        %1, %%mm0\n\t"
79
        "punpckldq 3%1, %%mm0\n\t"
80
        "movd        6%1, %%mm1\n\t"
81
        "punpckldq 9%1, %%mm1\n\t"
82
        "movd        12%1, %%mm2\n\t"
83
        "punpckldq 15%1, %%mm2\n\t"
84
        "movd        18%1, %%mm3\n\t"
85
        "punpckldq 21%1, %%mm3\n\t"
86
        "pand        %%mm7, %%mm0\n\t"
87
        "pand        %%mm7, %%mm1\n\t"
88
        "pand        %%mm7, %%mm2\n\t"
89
        "pand        %%mm7, %%mm3\n\t"
90
        MOVNTQ"        %%mm0, %0\n\t"
91
        MOVNTQ"        %%mm1, 8%0\n\t"
92
        MOVNTQ"        %%mm2, 16%0\n\t"
93
        MOVNTQ"        %%mm3, 24%0"
94
        :"=m"(*dest)
95
        :"m"(*s)
96
        :"memory");
97
    dest += 32;
98
    s += 24;
99
  }
100
  __asm __volatile(SFENCE:::"memory");
101
  __asm __volatile(EMMS:::"memory");
102
#endif
103
  while(s < end)
104
  {
105
    *dest++ = *s++;
106
    *dest++ = *s++;
107
    *dest++ = *s++;
108
    *dest++ = 0;
109
  }
110
}
111

    
112
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113
{
114
  uint8_t *dest = dst;
115
  const uint8_t *s = src;
116
  const uint8_t *end;
117
#ifdef HAVE_MMX
118
  const uint8_t *mm_end;
119
#endif
120
  end = s + src_size;
121
#ifdef HAVE_MMX
122
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
123
  mm_end = end - 31;
124
  while(s < mm_end)
125
  {
126
    __asm __volatile(
127
        PREFETCH"        32%1\n\t"
128
        "movq        %1, %%mm0\n\t"
129
        "movq        8%1, %%mm1\n\t"
130
        "movq        16%1, %%mm4\n\t"
131
        "movq        24%1, %%mm5\n\t"
132
        "movq        %%mm0, %%mm2\n\t"
133
        "movq        %%mm1, %%mm3\n\t"
134
        "movq        %%mm4, %%mm6\n\t"
135
        "movq        %%mm5, %%mm7\n\t"
136
        "psrlq        $8, %%mm2\n\t"
137
        "psrlq        $8, %%mm3\n\t"
138
        "psrlq        $8, %%mm6\n\t"
139
        "psrlq        $8, %%mm7\n\t"
140
        "pand        %2, %%mm0\n\t"
141
        "pand        %2, %%mm1\n\t"
142
        "pand        %2, %%mm4\n\t"
143
        "pand        %2, %%mm5\n\t"
144
        "pand        %3, %%mm2\n\t"
145
        "pand        %3, %%mm3\n\t"
146
        "pand        %3, %%mm6\n\t"
147
        "pand        %3, %%mm7\n\t"
148
        "por        %%mm2, %%mm0\n\t"
149
        "por        %%mm3, %%mm1\n\t"
150
        "por        %%mm6, %%mm4\n\t"
151
        "por        %%mm7, %%mm5\n\t"
152

    
153
        "movq        %%mm1, %%mm2\n\t"
154
        "movq        %%mm4, %%mm3\n\t"
155
        "psllq        $48, %%mm2\n\t"
156
        "psllq        $32, %%mm3\n\t"
157
        "pand        %4, %%mm2\n\t"
158
        "pand        %5, %%mm3\n\t"
159
        "por        %%mm2, %%mm0\n\t"
160
        "psrlq        $16, %%mm1\n\t"
161
        "psrlq        $32, %%mm4\n\t"
162
        "psllq        $16, %%mm5\n\t"
163
        "por        %%mm3, %%mm1\n\t"
164
        "pand        %6, %%mm5\n\t"
165
        "por        %%mm5, %%mm4\n\t"
166

    
167
        MOVNTQ"        %%mm0, %0\n\t"
168
        MOVNTQ"        %%mm1, 8%0\n\t"
169
        MOVNTQ"        %%mm4, 16%0"
170
        :"=m"(*dest)
171
        :"m"(*s),"m"(mask24l),
172
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173
        :"memory");
174
    dest += 24;
175
    s += 32;
176
  }
177
  __asm __volatile(SFENCE:::"memory");
178
  __asm __volatile(EMMS:::"memory");
179
#endif
180
  while(s < end)
181
  {
182
    *dest++ = *s++;
183
    *dest++ = *s++;
184
    *dest++ = *s++;
185
    s++;
186
  }
187
}
188

    
189
/*
190
 Original by Strepto/Astral
191
 ported to gcc & bugfixed : A'rpi
192
 MMX2, 3DNOW optimization by Nick Kurshev
193
 32bit c version, and and&add trick by Michael Niedermayer
194
*/
195
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196
{
197
  register const uint8_t* s=src;
198
  register uint8_t* d=dst;
199
  register const uint8_t *end;
200
  const uint8_t *mm_end;
201
  end = s + src_size;
202
#ifdef HAVE_MMX
203
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
204
  __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
205
  mm_end = end - 15;
206
  while(s<mm_end)
207
  {
208
        __asm __volatile(
209
                PREFETCH"        32%1\n\t"
210
                "movq        %1, %%mm0\n\t"
211
                "movq        8%1, %%mm2\n\t"
212
                "movq        %%mm0, %%mm1\n\t"
213
                "movq        %%mm2, %%mm3\n\t"
214
                "pand        %%mm4, %%mm0\n\t"
215
                "pand        %%mm4, %%mm2\n\t"
216
                "paddw        %%mm1, %%mm0\n\t"
217
                "paddw        %%mm3, %%mm2\n\t"
218
                MOVNTQ"        %%mm0, %0\n\t"
219
                MOVNTQ"        %%mm2, 8%0"
220
                :"=m"(*d)
221
                :"m"(*s)
222
                );
223
        d+=16;
224
        s+=16;
225
  }
226
  __asm __volatile(SFENCE:::"memory");
227
  __asm __volatile(EMMS:::"memory");
228
#endif
229
    mm_end = end - 3;
230
    while(s < mm_end)
231
    {
232
        register unsigned x= *((uint32_t *)s);
233
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234
        d+=4;
235
        s+=4;
236
    }
237
    if(s < end)
238
    {
239
        register unsigned short x= *((uint16_t *)s);
240
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241
    }
242
}
243

    
244
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
245
{
246
  register const uint8_t* s=src;
247
  register uint8_t* d=dst;
248
  register const uint8_t *end;
249
  const uint8_t *mm_end;
250
  end = s + src_size;
251
#ifdef HAVE_MMX
252
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
253
  __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
254
  __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
255
  mm_end = end - 15;
256
  while(s<mm_end)
257
  {
258
        __asm __volatile(
259
                PREFETCH"        32%1\n\t"
260
                "movq        %1, %%mm0\n\t"
261
                "movq        8%1, %%mm2\n\t"
262
                "movq        %%mm0, %%mm1\n\t"
263
                "movq        %%mm2, %%mm3\n\t"
264
                "psrlq        $1, %%mm0\n\t"
265
                "psrlq        $1, %%mm2\n\t"
266
                "pand        %%mm7, %%mm0\n\t"
267
                "pand        %%mm7, %%mm2\n\t"
268
                "pand        %%mm6, %%mm1\n\t"
269
                "pand        %%mm6, %%mm3\n\t"
270
                "por        %%mm1, %%mm0\n\t"
271
                "por        %%mm3, %%mm2\n\t"
272
                MOVNTQ"        %%mm0, %0\n\t"
273
                MOVNTQ"        %%mm2, 8%0"
274
                :"=m"(*d)
275
                :"m"(*s)
276
                );
277
        d+=16;
278
        s+=16;
279
  }
280
  __asm __volatile(SFENCE:::"memory");
281
  __asm __volatile(EMMS:::"memory");
282
#endif
283
    mm_end = end - 3;
284
    while(s < mm_end)
285
    {
286
        register uint32_t x= *((uint32_t *)s);
287
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
288
        s+=4;
289
        d+=4;
290
    }
291
    if(s < end)
292
    {
293
        register uint16_t x= *((uint16_t *)s);
294
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
295
        s+=2;
296
        d+=2;
297
    }
298
}
299

    
300
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
301
{
302
        const uint8_t *s = src;
303
        const uint8_t *end;
304
#ifdef HAVE_MMX
305
        const uint8_t *mm_end;
306
#endif
307
        uint16_t *d = (uint16_t *)dst;
308
        end = s + src_size;
309
#ifdef HAVE_MMX
310
        mm_end = end - 15;
311
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
312
        asm volatile(
313
                "movq %3, %%mm5                        \n\t"
314
                "movq %4, %%mm6                        \n\t"
315
                "movq %5, %%mm7                        \n\t"
316
                ".balign 16                        \n\t"
317
                "1:                                \n\t"
318
                PREFETCH" 32(%1)                \n\t"
319
                "movd        (%1), %%mm0                \n\t"
320
                "movd        4(%1), %%mm3                \n\t"
321
                "punpckldq 8(%1), %%mm0                \n\t"
322
                "punpckldq 12(%1), %%mm3        \n\t"
323
                "movq %%mm0, %%mm1                \n\t"
324
                "movq %%mm3, %%mm4                \n\t"
325
                "pand %%mm6, %%mm0                \n\t"
326
                "pand %%mm6, %%mm3                \n\t"
327
                "pmaddwd %%mm7, %%mm0                \n\t"
328
                "pmaddwd %%mm7, %%mm3                \n\t"
329
                "pand %%mm5, %%mm1                \n\t"
330
                "pand %%mm5, %%mm4                \n\t"
331
                "por %%mm1, %%mm0                \n\t"        
332
                "por %%mm4, %%mm3                \n\t"
333
                "psrld $5, %%mm0                \n\t"
334
                "pslld $11, %%mm3                \n\t"
335
                "por %%mm3, %%mm0                \n\t"
336
                MOVNTQ"        %%mm0, (%0)                \n\t"
337
                "addl $16, %1                        \n\t"
338
                "addl $8, %0                        \n\t"
339
                "cmpl %2, %1                        \n\t"
340
                " jb 1b                                \n\t"
341
                : "+r" (d), "+r"(s)
342
                : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
343
        );
344
#else
345
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
346
        __asm __volatile(
347
            "movq        %0, %%mm7\n\t"
348
            "movq        %1, %%mm6\n\t"
349
            ::"m"(red_16mask),"m"(green_16mask));
350
        while(s < mm_end)
351
        {
352
            __asm __volatile(
353
                PREFETCH" 32%1\n\t"
354
                "movd        %1, %%mm0\n\t"
355
                "movd        4%1, %%mm3\n\t"
356
                "punpckldq 8%1, %%mm0\n\t"
357
                "punpckldq 12%1, %%mm3\n\t"
358
                "movq        %%mm0, %%mm1\n\t"
359
                "movq        %%mm0, %%mm2\n\t"
360
                "movq        %%mm3, %%mm4\n\t"
361
                "movq        %%mm3, %%mm5\n\t"
362
                "psrlq        $3, %%mm0\n\t"
363
                "psrlq        $3, %%mm3\n\t"
364
                "pand        %2, %%mm0\n\t"
365
                "pand        %2, %%mm3\n\t"
366
                "psrlq        $5, %%mm1\n\t"
367
                "psrlq        $5, %%mm4\n\t"
368
                "pand        %%mm6, %%mm1\n\t"
369
                "pand        %%mm6, %%mm4\n\t"
370
                "psrlq        $8, %%mm2\n\t"
371
                "psrlq        $8, %%mm5\n\t"
372
                "pand        %%mm7, %%mm2\n\t"
373
                "pand        %%mm7, %%mm5\n\t"
374
                "por        %%mm1, %%mm0\n\t"
375
                "por        %%mm4, %%mm3\n\t"
376
                "por        %%mm2, %%mm0\n\t"
377
                "por        %%mm5, %%mm3\n\t"
378
                "psllq        $16, %%mm3\n\t"
379
                "por        %%mm3, %%mm0\n\t"
380
                MOVNTQ"        %%mm0, %0\n\t"
381
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
382
                d += 4;
383
                s += 16;
384
        }
385
#endif
386
        __asm __volatile(SFENCE:::"memory");
387
        __asm __volatile(EMMS:::"memory");
388
#endif
389
        while(s < end)
390
        {
391
                const int src= *s; s += 4;
392
                *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393
//                *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
394
        }
395
}
396

    
397
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
398
{
399
        const uint8_t *s = src;
400
        const uint8_t *end;
401
#ifdef HAVE_MMX
402
        const uint8_t *mm_end;
403
#endif
404
        uint16_t *d = (uint16_t *)dst;
405
        end = s + src_size;
406
#ifdef HAVE_MMX
407
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
408
        __asm __volatile(
409
            "movq        %0, %%mm7\n\t"
410
            "movq        %1, %%mm6\n\t"
411
            ::"m"(red_16mask),"m"(green_16mask));
412
        mm_end = end - 15;
413
        while(s < mm_end)
414
        {
415
            __asm __volatile(
416
                PREFETCH" 32%1\n\t"
417
                "movd        %1, %%mm0\n\t"
418
                "movd        4%1, %%mm3\n\t"
419
                "punpckldq 8%1, %%mm0\n\t"
420
                "punpckldq 12%1, %%mm3\n\t"
421
                "movq        %%mm0, %%mm1\n\t"
422
                "movq        %%mm0, %%mm2\n\t"
423
                "movq        %%mm3, %%mm4\n\t"
424
                "movq        %%mm3, %%mm5\n\t"
425
                "psllq        $8, %%mm0\n\t"
426
                "psllq        $8, %%mm3\n\t"
427
                "pand        %%mm7, %%mm0\n\t"
428
                "pand        %%mm7, %%mm3\n\t"
429
                "psrlq        $5, %%mm1\n\t"
430
                "psrlq        $5, %%mm4\n\t"
431
                "pand        %%mm6, %%mm1\n\t"
432
                "pand        %%mm6, %%mm4\n\t"
433
                "psrlq        $19, %%mm2\n\t"
434
                "psrlq        $19, %%mm5\n\t"
435
                "pand        %2, %%mm2\n\t"
436
                "pand        %2, %%mm5\n\t"
437
                "por        %%mm1, %%mm0\n\t"
438
                "por        %%mm4, %%mm3\n\t"
439
                "por        %%mm2, %%mm0\n\t"
440
                "por        %%mm5, %%mm3\n\t"
441
                "psllq        $16, %%mm3\n\t"
442
                "por        %%mm3, %%mm0\n\t"
443
                MOVNTQ"        %%mm0, %0\n\t"
444
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
445
                d += 4;
446
                s += 16;
447
        }
448
        __asm __volatile(SFENCE:::"memory");
449
        __asm __volatile(EMMS:::"memory");
450
#endif
451
        while(s < end)
452
        {
453
                const int src= *s; s += 4;
454
                *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
455
        }
456
}
457

    
458
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
459
{
460
        const uint8_t *s = src;
461
        const uint8_t *end;
462
#ifdef HAVE_MMX
463
        const uint8_t *mm_end;
464
#endif
465
        uint16_t *d = (uint16_t *)dst;
466
        end = s + src_size;
467
#ifdef HAVE_MMX
468
        mm_end = end - 15;
469
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
470
        asm volatile(
471
                "movq %3, %%mm5                        \n\t"
472
                "movq %4, %%mm6                        \n\t"
473
                "movq %5, %%mm7                        \n\t"
474
                ".balign 16                        \n\t"
475
                "1:                                \n\t"
476
                PREFETCH" 32(%1)                \n\t"
477
                "movd        (%1), %%mm0                \n\t"
478
                "movd        4(%1), %%mm3                \n\t"
479
                "punpckldq 8(%1), %%mm0                \n\t"
480
                "punpckldq 12(%1), %%mm3        \n\t"
481
                "movq %%mm0, %%mm1                \n\t"
482
                "movq %%mm3, %%mm4                \n\t"
483
                "pand %%mm6, %%mm0                \n\t"
484
                "pand %%mm6, %%mm3                \n\t"
485
                "pmaddwd %%mm7, %%mm0                \n\t"
486
                "pmaddwd %%mm7, %%mm3                \n\t"
487
                "pand %%mm5, %%mm1                \n\t"
488
                "pand %%mm5, %%mm4                \n\t"
489
                "por %%mm1, %%mm0                \n\t"        
490
                "por %%mm4, %%mm3                \n\t"
491
                "psrld $6, %%mm0                \n\t"
492
                "pslld $10, %%mm3                \n\t"
493
                "por %%mm3, %%mm0                \n\t"
494
                MOVNTQ"        %%mm0, (%0)                \n\t"
495
                "addl $16, %1                        \n\t"
496
                "addl $8, %0                        \n\t"
497
                "cmpl %2, %1                        \n\t"
498
                " jb 1b                                \n\t"
499
                : "+r" (d), "+r"(s)
500
                : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
501
        );
502
#else
503
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
504
        __asm __volatile(
505
            "movq        %0, %%mm7\n\t"
506
            "movq        %1, %%mm6\n\t"
507
            ::"m"(red_15mask),"m"(green_15mask));
508
        while(s < mm_end)
509
        {
510
            __asm __volatile(
511
                PREFETCH" 32%1\n\t"
512
                "movd        %1, %%mm0\n\t"
513
                "movd        4%1, %%mm3\n\t"
514
                "punpckldq 8%1, %%mm0\n\t"
515
                "punpckldq 12%1, %%mm3\n\t"
516
                "movq        %%mm0, %%mm1\n\t"
517
                "movq        %%mm0, %%mm2\n\t"
518
                "movq        %%mm3, %%mm4\n\t"
519
                "movq        %%mm3, %%mm5\n\t"
520
                "psrlq        $3, %%mm0\n\t"
521
                "psrlq        $3, %%mm3\n\t"
522
                "pand        %2, %%mm0\n\t"
523
                "pand        %2, %%mm3\n\t"
524
                "psrlq        $6, %%mm1\n\t"
525
                "psrlq        $6, %%mm4\n\t"
526
                "pand        %%mm6, %%mm1\n\t"
527
                "pand        %%mm6, %%mm4\n\t"
528
                "psrlq        $9, %%mm2\n\t"
529
                "psrlq        $9, %%mm5\n\t"
530
                "pand        %%mm7, %%mm2\n\t"
531
                "pand        %%mm7, %%mm5\n\t"
532
                "por        %%mm1, %%mm0\n\t"
533
                "por        %%mm4, %%mm3\n\t"
534
                "por        %%mm2, %%mm0\n\t"
535
                "por        %%mm5, %%mm3\n\t"
536
                "psllq        $16, %%mm3\n\t"
537
                "por        %%mm3, %%mm0\n\t"
538
                MOVNTQ"        %%mm0, %0\n\t"
539
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
540
                d += 4;
541
                s += 16;
542
        }
543
#endif
544
        __asm __volatile(SFENCE:::"memory");
545
        __asm __volatile(EMMS:::"memory");
546
#endif
547
        while(s < end)
548
        {
549
                const int src= *s; s += 4;
550
                *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
551
        }
552
}
553

    
554
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
555
{
556
        const uint8_t *s = src;
557
        const uint8_t *end;
558
#ifdef HAVE_MMX
559
        const uint8_t *mm_end;
560
#endif
561
        uint16_t *d = (uint16_t *)dst;
562
        end = s + src_size;
563
#ifdef HAVE_MMX
564
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
565
        __asm __volatile(
566
            "movq        %0, %%mm7\n\t"
567
            "movq        %1, %%mm6\n\t"
568
            ::"m"(red_15mask),"m"(green_15mask));
569
        mm_end = end - 15;
570
        while(s < mm_end)
571
        {
572
            __asm __volatile(
573
                PREFETCH" 32%1\n\t"
574
                "movd        %1, %%mm0\n\t"
575
                "movd        4%1, %%mm3\n\t"
576
                "punpckldq 8%1, %%mm0\n\t"
577
                "punpckldq 12%1, %%mm3\n\t"
578
                "movq        %%mm0, %%mm1\n\t"
579
                "movq        %%mm0, %%mm2\n\t"
580
                "movq        %%mm3, %%mm4\n\t"
581
                "movq        %%mm3, %%mm5\n\t"
582
                "psllq        $7, %%mm0\n\t"
583
                "psllq        $7, %%mm3\n\t"
584
                "pand        %%mm7, %%mm0\n\t"
585
                "pand        %%mm7, %%mm3\n\t"
586
                "psrlq        $6, %%mm1\n\t"
587
                "psrlq        $6, %%mm4\n\t"
588
                "pand        %%mm6, %%mm1\n\t"
589
                "pand        %%mm6, %%mm4\n\t"
590
                "psrlq        $19, %%mm2\n\t"
591
                "psrlq        $19, %%mm5\n\t"
592
                "pand        %2, %%mm2\n\t"
593
                "pand        %2, %%mm5\n\t"
594
                "por        %%mm1, %%mm0\n\t"
595
                "por        %%mm4, %%mm3\n\t"
596
                "por        %%mm2, %%mm0\n\t"
597
                "por        %%mm5, %%mm3\n\t"
598
                "psllq        $16, %%mm3\n\t"
599
                "por        %%mm3, %%mm0\n\t"
600
                MOVNTQ"        %%mm0, %0\n\t"
601
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
602
                d += 4;
603
                s += 16;
604
        }
605
        __asm __volatile(SFENCE:::"memory");
606
        __asm __volatile(EMMS:::"memory");
607
#endif
608
        while(s < end)
609
        {
610
                const int src= *s; s += 4;
611
                *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
612
        }
613
}
614

    
615
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
616
{
617
        const uint8_t *s = src;
618
        const uint8_t *end;
619
#ifdef HAVE_MMX
620
        const uint8_t *mm_end;
621
#endif
622
        uint16_t *d = (uint16_t *)dst;
623
        end = s + src_size;
624
#ifdef HAVE_MMX
625
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
626
        __asm __volatile(
627
            "movq        %0, %%mm7\n\t"
628
            "movq        %1, %%mm6\n\t"
629
            ::"m"(red_16mask),"m"(green_16mask));
630
        mm_end = end - 11;
631
        while(s < mm_end)
632
        {
633
            __asm __volatile(
634
                PREFETCH" 32%1\n\t"
635
                "movd        %1, %%mm0\n\t"
636
                "movd        3%1, %%mm3\n\t"
637
                "punpckldq 6%1, %%mm0\n\t"
638
                "punpckldq 9%1, %%mm3\n\t"
639
                "movq        %%mm0, %%mm1\n\t"
640
                "movq        %%mm0, %%mm2\n\t"
641
                "movq        %%mm3, %%mm4\n\t"
642
                "movq        %%mm3, %%mm5\n\t"
643
                "psrlq        $3, %%mm0\n\t"
644
                "psrlq        $3, %%mm3\n\t"
645
                "pand        %2, %%mm0\n\t"
646
                "pand        %2, %%mm3\n\t"
647
                "psrlq        $5, %%mm1\n\t"
648
                "psrlq        $5, %%mm4\n\t"
649
                "pand        %%mm6, %%mm1\n\t"
650
                "pand        %%mm6, %%mm4\n\t"
651
                "psrlq        $8, %%mm2\n\t"
652
                "psrlq        $8, %%mm5\n\t"
653
                "pand        %%mm7, %%mm2\n\t"
654
                "pand        %%mm7, %%mm5\n\t"
655
                "por        %%mm1, %%mm0\n\t"
656
                "por        %%mm4, %%mm3\n\t"
657
                "por        %%mm2, %%mm0\n\t"
658
                "por        %%mm5, %%mm3\n\t"
659
                "psllq        $16, %%mm3\n\t"
660
                "por        %%mm3, %%mm0\n\t"
661
                MOVNTQ"        %%mm0, %0\n\t"
662
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
663
                d += 4;
664
                s += 12;
665
        }
666
        __asm __volatile(SFENCE:::"memory");
667
        __asm __volatile(EMMS:::"memory");
668
#endif
669
        while(s < end)
670
        {
671
                const int b= *s++;
672
                const int g= *s++;
673
                const int r= *s++;
674
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
675
        }
676
}
677

    
678
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
679
{
680
        const uint8_t *s = src;
681
        const uint8_t *end;
682
#ifdef HAVE_MMX
683
        const uint8_t *mm_end;
684
#endif
685
        uint16_t *d = (uint16_t *)dst;
686
        end = s + src_size;
687
#ifdef HAVE_MMX
688
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
689
        __asm __volatile(
690
            "movq        %0, %%mm7\n\t"
691
            "movq        %1, %%mm6\n\t"
692
            ::"m"(red_16mask),"m"(green_16mask));
693
        mm_end = end - 15;
694
        while(s < mm_end)
695
        {
696
            __asm __volatile(
697
                PREFETCH" 32%1\n\t"
698
                "movd        %1, %%mm0\n\t"
699
                "movd        3%1, %%mm3\n\t"
700
                "punpckldq 6%1, %%mm0\n\t"
701
                "punpckldq 9%1, %%mm3\n\t"
702
                "movq        %%mm0, %%mm1\n\t"
703
                "movq        %%mm0, %%mm2\n\t"
704
                "movq        %%mm3, %%mm4\n\t"
705
                "movq        %%mm3, %%mm5\n\t"
706
                "psllq        $8, %%mm0\n\t"
707
                "psllq        $8, %%mm3\n\t"
708
                "pand        %%mm7, %%mm0\n\t"
709
                "pand        %%mm7, %%mm3\n\t"
710
                "psrlq        $5, %%mm1\n\t"
711
                "psrlq        $5, %%mm4\n\t"
712
                "pand        %%mm6, %%mm1\n\t"
713
                "pand        %%mm6, %%mm4\n\t"
714
                "psrlq        $19, %%mm2\n\t"
715
                "psrlq        $19, %%mm5\n\t"
716
                "pand        %2, %%mm2\n\t"
717
                "pand        %2, %%mm5\n\t"
718
                "por        %%mm1, %%mm0\n\t"
719
                "por        %%mm4, %%mm3\n\t"
720
                "por        %%mm2, %%mm0\n\t"
721
                "por        %%mm5, %%mm3\n\t"
722
                "psllq        $16, %%mm3\n\t"
723
                "por        %%mm3, %%mm0\n\t"
724
                MOVNTQ"        %%mm0, %0\n\t"
725
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
726
                d += 4;
727
                s += 12;
728
        }
729
        __asm __volatile(SFENCE:::"memory");
730
        __asm __volatile(EMMS:::"memory");
731
#endif
732
        while(s < end)
733
        {
734
                const int r= *s++;
735
                const int g= *s++;
736
                const int b= *s++;
737
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
738
        }
739
}
740

    
741
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
742
{
743
        const uint8_t *s = src;
744
        const uint8_t *end;
745
#ifdef HAVE_MMX
746
        const uint8_t *mm_end;
747
#endif
748
        uint16_t *d = (uint16_t *)dst;
749
        end = s + src_size;
750
#ifdef HAVE_MMX
751
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
752
        __asm __volatile(
753
            "movq        %0, %%mm7\n\t"
754
            "movq        %1, %%mm6\n\t"
755
            ::"m"(red_15mask),"m"(green_15mask));
756
        mm_end = end - 11;
757
        while(s < mm_end)
758
        {
759
            __asm __volatile(
760
                PREFETCH" 32%1\n\t"
761
                "movd        %1, %%mm0\n\t"
762
                "movd        3%1, %%mm3\n\t"
763
                "punpckldq 6%1, %%mm0\n\t"
764
                "punpckldq 9%1, %%mm3\n\t"
765
                "movq        %%mm0, %%mm1\n\t"
766
                "movq        %%mm0, %%mm2\n\t"
767
                "movq        %%mm3, %%mm4\n\t"
768
                "movq        %%mm3, %%mm5\n\t"
769
                "psrlq        $3, %%mm0\n\t"
770
                "psrlq        $3, %%mm3\n\t"
771
                "pand        %2, %%mm0\n\t"
772
                "pand        %2, %%mm3\n\t"
773
                "psrlq        $6, %%mm1\n\t"
774
                "psrlq        $6, %%mm4\n\t"
775
                "pand        %%mm6, %%mm1\n\t"
776
                "pand        %%mm6, %%mm4\n\t"
777
                "psrlq        $9, %%mm2\n\t"
778
                "psrlq        $9, %%mm5\n\t"
779
                "pand        %%mm7, %%mm2\n\t"
780
                "pand        %%mm7, %%mm5\n\t"
781
                "por        %%mm1, %%mm0\n\t"
782
                "por        %%mm4, %%mm3\n\t"
783
                "por        %%mm2, %%mm0\n\t"
784
                "por        %%mm5, %%mm3\n\t"
785
                "psllq        $16, %%mm3\n\t"
786
                "por        %%mm3, %%mm0\n\t"
787
                MOVNTQ"        %%mm0, %0\n\t"
788
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
789
                d += 4;
790
                s += 12;
791
        }
792
        __asm __volatile(SFENCE:::"memory");
793
        __asm __volatile(EMMS:::"memory");
794
#endif
795
        while(s < end)
796
        {
797
                const int b= *s++;
798
                const int g= *s++;
799
                const int r= *s++;
800
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
801
        }
802
}
803

    
804
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
805
{
806
        const uint8_t *s = src;
807
        const uint8_t *end;
808
#ifdef HAVE_MMX
809
        const uint8_t *mm_end;
810
#endif
811
        uint16_t *d = (uint16_t *)dst;
812
        end = s + src_size;
813
#ifdef HAVE_MMX
814
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
815
        __asm __volatile(
816
            "movq        %0, %%mm7\n\t"
817
            "movq        %1, %%mm6\n\t"
818
            ::"m"(red_15mask),"m"(green_15mask));
819
        mm_end = end - 15;
820
        while(s < mm_end)
821
        {
822
            __asm __volatile(
823
                PREFETCH" 32%1\n\t"
824
                "movd        %1, %%mm0\n\t"
825
                "movd        3%1, %%mm3\n\t"
826
                "punpckldq 6%1, %%mm0\n\t"
827
                "punpckldq 9%1, %%mm3\n\t"
828
                "movq        %%mm0, %%mm1\n\t"
829
                "movq        %%mm0, %%mm2\n\t"
830
                "movq        %%mm3, %%mm4\n\t"
831
                "movq        %%mm3, %%mm5\n\t"
832
                "psllq        $7, %%mm0\n\t"
833
                "psllq        $7, %%mm3\n\t"
834
                "pand        %%mm7, %%mm0\n\t"
835
                "pand        %%mm7, %%mm3\n\t"
836
                "psrlq        $6, %%mm1\n\t"
837
                "psrlq        $6, %%mm4\n\t"
838
                "pand        %%mm6, %%mm1\n\t"
839
                "pand        %%mm6, %%mm4\n\t"
840
                "psrlq        $19, %%mm2\n\t"
841
                "psrlq        $19, %%mm5\n\t"
842
                "pand        %2, %%mm2\n\t"
843
                "pand        %2, %%mm5\n\t"
844
                "por        %%mm1, %%mm0\n\t"
845
                "por        %%mm4, %%mm3\n\t"
846
                "por        %%mm2, %%mm0\n\t"
847
                "por        %%mm5, %%mm3\n\t"
848
                "psllq        $16, %%mm3\n\t"
849
                "por        %%mm3, %%mm0\n\t"
850
                MOVNTQ"        %%mm0, %0\n\t"
851
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
852
                d += 4;
853
                s += 12;
854
        }
855
        __asm __volatile(SFENCE:::"memory");
856
        __asm __volatile(EMMS:::"memory");
857
#endif
858
        while(s < end)
859
        {
860
                const int r= *s++;
861
                const int g= *s++;
862
                const int b= *s++;
863
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
864
        }
865
}
866

    
867
/*
868
  I use here less accurate approximation by simply
869
 left-shifting the input
870
  value and filling the low order bits with
871
 zeroes. This method improves png's
872
  compression but this scheme cannot reproduce white exactly, since it does not
873
  generate an all-ones maximum value; the net effect is to darken the
874
  image slightly.
875

876
  The better method should be "left bit replication":
877

878
   4 3 2 1 0
879
   ---------
880
   1 1 0 1 1
881

882
   7 6 5 4 3  2 1 0
883
   ----------------
884
   1 1 0 1 1  1 1 0
885
   |=======|  |===|
886
       |      Leftmost Bits Repeated to Fill Open Bits
887
       |
888
   Original Bits
889
*/
890
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
891
{
892
        const uint16_t *end;
893
#ifdef HAVE_MMX
894
        const uint16_t *mm_end;
895
#endif
896
        uint8_t *d = (uint8_t *)dst;
897
        const uint16_t *s = (uint16_t *)src;
898
        end = s + src_size/2;
899
#ifdef HAVE_MMX
900
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
901
        mm_end = end - 7;
902
        while(s < mm_end)
903
        {
904
            __asm __volatile(
905
                PREFETCH" 32%1\n\t"
906
                "movq        %1, %%mm0\n\t"
907
                "movq        %1, %%mm1\n\t"
908
                "movq        %1, %%mm2\n\t"
909
                "pand        %2, %%mm0\n\t"
910
                "pand        %3, %%mm1\n\t"
911
                "pand        %4, %%mm2\n\t"
912
                "psllq        $3, %%mm0\n\t"
913
                "psrlq        $2, %%mm1\n\t"
914
                "psrlq        $7, %%mm2\n\t"
915
                "movq        %%mm0, %%mm3\n\t"
916
                "movq        %%mm1, %%mm4\n\t"
917
                "movq        %%mm2, %%mm5\n\t"
918
                "punpcklwd %5, %%mm0\n\t"
919
                "punpcklwd %5, %%mm1\n\t"
920
                "punpcklwd %5, %%mm2\n\t"
921
                "punpckhwd %5, %%mm3\n\t"
922
                "punpckhwd %5, %%mm4\n\t"
923
                "punpckhwd %5, %%mm5\n\t"
924
                "psllq        $8, %%mm1\n\t"
925
                "psllq        $16, %%mm2\n\t"
926
                "por        %%mm1, %%mm0\n\t"
927
                "por        %%mm2, %%mm0\n\t"
928
                "psllq        $8, %%mm4\n\t"
929
                "psllq        $16, %%mm5\n\t"
930
                "por        %%mm4, %%mm3\n\t"
931
                "por        %%mm5, %%mm3\n\t"
932

    
933
                "movq        %%mm0, %%mm6\n\t"
934
                "movq        %%mm3, %%mm7\n\t"
935
                
936
                "movq        8%1, %%mm0\n\t"
937
                "movq        8%1, %%mm1\n\t"
938
                "movq        8%1, %%mm2\n\t"
939
                "pand        %2, %%mm0\n\t"
940
                "pand        %3, %%mm1\n\t"
941
                "pand        %4, %%mm2\n\t"
942
                "psllq        $3, %%mm0\n\t"
943
                "psrlq        $2, %%mm1\n\t"
944
                "psrlq        $7, %%mm2\n\t"
945
                "movq        %%mm0, %%mm3\n\t"
946
                "movq        %%mm1, %%mm4\n\t"
947
                "movq        %%mm2, %%mm5\n\t"
948
                "punpcklwd %5, %%mm0\n\t"
949
                "punpcklwd %5, %%mm1\n\t"
950
                "punpcklwd %5, %%mm2\n\t"
951
                "punpckhwd %5, %%mm3\n\t"
952
                "punpckhwd %5, %%mm4\n\t"
953
                "punpckhwd %5, %%mm5\n\t"
954
                "psllq        $8, %%mm1\n\t"
955
                "psllq        $16, %%mm2\n\t"
956
                "por        %%mm1, %%mm0\n\t"
957
                "por        %%mm2, %%mm0\n\t"
958
                "psllq        $8, %%mm4\n\t"
959
                "psllq        $16, %%mm5\n\t"
960
                "por        %%mm4, %%mm3\n\t"
961
                "por        %%mm5, %%mm3\n\t"
962

    
963
                :"=m"(*d)
964
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965
                :"memory");
966
            /* Borrowed 32 to 24 */
967
            __asm __volatile(
968
                "movq        %%mm0, %%mm4\n\t"
969
                "movq        %%mm3, %%mm5\n\t"
970
                "movq        %%mm6, %%mm0\n\t"
971
                "movq        %%mm7, %%mm1\n\t"
972
                
973
                "movq        %%mm4, %%mm6\n\t"
974
                "movq        %%mm5, %%mm7\n\t"
975
                "movq        %%mm0, %%mm2\n\t"
976
                "movq        %%mm1, %%mm3\n\t"
977

    
978
                "psrlq        $8, %%mm2\n\t"
979
                "psrlq        $8, %%mm3\n\t"
980
                "psrlq        $8, %%mm6\n\t"
981
                "psrlq        $8, %%mm7\n\t"
982
                "pand        %2, %%mm0\n\t"
983
                "pand        %2, %%mm1\n\t"
984
                "pand        %2, %%mm4\n\t"
985
                "pand        %2, %%mm5\n\t"
986
                "pand        %3, %%mm2\n\t"
987
                "pand        %3, %%mm3\n\t"
988
                "pand        %3, %%mm6\n\t"
989
                "pand        %3, %%mm7\n\t"
990
                "por        %%mm2, %%mm0\n\t"
991
                "por        %%mm3, %%mm1\n\t"
992
                "por        %%mm6, %%mm4\n\t"
993
                "por        %%mm7, %%mm5\n\t"
994

    
995
                "movq        %%mm1, %%mm2\n\t"
996
                "movq        %%mm4, %%mm3\n\t"
997
                "psllq        $48, %%mm2\n\t"
998
                "psllq        $32, %%mm3\n\t"
999
                "pand        %4, %%mm2\n\t"
1000
                "pand        %5, %%mm3\n\t"
1001
                "por        %%mm2, %%mm0\n\t"
1002
                "psrlq        $16, %%mm1\n\t"
1003
                "psrlq        $32, %%mm4\n\t"
1004
                "psllq        $16, %%mm5\n\t"
1005
                "por        %%mm3, %%mm1\n\t"
1006
                "pand        %6, %%mm5\n\t"
1007
                "por        %%mm5, %%mm4\n\t"
1008

    
1009
                MOVNTQ"        %%mm0, %0\n\t"
1010
                MOVNTQ"        %%mm1, 8%0\n\t"
1011
                MOVNTQ"        %%mm4, 16%0"
1012

    
1013
                :"=m"(*d)
1014
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1015
                :"memory");
1016
                d += 24;
1017
                s += 8;
1018
        }
1019
        __asm __volatile(SFENCE:::"memory");
1020
        __asm __volatile(EMMS:::"memory");
1021
#endif
1022
        while(s < end)
1023
        {
1024
                register uint16_t bgr;
1025
                bgr = *s++;
1026
                *d++ = (bgr&0x1F)<<3;
1027
                *d++ = (bgr&0x3E0)>>2;
1028
                *d++ = (bgr&0x7C00)>>7;
1029
        }
1030
}
1031

    
1032
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1033
{
1034
        const uint16_t *end;
1035
#ifdef HAVE_MMX
1036
        const uint16_t *mm_end;
1037
#endif
1038
        uint8_t *d = (uint8_t *)dst;
1039
        const uint16_t *s = (const uint16_t *)src;
1040
        end = s + src_size/2;
1041
#ifdef HAVE_MMX
1042
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1043
        mm_end = end - 7;
1044
        while(s < mm_end)
1045
        {
1046
            __asm __volatile(
1047
                PREFETCH" 32%1\n\t"
1048
                "movq        %1, %%mm0\n\t"
1049
                "movq        %1, %%mm1\n\t"
1050
                "movq        %1, %%mm2\n\t"
1051
                "pand        %2, %%mm0\n\t"
1052
                "pand        %3, %%mm1\n\t"
1053
                "pand        %4, %%mm2\n\t"
1054
                "psllq        $3, %%mm0\n\t"
1055
                "psrlq        $3, %%mm1\n\t"
1056
                "psrlq        $8, %%mm2\n\t"
1057
                "movq        %%mm0, %%mm3\n\t"
1058
                "movq        %%mm1, %%mm4\n\t"
1059
                "movq        %%mm2, %%mm5\n\t"
1060
                "punpcklwd %5, %%mm0\n\t"
1061
                "punpcklwd %5, %%mm1\n\t"
1062
                "punpcklwd %5, %%mm2\n\t"
1063
                "punpckhwd %5, %%mm3\n\t"
1064
                "punpckhwd %5, %%mm4\n\t"
1065
                "punpckhwd %5, %%mm5\n\t"
1066
                "psllq        $8, %%mm1\n\t"
1067
                "psllq        $16, %%mm2\n\t"
1068
                "por        %%mm1, %%mm0\n\t"
1069
                "por        %%mm2, %%mm0\n\t"
1070
                "psllq        $8, %%mm4\n\t"
1071
                "psllq        $16, %%mm5\n\t"
1072
                "por        %%mm4, %%mm3\n\t"
1073
                "por        %%mm5, %%mm3\n\t"
1074
                
1075
                "movq        %%mm0, %%mm6\n\t"
1076
                "movq        %%mm3, %%mm7\n\t"
1077

    
1078
                "movq        8%1, %%mm0\n\t"
1079
                "movq        8%1, %%mm1\n\t"
1080
                "movq        8%1, %%mm2\n\t"
1081
                "pand        %2, %%mm0\n\t"
1082
                "pand        %3, %%mm1\n\t"
1083
                "pand        %4, %%mm2\n\t"
1084
                "psllq        $3, %%mm0\n\t"
1085
                "psrlq        $3, %%mm1\n\t"
1086
                "psrlq        $8, %%mm2\n\t"
1087
                "movq        %%mm0, %%mm3\n\t"
1088
                "movq        %%mm1, %%mm4\n\t"
1089
                "movq        %%mm2, %%mm5\n\t"
1090
                "punpcklwd %5, %%mm0\n\t"
1091
                "punpcklwd %5, %%mm1\n\t"
1092
                "punpcklwd %5, %%mm2\n\t"
1093
                "punpckhwd %5, %%mm3\n\t"
1094
                "punpckhwd %5, %%mm4\n\t"
1095
                "punpckhwd %5, %%mm5\n\t"
1096
                "psllq        $8, %%mm1\n\t"
1097
                "psllq        $16, %%mm2\n\t"
1098
                "por        %%mm1, %%mm0\n\t"
1099
                "por        %%mm2, %%mm0\n\t"
1100
                "psllq        $8, %%mm4\n\t"
1101
                "psllq        $16, %%mm5\n\t"
1102
                "por        %%mm4, %%mm3\n\t"
1103
                "por        %%mm5, %%mm3\n\t"
1104
                :"=m"(*d)
1105
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)                
1106
                :"memory");
1107
            /* Borrowed 32 to 24 */
1108
            __asm __volatile(
1109
                "movq        %%mm0, %%mm4\n\t"
1110
                "movq        %%mm3, %%mm5\n\t"
1111
                "movq        %%mm6, %%mm0\n\t"
1112
                "movq        %%mm7, %%mm1\n\t"
1113
                
1114
                "movq        %%mm4, %%mm6\n\t"
1115
                "movq        %%mm5, %%mm7\n\t"
1116
                "movq        %%mm0, %%mm2\n\t"
1117
                "movq        %%mm1, %%mm3\n\t"
1118

    
1119
                "psrlq        $8, %%mm2\n\t"
1120
                "psrlq        $8, %%mm3\n\t"
1121
                "psrlq        $8, %%mm6\n\t"
1122
                "psrlq        $8, %%mm7\n\t"
1123
                "pand        %2, %%mm0\n\t"
1124
                "pand        %2, %%mm1\n\t"
1125
                "pand        %2, %%mm4\n\t"
1126
                "pand        %2, %%mm5\n\t"
1127
                "pand        %3, %%mm2\n\t"
1128
                "pand        %3, %%mm3\n\t"
1129
                "pand        %3, %%mm6\n\t"
1130
                "pand        %3, %%mm7\n\t"
1131
                "por        %%mm2, %%mm0\n\t"
1132
                "por        %%mm3, %%mm1\n\t"
1133
                "por        %%mm6, %%mm4\n\t"
1134
                "por        %%mm7, %%mm5\n\t"
1135

    
1136
                "movq        %%mm1, %%mm2\n\t"
1137
                "movq        %%mm4, %%mm3\n\t"
1138
                "psllq        $48, %%mm2\n\t"
1139
                "psllq        $32, %%mm3\n\t"
1140
                "pand        %4, %%mm2\n\t"
1141
                "pand        %5, %%mm3\n\t"
1142
                "por        %%mm2, %%mm0\n\t"
1143
                "psrlq        $16, %%mm1\n\t"
1144
                "psrlq        $32, %%mm4\n\t"
1145
                "psllq        $16, %%mm5\n\t"
1146
                "por        %%mm3, %%mm1\n\t"
1147
                "pand        %6, %%mm5\n\t"
1148
                "por        %%mm5, %%mm4\n\t"
1149

    
1150
                MOVNTQ"        %%mm0, %0\n\t"
1151
                MOVNTQ"        %%mm1, 8%0\n\t"
1152
                MOVNTQ"        %%mm4, 16%0"
1153

    
1154
                :"=m"(*d)
1155
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1156
                :"memory");
1157
                d += 24;
1158
                s += 8;
1159
        }
1160
        __asm __volatile(SFENCE:::"memory");
1161
        __asm __volatile(EMMS:::"memory");
1162
#endif
1163
        while(s < end)
1164
        {
1165
                register uint16_t bgr;
1166
                bgr = *s++;
1167
                *d++ = (bgr&0x1F)<<3;
1168
                *d++ = (bgr&0x7E0)>>3;
1169
                *d++ = (bgr&0xF800)>>8;
1170
        }
1171
}
1172

    
1173
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1174
{
1175
        const uint16_t *end;
1176
#ifdef HAVE_MMX
1177
        const uint16_t *mm_end;
1178
#endif
1179
        uint8_t *d = (uint8_t *)dst;
1180
        const uint16_t *s = (const uint16_t *)src;
1181
        end = s + src_size/2;
1182
#ifdef HAVE_MMX
1183
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1184
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1185
        mm_end = end - 3;
1186
        while(s < mm_end)
1187
        {
1188
            __asm __volatile(
1189
                PREFETCH" 32%1\n\t"
1190
                "movq        %1, %%mm0\n\t"
1191
                "movq        %1, %%mm1\n\t"
1192
                "movq        %1, %%mm2\n\t"
1193
                "pand        %2, %%mm0\n\t"
1194
                "pand        %3, %%mm1\n\t"
1195
                "pand        %4, %%mm2\n\t"
1196
                "psllq        $3, %%mm0\n\t"
1197
                "psrlq        $2, %%mm1\n\t"
1198
                "psrlq        $7, %%mm2\n\t"
1199
                "movq        %%mm0, %%mm3\n\t"
1200
                "movq        %%mm1, %%mm4\n\t"
1201
                "movq        %%mm2, %%mm5\n\t"
1202
                "punpcklwd %%mm7, %%mm0\n\t"
1203
                "punpcklwd %%mm7, %%mm1\n\t"
1204
                "punpcklwd %%mm7, %%mm2\n\t"
1205
                "punpckhwd %%mm7, %%mm3\n\t"
1206
                "punpckhwd %%mm7, %%mm4\n\t"
1207
                "punpckhwd %%mm7, %%mm5\n\t"
1208
                "psllq        $8, %%mm1\n\t"
1209
                "psllq        $16, %%mm2\n\t"
1210
                "por        %%mm1, %%mm0\n\t"
1211
                "por        %%mm2, %%mm0\n\t"
1212
                "psllq        $8, %%mm4\n\t"
1213
                "psllq        $16, %%mm5\n\t"
1214
                "por        %%mm4, %%mm3\n\t"
1215
                "por        %%mm5, %%mm3\n\t"
1216
                MOVNTQ"        %%mm0, %0\n\t"
1217
                MOVNTQ"        %%mm3, 8%0\n\t"
1218
                :"=m"(*d)
1219
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1220
                :"memory");
1221
                d += 16;
1222
                s += 4;
1223
        }
1224
        __asm __volatile(SFENCE:::"memory");
1225
        __asm __volatile(EMMS:::"memory");
1226
#endif
1227
        while(s < end)
1228
        {
1229
#if 0 //slightly slower on athlon
1230
                int bgr= *s++;
1231
                *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1232
#else
1233
//FIXME this is very likely wrong for bigendian (and the following converters too)
1234
                register uint16_t bgr;
1235
                bgr = *s++;
1236
                *d++ = (bgr&0x1F)<<3;
1237
                *d++ = (bgr&0x3E0)>>2;
1238
                *d++ = (bgr&0x7C00)>>7;
1239
                *d++ = 0;
1240
#endif
1241
        }
1242
}
1243

    
1244
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1245
{
1246
        const uint16_t *end;
1247
#ifdef HAVE_MMX
1248
        const uint16_t *mm_end;
1249
#endif
1250
        uint8_t *d = (uint8_t *)dst;
1251
        const uint16_t *s = (uint16_t *)src;
1252
        end = s + src_size/2;
1253
#ifdef HAVE_MMX
1254
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1255
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1256
        mm_end = end - 3;
1257
        while(s < mm_end)
1258
        {
1259
            __asm __volatile(
1260
                PREFETCH" 32%1\n\t"
1261
                "movq        %1, %%mm0\n\t"
1262
                "movq        %1, %%mm1\n\t"
1263
                "movq        %1, %%mm2\n\t"
1264
                "pand        %2, %%mm0\n\t"
1265
                "pand        %3, %%mm1\n\t"
1266
                "pand        %4, %%mm2\n\t"
1267
                "psllq        $3, %%mm0\n\t"
1268
                "psrlq        $3, %%mm1\n\t"
1269
                "psrlq        $8, %%mm2\n\t"
1270
                "movq        %%mm0, %%mm3\n\t"
1271
                "movq        %%mm1, %%mm4\n\t"
1272
                "movq        %%mm2, %%mm5\n\t"
1273
                "punpcklwd %%mm7, %%mm0\n\t"
1274
                "punpcklwd %%mm7, %%mm1\n\t"
1275
                "punpcklwd %%mm7, %%mm2\n\t"
1276
                "punpckhwd %%mm7, %%mm3\n\t"
1277
                "punpckhwd %%mm7, %%mm4\n\t"
1278
                "punpckhwd %%mm7, %%mm5\n\t"
1279
                "psllq        $8, %%mm1\n\t"
1280
                "psllq        $16, %%mm2\n\t"
1281
                "por        %%mm1, %%mm0\n\t"
1282
                "por        %%mm2, %%mm0\n\t"
1283
                "psllq        $8, %%mm4\n\t"
1284
                "psllq        $16, %%mm5\n\t"
1285
                "por        %%mm4, %%mm3\n\t"
1286
                "por        %%mm5, %%mm3\n\t"
1287
                MOVNTQ"        %%mm0, %0\n\t"
1288
                MOVNTQ"        %%mm3, 8%0\n\t"
1289
                :"=m"(*d)
1290
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1291
                :"memory");
1292
                d += 16;
1293
                s += 4;
1294
        }
1295
        __asm __volatile(SFENCE:::"memory");
1296
        __asm __volatile(EMMS:::"memory");
1297
#endif
1298
        while(s < end)
1299
        {
1300
                register uint16_t bgr;
1301
                bgr = *s++;
1302
                *d++ = (bgr&0x1F)<<3;
1303
                *d++ = (bgr&0x7E0)>>3;
1304
                *d++ = (bgr&0xF800)>>8;
1305
                *d++ = 0;
1306
        }
1307
}
1308

    
1309
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1310
{
1311
#ifdef HAVE_MMX
1312
/* TODO: unroll this loop */
1313
        asm volatile (
1314
                "xorl %%eax, %%eax                \n\t"
1315
                ".balign 16                        \n\t"
1316
                "1:                                \n\t"
1317
                PREFETCH" 32(%0, %%eax)                \n\t"
1318
                "movq (%0, %%eax), %%mm0        \n\t"
1319
                "movq %%mm0, %%mm1                \n\t"
1320
                "movq %%mm0, %%mm2                \n\t"
1321
                "pslld $16, %%mm0                \n\t"
1322
                "psrld $16, %%mm1                \n\t"
1323
                "pand "MANGLE(mask32r)", %%mm0        \n\t"
1324
                "pand "MANGLE(mask32g)", %%mm2        \n\t"
1325
                "pand "MANGLE(mask32b)", %%mm1        \n\t"
1326
                "por %%mm0, %%mm2                \n\t"
1327
                "por %%mm1, %%mm2                \n\t"
1328
                MOVNTQ" %%mm2, (%1, %%eax)        \n\t"
1329
                "addl $8, %%eax                        \n\t"
1330
                "cmpl %2, %%eax                        \n\t"
1331
                " jb 1b                                \n\t"
1332
                :: "r" (src), "r"(dst), "r" (src_size-7)
1333
                : "%eax"
1334
        );
1335

    
1336
        __asm __volatile(SFENCE:::"memory");
1337
        __asm __volatile(EMMS:::"memory");
1338
#else
1339
        unsigned i;
1340
        unsigned num_pixels = src_size >> 2;
1341
        for(i=0; i<num_pixels; i++)
1342
        {
1343
#ifdef WORDS_BIGENDIAN  
1344
          dst[4*i + 1] = src[4*i + 3];
1345
          dst[4*i + 2] = src[4*i + 2];
1346
          dst[4*i + 3] = src[4*i + 1];
1347
#else
1348
          dst[4*i + 0] = src[4*i + 2];
1349
          dst[4*i + 1] = src[4*i + 1];
1350
          dst[4*i + 2] = src[4*i + 0];
1351
#endif
1352
        }
1353
#endif
1354
}
1355

    
1356
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1357
{
1358
        unsigned i;
1359
#ifdef HAVE_MMX
1360
        int mmx_size= 23 - src_size;
1361
        asm volatile (
1362
                "movq "MANGLE(mask24r)", %%mm5        \n\t"
1363
                "movq "MANGLE(mask24g)", %%mm6        \n\t"
1364
                "movq "MANGLE(mask24b)", %%mm7        \n\t"
1365
                ".balign 16                        \n\t"
1366
                "1:                                \n\t"
1367
                PREFETCH" 32(%1, %%eax)                \n\t"
1368
                "movq   (%1, %%eax), %%mm0        \n\t" // BGR BGR BG
1369
                "movq   (%1, %%eax), %%mm1        \n\t" // BGR BGR BG
1370
                "movq  2(%1, %%eax), %%mm2        \n\t" // R BGR BGR B
1371
                "psllq $16, %%mm0                \n\t" // 00 BGR BGR
1372
                "pand %%mm5, %%mm0                \n\t"
1373
                "pand %%mm6, %%mm1                \n\t"
1374
                "pand %%mm7, %%mm2                \n\t"
1375
                "por %%mm0, %%mm1                \n\t"
1376
                "por %%mm2, %%mm1                \n\t"                
1377
                "movq  6(%1, %%eax), %%mm0        \n\t" // BGR BGR BG
1378
                MOVNTQ" %%mm1,   (%2, %%eax)        \n\t" // RGB RGB RG
1379
                "movq  8(%1, %%eax), %%mm1        \n\t" // R BGR BGR B
1380
                "movq 10(%1, %%eax), %%mm2        \n\t" // GR BGR BGR
1381
                "pand %%mm7, %%mm0                \n\t"
1382
                "pand %%mm5, %%mm1                \n\t"
1383
                "pand %%mm6, %%mm2                \n\t"
1384
                "por %%mm0, %%mm1                \n\t"
1385
                "por %%mm2, %%mm1                \n\t"                
1386
                "movq 14(%1, %%eax), %%mm0        \n\t" // R BGR BGR B
1387
                MOVNTQ" %%mm1,  8(%2, %%eax)        \n\t" // B RGB RGB R
1388
                "movq 16(%1, %%eax), %%mm1        \n\t" // GR BGR BGR
1389
                "movq 18(%1, %%eax), %%mm2        \n\t" // BGR BGR BG
1390
                "pand %%mm6, %%mm0                \n\t"
1391
                "pand %%mm7, %%mm1                \n\t"
1392
                "pand %%mm5, %%mm2                \n\t"
1393
                "por %%mm0, %%mm1                \n\t"
1394
                "por %%mm2, %%mm1                \n\t"                
1395
                MOVNTQ" %%mm1, 16(%2, %%eax)        \n\t"
1396
                "addl $24, %%eax                \n\t"
1397
                " js 1b                                \n\t"
1398
                : "+a" (mmx_size)
1399
                : "r" (src-mmx_size), "r"(dst-mmx_size)
1400
        );
1401

    
1402
        __asm __volatile(SFENCE:::"memory");
1403
        __asm __volatile(EMMS:::"memory");
1404

    
1405
        if(mmx_size==23) return; //finihsed, was multiple of 8
1406

    
1407
        src+= src_size;
1408
        dst+= src_size;
1409
        src_size= 23-mmx_size;
1410
        src-= src_size;
1411
        dst-= src_size;
1412
#endif
1413
        for(i=0; i<src_size; i+=3)
1414
        {
1415
                register uint8_t x;
1416
                x          = src[i + 2];
1417
                dst[i + 1] = src[i + 1];
1418
                dst[i + 2] = src[i + 0];
1419
                dst[i + 0] = x;
1420
        }
1421
}
1422

    
1423
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424
        unsigned int width, unsigned int height,
1425
        int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1426
{
1427
        unsigned y;
1428
        const unsigned chromWidth= width>>1;
1429
        for(y=0; y<height; y++)
1430
        {
1431
#ifdef HAVE_MMX
1432
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1433
                asm volatile(
1434
                        "xorl %%eax, %%eax                \n\t"
1435
                        ".balign 16                        \n\t"
1436
                        "1:                                \n\t"
1437
                        PREFETCH" 32(%1, %%eax, 2)        \n\t"
1438
                        PREFETCH" 32(%2, %%eax)                \n\t"
1439
                        PREFETCH" 32(%3, %%eax)                \n\t"
1440
                        "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1441
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1442
                        "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1443
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1444
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1445

    
1446
                        "movq (%1, %%eax,2), %%mm3        \n\t" // Y(0)
1447
                        "movq 8(%1, %%eax,2), %%mm5        \n\t" // Y(8)
1448
                        "movq %%mm3, %%mm4                \n\t" // Y(0)
1449
                        "movq %%mm5, %%mm6                \n\t" // Y(8)
1450
                        "punpcklbw %%mm0, %%mm3                \n\t" // YUYV YUYV(0)
1451
                        "punpckhbw %%mm0, %%mm4                \n\t" // YUYV YUYV(4)
1452
                        "punpcklbw %%mm2, %%mm5                \n\t" // YUYV YUYV(8)
1453
                        "punpckhbw %%mm2, %%mm6                \n\t" // YUYV YUYV(12)
1454

    
1455
                        MOVNTQ" %%mm3, (%0, %%eax, 4)        \n\t"
1456
                        MOVNTQ" %%mm4, 8(%0, %%eax, 4)        \n\t"
1457
                        MOVNTQ" %%mm5, 16(%0, %%eax, 4)        \n\t"
1458
                        MOVNTQ" %%mm6, 24(%0, %%eax, 4)        \n\t"
1459

    
1460
                        "addl $8, %%eax                        \n\t"
1461
                        "cmpl %4, %%eax                        \n\t"
1462
                        " jb 1b                                \n\t"
1463
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1464
                        : "%eax"
1465
                );
1466
#else
1467

    
1468
#if defined ARCH_ALPHA && defined HAVE_MVI
1469
#define pl2yuy2(n)                                        \
1470
        y1 = yc[n];                                        \
1471
        y2 = yc2[n];                                        \
1472
        u = uc[n];                                        \
1473
        v = vc[n];                                        \
1474
        asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));        \
1475
        asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));        \
1476
        asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1477
        asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1478
        yuv1 = (u << 8) + (v << 24);                        \
1479
        yuv2 = yuv1 + y2;                                \
1480
        yuv1 += y1;                                        \
1481
        qdst[n] = yuv1;                                        \
1482
        qdst2[n] = yuv2;
1483

    
1484
                int i;
1485
                uint64_t *qdst = (uint64_t *) dst;
1486
                uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487
                const uint32_t *yc = (uint32_t *) ysrc;
1488
                const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489
                const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490
                for(i = 0; i < chromWidth; i += 8){
1491
                        uint64_t y1, y2, yuv1, yuv2;
1492
                        uint64_t u, v;
1493
                        /* Prefetch */
1494
                        asm("ldq $31,64(%0)" :: "r"(yc));
1495
                        asm("ldq $31,64(%0)" :: "r"(yc2));
1496
                        asm("ldq $31,64(%0)" :: "r"(uc));
1497
                        asm("ldq $31,64(%0)" :: "r"(vc));
1498

    
1499
                        pl2yuy2(0);
1500
                        pl2yuy2(1);
1501
                        pl2yuy2(2);
1502
                        pl2yuy2(3);
1503

    
1504
                        yc += 4;
1505
                        yc2 += 4;
1506
                        uc += 4;
1507
                        vc += 4;
1508
                        qdst += 4;
1509
                        qdst2 += 4;
1510
                }
1511
                y++;
1512
                ysrc += lumStride;
1513
                dst += dstStride;
1514

    
1515
#elif __WORDSIZE >= 64
1516
                int i;
1517
                uint64_t *ldst = (uint64_t *) dst;
1518
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519
                for(i = 0; i < chromWidth; i += 2){
1520
                        uint64_t k, l;
1521
                        k = yc[0] + (uc[0] << 8) +
1522
                            (yc[1] << 16) + (vc[0] << 24);
1523
                        l = yc[2] + (uc[1] << 8) +
1524
                            (yc[3] << 16) + (vc[1] << 24);
1525
                        *ldst++ = k + (l << 32);
1526
                        yc += 4;
1527
                        uc += 2;
1528
                        vc += 2;
1529
                }
1530

    
1531
#else
1532
                int i, *idst = (int32_t *) dst;
1533
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534
                for(i = 0; i < chromWidth; i++){
1535
#ifdef WORDS_BIGENDIAN
1536
                        *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1537
                            (yc[1] << 8) + (vc[0] << 0);
1538
#else
1539
                        *idst++ = yc[0] + (uc[0] << 8) +
1540
                            (yc[1] << 16) + (vc[0] << 24);
1541
#endif
1542
                        yc += 2;
1543
                        uc++;
1544
                        vc++;
1545
                }
1546
#endif
1547
#endif
1548
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1549
                {
1550
                        usrc += chromStride;
1551
                        vsrc += chromStride;
1552
                }
1553
                ysrc += lumStride;
1554
                dst += dstStride;
1555
        }
1556
#ifdef HAVE_MMX
1557
asm(    EMMS" \n\t"
1558
        SFENCE" \n\t"
1559
        :::"memory");
1560
#endif
1561
}
1562

    
1563
/**
1564
 *
1565
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1566
 * problem for anyone then tell me, and ill fix it)
1567
 */
1568
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1569
        unsigned int width, unsigned int height,
1570
        int lumStride, int chromStride, int dstStride)
1571
{
1572
        //FIXME interpolate chroma
1573
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1574
}
1575

    
1576
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1577
        unsigned int width, unsigned int height,
1578
        int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1579
{
1580
        unsigned y;
1581
        const unsigned chromWidth= width>>1;
1582
        for(y=0; y<height; y++)
1583
        {
1584
#ifdef HAVE_MMX
1585
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1586
                asm volatile(
1587
                        "xorl %%eax, %%eax                \n\t"
1588
                        ".balign 16                        \n\t"
1589
                        "1:                                \n\t"
1590
                        PREFETCH" 32(%1, %%eax, 2)        \n\t"
1591
                        PREFETCH" 32(%2, %%eax)                \n\t"
1592
                        PREFETCH" 32(%3, %%eax)                \n\t"
1593
                        "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1594
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1595
                        "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1596
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1597
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1598

    
1599
                        "movq (%1, %%eax,2), %%mm3        \n\t" // Y(0)
1600
                        "movq 8(%1, %%eax,2), %%mm5        \n\t" // Y(8)
1601
                        "movq %%mm0, %%mm4                \n\t" // Y(0)
1602
                        "movq %%mm2, %%mm6                \n\t" // Y(8)
1603
                        "punpcklbw %%mm3, %%mm0                \n\t" // YUYV YUYV(0)
1604
                        "punpckhbw %%mm3, %%mm4                \n\t" // YUYV YUYV(4)
1605
                        "punpcklbw %%mm5, %%mm2                \n\t" // YUYV YUYV(8)
1606
                        "punpckhbw %%mm5, %%mm6                \n\t" // YUYV YUYV(12)
1607

    
1608
                        MOVNTQ" %%mm0, (%0, %%eax, 4)        \n\t"
1609
                        MOVNTQ" %%mm4, 8(%0, %%eax, 4)        \n\t"
1610
                        MOVNTQ" %%mm2, 16(%0, %%eax, 4)        \n\t"
1611
                        MOVNTQ" %%mm6, 24(%0, %%eax, 4)        \n\t"
1612

    
1613
                        "addl $8, %%eax                        \n\t"
1614
                        "cmpl %4, %%eax                        \n\t"
1615
                        " jb 1b                                \n\t"
1616
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1617
                        : "%eax"
1618
                );
1619
#else
1620
//FIXME adapt the alpha asm code from yv12->yuy2
1621

    
1622
#if __WORDSIZE >= 64
1623
                int i;
1624
                uint64_t *ldst = (uint64_t *) dst;
1625
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1626
                for(i = 0; i < chromWidth; i += 2){
1627
                        uint64_t k, l;
1628
                        k = uc[0] + (yc[0] << 8) +
1629
                            (vc[0] << 16) + (yc[1] << 24);
1630
                        l = uc[1] + (yc[2] << 8) +
1631
                            (vc[1] << 16) + (yc[3] << 24);
1632
                        *ldst++ = k + (l << 32);
1633
                        yc += 4;
1634
                        uc += 2;
1635
                        vc += 2;
1636
                }
1637

    
1638
#else
1639
                int i, *idst = (int32_t *) dst;
1640
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1641
                for(i = 0; i < chromWidth; i++){
1642
#ifdef WORDS_BIGENDIAN
1643
                        *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1644
                            (vc[0] << 8) + (yc[1] << 0);
1645
#else
1646
                        *idst++ = uc[0] + (yc[0] << 8) +
1647
                            (vc[0] << 16) + (yc[1] << 24);
1648
#endif
1649
                        yc += 2;
1650
                        uc++;
1651
                        vc++;
1652
                }
1653
#endif
1654
#endif
1655
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1656
                {
1657
                        usrc += chromStride;
1658
                        vsrc += chromStride;
1659
                }
1660
                ysrc += lumStride;
1661
                dst += dstStride;
1662
        }
1663
#ifdef HAVE_MMX
1664
asm(    EMMS" \n\t"
1665
        SFENCE" \n\t"
1666
        :::"memory");
1667
#endif
1668
}
1669

    
1670
/**
1671
 *
1672
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1673
 * problem for anyone then tell me, and ill fix it)
1674
 */
1675
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1676
        unsigned int width, unsigned int height,
1677
        int lumStride, int chromStride, int dstStride)
1678
{
1679
        //FIXME interpolate chroma
1680
        RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1681
}
1682

    
1683
/**
1684
 *
1685
 * width should be a multiple of 16
1686
 */
1687
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1688
        unsigned int width, unsigned int height,
1689
        int lumStride, int chromStride, int dstStride)
1690
{
1691
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1692
}
1693

    
1694
/**
1695
 *
1696
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1697
 * problem for anyone then tell me, and ill fix it)
1698
 */
1699
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1700
        unsigned int width, unsigned int height,
1701
        int lumStride, int chromStride, int srcStride)
1702
{
1703
        unsigned y;
1704
        const unsigned chromWidth= width>>1;
1705
        for(y=0; y<height; y+=2)
1706
        {
1707
#ifdef HAVE_MMX
1708
                asm volatile(
1709
                        "xorl %%eax, %%eax                \n\t"
1710
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1711
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1712
                        ".balign 16                        \n\t"
1713
                        "1:                                \n\t"
1714
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1715
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1716
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1717
                        "movq %%mm0, %%mm2                \n\t" // YUYV YUYV(0)
1718
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(4)
1719
                        "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1720
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1721
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1722
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1723
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1724
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1725

    
1726
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
1727

    
1728
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(8)
1729
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(12)
1730
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(8)
1731
                        "movq %%mm2, %%mm4                \n\t" // YUYV YUYV(12)
1732
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1733
                        "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1734
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1735
                        "pand %%mm7, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1736
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1737
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1738

    
1739
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
1740

    
1741
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1742
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1743
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1744
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1745
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1746
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1747
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1748
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1749

    
1750
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
1751
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
1752

    
1753
                        "addl $8, %%eax                        \n\t"
1754
                        "cmpl %4, %%eax                        \n\t"
1755
                        " jb 1b                                \n\t"
1756
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1757
                        : "memory", "%eax"
1758
                );
1759

    
1760
                ydst += lumStride;
1761
                src  += srcStride;
1762

    
1763
                asm volatile(
1764
                        "xorl %%eax, %%eax                \n\t"
1765
                        ".balign 16                        \n\t"
1766
                        "1:                                \n\t"
1767
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1768
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1769
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1770
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
1771
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
1772
                        "pand %%mm7, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1773
                        "pand %%mm7, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1774
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1775
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1776
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1777
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1778

    
1779
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
1780
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
1781

    
1782
                        "addl $8, %%eax                        \n\t"
1783
                        "cmpl %4, %%eax                        \n\t"
1784
                        " jb 1b                                \n\t"
1785

    
1786
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1787
                        : "memory", "%eax"
1788
                );
1789
#else
1790
                unsigned i;
1791
                for(i=0; i<chromWidth; i++)
1792
                {
1793
                        ydst[2*i+0]         = src[4*i+0];
1794
                        udst[i]         = src[4*i+1];
1795
                        ydst[2*i+1]         = src[4*i+2];
1796
                        vdst[i]         = src[4*i+3];
1797
                }
1798
                ydst += lumStride;
1799
                src  += srcStride;
1800

    
1801
                for(i=0; i<chromWidth; i++)
1802
                {
1803
                        ydst[2*i+0]         = src[4*i+0];
1804
                        ydst[2*i+1]         = src[4*i+2];
1805
                }
1806
#endif
1807
                udst += chromStride;
1808
                vdst += chromStride;
1809
                ydst += lumStride;
1810
                src  += srcStride;
1811
        }
1812
#ifdef HAVE_MMX
1813
asm volatile(   EMMS" \n\t"
1814
                SFENCE" \n\t"
1815
                :::"memory");
1816
#endif
1817
}
1818

    
1819
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1820
        uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1821
        unsigned int width, unsigned int height, int lumStride, int chromStride)
1822
{
1823
        /* Y Plane */
1824
        memcpy(ydst, ysrc, width*height);
1825

    
1826
        /* XXX: implement upscaling for U,V */
1827
}
1828

    
1829
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1830
{
1831
        int x,y;
1832
        
1833
        dst[0]= src[0];
1834
        
1835
        // first line
1836
        for(x=0; x<srcWidth-1; x++){
1837
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1838
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1839
        }
1840
        dst[2*srcWidth-1]= src[srcWidth-1];
1841
        
1842
        dst+= dstStride;
1843

    
1844
        for(y=1; y<srcHeight; y++){
1845
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1846
                const int mmxSize= srcWidth&~15;
1847
                asm volatile(
1848
                        "movl %4, %%eax                        \n\t"
1849
                        "1:                                \n\t"
1850
                        "movq (%0, %%eax), %%mm0        \n\t"
1851
                        "movq (%1, %%eax), %%mm1        \n\t"
1852
                        "movq 1(%0, %%eax), %%mm2        \n\t"
1853
                        "movq 1(%1, %%eax), %%mm3        \n\t"
1854
                        "movq -1(%0, %%eax), %%mm4        \n\t"
1855
                        "movq -1(%1, %%eax), %%mm5        \n\t"
1856
                        PAVGB" %%mm0, %%mm5                \n\t"
1857
                        PAVGB" %%mm0, %%mm3                \n\t"
1858
                        PAVGB" %%mm0, %%mm5                \n\t"
1859
                        PAVGB" %%mm0, %%mm3                \n\t"
1860
                        PAVGB" %%mm1, %%mm4                \n\t"
1861
                        PAVGB" %%mm1, %%mm2                \n\t"
1862
                        PAVGB" %%mm1, %%mm4                \n\t"
1863
                        PAVGB" %%mm1, %%mm2                \n\t"
1864
                        "movq %%mm5, %%mm7                \n\t"
1865
                        "movq %%mm4, %%mm6                \n\t"
1866
                        "punpcklbw %%mm3, %%mm5                \n\t"
1867
                        "punpckhbw %%mm3, %%mm7                \n\t"
1868
                        "punpcklbw %%mm2, %%mm4                \n\t"
1869
                        "punpckhbw %%mm2, %%mm6                \n\t"
1870
#if 1
1871
                        MOVNTQ" %%mm5, (%2, %%eax, 2)        \n\t"
1872
                        MOVNTQ" %%mm7, 8(%2, %%eax, 2)        \n\t"
1873
                        MOVNTQ" %%mm4, (%3, %%eax, 2)        \n\t"
1874
                        MOVNTQ" %%mm6, 8(%3, %%eax, 2)        \n\t"
1875
#else
1876
                        "movq %%mm5, (%2, %%eax, 2)        \n\t"
1877
                        "movq %%mm7, 8(%2, %%eax, 2)        \n\t"
1878
                        "movq %%mm4, (%3, %%eax, 2)        \n\t"
1879
                        "movq %%mm6, 8(%3, %%eax, 2)        \n\t"
1880
#endif
1881
                        "addl $8, %%eax                        \n\t"
1882
                        " js 1b                                \n\t"
1883
                        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1884
                           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1885
                           "g" (-mmxSize)
1886
                        : "%eax"
1887

    
1888
                );
1889
#else
1890
                const int mmxSize=1;
1891
#endif
1892
                dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1893
                dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1894

    
1895
                for(x=mmxSize-1; x<srcWidth-1; x++){
1896
                        dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1897
                        dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1898
                        dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1899
                        dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1900
                }
1901
                dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1902
                dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1903

    
1904
                dst+=dstStride*2;
1905
                src+=srcStride;
1906
        }
1907
        
1908
        // last line
1909
#if 1
1910
        dst[0]= src[0];
1911
        
1912
        for(x=0; x<srcWidth-1; x++){
1913
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1914
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1915
        }
1916
        dst[2*srcWidth-1]= src[srcWidth-1];
1917
#else
1918
        for(x=0; x<srcWidth; x++){
1919
                dst[2*x+0]=
1920
                dst[2*x+1]= src[x];
1921
        }
1922
#endif
1923

    
1924
#ifdef HAVE_MMX
1925
asm volatile(   EMMS" \n\t"
1926
                SFENCE" \n\t"
1927
                :::"memory");
1928
#endif
1929
}
1930

    
1931
/**
1932
 *
1933
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1934
 * problem for anyone then tell me, and ill fix it)
1935
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1936
 */
1937
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1938
        unsigned int width, unsigned int height,
1939
        int lumStride, int chromStride, int srcStride)
1940
{
1941
        unsigned y;
1942
        const unsigned chromWidth= width>>1;
1943
        for(y=0; y<height; y+=2)
1944
        {
1945
#ifdef HAVE_MMX
1946
                asm volatile(
1947
                        "xorl %%eax, %%eax                \n\t"
1948
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1949
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1950
                        ".balign 16                        \n\t"
1951
                        "1:                                \n\t"
1952
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1953
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // UYVY UYVY(0)
1954
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(4)
1955
                        "movq %%mm0, %%mm2                \n\t" // UYVY UYVY(0)
1956
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(4)
1957
                        "pand %%mm7, %%mm0                \n\t" // U0V0 U0V0(0)
1958
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(4)
1959
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1960
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1961
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1962
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1963

    
1964
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
1965

    
1966
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(8)
1967
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // UYVY UYVY(12)
1968
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(8)
1969
                        "movq %%mm2, %%mm4                \n\t" // UYVY UYVY(12)
1970
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(8)
1971
                        "pand %%mm7, %%mm2                \n\t" // U0V0 U0V0(12)
1972
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1973
                        "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1974
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1975
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1976

    
1977
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
1978

    
1979
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1980
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1981
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1982
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1983
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1984
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1985
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1986
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1987

    
1988
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
1989
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
1990

    
1991
                        "addl $8, %%eax                        \n\t"
1992
                        "cmpl %4, %%eax                        \n\t"
1993
                        " jb 1b                                \n\t"
1994
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1995
                        : "memory", "%eax"
1996
                );
1997

    
1998
                ydst += lumStride;
1999
                src  += srcStride;
2000

    
2001
                asm volatile(
2002
                        "xorl %%eax, %%eax                \n\t"
2003
                        ".balign 16                        \n\t"
2004
                        "1:                                \n\t"
2005
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
2006
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
2007
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
2008
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
2009
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
2010
                        "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2011
                        "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2012
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2013
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2014
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
2015
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
2016

    
2017
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
2018
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
2019

    
2020
                        "addl $8, %%eax                        \n\t"
2021
                        "cmpl %4, %%eax                        \n\t"
2022
                        " jb 1b                                \n\t"
2023

    
2024
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2025
                        : "memory", "%eax"
2026
                );
2027
#else
2028
                unsigned i;
2029
                for(i=0; i<chromWidth; i++)
2030
                {
2031
                        udst[i]         = src[4*i+0];
2032
                        ydst[2*i+0]         = src[4*i+1];
2033
                        vdst[i]         = src[4*i+2];
2034
                        ydst[2*i+1]         = src[4*i+3];
2035
                }
2036
                ydst += lumStride;
2037
                src  += srcStride;
2038

    
2039
                for(i=0; i<chromWidth; i++)
2040
                {
2041
                        ydst[2*i+0]         = src[4*i+1];
2042
                        ydst[2*i+1]         = src[4*i+3];
2043
                }
2044
#endif
2045
                udst += chromStride;
2046
                vdst += chromStride;
2047
                ydst += lumStride;
2048
                src  += srcStride;
2049
        }
2050
#ifdef HAVE_MMX
2051
asm volatile(   EMMS" \n\t"
2052
                SFENCE" \n\t"
2053
                :::"memory");
2054
#endif
2055
}
2056

    
2057
/**
2058
 *
2059
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2060
 * problem for anyone then tell me, and ill fix it)
2061
 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2062
 */
2063
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2064
        unsigned int width, unsigned int height,
2065
        int lumStride, int chromStride, int srcStride)
2066
{
2067
        unsigned y;
2068
        const unsigned chromWidth= width>>1;
2069
#ifdef HAVE_MMX
2070
        for(y=0; y<height-2; y+=2)
2071
        {
2072
                unsigned i;
2073
                for(i=0; i<2; i++)
2074
                {
2075
                        asm volatile(
2076
                                "movl %2, %%eax                        \n\t"
2077
                                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
2078
                                "movq "MANGLE(w1111)", %%mm5                \n\t"
2079
                                "pxor %%mm7, %%mm7                \n\t"
2080
                                "leal (%%eax, %%eax, 2), %%ebx        \n\t"
2081
                                ".balign 16                        \n\t"
2082
                                "1:                                \n\t"
2083
                                PREFETCH" 64(%0, %%ebx)                \n\t"
2084
                                "movd (%0, %%ebx), %%mm0        \n\t"
2085
                                "movd 3(%0, %%ebx), %%mm1        \n\t"
2086
                                "punpcklbw %%mm7, %%mm0                \n\t"
2087
                                "punpcklbw %%mm7, %%mm1                \n\t"
2088
                                "movd 6(%0, %%ebx), %%mm2        \n\t"
2089
                                "movd 9(%0, %%ebx), %%mm3        \n\t"
2090
                                "punpcklbw %%mm7, %%mm2                \n\t"
2091
                                "punpcklbw %%mm7, %%mm3                \n\t"
2092
                                "pmaddwd %%mm6, %%mm0                \n\t"
2093
                                "pmaddwd %%mm6, %%mm1                \n\t"
2094
                                "pmaddwd %%mm6, %%mm2                \n\t"
2095
                                "pmaddwd %%mm6, %%mm3                \n\t"
2096
#ifndef FAST_BGR2YV12
2097
                                "psrad $8, %%mm0                \n\t"
2098
                                "psrad $8, %%mm1                \n\t"
2099
                                "psrad $8, %%mm2                \n\t"
2100
                                "psrad $8, %%mm3                \n\t"
2101
#endif
2102
                                "packssdw %%mm1, %%mm0                \n\t"
2103
                                "packssdw %%mm3, %%mm2                \n\t"
2104
                                "pmaddwd %%mm5, %%mm0                \n\t"
2105
                                "pmaddwd %%mm5, %%mm2                \n\t"
2106
                                "packssdw %%mm2, %%mm0                \n\t"
2107
                                "psraw $7, %%mm0                \n\t"
2108

    
2109
                                "movd 12(%0, %%ebx), %%mm4        \n\t"
2110
                                "movd 15(%0, %%ebx), %%mm1        \n\t"
2111
                                "punpcklbw %%mm7, %%mm4                \n\t"
2112
                                "punpcklbw %%mm7, %%mm1                \n\t"
2113
                                "movd 18(%0, %%ebx), %%mm2        \n\t"
2114
                                "movd 21(%0, %%ebx), %%mm3        \n\t"
2115
                                "punpcklbw %%mm7, %%mm2                \n\t"
2116
                                "punpcklbw %%mm7, %%mm3                \n\t"
2117
                                "pmaddwd %%mm6, %%mm4                \n\t"
2118
                                "pmaddwd %%mm6, %%mm1                \n\t"
2119
                                "pmaddwd %%mm6, %%mm2                \n\t"
2120
                                "pmaddwd %%mm6, %%mm3                \n\t"
2121
#ifndef FAST_BGR2YV12
2122
                                "psrad $8, %%mm4                \n\t"
2123
                                "psrad $8, %%mm1                \n\t"
2124
                                "psrad $8, %%mm2                \n\t"
2125
                                "psrad $8, %%mm3                \n\t"
2126
#endif
2127
                                "packssdw %%mm1, %%mm4                \n\t"
2128
                                "packssdw %%mm3, %%mm2                \n\t"
2129
                                "pmaddwd %%mm5, %%mm4                \n\t"
2130
                                "pmaddwd %%mm5, %%mm2                \n\t"
2131
                                "addl $24, %%ebx                \n\t"
2132
                                "packssdw %%mm2, %%mm4                \n\t"
2133
                                "psraw $7, %%mm4                \n\t"
2134

    
2135
                                "packuswb %%mm4, %%mm0                \n\t"
2136
                                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
2137

    
2138
                                MOVNTQ" %%mm0, (%1, %%eax)        \n\t"
2139
                                "addl $8, %%eax                        \n\t"
2140
                                " js 1b                                \n\t"
2141
                                : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2142
                                : "%eax", "%ebx"
2143
                        );
2144
                        ydst += lumStride;
2145
                        src  += srcStride;
2146
                }
2147
                src -= srcStride*2;
2148
                asm volatile(
2149
                        "movl %4, %%eax                        \n\t"
2150
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2151
                        "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
2152
                        "pxor %%mm7, %%mm7                \n\t"
2153
                        "leal (%%eax, %%eax, 2), %%ebx        \n\t"
2154
                        "addl %%ebx, %%ebx                \n\t"
2155
                        ".balign 16                        \n\t"
2156
                        "1:                                \n\t"
2157
                        PREFETCH" 64(%0, %%ebx)                \n\t"
2158
                        PREFETCH" 64(%1, %%ebx)                \n\t"
2159
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2160
                        "movq (%0, %%ebx), %%mm0        \n\t"
2161
                        "movq (%1, %%ebx), %%mm1        \n\t"
2162
                        "movq 6(%0, %%ebx), %%mm2        \n\t"
2163
                        "movq 6(%1, %%ebx), %%mm3        \n\t"
2164
                        PAVGB" %%mm1, %%mm0                \n\t"
2165
                        PAVGB" %%mm3, %%mm2                \n\t"
2166
                        "movq %%mm0, %%mm1                \n\t"
2167
                        "movq %%mm2, %%mm3                \n\t"
2168
                        "psrlq $24, %%mm0                \n\t"
2169
                        "psrlq $24, %%mm2                \n\t"
2170
                        PAVGB" %%mm1, %%mm0                \n\t"
2171
                        PAVGB" %%mm3, %%mm2                \n\t"
2172
                        "punpcklbw %%mm7, %%mm0                \n\t"
2173
                        "punpcklbw %%mm7, %%mm2                \n\t"
2174
#else
2175
                        "movd (%0, %%ebx), %%mm0        \n\t"
2176
                        "movd (%1, %%ebx), %%mm1        \n\t"
2177
                        "movd 3(%0, %%ebx), %%mm2        \n\t"
2178
                        "movd 3(%1, %%ebx), %%mm3        \n\t"
2179
                        "punpcklbw %%mm7, %%mm0                \n\t"
2180
                        "punpcklbw %%mm7, %%mm1                \n\t"
2181
                        "punpcklbw %%mm7, %%mm2                \n\t"
2182
                        "punpcklbw %%mm7, %%mm3                \n\t"
2183
                        "paddw %%mm1, %%mm0                \n\t"
2184
                        "paddw %%mm3, %%mm2                \n\t"
2185
                        "paddw %%mm2, %%mm0                \n\t"
2186
                        "movd 6(%0, %%ebx), %%mm4        \n\t"
2187
                        "movd 6(%1, %%ebx), %%mm1        \n\t"
2188
                        "movd 9(%0, %%ebx), %%mm2        \n\t"
2189
                        "movd 9(%1, %%ebx), %%mm3        \n\t"
2190
                        "punpcklbw %%mm7, %%mm4                \n\t"
2191
                        "punpcklbw %%mm7, %%mm1                \n\t"
2192
                        "punpcklbw %%mm7, %%mm2                \n\t"
2193
                        "punpcklbw %%mm7, %%mm3                \n\t"
2194
                        "paddw %%mm1, %%mm4                \n\t"
2195
                        "paddw %%mm3, %%mm2                \n\t"
2196
                        "paddw %%mm4, %%mm2                \n\t"
2197
                        "psrlw $2, %%mm0                \n\t"
2198
                        "psrlw $2, %%mm2                \n\t"
2199
#endif
2200
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2201
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2202

    
2203
                        "pmaddwd %%mm0, %%mm1                \n\t"
2204
                        "pmaddwd %%mm2, %%mm3                \n\t"
2205
                        "pmaddwd %%mm6, %%mm0                \n\t"
2206
                        "pmaddwd %%mm6, %%mm2                \n\t"
2207
#ifndef FAST_BGR2YV12
2208
                        "psrad $8, %%mm0                \n\t"
2209
                        "psrad $8, %%mm1                \n\t"
2210
                        "psrad $8, %%mm2                \n\t"
2211
                        "psrad $8, %%mm3                \n\t"
2212
#endif
2213
                        "packssdw %%mm2, %%mm0                \n\t"
2214
                        "packssdw %%mm3, %%mm1                \n\t"
2215
                        "pmaddwd %%mm5, %%mm0                \n\t"
2216
                        "pmaddwd %%mm5, %%mm1                \n\t"
2217
                        "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
2218
                        "psraw $7, %%mm0                \n\t"
2219

    
2220
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2221
                        "movq 12(%0, %%ebx), %%mm4        \n\t"
2222
                        "movq 12(%1, %%ebx), %%mm1        \n\t"
2223
                        "movq 18(%0, %%ebx), %%mm2        \n\t"
2224
                        "movq 18(%1, %%ebx), %%mm3        \n\t"
2225
                        PAVGB" %%mm1, %%mm4                \n\t"
2226
                        PAVGB" %%mm3, %%mm2                \n\t"
2227
                        "movq %%mm4, %%mm1                \n\t"
2228
                        "movq %%mm2, %%mm3                \n\t"
2229
                        "psrlq $24, %%mm4                \n\t"
2230
                        "psrlq $24, %%mm2                \n\t"
2231
                        PAVGB" %%mm1, %%mm4                \n\t"
2232
                        PAVGB" %%mm3, %%mm2                \n\t"
2233
                        "punpcklbw %%mm7, %%mm4                \n\t"
2234
                        "punpcklbw %%mm7, %%mm2                \n\t"
2235
#else
2236
                        "movd 12(%0, %%ebx), %%mm4        \n\t"
2237
                        "movd 12(%1, %%ebx), %%mm1        \n\t"
2238
                        "movd 15(%0, %%ebx), %%mm2        \n\t"
2239
                        "movd 15(%1, %%ebx), %%mm3        \n\t"
2240
                        "punpcklbw %%mm7, %%mm4                \n\t"
2241
                        "punpcklbw %%mm7, %%mm1                \n\t"
2242
                        "punpcklbw %%mm7, %%mm2                \n\t"
2243
                        "punpcklbw %%mm7, %%mm3                \n\t"
2244
                        "paddw %%mm1, %%mm4                \n\t"
2245
                        "paddw %%mm3, %%mm2                \n\t"
2246
                        "paddw %%mm2, %%mm4                \n\t"
2247
                        "movd 18(%0, %%ebx), %%mm5        \n\t"
2248
                        "movd 18(%1, %%ebx), %%mm1        \n\t"
2249
                        "movd 21(%0, %%ebx), %%mm2        \n\t"
2250
                        "movd 21(%1, %%ebx), %%mm3        \n\t"
2251
                        "punpcklbw %%mm7, %%mm5                \n\t"
2252
                        "punpcklbw %%mm7, %%mm1                \n\t"
2253
                        "punpcklbw %%mm7, %%mm2                \n\t"
2254
                        "punpcklbw %%mm7, %%mm3                \n\t"
2255
                        "paddw %%mm1, %%mm5                \n\t"
2256
                        "paddw %%mm3, %%mm2                \n\t"
2257
                        "paddw %%mm5, %%mm2                \n\t"
2258
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2259
                        "psrlw $2, %%mm4                \n\t"
2260
                        "psrlw $2, %%mm2                \n\t"
2261
#endif
2262
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2263
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2264

    
2265
                        "pmaddwd %%mm4, %%mm1                \n\t"
2266
                        "pmaddwd %%mm2, %%mm3                \n\t"
2267
                        "pmaddwd %%mm6, %%mm4                \n\t"
2268
                        "pmaddwd %%mm6, %%mm2                \n\t"
2269
#ifndef FAST_BGR2YV12
2270
                        "psrad $8, %%mm4                \n\t"
2271
                        "psrad $8, %%mm1                \n\t"
2272
                        "psrad $8, %%mm2                \n\t"
2273
                        "psrad $8, %%mm3                \n\t"
2274
#endif
2275
                        "packssdw %%mm2, %%mm4                \n\t"
2276
                        "packssdw %%mm3, %%mm1                \n\t"
2277
                        "pmaddwd %%mm5, %%mm4                \n\t"
2278
                        "pmaddwd %%mm5, %%mm1                \n\t"
2279
                        "addl $24, %%ebx                \n\t"
2280
                        "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
2281
                        "psraw $7, %%mm4                \n\t"
2282

    
2283
                        "movq %%mm0, %%mm1                \n\t"
2284
                        "punpckldq %%mm4, %%mm0                \n\t"
2285
                        "punpckhdq %%mm4, %%mm1                \n\t"
2286
                        "packsswb %%mm1, %%mm0                \n\t"
2287
                        "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
2288

    
2289
                        "movd %%mm0, (%2, %%eax)        \n\t"
2290
                        "punpckhdq %%mm0, %%mm0                \n\t"
2291
                        "movd %%mm0, (%3, %%eax)        \n\t"
2292
                        "addl $4, %%eax                        \n\t"
2293
                        " js 1b                                \n\t"
2294
                        : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2295
                        : "%eax", "%ebx"
2296
                );
2297

    
2298
                udst += chromStride;
2299
                vdst += chromStride;
2300
                src  += srcStride*2;
2301
        }
2302

    
2303
        asm volatile(   EMMS" \n\t"
2304
                        SFENCE" \n\t"
2305
                        :::"memory");
2306
#else
2307
        y=0;
2308
#endif
2309
        for(; y<height; y+=2)
2310
        {
2311
                unsigned i;
2312
                for(i=0; i<chromWidth; i++)
2313
                {
2314
                        unsigned int b= src[6*i+0];
2315
                        unsigned int g= src[6*i+1];
2316
                        unsigned int r= src[6*i+2];
2317

    
2318
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2319
                        unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2320
                        unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2321

    
2322
                        udst[i]         = U;
2323
                        vdst[i]         = V;
2324
                        ydst[2*i]         = Y;
2325

    
2326
                        b= src[6*i+3];
2327
                        g= src[6*i+4];
2328
                        r= src[6*i+5];
2329

    
2330
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2331
                        ydst[2*i+1]         = Y;
2332
                }
2333
                ydst += lumStride;
2334
                src  += srcStride;
2335

    
2336
                for(i=0; i<chromWidth; i++)
2337
                {
2338
                        unsigned int b= src[6*i+0];
2339
                        unsigned int g= src[6*i+1];
2340
                        unsigned int r= src[6*i+2];
2341

    
2342
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2343

    
2344
                        ydst[2*i]         = Y;
2345

    
2346
                        b= src[6*i+3];
2347
                        g= src[6*i+4];
2348
                        r= src[6*i+5];
2349

    
2350
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2351
                        ydst[2*i+1]         = Y;
2352
                }
2353
                udst += chromStride;
2354
                vdst += chromStride;
2355
                ydst += lumStride;
2356
                src  += srcStride;
2357
        }
2358
}
2359

    
2360
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2361
                            unsigned width, unsigned height, int src1Stride,
2362
                            int src2Stride, int dstStride){
2363
        unsigned h;
2364

    
2365
        for(h=0; h < height; h++)
2366
        {
2367
                unsigned w;
2368

    
2369
#ifdef HAVE_MMX
2370
#ifdef HAVE_SSE2
2371
                asm(
2372
                        "xorl %%eax, %%eax                \n\t"
2373
                        "1:                                \n\t"
2374
                        PREFETCH" 64(%1, %%eax)                \n\t"
2375
                        PREFETCH" 64(%2, %%eax)                \n\t"
2376
                        "movdqa (%1, %%eax), %%xmm0        \n\t"
2377
                        "movdqa (%1, %%eax), %%xmm1        \n\t"
2378
                        "movdqa (%2, %%eax), %%xmm2        \n\t"
2379
                        "punpcklbw %%xmm2, %%xmm0        \n\t"
2380
                        "punpckhbw %%xmm2, %%xmm1        \n\t"
2381
                        "movntdq %%xmm0, (%0, %%eax, 2)        \n\t"
2382
                        "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2383
                        "addl $16, %%eax                        \n\t"
2384
                        "cmpl %3, %%eax                        \n\t"
2385
                        " jb 1b                                \n\t"
2386
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2387
                        : "memory", "%eax"
2388
                );
2389
#else
2390
                asm(
2391
                        "xorl %%eax, %%eax                \n\t"
2392
                        "1:                                \n\t"
2393
                        PREFETCH" 64(%1, %%eax)                \n\t"
2394
                        PREFETCH" 64(%2, %%eax)                \n\t"
2395
                        "movq (%1, %%eax), %%mm0        \n\t"
2396
                        "movq 8(%1, %%eax), %%mm2        \n\t"
2397
                        "movq %%mm0, %%mm1                \n\t"
2398
                        "movq %%mm2, %%mm3                \n\t"
2399
                        "movq (%2, %%eax), %%mm4        \n\t"
2400
                        "movq 8(%2, %%eax), %%mm5        \n\t"
2401
                        "punpcklbw %%mm4, %%mm0                \n\t"
2402
                        "punpckhbw %%mm4, %%mm1                \n\t"
2403
                        "punpcklbw %%mm5, %%mm2                \n\t"
2404
                        "punpckhbw %%mm5, %%mm3                \n\t"
2405
                        MOVNTQ" %%mm0, (%0, %%eax, 2)        \n\t"
2406
                        MOVNTQ" %%mm1, 8(%0, %%eax, 2)        \n\t"
2407
                        MOVNTQ" %%mm2, 16(%0, %%eax, 2)        \n\t"
2408
                        MOVNTQ" %%mm3, 24(%0, %%eax, 2)        \n\t"
2409
                        "addl $16, %%eax                        \n\t"
2410
                        "cmpl %3, %%eax                        \n\t"
2411
                        " jb 1b                                \n\t"
2412
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2413
                        : "memory", "%eax"
2414
                );
2415
#endif
2416
                for(w= (width&(~15)); w < width; w++)
2417
                {
2418
                        dest[2*w+0] = src1[w];
2419
                        dest[2*w+1] = src2[w];
2420
                }
2421
#else
2422
                for(w=0; w < width; w++)
2423
                {
2424
                        dest[2*w+0] = src1[w];
2425
                        dest[2*w+1] = src2[w];
2426
                }
2427
#endif
2428
                dest += dstStride;
2429
                src1 += src1Stride;
2430
                src2 += src2Stride;
2431
        }
2432
#ifdef HAVE_MMX
2433
        asm(
2434
                EMMS" \n\t"
2435
                SFENCE" \n\t"
2436
                ::: "memory"
2437
                );
2438
#endif
2439
}
2440

    
2441
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2442
                        uint8_t *dst1, uint8_t *dst2,
2443
                        unsigned width, unsigned height,
2444
                        int srcStride1, int srcStride2,
2445
                        int dstStride1, int dstStride2)
2446
{
2447
    unsigned int y,x,h;
2448
    int w;
2449
    w=width/2; h=height/2;
2450
#ifdef HAVE_MMX
2451
    asm volatile(
2452
        PREFETCH" %0\n\t"
2453
        PREFETCH" %1\n\t"
2454
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2455
#endif
2456
    for(y=0;y<h;y++){
2457
        const uint8_t* s1=src1+srcStride1*(y>>1);
2458
        uint8_t* d=dst1+dstStride1*y;
2459
        x=0;
2460
#ifdef HAVE_MMX
2461
        for(;x<w-31;x+=32)
2462
        {
2463
            asm volatile(
2464
                PREFETCH" 32%1\n\t"
2465
                "movq        %1, %%mm0\n\t"
2466
                "movq        8%1, %%mm2\n\t"
2467
                "movq        16%1, %%mm4\n\t"
2468
                "movq        24%1, %%mm6\n\t"
2469
                "movq        %%mm0, %%mm1\n\t"
2470
                "movq        %%mm2, %%mm3\n\t"
2471
                "movq        %%mm4, %%mm5\n\t"
2472
                "movq        %%mm6, %%mm7\n\t"
2473
                "punpcklbw %%mm0, %%mm0\n\t"
2474
                "punpckhbw %%mm1, %%mm1\n\t"
2475
                "punpcklbw %%mm2, %%mm2\n\t"
2476
                "punpckhbw %%mm3, %%mm3\n\t"
2477
                "punpcklbw %%mm4, %%mm4\n\t"
2478
                "punpckhbw %%mm5, %%mm5\n\t"
2479
                "punpcklbw %%mm6, %%mm6\n\t"
2480
                "punpckhbw %%mm7, %%mm7\n\t"
2481
                MOVNTQ"        %%mm0, %0\n\t"
2482
                MOVNTQ"        %%mm1, 8%0\n\t"
2483
                MOVNTQ"        %%mm2, 16%0\n\t"
2484
                MOVNTQ"        %%mm3, 24%0\n\t"
2485
                MOVNTQ"        %%mm4, 32%0\n\t"
2486
                MOVNTQ"        %%mm5, 40%0\n\t"
2487
                MOVNTQ"        %%mm6, 48%0\n\t"
2488
                MOVNTQ"        %%mm7, 56%0"
2489
                :"=m"(d[2*x])
2490
                :"m"(s1[x])
2491
                :"memory");
2492
        }
2493
#endif
2494
        for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2495
    }
2496
    for(y=0;y<h;y++){
2497
        const uint8_t* s2=src2+srcStride2*(y>>1);
2498
        uint8_t* d=dst2+dstStride2*y;
2499
        x=0;
2500
#ifdef HAVE_MMX
2501
        for(;x<w-31;x+=32)
2502
        {
2503
            asm volatile(
2504
                PREFETCH" 32%1\n\t"
2505
                "movq        %1, %%mm0\n\t"
2506
                "movq        8%1, %%mm2\n\t"
2507
                "movq        16%1, %%mm4\n\t"
2508
                "movq        24%1, %%mm6\n\t"
2509
                "movq        %%mm0, %%mm1\n\t"
2510
                "movq        %%mm2, %%mm3\n\t"
2511
                "movq        %%mm4, %%mm5\n\t"
2512
                "movq        %%mm6, %%mm7\n\t"
2513
                "punpcklbw %%mm0, %%mm0\n\t"
2514
                "punpckhbw %%mm1, %%mm1\n\t"
2515
                "punpcklbw %%mm2, %%mm2\n\t"
2516
                "punpckhbw %%mm3, %%mm3\n\t"
2517
                "punpcklbw %%mm4, %%mm4\n\t"
2518
                "punpckhbw %%mm5, %%mm5\n\t"
2519
                "punpcklbw %%mm6, %%mm6\n\t"
2520
                "punpckhbw %%mm7, %%mm7\n\t"
2521
                MOVNTQ"        %%mm0, %0\n\t"
2522
                MOVNTQ"        %%mm1, 8%0\n\t"
2523
                MOVNTQ"        %%mm2, 16%0\n\t"
2524
                MOVNTQ"        %%mm3, 24%0\n\t"
2525
                MOVNTQ"        %%mm4, 32%0\n\t"
2526
                MOVNTQ"        %%mm5, 40%0\n\t"
2527
                MOVNTQ"        %%mm6, 48%0\n\t"
2528
                MOVNTQ"        %%mm7, 56%0"
2529
                :"=m"(d[2*x])
2530
                :"m"(s2[x])
2531
                :"memory");
2532
        }
2533
#endif
2534
        for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2535
    }
2536
#ifdef HAVE_MMX
2537
        asm(
2538
                EMMS" \n\t"
2539
                SFENCE" \n\t"
2540
                ::: "memory"
2541
                );
2542
#endif
2543
}
2544

    
2545
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2546
                        uint8_t *dst,
2547
                        unsigned width, unsigned height,
2548
                        int srcStride1, int srcStride2,
2549
                        int srcStride3, int dstStride)
2550
{
2551
    unsigned y,x,w,h;
2552
    w=width/2; h=height;
2553
    for(y=0;y<h;y++){
2554
        const uint8_t* yp=src1+srcStride1*y;
2555
        const uint8_t* up=src2+srcStride2*(y>>2);
2556
        const uint8_t* vp=src3+srcStride3*(y>>2);
2557
        uint8_t* d=dst+dstStride*y;
2558
        x=0;
2559
#ifdef HAVE_MMX
2560
        for(;x<w-7;x+=8)
2561
        {
2562
            asm volatile(
2563
                PREFETCH" 32(%1, %0)\n\t"
2564
                PREFETCH" 32(%2, %0)\n\t"
2565
                PREFETCH" 32(%3, %0)\n\t"
2566
                "movq        (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2567
                "movq        (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2568
                "movq        (%3, %0), %%mm2\n\t"             /* V0V1V2V3V4V5V6V7 */
2569
                "movq        %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2570
                "movq        %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2571
                "movq        %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2572
                "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2573
                "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2574
                "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2575
                "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2576

    
2577
                "movq        %%mm1, %%mm6\n\t"
2578
                "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2579
                "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2580
                "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2581
                MOVNTQ"        %%mm0, (%4, %0, 8)\n\t"
2582
                MOVNTQ"        %%mm3, 8(%4, %0, 8)\n\t"
2583
                
2584
                "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2585
                "movq        8(%1, %0, 4), %%mm0\n\t"
2586
                "movq        %%mm0, %%mm3\n\t"
2587
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2588
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2589
                MOVNTQ"        %%mm0, 16(%4, %0, 8)\n\t"
2590
                MOVNTQ"        %%mm3, 24(%4, %0, 8)\n\t"
2591

    
2592
                "movq        %%mm4, %%mm6\n\t"
2593
                "movq        16(%1, %0, 4), %%mm0\n\t"
2594
                "movq        %%mm0, %%mm3\n\t"
2595
                "punpcklbw %%mm5, %%mm4\n\t"
2596
                "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2597
                "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2598
                MOVNTQ"        %%mm0, 32(%4, %0, 8)\n\t"
2599
                MOVNTQ"        %%mm3, 40(%4, %0, 8)\n\t"
2600
                
2601
                "punpckhbw %%mm5, %%mm6\n\t"
2602
                "movq        24(%1, %0, 4), %%mm0\n\t"
2603
                "movq        %%mm0, %%mm3\n\t"
2604
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2605
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2606
                MOVNTQ"        %%mm0, 48(%4, %0, 8)\n\t"
2607
                MOVNTQ"        %%mm3, 56(%4, %0, 8)\n\t"
2608

    
2609
                : "+r" (x)
2610
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2611
                :"memory");
2612
        }
2613
#endif
2614
        for(; x<w; x++)
2615
        {
2616
            const int x2= x<<2;
2617
            d[8*x+0]=yp[x2];
2618
            d[8*x+1]=up[x];
2619
            d[8*x+2]=yp[x2+1];
2620
            d[8*x+3]=vp[x];
2621
            d[8*x+4]=yp[x2+2];
2622
            d[8*x+5]=up[x];
2623
            d[8*x+6]=yp[x2+3];
2624
            d[8*x+7]=vp[x];
2625
        }
2626
    }
2627
#ifdef HAVE_MMX
2628
        asm(
2629
                EMMS" \n\t"
2630
                SFENCE" \n\t"
2631
                ::: "memory"
2632
                );
2633
#endif
2634
}