Statistics
| Branch: | Revision:

ffmpeg / postproc / rgb2rgb_template.c @ caeaabe7

History | View | Annotate | Download (64.5 KB)

1
/*
2
 *
3
 *  rgb2rgb.c, Software RGB to RGB convertor
4
 *  pluralize by Software PAL8 to RGB convertor
5
 *               Software YUV to YUV convertor
6
 *               Software YUV to RGB convertor
7
 *  Written by Nick Kurshev.
8
 *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9
 */
10

    
11
#include <stddef.h>
12
#include <inttypes.h> /* for __WORDSIZE */
13

    
14
#ifndef __WORDSIZE
15
// #warning You have misconfigured system and probably will lose performance!
16
#define __WORDSIZE MP_WORDSIZE
17
#endif
18

    
19
#undef PREFETCH
20
#undef MOVNTQ
21
#undef EMMS
22
#undef SFENCE
23
#undef MMREG_SIZE
24
#undef PREFETCHW
25
#undef PAVGB
26

    
27
#ifdef HAVE_SSE2
28
#define MMREG_SIZE 16
29
#else
30
#define MMREG_SIZE 8
31
#endif
32

    
33
#ifdef HAVE_3DNOW
34
#define PREFETCH  "prefetch"
35
#define PREFETCHW "prefetchw"
36
#define PAVGB          "pavgusb"
37
#elif defined ( HAVE_MMX2 )
38
#define PREFETCH "prefetchnta"
39
#define PREFETCHW "prefetcht0"
40
#define PAVGB          "pavgb"
41
#else
42
#define PREFETCH "/nop"
43
#define PREFETCHW "/nop"
44
#endif
45

    
46
#ifdef HAVE_3DNOW
47
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48
#define EMMS     "femms"
49
#else
50
#define EMMS     "emms"
51
#endif
52

    
53
#ifdef HAVE_MMX2
54
#define MOVNTQ "movntq"
55
#define SFENCE "sfence"
56
#else
57
#define MOVNTQ "movq"
58
#define SFENCE "/nop"
59
#endif
60

    
61
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62
{
63
  uint8_t *dest = dst;
64
  const uint8_t *s = src;
65
  const uint8_t *end;
66
#ifdef HAVE_MMX
67
  const uint8_t *mm_end;
68
#endif
69
  end = s + src_size;
70
#ifdef HAVE_MMX
71
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
72
  mm_end = end - 23;
73
  __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
74
  while(s < mm_end)
75
  {
76
    __asm __volatile(
77
        PREFETCH"        32%1\n\t"
78
        "movd        %1, %%mm0\n\t"
79
        "punpckldq 3%1, %%mm0\n\t"
80
        "movd        6%1, %%mm1\n\t"
81
        "punpckldq 9%1, %%mm1\n\t"
82
        "movd        12%1, %%mm2\n\t"
83
        "punpckldq 15%1, %%mm2\n\t"
84
        "movd        18%1, %%mm3\n\t"
85
        "punpckldq 21%1, %%mm3\n\t"
86
        "pand        %%mm7, %%mm0\n\t"
87
        "pand        %%mm7, %%mm1\n\t"
88
        "pand        %%mm7, %%mm2\n\t"
89
        "pand        %%mm7, %%mm3\n\t"
90
        MOVNTQ"        %%mm0, %0\n\t"
91
        MOVNTQ"        %%mm1, 8%0\n\t"
92
        MOVNTQ"        %%mm2, 16%0\n\t"
93
        MOVNTQ"        %%mm3, 24%0"
94
        :"=m"(*dest)
95
        :"m"(*s)
96
        :"memory");
97
    dest += 32;
98
    s += 24;
99
  }
100
  __asm __volatile(SFENCE:::"memory");
101
  __asm __volatile(EMMS:::"memory");
102
#endif
103
  while(s < end)
104
  {
105
    *dest++ = *s++;
106
    *dest++ = *s++;
107
    *dest++ = *s++;
108
    *dest++ = 0;
109
  }
110
}
111

    
112
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113
{
114
  uint8_t *dest = dst;
115
  const uint8_t *s = src;
116
  const uint8_t *end;
117
#ifdef HAVE_MMX
118
  const uint8_t *mm_end;
119
#endif
120
  end = s + src_size;
121
#ifdef HAVE_MMX
122
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
123
  mm_end = end - 31;
124
  while(s < mm_end)
125
  {
126
    __asm __volatile(
127
        PREFETCH"        32%1\n\t"
128
        "movq        %1, %%mm0\n\t"
129
        "movq        8%1, %%mm1\n\t"
130
        "movq        16%1, %%mm4\n\t"
131
        "movq        24%1, %%mm5\n\t"
132
        "movq        %%mm0, %%mm2\n\t"
133
        "movq        %%mm1, %%mm3\n\t"
134
        "movq        %%mm4, %%mm6\n\t"
135
        "movq        %%mm5, %%mm7\n\t"
136
        "psrlq        $8, %%mm2\n\t"
137
        "psrlq        $8, %%mm3\n\t"
138
        "psrlq        $8, %%mm6\n\t"
139
        "psrlq        $8, %%mm7\n\t"
140
        "pand        %2, %%mm0\n\t"
141
        "pand        %2, %%mm1\n\t"
142
        "pand        %2, %%mm4\n\t"
143
        "pand        %2, %%mm5\n\t"
144
        "pand        %3, %%mm2\n\t"
145
        "pand        %3, %%mm3\n\t"
146
        "pand        %3, %%mm6\n\t"
147
        "pand        %3, %%mm7\n\t"
148
        "por        %%mm2, %%mm0\n\t"
149
        "por        %%mm3, %%mm1\n\t"
150
        "por        %%mm6, %%mm4\n\t"
151
        "por        %%mm7, %%mm5\n\t"
152

    
153
        "movq        %%mm1, %%mm2\n\t"
154
        "movq        %%mm4, %%mm3\n\t"
155
        "psllq        $48, %%mm2\n\t"
156
        "psllq        $32, %%mm3\n\t"
157
        "pand        %4, %%mm2\n\t"
158
        "pand        %5, %%mm3\n\t"
159
        "por        %%mm2, %%mm0\n\t"
160
        "psrlq        $16, %%mm1\n\t"
161
        "psrlq        $32, %%mm4\n\t"
162
        "psllq        $16, %%mm5\n\t"
163
        "por        %%mm3, %%mm1\n\t"
164
        "pand        %6, %%mm5\n\t"
165
        "por        %%mm5, %%mm4\n\t"
166

    
167
        MOVNTQ"        %%mm0, %0\n\t"
168
        MOVNTQ"        %%mm1, 8%0\n\t"
169
        MOVNTQ"        %%mm4, 16%0"
170
        :"=m"(*dest)
171
        :"m"(*s),"m"(mask24l),
172
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173
        :"memory");
174
    dest += 24;
175
    s += 32;
176
  }
177
  __asm __volatile(SFENCE:::"memory");
178
  __asm __volatile(EMMS:::"memory");
179
#endif
180
  while(s < end)
181
  {
182
    *dest++ = *s++;
183
    *dest++ = *s++;
184
    *dest++ = *s++;
185
    s++;
186
  }
187
}
188

    
189
/*
190
 Original by Strepto/Astral
191
 ported to gcc & bugfixed : A'rpi
192
 MMX2, 3DNOW optimization by Nick Kurshev
193
 32bit c version, and and&add trick by Michael Niedermayer
194
*/
195
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196
{
197
  register const uint8_t* s=src;
198
  register uint8_t* d=dst;
199
  register const uint8_t *end;
200
  const uint8_t *mm_end;
201
  end = s + src_size;
202
#ifdef HAVE_MMX
203
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
204
  __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
205
  mm_end = end - 15;
206
  while(s<mm_end)
207
  {
208
        __asm __volatile(
209
                PREFETCH"        32%1\n\t"
210
                "movq        %1, %%mm0\n\t"
211
                "movq        8%1, %%mm2\n\t"
212
                "movq        %%mm0, %%mm1\n\t"
213
                "movq        %%mm2, %%mm3\n\t"
214
                "pand        %%mm4, %%mm0\n\t"
215
                "pand        %%mm4, %%mm2\n\t"
216
                "paddw        %%mm1, %%mm0\n\t"
217
                "paddw        %%mm3, %%mm2\n\t"
218
                MOVNTQ"        %%mm0, %0\n\t"
219
                MOVNTQ"        %%mm2, 8%0"
220
                :"=m"(*d)
221
                :"m"(*s)
222
                );
223
        d+=16;
224
        s+=16;
225
  }
226
  __asm __volatile(SFENCE:::"memory");
227
  __asm __volatile(EMMS:::"memory");
228
#endif
229
    mm_end = end - 3;
230
    while(s < mm_end)
231
    {
232
        register unsigned x= *((uint32_t *)s);
233
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234
        d+=4;
235
        s+=4;
236
    }
237
    if(s < end)
238
    {
239
        register unsigned short x= *((uint16_t *)s);
240
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241
    }
242
}
243

    
244
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
245
{
246
  register const uint8_t* s=src;
247
  register uint8_t* d=dst;
248
  register const uint8_t *end;
249
  const uint8_t *mm_end;
250
  end = s + src_size;
251
#ifdef HAVE_MMX
252
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
253
  __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
254
  __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
255
  mm_end = end - 15;
256
  while(s<mm_end)
257
  {
258
        __asm __volatile(
259
                PREFETCH"        32%1\n\t"
260
                "movq        %1, %%mm0\n\t"
261
                "movq        8%1, %%mm2\n\t"
262
                "movq        %%mm0, %%mm1\n\t"
263
                "movq        %%mm2, %%mm3\n\t"
264
                "psrlq        $1, %%mm0\n\t"
265
                "psrlq        $1, %%mm2\n\t"
266
                "pand        %%mm7, %%mm0\n\t"
267
                "pand        %%mm7, %%mm2\n\t"
268
                "pand        %%mm6, %%mm1\n\t"
269
                "pand        %%mm6, %%mm3\n\t"
270
                "por        %%mm1, %%mm0\n\t"
271
                "por        %%mm3, %%mm2\n\t"
272
                MOVNTQ"        %%mm0, %0\n\t"
273
                MOVNTQ"        %%mm2, 8%0"
274
                :"=m"(*d)
275
                :"m"(*s)
276
                );
277
        d+=16;
278
        s+=16;
279
  }
280
  __asm __volatile(SFENCE:::"memory");
281
  __asm __volatile(EMMS:::"memory");
282
#endif
283
    mm_end = end - 3;
284
    while(s < mm_end)
285
    {
286
        register uint32_t x= *((uint32_t *)s);
287
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
288
        s+=4;
289
        d+=4;
290
    }
291
    if(s < end)
292
    {
293
        register uint16_t x= *((uint16_t *)s);
294
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
295
        s+=2;
296
        d+=2;
297
    }
298
}
299

    
300
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
301
{
302
        const uint8_t *s = src;
303
        const uint8_t *end;
304
#ifdef HAVE_MMX
305
        const uint8_t *mm_end;
306
#endif
307
        uint16_t *d = (uint16_t *)dst;
308
        end = s + src_size;
309
#ifdef HAVE_MMX
310
        mm_end = end - 15;
311
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
312
        asm volatile(
313
                "movq %3, %%mm5                        \n\t"
314
                "movq %4, %%mm6                        \n\t"
315
                "movq %5, %%mm7                        \n\t"
316
                ".balign 16                        \n\t"
317
                "1:                                \n\t"
318
                PREFETCH" 32(%1)                \n\t"
319
                "movd        (%1), %%mm0                \n\t"
320
                "movd        4(%1), %%mm3                \n\t"
321
                "punpckldq 8(%1), %%mm0                \n\t"
322
                "punpckldq 12(%1), %%mm3        \n\t"
323
                "movq %%mm0, %%mm1                \n\t"
324
                "movq %%mm3, %%mm4                \n\t"
325
                "pand %%mm6, %%mm0                \n\t"
326
                "pand %%mm6, %%mm3                \n\t"
327
                "pmaddwd %%mm7, %%mm0                \n\t"
328
                "pmaddwd %%mm7, %%mm3                \n\t"
329
                "pand %%mm5, %%mm1                \n\t"
330
                "pand %%mm5, %%mm4                \n\t"
331
                "por %%mm1, %%mm0                \n\t"        
332
                "por %%mm4, %%mm3                \n\t"
333
                "psrld $5, %%mm0                \n\t"
334
                "pslld $11, %%mm3                \n\t"
335
                "por %%mm3, %%mm0                \n\t"
336
                MOVNTQ"        %%mm0, (%0)                \n\t"
337
                "addl $16, %1                        \n\t"
338
                "addl $8, %0                        \n\t"
339
                "cmpl %2, %1                        \n\t"
340
                " jb 1b                                \n\t"
341
                : "+r" (d), "+r"(s)
342
                : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
343
        );
344
#else
345
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
346
        __asm __volatile(
347
            "movq        %0, %%mm7\n\t"
348
            "movq        %1, %%mm6\n\t"
349
            ::"m"(red_16mask),"m"(green_16mask));
350
        while(s < mm_end)
351
        {
352
            __asm __volatile(
353
                PREFETCH" 32%1\n\t"
354
                "movd        %1, %%mm0\n\t"
355
                "movd        4%1, %%mm3\n\t"
356
                "punpckldq 8%1, %%mm0\n\t"
357
                "punpckldq 12%1, %%mm3\n\t"
358
                "movq        %%mm0, %%mm1\n\t"
359
                "movq        %%mm0, %%mm2\n\t"
360
                "movq        %%mm3, %%mm4\n\t"
361
                "movq        %%mm3, %%mm5\n\t"
362
                "psrlq        $3, %%mm0\n\t"
363
                "psrlq        $3, %%mm3\n\t"
364
                "pand        %2, %%mm0\n\t"
365
                "pand        %2, %%mm3\n\t"
366
                "psrlq        $5, %%mm1\n\t"
367
                "psrlq        $5, %%mm4\n\t"
368
                "pand        %%mm6, %%mm1\n\t"
369
                "pand        %%mm6, %%mm4\n\t"
370
                "psrlq        $8, %%mm2\n\t"
371
                "psrlq        $8, %%mm5\n\t"
372
                "pand        %%mm7, %%mm2\n\t"
373
                "pand        %%mm7, %%mm5\n\t"
374
                "por        %%mm1, %%mm0\n\t"
375
                "por        %%mm4, %%mm3\n\t"
376
                "por        %%mm2, %%mm0\n\t"
377
                "por        %%mm5, %%mm3\n\t"
378
                "psllq        $16, %%mm3\n\t"
379
                "por        %%mm3, %%mm0\n\t"
380
                MOVNTQ"        %%mm0, %0\n\t"
381
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
382
                d += 4;
383
                s += 16;
384
        }
385
#endif
386
        __asm __volatile(SFENCE:::"memory");
387
        __asm __volatile(EMMS:::"memory");
388
#endif
389
        while(s < end)
390
        {
391
                const int src= *((uint32_t*)s)++;
392
                *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393
//                *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
394
        }
395
}
396

    
397
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
398
{
399
        const uint8_t *s = src;
400
        const uint8_t *end;
401
#ifdef HAVE_MMX
402
        const uint8_t *mm_end;
403
#endif
404
        uint16_t *d = (uint16_t *)dst;
405
        end = s + src_size;
406
#ifdef HAVE_MMX
407
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
408
        __asm __volatile(
409
            "movq        %0, %%mm7\n\t"
410
            "movq        %1, %%mm6\n\t"
411
            ::"m"(red_16mask),"m"(green_16mask));
412
        mm_end = end - 15;
413
        while(s < mm_end)
414
        {
415
            __asm __volatile(
416
                PREFETCH" 32%1\n\t"
417
                "movd        %1, %%mm0\n\t"
418
                "movd        4%1, %%mm3\n\t"
419
                "punpckldq 8%1, %%mm0\n\t"
420
                "punpckldq 12%1, %%mm3\n\t"
421
                "movq        %%mm0, %%mm1\n\t"
422
                "movq        %%mm0, %%mm2\n\t"
423
                "movq        %%mm3, %%mm4\n\t"
424
                "movq        %%mm3, %%mm5\n\t"
425
                "psllq        $8, %%mm0\n\t"
426
                "psllq        $8, %%mm3\n\t"
427
                "pand        %%mm7, %%mm0\n\t"
428
                "pand        %%mm7, %%mm3\n\t"
429
                "psrlq        $5, %%mm1\n\t"
430
                "psrlq        $5, %%mm4\n\t"
431
                "pand        %%mm6, %%mm1\n\t"
432
                "pand        %%mm6, %%mm4\n\t"
433
                "psrlq        $19, %%mm2\n\t"
434
                "psrlq        $19, %%mm5\n\t"
435
                "pand        %2, %%mm2\n\t"
436
                "pand        %2, %%mm5\n\t"
437
                "por        %%mm1, %%mm0\n\t"
438
                "por        %%mm4, %%mm3\n\t"
439
                "por        %%mm2, %%mm0\n\t"
440
                "por        %%mm5, %%mm3\n\t"
441
                "psllq        $16, %%mm3\n\t"
442
                "por        %%mm3, %%mm0\n\t"
443
                MOVNTQ"        %%mm0, %0\n\t"
444
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
445
                d += 4;
446
                s += 16;
447
        }
448
        __asm __volatile(SFENCE:::"memory");
449
        __asm __volatile(EMMS:::"memory");
450
#endif
451
        while(s < end)
452
        {
453
                const int src= *((uint32_t*)s)++;
454
                *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
455
        }
456
}
457

    
458
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
459
{
460
        const uint8_t *s = src;
461
        const uint8_t *end;
462
#ifdef HAVE_MMX
463
        const uint8_t *mm_end;
464
#endif
465
        uint16_t *d = (uint16_t *)dst;
466
        end = s + src_size;
467
#ifdef HAVE_MMX
468
        mm_end = end - 15;
469
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
470
        asm volatile(
471
                "movq %3, %%mm5                        \n\t"
472
                "movq %4, %%mm6                        \n\t"
473
                "movq %5, %%mm7                        \n\t"
474
                ".balign 16                        \n\t"
475
                "1:                                \n\t"
476
                PREFETCH" 32(%1)                \n\t"
477
                "movd        (%1), %%mm0                \n\t"
478
                "movd        4(%1), %%mm3                \n\t"
479
                "punpckldq 8(%1), %%mm0                \n\t"
480
                "punpckldq 12(%1), %%mm3        \n\t"
481
                "movq %%mm0, %%mm1                \n\t"
482
                "movq %%mm3, %%mm4                \n\t"
483
                "pand %%mm6, %%mm0                \n\t"
484
                "pand %%mm6, %%mm3                \n\t"
485
                "pmaddwd %%mm7, %%mm0                \n\t"
486
                "pmaddwd %%mm7, %%mm3                \n\t"
487
                "pand %%mm5, %%mm1                \n\t"
488
                "pand %%mm5, %%mm4                \n\t"
489
                "por %%mm1, %%mm0                \n\t"        
490
                "por %%mm4, %%mm3                \n\t"
491
                "psrld $6, %%mm0                \n\t"
492
                "pslld $10, %%mm3                \n\t"
493
                "por %%mm3, %%mm0                \n\t"
494
                MOVNTQ"        %%mm0, (%0)                \n\t"
495
                "addl $16, %1                        \n\t"
496
                "addl $8, %0                        \n\t"
497
                "cmpl %2, %1                        \n\t"
498
                " jb 1b                                \n\t"
499
                : "+r" (d), "+r"(s)
500
                : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
501
        );
502
#else
503
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
504
        __asm __volatile(
505
            "movq        %0, %%mm7\n\t"
506
            "movq        %1, %%mm6\n\t"
507
            ::"m"(red_15mask),"m"(green_15mask));
508
        while(s < mm_end)
509
        {
510
            __asm __volatile(
511
                PREFETCH" 32%1\n\t"
512
                "movd        %1, %%mm0\n\t"
513
                "movd        4%1, %%mm3\n\t"
514
                "punpckldq 8%1, %%mm0\n\t"
515
                "punpckldq 12%1, %%mm3\n\t"
516
                "movq        %%mm0, %%mm1\n\t"
517
                "movq        %%mm0, %%mm2\n\t"
518
                "movq        %%mm3, %%mm4\n\t"
519
                "movq        %%mm3, %%mm5\n\t"
520
                "psrlq        $3, %%mm0\n\t"
521
                "psrlq        $3, %%mm3\n\t"
522
                "pand        %2, %%mm0\n\t"
523
                "pand        %2, %%mm3\n\t"
524
                "psrlq        $6, %%mm1\n\t"
525
                "psrlq        $6, %%mm4\n\t"
526
                "pand        %%mm6, %%mm1\n\t"
527
                "pand        %%mm6, %%mm4\n\t"
528
                "psrlq        $9, %%mm2\n\t"
529
                "psrlq        $9, %%mm5\n\t"
530
                "pand        %%mm7, %%mm2\n\t"
531
                "pand        %%mm7, %%mm5\n\t"
532
                "por        %%mm1, %%mm0\n\t"
533
                "por        %%mm4, %%mm3\n\t"
534
                "por        %%mm2, %%mm0\n\t"
535
                "por        %%mm5, %%mm3\n\t"
536
                "psllq        $16, %%mm3\n\t"
537
                "por        %%mm3, %%mm0\n\t"
538
                MOVNTQ"        %%mm0, %0\n\t"
539
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
540
                d += 4;
541
                s += 16;
542
        }
543
#endif
544
        __asm __volatile(SFENCE:::"memory");
545
        __asm __volatile(EMMS:::"memory");
546
#endif
547
        while(s < end)
548
        {
549
                const int src= *((uint32_t*)s)++;
550
                *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
551
        }
552
}
553

    
554
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
555
{
556
        const uint8_t *s = src;
557
        const uint8_t *end;
558
#ifdef HAVE_MMX
559
        const uint8_t *mm_end;
560
#endif
561
        uint16_t *d = (uint16_t *)dst;
562
        end = s + src_size;
563
#ifdef HAVE_MMX
564
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
565
        __asm __volatile(
566
            "movq        %0, %%mm7\n\t"
567
            "movq        %1, %%mm6\n\t"
568
            ::"m"(red_15mask),"m"(green_15mask));
569
        mm_end = end - 15;
570
        while(s < mm_end)
571
        {
572
            __asm __volatile(
573
                PREFETCH" 32%1\n\t"
574
                "movd        %1, %%mm0\n\t"
575
                "movd        4%1, %%mm3\n\t"
576
                "punpckldq 8%1, %%mm0\n\t"
577
                "punpckldq 12%1, %%mm3\n\t"
578
                "movq        %%mm0, %%mm1\n\t"
579
                "movq        %%mm0, %%mm2\n\t"
580
                "movq        %%mm3, %%mm4\n\t"
581
                "movq        %%mm3, %%mm5\n\t"
582
                "psllq        $7, %%mm0\n\t"
583
                "psllq        $7, %%mm3\n\t"
584
                "pand        %%mm7, %%mm0\n\t"
585
                "pand        %%mm7, %%mm3\n\t"
586
                "psrlq        $6, %%mm1\n\t"
587
                "psrlq        $6, %%mm4\n\t"
588
                "pand        %%mm6, %%mm1\n\t"
589
                "pand        %%mm6, %%mm4\n\t"
590
                "psrlq        $19, %%mm2\n\t"
591
                "psrlq        $19, %%mm5\n\t"
592
                "pand        %2, %%mm2\n\t"
593
                "pand        %2, %%mm5\n\t"
594
                "por        %%mm1, %%mm0\n\t"
595
                "por        %%mm4, %%mm3\n\t"
596
                "por        %%mm2, %%mm0\n\t"
597
                "por        %%mm5, %%mm3\n\t"
598
                "psllq        $16, %%mm3\n\t"
599
                "por        %%mm3, %%mm0\n\t"
600
                MOVNTQ"        %%mm0, %0\n\t"
601
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
602
                d += 4;
603
                s += 16;
604
        }
605
        __asm __volatile(SFENCE:::"memory");
606
        __asm __volatile(EMMS:::"memory");
607
#endif
608
        while(s < end)
609
        {
610
                const int src= *((uint32_t*)s)++;
611
                *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
612
        }
613
}
614

    
615
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
616
{
617
        const uint8_t *s = src;
618
        const uint8_t *end;
619
#ifdef HAVE_MMX
620
        const uint8_t *mm_end;
621
#endif
622
        uint16_t *d = (uint16_t *)dst;
623
        end = s + src_size;
624
#ifdef HAVE_MMX
625
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
626
        __asm __volatile(
627
            "movq        %0, %%mm7\n\t"
628
            "movq        %1, %%mm6\n\t"
629
            ::"m"(red_16mask),"m"(green_16mask));
630
        mm_end = end - 11;
631
        while(s < mm_end)
632
        {
633
            __asm __volatile(
634
                PREFETCH" 32%1\n\t"
635
                "movd        %1, %%mm0\n\t"
636
                "movd        3%1, %%mm3\n\t"
637
                "punpckldq 6%1, %%mm0\n\t"
638
                "punpckldq 9%1, %%mm3\n\t"
639
                "movq        %%mm0, %%mm1\n\t"
640
                "movq        %%mm0, %%mm2\n\t"
641
                "movq        %%mm3, %%mm4\n\t"
642
                "movq        %%mm3, %%mm5\n\t"
643
                "psrlq        $3, %%mm0\n\t"
644
                "psrlq        $3, %%mm3\n\t"
645
                "pand        %2, %%mm0\n\t"
646
                "pand        %2, %%mm3\n\t"
647
                "psrlq        $5, %%mm1\n\t"
648
                "psrlq        $5, %%mm4\n\t"
649
                "pand        %%mm6, %%mm1\n\t"
650
                "pand        %%mm6, %%mm4\n\t"
651
                "psrlq        $8, %%mm2\n\t"
652
                "psrlq        $8, %%mm5\n\t"
653
                "pand        %%mm7, %%mm2\n\t"
654
                "pand        %%mm7, %%mm5\n\t"
655
                "por        %%mm1, %%mm0\n\t"
656
                "por        %%mm4, %%mm3\n\t"
657
                "por        %%mm2, %%mm0\n\t"
658
                "por        %%mm5, %%mm3\n\t"
659
                "psllq        $16, %%mm3\n\t"
660
                "por        %%mm3, %%mm0\n\t"
661
                MOVNTQ"        %%mm0, %0\n\t"
662
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
663
                d += 4;
664
                s += 12;
665
        }
666
        __asm __volatile(SFENCE:::"memory");
667
        __asm __volatile(EMMS:::"memory");
668
#endif
669
        while(s < end)
670
        {
671
                const int b= *s++;
672
                const int g= *s++;
673
                const int r= *s++;
674
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
675
        }
676
}
677

    
678
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
679
{
680
        const uint8_t *s = src;
681
        const uint8_t *end;
682
#ifdef HAVE_MMX
683
        const uint8_t *mm_end;
684
#endif
685
        uint16_t *d = (uint16_t *)dst;
686
        end = s + src_size;
687
#ifdef HAVE_MMX
688
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
689
        __asm __volatile(
690
            "movq        %0, %%mm7\n\t"
691
            "movq        %1, %%mm6\n\t"
692
            ::"m"(red_16mask),"m"(green_16mask));
693
        mm_end = end - 15;
694
        while(s < mm_end)
695
        {
696
            __asm __volatile(
697
                PREFETCH" 32%1\n\t"
698
                "movd        %1, %%mm0\n\t"
699
                "movd        3%1, %%mm3\n\t"
700
                "punpckldq 6%1, %%mm0\n\t"
701
                "punpckldq 9%1, %%mm3\n\t"
702
                "movq        %%mm0, %%mm1\n\t"
703
                "movq        %%mm0, %%mm2\n\t"
704
                "movq        %%mm3, %%mm4\n\t"
705
                "movq        %%mm3, %%mm5\n\t"
706
                "psllq        $8, %%mm0\n\t"
707
                "psllq        $8, %%mm3\n\t"
708
                "pand        %%mm7, %%mm0\n\t"
709
                "pand        %%mm7, %%mm3\n\t"
710
                "psrlq        $5, %%mm1\n\t"
711
                "psrlq        $5, %%mm4\n\t"
712
                "pand        %%mm6, %%mm1\n\t"
713
                "pand        %%mm6, %%mm4\n\t"
714
                "psrlq        $19, %%mm2\n\t"
715
                "psrlq        $19, %%mm5\n\t"
716
                "pand        %2, %%mm2\n\t"
717
                "pand        %2, %%mm5\n\t"
718
                "por        %%mm1, %%mm0\n\t"
719
                "por        %%mm4, %%mm3\n\t"
720
                "por        %%mm2, %%mm0\n\t"
721
                "por        %%mm5, %%mm3\n\t"
722
                "psllq        $16, %%mm3\n\t"
723
                "por        %%mm3, %%mm0\n\t"
724
                MOVNTQ"        %%mm0, %0\n\t"
725
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
726
                d += 4;
727
                s += 12;
728
        }
729
        __asm __volatile(SFENCE:::"memory");
730
        __asm __volatile(EMMS:::"memory");
731
#endif
732
        while(s < end)
733
        {
734
                const int r= *s++;
735
                const int g= *s++;
736
                const int b= *s++;
737
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
738
        }
739
}
740

    
741
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
742
{
743
        const uint8_t *s = src;
744
        const uint8_t *end;
745
#ifdef HAVE_MMX
746
        const uint8_t *mm_end;
747
#endif
748
        uint16_t *d = (uint16_t *)dst;
749
        end = s + src_size;
750
#ifdef HAVE_MMX
751
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
752
        __asm __volatile(
753
            "movq        %0, %%mm7\n\t"
754
            "movq        %1, %%mm6\n\t"
755
            ::"m"(red_15mask),"m"(green_15mask));
756
        mm_end = end - 11;
757
        while(s < mm_end)
758
        {
759
            __asm __volatile(
760
                PREFETCH" 32%1\n\t"
761
                "movd        %1, %%mm0\n\t"
762
                "movd        3%1, %%mm3\n\t"
763
                "punpckldq 6%1, %%mm0\n\t"
764
                "punpckldq 9%1, %%mm3\n\t"
765
                "movq        %%mm0, %%mm1\n\t"
766
                "movq        %%mm0, %%mm2\n\t"
767
                "movq        %%mm3, %%mm4\n\t"
768
                "movq        %%mm3, %%mm5\n\t"
769
                "psrlq        $3, %%mm0\n\t"
770
                "psrlq        $3, %%mm3\n\t"
771
                "pand        %2, %%mm0\n\t"
772
                "pand        %2, %%mm3\n\t"
773
                "psrlq        $6, %%mm1\n\t"
774
                "psrlq        $6, %%mm4\n\t"
775
                "pand        %%mm6, %%mm1\n\t"
776
                "pand        %%mm6, %%mm4\n\t"
777
                "psrlq        $9, %%mm2\n\t"
778
                "psrlq        $9, %%mm5\n\t"
779
                "pand        %%mm7, %%mm2\n\t"
780
                "pand        %%mm7, %%mm5\n\t"
781
                "por        %%mm1, %%mm0\n\t"
782
                "por        %%mm4, %%mm3\n\t"
783
                "por        %%mm2, %%mm0\n\t"
784
                "por        %%mm5, %%mm3\n\t"
785
                "psllq        $16, %%mm3\n\t"
786
                "por        %%mm3, %%mm0\n\t"
787
                MOVNTQ"        %%mm0, %0\n\t"
788
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
789
                d += 4;
790
                s += 12;
791
        }
792
        __asm __volatile(SFENCE:::"memory");
793
        __asm __volatile(EMMS:::"memory");
794
#endif
795
        while(s < end)
796
        {
797
                const int b= *s++;
798
                const int g= *s++;
799
                const int r= *s++;
800
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
801
        }
802
}
803

    
804
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
805
{
806
        const uint8_t *s = src;
807
        const uint8_t *end;
808
#ifdef HAVE_MMX
809
        const uint8_t *mm_end;
810
#endif
811
        uint16_t *d = (uint16_t *)dst;
812
        end = s + src_size;
813
#ifdef HAVE_MMX
814
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
815
        __asm __volatile(
816
            "movq        %0, %%mm7\n\t"
817
            "movq        %1, %%mm6\n\t"
818
            ::"m"(red_15mask),"m"(green_15mask));
819
        mm_end = end - 15;
820
        while(s < mm_end)
821
        {
822
            __asm __volatile(
823
                PREFETCH" 32%1\n\t"
824
                "movd        %1, %%mm0\n\t"
825
                "movd        3%1, %%mm3\n\t"
826
                "punpckldq 6%1, %%mm0\n\t"
827
                "punpckldq 9%1, %%mm3\n\t"
828
                "movq        %%mm0, %%mm1\n\t"
829
                "movq        %%mm0, %%mm2\n\t"
830
                "movq        %%mm3, %%mm4\n\t"
831
                "movq        %%mm3, %%mm5\n\t"
832
                "psllq        $7, %%mm0\n\t"
833
                "psllq        $7, %%mm3\n\t"
834
                "pand        %%mm7, %%mm0\n\t"
835
                "pand        %%mm7, %%mm3\n\t"
836
                "psrlq        $6, %%mm1\n\t"
837
                "psrlq        $6, %%mm4\n\t"
838
                "pand        %%mm6, %%mm1\n\t"
839
                "pand        %%mm6, %%mm4\n\t"
840
                "psrlq        $19, %%mm2\n\t"
841
                "psrlq        $19, %%mm5\n\t"
842
                "pand        %2, %%mm2\n\t"
843
                "pand        %2, %%mm5\n\t"
844
                "por        %%mm1, %%mm0\n\t"
845
                "por        %%mm4, %%mm3\n\t"
846
                "por        %%mm2, %%mm0\n\t"
847
                "por        %%mm5, %%mm3\n\t"
848
                "psllq        $16, %%mm3\n\t"
849
                "por        %%mm3, %%mm0\n\t"
850
                MOVNTQ"        %%mm0, %0\n\t"
851
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
852
                d += 4;
853
                s += 12;
854
        }
855
        __asm __volatile(SFENCE:::"memory");
856
        __asm __volatile(EMMS:::"memory");
857
#endif
858
        while(s < end)
859
        {
860
                const int r= *s++;
861
                const int g= *s++;
862
                const int b= *s++;
863
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
864
        }
865
}
866

    
867
/*
868
  I use here less accurate approximation by simply
869
 left-shifting the input
870
  value and filling the low order bits with
871
 zeroes. This method improves png's
872
  compression but this scheme cannot reproduce white exactly, since it does not
873
  generate an all-ones maximum value; the net effect is to darken the
874
  image slightly.
875

876
  The better method should be "left bit replication":
877

878
   4 3 2 1 0
879
   ---------
880
   1 1 0 1 1
881

882
   7 6 5 4 3  2 1 0
883
   ----------------
884
   1 1 0 1 1  1 1 0
885
   |=======|  |===|
886
       |      Leftmost Bits Repeated to Fill Open Bits
887
       |
888
   Original Bits
889
*/
890
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
891
{
892
        const uint16_t *end;
893
#ifdef HAVE_MMX
894
        const uint16_t *mm_end;
895
#endif
896
        uint8_t *d = (uint8_t *)dst;
897
        const uint16_t *s = (uint16_t *)src;
898
        end = s + src_size/2;
899
#ifdef HAVE_MMX
900
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
901
        mm_end = end - 7;
902
        while(s < mm_end)
903
        {
904
            __asm __volatile(
905
                PREFETCH" 32%1\n\t"
906
                "movq        %1, %%mm0\n\t"
907
                "movq        %1, %%mm1\n\t"
908
                "movq        %1, %%mm2\n\t"
909
                "pand        %2, %%mm0\n\t"
910
                "pand        %3, %%mm1\n\t"
911
                "pand        %4, %%mm2\n\t"
912
                "psllq        $3, %%mm0\n\t"
913
                "psrlq        $2, %%mm1\n\t"
914
                "psrlq        $7, %%mm2\n\t"
915
                "movq        %%mm0, %%mm3\n\t"
916
                "movq        %%mm1, %%mm4\n\t"
917
                "movq        %%mm2, %%mm5\n\t"
918
                "punpcklwd %5, %%mm0\n\t"
919
                "punpcklwd %5, %%mm1\n\t"
920
                "punpcklwd %5, %%mm2\n\t"
921
                "punpckhwd %5, %%mm3\n\t"
922
                "punpckhwd %5, %%mm4\n\t"
923
                "punpckhwd %5, %%mm5\n\t"
924
                "psllq        $8, %%mm1\n\t"
925
                "psllq        $16, %%mm2\n\t"
926
                "por        %%mm1, %%mm0\n\t"
927
                "por        %%mm2, %%mm0\n\t"
928
                "psllq        $8, %%mm4\n\t"
929
                "psllq        $16, %%mm5\n\t"
930
                "por        %%mm4, %%mm3\n\t"
931
                "por        %%mm5, %%mm3\n\t"
932

    
933
                "movq        %%mm0, %%mm6\n\t"
934
                "movq        %%mm3, %%mm7\n\t"
935
                
936
                "movq        8%1, %%mm0\n\t"
937
                "movq        8%1, %%mm1\n\t"
938
                "movq        8%1, %%mm2\n\t"
939
                "pand        %2, %%mm0\n\t"
940
                "pand        %3, %%mm1\n\t"
941
                "pand        %4, %%mm2\n\t"
942
                "psllq        $3, %%mm0\n\t"
943
                "psrlq        $2, %%mm1\n\t"
944
                "psrlq        $7, %%mm2\n\t"
945
                "movq        %%mm0, %%mm3\n\t"
946
                "movq        %%mm1, %%mm4\n\t"
947
                "movq        %%mm2, %%mm5\n\t"
948
                "punpcklwd %5, %%mm0\n\t"
949
                "punpcklwd %5, %%mm1\n\t"
950
                "punpcklwd %5, %%mm2\n\t"
951
                "punpckhwd %5, %%mm3\n\t"
952
                "punpckhwd %5, %%mm4\n\t"
953
                "punpckhwd %5, %%mm5\n\t"
954
                "psllq        $8, %%mm1\n\t"
955
                "psllq        $16, %%mm2\n\t"
956
                "por        %%mm1, %%mm0\n\t"
957
                "por        %%mm2, %%mm0\n\t"
958
                "psllq        $8, %%mm4\n\t"
959
                "psllq        $16, %%mm5\n\t"
960
                "por        %%mm4, %%mm3\n\t"
961
                "por        %%mm5, %%mm3\n\t"
962

    
963
                :"=m"(*d)
964
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965
                :"memory");
966
            /* Borrowed 32 to 24 */
967
            __asm __volatile(
968
                "movq        %%mm0, %%mm4\n\t"
969
                "movq        %%mm3, %%mm5\n\t"
970
                "movq        %%mm6, %%mm0\n\t"
971
                "movq        %%mm7, %%mm1\n\t"
972
                
973
                "movq        %%mm4, %%mm6\n\t"
974
                "movq        %%mm5, %%mm7\n\t"
975
                "movq        %%mm0, %%mm2\n\t"
976
                "movq        %%mm1, %%mm3\n\t"
977

    
978
                "psrlq        $8, %%mm2\n\t"
979
                "psrlq        $8, %%mm3\n\t"
980
                "psrlq        $8, %%mm6\n\t"
981
                "psrlq        $8, %%mm7\n\t"
982
                "pand        %2, %%mm0\n\t"
983
                "pand        %2, %%mm1\n\t"
984
                "pand        %2, %%mm4\n\t"
985
                "pand        %2, %%mm5\n\t"
986
                "pand        %3, %%mm2\n\t"
987
                "pand        %3, %%mm3\n\t"
988
                "pand        %3, %%mm6\n\t"
989
                "pand        %3, %%mm7\n\t"
990
                "por        %%mm2, %%mm0\n\t"
991
                "por        %%mm3, %%mm1\n\t"
992
                "por        %%mm6, %%mm4\n\t"
993
                "por        %%mm7, %%mm5\n\t"
994

    
995
                "movq        %%mm1, %%mm2\n\t"
996
                "movq        %%mm4, %%mm3\n\t"
997
                "psllq        $48, %%mm2\n\t"
998
                "psllq        $32, %%mm3\n\t"
999
                "pand        %4, %%mm2\n\t"
1000
                "pand        %5, %%mm3\n\t"
1001
                "por        %%mm2, %%mm0\n\t"
1002
                "psrlq        $16, %%mm1\n\t"
1003
                "psrlq        $32, %%mm4\n\t"
1004
                "psllq        $16, %%mm5\n\t"
1005
                "por        %%mm3, %%mm1\n\t"
1006
                "pand        %6, %%mm5\n\t"
1007
                "por        %%mm5, %%mm4\n\t"
1008

    
1009
                MOVNTQ"        %%mm0, %0\n\t"
1010
                MOVNTQ"        %%mm1, 8%0\n\t"
1011
                MOVNTQ"        %%mm4, 16%0"
1012

    
1013
                :"=m"(*d)
1014
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1015
                :"memory");
1016
                d += 24;
1017
                s += 8;
1018
        }
1019
        __asm __volatile(SFENCE:::"memory");
1020
        __asm __volatile(EMMS:::"memory");
1021
#endif
1022
        while(s < end)
1023
        {
1024
                register uint16_t bgr;
1025
                bgr = *s++;
1026
                *d++ = (bgr&0x1F)<<3;
1027
                *d++ = (bgr&0x3E0)>>2;
1028
                *d++ = (bgr&0x7C00)>>7;
1029
        }
1030
}
1031

    
1032
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1033
{
1034
        const uint16_t *end;
1035
#ifdef HAVE_MMX
1036
        const uint16_t *mm_end;
1037
#endif
1038
        uint8_t *d = (uint8_t *)dst;
1039
        const uint16_t *s = (const uint16_t *)src;
1040
        end = s + src_size/2;
1041
#ifdef HAVE_MMX
1042
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1043
        mm_end = end - 7;
1044
        while(s < mm_end)
1045
        {
1046
            __asm __volatile(
1047
                PREFETCH" 32%1\n\t"
1048
                "movq        %1, %%mm0\n\t"
1049
                "movq        %1, %%mm1\n\t"
1050
                "movq        %1, %%mm2\n\t"
1051
                "pand        %2, %%mm0\n\t"
1052
                "pand        %3, %%mm1\n\t"
1053
                "pand        %4, %%mm2\n\t"
1054
                "psllq        $3, %%mm0\n\t"
1055
                "psrlq        $3, %%mm1\n\t"
1056
                "psrlq        $8, %%mm2\n\t"
1057
                "movq        %%mm0, %%mm3\n\t"
1058
                "movq        %%mm1, %%mm4\n\t"
1059
                "movq        %%mm2, %%mm5\n\t"
1060
                "punpcklwd %5, %%mm0\n\t"
1061
                "punpcklwd %5, %%mm1\n\t"
1062
                "punpcklwd %5, %%mm2\n\t"
1063
                "punpckhwd %5, %%mm3\n\t"
1064
                "punpckhwd %5, %%mm4\n\t"
1065
                "punpckhwd %5, %%mm5\n\t"
1066
                "psllq        $8, %%mm1\n\t"
1067
                "psllq        $16, %%mm2\n\t"
1068
                "por        %%mm1, %%mm0\n\t"
1069
                "por        %%mm2, %%mm0\n\t"
1070
                "psllq        $8, %%mm4\n\t"
1071
                "psllq        $16, %%mm5\n\t"
1072
                "por        %%mm4, %%mm3\n\t"
1073
                "por        %%mm5, %%mm3\n\t"
1074
                
1075
                "movq        %%mm0, %%mm6\n\t"
1076
                "movq        %%mm3, %%mm7\n\t"
1077

    
1078
                "movq        8%1, %%mm0\n\t"
1079
                "movq        8%1, %%mm1\n\t"
1080
                "movq        8%1, %%mm2\n\t"
1081
                "pand        %2, %%mm0\n\t"
1082
                "pand        %3, %%mm1\n\t"
1083
                "pand        %4, %%mm2\n\t"
1084
                "psllq        $3, %%mm0\n\t"
1085
                "psrlq        $3, %%mm1\n\t"
1086
                "psrlq        $8, %%mm2\n\t"
1087
                "movq        %%mm0, %%mm3\n\t"
1088
                "movq        %%mm1, %%mm4\n\t"
1089
                "movq        %%mm2, %%mm5\n\t"
1090
                "punpcklwd %5, %%mm0\n\t"
1091
                "punpcklwd %5, %%mm1\n\t"
1092
                "punpcklwd %5, %%mm2\n\t"
1093
                "punpckhwd %5, %%mm3\n\t"
1094
                "punpckhwd %5, %%mm4\n\t"
1095
                "punpckhwd %5, %%mm5\n\t"
1096
                "psllq        $8, %%mm1\n\t"
1097
                "psllq        $16, %%mm2\n\t"
1098
                "por        %%mm1, %%mm0\n\t"
1099
                "por        %%mm2, %%mm0\n\t"
1100
                "psllq        $8, %%mm4\n\t"
1101
                "psllq        $16, %%mm5\n\t"
1102
                "por        %%mm4, %%mm3\n\t"
1103
                "por        %%mm5, %%mm3\n\t"
1104
                :"=m"(*d)
1105
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)                
1106
                :"memory");
1107
            /* Borrowed 32 to 24 */
1108
            __asm __volatile(
1109
                "movq        %%mm0, %%mm4\n\t"
1110
                "movq        %%mm3, %%mm5\n\t"
1111
                "movq        %%mm6, %%mm0\n\t"
1112
                "movq        %%mm7, %%mm1\n\t"
1113
                
1114
                "movq        %%mm4, %%mm6\n\t"
1115
                "movq        %%mm5, %%mm7\n\t"
1116
                "movq        %%mm0, %%mm2\n\t"
1117
                "movq        %%mm1, %%mm3\n\t"
1118

    
1119
                "psrlq        $8, %%mm2\n\t"
1120
                "psrlq        $8, %%mm3\n\t"
1121
                "psrlq        $8, %%mm6\n\t"
1122
                "psrlq        $8, %%mm7\n\t"
1123
                "pand        %2, %%mm0\n\t"
1124
                "pand        %2, %%mm1\n\t"
1125
                "pand        %2, %%mm4\n\t"
1126
                "pand        %2, %%mm5\n\t"
1127
                "pand        %3, %%mm2\n\t"
1128
                "pand        %3, %%mm3\n\t"
1129
                "pand        %3, %%mm6\n\t"
1130
                "pand        %3, %%mm7\n\t"
1131
                "por        %%mm2, %%mm0\n\t"
1132
                "por        %%mm3, %%mm1\n\t"
1133
                "por        %%mm6, %%mm4\n\t"
1134
                "por        %%mm7, %%mm5\n\t"
1135

    
1136
                "movq        %%mm1, %%mm2\n\t"
1137
                "movq        %%mm4, %%mm3\n\t"
1138
                "psllq        $48, %%mm2\n\t"
1139
                "psllq        $32, %%mm3\n\t"
1140
                "pand        %4, %%mm2\n\t"
1141
                "pand        %5, %%mm3\n\t"
1142
                "por        %%mm2, %%mm0\n\t"
1143
                "psrlq        $16, %%mm1\n\t"
1144
                "psrlq        $32, %%mm4\n\t"
1145
                "psllq        $16, %%mm5\n\t"
1146
                "por        %%mm3, %%mm1\n\t"
1147
                "pand        %6, %%mm5\n\t"
1148
                "por        %%mm5, %%mm4\n\t"
1149

    
1150
                MOVNTQ"        %%mm0, %0\n\t"
1151
                MOVNTQ"        %%mm1, 8%0\n\t"
1152
                MOVNTQ"        %%mm4, 16%0"
1153

    
1154
                :"=m"(*d)
1155
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1156
                :"memory");
1157
                d += 24;
1158
                s += 8;
1159
        }
1160
        __asm __volatile(SFENCE:::"memory");
1161
        __asm __volatile(EMMS:::"memory");
1162
#endif
1163
        while(s < end)
1164
        {
1165
                register uint16_t bgr;
1166
                bgr = *s++;
1167
                *d++ = (bgr&0x1F)<<3;
1168
                *d++ = (bgr&0x7E0)>>3;
1169
                *d++ = (bgr&0xF800)>>8;
1170
        }
1171
}
1172

    
1173
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1174
{
1175
        const uint16_t *end;
1176
#ifdef HAVE_MMX
1177
        const uint16_t *mm_end;
1178
#endif
1179
        uint8_t *d = (uint8_t *)dst;
1180
        const uint16_t *s = (const uint16_t *)src;
1181
        end = s + src_size/2;
1182
#ifdef HAVE_MMX
1183
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1184
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1185
        mm_end = end - 3;
1186
        while(s < mm_end)
1187
        {
1188
            __asm __volatile(
1189
                PREFETCH" 32%1\n\t"
1190
                "movq        %1, %%mm0\n\t"
1191
                "movq        %1, %%mm1\n\t"
1192
                "movq        %1, %%mm2\n\t"
1193
                "pand        %2, %%mm0\n\t"
1194
                "pand        %3, %%mm1\n\t"
1195
                "pand        %4, %%mm2\n\t"
1196
                "psllq        $3, %%mm0\n\t"
1197
                "psrlq        $2, %%mm1\n\t"
1198
                "psrlq        $7, %%mm2\n\t"
1199
                "movq        %%mm0, %%mm3\n\t"
1200
                "movq        %%mm1, %%mm4\n\t"
1201
                "movq        %%mm2, %%mm5\n\t"
1202
                "punpcklwd %%mm7, %%mm0\n\t"
1203
                "punpcklwd %%mm7, %%mm1\n\t"
1204
                "punpcklwd %%mm7, %%mm2\n\t"
1205
                "punpckhwd %%mm7, %%mm3\n\t"
1206
                "punpckhwd %%mm7, %%mm4\n\t"
1207
                "punpckhwd %%mm7, %%mm5\n\t"
1208
                "psllq        $8, %%mm1\n\t"
1209
                "psllq        $16, %%mm2\n\t"
1210
                "por        %%mm1, %%mm0\n\t"
1211
                "por        %%mm2, %%mm0\n\t"
1212
                "psllq        $8, %%mm4\n\t"
1213
                "psllq        $16, %%mm5\n\t"
1214
                "por        %%mm4, %%mm3\n\t"
1215
                "por        %%mm5, %%mm3\n\t"
1216
                MOVNTQ"        %%mm0, %0\n\t"
1217
                MOVNTQ"        %%mm3, 8%0\n\t"
1218
                :"=m"(*d)
1219
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1220
                :"memory");
1221
                d += 16;
1222
                s += 4;
1223
        }
1224
        __asm __volatile(SFENCE:::"memory");
1225
        __asm __volatile(EMMS:::"memory");
1226
#endif
1227
        while(s < end)
1228
        {
1229
#if 0 //slightly slower on athlon
1230
                int bgr= *s++;
1231
                *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1232
#else
1233
//FIXME this is very likely wrong for bigendian (and the following converters too)
1234
                register uint16_t bgr;
1235
                bgr = *s++;
1236
                *d++ = (bgr&0x1F)<<3;
1237
                *d++ = (bgr&0x3E0)>>2;
1238
                *d++ = (bgr&0x7C00)>>7;
1239
                *d++ = 0;
1240
#endif
1241
        }
1242
}
1243

    
1244
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1245
{
1246
        const uint16_t *end;
1247
#ifdef HAVE_MMX
1248
        const uint16_t *mm_end;
1249
#endif
1250
        uint8_t *d = (uint8_t *)dst;
1251
        const uint16_t *s = (uint16_t *)src;
1252
        end = s + src_size/2;
1253
#ifdef HAVE_MMX
1254
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1255
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1256
        mm_end = end - 3;
1257
        while(s < mm_end)
1258
        {
1259
            __asm __volatile(
1260
                PREFETCH" 32%1\n\t"
1261
                "movq        %1, %%mm0\n\t"
1262
                "movq        %1, %%mm1\n\t"
1263
                "movq        %1, %%mm2\n\t"
1264
                "pand        %2, %%mm0\n\t"
1265
                "pand        %3, %%mm1\n\t"
1266
                "pand        %4, %%mm2\n\t"
1267
                "psllq        $3, %%mm0\n\t"
1268
                "psrlq        $3, %%mm1\n\t"
1269
                "psrlq        $8, %%mm2\n\t"
1270
                "movq        %%mm0, %%mm3\n\t"
1271
                "movq        %%mm1, %%mm4\n\t"
1272
                "movq        %%mm2, %%mm5\n\t"
1273
                "punpcklwd %%mm7, %%mm0\n\t"
1274
                "punpcklwd %%mm7, %%mm1\n\t"
1275
                "punpcklwd %%mm7, %%mm2\n\t"
1276
                "punpckhwd %%mm7, %%mm3\n\t"
1277
                "punpckhwd %%mm7, %%mm4\n\t"
1278
                "punpckhwd %%mm7, %%mm5\n\t"
1279
                "psllq        $8, %%mm1\n\t"
1280
                "psllq        $16, %%mm2\n\t"
1281
                "por        %%mm1, %%mm0\n\t"
1282
                "por        %%mm2, %%mm0\n\t"
1283
                "psllq        $8, %%mm4\n\t"
1284
                "psllq        $16, %%mm5\n\t"
1285
                "por        %%mm4, %%mm3\n\t"
1286
                "por        %%mm5, %%mm3\n\t"
1287
                MOVNTQ"        %%mm0, %0\n\t"
1288
                MOVNTQ"        %%mm3, 8%0\n\t"
1289
                :"=m"(*d)
1290
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1291
                :"memory");
1292
                d += 16;
1293
                s += 4;
1294
        }
1295
        __asm __volatile(SFENCE:::"memory");
1296
        __asm __volatile(EMMS:::"memory");
1297
#endif
1298
        while(s < end)
1299
        {
1300
                register uint16_t bgr;
1301
                bgr = *s++;
1302
                *d++ = (bgr&0x1F)<<3;
1303
                *d++ = (bgr&0x7E0)>>3;
1304
                *d++ = (bgr&0xF800)>>8;
1305
                *d++ = 0;
1306
        }
1307
}
1308

    
1309
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1310
{
1311
#ifdef HAVE_MMX
1312
/* TODO: unroll this loop */
1313
        asm volatile (
1314
                "xorl %%eax, %%eax                \n\t"
1315
                ".balign 16                        \n\t"
1316
                "1:                                \n\t"
1317
                PREFETCH" 32(%0, %%eax)                \n\t"
1318
                "movq (%0, %%eax), %%mm0        \n\t"
1319
                "movq %%mm0, %%mm1                \n\t"
1320
                "movq %%mm0, %%mm2                \n\t"
1321
                "pslld $16, %%mm0                \n\t"
1322
                "psrld $16, %%mm1                \n\t"
1323
                "pand "MANGLE(mask32r)", %%mm0        \n\t"
1324
                "pand "MANGLE(mask32g)", %%mm2        \n\t"
1325
                "pand "MANGLE(mask32b)", %%mm1        \n\t"
1326
                "por %%mm0, %%mm2                \n\t"
1327
                "por %%mm1, %%mm2                \n\t"
1328
                MOVNTQ" %%mm2, (%1, %%eax)        \n\t"
1329
                "addl $8, %%eax                        \n\t"
1330
                "cmpl %2, %%eax                        \n\t"
1331
                " jb 1b                                \n\t"
1332
                :: "r" (src), "r"(dst), "r" (src_size-7)
1333
                : "%eax"
1334
        );
1335

    
1336
        __asm __volatile(SFENCE:::"memory");
1337
        __asm __volatile(EMMS:::"memory");
1338
#else
1339
        unsigned i;
1340
        unsigned num_pixels = src_size >> 2;
1341
        for(i=0; i<num_pixels; i++)
1342
        {
1343
#ifdef WORDS_BIGENDIAN  
1344
          dst[4*i + 1] = src[4*i + 3];
1345
          dst[4*i + 2] = src[4*i + 2];
1346
          dst[4*i + 3] = src[4*i + 1];
1347
#else
1348
          dst[4*i + 0] = src[4*i + 2];
1349
          dst[4*i + 1] = src[4*i + 1];
1350
          dst[4*i + 2] = src[4*i + 0];
1351
#endif
1352
        }
1353
#endif
1354
}
1355

    
1356
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1357
{
1358
        unsigned i;
1359
#ifdef HAVE_MMX
1360
        int mmx_size= 23 - src_size;
1361
        asm volatile (
1362
                "movq "MANGLE(mask24r)", %%mm5        \n\t"
1363
                "movq "MANGLE(mask24g)", %%mm6        \n\t"
1364
                "movq "MANGLE(mask24b)", %%mm7        \n\t"
1365
                ".balign 16                        \n\t"
1366
                "1:                                \n\t"
1367
                PREFETCH" 32(%1, %%eax)                \n\t"
1368
                "movq   (%1, %%eax), %%mm0        \n\t" // BGR BGR BG
1369
                "movq   (%1, %%eax), %%mm1        \n\t" // BGR BGR BG
1370
                "movq  2(%1, %%eax), %%mm2        \n\t" // R BGR BGR B
1371
                "psllq $16, %%mm0                \n\t" // 00 BGR BGR
1372
                "pand %%mm5, %%mm0                \n\t"
1373
                "pand %%mm6, %%mm1                \n\t"
1374
                "pand %%mm7, %%mm2                \n\t"
1375
                "por %%mm0, %%mm1                \n\t"
1376
                "por %%mm2, %%mm1                \n\t"                
1377
                "movq  6(%1, %%eax), %%mm0        \n\t" // BGR BGR BG
1378
                MOVNTQ" %%mm1,   (%2, %%eax)        \n\t" // RGB RGB RG
1379
                "movq  8(%1, %%eax), %%mm1        \n\t" // R BGR BGR B
1380
                "movq 10(%1, %%eax), %%mm2        \n\t" // GR BGR BGR
1381
                "pand %%mm7, %%mm0                \n\t"
1382
                "pand %%mm5, %%mm1                \n\t"
1383
                "pand %%mm6, %%mm2                \n\t"
1384
                "por %%mm0, %%mm1                \n\t"
1385
                "por %%mm2, %%mm1                \n\t"                
1386
                "movq 14(%1, %%eax), %%mm0        \n\t" // R BGR BGR B
1387
                MOVNTQ" %%mm1,  8(%2, %%eax)        \n\t" // B RGB RGB R
1388
                "movq 16(%1, %%eax), %%mm1        \n\t" // GR BGR BGR
1389
                "movq 18(%1, %%eax), %%mm2        \n\t" // BGR BGR BG
1390
                "pand %%mm6, %%mm0                \n\t"
1391
                "pand %%mm7, %%mm1                \n\t"
1392
                "pand %%mm5, %%mm2                \n\t"
1393
                "por %%mm0, %%mm1                \n\t"
1394
                "por %%mm2, %%mm1                \n\t"                
1395
                MOVNTQ" %%mm1, 16(%2, %%eax)        \n\t"
1396
                "addl $24, %%eax                \n\t"
1397
                " js 1b                                \n\t"
1398
                : "+a" (mmx_size)
1399
                : "r" (src-mmx_size), "r"(dst-mmx_size)
1400
        );
1401

    
1402
        __asm __volatile(SFENCE:::"memory");
1403
        __asm __volatile(EMMS:::"memory");
1404

    
1405
        if(mmx_size==23) return; //finihsed, was multiple of 8
1406

    
1407
        src+= src_size;
1408
        dst+= src_size;
1409
        src_size= 23-mmx_size;
1410
        src-= src_size;
1411
        dst-= src_size;
1412
#endif
1413
        for(i=0; i<src_size; i+=3)
1414
        {
1415
                register uint8_t x;
1416
                x          = src[i + 2];
1417
                dst[i + 1] = src[i + 1];
1418
                dst[i + 2] = src[i + 0];
1419
                dst[i + 0] = x;
1420
        }
1421
}
1422

    
1423
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424
        unsigned int width, unsigned int height,
1425
        int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1426
{
1427
        unsigned y;
1428
        const unsigned chromWidth= width>>1;
1429
        for(y=0; y<height; y++)
1430
        {
1431
#ifdef HAVE_MMX
1432
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1433
                asm volatile(
1434
                        "xorl %%eax, %%eax                \n\t"
1435
                        ".balign 16                        \n\t"
1436
                        "1:                                \n\t"
1437
                        PREFETCH" 32(%1, %%eax, 2)        \n\t"
1438
                        PREFETCH" 32(%2, %%eax)                \n\t"
1439
                        PREFETCH" 32(%3, %%eax)                \n\t"
1440
                        "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1441
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1442
                        "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1443
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1444
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1445

    
1446
                        "movq (%1, %%eax,2), %%mm3        \n\t" // Y(0)
1447
                        "movq 8(%1, %%eax,2), %%mm5        \n\t" // Y(8)
1448
                        "movq %%mm3, %%mm4                \n\t" // Y(0)
1449
                        "movq %%mm5, %%mm6                \n\t" // Y(8)
1450
                        "punpcklbw %%mm0, %%mm3                \n\t" // YUYV YUYV(0)
1451
                        "punpckhbw %%mm0, %%mm4                \n\t" // YUYV YUYV(4)
1452
                        "punpcklbw %%mm2, %%mm5                \n\t" // YUYV YUYV(8)
1453
                        "punpckhbw %%mm2, %%mm6                \n\t" // YUYV YUYV(12)
1454

    
1455
                        MOVNTQ" %%mm3, (%0, %%eax, 4)        \n\t"
1456
                        MOVNTQ" %%mm4, 8(%0, %%eax, 4)        \n\t"
1457
                        MOVNTQ" %%mm5, 16(%0, %%eax, 4)        \n\t"
1458
                        MOVNTQ" %%mm6, 24(%0, %%eax, 4)        \n\t"
1459

    
1460
                        "addl $8, %%eax                        \n\t"
1461
                        "cmpl %4, %%eax                        \n\t"
1462
                        " jb 1b                                \n\t"
1463
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1464
                        : "%eax"
1465
                );
1466
#else
1467

    
1468
#if defined ARCH_ALPHA && defined HAVE_MVI
1469
#define pl2yuy2(n)                                        \
1470
        y1 = yc[n];                                        \
1471
        y2 = yc2[n];                                        \
1472
        u = uc[n];                                        \
1473
        v = vc[n];                                        \
1474
        asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));        \
1475
        asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));        \
1476
        asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1477
        asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1478
        yuv1 = (u << 8) + (v << 24);                        \
1479
        yuv2 = yuv1 + y2;                                \
1480
        yuv1 += y1;                                        \
1481
        qdst[n] = yuv1;                                        \
1482
        qdst2[n] = yuv2;
1483

    
1484
                int i;
1485
                uint64_t *qdst = (uint64_t *) dst;
1486
                uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487
                const uint32_t *yc = (uint32_t *) ysrc;
1488
                const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489
                const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490
                for(i = 0; i < chromWidth; i += 8){
1491
                        uint64_t y1, y2, yuv1, yuv2;
1492
                        uint64_t u, v;
1493
                        /* Prefetch */
1494
                        asm("ldq $31,64(%0)" :: "r"(yc));
1495
                        asm("ldq $31,64(%0)" :: "r"(yc2));
1496
                        asm("ldq $31,64(%0)" :: "r"(uc));
1497
                        asm("ldq $31,64(%0)" :: "r"(vc));
1498

    
1499
                        pl2yuy2(0);
1500
                        pl2yuy2(1);
1501
                        pl2yuy2(2);
1502
                        pl2yuy2(3);
1503

    
1504
                        yc += 4;
1505
                        yc2 += 4;
1506
                        uc += 4;
1507
                        vc += 4;
1508
                        qdst += 4;
1509
                        qdst2 += 4;
1510
                }
1511
                y++;
1512
                ysrc += lumStride;
1513
                dst += dstStride;
1514

    
1515
#elif __WORDSIZE >= 64
1516
                int i;
1517
                uint64_t *ldst = (uint64_t *) dst;
1518
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519
                for(i = 0; i < chromWidth; i += 2){
1520
                        uint64_t k, l;
1521
                        k = yc[0] + (uc[0] << 8) +
1522
                            (yc[1] << 16) + (vc[0] << 24);
1523
                        l = yc[2] + (uc[1] << 8) +
1524
                            (yc[3] << 16) + (vc[1] << 24);
1525
                        *ldst++ = k + (l << 32);
1526
                        yc += 4;
1527
                        uc += 2;
1528
                        vc += 2;
1529
                }
1530

    
1531
#else
1532
                int i, *idst = (int32_t *) dst;
1533
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534
                for(i = 0; i < chromWidth; i++){
1535
                        *idst++ = yc[0] + (uc[0] << 8) +
1536
                            (yc[1] << 16) + (vc[0] << 24);
1537
                        yc += 2;
1538
                        uc++;
1539
                        vc++;
1540
                }
1541
#endif
1542
#endif
1543
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1544
                {
1545
                        usrc += chromStride;
1546
                        vsrc += chromStride;
1547
                }
1548
                ysrc += lumStride;
1549
                dst += dstStride;
1550
        }
1551
#ifdef HAVE_MMX
1552
asm(    EMMS" \n\t"
1553
        SFENCE" \n\t"
1554
        :::"memory");
1555
#endif
1556
}
1557

    
1558
/**
1559
 *
1560
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1561
 * problem for anyone then tell me, and ill fix it)
1562
 */
1563
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1564
        unsigned int width, unsigned int height,
1565
        int lumStride, int chromStride, int dstStride)
1566
{
1567
        //FIXME interpolate chroma
1568
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1569
}
1570

    
1571
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1572
        unsigned int width, unsigned int height,
1573
        int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1574
{
1575
        unsigned y;
1576
        const unsigned chromWidth= width>>1;
1577
        for(y=0; y<height; y++)
1578
        {
1579
#if __WORDSIZE >= 64
1580
                int i;
1581
                uint64_t *ldst = (uint64_t *) dst;
1582
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1583
                for(i = 0; i < chromWidth; i += 2){
1584
                        uint64_t k, l;
1585
                        k = uc[0] + (yc[0] << 8) +
1586
                            (vc[0] << 16) + (yc[1] << 24);
1587
                        l = uc[1] + (yc[2] << 8) +
1588
                            (vc[1] << 16) + (yc[3] << 24);
1589
                        *ldst++ = k + (l << 32);
1590
                        yc += 4;
1591
                        uc += 2;
1592
                        vc += 2;
1593
                }
1594

    
1595
#else
1596
                int i, *idst = (int32_t *) dst;
1597
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1598
                for(i = 0; i < chromWidth; i++){
1599
                        *idst++ = uc[0] + (yc[0] << 8) +
1600
                            (vc[0] << 16) + (yc[1] << 24);
1601
                        yc += 2;
1602
                        uc++;
1603
                        vc++;
1604
                }
1605
#endif
1606
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1607
                {
1608
                        usrc += chromStride;
1609
                        vsrc += chromStride;
1610
                }
1611
                ysrc += lumStride;
1612
                dst += dstStride;
1613
        }
1614
}
1615

    
1616
/**
1617
 *
1618
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1619
 * problem for anyone then tell me, and ill fix it)
1620
 */
1621
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1622
        unsigned int width, unsigned int height,
1623
        int lumStride, int chromStride, int dstStride)
1624
{
1625
        //FIXME interpolate chroma
1626
        RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1627
}
1628

    
1629
/**
1630
 *
1631
 * width should be a multiple of 16
1632
 */
1633
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1634
        unsigned int width, unsigned int height,
1635
        int lumStride, int chromStride, int dstStride)
1636
{
1637
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1638
}
1639

    
1640
/**
1641
 *
1642
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1643
 * problem for anyone then tell me, and ill fix it)
1644
 */
1645
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1646
        unsigned int width, unsigned int height,
1647
        int lumStride, int chromStride, int srcStride)
1648
{
1649
        unsigned y;
1650
        const unsigned chromWidth= width>>1;
1651
        for(y=0; y<height; y+=2)
1652
        {
1653
#ifdef HAVE_MMX
1654
                asm volatile(
1655
                        "xorl %%eax, %%eax                \n\t"
1656
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1657
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1658
                        ".balign 16                        \n\t"
1659
                        "1:                                \n\t"
1660
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1661
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1662
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1663
                        "movq %%mm0, %%mm2                \n\t" // YUYV YUYV(0)
1664
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(4)
1665
                        "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1666
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1667
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1668
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1669
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1670
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1671

    
1672
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
1673

    
1674
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(8)
1675
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(12)
1676
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(8)
1677
                        "movq %%mm2, %%mm4                \n\t" // YUYV YUYV(12)
1678
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1679
                        "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1680
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1681
                        "pand %%mm7, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1682
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1683
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1684

    
1685
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
1686

    
1687
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1688
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1689
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1690
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1691
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1692
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1693
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1694
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1695

    
1696
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
1697
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
1698

    
1699
                        "addl $8, %%eax                        \n\t"
1700
                        "cmpl %4, %%eax                        \n\t"
1701
                        " jb 1b                                \n\t"
1702
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1703
                        : "memory", "%eax"
1704
                );
1705

    
1706
                ydst += lumStride;
1707
                src  += srcStride;
1708

    
1709
                asm volatile(
1710
                        "xorl %%eax, %%eax                \n\t"
1711
                        ".balign 16                        \n\t"
1712
                        "1:                                \n\t"
1713
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1714
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1715
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1716
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
1717
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
1718
                        "pand %%mm7, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1719
                        "pand %%mm7, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1720
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1721
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1722
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1723
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1724

    
1725
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
1726
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
1727

    
1728
                        "addl $8, %%eax                        \n\t"
1729
                        "cmpl %4, %%eax                        \n\t"
1730
                        " jb 1b                                \n\t"
1731

    
1732
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1733
                        : "memory", "%eax"
1734
                );
1735
#else
1736
                unsigned i;
1737
                for(i=0; i<chromWidth; i++)
1738
                {
1739
                        ydst[2*i+0]         = src[4*i+0];
1740
                        udst[i]         = src[4*i+1];
1741
                        ydst[2*i+1]         = src[4*i+2];
1742
                        vdst[i]         = src[4*i+3];
1743
                }
1744
                ydst += lumStride;
1745
                src  += srcStride;
1746

    
1747
                for(i=0; i<chromWidth; i++)
1748
                {
1749
                        ydst[2*i+0]         = src[4*i+0];
1750
                        ydst[2*i+1]         = src[4*i+2];
1751
                }
1752
#endif
1753
                udst += chromStride;
1754
                vdst += chromStride;
1755
                ydst += lumStride;
1756
                src  += srcStride;
1757
        }
1758
#ifdef HAVE_MMX
1759
asm volatile(   EMMS" \n\t"
1760
                SFENCE" \n\t"
1761
                :::"memory");
1762
#endif
1763
}
1764

    
1765
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1766
        uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1767
        unsigned int width, unsigned int height, int lumStride, int chromStride)
1768
{
1769
        /* Y Plane */
1770
        memcpy(ydst, ysrc, width*height);
1771

    
1772
        /* XXX: implement upscaling for U,V */
1773
}
1774

    
1775
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1776
{
1777
        int x,y;
1778
        
1779
        dst[0]= src[0];
1780
        
1781
        // first line
1782
        for(x=0; x<srcWidth-1; x++){
1783
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1784
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1785
        }
1786
        dst[2*srcWidth-1]= src[srcWidth-1];
1787
        
1788
        dst+= dstStride;
1789

    
1790
        for(y=1; y<srcHeight; y++){
1791
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1792
                const int mmxSize= srcWidth&~15;
1793
                asm volatile(
1794
                        "movl %4, %%eax                        \n\t"
1795
                        "1:                                \n\t"
1796
                        "movq (%0, %%eax), %%mm0        \n\t"
1797
                        "movq (%1, %%eax), %%mm1        \n\t"
1798
                        "movq 1(%0, %%eax), %%mm2        \n\t"
1799
                        "movq 1(%1, %%eax), %%mm3        \n\t"
1800
                        "movq -1(%0, %%eax), %%mm4        \n\t"
1801
                        "movq -1(%1, %%eax), %%mm5        \n\t"
1802
                        PAVGB" %%mm0, %%mm5                \n\t"
1803
                        PAVGB" %%mm0, %%mm3                \n\t"
1804
                        PAVGB" %%mm0, %%mm5                \n\t"
1805
                        PAVGB" %%mm0, %%mm3                \n\t"
1806
                        PAVGB" %%mm1, %%mm4                \n\t"
1807
                        PAVGB" %%mm1, %%mm2                \n\t"
1808
                        PAVGB" %%mm1, %%mm4                \n\t"
1809
                        PAVGB" %%mm1, %%mm2                \n\t"
1810
                        "movq %%mm5, %%mm7                \n\t"
1811
                        "movq %%mm4, %%mm6                \n\t"
1812
                        "punpcklbw %%mm3, %%mm5                \n\t"
1813
                        "punpckhbw %%mm3, %%mm7                \n\t"
1814
                        "punpcklbw %%mm2, %%mm4                \n\t"
1815
                        "punpckhbw %%mm2, %%mm6                \n\t"
1816
#if 1
1817
                        MOVNTQ" %%mm5, (%2, %%eax, 2)        \n\t"
1818
                        MOVNTQ" %%mm7, 8(%2, %%eax, 2)        \n\t"
1819
                        MOVNTQ" %%mm4, (%3, %%eax, 2)        \n\t"
1820
                        MOVNTQ" %%mm6, 8(%3, %%eax, 2)        \n\t"
1821
#else
1822
                        "movq %%mm5, (%2, %%eax, 2)        \n\t"
1823
                        "movq %%mm7, 8(%2, %%eax, 2)        \n\t"
1824
                        "movq %%mm4, (%3, %%eax, 2)        \n\t"
1825
                        "movq %%mm6, 8(%3, %%eax, 2)        \n\t"
1826
#endif
1827
                        "addl $8, %%eax                        \n\t"
1828
                        " js 1b                                \n\t"
1829
                        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1830
                           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1831
                           "g" (-mmxSize)
1832
                        : "%eax"
1833

    
1834
                );
1835
#else
1836
                const int mmxSize=1;
1837
#endif
1838
                dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1839
                dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1840

    
1841
                for(x=mmxSize-1; x<srcWidth-1; x++){
1842
                        dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1843
                        dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1844
                        dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1845
                        dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1846
                }
1847
                dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1848
                dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1849

    
1850
                dst+=dstStride*2;
1851
                src+=srcStride;
1852
        }
1853
        
1854
        // last line
1855
#if 1
1856
        dst[0]= src[0];
1857
        
1858
        for(x=0; x<srcWidth-1; x++){
1859
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1860
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1861
        }
1862
        dst[2*srcWidth-1]= src[srcWidth-1];
1863
#else
1864
        for(x=0; x<srcWidth; x++){
1865
                dst[2*x+0]=
1866
                dst[2*x+1]= src[x];
1867
        }
1868
#endif
1869

    
1870
#ifdef HAVE_MMX
1871
asm volatile(   EMMS" \n\t"
1872
                SFENCE" \n\t"
1873
                :::"memory");
1874
#endif
1875
}
1876

    
1877
/**
1878
 *
1879
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1880
 * problem for anyone then tell me, and ill fix it)
1881
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1882
 */
1883
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1884
        unsigned int width, unsigned int height,
1885
        int lumStride, int chromStride, int srcStride)
1886
{
1887
        unsigned y;
1888
        const unsigned chromWidth= width>>1;
1889
        for(y=0; y<height; y+=2)
1890
        {
1891
#ifdef HAVE_MMX
1892
                asm volatile(
1893
                        "xorl %%eax, %%eax                \n\t"
1894
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1895
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1896
                        ".balign 16                        \n\t"
1897
                        "1:                                \n\t"
1898
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1899
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // UYVY UYVY(0)
1900
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(4)
1901
                        "movq %%mm0, %%mm2                \n\t" // UYVY UYVY(0)
1902
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(4)
1903
                        "pand %%mm7, %%mm0                \n\t" // U0V0 U0V0(0)
1904
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(4)
1905
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1906
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1907
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1908
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1909

    
1910
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
1911

    
1912
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(8)
1913
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // UYVY UYVY(12)
1914
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(8)
1915
                        "movq %%mm2, %%mm4                \n\t" // UYVY UYVY(12)
1916
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(8)
1917
                        "pand %%mm7, %%mm2                \n\t" // U0V0 U0V0(12)
1918
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1919
                        "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1920
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1921
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1922

    
1923
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
1924

    
1925
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1926
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1927
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1928
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1929
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1930
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1931
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1932
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1933

    
1934
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
1935
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
1936

    
1937
                        "addl $8, %%eax                        \n\t"
1938
                        "cmpl %4, %%eax                        \n\t"
1939
                        " jb 1b                                \n\t"
1940
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1941
                        : "memory", "%eax"
1942
                );
1943

    
1944
                ydst += lumStride;
1945
                src  += srcStride;
1946

    
1947
                asm volatile(
1948
                        "xorl %%eax, %%eax                \n\t"
1949
                        ".balign 16                        \n\t"
1950
                        "1:                                \n\t"
1951
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1952
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1953
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1954
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
1955
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
1956
                        "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1957
                        "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1958
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1959
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1960
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1961
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1962

    
1963
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
1964
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
1965

    
1966
                        "addl $8, %%eax                        \n\t"
1967
                        "cmpl %4, %%eax                        \n\t"
1968
                        " jb 1b                                \n\t"
1969

    
1970
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1971
                        : "memory", "%eax"
1972
                );
1973
#else
1974
                unsigned i;
1975
                for(i=0; i<chromWidth; i++)
1976
                {
1977
                        udst[i]         = src[4*i+0];
1978
                        ydst[2*i+0]         = src[4*i+1];
1979
                        vdst[i]         = src[4*i+2];
1980
                        ydst[2*i+1]         = src[4*i+3];
1981
                }
1982
                ydst += lumStride;
1983
                src  += srcStride;
1984

    
1985
                for(i=0; i<chromWidth; i++)
1986
                {
1987
                        ydst[2*i+0]         = src[4*i+1];
1988
                        ydst[2*i+1]         = src[4*i+3];
1989
                }
1990
#endif
1991
                udst += chromStride;
1992
                vdst += chromStride;
1993
                ydst += lumStride;
1994
                src  += srcStride;
1995
        }
1996
#ifdef HAVE_MMX
1997
asm volatile(   EMMS" \n\t"
1998
                SFENCE" \n\t"
1999
                :::"memory");
2000
#endif
2001
}
2002

    
2003
/**
2004
 *
2005
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2006
 * problem for anyone then tell me, and ill fix it)
2007
 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2008
 */
2009
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2010
        unsigned int width, unsigned int height,
2011
        int lumStride, int chromStride, int srcStride)
2012
{
2013
        unsigned y;
2014
        const unsigned chromWidth= width>>1;
2015
#ifdef HAVE_MMX
2016
        for(y=0; y<height-2; y+=2)
2017
        {
2018
                unsigned i;
2019
                for(i=0; i<2; i++)
2020
                {
2021
                        asm volatile(
2022
                                "movl %2, %%eax                        \n\t"
2023
                                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
2024
                                "movq "MANGLE(w1111)", %%mm5                \n\t"
2025
                                "pxor %%mm7, %%mm7                \n\t"
2026
                                "leal (%%eax, %%eax, 2), %%ebx        \n\t"
2027
                                ".balign 16                        \n\t"
2028
                                "1:                                \n\t"
2029
                                PREFETCH" 64(%0, %%ebx)                \n\t"
2030
                                "movd (%0, %%ebx), %%mm0        \n\t"
2031
                                "movd 3(%0, %%ebx), %%mm1        \n\t"
2032
                                "punpcklbw %%mm7, %%mm0                \n\t"
2033
                                "punpcklbw %%mm7, %%mm1                \n\t"
2034
                                "movd 6(%0, %%ebx), %%mm2        \n\t"
2035
                                "movd 9(%0, %%ebx), %%mm3        \n\t"
2036
                                "punpcklbw %%mm7, %%mm2                \n\t"
2037
                                "punpcklbw %%mm7, %%mm3                \n\t"
2038
                                "pmaddwd %%mm6, %%mm0                \n\t"
2039
                                "pmaddwd %%mm6, %%mm1                \n\t"
2040
                                "pmaddwd %%mm6, %%mm2                \n\t"
2041
                                "pmaddwd %%mm6, %%mm3                \n\t"
2042
#ifndef FAST_BGR2YV12
2043
                                "psrad $8, %%mm0                \n\t"
2044
                                "psrad $8, %%mm1                \n\t"
2045
                                "psrad $8, %%mm2                \n\t"
2046
                                "psrad $8, %%mm3                \n\t"
2047
#endif
2048
                                "packssdw %%mm1, %%mm0                \n\t"
2049
                                "packssdw %%mm3, %%mm2                \n\t"
2050
                                "pmaddwd %%mm5, %%mm0                \n\t"
2051
                                "pmaddwd %%mm5, %%mm2                \n\t"
2052
                                "packssdw %%mm2, %%mm0                \n\t"
2053
                                "psraw $7, %%mm0                \n\t"
2054

    
2055
                                "movd 12(%0, %%ebx), %%mm4        \n\t"
2056
                                "movd 15(%0, %%ebx), %%mm1        \n\t"
2057
                                "punpcklbw %%mm7, %%mm4                \n\t"
2058
                                "punpcklbw %%mm7, %%mm1                \n\t"
2059
                                "movd 18(%0, %%ebx), %%mm2        \n\t"
2060
                                "movd 21(%0, %%ebx), %%mm3        \n\t"
2061
                                "punpcklbw %%mm7, %%mm2                \n\t"
2062
                                "punpcklbw %%mm7, %%mm3                \n\t"
2063
                                "pmaddwd %%mm6, %%mm4                \n\t"
2064
                                "pmaddwd %%mm6, %%mm1                \n\t"
2065
                                "pmaddwd %%mm6, %%mm2                \n\t"
2066
                                "pmaddwd %%mm6, %%mm3                \n\t"
2067
#ifndef FAST_BGR2YV12
2068
                                "psrad $8, %%mm4                \n\t"
2069
                                "psrad $8, %%mm1                \n\t"
2070
                                "psrad $8, %%mm2                \n\t"
2071
                                "psrad $8, %%mm3                \n\t"
2072
#endif
2073
                                "packssdw %%mm1, %%mm4                \n\t"
2074
                                "packssdw %%mm3, %%mm2                \n\t"
2075
                                "pmaddwd %%mm5, %%mm4                \n\t"
2076
                                "pmaddwd %%mm5, %%mm2                \n\t"
2077
                                "addl $24, %%ebx                \n\t"
2078
                                "packssdw %%mm2, %%mm4                \n\t"
2079
                                "psraw $7, %%mm4                \n\t"
2080

    
2081
                                "packuswb %%mm4, %%mm0                \n\t"
2082
                                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
2083

    
2084
                                MOVNTQ" %%mm0, (%1, %%eax)        \n\t"
2085
                                "addl $8, %%eax                        \n\t"
2086
                                " js 1b                                \n\t"
2087
                                : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2088
                                : "%eax", "%ebx"
2089
                        );
2090
                        ydst += lumStride;
2091
                        src  += srcStride;
2092
                }
2093
                src -= srcStride*2;
2094
                asm volatile(
2095
                        "movl %4, %%eax                        \n\t"
2096
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2097
                        "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
2098
                        "pxor %%mm7, %%mm7                \n\t"
2099
                        "leal (%%eax, %%eax, 2), %%ebx        \n\t"
2100
                        "addl %%ebx, %%ebx                \n\t"
2101
                        ".balign 16                        \n\t"
2102
                        "1:                                \n\t"
2103
                        PREFETCH" 64(%0, %%ebx)                \n\t"
2104
                        PREFETCH" 64(%1, %%ebx)                \n\t"
2105
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2106
                        "movq (%0, %%ebx), %%mm0        \n\t"
2107
                        "movq (%1, %%ebx), %%mm1        \n\t"
2108
                        "movq 6(%0, %%ebx), %%mm2        \n\t"
2109
                        "movq 6(%1, %%ebx), %%mm3        \n\t"
2110
                        PAVGB" %%mm1, %%mm0                \n\t"
2111
                        PAVGB" %%mm3, %%mm2                \n\t"
2112
                        "movq %%mm0, %%mm1                \n\t"
2113
                        "movq %%mm2, %%mm3                \n\t"
2114
                        "psrlq $24, %%mm0                \n\t"
2115
                        "psrlq $24, %%mm2                \n\t"
2116
                        PAVGB" %%mm1, %%mm0                \n\t"
2117
                        PAVGB" %%mm3, %%mm2                \n\t"
2118
                        "punpcklbw %%mm7, %%mm0                \n\t"
2119
                        "punpcklbw %%mm7, %%mm2                \n\t"
2120
#else
2121
                        "movd (%0, %%ebx), %%mm0        \n\t"
2122
                        "movd (%1, %%ebx), %%mm1        \n\t"
2123
                        "movd 3(%0, %%ebx), %%mm2        \n\t"
2124
                        "movd 3(%1, %%ebx), %%mm3        \n\t"
2125
                        "punpcklbw %%mm7, %%mm0                \n\t"
2126
                        "punpcklbw %%mm7, %%mm1                \n\t"
2127
                        "punpcklbw %%mm7, %%mm2                \n\t"
2128
                        "punpcklbw %%mm7, %%mm3                \n\t"
2129
                        "paddw %%mm1, %%mm0                \n\t"
2130
                        "paddw %%mm3, %%mm2                \n\t"
2131
                        "paddw %%mm2, %%mm0                \n\t"
2132
                        "movd 6(%0, %%ebx), %%mm4        \n\t"
2133
                        "movd 6(%1, %%ebx), %%mm1        \n\t"
2134
                        "movd 9(%0, %%ebx), %%mm2        \n\t"
2135
                        "movd 9(%1, %%ebx), %%mm3        \n\t"
2136
                        "punpcklbw %%mm7, %%mm4                \n\t"
2137
                        "punpcklbw %%mm7, %%mm1                \n\t"
2138
                        "punpcklbw %%mm7, %%mm2                \n\t"
2139
                        "punpcklbw %%mm7, %%mm3                \n\t"
2140
                        "paddw %%mm1, %%mm4                \n\t"
2141
                        "paddw %%mm3, %%mm2                \n\t"
2142
                        "paddw %%mm4, %%mm2                \n\t"
2143
                        "psrlw $2, %%mm0                \n\t"
2144
                        "psrlw $2, %%mm2                \n\t"
2145
#endif
2146
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2147
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2148

    
2149
                        "pmaddwd %%mm0, %%mm1                \n\t"
2150
                        "pmaddwd %%mm2, %%mm3                \n\t"
2151
                        "pmaddwd %%mm6, %%mm0                \n\t"
2152
                        "pmaddwd %%mm6, %%mm2                \n\t"
2153
#ifndef FAST_BGR2YV12
2154
                        "psrad $8, %%mm0                \n\t"
2155
                        "psrad $8, %%mm1                \n\t"
2156
                        "psrad $8, %%mm2                \n\t"
2157
                        "psrad $8, %%mm3                \n\t"
2158
#endif
2159
                        "packssdw %%mm2, %%mm0                \n\t"
2160
                        "packssdw %%mm3, %%mm1                \n\t"
2161
                        "pmaddwd %%mm5, %%mm0                \n\t"
2162
                        "pmaddwd %%mm5, %%mm1                \n\t"
2163
                        "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
2164
                        "psraw $7, %%mm0                \n\t"
2165

    
2166
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2167
                        "movq 12(%0, %%ebx), %%mm4        \n\t"
2168
                        "movq 12(%1, %%ebx), %%mm1        \n\t"
2169
                        "movq 18(%0, %%ebx), %%mm2        \n\t"
2170
                        "movq 18(%1, %%ebx), %%mm3        \n\t"
2171
                        PAVGB" %%mm1, %%mm4                \n\t"
2172
                        PAVGB" %%mm3, %%mm2                \n\t"
2173
                        "movq %%mm4, %%mm1                \n\t"
2174
                        "movq %%mm2, %%mm3                \n\t"
2175
                        "psrlq $24, %%mm4                \n\t"
2176
                        "psrlq $24, %%mm2                \n\t"
2177
                        PAVGB" %%mm1, %%mm4                \n\t"
2178
                        PAVGB" %%mm3, %%mm2                \n\t"
2179
                        "punpcklbw %%mm7, %%mm4                \n\t"
2180
                        "punpcklbw %%mm7, %%mm2                \n\t"
2181
#else
2182
                        "movd 12(%0, %%ebx), %%mm4        \n\t"
2183
                        "movd 12(%1, %%ebx), %%mm1        \n\t"
2184
                        "movd 15(%0, %%ebx), %%mm2        \n\t"
2185
                        "movd 15(%1, %%ebx), %%mm3        \n\t"
2186
                        "punpcklbw %%mm7, %%mm4                \n\t"
2187
                        "punpcklbw %%mm7, %%mm1                \n\t"
2188
                        "punpcklbw %%mm7, %%mm2                \n\t"
2189
                        "punpcklbw %%mm7, %%mm3                \n\t"
2190
                        "paddw %%mm1, %%mm4                \n\t"
2191
                        "paddw %%mm3, %%mm2                \n\t"
2192
                        "paddw %%mm2, %%mm4                \n\t"
2193
                        "movd 18(%0, %%ebx), %%mm5        \n\t"
2194
                        "movd 18(%1, %%ebx), %%mm1        \n\t"
2195
                        "movd 21(%0, %%ebx), %%mm2        \n\t"
2196
                        "movd 21(%1, %%ebx), %%mm3        \n\t"
2197
                        "punpcklbw %%mm7, %%mm5                \n\t"
2198
                        "punpcklbw %%mm7, %%mm1                \n\t"
2199
                        "punpcklbw %%mm7, %%mm2                \n\t"
2200
                        "punpcklbw %%mm7, %%mm3                \n\t"
2201
                        "paddw %%mm1, %%mm5                \n\t"
2202
                        "paddw %%mm3, %%mm2                \n\t"
2203
                        "paddw %%mm5, %%mm2                \n\t"
2204
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2205
                        "psrlw $2, %%mm4                \n\t"
2206
                        "psrlw $2, %%mm2                \n\t"
2207
#endif
2208
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2209
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2210

    
2211
                        "pmaddwd %%mm4, %%mm1                \n\t"
2212
                        "pmaddwd %%mm2, %%mm3                \n\t"
2213
                        "pmaddwd %%mm6, %%mm4                \n\t"
2214
                        "pmaddwd %%mm6, %%mm2                \n\t"
2215
#ifndef FAST_BGR2YV12
2216
                        "psrad $8, %%mm4                \n\t"
2217
                        "psrad $8, %%mm1                \n\t"
2218
                        "psrad $8, %%mm2                \n\t"
2219
                        "psrad $8, %%mm3                \n\t"
2220
#endif
2221
                        "packssdw %%mm2, %%mm4                \n\t"
2222
                        "packssdw %%mm3, %%mm1                \n\t"
2223
                        "pmaddwd %%mm5, %%mm4                \n\t"
2224
                        "pmaddwd %%mm5, %%mm1                \n\t"
2225
                        "addl $24, %%ebx                \n\t"
2226
                        "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
2227
                        "psraw $7, %%mm4                \n\t"
2228

    
2229
                        "movq %%mm0, %%mm1                \n\t"
2230
                        "punpckldq %%mm4, %%mm0                \n\t"
2231
                        "punpckhdq %%mm4, %%mm1                \n\t"
2232
                        "packsswb %%mm1, %%mm0                \n\t"
2233
                        "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
2234

    
2235
                        "movd %%mm0, (%2, %%eax)        \n\t"
2236
                        "punpckhdq %%mm0, %%mm0                \n\t"
2237
                        "movd %%mm0, (%3, %%eax)        \n\t"
2238
                        "addl $4, %%eax                        \n\t"
2239
                        " js 1b                                \n\t"
2240
                        : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2241
                        : "%eax", "%ebx"
2242
                );
2243

    
2244
                udst += chromStride;
2245
                vdst += chromStride;
2246
                src  += srcStride*2;
2247
        }
2248

    
2249
        asm volatile(   EMMS" \n\t"
2250
                        SFENCE" \n\t"
2251
                        :::"memory");
2252
#else
2253
        y=0;
2254
#endif
2255
        for(; y<height; y+=2)
2256
        {
2257
                unsigned i;
2258
                for(i=0; i<chromWidth; i++)
2259
                {
2260
                        unsigned int b= src[6*i+0];
2261
                        unsigned int g= src[6*i+1];
2262
                        unsigned int r= src[6*i+2];
2263

    
2264
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2265
                        unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2266
                        unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2267

    
2268
                        udst[i]         = U;
2269
                        vdst[i]         = V;
2270
                        ydst[2*i]         = Y;
2271

    
2272
                        b= src[6*i+3];
2273
                        g= src[6*i+4];
2274
                        r= src[6*i+5];
2275

    
2276
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2277
                        ydst[2*i+1]         = Y;
2278
                }
2279
                ydst += lumStride;
2280
                src  += srcStride;
2281

    
2282
                for(i=0; i<chromWidth; i++)
2283
                {
2284
                        unsigned int b= src[6*i+0];
2285
                        unsigned int g= src[6*i+1];
2286
                        unsigned int r= src[6*i+2];
2287

    
2288
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2289

    
2290
                        ydst[2*i]         = Y;
2291

    
2292
                        b= src[6*i+3];
2293
                        g= src[6*i+4];
2294
                        r= src[6*i+5];
2295

    
2296
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2297
                        ydst[2*i+1]         = Y;
2298
                }
2299
                udst += chromStride;
2300
                vdst += chromStride;
2301
                ydst += lumStride;
2302
                src  += srcStride;
2303
        }
2304
}
2305

    
2306
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2307
                            unsigned width, unsigned height, int src1Stride,
2308
                            int src2Stride, int dstStride){
2309
        unsigned h;
2310

    
2311
        for(h=0; h < height; h++)
2312
        {
2313
                unsigned w;
2314

    
2315
#ifdef HAVE_MMX
2316
#ifdef HAVE_SSE2
2317
                asm(
2318
                        "xorl %%eax, %%eax                \n\t"
2319
                        "1:                                \n\t"
2320
                        PREFETCH" 64(%1, %%eax)                \n\t"
2321
                        PREFETCH" 64(%2, %%eax)                \n\t"
2322
                        "movdqa (%1, %%eax), %%xmm0        \n\t"
2323
                        "movdqa (%1, %%eax), %%xmm1        \n\t"
2324
                        "movdqa (%2, %%eax), %%xmm2        \n\t"
2325
                        "punpcklbw %%xmm2, %%xmm0        \n\t"
2326
                        "punpckhbw %%xmm2, %%xmm1        \n\t"
2327
                        "movntdq %%xmm0, (%0, %%eax, 2)        \n\t"
2328
                        "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2329
                        "addl $16, %%eax                        \n\t"
2330
                        "cmpl %3, %%eax                        \n\t"
2331
                        " jb 1b                                \n\t"
2332
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2333
                        : "memory", "%eax"
2334
                );
2335
#else
2336
                asm(
2337
                        "xorl %%eax, %%eax                \n\t"
2338
                        "1:                                \n\t"
2339
                        PREFETCH" 64(%1, %%eax)                \n\t"
2340
                        PREFETCH" 64(%2, %%eax)                \n\t"
2341
                        "movq (%1, %%eax), %%mm0        \n\t"
2342
                        "movq 8(%1, %%eax), %%mm2        \n\t"
2343
                        "movq %%mm0, %%mm1                \n\t"
2344
                        "movq %%mm2, %%mm3                \n\t"
2345
                        "movq (%2, %%eax), %%mm4        \n\t"
2346
                        "movq 8(%2, %%eax), %%mm5        \n\t"
2347
                        "punpcklbw %%mm4, %%mm0                \n\t"
2348
                        "punpckhbw %%mm4, %%mm1                \n\t"
2349
                        "punpcklbw %%mm5, %%mm2                \n\t"
2350
                        "punpckhbw %%mm5, %%mm3                \n\t"
2351
                        MOVNTQ" %%mm0, (%0, %%eax, 2)        \n\t"
2352
                        MOVNTQ" %%mm1, 8(%0, %%eax, 2)        \n\t"
2353
                        MOVNTQ" %%mm2, 16(%0, %%eax, 2)        \n\t"
2354
                        MOVNTQ" %%mm3, 24(%0, %%eax, 2)        \n\t"
2355
                        "addl $16, %%eax                        \n\t"
2356
                        "cmpl %3, %%eax                        \n\t"
2357
                        " jb 1b                                \n\t"
2358
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2359
                        : "memory", "%eax"
2360
                );
2361
#endif
2362
                for(w= (width&(~15)); w < width; w++)
2363
                {
2364
                        dest[2*w+0] = src1[w];
2365
                        dest[2*w+1] = src2[w];
2366
                }
2367
#else
2368
                for(w=0; w < width; w++)
2369
                {
2370
                        dest[2*w+0] = src1[w];
2371
                        dest[2*w+1] = src2[w];
2372
                }
2373
#endif
2374
                dest += dstStride;
2375
                src1 += src1Stride;
2376
                src2 += src2Stride;
2377
        }
2378
#ifdef HAVE_MMX
2379
        asm(
2380
                EMMS" \n\t"
2381
                SFENCE" \n\t"
2382
                ::: "memory"
2383
                );
2384
#endif
2385
}
2386

    
2387
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2388
                        uint8_t *dst1, uint8_t *dst2,
2389
                        unsigned width, unsigned height,
2390
                        int srcStride1, int srcStride2,
2391
                        int dstStride1, int dstStride2)
2392
{
2393
    unsigned int y,x,h;
2394
    int w;
2395
    w=width/2; h=height/2;
2396
#ifdef HAVE_MMX
2397
    asm volatile(
2398
        PREFETCH" %0\n\t"
2399
        PREFETCH" %1\n\t"
2400
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2401
#endif
2402
    for(y=0;y<h;y++){
2403
        const uint8_t* s1=src1+srcStride1*(y>>1);
2404
        uint8_t* d=dst1+dstStride1*y;
2405
        x=0;
2406
#ifdef HAVE_MMX
2407
        for(;x<w-31;x+=32)
2408
        {
2409
            asm volatile(
2410
                PREFETCH" 32%1\n\t"
2411
                "movq        %1, %%mm0\n\t"
2412
                "movq        8%1, %%mm2\n\t"
2413
                "movq        16%1, %%mm4\n\t"
2414
                "movq        24%1, %%mm6\n\t"
2415
                "movq        %%mm0, %%mm1\n\t"
2416
                "movq        %%mm2, %%mm3\n\t"
2417
                "movq        %%mm4, %%mm5\n\t"
2418
                "movq        %%mm6, %%mm7\n\t"
2419
                "punpcklbw %%mm0, %%mm0\n\t"
2420
                "punpckhbw %%mm1, %%mm1\n\t"
2421
                "punpcklbw %%mm2, %%mm2\n\t"
2422
                "punpckhbw %%mm3, %%mm3\n\t"
2423
                "punpcklbw %%mm4, %%mm4\n\t"
2424
                "punpckhbw %%mm5, %%mm5\n\t"
2425
                "punpcklbw %%mm6, %%mm6\n\t"
2426
                "punpckhbw %%mm7, %%mm7\n\t"
2427
                MOVNTQ"        %%mm0, %0\n\t"
2428
                MOVNTQ"        %%mm1, 8%0\n\t"
2429
                MOVNTQ"        %%mm2, 16%0\n\t"
2430
                MOVNTQ"        %%mm3, 24%0\n\t"
2431
                MOVNTQ"        %%mm4, 32%0\n\t"
2432
                MOVNTQ"        %%mm5, 40%0\n\t"
2433
                MOVNTQ"        %%mm6, 48%0\n\t"
2434
                MOVNTQ"        %%mm7, 56%0"
2435
                :"=m"(d[2*x])
2436
                :"m"(s1[x])
2437
                :"memory");
2438
        }
2439
#endif
2440
        for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2441
    }
2442
    for(y=0;y<h;y++){
2443
        const uint8_t* s2=src2+srcStride2*(y>>1);
2444
        uint8_t* d=dst2+dstStride2*y;
2445
        x=0;
2446
#ifdef HAVE_MMX
2447
        for(;x<w-31;x+=32)
2448
        {
2449
            asm volatile(
2450
                PREFETCH" 32%1\n\t"
2451
                "movq        %1, %%mm0\n\t"
2452
                "movq        8%1, %%mm2\n\t"
2453
                "movq        16%1, %%mm4\n\t"
2454
                "movq        24%1, %%mm6\n\t"
2455
                "movq        %%mm0, %%mm1\n\t"
2456
                "movq        %%mm2, %%mm3\n\t"
2457
                "movq        %%mm4, %%mm5\n\t"
2458
                "movq        %%mm6, %%mm7\n\t"
2459
                "punpcklbw %%mm0, %%mm0\n\t"
2460
                "punpckhbw %%mm1, %%mm1\n\t"
2461
                "punpcklbw %%mm2, %%mm2\n\t"
2462
                "punpckhbw %%mm3, %%mm3\n\t"
2463
                "punpcklbw %%mm4, %%mm4\n\t"
2464
                "punpckhbw %%mm5, %%mm5\n\t"
2465
                "punpcklbw %%mm6, %%mm6\n\t"
2466
                "punpckhbw %%mm7, %%mm7\n\t"
2467
                MOVNTQ"        %%mm0, %0\n\t"
2468
                MOVNTQ"        %%mm1, 8%0\n\t"
2469
                MOVNTQ"        %%mm2, 16%0\n\t"
2470
                MOVNTQ"        %%mm3, 24%0\n\t"
2471
                MOVNTQ"        %%mm4, 32%0\n\t"
2472
                MOVNTQ"        %%mm5, 40%0\n\t"
2473
                MOVNTQ"        %%mm6, 48%0\n\t"
2474
                MOVNTQ"        %%mm7, 56%0"
2475
                :"=m"(d[2*x])
2476
                :"m"(s2[x])
2477
                :"memory");
2478
        }
2479
#endif
2480
        for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2481
    }
2482
#ifdef HAVE_MMX
2483
        asm(
2484
                EMMS" \n\t"
2485
                SFENCE" \n\t"
2486
                ::: "memory"
2487
                );
2488
#endif
2489
}
2490

    
2491
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2492
                        uint8_t *dst,
2493
                        unsigned width, unsigned height,
2494
                        int srcStride1, int srcStride2,
2495
                        int srcStride3, int dstStride)
2496
{
2497
    unsigned y,x,w,h;
2498
    w=width/2; h=height;
2499
    for(y=0;y<h;y++){
2500
        const uint8_t* yp=src1+srcStride1*y;
2501
        const uint8_t* up=src2+srcStride2*(y>>2);
2502
        const uint8_t* vp=src3+srcStride3*(y>>2);
2503
        uint8_t* d=dst+dstStride*y;
2504
        x=0;
2505
#ifdef HAVE_MMX
2506
        for(;x<w-7;x+=8)
2507
        {
2508
            asm volatile(
2509
                PREFETCH" 32(%1, %0)\n\t"
2510
                PREFETCH" 32(%2, %0)\n\t"
2511
                PREFETCH" 32(%3, %0)\n\t"
2512
                "movq        (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2513
                "movq        (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2514
                "movq        (%3, %0), %%mm2\n\t"             /* V0V1V2V3V4V5V6V7 */
2515
                "movq        %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2516
                "movq        %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2517
                "movq        %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2518
                "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2519
                "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2520
                "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2521
                "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2522

    
2523
                "movq        %%mm1, %%mm6\n\t"
2524
                "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2525
                "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2526
                "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2527
                MOVNTQ"        %%mm0, (%4, %0, 8)\n\t"
2528
                MOVNTQ"        %%mm3, 8(%4, %0, 8)\n\t"
2529
                
2530
                "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2531
                "movq        8(%1, %0, 4), %%mm0\n\t"
2532
                "movq        %%mm0, %%mm3\n\t"
2533
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2534
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2535
                MOVNTQ"        %%mm0, 16(%4, %0, 8)\n\t"
2536
                MOVNTQ"        %%mm3, 24(%4, %0, 8)\n\t"
2537

    
2538
                "movq        %%mm4, %%mm6\n\t"
2539
                "movq        16(%1, %0, 4), %%mm0\n\t"
2540
                "movq        %%mm0, %%mm3\n\t"
2541
                "punpcklbw %%mm5, %%mm4\n\t"
2542
                "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2543
                "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2544
                MOVNTQ"        %%mm0, 32(%4, %0, 8)\n\t"
2545
                MOVNTQ"        %%mm3, 40(%4, %0, 8)\n\t"
2546
                
2547
                "punpckhbw %%mm5, %%mm6\n\t"
2548
                "movq        24(%1, %0, 4), %%mm0\n\t"
2549
                "movq        %%mm0, %%mm3\n\t"
2550
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2551
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2552
                MOVNTQ"        %%mm0, 48(%4, %0, 8)\n\t"
2553
                MOVNTQ"        %%mm3, 56(%4, %0, 8)\n\t"
2554

    
2555
                : "+r" (x)
2556
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2557
                :"memory");
2558
        }
2559
#endif
2560
        for(; x<w; x++)
2561
        {
2562
            const int x2= x<<2;
2563
            d[8*x+0]=yp[x2];
2564
            d[8*x+1]=up[x];
2565
            d[8*x+2]=yp[x2+1];
2566
            d[8*x+3]=vp[x];
2567
            d[8*x+4]=yp[x2+2];
2568
            d[8*x+5]=up[x];
2569
            d[8*x+6]=yp[x2+3];
2570
            d[8*x+7]=vp[x];
2571
        }
2572
    }
2573
#ifdef HAVE_MMX
2574
        asm(
2575
                EMMS" \n\t"
2576
                SFENCE" \n\t"
2577
                ::: "memory"
2578
                );
2579
#endif
2580
}