Statistics
| Branch: | Revision:

ffmpeg / postproc / rgb2rgb_template.c @ ae4cffd9

History | View | Annotate | Download (65.8 KB)

1
/*
2
 *
3
 *  rgb2rgb.c, Software RGB to RGB convertor
4
 *  pluralize by Software PAL8 to RGB convertor
5
 *               Software YUV to YUV convertor
6
 *               Software YUV to RGB convertor
7
 *  Written by Nick Kurshev.
8
 *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9
 */
10

    
11
#include <stddef.h>
12
#include <inttypes.h> /* for __WORDSIZE */
13

    
14
#ifndef __WORDSIZE
15
// #warning You have misconfigured system and probably will lose performance!
16
#define __WORDSIZE MP_WORDSIZE
17
#endif
18

    
19
#undef PREFETCH
20
#undef MOVNTQ
21
#undef EMMS
22
#undef SFENCE
23
#undef MMREG_SIZE
24
#undef PREFETCHW
25
#undef PAVGB
26

    
27
#ifdef HAVE_SSE2
28
#define MMREG_SIZE 16
29
#else
30
#define MMREG_SIZE 8
31
#endif
32

    
33
#ifdef HAVE_3DNOW
34
#define PREFETCH  "prefetch"
35
#define PREFETCHW "prefetchw"
36
#define PAVGB          "pavgusb"
37
#elif defined ( HAVE_MMX2 )
38
#define PREFETCH "prefetchnta"
39
#define PREFETCHW "prefetcht0"
40
#define PAVGB          "pavgb"
41
#else
42
#define PREFETCH "/nop"
43
#define PREFETCHW "/nop"
44
#endif
45

    
46
#ifdef HAVE_3DNOW
47
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48
#define EMMS     "femms"
49
#else
50
#define EMMS     "emms"
51
#endif
52

    
53
#ifdef HAVE_MMX2
54
#define MOVNTQ "movntq"
55
#define SFENCE "sfence"
56
#else
57
#define MOVNTQ "movq"
58
#define SFENCE "/nop"
59
#endif
60

    
61
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62
{
63
  uint8_t *dest = dst;
64
  const uint8_t *s = src;
65
  const uint8_t *end;
66
#ifdef HAVE_MMX
67
  const uint8_t *mm_end;
68
#endif
69
  end = s + src_size;
70
#ifdef HAVE_MMX
71
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
72
  mm_end = end - 23;
73
  __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
74
  while(s < mm_end)
75
  {
76
    __asm __volatile(
77
        PREFETCH"        32%1\n\t"
78
        "movd        %1, %%mm0\n\t"
79
        "punpckldq 3%1, %%mm0\n\t"
80
        "movd        6%1, %%mm1\n\t"
81
        "punpckldq 9%1, %%mm1\n\t"
82
        "movd        12%1, %%mm2\n\t"
83
        "punpckldq 15%1, %%mm2\n\t"
84
        "movd        18%1, %%mm3\n\t"
85
        "punpckldq 21%1, %%mm3\n\t"
86
        "pand        %%mm7, %%mm0\n\t"
87
        "pand        %%mm7, %%mm1\n\t"
88
        "pand        %%mm7, %%mm2\n\t"
89
        "pand        %%mm7, %%mm3\n\t"
90
        MOVNTQ"        %%mm0, %0\n\t"
91
        MOVNTQ"        %%mm1, 8%0\n\t"
92
        MOVNTQ"        %%mm2, 16%0\n\t"
93
        MOVNTQ"        %%mm3, 24%0"
94
        :"=m"(*dest)
95
        :"m"(*s)
96
        :"memory");
97
    dest += 32;
98
    s += 24;
99
  }
100
  __asm __volatile(SFENCE:::"memory");
101
  __asm __volatile(EMMS:::"memory");
102
#endif
103
  while(s < end)
104
  {
105
    *dest++ = *s++;
106
    *dest++ = *s++;
107
    *dest++ = *s++;
108
    *dest++ = 0;
109
  }
110
}
111

    
112
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113
{
114
  uint8_t *dest = dst;
115
  const uint8_t *s = src;
116
  const uint8_t *end;
117
#ifdef HAVE_MMX
118
  const uint8_t *mm_end;
119
#endif
120
  end = s + src_size;
121
#ifdef HAVE_MMX
122
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
123
  mm_end = end - 31;
124
  while(s < mm_end)
125
  {
126
    __asm __volatile(
127
        PREFETCH"        32%1\n\t"
128
        "movq        %1, %%mm0\n\t"
129
        "movq        8%1, %%mm1\n\t"
130
        "movq        16%1, %%mm4\n\t"
131
        "movq        24%1, %%mm5\n\t"
132
        "movq        %%mm0, %%mm2\n\t"
133
        "movq        %%mm1, %%mm3\n\t"
134
        "movq        %%mm4, %%mm6\n\t"
135
        "movq        %%mm5, %%mm7\n\t"
136
        "psrlq        $8, %%mm2\n\t"
137
        "psrlq        $8, %%mm3\n\t"
138
        "psrlq        $8, %%mm6\n\t"
139
        "psrlq        $8, %%mm7\n\t"
140
        "pand        %2, %%mm0\n\t"
141
        "pand        %2, %%mm1\n\t"
142
        "pand        %2, %%mm4\n\t"
143
        "pand        %2, %%mm5\n\t"
144
        "pand        %3, %%mm2\n\t"
145
        "pand        %3, %%mm3\n\t"
146
        "pand        %3, %%mm6\n\t"
147
        "pand        %3, %%mm7\n\t"
148
        "por        %%mm2, %%mm0\n\t"
149
        "por        %%mm3, %%mm1\n\t"
150
        "por        %%mm6, %%mm4\n\t"
151
        "por        %%mm7, %%mm5\n\t"
152

    
153
        "movq        %%mm1, %%mm2\n\t"
154
        "movq        %%mm4, %%mm3\n\t"
155
        "psllq        $48, %%mm2\n\t"
156
        "psllq        $32, %%mm3\n\t"
157
        "pand        %4, %%mm2\n\t"
158
        "pand        %5, %%mm3\n\t"
159
        "por        %%mm2, %%mm0\n\t"
160
        "psrlq        $16, %%mm1\n\t"
161
        "psrlq        $32, %%mm4\n\t"
162
        "psllq        $16, %%mm5\n\t"
163
        "por        %%mm3, %%mm1\n\t"
164
        "pand        %6, %%mm5\n\t"
165
        "por        %%mm5, %%mm4\n\t"
166

    
167
        MOVNTQ"        %%mm0, %0\n\t"
168
        MOVNTQ"        %%mm1, 8%0\n\t"
169
        MOVNTQ"        %%mm4, 16%0"
170
        :"=m"(*dest)
171
        :"m"(*s),"m"(mask24l),
172
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173
        :"memory");
174
    dest += 24;
175
    s += 32;
176
  }
177
  __asm __volatile(SFENCE:::"memory");
178
  __asm __volatile(EMMS:::"memory");
179
#endif
180
  while(s < end)
181
  {
182
    *dest++ = *s++;
183
    *dest++ = *s++;
184
    *dest++ = *s++;
185
    s++;
186
  }
187
}
188

    
189
/*
190
 Original by Strepto/Astral
191
 ported to gcc & bugfixed : A'rpi
192
 MMX2, 3DNOW optimization by Nick Kurshev
193
 32bit c version, and and&add trick by Michael Niedermayer
194
*/
195
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196
{
197
  register const uint8_t* s=src;
198
  register uint8_t* d=dst;
199
  register const uint8_t *end;
200
  const uint8_t *mm_end;
201
  end = s + src_size;
202
#ifdef HAVE_MMX
203
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
204
  __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
205
  mm_end = end - 15;
206
  while(s<mm_end)
207
  {
208
        __asm __volatile(
209
                PREFETCH"        32%1\n\t"
210
                "movq        %1, %%mm0\n\t"
211
                "movq        8%1, %%mm2\n\t"
212
                "movq        %%mm0, %%mm1\n\t"
213
                "movq        %%mm2, %%mm3\n\t"
214
                "pand        %%mm4, %%mm0\n\t"
215
                "pand        %%mm4, %%mm2\n\t"
216
                "paddw        %%mm1, %%mm0\n\t"
217
                "paddw        %%mm3, %%mm2\n\t"
218
                MOVNTQ"        %%mm0, %0\n\t"
219
                MOVNTQ"        %%mm2, 8%0"
220
                :"=m"(*d)
221
                :"m"(*s)
222
                );
223
        d+=16;
224
        s+=16;
225
  }
226
  __asm __volatile(SFENCE:::"memory");
227
  __asm __volatile(EMMS:::"memory");
228
#endif
229
    mm_end = end - 3;
230
    while(s < mm_end)
231
    {
232
        register unsigned x= *((uint32_t *)s);
233
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234
        d+=4;
235
        s+=4;
236
    }
237
    if(s < end)
238
    {
239
        register unsigned short x= *((uint16_t *)s);
240
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241
    }
242
}
243

    
244
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
245
{
246
  register const uint8_t* s=src;
247
  register uint8_t* d=dst;
248
  register const uint8_t *end;
249
  const uint8_t *mm_end;
250
  end = s + src_size;
251
#ifdef HAVE_MMX
252
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
253
  __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
254
  __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
255
  mm_end = end - 15;
256
  while(s<mm_end)
257
  {
258
        __asm __volatile(
259
                PREFETCH"        32%1\n\t"
260
                "movq        %1, %%mm0\n\t"
261
                "movq        8%1, %%mm2\n\t"
262
                "movq        %%mm0, %%mm1\n\t"
263
                "movq        %%mm2, %%mm3\n\t"
264
                "psrlq        $1, %%mm0\n\t"
265
                "psrlq        $1, %%mm2\n\t"
266
                "pand        %%mm7, %%mm0\n\t"
267
                "pand        %%mm7, %%mm2\n\t"
268
                "pand        %%mm6, %%mm1\n\t"
269
                "pand        %%mm6, %%mm3\n\t"
270
                "por        %%mm1, %%mm0\n\t"
271
                "por        %%mm3, %%mm2\n\t"
272
                MOVNTQ"        %%mm0, %0\n\t"
273
                MOVNTQ"        %%mm2, 8%0"
274
                :"=m"(*d)
275
                :"m"(*s)
276
                );
277
        d+=16;
278
        s+=16;
279
  }
280
  __asm __volatile(SFENCE:::"memory");
281
  __asm __volatile(EMMS:::"memory");
282
#endif
283
    mm_end = end - 3;
284
    while(s < mm_end)
285
    {
286
        register uint32_t x= *((uint32_t *)s);
287
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
288
        s+=4;
289
        d+=4;
290
    }
291
    if(s < end)
292
    {
293
        register uint16_t x= *((uint16_t *)s);
294
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
295
        s+=2;
296
        d+=2;
297
    }
298
}
299

    
300
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
301
{
302
        const uint8_t *s = src;
303
        const uint8_t *end;
304
#ifdef HAVE_MMX
305
        const uint8_t *mm_end;
306
#endif
307
        uint16_t *d = (uint16_t *)dst;
308
        end = s + src_size;
309
#ifdef HAVE_MMX
310
        mm_end = end - 15;
311
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
312
        asm volatile(
313
                "movq %3, %%mm5                        \n\t"
314
                "movq %4, %%mm6                        \n\t"
315
                "movq %5, %%mm7                        \n\t"
316
                ".balign 16                        \n\t"
317
                "1:                                \n\t"
318
                PREFETCH" 32(%1)                \n\t"
319
                "movd        (%1), %%mm0                \n\t"
320
                "movd        4(%1), %%mm3                \n\t"
321
                "punpckldq 8(%1), %%mm0                \n\t"
322
                "punpckldq 12(%1), %%mm3        \n\t"
323
                "movq %%mm0, %%mm1                \n\t"
324
                "movq %%mm3, %%mm4                \n\t"
325
                "pand %%mm6, %%mm0                \n\t"
326
                "pand %%mm6, %%mm3                \n\t"
327
                "pmaddwd %%mm7, %%mm0                \n\t"
328
                "pmaddwd %%mm7, %%mm3                \n\t"
329
                "pand %%mm5, %%mm1                \n\t"
330
                "pand %%mm5, %%mm4                \n\t"
331
                "por %%mm1, %%mm0                \n\t"        
332
                "por %%mm4, %%mm3                \n\t"
333
                "psrld $5, %%mm0                \n\t"
334
                "pslld $11, %%mm3                \n\t"
335
                "por %%mm3, %%mm0                \n\t"
336
                MOVNTQ"        %%mm0, (%0)                \n\t"
337
                "addl $16, %1                        \n\t"
338
                "addl $8, %0                        \n\t"
339
                "cmpl %2, %1                        \n\t"
340
                " jb 1b                                \n\t"
341
                : "+r" (d), "+r"(s)
342
                : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
343
        );
344
#else
345
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
346
        __asm __volatile(
347
            "movq        %0, %%mm7\n\t"
348
            "movq        %1, %%mm6\n\t"
349
            ::"m"(red_16mask),"m"(green_16mask));
350
        while(s < mm_end)
351
        {
352
            __asm __volatile(
353
                PREFETCH" 32%1\n\t"
354
                "movd        %1, %%mm0\n\t"
355
                "movd        4%1, %%mm3\n\t"
356
                "punpckldq 8%1, %%mm0\n\t"
357
                "punpckldq 12%1, %%mm3\n\t"
358
                "movq        %%mm0, %%mm1\n\t"
359
                "movq        %%mm0, %%mm2\n\t"
360
                "movq        %%mm3, %%mm4\n\t"
361
                "movq        %%mm3, %%mm5\n\t"
362
                "psrlq        $3, %%mm0\n\t"
363
                "psrlq        $3, %%mm3\n\t"
364
                "pand        %2, %%mm0\n\t"
365
                "pand        %2, %%mm3\n\t"
366
                "psrlq        $5, %%mm1\n\t"
367
                "psrlq        $5, %%mm4\n\t"
368
                "pand        %%mm6, %%mm1\n\t"
369
                "pand        %%mm6, %%mm4\n\t"
370
                "psrlq        $8, %%mm2\n\t"
371
                "psrlq        $8, %%mm5\n\t"
372
                "pand        %%mm7, %%mm2\n\t"
373
                "pand        %%mm7, %%mm5\n\t"
374
                "por        %%mm1, %%mm0\n\t"
375
                "por        %%mm4, %%mm3\n\t"
376
                "por        %%mm2, %%mm0\n\t"
377
                "por        %%mm5, %%mm3\n\t"
378
                "psllq        $16, %%mm3\n\t"
379
                "por        %%mm3, %%mm0\n\t"
380
                MOVNTQ"        %%mm0, %0\n\t"
381
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
382
                d += 4;
383
                s += 16;
384
        }
385
#endif
386
        __asm __volatile(SFENCE:::"memory");
387
        __asm __volatile(EMMS:::"memory");
388
#endif
389
        while(s < end)
390
        {
391
                const int src= *s; s += 4;
392
                *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393
//                *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
394
        }
395
}
396

    
397
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
398
{
399
        const uint8_t *s = src;
400
        const uint8_t *end;
401
#ifdef HAVE_MMX
402
        const uint8_t *mm_end;
403
#endif
404
        uint16_t *d = (uint16_t *)dst;
405
        end = s + src_size;
406
#ifdef HAVE_MMX
407
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
408
        __asm __volatile(
409
            "movq        %0, %%mm7\n\t"
410
            "movq        %1, %%mm6\n\t"
411
            ::"m"(red_16mask),"m"(green_16mask));
412
        mm_end = end - 15;
413
        while(s < mm_end)
414
        {
415
            __asm __volatile(
416
                PREFETCH" 32%1\n\t"
417
                "movd        %1, %%mm0\n\t"
418
                "movd        4%1, %%mm3\n\t"
419
                "punpckldq 8%1, %%mm0\n\t"
420
                "punpckldq 12%1, %%mm3\n\t"
421
                "movq        %%mm0, %%mm1\n\t"
422
                "movq        %%mm0, %%mm2\n\t"
423
                "movq        %%mm3, %%mm4\n\t"
424
                "movq        %%mm3, %%mm5\n\t"
425
                "psllq        $8, %%mm0\n\t"
426
                "psllq        $8, %%mm3\n\t"
427
                "pand        %%mm7, %%mm0\n\t"
428
                "pand        %%mm7, %%mm3\n\t"
429
                "psrlq        $5, %%mm1\n\t"
430
                "psrlq        $5, %%mm4\n\t"
431
                "pand        %%mm6, %%mm1\n\t"
432
                "pand        %%mm6, %%mm4\n\t"
433
                "psrlq        $19, %%mm2\n\t"
434
                "psrlq        $19, %%mm5\n\t"
435
                "pand        %2, %%mm2\n\t"
436
                "pand        %2, %%mm5\n\t"
437
                "por        %%mm1, %%mm0\n\t"
438
                "por        %%mm4, %%mm3\n\t"
439
                "por        %%mm2, %%mm0\n\t"
440
                "por        %%mm5, %%mm3\n\t"
441
                "psllq        $16, %%mm3\n\t"
442
                "por        %%mm3, %%mm0\n\t"
443
                MOVNTQ"        %%mm0, %0\n\t"
444
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
445
                d += 4;
446
                s += 16;
447
        }
448
        __asm __volatile(SFENCE:::"memory");
449
        __asm __volatile(EMMS:::"memory");
450
#endif
451
        while(s < end)
452
        {
453
                const int src= *s; s += 4;
454
                *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
455
        }
456
}
457

    
458
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
459
{
460
        const uint8_t *s = src;
461
        const uint8_t *end;
462
#ifdef HAVE_MMX
463
        const uint8_t *mm_end;
464
#endif
465
        uint16_t *d = (uint16_t *)dst;
466
        end = s + src_size;
467
#ifdef HAVE_MMX
468
        mm_end = end - 15;
469
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
470
        asm volatile(
471
                "movq %3, %%mm5                        \n\t"
472
                "movq %4, %%mm6                        \n\t"
473
                "movq %5, %%mm7                        \n\t"
474
                ".balign 16                        \n\t"
475
                "1:                                \n\t"
476
                PREFETCH" 32(%1)                \n\t"
477
                "movd        (%1), %%mm0                \n\t"
478
                "movd        4(%1), %%mm3                \n\t"
479
                "punpckldq 8(%1), %%mm0                \n\t"
480
                "punpckldq 12(%1), %%mm3        \n\t"
481
                "movq %%mm0, %%mm1                \n\t"
482
                "movq %%mm3, %%mm4                \n\t"
483
                "pand %%mm6, %%mm0                \n\t"
484
                "pand %%mm6, %%mm3                \n\t"
485
                "pmaddwd %%mm7, %%mm0                \n\t"
486
                "pmaddwd %%mm7, %%mm3                \n\t"
487
                "pand %%mm5, %%mm1                \n\t"
488
                "pand %%mm5, %%mm4                \n\t"
489
                "por %%mm1, %%mm0                \n\t"        
490
                "por %%mm4, %%mm3                \n\t"
491
                "psrld $6, %%mm0                \n\t"
492
                "pslld $10, %%mm3                \n\t"
493
                "por %%mm3, %%mm0                \n\t"
494
                MOVNTQ"        %%mm0, (%0)                \n\t"
495
                "addl $16, %1                        \n\t"
496
                "addl $8, %0                        \n\t"
497
                "cmpl %2, %1                        \n\t"
498
                " jb 1b                                \n\t"
499
                : "+r" (d), "+r"(s)
500
                : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
501
        );
502
#else
503
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
504
        __asm __volatile(
505
            "movq        %0, %%mm7\n\t"
506
            "movq        %1, %%mm6\n\t"
507
            ::"m"(red_15mask),"m"(green_15mask));
508
        while(s < mm_end)
509
        {
510
            __asm __volatile(
511
                PREFETCH" 32%1\n\t"
512
                "movd        %1, %%mm0\n\t"
513
                "movd        4%1, %%mm3\n\t"
514
                "punpckldq 8%1, %%mm0\n\t"
515
                "punpckldq 12%1, %%mm3\n\t"
516
                "movq        %%mm0, %%mm1\n\t"
517
                "movq        %%mm0, %%mm2\n\t"
518
                "movq        %%mm3, %%mm4\n\t"
519
                "movq        %%mm3, %%mm5\n\t"
520
                "psrlq        $3, %%mm0\n\t"
521
                "psrlq        $3, %%mm3\n\t"
522
                "pand        %2, %%mm0\n\t"
523
                "pand        %2, %%mm3\n\t"
524
                "psrlq        $6, %%mm1\n\t"
525
                "psrlq        $6, %%mm4\n\t"
526
                "pand        %%mm6, %%mm1\n\t"
527
                "pand        %%mm6, %%mm4\n\t"
528
                "psrlq        $9, %%mm2\n\t"
529
                "psrlq        $9, %%mm5\n\t"
530
                "pand        %%mm7, %%mm2\n\t"
531
                "pand        %%mm7, %%mm5\n\t"
532
                "por        %%mm1, %%mm0\n\t"
533
                "por        %%mm4, %%mm3\n\t"
534
                "por        %%mm2, %%mm0\n\t"
535
                "por        %%mm5, %%mm3\n\t"
536
                "psllq        $16, %%mm3\n\t"
537
                "por        %%mm3, %%mm0\n\t"
538
                MOVNTQ"        %%mm0, %0\n\t"
539
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
540
                d += 4;
541
                s += 16;
542
        }
543
#endif
544
        __asm __volatile(SFENCE:::"memory");
545
        __asm __volatile(EMMS:::"memory");
546
#endif
547
        while(s < end)
548
        {
549
                const int src= *s; s += 4;
550
                *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
551
        }
552
}
553

    
554
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
555
{
556
        const uint8_t *s = src;
557
        const uint8_t *end;
558
#ifdef HAVE_MMX
559
        const uint8_t *mm_end;
560
#endif
561
        uint16_t *d = (uint16_t *)dst;
562
        end = s + src_size;
563
#ifdef HAVE_MMX
564
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
565
        __asm __volatile(
566
            "movq        %0, %%mm7\n\t"
567
            "movq        %1, %%mm6\n\t"
568
            ::"m"(red_15mask),"m"(green_15mask));
569
        mm_end = end - 15;
570
        while(s < mm_end)
571
        {
572
            __asm __volatile(
573
                PREFETCH" 32%1\n\t"
574
                "movd        %1, %%mm0\n\t"
575
                "movd        4%1, %%mm3\n\t"
576
                "punpckldq 8%1, %%mm0\n\t"
577
                "punpckldq 12%1, %%mm3\n\t"
578
                "movq        %%mm0, %%mm1\n\t"
579
                "movq        %%mm0, %%mm2\n\t"
580
                "movq        %%mm3, %%mm4\n\t"
581
                "movq        %%mm3, %%mm5\n\t"
582
                "psllq        $7, %%mm0\n\t"
583
                "psllq        $7, %%mm3\n\t"
584
                "pand        %%mm7, %%mm0\n\t"
585
                "pand        %%mm7, %%mm3\n\t"
586
                "psrlq        $6, %%mm1\n\t"
587
                "psrlq        $6, %%mm4\n\t"
588
                "pand        %%mm6, %%mm1\n\t"
589
                "pand        %%mm6, %%mm4\n\t"
590
                "psrlq        $19, %%mm2\n\t"
591
                "psrlq        $19, %%mm5\n\t"
592
                "pand        %2, %%mm2\n\t"
593
                "pand        %2, %%mm5\n\t"
594
                "por        %%mm1, %%mm0\n\t"
595
                "por        %%mm4, %%mm3\n\t"
596
                "por        %%mm2, %%mm0\n\t"
597
                "por        %%mm5, %%mm3\n\t"
598
                "psllq        $16, %%mm3\n\t"
599
                "por        %%mm3, %%mm0\n\t"
600
                MOVNTQ"        %%mm0, %0\n\t"
601
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
602
                d += 4;
603
                s += 16;
604
        }
605
        __asm __volatile(SFENCE:::"memory");
606
        __asm __volatile(EMMS:::"memory");
607
#endif
608
        while(s < end)
609
        {
610
                const int src= *s; s += 4;
611
                *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
612
        }
613
}
614

    
615
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
616
{
617
        const uint8_t *s = src;
618
        const uint8_t *end;
619
#ifdef HAVE_MMX
620
        const uint8_t *mm_end;
621
#endif
622
        uint16_t *d = (uint16_t *)dst;
623
        end = s + src_size;
624
#ifdef HAVE_MMX
625
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
626
        __asm __volatile(
627
            "movq        %0, %%mm7\n\t"
628
            "movq        %1, %%mm6\n\t"
629
            ::"m"(red_16mask),"m"(green_16mask));
630
        mm_end = end - 11;
631
        while(s < mm_end)
632
        {
633
            __asm __volatile(
634
                PREFETCH" 32%1\n\t"
635
                "movd        %1, %%mm0\n\t"
636
                "movd        3%1, %%mm3\n\t"
637
                "punpckldq 6%1, %%mm0\n\t"
638
                "punpckldq 9%1, %%mm3\n\t"
639
                "movq        %%mm0, %%mm1\n\t"
640
                "movq        %%mm0, %%mm2\n\t"
641
                "movq        %%mm3, %%mm4\n\t"
642
                "movq        %%mm3, %%mm5\n\t"
643
                "psrlq        $3, %%mm0\n\t"
644
                "psrlq        $3, %%mm3\n\t"
645
                "pand        %2, %%mm0\n\t"
646
                "pand        %2, %%mm3\n\t"
647
                "psrlq        $5, %%mm1\n\t"
648
                "psrlq        $5, %%mm4\n\t"
649
                "pand        %%mm6, %%mm1\n\t"
650
                "pand        %%mm6, %%mm4\n\t"
651
                "psrlq        $8, %%mm2\n\t"
652
                "psrlq        $8, %%mm5\n\t"
653
                "pand        %%mm7, %%mm2\n\t"
654
                "pand        %%mm7, %%mm5\n\t"
655
                "por        %%mm1, %%mm0\n\t"
656
                "por        %%mm4, %%mm3\n\t"
657
                "por        %%mm2, %%mm0\n\t"
658
                "por        %%mm5, %%mm3\n\t"
659
                "psllq        $16, %%mm3\n\t"
660
                "por        %%mm3, %%mm0\n\t"
661
                MOVNTQ"        %%mm0, %0\n\t"
662
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
663
                d += 4;
664
                s += 12;
665
        }
666
        __asm __volatile(SFENCE:::"memory");
667
        __asm __volatile(EMMS:::"memory");
668
#endif
669
        while(s < end)
670
        {
671
                const int b= *s++;
672
                const int g= *s++;
673
                const int r= *s++;
674
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
675
        }
676
}
677

    
678
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
679
{
680
        const uint8_t *s = src;
681
        const uint8_t *end;
682
#ifdef HAVE_MMX
683
        const uint8_t *mm_end;
684
#endif
685
        uint16_t *d = (uint16_t *)dst;
686
        end = s + src_size;
687
#ifdef HAVE_MMX
688
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
689
        __asm __volatile(
690
            "movq        %0, %%mm7\n\t"
691
            "movq        %1, %%mm6\n\t"
692
            ::"m"(red_16mask),"m"(green_16mask));
693
        mm_end = end - 15;
694
        while(s < mm_end)
695
        {
696
            __asm __volatile(
697
                PREFETCH" 32%1\n\t"
698
                "movd        %1, %%mm0\n\t"
699
                "movd        3%1, %%mm3\n\t"
700
                "punpckldq 6%1, %%mm0\n\t"
701
                "punpckldq 9%1, %%mm3\n\t"
702
                "movq        %%mm0, %%mm1\n\t"
703
                "movq        %%mm0, %%mm2\n\t"
704
                "movq        %%mm3, %%mm4\n\t"
705
                "movq        %%mm3, %%mm5\n\t"
706
                "psllq        $8, %%mm0\n\t"
707
                "psllq        $8, %%mm3\n\t"
708
                "pand        %%mm7, %%mm0\n\t"
709
                "pand        %%mm7, %%mm3\n\t"
710
                "psrlq        $5, %%mm1\n\t"
711
                "psrlq        $5, %%mm4\n\t"
712
                "pand        %%mm6, %%mm1\n\t"
713
                "pand        %%mm6, %%mm4\n\t"
714
                "psrlq        $19, %%mm2\n\t"
715
                "psrlq        $19, %%mm5\n\t"
716
                "pand        %2, %%mm2\n\t"
717
                "pand        %2, %%mm5\n\t"
718
                "por        %%mm1, %%mm0\n\t"
719
                "por        %%mm4, %%mm3\n\t"
720
                "por        %%mm2, %%mm0\n\t"
721
                "por        %%mm5, %%mm3\n\t"
722
                "psllq        $16, %%mm3\n\t"
723
                "por        %%mm3, %%mm0\n\t"
724
                MOVNTQ"        %%mm0, %0\n\t"
725
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
726
                d += 4;
727
                s += 12;
728
        }
729
        __asm __volatile(SFENCE:::"memory");
730
        __asm __volatile(EMMS:::"memory");
731
#endif
732
        while(s < end)
733
        {
734
                const int r= *s++;
735
                const int g= *s++;
736
                const int b= *s++;
737
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
738
        }
739
}
740

    
741
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
742
{
743
        const uint8_t *s = src;
744
        const uint8_t *end;
745
#ifdef HAVE_MMX
746
        const uint8_t *mm_end;
747
#endif
748
        uint16_t *d = (uint16_t *)dst;
749
        end = s + src_size;
750
#ifdef HAVE_MMX
751
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
752
        __asm __volatile(
753
            "movq        %0, %%mm7\n\t"
754
            "movq        %1, %%mm6\n\t"
755
            ::"m"(red_15mask),"m"(green_15mask));
756
        mm_end = end - 11;
757
        while(s < mm_end)
758
        {
759
            __asm __volatile(
760
                PREFETCH" 32%1\n\t"
761
                "movd        %1, %%mm0\n\t"
762
                "movd        3%1, %%mm3\n\t"
763
                "punpckldq 6%1, %%mm0\n\t"
764
                "punpckldq 9%1, %%mm3\n\t"
765
                "movq        %%mm0, %%mm1\n\t"
766
                "movq        %%mm0, %%mm2\n\t"
767
                "movq        %%mm3, %%mm4\n\t"
768
                "movq        %%mm3, %%mm5\n\t"
769
                "psrlq        $3, %%mm0\n\t"
770
                "psrlq        $3, %%mm3\n\t"
771
                "pand        %2, %%mm0\n\t"
772
                "pand        %2, %%mm3\n\t"
773
                "psrlq        $6, %%mm1\n\t"
774
                "psrlq        $6, %%mm4\n\t"
775
                "pand        %%mm6, %%mm1\n\t"
776
                "pand        %%mm6, %%mm4\n\t"
777
                "psrlq        $9, %%mm2\n\t"
778
                "psrlq        $9, %%mm5\n\t"
779
                "pand        %%mm7, %%mm2\n\t"
780
                "pand        %%mm7, %%mm5\n\t"
781
                "por        %%mm1, %%mm0\n\t"
782
                "por        %%mm4, %%mm3\n\t"
783
                "por        %%mm2, %%mm0\n\t"
784
                "por        %%mm5, %%mm3\n\t"
785
                "psllq        $16, %%mm3\n\t"
786
                "por        %%mm3, %%mm0\n\t"
787
                MOVNTQ"        %%mm0, %0\n\t"
788
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
789
                d += 4;
790
                s += 12;
791
        }
792
        __asm __volatile(SFENCE:::"memory");
793
        __asm __volatile(EMMS:::"memory");
794
#endif
795
        while(s < end)
796
        {
797
                const int b= *s++;
798
                const int g= *s++;
799
                const int r= *s++;
800
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
801
        }
802
}
803

    
804
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
805
{
806
        const uint8_t *s = src;
807
        const uint8_t *end;
808
#ifdef HAVE_MMX
809
        const uint8_t *mm_end;
810
#endif
811
        uint16_t *d = (uint16_t *)dst;
812
        end = s + src_size;
813
#ifdef HAVE_MMX
814
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
815
        __asm __volatile(
816
            "movq        %0, %%mm7\n\t"
817
            "movq        %1, %%mm6\n\t"
818
            ::"m"(red_15mask),"m"(green_15mask));
819
        mm_end = end - 15;
820
        while(s < mm_end)
821
        {
822
            __asm __volatile(
823
                PREFETCH" 32%1\n\t"
824
                "movd        %1, %%mm0\n\t"
825
                "movd        3%1, %%mm3\n\t"
826
                "punpckldq 6%1, %%mm0\n\t"
827
                "punpckldq 9%1, %%mm3\n\t"
828
                "movq        %%mm0, %%mm1\n\t"
829
                "movq        %%mm0, %%mm2\n\t"
830
                "movq        %%mm3, %%mm4\n\t"
831
                "movq        %%mm3, %%mm5\n\t"
832
                "psllq        $7, %%mm0\n\t"
833
                "psllq        $7, %%mm3\n\t"
834
                "pand        %%mm7, %%mm0\n\t"
835
                "pand        %%mm7, %%mm3\n\t"
836
                "psrlq        $6, %%mm1\n\t"
837
                "psrlq        $6, %%mm4\n\t"
838
                "pand        %%mm6, %%mm1\n\t"
839
                "pand        %%mm6, %%mm4\n\t"
840
                "psrlq        $19, %%mm2\n\t"
841
                "psrlq        $19, %%mm5\n\t"
842
                "pand        %2, %%mm2\n\t"
843
                "pand        %2, %%mm5\n\t"
844
                "por        %%mm1, %%mm0\n\t"
845
                "por        %%mm4, %%mm3\n\t"
846
                "por        %%mm2, %%mm0\n\t"
847
                "por        %%mm5, %%mm3\n\t"
848
                "psllq        $16, %%mm3\n\t"
849
                "por        %%mm3, %%mm0\n\t"
850
                MOVNTQ"        %%mm0, %0\n\t"
851
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
852
                d += 4;
853
                s += 12;
854
        }
855
        __asm __volatile(SFENCE:::"memory");
856
        __asm __volatile(EMMS:::"memory");
857
#endif
858
        while(s < end)
859
        {
860
                const int r= *s++;
861
                const int g= *s++;
862
                const int b= *s++;
863
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
864
        }
865
}
866

    
867
/*
868
  I use here less accurate approximation by simply
869
 left-shifting the input
870
  value and filling the low order bits with
871
 zeroes. This method improves png's
872
  compression but this scheme cannot reproduce white exactly, since it does not
873
  generate an all-ones maximum value; the net effect is to darken the
874
  image slightly.
875

876
  The better method should be "left bit replication":
877

878
   4 3 2 1 0
879
   ---------
880
   1 1 0 1 1
881

882
   7 6 5 4 3  2 1 0
883
   ----------------
884
   1 1 0 1 1  1 1 0
885
   |=======|  |===|
886
       |      Leftmost Bits Repeated to Fill Open Bits
887
       |
888
   Original Bits
889
*/
890
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
891
{
892
        const uint16_t *end;
893
#ifdef HAVE_MMX
894
        const uint16_t *mm_end;
895
#endif
896
        uint8_t *d = (uint8_t *)dst;
897
        const uint16_t *s = (uint16_t *)src;
898
        end = s + src_size/2;
899
#ifdef HAVE_MMX
900
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
901
        mm_end = end - 7;
902
        while(s < mm_end)
903
        {
904
            __asm __volatile(
905
                PREFETCH" 32%1\n\t"
906
                "movq        %1, %%mm0\n\t"
907
                "movq        %1, %%mm1\n\t"
908
                "movq        %1, %%mm2\n\t"
909
                "pand        %2, %%mm0\n\t"
910
                "pand        %3, %%mm1\n\t"
911
                "pand        %4, %%mm2\n\t"
912
                "psllq        $3, %%mm0\n\t"
913
                "psrlq        $2, %%mm1\n\t"
914
                "psrlq        $7, %%mm2\n\t"
915
                "movq        %%mm0, %%mm3\n\t"
916
                "movq        %%mm1, %%mm4\n\t"
917
                "movq        %%mm2, %%mm5\n\t"
918
                "punpcklwd %5, %%mm0\n\t"
919
                "punpcklwd %5, %%mm1\n\t"
920
                "punpcklwd %5, %%mm2\n\t"
921
                "punpckhwd %5, %%mm3\n\t"
922
                "punpckhwd %5, %%mm4\n\t"
923
                "punpckhwd %5, %%mm5\n\t"
924
                "psllq        $8, %%mm1\n\t"
925
                "psllq        $16, %%mm2\n\t"
926
                "por        %%mm1, %%mm0\n\t"
927
                "por        %%mm2, %%mm0\n\t"
928
                "psllq        $8, %%mm4\n\t"
929
                "psllq        $16, %%mm5\n\t"
930
                "por        %%mm4, %%mm3\n\t"
931
                "por        %%mm5, %%mm3\n\t"
932

    
933
                "movq        %%mm0, %%mm6\n\t"
934
                "movq        %%mm3, %%mm7\n\t"
935
                
936
                "movq        8%1, %%mm0\n\t"
937
                "movq        8%1, %%mm1\n\t"
938
                "movq        8%1, %%mm2\n\t"
939
                "pand        %2, %%mm0\n\t"
940
                "pand        %3, %%mm1\n\t"
941
                "pand        %4, %%mm2\n\t"
942
                "psllq        $3, %%mm0\n\t"
943
                "psrlq        $2, %%mm1\n\t"
944
                "psrlq        $7, %%mm2\n\t"
945
                "movq        %%mm0, %%mm3\n\t"
946
                "movq        %%mm1, %%mm4\n\t"
947
                "movq        %%mm2, %%mm5\n\t"
948
                "punpcklwd %5, %%mm0\n\t"
949
                "punpcklwd %5, %%mm1\n\t"
950
                "punpcklwd %5, %%mm2\n\t"
951
                "punpckhwd %5, %%mm3\n\t"
952
                "punpckhwd %5, %%mm4\n\t"
953
                "punpckhwd %5, %%mm5\n\t"
954
                "psllq        $8, %%mm1\n\t"
955
                "psllq        $16, %%mm2\n\t"
956
                "por        %%mm1, %%mm0\n\t"
957
                "por        %%mm2, %%mm0\n\t"
958
                "psllq        $8, %%mm4\n\t"
959
                "psllq        $16, %%mm5\n\t"
960
                "por        %%mm4, %%mm3\n\t"
961
                "por        %%mm5, %%mm3\n\t"
962

    
963
                :"=m"(*d)
964
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965
                :"memory");
966
            /* Borrowed 32 to 24 */
967
            __asm __volatile(
968
                "movq        %%mm0, %%mm4\n\t"
969
                "movq        %%mm3, %%mm5\n\t"
970
                "movq        %%mm6, %%mm0\n\t"
971
                "movq        %%mm7, %%mm1\n\t"
972
                
973
                "movq        %%mm4, %%mm6\n\t"
974
                "movq        %%mm5, %%mm7\n\t"
975
                "movq        %%mm0, %%mm2\n\t"
976
                "movq        %%mm1, %%mm3\n\t"
977

    
978
                "psrlq        $8, %%mm2\n\t"
979
                "psrlq        $8, %%mm3\n\t"
980
                "psrlq        $8, %%mm6\n\t"
981
                "psrlq        $8, %%mm7\n\t"
982
                "pand        %2, %%mm0\n\t"
983
                "pand        %2, %%mm1\n\t"
984
                "pand        %2, %%mm4\n\t"
985
                "pand        %2, %%mm5\n\t"
986
                "pand        %3, %%mm2\n\t"
987
                "pand        %3, %%mm3\n\t"
988
                "pand        %3, %%mm6\n\t"
989
                "pand        %3, %%mm7\n\t"
990
                "por        %%mm2, %%mm0\n\t"
991
                "por        %%mm3, %%mm1\n\t"
992
                "por        %%mm6, %%mm4\n\t"
993
                "por        %%mm7, %%mm5\n\t"
994

    
995
                "movq        %%mm1, %%mm2\n\t"
996
                "movq        %%mm4, %%mm3\n\t"
997
                "psllq        $48, %%mm2\n\t"
998
                "psllq        $32, %%mm3\n\t"
999
                "pand        %4, %%mm2\n\t"
1000
                "pand        %5, %%mm3\n\t"
1001
                "por        %%mm2, %%mm0\n\t"
1002
                "psrlq        $16, %%mm1\n\t"
1003
                "psrlq        $32, %%mm4\n\t"
1004
                "psllq        $16, %%mm5\n\t"
1005
                "por        %%mm3, %%mm1\n\t"
1006
                "pand        %6, %%mm5\n\t"
1007
                "por        %%mm5, %%mm4\n\t"
1008

    
1009
                MOVNTQ"        %%mm0, %0\n\t"
1010
                MOVNTQ"        %%mm1, 8%0\n\t"
1011
                MOVNTQ"        %%mm4, 16%0"
1012

    
1013
                :"=m"(*d)
1014
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1015
                :"memory");
1016
                d += 24;
1017
                s += 8;
1018
        }
1019
        __asm __volatile(SFENCE:::"memory");
1020
        __asm __volatile(EMMS:::"memory");
1021
#endif
1022
        while(s < end)
1023
        {
1024
                register uint16_t bgr;
1025
                bgr = *s++;
1026
                *d++ = (bgr&0x1F)<<3;
1027
                *d++ = (bgr&0x3E0)>>2;
1028
                *d++ = (bgr&0x7C00)>>7;
1029
        }
1030
}
1031

    
1032
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1033
{
1034
        const uint16_t *end;
1035
#ifdef HAVE_MMX
1036
        const uint16_t *mm_end;
1037
#endif
1038
        uint8_t *d = (uint8_t *)dst;
1039
        const uint16_t *s = (const uint16_t *)src;
1040
        end = s + src_size/2;
1041
#ifdef HAVE_MMX
1042
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1043
        mm_end = end - 7;
1044
        while(s < mm_end)
1045
        {
1046
            __asm __volatile(
1047
                PREFETCH" 32%1\n\t"
1048
                "movq        %1, %%mm0\n\t"
1049
                "movq        %1, %%mm1\n\t"
1050
                "movq        %1, %%mm2\n\t"
1051
                "pand        %2, %%mm0\n\t"
1052
                "pand        %3, %%mm1\n\t"
1053
                "pand        %4, %%mm2\n\t"
1054
                "psllq        $3, %%mm0\n\t"
1055
                "psrlq        $3, %%mm1\n\t"
1056
                "psrlq        $8, %%mm2\n\t"
1057
                "movq        %%mm0, %%mm3\n\t"
1058
                "movq        %%mm1, %%mm4\n\t"
1059
                "movq        %%mm2, %%mm5\n\t"
1060
                "punpcklwd %5, %%mm0\n\t"
1061
                "punpcklwd %5, %%mm1\n\t"
1062
                "punpcklwd %5, %%mm2\n\t"
1063
                "punpckhwd %5, %%mm3\n\t"
1064
                "punpckhwd %5, %%mm4\n\t"
1065
                "punpckhwd %5, %%mm5\n\t"
1066
                "psllq        $8, %%mm1\n\t"
1067
                "psllq        $16, %%mm2\n\t"
1068
                "por        %%mm1, %%mm0\n\t"
1069
                "por        %%mm2, %%mm0\n\t"
1070
                "psllq        $8, %%mm4\n\t"
1071
                "psllq        $16, %%mm5\n\t"
1072
                "por        %%mm4, %%mm3\n\t"
1073
                "por        %%mm5, %%mm3\n\t"
1074
                
1075
                "movq        %%mm0, %%mm6\n\t"
1076
                "movq        %%mm3, %%mm7\n\t"
1077

    
1078
                "movq        8%1, %%mm0\n\t"
1079
                "movq        8%1, %%mm1\n\t"
1080
                "movq        8%1, %%mm2\n\t"
1081
                "pand        %2, %%mm0\n\t"
1082
                "pand        %3, %%mm1\n\t"
1083
                "pand        %4, %%mm2\n\t"
1084
                "psllq        $3, %%mm0\n\t"
1085
                "psrlq        $3, %%mm1\n\t"
1086
                "psrlq        $8, %%mm2\n\t"
1087
                "movq        %%mm0, %%mm3\n\t"
1088
                "movq        %%mm1, %%mm4\n\t"
1089
                "movq        %%mm2, %%mm5\n\t"
1090
                "punpcklwd %5, %%mm0\n\t"
1091
                "punpcklwd %5, %%mm1\n\t"
1092
                "punpcklwd %5, %%mm2\n\t"
1093
                "punpckhwd %5, %%mm3\n\t"
1094
                "punpckhwd %5, %%mm4\n\t"
1095
                "punpckhwd %5, %%mm5\n\t"
1096
                "psllq        $8, %%mm1\n\t"
1097
                "psllq        $16, %%mm2\n\t"
1098
                "por        %%mm1, %%mm0\n\t"
1099
                "por        %%mm2, %%mm0\n\t"
1100
                "psllq        $8, %%mm4\n\t"
1101
                "psllq        $16, %%mm5\n\t"
1102
                "por        %%mm4, %%mm3\n\t"
1103
                "por        %%mm5, %%mm3\n\t"
1104
                :"=m"(*d)
1105
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)                
1106
                :"memory");
1107
            /* Borrowed 32 to 24 */
1108
            __asm __volatile(
1109
                "movq        %%mm0, %%mm4\n\t"
1110
                "movq        %%mm3, %%mm5\n\t"
1111
                "movq        %%mm6, %%mm0\n\t"
1112
                "movq        %%mm7, %%mm1\n\t"
1113
                
1114
                "movq        %%mm4, %%mm6\n\t"
1115
                "movq        %%mm5, %%mm7\n\t"
1116
                "movq        %%mm0, %%mm2\n\t"
1117
                "movq        %%mm1, %%mm3\n\t"
1118

    
1119
                "psrlq        $8, %%mm2\n\t"
1120
                "psrlq        $8, %%mm3\n\t"
1121
                "psrlq        $8, %%mm6\n\t"
1122
                "psrlq        $8, %%mm7\n\t"
1123
                "pand        %2, %%mm0\n\t"
1124
                "pand        %2, %%mm1\n\t"
1125
                "pand        %2, %%mm4\n\t"
1126
                "pand        %2, %%mm5\n\t"
1127
                "pand        %3, %%mm2\n\t"
1128
                "pand        %3, %%mm3\n\t"
1129
                "pand        %3, %%mm6\n\t"
1130
                "pand        %3, %%mm7\n\t"
1131
                "por        %%mm2, %%mm0\n\t"
1132
                "por        %%mm3, %%mm1\n\t"
1133
                "por        %%mm6, %%mm4\n\t"
1134
                "por        %%mm7, %%mm5\n\t"
1135

    
1136
                "movq        %%mm1, %%mm2\n\t"
1137
                "movq        %%mm4, %%mm3\n\t"
1138
                "psllq        $48, %%mm2\n\t"
1139
                "psllq        $32, %%mm3\n\t"
1140
                "pand        %4, %%mm2\n\t"
1141
                "pand        %5, %%mm3\n\t"
1142
                "por        %%mm2, %%mm0\n\t"
1143
                "psrlq        $16, %%mm1\n\t"
1144
                "psrlq        $32, %%mm4\n\t"
1145
                "psllq        $16, %%mm5\n\t"
1146
                "por        %%mm3, %%mm1\n\t"
1147
                "pand        %6, %%mm5\n\t"
1148
                "por        %%mm5, %%mm4\n\t"
1149

    
1150
                MOVNTQ"        %%mm0, %0\n\t"
1151
                MOVNTQ"        %%mm1, 8%0\n\t"
1152
                MOVNTQ"        %%mm4, 16%0"
1153

    
1154
                :"=m"(*d)
1155
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1156
                :"memory");
1157
                d += 24;
1158
                s += 8;
1159
        }
1160
        __asm __volatile(SFENCE:::"memory");
1161
        __asm __volatile(EMMS:::"memory");
1162
#endif
1163
        while(s < end)
1164
        {
1165
                register uint16_t bgr;
1166
                bgr = *s++;
1167
                *d++ = (bgr&0x1F)<<3;
1168
                *d++ = (bgr&0x7E0)>>3;
1169
                *d++ = (bgr&0xF800)>>8;
1170
        }
1171
}
1172

    
1173
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1174
{
1175
        const uint16_t *end;
1176
#ifdef HAVE_MMX
1177
        const uint16_t *mm_end;
1178
#endif
1179
        uint8_t *d = (uint8_t *)dst;
1180
        const uint16_t *s = (const uint16_t *)src;
1181
        end = s + src_size/2;
1182
#ifdef HAVE_MMX
1183
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1184
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1185
        mm_end = end - 3;
1186
        while(s < mm_end)
1187
        {
1188
            __asm __volatile(
1189
                PREFETCH" 32%1\n\t"
1190
                "movq        %1, %%mm0\n\t"
1191
                "movq        %1, %%mm1\n\t"
1192
                "movq        %1, %%mm2\n\t"
1193
                "pand        %2, %%mm0\n\t"
1194
                "pand        %3, %%mm1\n\t"
1195
                "pand        %4, %%mm2\n\t"
1196
                "psllq        $3, %%mm0\n\t"
1197
                "psrlq        $2, %%mm1\n\t"
1198
                "psrlq        $7, %%mm2\n\t"
1199
                "movq        %%mm0, %%mm3\n\t"
1200
                "movq        %%mm1, %%mm4\n\t"
1201
                "movq        %%mm2, %%mm5\n\t"
1202
                "punpcklwd %%mm7, %%mm0\n\t"
1203
                "punpcklwd %%mm7, %%mm1\n\t"
1204
                "punpcklwd %%mm7, %%mm2\n\t"
1205
                "punpckhwd %%mm7, %%mm3\n\t"
1206
                "punpckhwd %%mm7, %%mm4\n\t"
1207
                "punpckhwd %%mm7, %%mm5\n\t"
1208
                "psllq        $8, %%mm1\n\t"
1209
                "psllq        $16, %%mm2\n\t"
1210
                "por        %%mm1, %%mm0\n\t"
1211
                "por        %%mm2, %%mm0\n\t"
1212
                "psllq        $8, %%mm4\n\t"
1213
                "psllq        $16, %%mm5\n\t"
1214
                "por        %%mm4, %%mm3\n\t"
1215
                "por        %%mm5, %%mm3\n\t"
1216
                MOVNTQ"        %%mm0, %0\n\t"
1217
                MOVNTQ"        %%mm3, 8%0\n\t"
1218
                :"=m"(*d)
1219
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1220
                :"memory");
1221
                d += 16;
1222
                s += 4;
1223
        }
1224
        __asm __volatile(SFENCE:::"memory");
1225
        __asm __volatile(EMMS:::"memory");
1226
#endif
1227
        while(s < end)
1228
        {
1229
#if 0 //slightly slower on athlon
1230
                int bgr= *s++;
1231
                *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1232
#else
1233
//FIXME this is very likely wrong for bigendian (and the following converters too)
1234
                register uint16_t bgr;
1235
                bgr = *s++;
1236
                *d++ = (bgr&0x1F)<<3;
1237
                *d++ = (bgr&0x3E0)>>2;
1238
                *d++ = (bgr&0x7C00)>>7;
1239
                *d++ = 0;
1240
#endif
1241
        }
1242
}
1243

    
1244
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1245
{
1246
        const uint16_t *end;
1247
#ifdef HAVE_MMX
1248
        const uint16_t *mm_end;
1249
#endif
1250
        uint8_t *d = (uint8_t *)dst;
1251
        const uint16_t *s = (uint16_t *)src;
1252
        end = s + src_size/2;
1253
#ifdef HAVE_MMX
1254
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1255
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1256
        mm_end = end - 3;
1257
        while(s < mm_end)
1258
        {
1259
            __asm __volatile(
1260
                PREFETCH" 32%1\n\t"
1261
                "movq        %1, %%mm0\n\t"
1262
                "movq        %1, %%mm1\n\t"
1263
                "movq        %1, %%mm2\n\t"
1264
                "pand        %2, %%mm0\n\t"
1265
                "pand        %3, %%mm1\n\t"
1266
                "pand        %4, %%mm2\n\t"
1267
                "psllq        $3, %%mm0\n\t"
1268
                "psrlq        $3, %%mm1\n\t"
1269
                "psrlq        $8, %%mm2\n\t"
1270
                "movq        %%mm0, %%mm3\n\t"
1271
                "movq        %%mm1, %%mm4\n\t"
1272
                "movq        %%mm2, %%mm5\n\t"
1273
                "punpcklwd %%mm7, %%mm0\n\t"
1274
                "punpcklwd %%mm7, %%mm1\n\t"
1275
                "punpcklwd %%mm7, %%mm2\n\t"
1276
                "punpckhwd %%mm7, %%mm3\n\t"
1277
                "punpckhwd %%mm7, %%mm4\n\t"
1278
                "punpckhwd %%mm7, %%mm5\n\t"
1279
                "psllq        $8, %%mm1\n\t"
1280
                "psllq        $16, %%mm2\n\t"
1281
                "por        %%mm1, %%mm0\n\t"
1282
                "por        %%mm2, %%mm0\n\t"
1283
                "psllq        $8, %%mm4\n\t"
1284
                "psllq        $16, %%mm5\n\t"
1285
                "por        %%mm4, %%mm3\n\t"
1286
                "por        %%mm5, %%mm3\n\t"
1287
                MOVNTQ"        %%mm0, %0\n\t"
1288
                MOVNTQ"        %%mm3, 8%0\n\t"
1289
                :"=m"(*d)
1290
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1291
                :"memory");
1292
                d += 16;
1293
                s += 4;
1294
        }
1295
        __asm __volatile(SFENCE:::"memory");
1296
        __asm __volatile(EMMS:::"memory");
1297
#endif
1298
        while(s < end)
1299
        {
1300
                register uint16_t bgr;
1301
                bgr = *s++;
1302
                *d++ = (bgr&0x1F)<<3;
1303
                *d++ = (bgr&0x7E0)>>3;
1304
                *d++ = (bgr&0xF800)>>8;
1305
                *d++ = 0;
1306
        }
1307
}
1308

    
1309
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1310
{
1311
#ifdef HAVE_MMX
1312
/* TODO: unroll this loop */
1313
        asm volatile (
1314
                "xorl %%eax, %%eax                \n\t"
1315
                ".balign 16                        \n\t"
1316
                "1:                                \n\t"
1317
                PREFETCH" 32(%0, %%eax)                \n\t"
1318
                "movq (%0, %%eax), %%mm0        \n\t"
1319
                "movq %%mm0, %%mm1                \n\t"
1320
                "movq %%mm0, %%mm2                \n\t"
1321
                "pslld $16, %%mm0                \n\t"
1322
                "psrld $16, %%mm1                \n\t"
1323
                "pand "MANGLE(mask32r)", %%mm0        \n\t"
1324
                "pand "MANGLE(mask32g)", %%mm2        \n\t"
1325
                "pand "MANGLE(mask32b)", %%mm1        \n\t"
1326
                "por %%mm0, %%mm2                \n\t"
1327
                "por %%mm1, %%mm2                \n\t"
1328
                MOVNTQ" %%mm2, (%1, %%eax)        \n\t"
1329
                "addl $8, %%eax                        \n\t"
1330
                "cmpl %2, %%eax                        \n\t"
1331
                " jb 1b                                \n\t"
1332
                :: "r" (src), "r"(dst), "r" (src_size-7)
1333
                : "%eax"
1334
        );
1335

    
1336
        __asm __volatile(SFENCE:::"memory");
1337
        __asm __volatile(EMMS:::"memory");
1338
#else
1339
        unsigned i;
1340
        unsigned num_pixels = src_size >> 2;
1341
        for(i=0; i<num_pixels; i++)
1342
        {
1343
#ifdef WORDS_BIGENDIAN  
1344
          dst[4*i + 1] = src[4*i + 3];
1345
          dst[4*i + 2] = src[4*i + 2];
1346
          dst[4*i + 3] = src[4*i + 1];
1347
#else
1348
          dst[4*i + 0] = src[4*i + 2];
1349
          dst[4*i + 1] = src[4*i + 1];
1350
          dst[4*i + 2] = src[4*i + 0];
1351
#endif
1352
        }
1353
#endif
1354
}
1355

    
1356
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1357
{
1358
        unsigned i;
1359
#ifdef HAVE_MMX
1360
        int mmx_size= 23 - src_size;
1361
        asm volatile (
1362
                "movq "MANGLE(mask24r)", %%mm5        \n\t"
1363
                "movq "MANGLE(mask24g)", %%mm6        \n\t"
1364
                "movq "MANGLE(mask24b)", %%mm7        \n\t"
1365
                ".balign 16                        \n\t"
1366
                "1:                                \n\t"
1367
                PREFETCH" 32(%1, %%eax)                \n\t"
1368
                "movq   (%1, %%eax), %%mm0        \n\t" // BGR BGR BG
1369
                "movq   (%1, %%eax), %%mm1        \n\t" // BGR BGR BG
1370
                "movq  2(%1, %%eax), %%mm2        \n\t" // R BGR BGR B
1371
                "psllq $16, %%mm0                \n\t" // 00 BGR BGR
1372
                "pand %%mm5, %%mm0                \n\t"
1373
                "pand %%mm6, %%mm1                \n\t"
1374
                "pand %%mm7, %%mm2                \n\t"
1375
                "por %%mm0, %%mm1                \n\t"
1376
                "por %%mm2, %%mm1                \n\t"                
1377
                "movq  6(%1, %%eax), %%mm0        \n\t" // BGR BGR BG
1378
                MOVNTQ" %%mm1,   (%2, %%eax)        \n\t" // RGB RGB RG
1379
                "movq  8(%1, %%eax), %%mm1        \n\t" // R BGR BGR B
1380
                "movq 10(%1, %%eax), %%mm2        \n\t" // GR BGR BGR
1381
                "pand %%mm7, %%mm0                \n\t"
1382
                "pand %%mm5, %%mm1                \n\t"
1383
                "pand %%mm6, %%mm2                \n\t"
1384
                "por %%mm0, %%mm1                \n\t"
1385
                "por %%mm2, %%mm1                \n\t"                
1386
                "movq 14(%1, %%eax), %%mm0        \n\t" // R BGR BGR B
1387
                MOVNTQ" %%mm1,  8(%2, %%eax)        \n\t" // B RGB RGB R
1388
                "movq 16(%1, %%eax), %%mm1        \n\t" // GR BGR BGR
1389
                "movq 18(%1, %%eax), %%mm2        \n\t" // BGR BGR BG
1390
                "pand %%mm6, %%mm0                \n\t"
1391
                "pand %%mm7, %%mm1                \n\t"
1392
                "pand %%mm5, %%mm2                \n\t"
1393
                "por %%mm0, %%mm1                \n\t"
1394
                "por %%mm2, %%mm1                \n\t"                
1395
                MOVNTQ" %%mm1, 16(%2, %%eax)        \n\t"
1396
                "addl $24, %%eax                \n\t"
1397
                " js 1b                                \n\t"
1398
                : "+a" (mmx_size)
1399
                : "r" (src-mmx_size), "r"(dst-mmx_size)
1400
        );
1401

    
1402
        __asm __volatile(SFENCE:::"memory");
1403
        __asm __volatile(EMMS:::"memory");
1404

    
1405
        if(mmx_size==23) return; //finihsed, was multiple of 8
1406

    
1407
        src+= src_size;
1408
        dst+= src_size;
1409
        src_size= 23-mmx_size;
1410
        src-= src_size;
1411
        dst-= src_size;
1412
#endif
1413
        for(i=0; i<src_size; i+=3)
1414
        {
1415
                register uint8_t x;
1416
                x          = src[i + 2];
1417
                dst[i + 1] = src[i + 1];
1418
                dst[i + 2] = src[i + 0];
1419
                dst[i + 0] = x;
1420
        }
1421
}
1422

    
1423
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424
        unsigned int width, unsigned int height,
1425
        int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1426
{
1427
        unsigned y;
1428
        const unsigned chromWidth= width>>1;
1429
        for(y=0; y<height; y++)
1430
        {
1431
#ifdef HAVE_MMX
1432
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1433
                asm volatile(
1434
                        "xorl %%eax, %%eax                \n\t"
1435
                        ".balign 16                        \n\t"
1436
                        "1:                                \n\t"
1437
                        PREFETCH" 32(%1, %%eax, 2)        \n\t"
1438
                        PREFETCH" 32(%2, %%eax)                \n\t"
1439
                        PREFETCH" 32(%3, %%eax)                \n\t"
1440
                        "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1441
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1442
                        "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1443
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1444
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1445

    
1446
                        "movq (%1, %%eax,2), %%mm3        \n\t" // Y(0)
1447
                        "movq 8(%1, %%eax,2), %%mm5        \n\t" // Y(8)
1448
                        "movq %%mm3, %%mm4                \n\t" // Y(0)
1449
                        "movq %%mm5, %%mm6                \n\t" // Y(8)
1450
                        "punpcklbw %%mm0, %%mm3                \n\t" // YUYV YUYV(0)
1451
                        "punpckhbw %%mm0, %%mm4                \n\t" // YUYV YUYV(4)
1452
                        "punpcklbw %%mm2, %%mm5                \n\t" // YUYV YUYV(8)
1453
                        "punpckhbw %%mm2, %%mm6                \n\t" // YUYV YUYV(12)
1454

    
1455
                        MOVNTQ" %%mm3, (%0, %%eax, 4)        \n\t"
1456
                        MOVNTQ" %%mm4, 8(%0, %%eax, 4)        \n\t"
1457
                        MOVNTQ" %%mm5, 16(%0, %%eax, 4)        \n\t"
1458
                        MOVNTQ" %%mm6, 24(%0, %%eax, 4)        \n\t"
1459

    
1460
                        "addl $8, %%eax                        \n\t"
1461
                        "cmpl %4, %%eax                        \n\t"
1462
                        " jb 1b                                \n\t"
1463
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1464
                        : "%eax"
1465
                );
1466
#else
1467

    
1468
#if defined ARCH_ALPHA && defined HAVE_MVI
1469
#define pl2yuy2(n)                                        \
1470
        y1 = yc[n];                                        \
1471
        y2 = yc2[n];                                        \
1472
        u = uc[n];                                        \
1473
        v = vc[n];                                        \
1474
        asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));        \
1475
        asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));        \
1476
        asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1477
        asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1478
        yuv1 = (u << 8) + (v << 24);                        \
1479
        yuv2 = yuv1 + y2;                                \
1480
        yuv1 += y1;                                        \
1481
        qdst[n] = yuv1;                                        \
1482
        qdst2[n] = yuv2;
1483

    
1484
                int i;
1485
                uint64_t *qdst = (uint64_t *) dst;
1486
                uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487
                const uint32_t *yc = (uint32_t *) ysrc;
1488
                const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489
                const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490
                for(i = 0; i < chromWidth; i += 8){
1491
                        uint64_t y1, y2, yuv1, yuv2;
1492
                        uint64_t u, v;
1493
                        /* Prefetch */
1494
                        asm("ldq $31,64(%0)" :: "r"(yc));
1495
                        asm("ldq $31,64(%0)" :: "r"(yc2));
1496
                        asm("ldq $31,64(%0)" :: "r"(uc));
1497
                        asm("ldq $31,64(%0)" :: "r"(vc));
1498

    
1499
                        pl2yuy2(0);
1500
                        pl2yuy2(1);
1501
                        pl2yuy2(2);
1502
                        pl2yuy2(3);
1503

    
1504
                        yc += 4;
1505
                        yc2 += 4;
1506
                        uc += 4;
1507
                        vc += 4;
1508
                        qdst += 4;
1509
                        qdst2 += 4;
1510
                }
1511
                y++;
1512
                ysrc += lumStride;
1513
                dst += dstStride;
1514

    
1515
#elif __WORDSIZE >= 64
1516
                int i;
1517
                uint64_t *ldst = (uint64_t *) dst;
1518
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519
                for(i = 0; i < chromWidth; i += 2){
1520
                        uint64_t k, l;
1521
                        k = yc[0] + (uc[0] << 8) +
1522
                            (yc[1] << 16) + (vc[0] << 24);
1523
                        l = yc[2] + (uc[1] << 8) +
1524
                            (yc[3] << 16) + (vc[1] << 24);
1525
                        *ldst++ = k + (l << 32);
1526
                        yc += 4;
1527
                        uc += 2;
1528
                        vc += 2;
1529
                }
1530

    
1531
#else
1532
                int i, *idst = (int32_t *) dst;
1533
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534
                for(i = 0; i < chromWidth; i++){
1535
                        *idst++ = yc[0] + (uc[0] << 8) +
1536
                            (yc[1] << 16) + (vc[0] << 24);
1537
                        yc += 2;
1538
                        uc++;
1539
                        vc++;
1540
                }
1541
#endif
1542
#endif
1543
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1544
                {
1545
                        usrc += chromStride;
1546
                        vsrc += chromStride;
1547
                }
1548
                ysrc += lumStride;
1549
                dst += dstStride;
1550
        }
1551
#ifdef HAVE_MMX
1552
asm(    EMMS" \n\t"
1553
        SFENCE" \n\t"
1554
        :::"memory");
1555
#endif
1556
}
1557

    
1558
/**
1559
 *
1560
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1561
 * problem for anyone then tell me, and ill fix it)
1562
 */
1563
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1564
        unsigned int width, unsigned int height,
1565
        int lumStride, int chromStride, int dstStride)
1566
{
1567
        //FIXME interpolate chroma
1568
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1569
}
1570

    
1571
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1572
        unsigned int width, unsigned int height,
1573
        int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1574
{
1575
        unsigned y;
1576
        const unsigned chromWidth= width>>1;
1577
        for(y=0; y<height; y++)
1578
        {
1579
#ifdef HAVE_MMX
1580
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1581
                asm volatile(
1582
                        "xorl %%eax, %%eax                \n\t"
1583
                        ".balign 16                        \n\t"
1584
                        "1:                                \n\t"
1585
                        PREFETCH" 32(%1, %%eax, 2)        \n\t"
1586
                        PREFETCH" 32(%2, %%eax)                \n\t"
1587
                        PREFETCH" 32(%3, %%eax)                \n\t"
1588
                        "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1589
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1590
                        "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1591
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1592
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1593

    
1594
                        "movq (%1, %%eax,2), %%mm3        \n\t" // Y(0)
1595
                        "movq 8(%1, %%eax,2), %%mm5        \n\t" // Y(8)
1596
                        "movq %%mm0, %%mm4                \n\t" // Y(0)
1597
                        "movq %%mm2, %%mm6                \n\t" // Y(8)
1598
                        "punpcklbw %%mm3, %%mm0                \n\t" // YUYV YUYV(0)
1599
                        "punpckhbw %%mm3, %%mm4                \n\t" // YUYV YUYV(4)
1600
                        "punpcklbw %%mm5, %%mm2                \n\t" // YUYV YUYV(8)
1601
                        "punpckhbw %%mm5, %%mm6                \n\t" // YUYV YUYV(12)
1602

    
1603
                        MOVNTQ" %%mm0, (%0, %%eax, 4)        \n\t"
1604
                        MOVNTQ" %%mm4, 8(%0, %%eax, 4)        \n\t"
1605
                        MOVNTQ" %%mm2, 16(%0, %%eax, 4)        \n\t"
1606
                        MOVNTQ" %%mm6, 24(%0, %%eax, 4)        \n\t"
1607

    
1608
                        "addl $8, %%eax                        \n\t"
1609
                        "cmpl %4, %%eax                        \n\t"
1610
                        " jb 1b                                \n\t"
1611
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1612
                        : "%eax"
1613
                );
1614
#else
1615
//FIXME adapt the alpha asm code from yv12->yuy2
1616

    
1617
#if __WORDSIZE >= 64
1618
                int i;
1619
                uint64_t *ldst = (uint64_t *) dst;
1620
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1621
                for(i = 0; i < chromWidth; i += 2){
1622
                        uint64_t k, l;
1623
                        k = uc[0] + (yc[0] << 8) +
1624
                            (vc[0] << 16) + (yc[1] << 24);
1625
                        l = uc[1] + (yc[2] << 8) +
1626
                            (vc[1] << 16) + (yc[3] << 24);
1627
                        *ldst++ = k + (l << 32);
1628
                        yc += 4;
1629
                        uc += 2;
1630
                        vc += 2;
1631
                }
1632

    
1633
#else
1634
                int i, *idst = (int32_t *) dst;
1635
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1636
                for(i = 0; i < chromWidth; i++){
1637
                        *idst++ = uc[0] + (yc[0] << 8) +
1638
                            (vc[0] << 16) + (yc[1] << 24);
1639
                        yc += 2;
1640
                        uc++;
1641
                        vc++;
1642
                }
1643
#endif
1644
#endif
1645
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1646
                {
1647
                        usrc += chromStride;
1648
                        vsrc += chromStride;
1649
                }
1650
                ysrc += lumStride;
1651
                dst += dstStride;
1652
        }
1653
#ifdef HAVE_MMX
1654
asm(    EMMS" \n\t"
1655
        SFENCE" \n\t"
1656
        :::"memory");
1657
#endif
1658
}
1659

    
1660
/**
1661
 *
1662
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1663
 * problem for anyone then tell me, and ill fix it)
1664
 */
1665
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1666
        unsigned int width, unsigned int height,
1667
        int lumStride, int chromStride, int dstStride)
1668
{
1669
        //FIXME interpolate chroma
1670
        RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1671
}
1672

    
1673
/**
1674
 *
1675
 * width should be a multiple of 16
1676
 */
1677
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1678
        unsigned int width, unsigned int height,
1679
        int lumStride, int chromStride, int dstStride)
1680
{
1681
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1682
}
1683

    
1684
/**
1685
 *
1686
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1687
 * problem for anyone then tell me, and ill fix it)
1688
 */
1689
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1690
        unsigned int width, unsigned int height,
1691
        int lumStride, int chromStride, int srcStride)
1692
{
1693
        unsigned y;
1694
        const unsigned chromWidth= width>>1;
1695
        for(y=0; y<height; y+=2)
1696
        {
1697
#ifdef HAVE_MMX
1698
                asm volatile(
1699
                        "xorl %%eax, %%eax                \n\t"
1700
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1701
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1702
                        ".balign 16                        \n\t"
1703
                        "1:                                \n\t"
1704
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1705
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1706
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1707
                        "movq %%mm0, %%mm2                \n\t" // YUYV YUYV(0)
1708
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(4)
1709
                        "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1710
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1711
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1712
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1713
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1714
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1715

    
1716
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
1717

    
1718
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(8)
1719
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(12)
1720
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(8)
1721
                        "movq %%mm2, %%mm4                \n\t" // YUYV YUYV(12)
1722
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1723
                        "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1724
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1725
                        "pand %%mm7, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1726
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1727
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1728

    
1729
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
1730

    
1731
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1732
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1733
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1734
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1735
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1736
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1737
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1738
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1739

    
1740
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
1741
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
1742

    
1743
                        "addl $8, %%eax                        \n\t"
1744
                        "cmpl %4, %%eax                        \n\t"
1745
                        " jb 1b                                \n\t"
1746
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1747
                        : "memory", "%eax"
1748
                );
1749

    
1750
                ydst += lumStride;
1751
                src  += srcStride;
1752

    
1753
                asm volatile(
1754
                        "xorl %%eax, %%eax                \n\t"
1755
                        ".balign 16                        \n\t"
1756
                        "1:                                \n\t"
1757
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1758
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1759
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1760
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
1761
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
1762
                        "pand %%mm7, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1763
                        "pand %%mm7, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1764
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1765
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1766
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1767
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1768

    
1769
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
1770
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
1771

    
1772
                        "addl $8, %%eax                        \n\t"
1773
                        "cmpl %4, %%eax                        \n\t"
1774
                        " jb 1b                                \n\t"
1775

    
1776
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1777
                        : "memory", "%eax"
1778
                );
1779
#else
1780
                unsigned i;
1781
                for(i=0; i<chromWidth; i++)
1782
                {
1783
                        ydst[2*i+0]         = src[4*i+0];
1784
                        udst[i]         = src[4*i+1];
1785
                        ydst[2*i+1]         = src[4*i+2];
1786
                        vdst[i]         = src[4*i+3];
1787
                }
1788
                ydst += lumStride;
1789
                src  += srcStride;
1790

    
1791
                for(i=0; i<chromWidth; i++)
1792
                {
1793
                        ydst[2*i+0]         = src[4*i+0];
1794
                        ydst[2*i+1]         = src[4*i+2];
1795
                }
1796
#endif
1797
                udst += chromStride;
1798
                vdst += chromStride;
1799
                ydst += lumStride;
1800
                src  += srcStride;
1801
        }
1802
#ifdef HAVE_MMX
1803
asm volatile(   EMMS" \n\t"
1804
                SFENCE" \n\t"
1805
                :::"memory");
1806
#endif
1807
}
1808

    
1809
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1810
        uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1811
        unsigned int width, unsigned int height, int lumStride, int chromStride)
1812
{
1813
        /* Y Plane */
1814
        memcpy(ydst, ysrc, width*height);
1815

    
1816
        /* XXX: implement upscaling for U,V */
1817
}
1818

    
1819
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1820
{
1821
        int x,y;
1822
        
1823
        dst[0]= src[0];
1824
        
1825
        // first line
1826
        for(x=0; x<srcWidth-1; x++){
1827
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1828
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1829
        }
1830
        dst[2*srcWidth-1]= src[srcWidth-1];
1831
        
1832
        dst+= dstStride;
1833

    
1834
        for(y=1; y<srcHeight; y++){
1835
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1836
                const int mmxSize= srcWidth&~15;
1837
                asm volatile(
1838
                        "movl %4, %%eax                        \n\t"
1839
                        "1:                                \n\t"
1840
                        "movq (%0, %%eax), %%mm0        \n\t"
1841
                        "movq (%1, %%eax), %%mm1        \n\t"
1842
                        "movq 1(%0, %%eax), %%mm2        \n\t"
1843
                        "movq 1(%1, %%eax), %%mm3        \n\t"
1844
                        "movq -1(%0, %%eax), %%mm4        \n\t"
1845
                        "movq -1(%1, %%eax), %%mm5        \n\t"
1846
                        PAVGB" %%mm0, %%mm5                \n\t"
1847
                        PAVGB" %%mm0, %%mm3                \n\t"
1848
                        PAVGB" %%mm0, %%mm5                \n\t"
1849
                        PAVGB" %%mm0, %%mm3                \n\t"
1850
                        PAVGB" %%mm1, %%mm4                \n\t"
1851
                        PAVGB" %%mm1, %%mm2                \n\t"
1852
                        PAVGB" %%mm1, %%mm4                \n\t"
1853
                        PAVGB" %%mm1, %%mm2                \n\t"
1854
                        "movq %%mm5, %%mm7                \n\t"
1855
                        "movq %%mm4, %%mm6                \n\t"
1856
                        "punpcklbw %%mm3, %%mm5                \n\t"
1857
                        "punpckhbw %%mm3, %%mm7                \n\t"
1858
                        "punpcklbw %%mm2, %%mm4                \n\t"
1859
                        "punpckhbw %%mm2, %%mm6                \n\t"
1860
#if 1
1861
                        MOVNTQ" %%mm5, (%2, %%eax, 2)        \n\t"
1862
                        MOVNTQ" %%mm7, 8(%2, %%eax, 2)        \n\t"
1863
                        MOVNTQ" %%mm4, (%3, %%eax, 2)        \n\t"
1864
                        MOVNTQ" %%mm6, 8(%3, %%eax, 2)        \n\t"
1865
#else
1866
                        "movq %%mm5, (%2, %%eax, 2)        \n\t"
1867
                        "movq %%mm7, 8(%2, %%eax, 2)        \n\t"
1868
                        "movq %%mm4, (%3, %%eax, 2)        \n\t"
1869
                        "movq %%mm6, 8(%3, %%eax, 2)        \n\t"
1870
#endif
1871
                        "addl $8, %%eax                        \n\t"
1872
                        " js 1b                                \n\t"
1873
                        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1874
                           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1875
                           "g" (-mmxSize)
1876
                        : "%eax"
1877

    
1878
                );
1879
#else
1880
                const int mmxSize=1;
1881
#endif
1882
                dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1883
                dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1884

    
1885
                for(x=mmxSize-1; x<srcWidth-1; x++){
1886
                        dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1887
                        dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1888
                        dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1889
                        dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1890
                }
1891
                dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1892
                dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1893

    
1894
                dst+=dstStride*2;
1895
                src+=srcStride;
1896
        }
1897
        
1898
        // last line
1899
#if 1
1900
        dst[0]= src[0];
1901
        
1902
        for(x=0; x<srcWidth-1; x++){
1903
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1904
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1905
        }
1906
        dst[2*srcWidth-1]= src[srcWidth-1];
1907
#else
1908
        for(x=0; x<srcWidth; x++){
1909
                dst[2*x+0]=
1910
                dst[2*x+1]= src[x];
1911
        }
1912
#endif
1913

    
1914
#ifdef HAVE_MMX
1915
asm volatile(   EMMS" \n\t"
1916
                SFENCE" \n\t"
1917
                :::"memory");
1918
#endif
1919
}
1920

    
1921
/**
1922
 *
1923
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1924
 * problem for anyone then tell me, and ill fix it)
1925
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1926
 */
1927
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1928
        unsigned int width, unsigned int height,
1929
        int lumStride, int chromStride, int srcStride)
1930
{
1931
        unsigned y;
1932
        const unsigned chromWidth= width>>1;
1933
        for(y=0; y<height; y+=2)
1934
        {
1935
#ifdef HAVE_MMX
1936
                asm volatile(
1937
                        "xorl %%eax, %%eax                \n\t"
1938
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1939
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1940
                        ".balign 16                        \n\t"
1941
                        "1:                                \n\t"
1942
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1943
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // UYVY UYVY(0)
1944
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(4)
1945
                        "movq %%mm0, %%mm2                \n\t" // UYVY UYVY(0)
1946
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(4)
1947
                        "pand %%mm7, %%mm0                \n\t" // U0V0 U0V0(0)
1948
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(4)
1949
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1950
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1951
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1952
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1953

    
1954
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
1955

    
1956
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(8)
1957
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // UYVY UYVY(12)
1958
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(8)
1959
                        "movq %%mm2, %%mm4                \n\t" // UYVY UYVY(12)
1960
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(8)
1961
                        "pand %%mm7, %%mm2                \n\t" // U0V0 U0V0(12)
1962
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1963
                        "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1964
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1965
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1966

    
1967
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
1968

    
1969
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1970
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1971
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1972
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1973
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1974
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1975
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1976
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1977

    
1978
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
1979
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
1980

    
1981
                        "addl $8, %%eax                        \n\t"
1982
                        "cmpl %4, %%eax                        \n\t"
1983
                        " jb 1b                                \n\t"
1984
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1985
                        : "memory", "%eax"
1986
                );
1987

    
1988
                ydst += lumStride;
1989
                src  += srcStride;
1990

    
1991
                asm volatile(
1992
                        "xorl %%eax, %%eax                \n\t"
1993
                        ".balign 16                        \n\t"
1994
                        "1:                                \n\t"
1995
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1996
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1997
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1998
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
1999
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
2000
                        "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2001
                        "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2002
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2003
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2004
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
2005
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
2006

    
2007
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
2008
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
2009

    
2010
                        "addl $8, %%eax                        \n\t"
2011
                        "cmpl %4, %%eax                        \n\t"
2012
                        " jb 1b                                \n\t"
2013

    
2014
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2015
                        : "memory", "%eax"
2016
                );
2017
#else
2018
                unsigned i;
2019
                for(i=0; i<chromWidth; i++)
2020
                {
2021
                        udst[i]         = src[4*i+0];
2022
                        ydst[2*i+0]         = src[4*i+1];
2023
                        vdst[i]         = src[4*i+2];
2024
                        ydst[2*i+1]         = src[4*i+3];
2025
                }
2026
                ydst += lumStride;
2027
                src  += srcStride;
2028

    
2029
                for(i=0; i<chromWidth; i++)
2030
                {
2031
                        ydst[2*i+0]         = src[4*i+1];
2032
                        ydst[2*i+1]         = src[4*i+3];
2033
                }
2034
#endif
2035
                udst += chromStride;
2036
                vdst += chromStride;
2037
                ydst += lumStride;
2038
                src  += srcStride;
2039
        }
2040
#ifdef HAVE_MMX
2041
asm volatile(   EMMS" \n\t"
2042
                SFENCE" \n\t"
2043
                :::"memory");
2044
#endif
2045
}
2046

    
2047
/**
2048
 *
2049
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2050
 * problem for anyone then tell me, and ill fix it)
2051
 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2052
 */
2053
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2054
        unsigned int width, unsigned int height,
2055
        int lumStride, int chromStride, int srcStride)
2056
{
2057
        unsigned y;
2058
        const unsigned chromWidth= width>>1;
2059
#ifdef HAVE_MMX
2060
        for(y=0; y<height-2; y+=2)
2061
        {
2062
                unsigned i;
2063
                for(i=0; i<2; i++)
2064
                {
2065
                        asm volatile(
2066
                                "movl %2, %%eax                        \n\t"
2067
                                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
2068
                                "movq "MANGLE(w1111)", %%mm5                \n\t"
2069
                                "pxor %%mm7, %%mm7                \n\t"
2070
                                "leal (%%eax, %%eax, 2), %%ebx        \n\t"
2071
                                ".balign 16                        \n\t"
2072
                                "1:                                \n\t"
2073
                                PREFETCH" 64(%0, %%ebx)                \n\t"
2074
                                "movd (%0, %%ebx), %%mm0        \n\t"
2075
                                "movd 3(%0, %%ebx), %%mm1        \n\t"
2076
                                "punpcklbw %%mm7, %%mm0                \n\t"
2077
                                "punpcklbw %%mm7, %%mm1                \n\t"
2078
                                "movd 6(%0, %%ebx), %%mm2        \n\t"
2079
                                "movd 9(%0, %%ebx), %%mm3        \n\t"
2080
                                "punpcklbw %%mm7, %%mm2                \n\t"
2081
                                "punpcklbw %%mm7, %%mm3                \n\t"
2082
                                "pmaddwd %%mm6, %%mm0                \n\t"
2083
                                "pmaddwd %%mm6, %%mm1                \n\t"
2084
                                "pmaddwd %%mm6, %%mm2                \n\t"
2085
                                "pmaddwd %%mm6, %%mm3                \n\t"
2086
#ifndef FAST_BGR2YV12
2087
                                "psrad $8, %%mm0                \n\t"
2088
                                "psrad $8, %%mm1                \n\t"
2089
                                "psrad $8, %%mm2                \n\t"
2090
                                "psrad $8, %%mm3                \n\t"
2091
#endif
2092
                                "packssdw %%mm1, %%mm0                \n\t"
2093
                                "packssdw %%mm3, %%mm2                \n\t"
2094
                                "pmaddwd %%mm5, %%mm0                \n\t"
2095
                                "pmaddwd %%mm5, %%mm2                \n\t"
2096
                                "packssdw %%mm2, %%mm0                \n\t"
2097
                                "psraw $7, %%mm0                \n\t"
2098

    
2099
                                "movd 12(%0, %%ebx), %%mm4        \n\t"
2100
                                "movd 15(%0, %%ebx), %%mm1        \n\t"
2101
                                "punpcklbw %%mm7, %%mm4                \n\t"
2102
                                "punpcklbw %%mm7, %%mm1                \n\t"
2103
                                "movd 18(%0, %%ebx), %%mm2        \n\t"
2104
                                "movd 21(%0, %%ebx), %%mm3        \n\t"
2105
                                "punpcklbw %%mm7, %%mm2                \n\t"
2106
                                "punpcklbw %%mm7, %%mm3                \n\t"
2107
                                "pmaddwd %%mm6, %%mm4                \n\t"
2108
                                "pmaddwd %%mm6, %%mm1                \n\t"
2109
                                "pmaddwd %%mm6, %%mm2                \n\t"
2110
                                "pmaddwd %%mm6, %%mm3                \n\t"
2111
#ifndef FAST_BGR2YV12
2112
                                "psrad $8, %%mm4                \n\t"
2113
                                "psrad $8, %%mm1                \n\t"
2114
                                "psrad $8, %%mm2                \n\t"
2115
                                "psrad $8, %%mm3                \n\t"
2116
#endif
2117
                                "packssdw %%mm1, %%mm4                \n\t"
2118
                                "packssdw %%mm3, %%mm2                \n\t"
2119
                                "pmaddwd %%mm5, %%mm4                \n\t"
2120
                                "pmaddwd %%mm5, %%mm2                \n\t"
2121
                                "addl $24, %%ebx                \n\t"
2122
                                "packssdw %%mm2, %%mm4                \n\t"
2123
                                "psraw $7, %%mm4                \n\t"
2124

    
2125
                                "packuswb %%mm4, %%mm0                \n\t"
2126
                                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
2127

    
2128
                                MOVNTQ" %%mm0, (%1, %%eax)        \n\t"
2129
                                "addl $8, %%eax                        \n\t"
2130
                                " js 1b                                \n\t"
2131
                                : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2132
                                : "%eax", "%ebx"
2133
                        );
2134
                        ydst += lumStride;
2135
                        src  += srcStride;
2136
                }
2137
                src -= srcStride*2;
2138
                asm volatile(
2139
                        "movl %4, %%eax                        \n\t"
2140
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2141
                        "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
2142
                        "pxor %%mm7, %%mm7                \n\t"
2143
                        "leal (%%eax, %%eax, 2), %%ebx        \n\t"
2144
                        "addl %%ebx, %%ebx                \n\t"
2145
                        ".balign 16                        \n\t"
2146
                        "1:                                \n\t"
2147
                        PREFETCH" 64(%0, %%ebx)                \n\t"
2148
                        PREFETCH" 64(%1, %%ebx)                \n\t"
2149
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2150
                        "movq (%0, %%ebx), %%mm0        \n\t"
2151
                        "movq (%1, %%ebx), %%mm1        \n\t"
2152
                        "movq 6(%0, %%ebx), %%mm2        \n\t"
2153
                        "movq 6(%1, %%ebx), %%mm3        \n\t"
2154
                        PAVGB" %%mm1, %%mm0                \n\t"
2155
                        PAVGB" %%mm3, %%mm2                \n\t"
2156
                        "movq %%mm0, %%mm1                \n\t"
2157
                        "movq %%mm2, %%mm3                \n\t"
2158
                        "psrlq $24, %%mm0                \n\t"
2159
                        "psrlq $24, %%mm2                \n\t"
2160
                        PAVGB" %%mm1, %%mm0                \n\t"
2161
                        PAVGB" %%mm3, %%mm2                \n\t"
2162
                        "punpcklbw %%mm7, %%mm0                \n\t"
2163
                        "punpcklbw %%mm7, %%mm2                \n\t"
2164
#else
2165
                        "movd (%0, %%ebx), %%mm0        \n\t"
2166
                        "movd (%1, %%ebx), %%mm1        \n\t"
2167
                        "movd 3(%0, %%ebx), %%mm2        \n\t"
2168
                        "movd 3(%1, %%ebx), %%mm3        \n\t"
2169
                        "punpcklbw %%mm7, %%mm0                \n\t"
2170
                        "punpcklbw %%mm7, %%mm1                \n\t"
2171
                        "punpcklbw %%mm7, %%mm2                \n\t"
2172
                        "punpcklbw %%mm7, %%mm3                \n\t"
2173
                        "paddw %%mm1, %%mm0                \n\t"
2174
                        "paddw %%mm3, %%mm2                \n\t"
2175
                        "paddw %%mm2, %%mm0                \n\t"
2176
                        "movd 6(%0, %%ebx), %%mm4        \n\t"
2177
                        "movd 6(%1, %%ebx), %%mm1        \n\t"
2178
                        "movd 9(%0, %%ebx), %%mm2        \n\t"
2179
                        "movd 9(%1, %%ebx), %%mm3        \n\t"
2180
                        "punpcklbw %%mm7, %%mm4                \n\t"
2181
                        "punpcklbw %%mm7, %%mm1                \n\t"
2182
                        "punpcklbw %%mm7, %%mm2                \n\t"
2183
                        "punpcklbw %%mm7, %%mm3                \n\t"
2184
                        "paddw %%mm1, %%mm4                \n\t"
2185
                        "paddw %%mm3, %%mm2                \n\t"
2186
                        "paddw %%mm4, %%mm2                \n\t"
2187
                        "psrlw $2, %%mm0                \n\t"
2188
                        "psrlw $2, %%mm2                \n\t"
2189
#endif
2190
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2191
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2192

    
2193
                        "pmaddwd %%mm0, %%mm1                \n\t"
2194
                        "pmaddwd %%mm2, %%mm3                \n\t"
2195
                        "pmaddwd %%mm6, %%mm0                \n\t"
2196
                        "pmaddwd %%mm6, %%mm2                \n\t"
2197
#ifndef FAST_BGR2YV12
2198
                        "psrad $8, %%mm0                \n\t"
2199
                        "psrad $8, %%mm1                \n\t"
2200
                        "psrad $8, %%mm2                \n\t"
2201
                        "psrad $8, %%mm3                \n\t"
2202
#endif
2203
                        "packssdw %%mm2, %%mm0                \n\t"
2204
                        "packssdw %%mm3, %%mm1                \n\t"
2205
                        "pmaddwd %%mm5, %%mm0                \n\t"
2206
                        "pmaddwd %%mm5, %%mm1                \n\t"
2207
                        "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
2208
                        "psraw $7, %%mm0                \n\t"
2209

    
2210
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2211
                        "movq 12(%0, %%ebx), %%mm4        \n\t"
2212
                        "movq 12(%1, %%ebx), %%mm1        \n\t"
2213
                        "movq 18(%0, %%ebx), %%mm2        \n\t"
2214
                        "movq 18(%1, %%ebx), %%mm3        \n\t"
2215
                        PAVGB" %%mm1, %%mm4                \n\t"
2216
                        PAVGB" %%mm3, %%mm2                \n\t"
2217
                        "movq %%mm4, %%mm1                \n\t"
2218
                        "movq %%mm2, %%mm3                \n\t"
2219
                        "psrlq $24, %%mm4                \n\t"
2220
                        "psrlq $24, %%mm2                \n\t"
2221
                        PAVGB" %%mm1, %%mm4                \n\t"
2222
                        PAVGB" %%mm3, %%mm2                \n\t"
2223
                        "punpcklbw %%mm7, %%mm4                \n\t"
2224
                        "punpcklbw %%mm7, %%mm2                \n\t"
2225
#else
2226
                        "movd 12(%0, %%ebx), %%mm4        \n\t"
2227
                        "movd 12(%1, %%ebx), %%mm1        \n\t"
2228
                        "movd 15(%0, %%ebx), %%mm2        \n\t"
2229
                        "movd 15(%1, %%ebx), %%mm3        \n\t"
2230
                        "punpcklbw %%mm7, %%mm4                \n\t"
2231
                        "punpcklbw %%mm7, %%mm1                \n\t"
2232
                        "punpcklbw %%mm7, %%mm2                \n\t"
2233
                        "punpcklbw %%mm7, %%mm3                \n\t"
2234
                        "paddw %%mm1, %%mm4                \n\t"
2235
                        "paddw %%mm3, %%mm2                \n\t"
2236
                        "paddw %%mm2, %%mm4                \n\t"
2237
                        "movd 18(%0, %%ebx), %%mm5        \n\t"
2238
                        "movd 18(%1, %%ebx), %%mm1        \n\t"
2239
                        "movd 21(%0, %%ebx), %%mm2        \n\t"
2240
                        "movd 21(%1, %%ebx), %%mm3        \n\t"
2241
                        "punpcklbw %%mm7, %%mm5                \n\t"
2242
                        "punpcklbw %%mm7, %%mm1                \n\t"
2243
                        "punpcklbw %%mm7, %%mm2                \n\t"
2244
                        "punpcklbw %%mm7, %%mm3                \n\t"
2245
                        "paddw %%mm1, %%mm5                \n\t"
2246
                        "paddw %%mm3, %%mm2                \n\t"
2247
                        "paddw %%mm5, %%mm2                \n\t"
2248
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2249
                        "psrlw $2, %%mm4                \n\t"
2250
                        "psrlw $2, %%mm2                \n\t"
2251
#endif
2252
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2253
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2254

    
2255
                        "pmaddwd %%mm4, %%mm1                \n\t"
2256
                        "pmaddwd %%mm2, %%mm3                \n\t"
2257
                        "pmaddwd %%mm6, %%mm4                \n\t"
2258
                        "pmaddwd %%mm6, %%mm2                \n\t"
2259
#ifndef FAST_BGR2YV12
2260
                        "psrad $8, %%mm4                \n\t"
2261
                        "psrad $8, %%mm1                \n\t"
2262
                        "psrad $8, %%mm2                \n\t"
2263
                        "psrad $8, %%mm3                \n\t"
2264
#endif
2265
                        "packssdw %%mm2, %%mm4                \n\t"
2266
                        "packssdw %%mm3, %%mm1                \n\t"
2267
                        "pmaddwd %%mm5, %%mm4                \n\t"
2268
                        "pmaddwd %%mm5, %%mm1                \n\t"
2269
                        "addl $24, %%ebx                \n\t"
2270
                        "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
2271
                        "psraw $7, %%mm4                \n\t"
2272

    
2273
                        "movq %%mm0, %%mm1                \n\t"
2274
                        "punpckldq %%mm4, %%mm0                \n\t"
2275
                        "punpckhdq %%mm4, %%mm1                \n\t"
2276
                        "packsswb %%mm1, %%mm0                \n\t"
2277
                        "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
2278

    
2279
                        "movd %%mm0, (%2, %%eax)        \n\t"
2280
                        "punpckhdq %%mm0, %%mm0                \n\t"
2281
                        "movd %%mm0, (%3, %%eax)        \n\t"
2282
                        "addl $4, %%eax                        \n\t"
2283
                        " js 1b                                \n\t"
2284
                        : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2285
                        : "%eax", "%ebx"
2286
                );
2287

    
2288
                udst += chromStride;
2289
                vdst += chromStride;
2290
                src  += srcStride*2;
2291
        }
2292

    
2293
        asm volatile(   EMMS" \n\t"
2294
                        SFENCE" \n\t"
2295
                        :::"memory");
2296
#else
2297
        y=0;
2298
#endif
2299
        for(; y<height; y+=2)
2300
        {
2301
                unsigned i;
2302
                for(i=0; i<chromWidth; i++)
2303
                {
2304
                        unsigned int b= src[6*i+0];
2305
                        unsigned int g= src[6*i+1];
2306
                        unsigned int r= src[6*i+2];
2307

    
2308
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2309
                        unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2310
                        unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2311

    
2312
                        udst[i]         = U;
2313
                        vdst[i]         = V;
2314
                        ydst[2*i]         = Y;
2315

    
2316
                        b= src[6*i+3];
2317
                        g= src[6*i+4];
2318
                        r= src[6*i+5];
2319

    
2320
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2321
                        ydst[2*i+1]         = Y;
2322
                }
2323
                ydst += lumStride;
2324
                src  += srcStride;
2325

    
2326
                for(i=0; i<chromWidth; i++)
2327
                {
2328
                        unsigned int b= src[6*i+0];
2329
                        unsigned int g= src[6*i+1];
2330
                        unsigned int r= src[6*i+2];
2331

    
2332
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2333

    
2334
                        ydst[2*i]         = Y;
2335

    
2336
                        b= src[6*i+3];
2337
                        g= src[6*i+4];
2338
                        r= src[6*i+5];
2339

    
2340
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2341
                        ydst[2*i+1]         = Y;
2342
                }
2343
                udst += chromStride;
2344
                vdst += chromStride;
2345
                ydst += lumStride;
2346
                src  += srcStride;
2347
        }
2348
}
2349

    
2350
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2351
                            unsigned width, unsigned height, int src1Stride,
2352
                            int src2Stride, int dstStride){
2353
        unsigned h;
2354

    
2355
        for(h=0; h < height; h++)
2356
        {
2357
                unsigned w;
2358

    
2359
#ifdef HAVE_MMX
2360
#ifdef HAVE_SSE2
2361
                asm(
2362
                        "xorl %%eax, %%eax                \n\t"
2363
                        "1:                                \n\t"
2364
                        PREFETCH" 64(%1, %%eax)                \n\t"
2365
                        PREFETCH" 64(%2, %%eax)                \n\t"
2366
                        "movdqa (%1, %%eax), %%xmm0        \n\t"
2367
                        "movdqa (%1, %%eax), %%xmm1        \n\t"
2368
                        "movdqa (%2, %%eax), %%xmm2        \n\t"
2369
                        "punpcklbw %%xmm2, %%xmm0        \n\t"
2370
                        "punpckhbw %%xmm2, %%xmm1        \n\t"
2371
                        "movntdq %%xmm0, (%0, %%eax, 2)        \n\t"
2372
                        "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2373
                        "addl $16, %%eax                        \n\t"
2374
                        "cmpl %3, %%eax                        \n\t"
2375
                        " jb 1b                                \n\t"
2376
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2377
                        : "memory", "%eax"
2378
                );
2379
#else
2380
                asm(
2381
                        "xorl %%eax, %%eax                \n\t"
2382
                        "1:                                \n\t"
2383
                        PREFETCH" 64(%1, %%eax)                \n\t"
2384
                        PREFETCH" 64(%2, %%eax)                \n\t"
2385
                        "movq (%1, %%eax), %%mm0        \n\t"
2386
                        "movq 8(%1, %%eax), %%mm2        \n\t"
2387
                        "movq %%mm0, %%mm1                \n\t"
2388
                        "movq %%mm2, %%mm3                \n\t"
2389
                        "movq (%2, %%eax), %%mm4        \n\t"
2390
                        "movq 8(%2, %%eax), %%mm5        \n\t"
2391
                        "punpcklbw %%mm4, %%mm0                \n\t"
2392
                        "punpckhbw %%mm4, %%mm1                \n\t"
2393
                        "punpcklbw %%mm5, %%mm2                \n\t"
2394
                        "punpckhbw %%mm5, %%mm3                \n\t"
2395
                        MOVNTQ" %%mm0, (%0, %%eax, 2)        \n\t"
2396
                        MOVNTQ" %%mm1, 8(%0, %%eax, 2)        \n\t"
2397
                        MOVNTQ" %%mm2, 16(%0, %%eax, 2)        \n\t"
2398
                        MOVNTQ" %%mm3, 24(%0, %%eax, 2)        \n\t"
2399
                        "addl $16, %%eax                        \n\t"
2400
                        "cmpl %3, %%eax                        \n\t"
2401
                        " jb 1b                                \n\t"
2402
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2403
                        : "memory", "%eax"
2404
                );
2405
#endif
2406
                for(w= (width&(~15)); w < width; w++)
2407
                {
2408
                        dest[2*w+0] = src1[w];
2409
                        dest[2*w+1] = src2[w];
2410
                }
2411
#else
2412
                for(w=0; w < width; w++)
2413
                {
2414
                        dest[2*w+0] = src1[w];
2415
                        dest[2*w+1] = src2[w];
2416
                }
2417
#endif
2418
                dest += dstStride;
2419
                src1 += src1Stride;
2420
                src2 += src2Stride;
2421
        }
2422
#ifdef HAVE_MMX
2423
        asm(
2424
                EMMS" \n\t"
2425
                SFENCE" \n\t"
2426
                ::: "memory"
2427
                );
2428
#endif
2429
}
2430

    
2431
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2432
                        uint8_t *dst1, uint8_t *dst2,
2433
                        unsigned width, unsigned height,
2434
                        int srcStride1, int srcStride2,
2435
                        int dstStride1, int dstStride2)
2436
{
2437
    unsigned int y,x,h;
2438
    int w;
2439
    w=width/2; h=height/2;
2440
#ifdef HAVE_MMX
2441
    asm volatile(
2442
        PREFETCH" %0\n\t"
2443
        PREFETCH" %1\n\t"
2444
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2445
#endif
2446
    for(y=0;y<h;y++){
2447
        const uint8_t* s1=src1+srcStride1*(y>>1);
2448
        uint8_t* d=dst1+dstStride1*y;
2449
        x=0;
2450
#ifdef HAVE_MMX
2451
        for(;x<w-31;x+=32)
2452
        {
2453
            asm volatile(
2454
                PREFETCH" 32%1\n\t"
2455
                "movq        %1, %%mm0\n\t"
2456
                "movq        8%1, %%mm2\n\t"
2457
                "movq        16%1, %%mm4\n\t"
2458
                "movq        24%1, %%mm6\n\t"
2459
                "movq        %%mm0, %%mm1\n\t"
2460
                "movq        %%mm2, %%mm3\n\t"
2461
                "movq        %%mm4, %%mm5\n\t"
2462
                "movq        %%mm6, %%mm7\n\t"
2463
                "punpcklbw %%mm0, %%mm0\n\t"
2464
                "punpckhbw %%mm1, %%mm1\n\t"
2465
                "punpcklbw %%mm2, %%mm2\n\t"
2466
                "punpckhbw %%mm3, %%mm3\n\t"
2467
                "punpcklbw %%mm4, %%mm4\n\t"
2468
                "punpckhbw %%mm5, %%mm5\n\t"
2469
                "punpcklbw %%mm6, %%mm6\n\t"
2470
                "punpckhbw %%mm7, %%mm7\n\t"
2471
                MOVNTQ"        %%mm0, %0\n\t"
2472
                MOVNTQ"        %%mm1, 8%0\n\t"
2473
                MOVNTQ"        %%mm2, 16%0\n\t"
2474
                MOVNTQ"        %%mm3, 24%0\n\t"
2475
                MOVNTQ"        %%mm4, 32%0\n\t"
2476
                MOVNTQ"        %%mm5, 40%0\n\t"
2477
                MOVNTQ"        %%mm6, 48%0\n\t"
2478
                MOVNTQ"        %%mm7, 56%0"
2479
                :"=m"(d[2*x])
2480
                :"m"(s1[x])
2481
                :"memory");
2482
        }
2483
#endif
2484
        for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2485
    }
2486
    for(y=0;y<h;y++){
2487
        const uint8_t* s2=src2+srcStride2*(y>>1);
2488
        uint8_t* d=dst2+dstStride2*y;
2489
        x=0;
2490
#ifdef HAVE_MMX
2491
        for(;x<w-31;x+=32)
2492
        {
2493
            asm volatile(
2494
                PREFETCH" 32%1\n\t"
2495
                "movq        %1, %%mm0\n\t"
2496
                "movq        8%1, %%mm2\n\t"
2497
                "movq        16%1, %%mm4\n\t"
2498
                "movq        24%1, %%mm6\n\t"
2499
                "movq        %%mm0, %%mm1\n\t"
2500
                "movq        %%mm2, %%mm3\n\t"
2501
                "movq        %%mm4, %%mm5\n\t"
2502
                "movq        %%mm6, %%mm7\n\t"
2503
                "punpcklbw %%mm0, %%mm0\n\t"
2504
                "punpckhbw %%mm1, %%mm1\n\t"
2505
                "punpcklbw %%mm2, %%mm2\n\t"
2506
                "punpckhbw %%mm3, %%mm3\n\t"
2507
                "punpcklbw %%mm4, %%mm4\n\t"
2508
                "punpckhbw %%mm5, %%mm5\n\t"
2509
                "punpcklbw %%mm6, %%mm6\n\t"
2510
                "punpckhbw %%mm7, %%mm7\n\t"
2511
                MOVNTQ"        %%mm0, %0\n\t"
2512
                MOVNTQ"        %%mm1, 8%0\n\t"
2513
                MOVNTQ"        %%mm2, 16%0\n\t"
2514
                MOVNTQ"        %%mm3, 24%0\n\t"
2515
                MOVNTQ"        %%mm4, 32%0\n\t"
2516
                MOVNTQ"        %%mm5, 40%0\n\t"
2517
                MOVNTQ"        %%mm6, 48%0\n\t"
2518
                MOVNTQ"        %%mm7, 56%0"
2519
                :"=m"(d[2*x])
2520
                :"m"(s2[x])
2521
                :"memory");
2522
        }
2523
#endif
2524
        for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2525
    }
2526
#ifdef HAVE_MMX
2527
        asm(
2528
                EMMS" \n\t"
2529
                SFENCE" \n\t"
2530
                ::: "memory"
2531
                );
2532
#endif
2533
}
2534

    
2535
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2536
                        uint8_t *dst,
2537
                        unsigned width, unsigned height,
2538
                        int srcStride1, int srcStride2,
2539
                        int srcStride3, int dstStride)
2540
{
2541
    unsigned y,x,w,h;
2542
    w=width/2; h=height;
2543
    for(y=0;y<h;y++){
2544
        const uint8_t* yp=src1+srcStride1*y;
2545
        const uint8_t* up=src2+srcStride2*(y>>2);
2546
        const uint8_t* vp=src3+srcStride3*(y>>2);
2547
        uint8_t* d=dst+dstStride*y;
2548
        x=0;
2549
#ifdef HAVE_MMX
2550
        for(;x<w-7;x+=8)
2551
        {
2552
            asm volatile(
2553
                PREFETCH" 32(%1, %0)\n\t"
2554
                PREFETCH" 32(%2, %0)\n\t"
2555
                PREFETCH" 32(%3, %0)\n\t"
2556
                "movq        (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2557
                "movq        (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2558
                "movq        (%3, %0), %%mm2\n\t"             /* V0V1V2V3V4V5V6V7 */
2559
                "movq        %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2560
                "movq        %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2561
                "movq        %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2562
                "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2563
                "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2564
                "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2565
                "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2566

    
2567
                "movq        %%mm1, %%mm6\n\t"
2568
                "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2569
                "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2570
                "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2571
                MOVNTQ"        %%mm0, (%4, %0, 8)\n\t"
2572
                MOVNTQ"        %%mm3, 8(%4, %0, 8)\n\t"
2573
                
2574
                "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2575
                "movq        8(%1, %0, 4), %%mm0\n\t"
2576
                "movq        %%mm0, %%mm3\n\t"
2577
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2578
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2579
                MOVNTQ"        %%mm0, 16(%4, %0, 8)\n\t"
2580
                MOVNTQ"        %%mm3, 24(%4, %0, 8)\n\t"
2581

    
2582
                "movq        %%mm4, %%mm6\n\t"
2583
                "movq        16(%1, %0, 4), %%mm0\n\t"
2584
                "movq        %%mm0, %%mm3\n\t"
2585
                "punpcklbw %%mm5, %%mm4\n\t"
2586
                "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2587
                "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2588
                MOVNTQ"        %%mm0, 32(%4, %0, 8)\n\t"
2589
                MOVNTQ"        %%mm3, 40(%4, %0, 8)\n\t"
2590
                
2591
                "punpckhbw %%mm5, %%mm6\n\t"
2592
                "movq        24(%1, %0, 4), %%mm0\n\t"
2593
                "movq        %%mm0, %%mm3\n\t"
2594
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2595
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2596
                MOVNTQ"        %%mm0, 48(%4, %0, 8)\n\t"
2597
                MOVNTQ"        %%mm3, 56(%4, %0, 8)\n\t"
2598

    
2599
                : "+r" (x)
2600
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2601
                :"memory");
2602
        }
2603
#endif
2604
        for(; x<w; x++)
2605
        {
2606
            const int x2= x<<2;
2607
            d[8*x+0]=yp[x2];
2608
            d[8*x+1]=up[x];
2609
            d[8*x+2]=yp[x2+1];
2610
            d[8*x+3]=vp[x];
2611
            d[8*x+4]=yp[x2+2];
2612
            d[8*x+5]=up[x];
2613
            d[8*x+6]=yp[x2+3];
2614
            d[8*x+7]=vp[x];
2615
        }
2616
    }
2617
#ifdef HAVE_MMX
2618
        asm(
2619
                EMMS" \n\t"
2620
                SFENCE" \n\t"
2621
                ::: "memory"
2622
                );
2623
#endif
2624
}