Statistics
| Branch: | Revision:

ffmpeg / postproc / rgb2rgb_template.c @ 6e1c66bc

History | View | Annotate | Download (67.3 KB)

1
/*
2
 *
3
 *  rgb2rgb.c, Software RGB to RGB convertor
4
 *  pluralize by Software PAL8 to RGB convertor
5
 *               Software YUV to YUV convertor
6
 *               Software YUV to RGB convertor
7
 *  Written by Nick Kurshev.
8
 *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9
 *  lot of big-endian byteorder fixes by Alex Beregszaszi
10
 */
11

    
12
#include <stddef.h>
13
#include <inttypes.h> /* for __WORDSIZE */
14

    
15
#ifndef __WORDSIZE
16
// #warning You have misconfigured system and probably will lose performance!
17
#define __WORDSIZE MP_WORDSIZE
18
#endif
19

    
20
#undef PREFETCH
21
#undef MOVNTQ
22
#undef EMMS
23
#undef SFENCE
24
#undef MMREG_SIZE
25
#undef PREFETCHW
26
#undef PAVGB
27

    
28
#ifdef HAVE_SSE2
29
#define MMREG_SIZE 16
30
#else
31
#define MMREG_SIZE 8
32
#endif
33

    
34
#ifdef HAVE_3DNOW
35
#define PREFETCH  "prefetch"
36
#define PREFETCHW "prefetchw"
37
#define PAVGB          "pavgusb"
38
#elif defined ( HAVE_MMX2 )
39
#define PREFETCH "prefetchnta"
40
#define PREFETCHW "prefetcht0"
41
#define PAVGB          "pavgb"
42
#else
43
#define PREFETCH "/nop"
44
#define PREFETCHW "/nop"
45
#endif
46

    
47
#ifdef HAVE_3DNOW
48
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
49
#define EMMS     "femms"
50
#else
51
#define EMMS     "emms"
52
#endif
53

    
54
#ifdef HAVE_MMX2
55
#define MOVNTQ "movntq"
56
#define SFENCE "sfence"
57
#else
58
#define MOVNTQ "movq"
59
#define SFENCE "/nop"
60
#endif
61

    
62
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
63
{
64
  uint8_t *dest = dst;
65
  const uint8_t *s = src;
66
  const uint8_t *end;
67
#ifdef HAVE_MMX
68
  const uint8_t *mm_end;
69
#endif
70
  end = s + src_size;
71
#ifdef HAVE_MMX
72
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
73
  mm_end = end - 23;
74
  __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
75
  while(s < mm_end)
76
  {
77
    __asm __volatile(
78
        PREFETCH"        32%1\n\t"
79
        "movd        %1, %%mm0\n\t"
80
        "punpckldq 3%1, %%mm0\n\t"
81
        "movd        6%1, %%mm1\n\t"
82
        "punpckldq 9%1, %%mm1\n\t"
83
        "movd        12%1, %%mm2\n\t"
84
        "punpckldq 15%1, %%mm2\n\t"
85
        "movd        18%1, %%mm3\n\t"
86
        "punpckldq 21%1, %%mm3\n\t"
87
        "pand        %%mm7, %%mm0\n\t"
88
        "pand        %%mm7, %%mm1\n\t"
89
        "pand        %%mm7, %%mm2\n\t"
90
        "pand        %%mm7, %%mm3\n\t"
91
        MOVNTQ"        %%mm0, %0\n\t"
92
        MOVNTQ"        %%mm1, 8%0\n\t"
93
        MOVNTQ"        %%mm2, 16%0\n\t"
94
        MOVNTQ"        %%mm3, 24%0"
95
        :"=m"(*dest)
96
        :"m"(*s)
97
        :"memory");
98
    dest += 32;
99
    s += 24;
100
  }
101
  __asm __volatile(SFENCE:::"memory");
102
  __asm __volatile(EMMS:::"memory");
103
#endif
104
  while(s < end)
105
  {
106
#ifdef WORDS_BIGENDIAN
107
    *dest++ = 0;
108
    *dest++ = *s++;
109
    *dest++ = *s++;
110
    *dest++ = *s++;
111
#else
112
    *dest++ = *s++;
113
    *dest++ = *s++;
114
    *dest++ = *s++;
115
    *dest++ = 0;
116
#endif
117
  }
118
}
119

    
120
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
121
{
122
  uint8_t *dest = dst;
123
  const uint8_t *s = src;
124
  const uint8_t *end;
125
#ifdef HAVE_MMX
126
  const uint8_t *mm_end;
127
#endif
128
  end = s + src_size;
129
#ifdef HAVE_MMX
130
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
131
  mm_end = end - 31;
132
  while(s < mm_end)
133
  {
134
    __asm __volatile(
135
        PREFETCH"        32%1\n\t"
136
        "movq        %1, %%mm0\n\t"
137
        "movq        8%1, %%mm1\n\t"
138
        "movq        16%1, %%mm4\n\t"
139
        "movq        24%1, %%mm5\n\t"
140
        "movq        %%mm0, %%mm2\n\t"
141
        "movq        %%mm1, %%mm3\n\t"
142
        "movq        %%mm4, %%mm6\n\t"
143
        "movq        %%mm5, %%mm7\n\t"
144
        "psrlq        $8, %%mm2\n\t"
145
        "psrlq        $8, %%mm3\n\t"
146
        "psrlq        $8, %%mm6\n\t"
147
        "psrlq        $8, %%mm7\n\t"
148
        "pand        %2, %%mm0\n\t"
149
        "pand        %2, %%mm1\n\t"
150
        "pand        %2, %%mm4\n\t"
151
        "pand        %2, %%mm5\n\t"
152
        "pand        %3, %%mm2\n\t"
153
        "pand        %3, %%mm3\n\t"
154
        "pand        %3, %%mm6\n\t"
155
        "pand        %3, %%mm7\n\t"
156
        "por        %%mm2, %%mm0\n\t"
157
        "por        %%mm3, %%mm1\n\t"
158
        "por        %%mm6, %%mm4\n\t"
159
        "por        %%mm7, %%mm5\n\t"
160

    
161
        "movq        %%mm1, %%mm2\n\t"
162
        "movq        %%mm4, %%mm3\n\t"
163
        "psllq        $48, %%mm2\n\t"
164
        "psllq        $32, %%mm3\n\t"
165
        "pand        %4, %%mm2\n\t"
166
        "pand        %5, %%mm3\n\t"
167
        "por        %%mm2, %%mm0\n\t"
168
        "psrlq        $16, %%mm1\n\t"
169
        "psrlq        $32, %%mm4\n\t"
170
        "psllq        $16, %%mm5\n\t"
171
        "por        %%mm3, %%mm1\n\t"
172
        "pand        %6, %%mm5\n\t"
173
        "por        %%mm5, %%mm4\n\t"
174

    
175
        MOVNTQ"        %%mm0, %0\n\t"
176
        MOVNTQ"        %%mm1, 8%0\n\t"
177
        MOVNTQ"        %%mm4, 16%0"
178
        :"=m"(*dest)
179
        :"m"(*s),"m"(mask24l),
180
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
181
        :"memory");
182
    dest += 24;
183
    s += 32;
184
  }
185
  __asm __volatile(SFENCE:::"memory");
186
  __asm __volatile(EMMS:::"memory");
187
#endif
188
  while(s < end)
189
  {
190
#ifdef WORDS_BIGENDIAN
191
    s++;
192
    *dest++ = *s++;
193
    *dest++ = *s++;
194
    *dest++ = *s++;
195
#else
196
    *dest++ = *s++;
197
    *dest++ = *s++;
198
    *dest++ = *s++;
199
    s++;
200
#endif
201
  }
202
}
203

    
204
/*
205
 Original by Strepto/Astral
206
 ported to gcc & bugfixed : A'rpi
207
 MMX2, 3DNOW optimization by Nick Kurshev
208
 32bit c version, and and&add trick by Michael Niedermayer
209
*/
210
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
211
{
212
  register const uint8_t* s=src;
213
  register uint8_t* d=dst;
214
  register const uint8_t *end;
215
  const uint8_t *mm_end;
216
  end = s + src_size;
217
#ifdef HAVE_MMX
218
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
219
  __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
220
  mm_end = end - 15;
221
  while(s<mm_end)
222
  {
223
        __asm __volatile(
224
                PREFETCH"        32%1\n\t"
225
                "movq        %1, %%mm0\n\t"
226
                "movq        8%1, %%mm2\n\t"
227
                "movq        %%mm0, %%mm1\n\t"
228
                "movq        %%mm2, %%mm3\n\t"
229
                "pand        %%mm4, %%mm0\n\t"
230
                "pand        %%mm4, %%mm2\n\t"
231
                "paddw        %%mm1, %%mm0\n\t"
232
                "paddw        %%mm3, %%mm2\n\t"
233
                MOVNTQ"        %%mm0, %0\n\t"
234
                MOVNTQ"        %%mm2, 8%0"
235
                :"=m"(*d)
236
                :"m"(*s)
237
                );
238
        d+=16;
239
        s+=16;
240
  }
241
  __asm __volatile(SFENCE:::"memory");
242
  __asm __volatile(EMMS:::"memory");
243
#endif
244
    mm_end = end - 3;
245
    while(s < mm_end)
246
    {
247
        register unsigned x= *((uint32_t *)s);
248
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
249
        d+=4;
250
        s+=4;
251
    }
252
    if(s < end)
253
    {
254
        register unsigned short x= *((uint16_t *)s);
255
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
256
    }
257
}
258

    
259
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
260
{
261
  register const uint8_t* s=src;
262
  register uint8_t* d=dst;
263
  register const uint8_t *end;
264
  const uint8_t *mm_end;
265
  end = s + src_size;
266
#ifdef HAVE_MMX
267
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
268
  __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
269
  __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
270
  mm_end = end - 15;
271
  while(s<mm_end)
272
  {
273
        __asm __volatile(
274
                PREFETCH"        32%1\n\t"
275
                "movq        %1, %%mm0\n\t"
276
                "movq        8%1, %%mm2\n\t"
277
                "movq        %%mm0, %%mm1\n\t"
278
                "movq        %%mm2, %%mm3\n\t"
279
                "psrlq        $1, %%mm0\n\t"
280
                "psrlq        $1, %%mm2\n\t"
281
                "pand        %%mm7, %%mm0\n\t"
282
                "pand        %%mm7, %%mm2\n\t"
283
                "pand        %%mm6, %%mm1\n\t"
284
                "pand        %%mm6, %%mm3\n\t"
285
                "por        %%mm1, %%mm0\n\t"
286
                "por        %%mm3, %%mm2\n\t"
287
                MOVNTQ"        %%mm0, %0\n\t"
288
                MOVNTQ"        %%mm2, 8%0"
289
                :"=m"(*d)
290
                :"m"(*s)
291
                );
292
        d+=16;
293
        s+=16;
294
  }
295
  __asm __volatile(SFENCE:::"memory");
296
  __asm __volatile(EMMS:::"memory");
297
#endif
298
    mm_end = end - 3;
299
    while(s < mm_end)
300
    {
301
        register uint32_t x= *((uint32_t *)s);
302
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
303
        s+=4;
304
        d+=4;
305
    }
306
    if(s < end)
307
    {
308
        register uint16_t x= *((uint16_t *)s);
309
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
310
        s+=2;
311
        d+=2;
312
    }
313
}
314

    
315
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
316
{
317
        const uint8_t *s = src;
318
        const uint8_t *end;
319
#ifdef HAVE_MMX
320
        const uint8_t *mm_end;
321
#endif
322
        uint16_t *d = (uint16_t *)dst;
323
        end = s + src_size;
324
#ifdef HAVE_MMX
325
        mm_end = end - 15;
326
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
327
        asm volatile(
328
                "movq %3, %%mm5                        \n\t"
329
                "movq %4, %%mm6                        \n\t"
330
                "movq %5, %%mm7                        \n\t"
331
                ".balign 16                        \n\t"
332
                "1:                                \n\t"
333
                PREFETCH" 32(%1)                \n\t"
334
                "movd        (%1), %%mm0                \n\t"
335
                "movd        4(%1), %%mm3                \n\t"
336
                "punpckldq 8(%1), %%mm0                \n\t"
337
                "punpckldq 12(%1), %%mm3        \n\t"
338
                "movq %%mm0, %%mm1                \n\t"
339
                "movq %%mm3, %%mm4                \n\t"
340
                "pand %%mm6, %%mm0                \n\t"
341
                "pand %%mm6, %%mm3                \n\t"
342
                "pmaddwd %%mm7, %%mm0                \n\t"
343
                "pmaddwd %%mm7, %%mm3                \n\t"
344
                "pand %%mm5, %%mm1                \n\t"
345
                "pand %%mm5, %%mm4                \n\t"
346
                "por %%mm1, %%mm0                \n\t"        
347
                "por %%mm4, %%mm3                \n\t"
348
                "psrld $5, %%mm0                \n\t"
349
                "pslld $11, %%mm3                \n\t"
350
                "por %%mm3, %%mm0                \n\t"
351
                MOVNTQ"        %%mm0, (%0)                \n\t"
352
                "add $16, %1                        \n\t"
353
                "add $8, %0                        \n\t"
354
                "cmp %2, %1                        \n\t"
355
                " jb 1b                                \n\t"
356
                : "+r" (d), "+r"(s)
357
                : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
358
        );
359
#else
360
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
361
        __asm __volatile(
362
            "movq        %0, %%mm7\n\t"
363
            "movq        %1, %%mm6\n\t"
364
            ::"m"(red_16mask),"m"(green_16mask));
365
        while(s < mm_end)
366
        {
367
            __asm __volatile(
368
                PREFETCH" 32%1\n\t"
369
                "movd        %1, %%mm0\n\t"
370
                "movd        4%1, %%mm3\n\t"
371
                "punpckldq 8%1, %%mm0\n\t"
372
                "punpckldq 12%1, %%mm3\n\t"
373
                "movq        %%mm0, %%mm1\n\t"
374
                "movq        %%mm0, %%mm2\n\t"
375
                "movq        %%mm3, %%mm4\n\t"
376
                "movq        %%mm3, %%mm5\n\t"
377
                "psrlq        $3, %%mm0\n\t"
378
                "psrlq        $3, %%mm3\n\t"
379
                "pand        %2, %%mm0\n\t"
380
                "pand        %2, %%mm3\n\t"
381
                "psrlq        $5, %%mm1\n\t"
382
                "psrlq        $5, %%mm4\n\t"
383
                "pand        %%mm6, %%mm1\n\t"
384
                "pand        %%mm6, %%mm4\n\t"
385
                "psrlq        $8, %%mm2\n\t"
386
                "psrlq        $8, %%mm5\n\t"
387
                "pand        %%mm7, %%mm2\n\t"
388
                "pand        %%mm7, %%mm5\n\t"
389
                "por        %%mm1, %%mm0\n\t"
390
                "por        %%mm4, %%mm3\n\t"
391
                "por        %%mm2, %%mm0\n\t"
392
                "por        %%mm5, %%mm3\n\t"
393
                "psllq        $16, %%mm3\n\t"
394
                "por        %%mm3, %%mm0\n\t"
395
                MOVNTQ"        %%mm0, %0\n\t"
396
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
397
                d += 4;
398
                s += 16;
399
        }
400
#endif
401
        __asm __volatile(SFENCE:::"memory");
402
        __asm __volatile(EMMS:::"memory");
403
#endif
404
        while(s < end)
405
        {
406
                // FIXME on bigendian
407
                const int src= *s; s += 4;
408
                *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
409
//                *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
410
        }
411
}
412

    
413
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
414
{
415
        const uint8_t *s = src;
416
        const uint8_t *end;
417
#ifdef HAVE_MMX
418
        const uint8_t *mm_end;
419
#endif
420
        uint16_t *d = (uint16_t *)dst;
421
        end = s + src_size;
422
#ifdef HAVE_MMX
423
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
424
        __asm __volatile(
425
            "movq        %0, %%mm7\n\t"
426
            "movq        %1, %%mm6\n\t"
427
            ::"m"(red_16mask),"m"(green_16mask));
428
        mm_end = end - 15;
429
        while(s < mm_end)
430
        {
431
            __asm __volatile(
432
                PREFETCH" 32%1\n\t"
433
                "movd        %1, %%mm0\n\t"
434
                "movd        4%1, %%mm3\n\t"
435
                "punpckldq 8%1, %%mm0\n\t"
436
                "punpckldq 12%1, %%mm3\n\t"
437
                "movq        %%mm0, %%mm1\n\t"
438
                "movq        %%mm0, %%mm2\n\t"
439
                "movq        %%mm3, %%mm4\n\t"
440
                "movq        %%mm3, %%mm5\n\t"
441
                "psllq        $8, %%mm0\n\t"
442
                "psllq        $8, %%mm3\n\t"
443
                "pand        %%mm7, %%mm0\n\t"
444
                "pand        %%mm7, %%mm3\n\t"
445
                "psrlq        $5, %%mm1\n\t"
446
                "psrlq        $5, %%mm4\n\t"
447
                "pand        %%mm6, %%mm1\n\t"
448
                "pand        %%mm6, %%mm4\n\t"
449
                "psrlq        $19, %%mm2\n\t"
450
                "psrlq        $19, %%mm5\n\t"
451
                "pand        %2, %%mm2\n\t"
452
                "pand        %2, %%mm5\n\t"
453
                "por        %%mm1, %%mm0\n\t"
454
                "por        %%mm4, %%mm3\n\t"
455
                "por        %%mm2, %%mm0\n\t"
456
                "por        %%mm5, %%mm3\n\t"
457
                "psllq        $16, %%mm3\n\t"
458
                "por        %%mm3, %%mm0\n\t"
459
                MOVNTQ"        %%mm0, %0\n\t"
460
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
461
                d += 4;
462
                s += 16;
463
        }
464
        __asm __volatile(SFENCE:::"memory");
465
        __asm __volatile(EMMS:::"memory");
466
#endif
467
        while(s < end)
468
        {
469
                // FIXME on bigendian
470
                const int src= *s; s += 4;
471
                *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
472
        }
473
}
474

    
475
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
476
{
477
        const uint8_t *s = src;
478
        const uint8_t *end;
479
#ifdef HAVE_MMX
480
        const uint8_t *mm_end;
481
#endif
482
        uint16_t *d = (uint16_t *)dst;
483
        end = s + src_size;
484
#ifdef HAVE_MMX
485
        mm_end = end - 15;
486
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
487
        asm volatile(
488
                "movq %3, %%mm5                        \n\t"
489
                "movq %4, %%mm6                        \n\t"
490
                "movq %5, %%mm7                        \n\t"
491
                ".balign 16                        \n\t"
492
                "1:                                \n\t"
493
                PREFETCH" 32(%1)                \n\t"
494
                "movd        (%1), %%mm0                \n\t"
495
                "movd        4(%1), %%mm3                \n\t"
496
                "punpckldq 8(%1), %%mm0                \n\t"
497
                "punpckldq 12(%1), %%mm3        \n\t"
498
                "movq %%mm0, %%mm1                \n\t"
499
                "movq %%mm3, %%mm4                \n\t"
500
                "pand %%mm6, %%mm0                \n\t"
501
                "pand %%mm6, %%mm3                \n\t"
502
                "pmaddwd %%mm7, %%mm0                \n\t"
503
                "pmaddwd %%mm7, %%mm3                \n\t"
504
                "pand %%mm5, %%mm1                \n\t"
505
                "pand %%mm5, %%mm4                \n\t"
506
                "por %%mm1, %%mm0                \n\t"        
507
                "por %%mm4, %%mm3                \n\t"
508
                "psrld $6, %%mm0                \n\t"
509
                "pslld $10, %%mm3                \n\t"
510
                "por %%mm3, %%mm0                \n\t"
511
                MOVNTQ"        %%mm0, (%0)                \n\t"
512
                "add $16, %1                        \n\t"
513
                "add $8, %0                        \n\t"
514
                "cmp %2, %1                        \n\t"
515
                " jb 1b                                \n\t"
516
                : "+r" (d), "+r"(s)
517
                : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
518
        );
519
#else
520
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
521
        __asm __volatile(
522
            "movq        %0, %%mm7\n\t"
523
            "movq        %1, %%mm6\n\t"
524
            ::"m"(red_15mask),"m"(green_15mask));
525
        while(s < mm_end)
526
        {
527
            __asm __volatile(
528
                PREFETCH" 32%1\n\t"
529
                "movd        %1, %%mm0\n\t"
530
                "movd        4%1, %%mm3\n\t"
531
                "punpckldq 8%1, %%mm0\n\t"
532
                "punpckldq 12%1, %%mm3\n\t"
533
                "movq        %%mm0, %%mm1\n\t"
534
                "movq        %%mm0, %%mm2\n\t"
535
                "movq        %%mm3, %%mm4\n\t"
536
                "movq        %%mm3, %%mm5\n\t"
537
                "psrlq        $3, %%mm0\n\t"
538
                "psrlq        $3, %%mm3\n\t"
539
                "pand        %2, %%mm0\n\t"
540
                "pand        %2, %%mm3\n\t"
541
                "psrlq        $6, %%mm1\n\t"
542
                "psrlq        $6, %%mm4\n\t"
543
                "pand        %%mm6, %%mm1\n\t"
544
                "pand        %%mm6, %%mm4\n\t"
545
                "psrlq        $9, %%mm2\n\t"
546
                "psrlq        $9, %%mm5\n\t"
547
                "pand        %%mm7, %%mm2\n\t"
548
                "pand        %%mm7, %%mm5\n\t"
549
                "por        %%mm1, %%mm0\n\t"
550
                "por        %%mm4, %%mm3\n\t"
551
                "por        %%mm2, %%mm0\n\t"
552
                "por        %%mm5, %%mm3\n\t"
553
                "psllq        $16, %%mm3\n\t"
554
                "por        %%mm3, %%mm0\n\t"
555
                MOVNTQ"        %%mm0, %0\n\t"
556
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
557
                d += 4;
558
                s += 16;
559
        }
560
#endif
561
        __asm __volatile(SFENCE:::"memory");
562
        __asm __volatile(EMMS:::"memory");
563
#endif
564
        while(s < end)
565
        {
566
                // FIXME on bigendian
567
                const int src= *s; s += 4;
568
                *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
569
        }
570
}
571

    
572
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
573
{
574
        const uint8_t *s = src;
575
        const uint8_t *end;
576
#ifdef HAVE_MMX
577
        const uint8_t *mm_end;
578
#endif
579
        uint16_t *d = (uint16_t *)dst;
580
        end = s + src_size;
581
#ifdef HAVE_MMX
582
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
583
        __asm __volatile(
584
            "movq        %0, %%mm7\n\t"
585
            "movq        %1, %%mm6\n\t"
586
            ::"m"(red_15mask),"m"(green_15mask));
587
        mm_end = end - 15;
588
        while(s < mm_end)
589
        {
590
            __asm __volatile(
591
                PREFETCH" 32%1\n\t"
592
                "movd        %1, %%mm0\n\t"
593
                "movd        4%1, %%mm3\n\t"
594
                "punpckldq 8%1, %%mm0\n\t"
595
                "punpckldq 12%1, %%mm3\n\t"
596
                "movq        %%mm0, %%mm1\n\t"
597
                "movq        %%mm0, %%mm2\n\t"
598
                "movq        %%mm3, %%mm4\n\t"
599
                "movq        %%mm3, %%mm5\n\t"
600
                "psllq        $7, %%mm0\n\t"
601
                "psllq        $7, %%mm3\n\t"
602
                "pand        %%mm7, %%mm0\n\t"
603
                "pand        %%mm7, %%mm3\n\t"
604
                "psrlq        $6, %%mm1\n\t"
605
                "psrlq        $6, %%mm4\n\t"
606
                "pand        %%mm6, %%mm1\n\t"
607
                "pand        %%mm6, %%mm4\n\t"
608
                "psrlq        $19, %%mm2\n\t"
609
                "psrlq        $19, %%mm5\n\t"
610
                "pand        %2, %%mm2\n\t"
611
                "pand        %2, %%mm5\n\t"
612
                "por        %%mm1, %%mm0\n\t"
613
                "por        %%mm4, %%mm3\n\t"
614
                "por        %%mm2, %%mm0\n\t"
615
                "por        %%mm5, %%mm3\n\t"
616
                "psllq        $16, %%mm3\n\t"
617
                "por        %%mm3, %%mm0\n\t"
618
                MOVNTQ"        %%mm0, %0\n\t"
619
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
620
                d += 4;
621
                s += 16;
622
        }
623
        __asm __volatile(SFENCE:::"memory");
624
        __asm __volatile(EMMS:::"memory");
625
#endif
626
        while(s < end)
627
        {
628
                // FIXME on bigendian
629
                const int src= *s; s += 4;
630
                *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
631
        }
632
}
633

    
634
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
635
{
636
        const uint8_t *s = src;
637
        const uint8_t *end;
638
#ifdef HAVE_MMX
639
        const uint8_t *mm_end;
640
#endif
641
        uint16_t *d = (uint16_t *)dst;
642
        end = s + src_size;
643
#ifdef HAVE_MMX
644
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
645
        __asm __volatile(
646
            "movq        %0, %%mm7\n\t"
647
            "movq        %1, %%mm6\n\t"
648
            ::"m"(red_16mask),"m"(green_16mask));
649
        mm_end = end - 11;
650
        while(s < mm_end)
651
        {
652
            __asm __volatile(
653
                PREFETCH" 32%1\n\t"
654
                "movd        %1, %%mm0\n\t"
655
                "movd        3%1, %%mm3\n\t"
656
                "punpckldq 6%1, %%mm0\n\t"
657
                "punpckldq 9%1, %%mm3\n\t"
658
                "movq        %%mm0, %%mm1\n\t"
659
                "movq        %%mm0, %%mm2\n\t"
660
                "movq        %%mm3, %%mm4\n\t"
661
                "movq        %%mm3, %%mm5\n\t"
662
                "psrlq        $3, %%mm0\n\t"
663
                "psrlq        $3, %%mm3\n\t"
664
                "pand        %2, %%mm0\n\t"
665
                "pand        %2, %%mm3\n\t"
666
                "psrlq        $5, %%mm1\n\t"
667
                "psrlq        $5, %%mm4\n\t"
668
                "pand        %%mm6, %%mm1\n\t"
669
                "pand        %%mm6, %%mm4\n\t"
670
                "psrlq        $8, %%mm2\n\t"
671
                "psrlq        $8, %%mm5\n\t"
672
                "pand        %%mm7, %%mm2\n\t"
673
                "pand        %%mm7, %%mm5\n\t"
674
                "por        %%mm1, %%mm0\n\t"
675
                "por        %%mm4, %%mm3\n\t"
676
                "por        %%mm2, %%mm0\n\t"
677
                "por        %%mm5, %%mm3\n\t"
678
                "psllq        $16, %%mm3\n\t"
679
                "por        %%mm3, %%mm0\n\t"
680
                MOVNTQ"        %%mm0, %0\n\t"
681
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
682
                d += 4;
683
                s += 12;
684
        }
685
        __asm __volatile(SFENCE:::"memory");
686
        __asm __volatile(EMMS:::"memory");
687
#endif
688
        while(s < end)
689
        {
690
                const int b= *s++;
691
                const int g= *s++;
692
                const int r= *s++;
693
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
694
        }
695
}
696

    
697
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
698
{
699
        const uint8_t *s = src;
700
        const uint8_t *end;
701
#ifdef HAVE_MMX
702
        const uint8_t *mm_end;
703
#endif
704
        uint16_t *d = (uint16_t *)dst;
705
        end = s + src_size;
706
#ifdef HAVE_MMX
707
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
708
        __asm __volatile(
709
            "movq        %0, %%mm7\n\t"
710
            "movq        %1, %%mm6\n\t"
711
            ::"m"(red_16mask),"m"(green_16mask));
712
        mm_end = end - 15;
713
        while(s < mm_end)
714
        {
715
            __asm __volatile(
716
                PREFETCH" 32%1\n\t"
717
                "movd        %1, %%mm0\n\t"
718
                "movd        3%1, %%mm3\n\t"
719
                "punpckldq 6%1, %%mm0\n\t"
720
                "punpckldq 9%1, %%mm3\n\t"
721
                "movq        %%mm0, %%mm1\n\t"
722
                "movq        %%mm0, %%mm2\n\t"
723
                "movq        %%mm3, %%mm4\n\t"
724
                "movq        %%mm3, %%mm5\n\t"
725
                "psllq        $8, %%mm0\n\t"
726
                "psllq        $8, %%mm3\n\t"
727
                "pand        %%mm7, %%mm0\n\t"
728
                "pand        %%mm7, %%mm3\n\t"
729
                "psrlq        $5, %%mm1\n\t"
730
                "psrlq        $5, %%mm4\n\t"
731
                "pand        %%mm6, %%mm1\n\t"
732
                "pand        %%mm6, %%mm4\n\t"
733
                "psrlq        $19, %%mm2\n\t"
734
                "psrlq        $19, %%mm5\n\t"
735
                "pand        %2, %%mm2\n\t"
736
                "pand        %2, %%mm5\n\t"
737
                "por        %%mm1, %%mm0\n\t"
738
                "por        %%mm4, %%mm3\n\t"
739
                "por        %%mm2, %%mm0\n\t"
740
                "por        %%mm5, %%mm3\n\t"
741
                "psllq        $16, %%mm3\n\t"
742
                "por        %%mm3, %%mm0\n\t"
743
                MOVNTQ"        %%mm0, %0\n\t"
744
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
745
                d += 4;
746
                s += 12;
747
        }
748
        __asm __volatile(SFENCE:::"memory");
749
        __asm __volatile(EMMS:::"memory");
750
#endif
751
        while(s < end)
752
        {
753
                const int r= *s++;
754
                const int g= *s++;
755
                const int b= *s++;
756
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
757
        }
758
}
759

    
760
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
761
{
762
        const uint8_t *s = src;
763
        const uint8_t *end;
764
#ifdef HAVE_MMX
765
        const uint8_t *mm_end;
766
#endif
767
        uint16_t *d = (uint16_t *)dst;
768
        end = s + src_size;
769
#ifdef HAVE_MMX
770
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
771
        __asm __volatile(
772
            "movq        %0, %%mm7\n\t"
773
            "movq        %1, %%mm6\n\t"
774
            ::"m"(red_15mask),"m"(green_15mask));
775
        mm_end = end - 11;
776
        while(s < mm_end)
777
        {
778
            __asm __volatile(
779
                PREFETCH" 32%1\n\t"
780
                "movd        %1, %%mm0\n\t"
781
                "movd        3%1, %%mm3\n\t"
782
                "punpckldq 6%1, %%mm0\n\t"
783
                "punpckldq 9%1, %%mm3\n\t"
784
                "movq        %%mm0, %%mm1\n\t"
785
                "movq        %%mm0, %%mm2\n\t"
786
                "movq        %%mm3, %%mm4\n\t"
787
                "movq        %%mm3, %%mm5\n\t"
788
                "psrlq        $3, %%mm0\n\t"
789
                "psrlq        $3, %%mm3\n\t"
790
                "pand        %2, %%mm0\n\t"
791
                "pand        %2, %%mm3\n\t"
792
                "psrlq        $6, %%mm1\n\t"
793
                "psrlq        $6, %%mm4\n\t"
794
                "pand        %%mm6, %%mm1\n\t"
795
                "pand        %%mm6, %%mm4\n\t"
796
                "psrlq        $9, %%mm2\n\t"
797
                "psrlq        $9, %%mm5\n\t"
798
                "pand        %%mm7, %%mm2\n\t"
799
                "pand        %%mm7, %%mm5\n\t"
800
                "por        %%mm1, %%mm0\n\t"
801
                "por        %%mm4, %%mm3\n\t"
802
                "por        %%mm2, %%mm0\n\t"
803
                "por        %%mm5, %%mm3\n\t"
804
                "psllq        $16, %%mm3\n\t"
805
                "por        %%mm3, %%mm0\n\t"
806
                MOVNTQ"        %%mm0, %0\n\t"
807
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
808
                d += 4;
809
                s += 12;
810
        }
811
        __asm __volatile(SFENCE:::"memory");
812
        __asm __volatile(EMMS:::"memory");
813
#endif
814
        while(s < end)
815
        {
816
                const int b= *s++;
817
                const int g= *s++;
818
                const int r= *s++;
819
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
820
        }
821
}
822

    
823
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
824
{
825
        const uint8_t *s = src;
826
        const uint8_t *end;
827
#ifdef HAVE_MMX
828
        const uint8_t *mm_end;
829
#endif
830
        uint16_t *d = (uint16_t *)dst;
831
        end = s + src_size;
832
#ifdef HAVE_MMX
833
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
834
        __asm __volatile(
835
            "movq        %0, %%mm7\n\t"
836
            "movq        %1, %%mm6\n\t"
837
            ::"m"(red_15mask),"m"(green_15mask));
838
        mm_end = end - 15;
839
        while(s < mm_end)
840
        {
841
            __asm __volatile(
842
                PREFETCH" 32%1\n\t"
843
                "movd        %1, %%mm0\n\t"
844
                "movd        3%1, %%mm3\n\t"
845
                "punpckldq 6%1, %%mm0\n\t"
846
                "punpckldq 9%1, %%mm3\n\t"
847
                "movq        %%mm0, %%mm1\n\t"
848
                "movq        %%mm0, %%mm2\n\t"
849
                "movq        %%mm3, %%mm4\n\t"
850
                "movq        %%mm3, %%mm5\n\t"
851
                "psllq        $7, %%mm0\n\t"
852
                "psllq        $7, %%mm3\n\t"
853
                "pand        %%mm7, %%mm0\n\t"
854
                "pand        %%mm7, %%mm3\n\t"
855
                "psrlq        $6, %%mm1\n\t"
856
                "psrlq        $6, %%mm4\n\t"
857
                "pand        %%mm6, %%mm1\n\t"
858
                "pand        %%mm6, %%mm4\n\t"
859
                "psrlq        $19, %%mm2\n\t"
860
                "psrlq        $19, %%mm5\n\t"
861
                "pand        %2, %%mm2\n\t"
862
                "pand        %2, %%mm5\n\t"
863
                "por        %%mm1, %%mm0\n\t"
864
                "por        %%mm4, %%mm3\n\t"
865
                "por        %%mm2, %%mm0\n\t"
866
                "por        %%mm5, %%mm3\n\t"
867
                "psllq        $16, %%mm3\n\t"
868
                "por        %%mm3, %%mm0\n\t"
869
                MOVNTQ"        %%mm0, %0\n\t"
870
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
871
                d += 4;
872
                s += 12;
873
        }
874
        __asm __volatile(SFENCE:::"memory");
875
        __asm __volatile(EMMS:::"memory");
876
#endif
877
        while(s < end)
878
        {
879
                const int r= *s++;
880
                const int g= *s++;
881
                const int b= *s++;
882
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
883
        }
884
}
885

    
886
/*
887
  I use here less accurate approximation by simply
888
 left-shifting the input
889
  value and filling the low order bits with
890
 zeroes. This method improves png's
891
  compression but this scheme cannot reproduce white exactly, since it does not
892
  generate an all-ones maximum value; the net effect is to darken the
893
  image slightly.
894

895
  The better method should be "left bit replication":
896

897
   4 3 2 1 0
898
   ---------
899
   1 1 0 1 1
900

901
   7 6 5 4 3  2 1 0
902
   ----------------
903
   1 1 0 1 1  1 1 0
904
   |=======|  |===|
905
       |      Leftmost Bits Repeated to Fill Open Bits
906
       |
907
   Original Bits
908
*/
909
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
910
{
911
        const uint16_t *end;
912
#ifdef HAVE_MMX
913
        const uint16_t *mm_end;
914
#endif
915
        uint8_t *d = (uint8_t *)dst;
916
        const uint16_t *s = (uint16_t *)src;
917
        end = s + src_size/2;
918
#ifdef HAVE_MMX
919
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
920
        mm_end = end - 7;
921
        while(s < mm_end)
922
        {
923
            __asm __volatile(
924
                PREFETCH" 32%1\n\t"
925
                "movq        %1, %%mm0\n\t"
926
                "movq        %1, %%mm1\n\t"
927
                "movq        %1, %%mm2\n\t"
928
                "pand        %2, %%mm0\n\t"
929
                "pand        %3, %%mm1\n\t"
930
                "pand        %4, %%mm2\n\t"
931
                "psllq        $3, %%mm0\n\t"
932
                "psrlq        $2, %%mm1\n\t"
933
                "psrlq        $7, %%mm2\n\t"
934
                "movq        %%mm0, %%mm3\n\t"
935
                "movq        %%mm1, %%mm4\n\t"
936
                "movq        %%mm2, %%mm5\n\t"
937
                "punpcklwd %5, %%mm0\n\t"
938
                "punpcklwd %5, %%mm1\n\t"
939
                "punpcklwd %5, %%mm2\n\t"
940
                "punpckhwd %5, %%mm3\n\t"
941
                "punpckhwd %5, %%mm4\n\t"
942
                "punpckhwd %5, %%mm5\n\t"
943
                "psllq        $8, %%mm1\n\t"
944
                "psllq        $16, %%mm2\n\t"
945
                "por        %%mm1, %%mm0\n\t"
946
                "por        %%mm2, %%mm0\n\t"
947
                "psllq        $8, %%mm4\n\t"
948
                "psllq        $16, %%mm5\n\t"
949
                "por        %%mm4, %%mm3\n\t"
950
                "por        %%mm5, %%mm3\n\t"
951

    
952
                "movq        %%mm0, %%mm6\n\t"
953
                "movq        %%mm3, %%mm7\n\t"
954
                
955
                "movq        8%1, %%mm0\n\t"
956
                "movq        8%1, %%mm1\n\t"
957
                "movq        8%1, %%mm2\n\t"
958
                "pand        %2, %%mm0\n\t"
959
                "pand        %3, %%mm1\n\t"
960
                "pand        %4, %%mm2\n\t"
961
                "psllq        $3, %%mm0\n\t"
962
                "psrlq        $2, %%mm1\n\t"
963
                "psrlq        $7, %%mm2\n\t"
964
                "movq        %%mm0, %%mm3\n\t"
965
                "movq        %%mm1, %%mm4\n\t"
966
                "movq        %%mm2, %%mm5\n\t"
967
                "punpcklwd %5, %%mm0\n\t"
968
                "punpcklwd %5, %%mm1\n\t"
969
                "punpcklwd %5, %%mm2\n\t"
970
                "punpckhwd %5, %%mm3\n\t"
971
                "punpckhwd %5, %%mm4\n\t"
972
                "punpckhwd %5, %%mm5\n\t"
973
                "psllq        $8, %%mm1\n\t"
974
                "psllq        $16, %%mm2\n\t"
975
                "por        %%mm1, %%mm0\n\t"
976
                "por        %%mm2, %%mm0\n\t"
977
                "psllq        $8, %%mm4\n\t"
978
                "psllq        $16, %%mm5\n\t"
979
                "por        %%mm4, %%mm3\n\t"
980
                "por        %%mm5, %%mm3\n\t"
981

    
982
                :"=m"(*d)
983
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
984
                :"memory");
985
            /* Borrowed 32 to 24 */
986
            __asm __volatile(
987
                "movq        %%mm0, %%mm4\n\t"
988
                "movq        %%mm3, %%mm5\n\t"
989
                "movq        %%mm6, %%mm0\n\t"
990
                "movq        %%mm7, %%mm1\n\t"
991
                
992
                "movq        %%mm4, %%mm6\n\t"
993
                "movq        %%mm5, %%mm7\n\t"
994
                "movq        %%mm0, %%mm2\n\t"
995
                "movq        %%mm1, %%mm3\n\t"
996

    
997
                "psrlq        $8, %%mm2\n\t"
998
                "psrlq        $8, %%mm3\n\t"
999
                "psrlq        $8, %%mm6\n\t"
1000
                "psrlq        $8, %%mm7\n\t"
1001
                "pand        %2, %%mm0\n\t"
1002
                "pand        %2, %%mm1\n\t"
1003
                "pand        %2, %%mm4\n\t"
1004
                "pand        %2, %%mm5\n\t"
1005
                "pand        %3, %%mm2\n\t"
1006
                "pand        %3, %%mm3\n\t"
1007
                "pand        %3, %%mm6\n\t"
1008
                "pand        %3, %%mm7\n\t"
1009
                "por        %%mm2, %%mm0\n\t"
1010
                "por        %%mm3, %%mm1\n\t"
1011
                "por        %%mm6, %%mm4\n\t"
1012
                "por        %%mm7, %%mm5\n\t"
1013

    
1014
                "movq        %%mm1, %%mm2\n\t"
1015
                "movq        %%mm4, %%mm3\n\t"
1016
                "psllq        $48, %%mm2\n\t"
1017
                "psllq        $32, %%mm3\n\t"
1018
                "pand        %4, %%mm2\n\t"
1019
                "pand        %5, %%mm3\n\t"
1020
                "por        %%mm2, %%mm0\n\t"
1021
                "psrlq        $16, %%mm1\n\t"
1022
                "psrlq        $32, %%mm4\n\t"
1023
                "psllq        $16, %%mm5\n\t"
1024
                "por        %%mm3, %%mm1\n\t"
1025
                "pand        %6, %%mm5\n\t"
1026
                "por        %%mm5, %%mm4\n\t"
1027

    
1028
                MOVNTQ"        %%mm0, %0\n\t"
1029
                MOVNTQ"        %%mm1, 8%0\n\t"
1030
                MOVNTQ"        %%mm4, 16%0"
1031

    
1032
                :"=m"(*d)
1033
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1034
                :"memory");
1035
                d += 24;
1036
                s += 8;
1037
        }
1038
        __asm __volatile(SFENCE:::"memory");
1039
        __asm __volatile(EMMS:::"memory");
1040
#endif
1041
        while(s < end)
1042
        {
1043
                register uint16_t bgr;
1044
                bgr = *s++;
1045
                *d++ = (bgr&0x1F)<<3;
1046
                *d++ = (bgr&0x3E0)>>2;
1047
                *d++ = (bgr&0x7C00)>>7;
1048
        }
1049
}
1050

    
1051
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1052
{
1053
        const uint16_t *end;
1054
#ifdef HAVE_MMX
1055
        const uint16_t *mm_end;
1056
#endif
1057
        uint8_t *d = (uint8_t *)dst;
1058
        const uint16_t *s = (const uint16_t *)src;
1059
        end = s + src_size/2;
1060
#ifdef HAVE_MMX
1061
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1062
        mm_end = end - 7;
1063
        while(s < mm_end)
1064
        {
1065
            __asm __volatile(
1066
                PREFETCH" 32%1\n\t"
1067
                "movq        %1, %%mm0\n\t"
1068
                "movq        %1, %%mm1\n\t"
1069
                "movq        %1, %%mm2\n\t"
1070
                "pand        %2, %%mm0\n\t"
1071
                "pand        %3, %%mm1\n\t"
1072
                "pand        %4, %%mm2\n\t"
1073
                "psllq        $3, %%mm0\n\t"
1074
                "psrlq        $3, %%mm1\n\t"
1075
                "psrlq        $8, %%mm2\n\t"
1076
                "movq        %%mm0, %%mm3\n\t"
1077
                "movq        %%mm1, %%mm4\n\t"
1078
                "movq        %%mm2, %%mm5\n\t"
1079
                "punpcklwd %5, %%mm0\n\t"
1080
                "punpcklwd %5, %%mm1\n\t"
1081
                "punpcklwd %5, %%mm2\n\t"
1082
                "punpckhwd %5, %%mm3\n\t"
1083
                "punpckhwd %5, %%mm4\n\t"
1084
                "punpckhwd %5, %%mm5\n\t"
1085
                "psllq        $8, %%mm1\n\t"
1086
                "psllq        $16, %%mm2\n\t"
1087
                "por        %%mm1, %%mm0\n\t"
1088
                "por        %%mm2, %%mm0\n\t"
1089
                "psllq        $8, %%mm4\n\t"
1090
                "psllq        $16, %%mm5\n\t"
1091
                "por        %%mm4, %%mm3\n\t"
1092
                "por        %%mm5, %%mm3\n\t"
1093
                
1094
                "movq        %%mm0, %%mm6\n\t"
1095
                "movq        %%mm3, %%mm7\n\t"
1096

    
1097
                "movq        8%1, %%mm0\n\t"
1098
                "movq        8%1, %%mm1\n\t"
1099
                "movq        8%1, %%mm2\n\t"
1100
                "pand        %2, %%mm0\n\t"
1101
                "pand        %3, %%mm1\n\t"
1102
                "pand        %4, %%mm2\n\t"
1103
                "psllq        $3, %%mm0\n\t"
1104
                "psrlq        $3, %%mm1\n\t"
1105
                "psrlq        $8, %%mm2\n\t"
1106
                "movq        %%mm0, %%mm3\n\t"
1107
                "movq        %%mm1, %%mm4\n\t"
1108
                "movq        %%mm2, %%mm5\n\t"
1109
                "punpcklwd %5, %%mm0\n\t"
1110
                "punpcklwd %5, %%mm1\n\t"
1111
                "punpcklwd %5, %%mm2\n\t"
1112
                "punpckhwd %5, %%mm3\n\t"
1113
                "punpckhwd %5, %%mm4\n\t"
1114
                "punpckhwd %5, %%mm5\n\t"
1115
                "psllq        $8, %%mm1\n\t"
1116
                "psllq        $16, %%mm2\n\t"
1117
                "por        %%mm1, %%mm0\n\t"
1118
                "por        %%mm2, %%mm0\n\t"
1119
                "psllq        $8, %%mm4\n\t"
1120
                "psllq        $16, %%mm5\n\t"
1121
                "por        %%mm4, %%mm3\n\t"
1122
                "por        %%mm5, %%mm3\n\t"
1123
                :"=m"(*d)
1124
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)                
1125
                :"memory");
1126
            /* Borrowed 32 to 24 */
1127
            __asm __volatile(
1128
                "movq        %%mm0, %%mm4\n\t"
1129
                "movq        %%mm3, %%mm5\n\t"
1130
                "movq        %%mm6, %%mm0\n\t"
1131
                "movq        %%mm7, %%mm1\n\t"
1132
                
1133
                "movq        %%mm4, %%mm6\n\t"
1134
                "movq        %%mm5, %%mm7\n\t"
1135
                "movq        %%mm0, %%mm2\n\t"
1136
                "movq        %%mm1, %%mm3\n\t"
1137

    
1138
                "psrlq        $8, %%mm2\n\t"
1139
                "psrlq        $8, %%mm3\n\t"
1140
                "psrlq        $8, %%mm6\n\t"
1141
                "psrlq        $8, %%mm7\n\t"
1142
                "pand        %2, %%mm0\n\t"
1143
                "pand        %2, %%mm1\n\t"
1144
                "pand        %2, %%mm4\n\t"
1145
                "pand        %2, %%mm5\n\t"
1146
                "pand        %3, %%mm2\n\t"
1147
                "pand        %3, %%mm3\n\t"
1148
                "pand        %3, %%mm6\n\t"
1149
                "pand        %3, %%mm7\n\t"
1150
                "por        %%mm2, %%mm0\n\t"
1151
                "por        %%mm3, %%mm1\n\t"
1152
                "por        %%mm6, %%mm4\n\t"
1153
                "por        %%mm7, %%mm5\n\t"
1154

    
1155
                "movq        %%mm1, %%mm2\n\t"
1156
                "movq        %%mm4, %%mm3\n\t"
1157
                "psllq        $48, %%mm2\n\t"
1158
                "psllq        $32, %%mm3\n\t"
1159
                "pand        %4, %%mm2\n\t"
1160
                "pand        %5, %%mm3\n\t"
1161
                "por        %%mm2, %%mm0\n\t"
1162
                "psrlq        $16, %%mm1\n\t"
1163
                "psrlq        $32, %%mm4\n\t"
1164
                "psllq        $16, %%mm5\n\t"
1165
                "por        %%mm3, %%mm1\n\t"
1166
                "pand        %6, %%mm5\n\t"
1167
                "por        %%mm5, %%mm4\n\t"
1168

    
1169
                MOVNTQ"        %%mm0, %0\n\t"
1170
                MOVNTQ"        %%mm1, 8%0\n\t"
1171
                MOVNTQ"        %%mm4, 16%0"
1172

    
1173
                :"=m"(*d)
1174
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1175
                :"memory");
1176
                d += 24;
1177
                s += 8;
1178
        }
1179
        __asm __volatile(SFENCE:::"memory");
1180
        __asm __volatile(EMMS:::"memory");
1181
#endif
1182
        while(s < end)
1183
        {
1184
                register uint16_t bgr;
1185
                bgr = *s++;
1186
                *d++ = (bgr&0x1F)<<3;
1187
                *d++ = (bgr&0x7E0)>>3;
1188
                *d++ = (bgr&0xF800)>>8;
1189
        }
1190
}
1191

    
1192
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1193
{
1194
        const uint16_t *end;
1195
#ifdef HAVE_MMX
1196
        const uint16_t *mm_end;
1197
#endif
1198
        uint8_t *d = (uint8_t *)dst;
1199
        const uint16_t *s = (const uint16_t *)src;
1200
        end = s + src_size/2;
1201
#ifdef HAVE_MMX
1202
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1203
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1204
        mm_end = end - 3;
1205
        while(s < mm_end)
1206
        {
1207
            __asm __volatile(
1208
                PREFETCH" 32%1\n\t"
1209
                "movq        %1, %%mm0\n\t"
1210
                "movq        %1, %%mm1\n\t"
1211
                "movq        %1, %%mm2\n\t"
1212
                "pand        %2, %%mm0\n\t"
1213
                "pand        %3, %%mm1\n\t"
1214
                "pand        %4, %%mm2\n\t"
1215
                "psllq        $3, %%mm0\n\t"
1216
                "psrlq        $2, %%mm1\n\t"
1217
                "psrlq        $7, %%mm2\n\t"
1218
                "movq        %%mm0, %%mm3\n\t"
1219
                "movq        %%mm1, %%mm4\n\t"
1220
                "movq        %%mm2, %%mm5\n\t"
1221
                "punpcklwd %%mm7, %%mm0\n\t"
1222
                "punpcklwd %%mm7, %%mm1\n\t"
1223
                "punpcklwd %%mm7, %%mm2\n\t"
1224
                "punpckhwd %%mm7, %%mm3\n\t"
1225
                "punpckhwd %%mm7, %%mm4\n\t"
1226
                "punpckhwd %%mm7, %%mm5\n\t"
1227
                "psllq        $8, %%mm1\n\t"
1228
                "psllq        $16, %%mm2\n\t"
1229
                "por        %%mm1, %%mm0\n\t"
1230
                "por        %%mm2, %%mm0\n\t"
1231
                "psllq        $8, %%mm4\n\t"
1232
                "psllq        $16, %%mm5\n\t"
1233
                "por        %%mm4, %%mm3\n\t"
1234
                "por        %%mm5, %%mm3\n\t"
1235
                MOVNTQ"        %%mm0, %0\n\t"
1236
                MOVNTQ"        %%mm3, 8%0\n\t"
1237
                :"=m"(*d)
1238
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1239
                :"memory");
1240
                d += 16;
1241
                s += 4;
1242
        }
1243
        __asm __volatile(SFENCE:::"memory");
1244
        __asm __volatile(EMMS:::"memory");
1245
#endif
1246
        while(s < end)
1247
        {
1248
#if 0 //slightly slower on athlon
1249
                int bgr= *s++;
1250
                *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1251
#else
1252
//FIXME this is very likely wrong for bigendian (and the following converters too)
1253
                register uint16_t bgr;
1254
                bgr = *s++;
1255
#ifdef WORDS_BIGENDIAN
1256
                *d++ = 0;
1257
                *d++ = (bgr&0x1F)<<3;
1258
                *d++ = (bgr&0x3E0)>>2;
1259
                *d++ = (bgr&0x7C00)>>7;
1260
#else
1261
                *d++ = (bgr&0x1F)<<3;
1262
                *d++ = (bgr&0x3E0)>>2;
1263
                *d++ = (bgr&0x7C00)>>7;
1264
                *d++ = 0;
1265
#endif
1266

    
1267
#endif
1268
        }
1269
}
1270

    
1271
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1272
{
1273
        const uint16_t *end;
1274
#ifdef HAVE_MMX
1275
        const uint16_t *mm_end;
1276
#endif
1277
        uint8_t *d = (uint8_t *)dst;
1278
        const uint16_t *s = (uint16_t *)src;
1279
        end = s + src_size/2;
1280
#ifdef HAVE_MMX
1281
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1282
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1283
        mm_end = end - 3;
1284
        while(s < mm_end)
1285
        {
1286
            __asm __volatile(
1287
                PREFETCH" 32%1\n\t"
1288
                "movq        %1, %%mm0\n\t"
1289
                "movq        %1, %%mm1\n\t"
1290
                "movq        %1, %%mm2\n\t"
1291
                "pand        %2, %%mm0\n\t"
1292
                "pand        %3, %%mm1\n\t"
1293
                "pand        %4, %%mm2\n\t"
1294
                "psllq        $3, %%mm0\n\t"
1295
                "psrlq        $3, %%mm1\n\t"
1296
                "psrlq        $8, %%mm2\n\t"
1297
                "movq        %%mm0, %%mm3\n\t"
1298
                "movq        %%mm1, %%mm4\n\t"
1299
                "movq        %%mm2, %%mm5\n\t"
1300
                "punpcklwd %%mm7, %%mm0\n\t"
1301
                "punpcklwd %%mm7, %%mm1\n\t"
1302
                "punpcklwd %%mm7, %%mm2\n\t"
1303
                "punpckhwd %%mm7, %%mm3\n\t"
1304
                "punpckhwd %%mm7, %%mm4\n\t"
1305
                "punpckhwd %%mm7, %%mm5\n\t"
1306
                "psllq        $8, %%mm1\n\t"
1307
                "psllq        $16, %%mm2\n\t"
1308
                "por        %%mm1, %%mm0\n\t"
1309
                "por        %%mm2, %%mm0\n\t"
1310
                "psllq        $8, %%mm4\n\t"
1311
                "psllq        $16, %%mm5\n\t"
1312
                "por        %%mm4, %%mm3\n\t"
1313
                "por        %%mm5, %%mm3\n\t"
1314
                MOVNTQ"        %%mm0, %0\n\t"
1315
                MOVNTQ"        %%mm3, 8%0\n\t"
1316
                :"=m"(*d)
1317
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1318
                :"memory");
1319
                d += 16;
1320
                s += 4;
1321
        }
1322
        __asm __volatile(SFENCE:::"memory");
1323
        __asm __volatile(EMMS:::"memory");
1324
#endif
1325
        while(s < end)
1326
        {
1327
                register uint16_t bgr;
1328
                bgr = *s++;
1329
#ifdef WORDS_BIGENDIAN
1330
                *d++ = 0;
1331
                *d++ = (bgr&0x1F)<<3;
1332
                *d++ = (bgr&0x7E0)>>3;
1333
                *d++ = (bgr&0xF800)>>8;
1334
#else
1335
                *d++ = (bgr&0x1F)<<3;
1336
                *d++ = (bgr&0x7E0)>>3;
1337
                *d++ = (bgr&0xF800)>>8;
1338
                *d++ = 0;
1339
#endif
1340
        }
1341
}
1342

    
1343
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1344
{
1345
#ifdef HAVE_MMX
1346
/* TODO: unroll this loop */
1347
        asm volatile (
1348
                "xor %%"REG_a", %%"REG_a"        \n\t"
1349
                ".balign 16                        \n\t"
1350
                "1:                                \n\t"
1351
                PREFETCH" 32(%0, %%"REG_a")        \n\t"
1352
                "movq (%0, %%"REG_a"), %%mm0        \n\t"
1353
                "movq %%mm0, %%mm1                \n\t"
1354
                "movq %%mm0, %%mm2                \n\t"
1355
                "pslld $16, %%mm0                \n\t"
1356
                "psrld $16, %%mm1                \n\t"
1357
                "pand "MANGLE(mask32r)", %%mm0        \n\t"
1358
                "pand "MANGLE(mask32g)", %%mm2        \n\t"
1359
                "pand "MANGLE(mask32b)", %%mm1        \n\t"
1360
                "por %%mm0, %%mm2                \n\t"
1361
                "por %%mm1, %%mm2                \n\t"
1362
                MOVNTQ" %%mm2, (%1, %%"REG_a")        \n\t"
1363
                "add $8, %%"REG_a"                \n\t"
1364
                "cmp %2, %%"REG_a"                \n\t"
1365
                " jb 1b                                \n\t"
1366
                :: "r" (src), "r"(dst), "r" ((long)src_size-7)
1367
                : "%"REG_a
1368
        );
1369

    
1370
        __asm __volatile(SFENCE:::"memory");
1371
        __asm __volatile(EMMS:::"memory");
1372
#else
1373
        unsigned i;
1374
        unsigned num_pixels = src_size >> 2;
1375
        for(i=0; i<num_pixels; i++)
1376
        {
1377
#ifdef WORDS_BIGENDIAN  
1378
          dst[4*i + 1] = src[4*i + 3];
1379
          dst[4*i + 2] = src[4*i + 2];
1380
          dst[4*i + 3] = src[4*i + 1];
1381
#else
1382
          dst[4*i + 0] = src[4*i + 2];
1383
          dst[4*i + 1] = src[4*i + 1];
1384
          dst[4*i + 2] = src[4*i + 0];
1385
#endif
1386
        }
1387
#endif
1388
}
1389

    
1390
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1391
{
1392
        unsigned i;
1393
#ifdef HAVE_MMX
1394
        long mmx_size= 23 - src_size;
1395
        asm volatile (
1396
                "movq "MANGLE(mask24r)", %%mm5        \n\t"
1397
                "movq "MANGLE(mask24g)", %%mm6        \n\t"
1398
                "movq "MANGLE(mask24b)", %%mm7        \n\t"
1399
                ".balign 16                        \n\t"
1400
                "1:                                \n\t"
1401
                PREFETCH" 32(%1, %%"REG_a")        \n\t"
1402
                "movq   (%1, %%"REG_a"), %%mm0        \n\t" // BGR BGR BG
1403
                "movq   (%1, %%"REG_a"), %%mm1        \n\t" // BGR BGR BG
1404
                "movq  2(%1, %%"REG_a"), %%mm2        \n\t" // R BGR BGR B
1405
                "psllq $16, %%mm0                \n\t" // 00 BGR BGR
1406
                "pand %%mm5, %%mm0                \n\t"
1407
                "pand %%mm6, %%mm1                \n\t"
1408
                "pand %%mm7, %%mm2                \n\t"
1409
                "por %%mm0, %%mm1                \n\t"
1410
                "por %%mm2, %%mm1                \n\t"                
1411
                "movq  6(%1, %%"REG_a"), %%mm0        \n\t" // BGR BGR BG
1412
                MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1413
                "movq  8(%1, %%"REG_a"), %%mm1        \n\t" // R BGR BGR B
1414
                "movq 10(%1, %%"REG_a"), %%mm2        \n\t" // GR BGR BGR
1415
                "pand %%mm7, %%mm0                \n\t"
1416
                "pand %%mm5, %%mm1                \n\t"
1417
                "pand %%mm6, %%mm2                \n\t"
1418
                "por %%mm0, %%mm1                \n\t"
1419
                "por %%mm2, %%mm1                \n\t"                
1420
                "movq 14(%1, %%"REG_a"), %%mm0        \n\t" // R BGR BGR B
1421
                MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1422
                "movq 16(%1, %%"REG_a"), %%mm1        \n\t" // GR BGR BGR
1423
                "movq 18(%1, %%"REG_a"), %%mm2        \n\t" // BGR BGR BG
1424
                "pand %%mm6, %%mm0                \n\t"
1425
                "pand %%mm7, %%mm1                \n\t"
1426
                "pand %%mm5, %%mm2                \n\t"
1427
                "por %%mm0, %%mm1                \n\t"
1428
                "por %%mm2, %%mm1                \n\t"                
1429
                MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1430
                "add $24, %%"REG_a"                \n\t"
1431
                " js 1b                                \n\t"
1432
                : "+a" (mmx_size)
1433
                : "r" (src-mmx_size), "r"(dst-mmx_size)
1434
        );
1435

    
1436
        __asm __volatile(SFENCE:::"memory");
1437
        __asm __volatile(EMMS:::"memory");
1438

    
1439
        if(mmx_size==23) return; //finihsed, was multiple of 8
1440

    
1441
        src+= src_size;
1442
        dst+= src_size;
1443
        src_size= 23-mmx_size;
1444
        src-= src_size;
1445
        dst-= src_size;
1446
#endif
1447
        for(i=0; i<src_size; i+=3)
1448
        {
1449
                register uint8_t x;
1450
                x          = src[i + 2];
1451
                dst[i + 1] = src[i + 1];
1452
                dst[i + 2] = src[i + 0];
1453
                dst[i + 0] = x;
1454
        }
1455
}
1456

    
1457
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1458
        unsigned int width, unsigned int height,
1459
        int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1460
{
1461
        unsigned y;
1462
        const unsigned chromWidth= width>>1;
1463
        for(y=0; y<height; y++)
1464
        {
1465
#ifdef HAVE_MMX
1466
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1467
                asm volatile(
1468
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1469
                        ".balign 16                        \n\t"
1470
                        "1:                                \n\t"
1471
                        PREFETCH" 32(%1, %%"REG_a", 2)        \n\t"
1472
                        PREFETCH" 32(%2, %%"REG_a")        \n\t"
1473
                        PREFETCH" 32(%3, %%"REG_a")        \n\t"
1474
                        "movq (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1475
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1476
                        "movq (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1477
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1478
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1479

    
1480
                        "movq (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1481
                        "movq 8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1482
                        "movq %%mm3, %%mm4                \n\t" // Y(0)
1483
                        "movq %%mm5, %%mm6                \n\t" // Y(8)
1484
                        "punpcklbw %%mm0, %%mm3                \n\t" // YUYV YUYV(0)
1485
                        "punpckhbw %%mm0, %%mm4                \n\t" // YUYV YUYV(4)
1486
                        "punpcklbw %%mm2, %%mm5                \n\t" // YUYV YUYV(8)
1487
                        "punpckhbw %%mm2, %%mm6                \n\t" // YUYV YUYV(12)
1488

    
1489
                        MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1490
                        MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1491
                        MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1492
                        MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1493

    
1494
                        "add $8, %%"REG_a"                \n\t"
1495
                        "cmp %4, %%"REG_a"                \n\t"
1496
                        " jb 1b                                \n\t"
1497
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1498
                        : "%"REG_a
1499
                );
1500
#else
1501

    
1502
#if defined ARCH_ALPHA && defined HAVE_MVI
1503
#define pl2yuy2(n)                                        \
1504
        y1 = yc[n];                                        \
1505
        y2 = yc2[n];                                        \
1506
        u = uc[n];                                        \
1507
        v = vc[n];                                        \
1508
        asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));        \
1509
        asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));        \
1510
        asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1511
        asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1512
        yuv1 = (u << 8) + (v << 24);                        \
1513
        yuv2 = yuv1 + y2;                                \
1514
        yuv1 += y1;                                        \
1515
        qdst[n] = yuv1;                                        \
1516
        qdst2[n] = yuv2;
1517

    
1518
                int i;
1519
                uint64_t *qdst = (uint64_t *) dst;
1520
                uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1521
                const uint32_t *yc = (uint32_t *) ysrc;
1522
                const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1523
                const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1524
                for(i = 0; i < chromWidth; i += 8){
1525
                        uint64_t y1, y2, yuv1, yuv2;
1526
                        uint64_t u, v;
1527
                        /* Prefetch */
1528
                        asm("ldq $31,64(%0)" :: "r"(yc));
1529
                        asm("ldq $31,64(%0)" :: "r"(yc2));
1530
                        asm("ldq $31,64(%0)" :: "r"(uc));
1531
                        asm("ldq $31,64(%0)" :: "r"(vc));
1532

    
1533
                        pl2yuy2(0);
1534
                        pl2yuy2(1);
1535
                        pl2yuy2(2);
1536
                        pl2yuy2(3);
1537

    
1538
                        yc += 4;
1539
                        yc2 += 4;
1540
                        uc += 4;
1541
                        vc += 4;
1542
                        qdst += 4;
1543
                        qdst2 += 4;
1544
                }
1545
                y++;
1546
                ysrc += lumStride;
1547
                dst += dstStride;
1548

    
1549
#elif __WORDSIZE >= 64
1550
                int i;
1551
                uint64_t *ldst = (uint64_t *) dst;
1552
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1553
                for(i = 0; i < chromWidth; i += 2){
1554
                        uint64_t k, l;
1555
                        k = yc[0] + (uc[0] << 8) +
1556
                            (yc[1] << 16) + (vc[0] << 24);
1557
                        l = yc[2] + (uc[1] << 8) +
1558
                            (yc[3] << 16) + (vc[1] << 24);
1559
                        *ldst++ = k + (l << 32);
1560
                        yc += 4;
1561
                        uc += 2;
1562
                        vc += 2;
1563
                }
1564

    
1565
#else
1566
                int i, *idst = (int32_t *) dst;
1567
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1568
                for(i = 0; i < chromWidth; i++){
1569
#ifdef WORDS_BIGENDIAN
1570
                        *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1571
                            (yc[1] << 8) + (vc[0] << 0);
1572
#else
1573
                        *idst++ = yc[0] + (uc[0] << 8) +
1574
                            (yc[1] << 16) + (vc[0] << 24);
1575
#endif
1576
                        yc += 2;
1577
                        uc++;
1578
                        vc++;
1579
                }
1580
#endif
1581
#endif
1582
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1583
                {
1584
                        usrc += chromStride;
1585
                        vsrc += chromStride;
1586
                }
1587
                ysrc += lumStride;
1588
                dst += dstStride;
1589
        }
1590
#ifdef HAVE_MMX
1591
asm(    EMMS" \n\t"
1592
        SFENCE" \n\t"
1593
        :::"memory");
1594
#endif
1595
}
1596

    
1597
/**
1598
 *
1599
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1600
 * problem for anyone then tell me, and ill fix it)
1601
 */
1602
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1603
        unsigned int width, unsigned int height,
1604
        int lumStride, int chromStride, int dstStride)
1605
{
1606
        //FIXME interpolate chroma
1607
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1608
}
1609

    
1610
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1611
        unsigned int width, unsigned int height,
1612
        int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1613
{
1614
        unsigned y;
1615
        const unsigned chromWidth= width>>1;
1616
        for(y=0; y<height; y++)
1617
        {
1618
#ifdef HAVE_MMX
1619
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1620
                asm volatile(
1621
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1622
                        ".balign 16                        \n\t"
1623
                        "1:                                \n\t"
1624
                        PREFETCH" 32(%1, %%"REG_a", 2)        \n\t"
1625
                        PREFETCH" 32(%2, %%"REG_a")        \n\t"
1626
                        PREFETCH" 32(%3, %%"REG_a")        \n\t"
1627
                        "movq (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1628
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1629
                        "movq (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1630
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1631
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1632

    
1633
                        "movq (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1634
                        "movq 8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1635
                        "movq %%mm0, %%mm4                \n\t" // Y(0)
1636
                        "movq %%mm2, %%mm6                \n\t" // Y(8)
1637
                        "punpcklbw %%mm3, %%mm0                \n\t" // YUYV YUYV(0)
1638
                        "punpckhbw %%mm3, %%mm4                \n\t" // YUYV YUYV(4)
1639
                        "punpcklbw %%mm5, %%mm2                \n\t" // YUYV YUYV(8)
1640
                        "punpckhbw %%mm5, %%mm6                \n\t" // YUYV YUYV(12)
1641

    
1642
                        MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1643
                        MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1644
                        MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1645
                        MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1646

    
1647
                        "add $8, %%"REG_a"                \n\t"
1648
                        "cmp %4, %%"REG_a"                \n\t"
1649
                        " jb 1b                                \n\t"
1650
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1651
                        : "%"REG_a
1652
                );
1653
#else
1654
//FIXME adapt the alpha asm code from yv12->yuy2
1655

    
1656
#if __WORDSIZE >= 64
1657
                int i;
1658
                uint64_t *ldst = (uint64_t *) dst;
1659
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1660
                for(i = 0; i < chromWidth; i += 2){
1661
                        uint64_t k, l;
1662
                        k = uc[0] + (yc[0] << 8) +
1663
                            (vc[0] << 16) + (yc[1] << 24);
1664
                        l = uc[1] + (yc[2] << 8) +
1665
                            (vc[1] << 16) + (yc[3] << 24);
1666
                        *ldst++ = k + (l << 32);
1667
                        yc += 4;
1668
                        uc += 2;
1669
                        vc += 2;
1670
                }
1671

    
1672
#else
1673
                int i, *idst = (int32_t *) dst;
1674
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1675
                for(i = 0; i < chromWidth; i++){
1676
#ifdef WORDS_BIGENDIAN
1677
                        *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1678
                            (vc[0] << 8) + (yc[1] << 0);
1679
#else
1680
                        *idst++ = uc[0] + (yc[0] << 8) +
1681
                            (vc[0] << 16) + (yc[1] << 24);
1682
#endif
1683
                        yc += 2;
1684
                        uc++;
1685
                        vc++;
1686
                }
1687
#endif
1688
#endif
1689
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1690
                {
1691
                        usrc += chromStride;
1692
                        vsrc += chromStride;
1693
                }
1694
                ysrc += lumStride;
1695
                dst += dstStride;
1696
        }
1697
#ifdef HAVE_MMX
1698
asm(    EMMS" \n\t"
1699
        SFENCE" \n\t"
1700
        :::"memory");
1701
#endif
1702
}
1703

    
1704
/**
1705
 *
1706
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1707
 * problem for anyone then tell me, and ill fix it)
1708
 */
1709
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1710
        unsigned int width, unsigned int height,
1711
        int lumStride, int chromStride, int dstStride)
1712
{
1713
        //FIXME interpolate chroma
1714
        RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1715
}
1716

    
1717
/**
1718
 *
1719
 * width should be a multiple of 16
1720
 */
1721
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1722
        unsigned int width, unsigned int height,
1723
        int lumStride, int chromStride, int dstStride)
1724
{
1725
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1726
}
1727

    
1728
/**
1729
 *
1730
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1731
 * problem for anyone then tell me, and ill fix it)
1732
 */
1733
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1734
        unsigned int width, unsigned int height,
1735
        int lumStride, int chromStride, int srcStride)
1736
{
1737
        unsigned y;
1738
        const unsigned chromWidth= width>>1;
1739
        for(y=0; y<height; y+=2)
1740
        {
1741
#ifdef HAVE_MMX
1742
                asm volatile(
1743
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1744
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1745
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1746
                        ".balign 16                        \n\t"
1747
                        "1:                                \n\t"
1748
                        PREFETCH" 64(%0, %%"REG_a", 4)        \n\t"
1749
                        "movq (%0, %%"REG_a", 4), %%mm0        \n\t" // YUYV YUYV(0)
1750
                        "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1751
                        "movq %%mm0, %%mm2                \n\t" // YUYV YUYV(0)
1752
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(4)
1753
                        "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1754
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1755
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1756
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1757
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1758
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1759

    
1760
                        MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1761

    
1762
                        "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1763
                        "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1764
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(8)
1765
                        "movq %%mm2, %%mm4                \n\t" // YUYV YUYV(12)
1766
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1767
                        "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1768
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1769
                        "pand %%mm7, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1770
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1771
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1772

    
1773
                        MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1774

    
1775
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1776
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1777
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1778
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1779
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1780
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1781
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1782
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1783

    
1784
                        MOVNTQ" %%mm0, (%3, %%"REG_a")        \n\t"
1785
                        MOVNTQ" %%mm2, (%2, %%"REG_a")        \n\t"
1786

    
1787
                        "add $8, %%"REG_a"                \n\t"
1788
                        "cmp %4, %%"REG_a"                \n\t"
1789
                        " jb 1b                                \n\t"
1790
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1791
                        : "memory", "%"REG_a
1792
                );
1793

    
1794
                ydst += lumStride;
1795
                src  += srcStride;
1796

    
1797
                asm volatile(
1798
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1799
                        ".balign 16                        \n\t"
1800
                        "1:                                \n\t"
1801
                        PREFETCH" 64(%0, %%"REG_a", 4)        \n\t"
1802
                        "movq (%0, %%"REG_a", 4), %%mm0        \n\t" // YUYV YUYV(0)
1803
                        "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1804
                        "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1805
                        "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1806
                        "pand %%mm7, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1807
                        "pand %%mm7, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1808
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1809
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1810
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1811
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1812

    
1813
                        MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1814
                        MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1815

    
1816
                        "add $8, %%"REG_a"                \n\t"
1817
                        "cmp %4, %%"REG_a"                \n\t"
1818
                        " jb 1b                                \n\t"
1819

    
1820
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1821
                        : "memory", "%"REG_a
1822
                );
1823
#else
1824
                unsigned i;
1825
                for(i=0; i<chromWidth; i++)
1826
                {
1827
                        ydst[2*i+0]         = src[4*i+0];
1828
                        udst[i]         = src[4*i+1];
1829
                        ydst[2*i+1]         = src[4*i+2];
1830
                        vdst[i]         = src[4*i+3];
1831
                }
1832
                ydst += lumStride;
1833
                src  += srcStride;
1834

    
1835
                for(i=0; i<chromWidth; i++)
1836
                {
1837
                        ydst[2*i+0]         = src[4*i+0];
1838
                        ydst[2*i+1]         = src[4*i+2];
1839
                }
1840
#endif
1841
                udst += chromStride;
1842
                vdst += chromStride;
1843
                ydst += lumStride;
1844
                src  += srcStride;
1845
        }
1846
#ifdef HAVE_MMX
1847
asm volatile(   EMMS" \n\t"
1848
                SFENCE" \n\t"
1849
                :::"memory");
1850
#endif
1851
}
1852

    
1853
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1854
        uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1855
        unsigned int width, unsigned int height, int lumStride, int chromStride)
1856
{
1857
        /* Y Plane */
1858
        memcpy(ydst, ysrc, width*height);
1859

    
1860
        /* XXX: implement upscaling for U,V */
1861
}
1862

    
1863
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1864
{
1865
        int x,y;
1866
        
1867
        dst[0]= src[0];
1868
        
1869
        // first line
1870
        for(x=0; x<srcWidth-1; x++){
1871
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1872
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1873
        }
1874
        dst[2*srcWidth-1]= src[srcWidth-1];
1875
        
1876
        dst+= dstStride;
1877

    
1878
        for(y=1; y<srcHeight; y++){
1879
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1880
                const long mmxSize= srcWidth&~15;
1881
                asm volatile(
1882
                        "mov %4, %%"REG_a"                \n\t"
1883
                        "1:                                \n\t"
1884
                        "movq (%0, %%"REG_a"), %%mm0        \n\t"
1885
                        "movq (%1, %%"REG_a"), %%mm1        \n\t"
1886
                        "movq 1(%0, %%"REG_a"), %%mm2        \n\t"
1887
                        "movq 1(%1, %%"REG_a"), %%mm3        \n\t"
1888
                        "movq -1(%0, %%"REG_a"), %%mm4        \n\t"
1889
                        "movq -1(%1, %%"REG_a"), %%mm5        \n\t"
1890
                        PAVGB" %%mm0, %%mm5                \n\t"
1891
                        PAVGB" %%mm0, %%mm3                \n\t"
1892
                        PAVGB" %%mm0, %%mm5                \n\t"
1893
                        PAVGB" %%mm0, %%mm3                \n\t"
1894
                        PAVGB" %%mm1, %%mm4                \n\t"
1895
                        PAVGB" %%mm1, %%mm2                \n\t"
1896
                        PAVGB" %%mm1, %%mm4                \n\t"
1897
                        PAVGB" %%mm1, %%mm2                \n\t"
1898
                        "movq %%mm5, %%mm7                \n\t"
1899
                        "movq %%mm4, %%mm6                \n\t"
1900
                        "punpcklbw %%mm3, %%mm5                \n\t"
1901
                        "punpckhbw %%mm3, %%mm7                \n\t"
1902
                        "punpcklbw %%mm2, %%mm4                \n\t"
1903
                        "punpckhbw %%mm2, %%mm6                \n\t"
1904
#if 1
1905
                        MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1906
                        MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1907
                        MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1908
                        MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1909
#else
1910
                        "movq %%mm5, (%2, %%"REG_a", 2)        \n\t"
1911
                        "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1912
                        "movq %%mm4, (%3, %%"REG_a", 2)        \n\t"
1913
                        "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1914
#endif
1915
                        "add $8, %%"REG_a"                \n\t"
1916
                        " js 1b                                \n\t"
1917
                        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1918
                           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1919
                           "g" (-mmxSize)
1920
                        : "%"REG_a
1921

    
1922
                );
1923
#else
1924
                const int mmxSize=1;
1925
#endif
1926
                dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1927
                dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1928

    
1929
                for(x=mmxSize-1; x<srcWidth-1; x++){
1930
                        dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1931
                        dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1932
                        dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1933
                        dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1934
                }
1935
                dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1936
                dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1937

    
1938
                dst+=dstStride*2;
1939
                src+=srcStride;
1940
        }
1941
        
1942
        // last line
1943
#if 1
1944
        dst[0]= src[0];
1945
        
1946
        for(x=0; x<srcWidth-1; x++){
1947
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1948
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1949
        }
1950
        dst[2*srcWidth-1]= src[srcWidth-1];
1951
#else
1952
        for(x=0; x<srcWidth; x++){
1953
                dst[2*x+0]=
1954
                dst[2*x+1]= src[x];
1955
        }
1956
#endif
1957

    
1958
#ifdef HAVE_MMX
1959
asm volatile(   EMMS" \n\t"
1960
                SFENCE" \n\t"
1961
                :::"memory");
1962
#endif
1963
}
1964

    
1965
/**
1966
 *
1967
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1968
 * problem for anyone then tell me, and ill fix it)
1969
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1970
 */
1971
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1972
        unsigned int width, unsigned int height,
1973
        int lumStride, int chromStride, int srcStride)
1974
{
1975
        unsigned y;
1976
        const unsigned chromWidth= width>>1;
1977
        for(y=0; y<height; y+=2)
1978
        {
1979
#ifdef HAVE_MMX
1980
                asm volatile(
1981
                        "xorl %%eax, %%eax                \n\t"
1982
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1983
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1984
                        ".balign 16                        \n\t"
1985
                        "1:                                \n\t"
1986
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1987
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // UYVY UYVY(0)
1988
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(4)
1989
                        "movq %%mm0, %%mm2                \n\t" // UYVY UYVY(0)
1990
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(4)
1991
                        "pand %%mm7, %%mm0                \n\t" // U0V0 U0V0(0)
1992
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(4)
1993
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1994
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1995
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1996
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1997

    
1998
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
1999

    
2000
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(8)
2001
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // UYVY UYVY(12)
2002
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(8)
2003
                        "movq %%mm2, %%mm4                \n\t" // UYVY UYVY(12)
2004
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(8)
2005
                        "pand %%mm7, %%mm2                \n\t" // U0V0 U0V0(12)
2006
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2007
                        "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2008
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
2009
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
2010

    
2011
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
2012

    
2013
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
2014
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
2015
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2016
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2017
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
2018
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
2019
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
2020
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
2021

    
2022
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
2023
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
2024

    
2025
                        "addl $8, %%eax                        \n\t"
2026
                        "cmpl %4, %%eax                        \n\t"
2027
                        " jb 1b                                \n\t"
2028
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2029
                        : "memory", "%eax"
2030
                );
2031

    
2032
                ydst += lumStride;
2033
                src  += srcStride;
2034

    
2035
                asm volatile(
2036
                        "xorl %%eax, %%eax                \n\t"
2037
                        ".balign 16                        \n\t"
2038
                        "1:                                \n\t"
2039
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
2040
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
2041
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
2042
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
2043
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
2044
                        "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2045
                        "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2046
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2047
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2048
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
2049
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
2050

    
2051
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
2052
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
2053

    
2054
                        "addl $8, %%eax                        \n\t"
2055
                        "cmpl %4, %%eax                        \n\t"
2056
                        " jb 1b                                \n\t"
2057

    
2058
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2059
                        : "memory", "%eax"
2060
                );
2061
#else
2062
                unsigned i;
2063
                for(i=0; i<chromWidth; i++)
2064
                {
2065
                        udst[i]         = src[4*i+0];
2066
                        ydst[2*i+0]         = src[4*i+1];
2067
                        vdst[i]         = src[4*i+2];
2068
                        ydst[2*i+1]         = src[4*i+3];
2069
                }
2070
                ydst += lumStride;
2071
                src  += srcStride;
2072

    
2073
                for(i=0; i<chromWidth; i++)
2074
                {
2075
                        ydst[2*i+0]         = src[4*i+1];
2076
                        ydst[2*i+1]         = src[4*i+3];
2077
                }
2078
#endif
2079
                udst += chromStride;
2080
                vdst += chromStride;
2081
                ydst += lumStride;
2082
                src  += srcStride;
2083
        }
2084
#ifdef HAVE_MMX
2085
asm volatile(   EMMS" \n\t"
2086
                SFENCE" \n\t"
2087
                :::"memory");
2088
#endif
2089
}
2090

    
2091
/**
2092
 *
2093
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2094
 * problem for anyone then tell me, and ill fix it)
2095
 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2096
 */
2097
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2098
        unsigned int width, unsigned int height,
2099
        int lumStride, int chromStride, int srcStride)
2100
{
2101
        unsigned y;
2102
        const unsigned chromWidth= width>>1;
2103
#ifdef HAVE_MMX
2104
        for(y=0; y<height-2; y+=2)
2105
        {
2106
                unsigned i;
2107
                for(i=0; i<2; i++)
2108
                {
2109
                        asm volatile(
2110
                                "mov %2, %%"REG_a"                \n\t"
2111
                                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
2112
                                "movq "MANGLE(w1111)", %%mm5                \n\t"
2113
                                "pxor %%mm7, %%mm7                \n\t"
2114
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2115
                                ".balign 16                        \n\t"
2116
                                "1:                                \n\t"
2117
                                PREFETCH" 64(%0, %%"REG_b")        \n\t"
2118
                                "movd (%0, %%"REG_b"), %%mm0        \n\t"
2119
                                "movd 3(%0, %%"REG_b"), %%mm1        \n\t"
2120
                                "punpcklbw %%mm7, %%mm0                \n\t"
2121
                                "punpcklbw %%mm7, %%mm1                \n\t"
2122
                                "movd 6(%0, %%"REG_b"), %%mm2        \n\t"
2123
                                "movd 9(%0, %%"REG_b"), %%mm3        \n\t"
2124
                                "punpcklbw %%mm7, %%mm2                \n\t"
2125
                                "punpcklbw %%mm7, %%mm3                \n\t"
2126
                                "pmaddwd %%mm6, %%mm0                \n\t"
2127
                                "pmaddwd %%mm6, %%mm1                \n\t"
2128
                                "pmaddwd %%mm6, %%mm2                \n\t"
2129
                                "pmaddwd %%mm6, %%mm3                \n\t"
2130
#ifndef FAST_BGR2YV12
2131
                                "psrad $8, %%mm0                \n\t"
2132
                                "psrad $8, %%mm1                \n\t"
2133
                                "psrad $8, %%mm2                \n\t"
2134
                                "psrad $8, %%mm3                \n\t"
2135
#endif
2136
                                "packssdw %%mm1, %%mm0                \n\t"
2137
                                "packssdw %%mm3, %%mm2                \n\t"
2138
                                "pmaddwd %%mm5, %%mm0                \n\t"
2139
                                "pmaddwd %%mm5, %%mm2                \n\t"
2140
                                "packssdw %%mm2, %%mm0                \n\t"
2141
                                "psraw $7, %%mm0                \n\t"
2142

    
2143
                                "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
2144
                                "movd 15(%0, %%"REG_b"), %%mm1        \n\t"
2145
                                "punpcklbw %%mm7, %%mm4                \n\t"
2146
                                "punpcklbw %%mm7, %%mm1                \n\t"
2147
                                "movd 18(%0, %%"REG_b"), %%mm2        \n\t"
2148
                                "movd 21(%0, %%"REG_b"), %%mm3        \n\t"
2149
                                "punpcklbw %%mm7, %%mm2                \n\t"
2150
                                "punpcklbw %%mm7, %%mm3                \n\t"
2151
                                "pmaddwd %%mm6, %%mm4                \n\t"
2152
                                "pmaddwd %%mm6, %%mm1                \n\t"
2153
                                "pmaddwd %%mm6, %%mm2                \n\t"
2154
                                "pmaddwd %%mm6, %%mm3                \n\t"
2155
#ifndef FAST_BGR2YV12
2156
                                "psrad $8, %%mm4                \n\t"
2157
                                "psrad $8, %%mm1                \n\t"
2158
                                "psrad $8, %%mm2                \n\t"
2159
                                "psrad $8, %%mm3                \n\t"
2160
#endif
2161
                                "packssdw %%mm1, %%mm4                \n\t"
2162
                                "packssdw %%mm3, %%mm2                \n\t"
2163
                                "pmaddwd %%mm5, %%mm4                \n\t"
2164
                                "pmaddwd %%mm5, %%mm2                \n\t"
2165
                                "add $24, %%"REG_b"                \n\t"
2166
                                "packssdw %%mm2, %%mm4                \n\t"
2167
                                "psraw $7, %%mm4                \n\t"
2168

    
2169
                                "packuswb %%mm4, %%mm0                \n\t"
2170
                                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
2171

    
2172
                                MOVNTQ" %%mm0, (%1, %%"REG_a")        \n\t"
2173
                                "add $8, %%"REG_a"                \n\t"
2174
                                " js 1b                                \n\t"
2175
                                : : "r" (src+width*3), "r" (ydst+width), "g" ((long)-width)
2176
                                : "%"REG_a, "%"REG_b
2177
                        );
2178
                        ydst += lumStride;
2179
                        src  += srcStride;
2180
                }
2181
                src -= srcStride*2;
2182
                asm volatile(
2183
                        "mov %4, %%"REG_a"                \n\t"
2184
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2185
                        "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
2186
                        "pxor %%mm7, %%mm7                \n\t"
2187
                        "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2188
                        "add %%"REG_b", %%"REG_b"        \n\t"
2189
                        ".balign 16                        \n\t"
2190
                        "1:                                \n\t"
2191
                        PREFETCH" 64(%0, %%"REG_b")        \n\t"
2192
                        PREFETCH" 64(%1, %%"REG_b")        \n\t"
2193
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2194
                        "movq (%0, %%"REG_b"), %%mm0        \n\t"
2195
                        "movq (%1, %%"REG_b"), %%mm1        \n\t"
2196
                        "movq 6(%0, %%"REG_b"), %%mm2        \n\t"
2197
                        "movq 6(%1, %%"REG_b"), %%mm3        \n\t"
2198
                        PAVGB" %%mm1, %%mm0                \n\t"
2199
                        PAVGB" %%mm3, %%mm2                \n\t"
2200
                        "movq %%mm0, %%mm1                \n\t"
2201
                        "movq %%mm2, %%mm3                \n\t"
2202
                        "psrlq $24, %%mm0                \n\t"
2203
                        "psrlq $24, %%mm2                \n\t"
2204
                        PAVGB" %%mm1, %%mm0                \n\t"
2205
                        PAVGB" %%mm3, %%mm2                \n\t"
2206
                        "punpcklbw %%mm7, %%mm0                \n\t"
2207
                        "punpcklbw %%mm7, %%mm2                \n\t"
2208
#else
2209
                        "movd (%0, %%"REG_b"), %%mm0        \n\t"
2210
                        "movd (%1, %%"REG_b"), %%mm1        \n\t"
2211
                        "movd 3(%0, %%"REG_b"), %%mm2        \n\t"
2212
                        "movd 3(%1, %%"REG_b"), %%mm3        \n\t"
2213
                        "punpcklbw %%mm7, %%mm0                \n\t"
2214
                        "punpcklbw %%mm7, %%mm1                \n\t"
2215
                        "punpcklbw %%mm7, %%mm2                \n\t"
2216
                        "punpcklbw %%mm7, %%mm3                \n\t"
2217
                        "paddw %%mm1, %%mm0                \n\t"
2218
                        "paddw %%mm3, %%mm2                \n\t"
2219
                        "paddw %%mm2, %%mm0                \n\t"
2220
                        "movd 6(%0, %%"REG_b"), %%mm4        \n\t"
2221
                        "movd 6(%1, %%"REG_b"), %%mm1        \n\t"
2222
                        "movd 9(%0, %%"REG_b"), %%mm2        \n\t"
2223
                        "movd 9(%1, %%"REG_b"), %%mm3        \n\t"
2224
                        "punpcklbw %%mm7, %%mm4                \n\t"
2225
                        "punpcklbw %%mm7, %%mm1                \n\t"
2226
                        "punpcklbw %%mm7, %%mm2                \n\t"
2227
                        "punpcklbw %%mm7, %%mm3                \n\t"
2228
                        "paddw %%mm1, %%mm4                \n\t"
2229
                        "paddw %%mm3, %%mm2                \n\t"
2230
                        "paddw %%mm4, %%mm2                \n\t"
2231
                        "psrlw $2, %%mm0                \n\t"
2232
                        "psrlw $2, %%mm2                \n\t"
2233
#endif
2234
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2235
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2236

    
2237
                        "pmaddwd %%mm0, %%mm1                \n\t"
2238
                        "pmaddwd %%mm2, %%mm3                \n\t"
2239
                        "pmaddwd %%mm6, %%mm0                \n\t"
2240
                        "pmaddwd %%mm6, %%mm2                \n\t"
2241
#ifndef FAST_BGR2YV12
2242
                        "psrad $8, %%mm0                \n\t"
2243
                        "psrad $8, %%mm1                \n\t"
2244
                        "psrad $8, %%mm2                \n\t"
2245
                        "psrad $8, %%mm3                \n\t"
2246
#endif
2247
                        "packssdw %%mm2, %%mm0                \n\t"
2248
                        "packssdw %%mm3, %%mm1                \n\t"
2249
                        "pmaddwd %%mm5, %%mm0                \n\t"
2250
                        "pmaddwd %%mm5, %%mm1                \n\t"
2251
                        "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
2252
                        "psraw $7, %%mm0                \n\t"
2253

    
2254
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2255
                        "movq 12(%0, %%"REG_b"), %%mm4        \n\t"
2256
                        "movq 12(%1, %%"REG_b"), %%mm1        \n\t"
2257
                        "movq 18(%0, %%"REG_b"), %%mm2        \n\t"
2258
                        "movq 18(%1, %%"REG_b"), %%mm3        \n\t"
2259
                        PAVGB" %%mm1, %%mm4                \n\t"
2260
                        PAVGB" %%mm3, %%mm2                \n\t"
2261
                        "movq %%mm4, %%mm1                \n\t"
2262
                        "movq %%mm2, %%mm3                \n\t"
2263
                        "psrlq $24, %%mm4                \n\t"
2264
                        "psrlq $24, %%mm2                \n\t"
2265
                        PAVGB" %%mm1, %%mm4                \n\t"
2266
                        PAVGB" %%mm3, %%mm2                \n\t"
2267
                        "punpcklbw %%mm7, %%mm4                \n\t"
2268
                        "punpcklbw %%mm7, %%mm2                \n\t"
2269
#else
2270
                        "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
2271
                        "movd 12(%1, %%"REG_b"), %%mm1        \n\t"
2272
                        "movd 15(%0, %%"REG_b"), %%mm2        \n\t"
2273
                        "movd 15(%1, %%"REG_b"), %%mm3        \n\t"
2274
                        "punpcklbw %%mm7, %%mm4                \n\t"
2275
                        "punpcklbw %%mm7, %%mm1                \n\t"
2276
                        "punpcklbw %%mm7, %%mm2                \n\t"
2277
                        "punpcklbw %%mm7, %%mm3                \n\t"
2278
                        "paddw %%mm1, %%mm4                \n\t"
2279
                        "paddw %%mm3, %%mm2                \n\t"
2280
                        "paddw %%mm2, %%mm4                \n\t"
2281
                        "movd 18(%0, %%"REG_b"), %%mm5        \n\t"
2282
                        "movd 18(%1, %%"REG_b"), %%mm1        \n\t"
2283
                        "movd 21(%0, %%"REG_b"), %%mm2        \n\t"
2284
                        "movd 21(%1, %%"REG_b"), %%mm3        \n\t"
2285
                        "punpcklbw %%mm7, %%mm5                \n\t"
2286
                        "punpcklbw %%mm7, %%mm1                \n\t"
2287
                        "punpcklbw %%mm7, %%mm2                \n\t"
2288
                        "punpcklbw %%mm7, %%mm3                \n\t"
2289
                        "paddw %%mm1, %%mm5                \n\t"
2290
                        "paddw %%mm3, %%mm2                \n\t"
2291
                        "paddw %%mm5, %%mm2                \n\t"
2292
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2293
                        "psrlw $2, %%mm4                \n\t"
2294
                        "psrlw $2, %%mm2                \n\t"
2295
#endif
2296
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2297
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2298

    
2299
                        "pmaddwd %%mm4, %%mm1                \n\t"
2300
                        "pmaddwd %%mm2, %%mm3                \n\t"
2301
                        "pmaddwd %%mm6, %%mm4                \n\t"
2302
                        "pmaddwd %%mm6, %%mm2                \n\t"
2303
#ifndef FAST_BGR2YV12
2304
                        "psrad $8, %%mm4                \n\t"
2305
                        "psrad $8, %%mm1                \n\t"
2306
                        "psrad $8, %%mm2                \n\t"
2307
                        "psrad $8, %%mm3                \n\t"
2308
#endif
2309
                        "packssdw %%mm2, %%mm4                \n\t"
2310
                        "packssdw %%mm3, %%mm1                \n\t"
2311
                        "pmaddwd %%mm5, %%mm4                \n\t"
2312
                        "pmaddwd %%mm5, %%mm1                \n\t"
2313
                        "add $24, %%"REG_b"                \n\t"
2314
                        "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
2315
                        "psraw $7, %%mm4                \n\t"
2316

    
2317
                        "movq %%mm0, %%mm1                \n\t"
2318
                        "punpckldq %%mm4, %%mm0                \n\t"
2319
                        "punpckhdq %%mm4, %%mm1                \n\t"
2320
                        "packsswb %%mm1, %%mm0                \n\t"
2321
                        "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
2322
                        "movd %%mm0, (%2, %%"REG_a")        \n\t"
2323
                        "punpckhdq %%mm0, %%mm0                \n\t"
2324
                        "movd %%mm0, (%3, %%"REG_a")        \n\t"
2325
                        "add $4, %%"REG_a"                \n\t"
2326
                        " js 1b                                \n\t"
2327
                        : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" ((long)-chromWidth)
2328
                        : "%"REG_a, "%"REG_b
2329
                );
2330

    
2331
                udst += chromStride;
2332
                vdst += chromStride;
2333
                src  += srcStride*2;
2334
        }
2335

    
2336
        asm volatile(   EMMS" \n\t"
2337
                        SFENCE" \n\t"
2338
                        :::"memory");
2339
#else
2340
        y=0;
2341
#endif
2342
        for(; y<height; y+=2)
2343
        {
2344
                unsigned i;
2345
                for(i=0; i<chromWidth; i++)
2346
                {
2347
                        unsigned int b= src[6*i+0];
2348
                        unsigned int g= src[6*i+1];
2349
                        unsigned int r= src[6*i+2];
2350

    
2351
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2352
                        unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2353
                        unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2354

    
2355
                        udst[i]         = U;
2356
                        vdst[i]         = V;
2357
                        ydst[2*i]         = Y;
2358

    
2359
                        b= src[6*i+3];
2360
                        g= src[6*i+4];
2361
                        r= src[6*i+5];
2362

    
2363
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2364
                        ydst[2*i+1]         = Y;
2365
                }
2366
                ydst += lumStride;
2367
                src  += srcStride;
2368

    
2369
                for(i=0; i<chromWidth; i++)
2370
                {
2371
                        unsigned int b= src[6*i+0];
2372
                        unsigned int g= src[6*i+1];
2373
                        unsigned int r= src[6*i+2];
2374

    
2375
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2376

    
2377
                        ydst[2*i]         = Y;
2378

    
2379
                        b= src[6*i+3];
2380
                        g= src[6*i+4];
2381
                        r= src[6*i+5];
2382

    
2383
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2384
                        ydst[2*i+1]         = Y;
2385
                }
2386
                udst += chromStride;
2387
                vdst += chromStride;
2388
                ydst += lumStride;
2389
                src  += srcStride;
2390
        }
2391
}
2392

    
2393
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2394
                            unsigned width, unsigned height, int src1Stride,
2395
                            int src2Stride, int dstStride){
2396
        unsigned h;
2397

    
2398
        for(h=0; h < height; h++)
2399
        {
2400
                unsigned w;
2401

    
2402
#ifdef HAVE_MMX
2403
#ifdef HAVE_SSE2
2404
                asm(
2405
                        "xor %%"REG_a", %%"REG_a"        \n\t"
2406
                        "1:                                \n\t"
2407
                        PREFETCH" 64(%1, %%"REG_a")        \n\t"
2408
                        PREFETCH" 64(%2, %%"REG_a")        \n\t"
2409
                        "movdqa (%1, %%"REG_a"), %%xmm0        \n\t"
2410
                        "movdqa (%1, %%"REG_a"), %%xmm1        \n\t"
2411
                        "movdqa (%2, %%"REG_a"), %%xmm2        \n\t"
2412
                        "punpcklbw %%xmm2, %%xmm0        \n\t"
2413
                        "punpckhbw %%xmm2, %%xmm1        \n\t"
2414
                        "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2415
                        "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2416
                        "add $16, %%"REG_a"                \n\t"
2417
                        "cmp %3, %%"REG_a"                \n\t"
2418
                        " jb 1b                                \n\t"
2419
                        ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2420
                        : "memory", "%"REG_a""
2421
                );
2422
#else
2423
                asm(
2424
                        "xor %%"REG_a", %%"REG_a"        \n\t"
2425
                        "1:                                \n\t"
2426
                        PREFETCH" 64(%1, %%"REG_a")        \n\t"
2427
                        PREFETCH" 64(%2, %%"REG_a")        \n\t"
2428
                        "movq (%1, %%"REG_a"), %%mm0        \n\t"
2429
                        "movq 8(%1, %%"REG_a"), %%mm2        \n\t"
2430
                        "movq %%mm0, %%mm1                \n\t"
2431
                        "movq %%mm2, %%mm3                \n\t"
2432
                        "movq (%2, %%"REG_a"), %%mm4        \n\t"
2433
                        "movq 8(%2, %%"REG_a"), %%mm5        \n\t"
2434
                        "punpcklbw %%mm4, %%mm0                \n\t"
2435
                        "punpckhbw %%mm4, %%mm1                \n\t"
2436
                        "punpcklbw %%mm5, %%mm2                \n\t"
2437
                        "punpckhbw %%mm5, %%mm3                \n\t"
2438
                        MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2439
                        MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2440
                        MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2441
                        MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2442
                        "add $16, %%"REG_a"                \n\t"
2443
                        "cmp %3, %%"REG_a"                \n\t"
2444
                        " jb 1b                                \n\t"
2445
                        ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2446
                        : "memory", "%"REG_a
2447
                );
2448
#endif
2449
                for(w= (width&(~15)); w < width; w++)
2450
                {
2451
                        dest[2*w+0] = src1[w];
2452
                        dest[2*w+1] = src2[w];
2453
                }
2454
#else
2455
                for(w=0; w < width; w++)
2456
                {
2457
                        dest[2*w+0] = src1[w];
2458
                        dest[2*w+1] = src2[w];
2459
                }
2460
#endif
2461
                dest += dstStride;
2462
                src1 += src1Stride;
2463
                src2 += src2Stride;
2464
        }
2465
#ifdef HAVE_MMX
2466
        asm(
2467
                EMMS" \n\t"
2468
                SFENCE" \n\t"
2469
                ::: "memory"
2470
                );
2471
#endif
2472
}
2473

    
2474
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2475
                        uint8_t *dst1, uint8_t *dst2,
2476
                        unsigned width, unsigned height,
2477
                        int srcStride1, int srcStride2,
2478
                        int dstStride1, int dstStride2)
2479
{
2480
    unsigned int y,x,h;
2481
    int w;
2482
    w=width/2; h=height/2;
2483
#ifdef HAVE_MMX
2484
    asm volatile(
2485
        PREFETCH" %0\n\t"
2486
        PREFETCH" %1\n\t"
2487
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2488
#endif
2489
    for(y=0;y<h;y++){
2490
        const uint8_t* s1=src1+srcStride1*(y>>1);
2491
        uint8_t* d=dst1+dstStride1*y;
2492
        x=0;
2493
#ifdef HAVE_MMX
2494
        for(;x<w-31;x+=32)
2495
        {
2496
            asm volatile(
2497
                PREFETCH" 32%1\n\t"
2498
                "movq        %1, %%mm0\n\t"
2499
                "movq        8%1, %%mm2\n\t"
2500
                "movq        16%1, %%mm4\n\t"
2501
                "movq        24%1, %%mm6\n\t"
2502
                "movq        %%mm0, %%mm1\n\t"
2503
                "movq        %%mm2, %%mm3\n\t"
2504
                "movq        %%mm4, %%mm5\n\t"
2505
                "movq        %%mm6, %%mm7\n\t"
2506
                "punpcklbw %%mm0, %%mm0\n\t"
2507
                "punpckhbw %%mm1, %%mm1\n\t"
2508
                "punpcklbw %%mm2, %%mm2\n\t"
2509
                "punpckhbw %%mm3, %%mm3\n\t"
2510
                "punpcklbw %%mm4, %%mm4\n\t"
2511
                "punpckhbw %%mm5, %%mm5\n\t"
2512
                "punpcklbw %%mm6, %%mm6\n\t"
2513
                "punpckhbw %%mm7, %%mm7\n\t"
2514
                MOVNTQ"        %%mm0, %0\n\t"
2515
                MOVNTQ"        %%mm1, 8%0\n\t"
2516
                MOVNTQ"        %%mm2, 16%0\n\t"
2517
                MOVNTQ"        %%mm3, 24%0\n\t"
2518
                MOVNTQ"        %%mm4, 32%0\n\t"
2519
                MOVNTQ"        %%mm5, 40%0\n\t"
2520
                MOVNTQ"        %%mm6, 48%0\n\t"
2521
                MOVNTQ"        %%mm7, 56%0"
2522
                :"=m"(d[2*x])
2523
                :"m"(s1[x])
2524
                :"memory");
2525
        }
2526
#endif
2527
        for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2528
    }
2529
    for(y=0;y<h;y++){
2530
        const uint8_t* s2=src2+srcStride2*(y>>1);
2531
        uint8_t* d=dst2+dstStride2*y;
2532
        x=0;
2533
#ifdef HAVE_MMX
2534
        for(;x<w-31;x+=32)
2535
        {
2536
            asm volatile(
2537
                PREFETCH" 32%1\n\t"
2538
                "movq        %1, %%mm0\n\t"
2539
                "movq        8%1, %%mm2\n\t"
2540
                "movq        16%1, %%mm4\n\t"
2541
                "movq        24%1, %%mm6\n\t"
2542
                "movq        %%mm0, %%mm1\n\t"
2543
                "movq        %%mm2, %%mm3\n\t"
2544
                "movq        %%mm4, %%mm5\n\t"
2545
                "movq        %%mm6, %%mm7\n\t"
2546
                "punpcklbw %%mm0, %%mm0\n\t"
2547
                "punpckhbw %%mm1, %%mm1\n\t"
2548
                "punpcklbw %%mm2, %%mm2\n\t"
2549
                "punpckhbw %%mm3, %%mm3\n\t"
2550
                "punpcklbw %%mm4, %%mm4\n\t"
2551
                "punpckhbw %%mm5, %%mm5\n\t"
2552
                "punpcklbw %%mm6, %%mm6\n\t"
2553
                "punpckhbw %%mm7, %%mm7\n\t"
2554
                MOVNTQ"        %%mm0, %0\n\t"
2555
                MOVNTQ"        %%mm1, 8%0\n\t"
2556
                MOVNTQ"        %%mm2, 16%0\n\t"
2557
                MOVNTQ"        %%mm3, 24%0\n\t"
2558
                MOVNTQ"        %%mm4, 32%0\n\t"
2559
                MOVNTQ"        %%mm5, 40%0\n\t"
2560
                MOVNTQ"        %%mm6, 48%0\n\t"
2561
                MOVNTQ"        %%mm7, 56%0"
2562
                :"=m"(d[2*x])
2563
                :"m"(s2[x])
2564
                :"memory");
2565
        }
2566
#endif
2567
        for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2568
    }
2569
#ifdef HAVE_MMX
2570
        asm(
2571
                EMMS" \n\t"
2572
                SFENCE" \n\t"
2573
                ::: "memory"
2574
                );
2575
#endif
2576
}
2577

    
2578
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2579
                        uint8_t *dst,
2580
                        unsigned width, unsigned height,
2581
                        int srcStride1, int srcStride2,
2582
                        int srcStride3, int dstStride)
2583
{
2584
    unsigned long y,x,w,h;
2585
    w=width/2; h=height;
2586
    for(y=0;y<h;y++){
2587
        const uint8_t* yp=src1+srcStride1*y;
2588
        const uint8_t* up=src2+srcStride2*(y>>2);
2589
        const uint8_t* vp=src3+srcStride3*(y>>2);
2590
        uint8_t* d=dst+dstStride*y;
2591
        x=0;
2592
#ifdef HAVE_MMX
2593
        for(;x<w-7;x+=8)
2594
        {
2595
            asm volatile(
2596
                PREFETCH" 32(%1, %0)\n\t"
2597
                PREFETCH" 32(%2, %0)\n\t"
2598
                PREFETCH" 32(%3, %0)\n\t"
2599
                "movq        (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2600
                "movq        (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2601
                "movq        (%3, %0), %%mm2\n\t"             /* V0V1V2V3V4V5V6V7 */
2602
                "movq        %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2603
                "movq        %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2604
                "movq        %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2605
                "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2606
                "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2607
                "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2608
                "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2609

    
2610
                "movq        %%mm1, %%mm6\n\t"
2611
                "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2612
                "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2613
                "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2614
                MOVNTQ"        %%mm0, (%4, %0, 8)\n\t"
2615
                MOVNTQ"        %%mm3, 8(%4, %0, 8)\n\t"
2616
                
2617
                "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2618
                "movq        8(%1, %0, 4), %%mm0\n\t"
2619
                "movq        %%mm0, %%mm3\n\t"
2620
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2621
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2622
                MOVNTQ"        %%mm0, 16(%4, %0, 8)\n\t"
2623
                MOVNTQ"        %%mm3, 24(%4, %0, 8)\n\t"
2624

    
2625
                "movq        %%mm4, %%mm6\n\t"
2626
                "movq        16(%1, %0, 4), %%mm0\n\t"
2627
                "movq        %%mm0, %%mm3\n\t"
2628
                "punpcklbw %%mm5, %%mm4\n\t"
2629
                "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2630
                "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2631
                MOVNTQ"        %%mm0, 32(%4, %0, 8)\n\t"
2632
                MOVNTQ"        %%mm3, 40(%4, %0, 8)\n\t"
2633
                
2634
                "punpckhbw %%mm5, %%mm6\n\t"
2635
                "movq        24(%1, %0, 4), %%mm0\n\t"
2636
                "movq        %%mm0, %%mm3\n\t"
2637
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2638
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2639
                MOVNTQ"        %%mm0, 48(%4, %0, 8)\n\t"
2640
                MOVNTQ"        %%mm3, 56(%4, %0, 8)\n\t"
2641

    
2642
                : "+r" (x)
2643
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2644
                :"memory");
2645
        }
2646
#endif
2647
        for(; x<w; x++)
2648
        {
2649
            const int x2= x<<2;
2650
            d[8*x+0]=yp[x2];
2651
            d[8*x+1]=up[x];
2652
            d[8*x+2]=yp[x2+1];
2653
            d[8*x+3]=vp[x];
2654
            d[8*x+4]=yp[x2+2];
2655
            d[8*x+5]=up[x];
2656
            d[8*x+6]=yp[x2+3];
2657
            d[8*x+7]=vp[x];
2658
        }
2659
    }
2660
#ifdef HAVE_MMX
2661
        asm(
2662
                EMMS" \n\t"
2663
                SFENCE" \n\t"
2664
                ::: "memory"
2665
                );
2666
#endif
2667
}