Statistics
| Branch: | Revision:

ffmpeg / postproc / rgb2rgb.c @ 99969243

History | View | Annotate | Download (21.1 KB)

1
/*
2
 *
3
 *  rgb2rgb.c, Software RGB to RGB convertor
4
 *  pluralize by Software PAL8 to RGB convertor
5
 *               Software YUV to YUV convertor
6
 *               Software YUV to RGB convertor
7
 *  Written by Nick Kurshev.
8
 *  palette stuff & yuv stuff by Michael
9
 */
10
#include <inttypes.h>
11
#include "../config.h"
12
#include "rgb2rgb.h"
13
#include "../mmx_defs.h"
14

    
15
#ifdef HAVE_MMX
16
static const uint64_t mask32b  __attribute__((aligned(8))) = 0x000000FF000000FFULL;
17
static const uint64_t mask32g  __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
18
static const uint64_t mask32r  __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
19
static const uint64_t mask32   __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
20
static const uint64_t mask24l  __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
21
static const uint64_t mask24h  __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
22
static const uint64_t mask24hh  __attribute__((aligned(8))) = 0xffff000000000000ULL;
23
static const uint64_t mask24hhh  __attribute__((aligned(8))) = 0xffffffff00000000ULL;
24
static const uint64_t mask24hhhh  __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
25
static const uint64_t mask15b  __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
26
static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
27
static const uint64_t mask15s  __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
28
static const uint64_t red_16mask  __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
29
static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
30
static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
31
static const uint64_t red_15mask  __attribute__((aligned(8))) = 0x00007c000000f800ULL;
32
static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL;
33
static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
34
#if 0
35
static volatile uint64_t __attribute__((aligned(8))) b5Dither;
36
static volatile uint64_t __attribute__((aligned(8))) g5Dither;
37
static volatile uint64_t __attribute__((aligned(8))) g6Dither;
38
static volatile uint64_t __attribute__((aligned(8))) r5Dither;
39

40
static uint64_t __attribute__((aligned(8))) dither4[2]={
41
        0x0103010301030103LL,
42
        0x0200020002000200LL,};
43

44
static uint64_t __attribute__((aligned(8))) dither8[2]={
45
        0x0602060206020602LL,
46
        0x0004000400040004LL,};
47
#endif
48
#endif
49

    
50
void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
51
{
52
  uint8_t *dest = dst;
53
  const uint8_t *s = src;
54
  const uint8_t *end;
55
#ifdef HAVE_MMX
56
  uint8_t *mm_end;
57
#endif
58
  end = s + src_size;
59
#ifdef HAVE_MMX
60
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
61
  mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
62
  __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
63
  if(mm_end == end) mm_end -= MMREG_SIZE*4;
64
  while(s < mm_end)
65
  {
66
    __asm __volatile(
67
        PREFETCH"        32%1\n\t"
68
        "movd        %1, %%mm0\n\t"
69
        "punpckldq 3%1, %%mm0\n\t"
70
        "movd        6%1, %%mm1\n\t"
71
        "punpckldq 9%1, %%mm1\n\t"
72
        "movd        12%1, %%mm2\n\t"
73
        "punpckldq 15%1, %%mm2\n\t"
74
        "movd        18%1, %%mm3\n\t"
75
        "punpckldq 21%1, %%mm3\n\t"
76
        "pand        %%mm7, %%mm0\n\t"
77
        "pand        %%mm7, %%mm1\n\t"
78
        "pand        %%mm7, %%mm2\n\t"
79
        "pand        %%mm7, %%mm3\n\t"
80
        MOVNTQ"        %%mm0, %0\n\t"
81
        MOVNTQ"        %%mm1, 8%0\n\t"
82
        MOVNTQ"        %%mm2, 16%0\n\t"
83
        MOVNTQ"        %%mm3, 24%0"
84
        :"=m"(*dest)
85
        :"m"(*s)
86
        :"memory");
87
    dest += 32;
88
    s += 24;
89
  }
90
  __asm __volatile(SFENCE:::"memory");
91
  __asm __volatile(EMMS:::"memory");
92
#endif
93
  while(s < end)
94
  {
95
    *dest++ = *s++;
96
    *dest++ = *s++;
97
    *dest++ = *s++;
98
    *dest++ = 0;
99
  }
100
}
101

    
102
void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
103
{
104
  uint8_t *dest = dst;
105
  const uint8_t *s = src;
106
  const uint8_t *end;
107
#ifdef HAVE_MMX
108
  uint8_t *mm_end;
109
#endif
110
  end = s + src_size;
111
#ifdef HAVE_MMX
112
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
113
  mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
114
  while(s < mm_end)
115
  {
116
    __asm __volatile(
117
        PREFETCH"        32%1\n\t"
118
        "movq        %1, %%mm0\n\t"
119
        "movq        8%1, %%mm1\n\t"
120
        "movq        16%1, %%mm4\n\t"
121
        "movq        24%1, %%mm5\n\t"
122
        "movq        %%mm0, %%mm2\n\t"
123
        "movq        %%mm1, %%mm3\n\t"
124
        "movq        %%mm4, %%mm6\n\t"
125
        "movq        %%mm5, %%mm7\n\t"
126
        "psrlq        $8, %%mm2\n\t"
127
        "psrlq        $8, %%mm3\n\t"
128
        "psrlq        $8, %%mm6\n\t"
129
        "psrlq        $8, %%mm7\n\t"
130
        "pand        %2, %%mm0\n\t"
131
        "pand        %2, %%mm1\n\t"
132
        "pand        %2, %%mm4\n\t"
133
        "pand        %2, %%mm5\n\t"
134
        "pand        %3, %%mm2\n\t"
135
        "pand        %3, %%mm3\n\t"
136
        "pand        %3, %%mm6\n\t"
137
        "pand        %3, %%mm7\n\t"
138
        "por        %%mm2, %%mm0\n\t"
139
        "por        %%mm3, %%mm1\n\t"
140
        "por        %%mm6, %%mm4\n\t"
141
        "por        %%mm7, %%mm5\n\t"
142

    
143
        "movq        %%mm1, %%mm2\n\t"
144
        "movq        %%mm4, %%mm3\n\t"
145
        "psllq        $48, %%mm2\n\t"
146
        "psllq        $32, %%mm3\n\t"
147
        "pand        %4, %%mm2\n\t"
148
        "pand        %5, %%mm3\n\t"
149
        "por        %%mm2, %%mm0\n\t"
150
        "psrlq        $16, %%mm1\n\t"
151
        "psrlq        $32, %%mm4\n\t"
152
        "psllq        $16, %%mm5\n\t"
153
        "por        %%mm3, %%mm1\n\t"
154
        "pand        %6, %%mm5\n\t"
155
        "por        %%mm5, %%mm4\n\t"
156
        
157
        MOVNTQ"        %%mm0, %0\n\t"
158
        MOVNTQ"        %%mm1, 8%0\n\t"
159
        MOVNTQ"        %%mm4, 16%0"
160
        :"=m"(*dest)
161
        :"m"(*s),"m"(mask24l),
162
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
163
        :"memory");
164
    dest += 24;
165
    s += 32;
166
  }
167
  __asm __volatile(SFENCE:::"memory");
168
  __asm __volatile(EMMS:::"memory");
169
#endif
170
  while(s < end)
171
  {
172
    *dest++ = *s++;
173
    *dest++ = *s++;
174
    *dest++ = *s++;
175
    s++;
176
  }
177
}
178

    
179
/*
180
 Original by Strepto/Astral
181
 ported to gcc & bugfixed : A'rpi
182
 MMX2, 3DNOW optimization by Nick Kurshev
183
 32bit c version, and and&add trick by Michael Niedermayer
184
*/
185
void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
186
{
187
#ifdef HAVE_MMX
188
  register const char* s=src+src_size;
189
  register char* d=dst+src_size;
190
  register int offs=-src_size;
191
  __asm __volatile(PREFETCH"        %0"::"m"(*(s+offs)));
192
  __asm __volatile(
193
        "movq        %0, %%mm4\n\t"
194
        ::"m"(mask15s));
195
  while(offs<0)
196
  {
197
        __asm __volatile(
198
                PREFETCH"        32%1\n\t"
199
                "movq        %1, %%mm0\n\t"
200
                "movq        8%1, %%mm2\n\t"
201
                "movq        %%mm0, %%mm1\n\t"
202
                "movq        %%mm2, %%mm3\n\t"
203
                "pand        %%mm4, %%mm0\n\t"
204
                "pand        %%mm4, %%mm2\n\t"
205
                "paddw        %%mm1, %%mm0\n\t"
206
                "paddw        %%mm3, %%mm2\n\t"
207
                MOVNTQ"        %%mm0, %0\n\t"
208
                MOVNTQ"        %%mm2, 8%0"
209
                :"=m"(*(d+offs))
210
                :"m"(*(s+offs))
211
                );
212
        offs+=16;
213
  }
214
  __asm __volatile(SFENCE:::"memory");
215
  __asm __volatile(EMMS:::"memory");
216
#else
217
#if 0
218
   const uint16_t *s1=( uint16_t * )src;
219
   uint16_t *d1=( uint16_t * )dst;
220
   uint16_t *e=((uint8_t *)s1)+src_size;
221
   while( s1<e ){
222
     register int x=*( s1++ );
223
     /* rrrrrggggggbbbbb
224
        0rrrrrgggggbbbbb
225
        0111 1111 1110 0000=0x7FE0
226
        00000000000001 1111=0x001F */
227
     *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
228
   }
229
#else
230
        const unsigned *s1=( unsigned * )src;
231
        unsigned *d1=( unsigned * )dst;
232
        int i;
233
        int size= src_size>>2;
234
        for(i=0; i<size; i++)
235
        {
236
                register int x= s1[i];
237
//                d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
238
                d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
239

    
240
        }
241
#endif
242
#endif
243
}
244

    
245
/**
246
 * Pallete is assumed to contain bgr32
247
 */
248
void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
249
{
250
        unsigned i;
251
        for(i=0; i<num_pixels; i++)
252
                ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
253
}
254

    
255
/**
256
 * Pallete is assumed to contain bgr32
257
 */
258
void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
259
{
260
        unsigned i;
261
/*
262
        writes 1 byte o much and might cause alignment issues on some architectures?
263
        for(i=0; i<num_pixels; i++)
264
                ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
265
*/
266
        for(i=0; i<num_pixels; i++)
267
        {
268
                //FIXME slow?
269
                dst[0]= palette[ src[i]*4+0 ];
270
                dst[1]= palette[ src[i]*4+1 ];
271
                dst[2]= palette[ src[i]*4+2 ];
272
                dst+= 3;
273
        }
274
}
275

    
276
void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
277
{
278
#ifdef HAVE_MMX
279
        const uint8_t *s = src;
280
        const uint8_t *end,*mm_end;
281
        uint16_t *d = (uint16_t *)dst;
282
        end = s + src_size;
283
        mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
284
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
285
        __asm __volatile(
286
            "movq        %0, %%mm7\n\t"
287
            "movq        %1, %%mm6\n\t"
288
            ::"m"(red_16mask),"m"(green_16mask));
289
        while(s < mm_end)
290
        {
291
            __asm __volatile(
292
                PREFETCH" 32%1\n\t"
293
                "movd        %1, %%mm0\n\t"
294
                "movd        4%1, %%mm3\n\t"
295
                "punpckldq 8%1, %%mm0\n\t"
296
                "punpckldq 12%1, %%mm3\n\t"
297
                "movq        %%mm0, %%mm1\n\t"
298
                "movq        %%mm0, %%mm2\n\t"
299
                "movq        %%mm3, %%mm4\n\t"
300
                "movq        %%mm3, %%mm5\n\t"
301
                "psrlq        $3, %%mm0\n\t"
302
                "psrlq        $3, %%mm3\n\t"
303
                "pand        %2, %%mm0\n\t"
304
                "pand        %2, %%mm3\n\t"
305
                "psrlq        $5, %%mm1\n\t"
306
                "psrlq        $5, %%mm4\n\t"
307
                "pand        %%mm6, %%mm1\n\t"
308
                "pand        %%mm6, %%mm4\n\t"
309
                "psrlq        $8, %%mm2\n\t"
310
                "psrlq        $8, %%mm5\n\t"
311
                "pand        %%mm7, %%mm2\n\t"
312
                "pand        %%mm7, %%mm5\n\t"
313
                "por        %%mm1, %%mm0\n\t"
314
                "por        %%mm4, %%mm3\n\t"
315
                "por        %%mm2, %%mm0\n\t"
316
                "por        %%mm5, %%mm3\n\t"
317
                "psllq        $16, %%mm3\n\t"
318
                "por        %%mm3, %%mm0\n\t"
319
                MOVNTQ"        %%mm0, %0\n\t"
320
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
321
                d += 4;
322
                s += 16;
323
        }
324
        while(s < end)
325
        {
326
                const int b= *s++;
327
                const int g= *s++;
328
                const int r= *s++;
329
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
330
        }
331
        __asm __volatile(SFENCE:::"memory");
332
        __asm __volatile(EMMS:::"memory");
333
#else
334
        unsigned j,i,num_pixels=src_size/4;
335
        uint16_t *d = (uint16_t *)dst;
336
        for(i=0,j=0; j<num_pixels; i+=4,j++)
337
        {
338
                const int b= src[i+0];
339
                const int g= src[i+1];
340
                const int r= src[i+2];
341

    
342
                d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
343
        }
344
#endif
345
}
346

    
347
void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
348
{
349
#ifdef HAVE_MMX
350
        const uint8_t *s = src;
351
        const uint8_t *end,*mm_end;
352
        uint16_t *d = (uint16_t *)dst;
353
        end = s + src_size;
354
        mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
355
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
356
        __asm __volatile(
357
            "movq        %0, %%mm7\n\t"
358
            "movq        %1, %%mm6\n\t"
359
            ::"m"(red_15mask),"m"(green_15mask));
360
        while(s < mm_end)
361
        {
362
            __asm __volatile(
363
                PREFETCH" 32%1\n\t"
364
                "movd        %1, %%mm0\n\t"
365
                "movd        4%1, %%mm3\n\t"
366
                "punpckldq 8%1, %%mm0\n\t"
367
                "punpckldq 12%1, %%mm3\n\t"
368
                "movq        %%mm0, %%mm1\n\t"
369
                "movq        %%mm0, %%mm2\n\t"
370
                "movq        %%mm3, %%mm4\n\t"
371
                "movq        %%mm3, %%mm5\n\t"
372
                "psrlq        $3, %%mm0\n\t"
373
                "psrlq        $3, %%mm3\n\t"
374
                "pand        %2, %%mm0\n\t"
375
                "pand        %2, %%mm3\n\t"
376
                "psrlq        $6, %%mm1\n\t"
377
                "psrlq        $6, %%mm4\n\t"
378
                "pand        %%mm6, %%mm1\n\t"
379
                "pand        %%mm6, %%mm4\n\t"
380
                "psrlq        $9, %%mm2\n\t"
381
                "psrlq        $9, %%mm5\n\t"
382
                "pand        %%mm7, %%mm2\n\t"
383
                "pand        %%mm7, %%mm5\n\t"
384
                "por        %%mm1, %%mm0\n\t"
385
                "por        %%mm4, %%mm3\n\t"
386
                "por        %%mm2, %%mm0\n\t"
387
                "por        %%mm5, %%mm3\n\t"
388
                "psllq        $16, %%mm3\n\t"
389
                "por        %%mm3, %%mm0\n\t"
390
                MOVNTQ"        %%mm0, %0\n\t"
391
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
392
                d += 4;
393
                s += 16;
394
        }
395
        while(s < end)
396
        {
397
                const int b= *s++;
398
                const int g= *s++;
399
                const int r= *s++;
400
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
401
        }
402
        __asm __volatile(SFENCE:::"memory");
403
        __asm __volatile(EMMS:::"memory");
404
#else
405
        unsigned j,i,num_pixels=src_size/4;
406
        uint16_t *d = (uint16_t *)dst;
407
        for(i=0,j=0; j<num_pixels; i+=4,j++)
408
        {
409
                const int b= src[i+0];
410
                const int g= src[i+1];
411
                const int r= src[i+2];
412

    
413
                d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
414
        }
415
#endif
416
}
417

    
418
void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
419
{
420
#ifdef HAVE_MMX
421
        const uint8_t *s = src;
422
        const uint8_t *end,*mm_end;
423
        uint16_t *d = (uint16_t *)dst;
424
        end = s + src_size;
425
        mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
426
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
427
        __asm __volatile(
428
            "movq        %0, %%mm7\n\t"
429
            "movq        %1, %%mm6\n\t"
430
            ::"m"(red_16mask),"m"(green_16mask));
431
        if(mm_end == end) mm_end -= MMREG_SIZE*2;
432
        while(s < mm_end)
433
        {
434
            __asm __volatile(
435
                PREFETCH" 32%1\n\t"
436
                "movd        %1, %%mm0\n\t"
437
                "movd        3%1, %%mm3\n\t"
438
                "punpckldq 6%1, %%mm0\n\t"
439
                "punpckldq 9%1, %%mm3\n\t"
440
                "movq        %%mm0, %%mm1\n\t"
441
                "movq        %%mm0, %%mm2\n\t"
442
                "movq        %%mm3, %%mm4\n\t"
443
                "movq        %%mm3, %%mm5\n\t"
444
                "psrlq        $3, %%mm0\n\t"
445
                "psrlq        $3, %%mm3\n\t"
446
                "pand        %2, %%mm0\n\t"
447
                "pand        %2, %%mm3\n\t"
448
                "psrlq        $5, %%mm1\n\t"
449
                "psrlq        $5, %%mm4\n\t"
450
                "pand        %%mm6, %%mm1\n\t"
451
                "pand        %%mm6, %%mm4\n\t"
452
                "psrlq        $8, %%mm2\n\t"
453
                "psrlq        $8, %%mm5\n\t"
454
                "pand        %%mm7, %%mm2\n\t"
455
                "pand        %%mm7, %%mm5\n\t"
456
                "por        %%mm1, %%mm0\n\t"
457
                "por        %%mm4, %%mm3\n\t"
458
                "por        %%mm2, %%mm0\n\t"
459
                "por        %%mm5, %%mm3\n\t"
460
                "psllq        $16, %%mm3\n\t"
461
                "por        %%mm3, %%mm0\n\t"
462
                MOVNTQ"        %%mm0, %0\n\t"
463
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
464
                d += 4;
465
                s += 12;
466
        }
467
        while(s < end)
468
        {
469
                const int b= *s++;
470
                const int g= *s++;
471
                const int r= *s++;
472
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
473
        }
474
        __asm __volatile(SFENCE:::"memory");
475
        __asm __volatile(EMMS:::"memory");
476
#else
477
        unsigned j,i,num_pixels=src_size/3;
478
        uint16_t *d = (uint16_t *)dst;
479
        for(i=0,j=0; j<num_pixels; i+=3,j++)
480
        {
481
                const int b= src[i+0];
482
                const int g= src[i+1];
483
                const int r= src[i+2];
484

    
485
                d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
486
        }
487
#endif
488
}
489

    
490
void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
491
{
492
#ifdef HAVE_MMX
493
        const uint8_t *s = src;
494
        const uint8_t *end,*mm_end;
495
        uint16_t *d = (uint16_t *)dst;
496
        end = s + src_size;
497
        mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
498
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
499
        __asm __volatile(
500
            "movq        %0, %%mm7\n\t"
501
            "movq        %1, %%mm6\n\t"
502
            ::"m"(red_15mask),"m"(green_15mask));
503
        if(mm_end == end) mm_end -= MMREG_SIZE*2;
504
        while(s < mm_end)
505
        {
506
            __asm __volatile(
507
                PREFETCH" 32%1\n\t"
508
                "movd        %1, %%mm0\n\t"
509
                "movd        3%1, %%mm3\n\t"
510
                "punpckldq 6%1, %%mm0\n\t"
511
                "punpckldq 9%1, %%mm3\n\t"
512
                "movq        %%mm0, %%mm1\n\t"
513
                "movq        %%mm0, %%mm2\n\t"
514
                "movq        %%mm3, %%mm4\n\t"
515
                "movq        %%mm3, %%mm5\n\t"
516
                "psrlq        $3, %%mm0\n\t"
517
                "psrlq        $3, %%mm3\n\t"
518
                "pand        %2, %%mm0\n\t"
519
                "pand        %2, %%mm3\n\t"
520
                "psrlq        $6, %%mm1\n\t"
521
                "psrlq        $6, %%mm4\n\t"
522
                "pand        %%mm6, %%mm1\n\t"
523
                "pand        %%mm6, %%mm4\n\t"
524
                "psrlq        $9, %%mm2\n\t"
525
                "psrlq        $9, %%mm5\n\t"
526
                "pand        %%mm7, %%mm2\n\t"
527
                "pand        %%mm7, %%mm5\n\t"
528
                "por        %%mm1, %%mm0\n\t"
529
                "por        %%mm4, %%mm3\n\t"
530
                "por        %%mm2, %%mm0\n\t"
531
                "por        %%mm5, %%mm3\n\t"
532
                "psllq        $16, %%mm3\n\t"
533
                "por        %%mm3, %%mm0\n\t"
534
                MOVNTQ"        %%mm0, %0\n\t"
535
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
536
                d += 4;
537
                s += 12;
538
        }
539
        while(s < end)
540
        {
541
                const int b= *s++;
542
                const int g= *s++;
543
                const int r= *s++;
544
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
545
        }
546
        __asm __volatile(SFENCE:::"memory");
547
        __asm __volatile(EMMS:::"memory");
548
#else
549
        unsigned j,i,num_pixels=src_size/3;
550
        uint16_t *d = (uint16_t *)dst;
551
        for(i=0,j=0; j<num_pixels; i+=3,j++)
552
        {
553
                const int b= src[i+0];
554
                const int g= src[i+1];
555
                const int r= src[i+2];
556

    
557
                d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
558
        }
559
#endif
560
}
561

    
562
/**
563
 * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
564
 */
565
void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
566
{
567
        unsigned i;
568
        for(i=0; i<num_pixels; i++)
569
                ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
570
}
571

    
572
/**
573
 * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
574
 */
575
void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
576
{
577
        unsigned i;
578
        for(i=0; i<num_pixels; i++)
579
                ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
580
}
581

    
582
void rgb32tobgr32(const uint8_t *src, uint8_t *dst, unsigned int src_size)
583
{
584
        int num_pixels= src_size >> 2;
585
#ifdef HAVE_MMX
586
        asm volatile (
587
                "xorl %%eax, %%eax                \n\t"
588
                "1:                                \n\t"
589
                PREFETCH" 32(%0, %%eax)                \n\t"
590
                "movq (%0, %%eax), %%mm0        \n\t"
591
                "movq %%mm0, %%mm1                \n\t"
592
                "movq %%mm0, %%mm2                \n\t"
593
                "pslld $16, %%mm0                \n\t"
594
                "psrld $16, %%mm1                \n\t"
595
                "pand mask32r, %%mm0                \n\t"
596
                "pand mask32g, %%mm2                \n\t"
597
                "pand mask32b, %%mm1                \n\t"
598
                "por %%mm0, %%mm2                \n\t"
599
                "por %%mm1, %%mm2                \n\t"
600
                MOVNTQ" %%mm2, (%1, %%eax)        \n\t"
601
                "addl $2, %%eax                        \n\t"
602
                "cmpl %2, %%eax                        \n\t"
603
                " jb 1b                                \n\t"
604
                :: "r" (src), "r"(dst), "r" (num_pixels)
605
                : "%eax"
606
        );
607
#else
608
        int i;
609
        for(i=0; i<num_pixels; i++)
610
        {
611
                dst[4*i + 0] = src[4*i + 2];
612
                dst[4*i + 1] = src[4*i + 1];
613
                dst[4*i + 2] = src[4*i + 0];
614
        }
615
#endif
616
}
617

    
618
/**
619
 *
620
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
621
 * problem for anyone then tell me, and ill fix it)
622
 */
623
void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
624
        unsigned int width, unsigned int height,
625
        unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
626
{
627
        int y;
628
        const int chromWidth= width>>1;
629
        for(y=0; y<height; y++)
630
        {
631
#ifdef HAVE_MMX
632
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
633
                asm volatile(
634
                        "xorl %%eax, %%eax                \n\t"
635
                        "1:                                \n\t"
636
                        PREFETCH" 32(%1, %%eax, 2)        \n\t"
637
                        PREFETCH" 32(%2, %%eax)                \n\t"
638
                        PREFETCH" 32(%3, %%eax)                \n\t"
639
                        "movq (%2, %%eax), %%mm0        \n\t" // U(0)
640
                        "movq %%mm0, %%mm2                \n\t" // U(0)
641
                        "movq (%3, %%eax), %%mm1        \n\t" // V(0)
642
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
643
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
644

    
645
                        "movq (%1, %%eax,2), %%mm3        \n\t" // Y(0)
646
                        "movq 8(%1, %%eax,2), %%mm5        \n\t" // Y(8)
647
                        "movq %%mm3, %%mm4                \n\t" // Y(0)
648
                        "movq %%mm5, %%mm6                \n\t" // Y(8)
649
                        "punpcklbw %%mm0, %%mm3                \n\t" // YUYV YUYV(0)
650
                        "punpckhbw %%mm0, %%mm4                \n\t" // YUYV YUYV(4)
651
                        "punpcklbw %%mm2, %%mm5                \n\t" // YUYV YUYV(8)
652
                        "punpckhbw %%mm2, %%mm6                \n\t" // YUYV YUYV(12)
653

    
654
                        MOVNTQ" %%mm3, (%0, %%eax, 4)        \n\t"
655
                        MOVNTQ" %%mm4, 8(%0, %%eax, 4)        \n\t"
656
                        MOVNTQ" %%mm5, 16(%0, %%eax, 4)        \n\t"
657
                        MOVNTQ" %%mm6, 24(%0, %%eax, 4)        \n\t"
658

    
659
                        "addl $8, %%eax                        \n\t"
660
                        "cmpl %4, %%eax                        \n\t"
661
                        " jb 1b                                \n\t"
662
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
663
                        : "%eax"
664
                );
665
#else
666
                int i;
667
                for(i=0; i<chromWidth; i++)
668
                {
669
                        dst[4*i+0] = ysrc[2*i+0];
670
                        dst[4*i+1] = usrc[i];
671
                        dst[4*i+2] = ysrc[2*i+1];
672
                        dst[4*i+3] = vsrc[i];
673
                }
674
#endif
675
                if(y&1)
676
                {
677
                        usrc += chromStride;
678
                        vsrc += chromStride;
679
                }
680
                ysrc += lumStride;
681
                dst += dstStride;
682
        }
683
#ifdef HAVE_MMX
684
asm(    EMMS" \n\t"
685
        SFENCE" \n\t"
686
        :::"memory");
687
#endif
688
}
689

    
690
/**
691
 *
692
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
693
 * problem for anyone then tell me, and ill fix it)
694
 */
695
void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
696
        unsigned int width, unsigned int height,
697
        unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
698
{
699
        int y;
700
        const int chromWidth= width>>1;
701
        for(y=0; y<height; y+=2)
702
        {
703
#ifdef HAVE_MMX
704
                asm volatile(
705
                        "xorl %%eax, %%eax                \n\t"
706
                        "pcmpeqw %%mm7, %%mm7                \n\t"
707
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
708
                        "1:                                \n\t"
709
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
710
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
711
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
712
                        "movq %%mm0, %%mm2                \n\t" // YUYV YUYV(0)
713
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(4)
714
                        "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
715
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
716
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
717
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
718
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
719
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
720

    
721
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
722

    
723
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(8)
724
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(12)
725
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(8)
726
                        "movq %%mm2, %%mm4                \n\t" // YUYV YUYV(12)
727
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
728
                        "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
729
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
730
                        "pand %%mm7, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
731
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
732
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
733

    
734
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
735

    
736
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
737
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
738
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
739
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
740
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
741
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
742
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
743
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
744

    
745
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
746
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
747

    
748
                        "addl $8, %%eax                        \n\t"
749
                        "cmpl %4, %%eax                        \n\t"
750
                        " jb 1b                                \n\t"
751
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
752
                        : "memory", "%eax"
753
                );
754

    
755
                asm volatile(
756
                        "xorl %%eax, %%eax                \n\t"
757
                        "1:                                \n\t"
758
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
759
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
760
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
761
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
762
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
763
                        "pand %%mm7, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
764
                        "pand %%mm7, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
765
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
766
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
767
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
768
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
769

    
770
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
771
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
772

    
773
                        "addl $8, %%eax                        \n\t"
774
                        "cmpl %4, %%eax                        \n\t"
775
                        " jb 1b                                \n\t"
776

    
777
                        ::"r"(src+srcStride), "r"(ydst+lumStride), "r"(udst), "r"(vdst), "r" (chromWidth)
778
                        : "memory", "%eax"
779
                );
780
#else
781
                int i;
782
                for(i=0; i<chromWidth; i++)
783
                {
784
                        ydst[2*i+0]         = src[4*i+0];
785
                        udst[i]         = src[4*i+1];
786
                        ydst[2*i+1]         = src[4*i+2];
787
                        vdst[i]         = src[4*i+3];
788
                }
789
                ydst += lumStride;
790
                src  += srcStride;
791

    
792
                for(i=0; i<chromWidth; i++)
793
                {
794
                        ydst[2*i+0]         = src[4*i+0];
795
                        ydst[2*i+1]         = src[4*i+2];
796
                }
797
#endif
798
                udst += chromStride;
799
                vdst += chromStride;
800
                ydst += lumStride;
801
                src  += srcStride;
802
        }
803
#ifdef HAVE_MMX
804
asm(    EMMS" \n\t"
805
        SFENCE" \n\t"
806
        :::"memory");
807
#endif
808
}