Statistics
| Branch: | Revision:

ffmpeg / postproc / rgb2rgb_template.c @ d8dad2a5

History | View | Annotate | Download (47.4 KB)

1
/*
2
 *
3
 *  rgb2rgb.c, Software RGB to RGB convertor
4
 *  pluralize by Software PAL8 to RGB convertor
5
 *               Software YUV to YUV convertor
6
 *               Software YUV to RGB convertor
7
 *  Written by Nick Kurshev.
8
 *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9
 */
10

    
11
#include <stddef.h>
12
#include <inttypes.h> /* for __WORDSIZE */
13

    
14
#ifndef __WORDSIZE
15
#warning You have misconfigured system and probably will lose performance!
16
#endif
17

    
18
#undef PREFETCH
19
#undef MOVNTQ
20
#undef EMMS
21
#undef SFENCE
22
#undef MMREG_SIZE
23
#undef PREFETCHW
24
#undef PAVGB
25

    
26
#ifdef HAVE_SSE2
27
#define MMREG_SIZE 16
28
#else
29
#define MMREG_SIZE 8
30
#endif
31

    
32
#ifdef HAVE_3DNOW
33
#define PREFETCH  "prefetch"
34
#define PREFETCHW "prefetchw"
35
#define PAVGB          "pavgusb"
36
#elif defined ( HAVE_MMX2 )
37
#define PREFETCH "prefetchnta"
38
#define PREFETCHW "prefetcht0"
39
#define PAVGB          "pavgb"
40
#else
41
#define PREFETCH "/nop"
42
#define PREFETCHW "/nop"
43
#endif
44

    
45
#ifdef HAVE_3DNOW
46
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
47
#define EMMS     "femms"
48
#else
49
#define EMMS     "emms"
50
#endif
51

    
52
#ifdef HAVE_MMX2
53
#define MOVNTQ "movntq"
54
#define SFENCE "sfence"
55
#else
56
#define MOVNTQ "movq"
57
#define SFENCE "/nop"
58
#endif
59

    
60
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
61
{
62
  uint8_t *dest = dst;
63
  const uint8_t *s = src;
64
  const uint8_t *end;
65
#ifdef HAVE_MMX
66
  const uint8_t *mm_end;
67
#endif
68
  end = s + src_size;
69
#ifdef HAVE_MMX
70
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
71
  mm_end = end - 23;
72
  __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
73
  while(s < mm_end)
74
  {
75
    __asm __volatile(
76
        PREFETCH"        32%1\n\t"
77
        "movd        %1, %%mm0\n\t"
78
        "punpckldq 3%1, %%mm0\n\t"
79
        "movd        6%1, %%mm1\n\t"
80
        "punpckldq 9%1, %%mm1\n\t"
81
        "movd        12%1, %%mm2\n\t"
82
        "punpckldq 15%1, %%mm2\n\t"
83
        "movd        18%1, %%mm3\n\t"
84
        "punpckldq 21%1, %%mm3\n\t"
85
        "pand        %%mm7, %%mm0\n\t"
86
        "pand        %%mm7, %%mm1\n\t"
87
        "pand        %%mm7, %%mm2\n\t"
88
        "pand        %%mm7, %%mm3\n\t"
89
        MOVNTQ"        %%mm0, %0\n\t"
90
        MOVNTQ"        %%mm1, 8%0\n\t"
91
        MOVNTQ"        %%mm2, 16%0\n\t"
92
        MOVNTQ"        %%mm3, 24%0"
93
        :"=m"(*dest)
94
        :"m"(*s)
95
        :"memory");
96
    dest += 32;
97
    s += 24;
98
  }
99
  __asm __volatile(SFENCE:::"memory");
100
  __asm __volatile(EMMS:::"memory");
101
#endif
102
  while(s < end)
103
  {
104
    *dest++ = *s++;
105
    *dest++ = *s++;
106
    *dest++ = *s++;
107
    *dest++ = 0;
108
  }
109
}
110

    
111
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
112
{
113
  uint8_t *dest = dst;
114
  const uint8_t *s = src;
115
  const uint8_t *end;
116
#ifdef HAVE_MMX
117
  const uint8_t *mm_end;
118
#endif
119
  end = s + src_size;
120
#ifdef HAVE_MMX
121
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
122
  mm_end = end - 31;
123
  while(s < mm_end)
124
  {
125
    __asm __volatile(
126
        PREFETCH"        32%1\n\t"
127
        "movq        %1, %%mm0\n\t"
128
        "movq        8%1, %%mm1\n\t"
129
        "movq        16%1, %%mm4\n\t"
130
        "movq        24%1, %%mm5\n\t"
131
        "movq        %%mm0, %%mm2\n\t"
132
        "movq        %%mm1, %%mm3\n\t"
133
        "movq        %%mm4, %%mm6\n\t"
134
        "movq        %%mm5, %%mm7\n\t"
135
        "psrlq        $8, %%mm2\n\t"
136
        "psrlq        $8, %%mm3\n\t"
137
        "psrlq        $8, %%mm6\n\t"
138
        "psrlq        $8, %%mm7\n\t"
139
        "pand        %2, %%mm0\n\t"
140
        "pand        %2, %%mm1\n\t"
141
        "pand        %2, %%mm4\n\t"
142
        "pand        %2, %%mm5\n\t"
143
        "pand        %3, %%mm2\n\t"
144
        "pand        %3, %%mm3\n\t"
145
        "pand        %3, %%mm6\n\t"
146
        "pand        %3, %%mm7\n\t"
147
        "por        %%mm2, %%mm0\n\t"
148
        "por        %%mm3, %%mm1\n\t"
149
        "por        %%mm6, %%mm4\n\t"
150
        "por        %%mm7, %%mm5\n\t"
151

    
152
        "movq        %%mm1, %%mm2\n\t"
153
        "movq        %%mm4, %%mm3\n\t"
154
        "psllq        $48, %%mm2\n\t"
155
        "psllq        $32, %%mm3\n\t"
156
        "pand        %4, %%mm2\n\t"
157
        "pand        %5, %%mm3\n\t"
158
        "por        %%mm2, %%mm0\n\t"
159
        "psrlq        $16, %%mm1\n\t"
160
        "psrlq        $32, %%mm4\n\t"
161
        "psllq        $16, %%mm5\n\t"
162
        "por        %%mm3, %%mm1\n\t"
163
        "pand        %6, %%mm5\n\t"
164
        "por        %%mm5, %%mm4\n\t"
165

    
166
        MOVNTQ"        %%mm0, %0\n\t"
167
        MOVNTQ"        %%mm1, 8%0\n\t"
168
        MOVNTQ"        %%mm4, 16%0"
169
        :"=m"(*dest)
170
        :"m"(*s),"m"(mask24l),
171
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
172
        :"memory");
173
    dest += 24;
174
    s += 32;
175
  }
176
  __asm __volatile(SFENCE:::"memory");
177
  __asm __volatile(EMMS:::"memory");
178
#endif
179
  while(s < end)
180
  {
181
    *dest++ = *s++;
182
    *dest++ = *s++;
183
    *dest++ = *s++;
184
    s++;
185
  }
186
}
187

    
188
/*
189
 Original by Strepto/Astral
190
 ported to gcc & bugfixed : A'rpi
191
 MMX2, 3DNOW optimization by Nick Kurshev
192
 32bit c version, and and&add trick by Michael Niedermayer
193
*/
194
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
195
{
196
  register const uint8_t* s=src;
197
  register uint8_t* d=dst;
198
  register const uint8_t *end;
199
  const uint8_t *mm_end;
200
  end = s + src_size;
201
#ifdef HAVE_MMX
202
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
203
  __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
204
  mm_end = end - 15;
205
  while(s<mm_end)
206
  {
207
        __asm __volatile(
208
                PREFETCH"        32%1\n\t"
209
                "movq        %1, %%mm0\n\t"
210
                "movq        8%1, %%mm2\n\t"
211
                "movq        %%mm0, %%mm1\n\t"
212
                "movq        %%mm2, %%mm3\n\t"
213
                "pand        %%mm4, %%mm0\n\t"
214
                "pand        %%mm4, %%mm2\n\t"
215
                "paddw        %%mm1, %%mm0\n\t"
216
                "paddw        %%mm3, %%mm2\n\t"
217
                MOVNTQ"        %%mm0, %0\n\t"
218
                MOVNTQ"        %%mm2, 8%0"
219
                :"=m"(*d)
220
                :"m"(*s)
221
                );
222
        d+=16;
223
        s+=16;
224
  }
225
  __asm __volatile(SFENCE:::"memory");
226
  __asm __volatile(EMMS:::"memory");
227
#endif
228
    mm_end = end - 3;
229
    while(s < mm_end)
230
    {
231
        register unsigned x= *((uint32_t *)s);
232
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
233
        d+=4;
234
        s+=4;
235
    }
236
    if(s < end)
237
    {
238
        register unsigned short x= *((uint16_t *)s);
239
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
240
    }
241
}
242

    
243
static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
244
{
245
        unsigned j,i,num_pixels=src_size/3;
246
        for(i=0,j=0; j<num_pixels; i+=3,j+=3)
247
        {
248
                dst[j+0] = src[i+2];
249
                dst[j+1] = src[i+1];
250
                dst[j+2] = src[i+0];
251
        }
252
}
253

    
254
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
255
{
256
        const uint8_t *s = src;
257
        const uint8_t *end;
258
#ifdef HAVE_MMX
259
        const uint8_t *mm_end;
260
#endif
261
        uint16_t *d = (uint16_t *)dst;
262
        end = s + src_size;
263
#ifdef HAVE_MMX
264
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
265
        __asm __volatile(
266
            "movq        %0, %%mm7\n\t"
267
            "movq        %1, %%mm6\n\t"
268
            ::"m"(red_16mask),"m"(green_16mask));
269
        mm_end = end - 15;
270
        while(s < mm_end)
271
        {
272
            __asm __volatile(
273
                PREFETCH" 32%1\n\t"
274
                "movd        %1, %%mm0\n\t"
275
                "movd        4%1, %%mm3\n\t"
276
                "punpckldq 8%1, %%mm0\n\t"
277
                "punpckldq 12%1, %%mm3\n\t"
278
                "movq        %%mm0, %%mm1\n\t"
279
                "movq        %%mm0, %%mm2\n\t"
280
                "movq        %%mm3, %%mm4\n\t"
281
                "movq        %%mm3, %%mm5\n\t"
282
                "psrlq        $3, %%mm0\n\t"
283
                "psrlq        $3, %%mm3\n\t"
284
                "pand        %2, %%mm0\n\t"
285
                "pand        %2, %%mm3\n\t"
286
                "psrlq        $5, %%mm1\n\t"
287
                "psrlq        $5, %%mm4\n\t"
288
                "pand        %%mm6, %%mm1\n\t"
289
                "pand        %%mm6, %%mm4\n\t"
290
                "psrlq        $8, %%mm2\n\t"
291
                "psrlq        $8, %%mm5\n\t"
292
                "pand        %%mm7, %%mm2\n\t"
293
                "pand        %%mm7, %%mm5\n\t"
294
                "por        %%mm1, %%mm0\n\t"
295
                "por        %%mm4, %%mm3\n\t"
296
                "por        %%mm2, %%mm0\n\t"
297
                "por        %%mm5, %%mm3\n\t"
298
                "psllq        $16, %%mm3\n\t"
299
                "por        %%mm3, %%mm0\n\t"
300
                MOVNTQ"        %%mm0, %0\n\t"
301
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
302
                d += 4;
303
                s += 16;
304
        }
305
        __asm __volatile(SFENCE:::"memory");
306
        __asm __volatile(EMMS:::"memory");
307
#endif
308
        while(s < end)
309
        {
310
                const int b= *s++;
311
                const int g= *s++;
312
                const int r= *s++;
313
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
314
                s++;
315
        }
316
}
317

    
318
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
319
{
320
        const uint8_t *s = src;
321
        const uint8_t *end;
322
#ifdef HAVE_MMX
323
        const uint8_t *mm_end;
324
#endif
325
        uint16_t *d = (uint16_t *)dst;
326
        end = s + src_size;
327
#ifdef HAVE_MMX
328
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
329
        __asm __volatile(
330
            "movq        %0, %%mm7\n\t"
331
            "movq        %1, %%mm6\n\t"
332
            ::"m"(red_15mask),"m"(green_15mask));
333
        mm_end = end - 15;
334
        while(s < mm_end)
335
        {
336
            __asm __volatile(
337
                PREFETCH" 32%1\n\t"
338
                "movd        %1, %%mm0\n\t"
339
                "movd        4%1, %%mm3\n\t"
340
                "punpckldq 8%1, %%mm0\n\t"
341
                "punpckldq 12%1, %%mm3\n\t"
342
                "movq        %%mm0, %%mm1\n\t"
343
                "movq        %%mm0, %%mm2\n\t"
344
                "movq        %%mm3, %%mm4\n\t"
345
                "movq        %%mm3, %%mm5\n\t"
346
                "psrlq        $3, %%mm0\n\t"
347
                "psrlq        $3, %%mm3\n\t"
348
                "pand        %2, %%mm0\n\t"
349
                "pand        %2, %%mm3\n\t"
350
                "psrlq        $6, %%mm1\n\t"
351
                "psrlq        $6, %%mm4\n\t"
352
                "pand        %%mm6, %%mm1\n\t"
353
                "pand        %%mm6, %%mm4\n\t"
354
                "psrlq        $9, %%mm2\n\t"
355
                "psrlq        $9, %%mm5\n\t"
356
                "pand        %%mm7, %%mm2\n\t"
357
                "pand        %%mm7, %%mm5\n\t"
358
                "por        %%mm1, %%mm0\n\t"
359
                "por        %%mm4, %%mm3\n\t"
360
                "por        %%mm2, %%mm0\n\t"
361
                "por        %%mm5, %%mm3\n\t"
362
                "psllq        $16, %%mm3\n\t"
363
                "por        %%mm3, %%mm0\n\t"
364
                MOVNTQ"        %%mm0, %0\n\t"
365
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
366
                d += 4;
367
                s += 16;
368
        }
369
        __asm __volatile(SFENCE:::"memory");
370
        __asm __volatile(EMMS:::"memory");
371
#endif
372
        while(s < end)
373
        {
374
                const int b= *s++;
375
                const int g= *s++;
376
                const int r= *s++;
377
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
378
                s++;
379
        }
380
}
381

    
382
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
383
{
384
        const uint8_t *s = src;
385
        const uint8_t *end;
386
#ifdef HAVE_MMX
387
        const uint8_t *mm_end;
388
#endif
389
        uint16_t *d = (uint16_t *)dst;
390
        end = s + src_size;
391
#ifdef HAVE_MMX
392
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
393
        __asm __volatile(
394
            "movq        %0, %%mm7\n\t"
395
            "movq        %1, %%mm6\n\t"
396
            ::"m"(red_16mask),"m"(green_16mask));
397
        mm_end = end - 11;
398
        while(s < mm_end)
399
        {
400
            __asm __volatile(
401
                PREFETCH" 32%1\n\t"
402
                "movd        %1, %%mm0\n\t"
403
                "movd        3%1, %%mm3\n\t"
404
                "punpckldq 6%1, %%mm0\n\t"
405
                "punpckldq 9%1, %%mm3\n\t"
406
                "movq        %%mm0, %%mm1\n\t"
407
                "movq        %%mm0, %%mm2\n\t"
408
                "movq        %%mm3, %%mm4\n\t"
409
                "movq        %%mm3, %%mm5\n\t"
410
                "psrlq        $3, %%mm0\n\t"
411
                "psrlq        $3, %%mm3\n\t"
412
                "pand        %2, %%mm0\n\t"
413
                "pand        %2, %%mm3\n\t"
414
                "psrlq        $5, %%mm1\n\t"
415
                "psrlq        $5, %%mm4\n\t"
416
                "pand        %%mm6, %%mm1\n\t"
417
                "pand        %%mm6, %%mm4\n\t"
418
                "psrlq        $8, %%mm2\n\t"
419
                "psrlq        $8, %%mm5\n\t"
420
                "pand        %%mm7, %%mm2\n\t"
421
                "pand        %%mm7, %%mm5\n\t"
422
                "por        %%mm1, %%mm0\n\t"
423
                "por        %%mm4, %%mm3\n\t"
424
                "por        %%mm2, %%mm0\n\t"
425
                "por        %%mm5, %%mm3\n\t"
426
                "psllq        $16, %%mm3\n\t"
427
                "por        %%mm3, %%mm0\n\t"
428
                MOVNTQ"        %%mm0, %0\n\t"
429
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
430
                d += 4;
431
                s += 12;
432
        }
433
        __asm __volatile(SFENCE:::"memory");
434
        __asm __volatile(EMMS:::"memory");
435
#endif
436
        while(s < end)
437
        {
438
                const int b= *s++;
439
                const int g= *s++;
440
                const int r= *s++;
441
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
442
        }
443
}
444

    
445
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
446
{
447
        const uint8_t *s = src;
448
        const uint8_t *end;
449
#ifdef HAVE_MMX
450
        const uint8_t *mm_end;
451
#endif
452
        uint16_t *d = (uint16_t *)dst;
453
        end = s + src_size;
454
#ifdef HAVE_MMX
455
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
456
        __asm __volatile(
457
            "movq        %0, %%mm7\n\t"
458
            "movq        %1, %%mm6\n\t"
459
            ::"m"(red_15mask),"m"(green_15mask));
460
        mm_end = end - 11;
461
        while(s < mm_end)
462
        {
463
            __asm __volatile(
464
                PREFETCH" 32%1\n\t"
465
                "movd        %1, %%mm0\n\t"
466
                "movd        3%1, %%mm3\n\t"
467
                "punpckldq 6%1, %%mm0\n\t"
468
                "punpckldq 9%1, %%mm3\n\t"
469
                "movq        %%mm0, %%mm1\n\t"
470
                "movq        %%mm0, %%mm2\n\t"
471
                "movq        %%mm3, %%mm4\n\t"
472
                "movq        %%mm3, %%mm5\n\t"
473
                "psrlq        $3, %%mm0\n\t"
474
                "psrlq        $3, %%mm3\n\t"
475
                "pand        %2, %%mm0\n\t"
476
                "pand        %2, %%mm3\n\t"
477
                "psrlq        $6, %%mm1\n\t"
478
                "psrlq        $6, %%mm4\n\t"
479
                "pand        %%mm6, %%mm1\n\t"
480
                "pand        %%mm6, %%mm4\n\t"
481
                "psrlq        $9, %%mm2\n\t"
482
                "psrlq        $9, %%mm5\n\t"
483
                "pand        %%mm7, %%mm2\n\t"
484
                "pand        %%mm7, %%mm5\n\t"
485
                "por        %%mm1, %%mm0\n\t"
486
                "por        %%mm4, %%mm3\n\t"
487
                "por        %%mm2, %%mm0\n\t"
488
                "por        %%mm5, %%mm3\n\t"
489
                "psllq        $16, %%mm3\n\t"
490
                "por        %%mm3, %%mm0\n\t"
491
                MOVNTQ"        %%mm0, %0\n\t"
492
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
493
                d += 4;
494
                s += 12;
495
        }
496
        __asm __volatile(SFENCE:::"memory");
497
        __asm __volatile(EMMS:::"memory");
498
#endif
499
        while(s < end)
500
        {
501
                const int b= *s++;
502
                const int g= *s++;
503
                const int r= *s++;
504
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
505
        }
506
}
507

    
508
/*
509
  I use here less accurate approximation by simply
510
 left-shifting the input
511
  value and filling the low order bits with
512
 zeroes. This method improves png's
513
  compression but this scheme cannot reproduce white exactly, since it does not
514
  generate an all-ones maximum value; the net effect is to darken the
515
  image slightly.
516

517
  The better method should be "left bit replication":
518

519
   4 3 2 1 0
520
   ---------
521
   1 1 0 1 1
522

523
   7 6 5 4 3  2 1 0
524
   ----------------
525
   1 1 0 1 1  1 1 0
526
   |=======|  |===|
527
       |      Leftmost Bits Repeated to Fill Open Bits
528
       |
529
   Original Bits
530
*/
531
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
532
{
533
        const uint16_t *end;
534
#ifdef HAVE_MMX
535
        const uint16_t *mm_end;
536
#endif
537
        uint8_t *d = (uint8_t *)dst;
538
        const uint16_t *s = (uint16_t *)src;
539
        end = s + src_size/2;
540
#ifdef HAVE_MMX
541
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
542
        mm_end = end - 7;
543
        while(s < mm_end)
544
        {
545
            __asm __volatile(
546
                PREFETCH" 32%1\n\t"
547
                "movq        %1, %%mm0\n\t"
548
                "movq        %1, %%mm1\n\t"
549
                "movq        %1, %%mm2\n\t"
550
                "pand        %2, %%mm0\n\t"
551
                "pand        %3, %%mm1\n\t"
552
                "pand        %4, %%mm2\n\t"
553
                "psllq        $3, %%mm0\n\t"
554
                "psrlq        $2, %%mm1\n\t"
555
                "psrlq        $7, %%mm2\n\t"
556
                "movq        %%mm0, %%mm3\n\t"
557
                "movq        %%mm1, %%mm4\n\t"
558
                "movq        %%mm2, %%mm5\n\t"
559
                "punpcklwd %5, %%mm0\n\t"
560
                "punpcklwd %5, %%mm1\n\t"
561
                "punpcklwd %5, %%mm2\n\t"
562
                "punpckhwd %5, %%mm3\n\t"
563
                "punpckhwd %5, %%mm4\n\t"
564
                "punpckhwd %5, %%mm5\n\t"
565
                "psllq        $8, %%mm1\n\t"
566
                "psllq        $16, %%mm2\n\t"
567
                "por        %%mm1, %%mm0\n\t"
568
                "por        %%mm2, %%mm0\n\t"
569
                "psllq        $8, %%mm4\n\t"
570
                "psllq        $16, %%mm5\n\t"
571
                "por        %%mm4, %%mm3\n\t"
572
                "por        %%mm5, %%mm3\n\t"
573

    
574
                "movq        %%mm0, %%mm6\n\t"
575
                "movq        %%mm3, %%mm7\n\t"
576
                
577
                "movq        8%1, %%mm0\n\t"
578
                "movq        8%1, %%mm1\n\t"
579
                "movq        8%1, %%mm2\n\t"
580
                "pand        %2, %%mm0\n\t"
581
                "pand        %3, %%mm1\n\t"
582
                "pand        %4, %%mm2\n\t"
583
                "psllq        $3, %%mm0\n\t"
584
                "psrlq        $2, %%mm1\n\t"
585
                "psrlq        $7, %%mm2\n\t"
586
                "movq        %%mm0, %%mm3\n\t"
587
                "movq        %%mm1, %%mm4\n\t"
588
                "movq        %%mm2, %%mm5\n\t"
589
                "punpcklwd %5, %%mm0\n\t"
590
                "punpcklwd %5, %%mm1\n\t"
591
                "punpcklwd %5, %%mm2\n\t"
592
                "punpckhwd %5, %%mm3\n\t"
593
                "punpckhwd %5, %%mm4\n\t"
594
                "punpckhwd %5, %%mm5\n\t"
595
                "psllq        $8, %%mm1\n\t"
596
                "psllq        $16, %%mm2\n\t"
597
                "por        %%mm1, %%mm0\n\t"
598
                "por        %%mm2, %%mm0\n\t"
599
                "psllq        $8, %%mm4\n\t"
600
                "psllq        $16, %%mm5\n\t"
601
                "por        %%mm4, %%mm3\n\t"
602
                "por        %%mm5, %%mm3\n\t"
603

    
604
                :"=m"(*d)
605
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
606
                :"memory");
607
            /* Borrowed 32 to 24 */
608
            __asm __volatile(
609
                "movq        %%mm0, %%mm4\n\t"
610
                "movq        %%mm3, %%mm5\n\t"
611
                "movq        %%mm6, %%mm0\n\t"
612
                "movq        %%mm7, %%mm1\n\t"
613
                
614
                "movq        %%mm4, %%mm6\n\t"
615
                "movq        %%mm5, %%mm7\n\t"
616
                "movq        %%mm0, %%mm2\n\t"
617
                "movq        %%mm1, %%mm3\n\t"
618

    
619
                "psrlq        $8, %%mm2\n\t"
620
                "psrlq        $8, %%mm3\n\t"
621
                "psrlq        $8, %%mm6\n\t"
622
                "psrlq        $8, %%mm7\n\t"
623
                "pand        %2, %%mm0\n\t"
624
                "pand        %2, %%mm1\n\t"
625
                "pand        %2, %%mm4\n\t"
626
                "pand        %2, %%mm5\n\t"
627
                "pand        %3, %%mm2\n\t"
628
                "pand        %3, %%mm3\n\t"
629
                "pand        %3, %%mm6\n\t"
630
                "pand        %3, %%mm7\n\t"
631
                "por        %%mm2, %%mm0\n\t"
632
                "por        %%mm3, %%mm1\n\t"
633
                "por        %%mm6, %%mm4\n\t"
634
                "por        %%mm7, %%mm5\n\t"
635

    
636
                "movq        %%mm1, %%mm2\n\t"
637
                "movq        %%mm4, %%mm3\n\t"
638
                "psllq        $48, %%mm2\n\t"
639
                "psllq        $32, %%mm3\n\t"
640
                "pand        %4, %%mm2\n\t"
641
                "pand        %5, %%mm3\n\t"
642
                "por        %%mm2, %%mm0\n\t"
643
                "psrlq        $16, %%mm1\n\t"
644
                "psrlq        $32, %%mm4\n\t"
645
                "psllq        $16, %%mm5\n\t"
646
                "por        %%mm3, %%mm1\n\t"
647
                "pand        %6, %%mm5\n\t"
648
                "por        %%mm5, %%mm4\n\t"
649

    
650
                MOVNTQ"        %%mm0, %0\n\t"
651
                MOVNTQ"        %%mm1, 8%0\n\t"
652
                MOVNTQ"        %%mm4, 16%0"
653

    
654
                :"=m"(*d)
655
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
656
                :"memory");
657
                d += 24;
658
                s += 8;
659
        }
660
        __asm __volatile(SFENCE:::"memory");
661
        __asm __volatile(EMMS:::"memory");
662
#endif
663
        while(s < end)
664
        {
665
                register uint16_t bgr;
666
                bgr = *s++;
667
                *d++ = (bgr&0x1F)<<3;
668
                *d++ = (bgr&0x3E0)>>2;
669
                *d++ = (bgr&0x7C00)>>7;
670
        }
671
}
672

    
673
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
674
{
675
        const uint16_t *end;
676
#ifdef HAVE_MMX
677
        const uint16_t *mm_end;
678
#endif
679
        uint8_t *d = (uint8_t *)dst;
680
        const uint16_t *s = (const uint16_t *)src;
681
        end = s + src_size/2;
682
#ifdef HAVE_MMX
683
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
684
        mm_end = end - 7;
685
        while(s < mm_end)
686
        {
687
            __asm __volatile(
688
                PREFETCH" 32%1\n\t"
689
                "movq        %1, %%mm0\n\t"
690
                "movq        %1, %%mm1\n\t"
691
                "movq        %1, %%mm2\n\t"
692
                "pand        %2, %%mm0\n\t"
693
                "pand        %3, %%mm1\n\t"
694
                "pand        %4, %%mm2\n\t"
695
                "psllq        $3, %%mm0\n\t"
696
                "psrlq        $3, %%mm1\n\t"
697
                "psrlq        $8, %%mm2\n\t"
698
                "movq        %%mm0, %%mm3\n\t"
699
                "movq        %%mm1, %%mm4\n\t"
700
                "movq        %%mm2, %%mm5\n\t"
701
                "punpcklwd %5, %%mm0\n\t"
702
                "punpcklwd %5, %%mm1\n\t"
703
                "punpcklwd %5, %%mm2\n\t"
704
                "punpckhwd %5, %%mm3\n\t"
705
                "punpckhwd %5, %%mm4\n\t"
706
                "punpckhwd %5, %%mm5\n\t"
707
                "psllq        $8, %%mm1\n\t"
708
                "psllq        $16, %%mm2\n\t"
709
                "por        %%mm1, %%mm0\n\t"
710
                "por        %%mm2, %%mm0\n\t"
711
                "psllq        $8, %%mm4\n\t"
712
                "psllq        $16, %%mm5\n\t"
713
                "por        %%mm4, %%mm3\n\t"
714
                "por        %%mm5, %%mm3\n\t"
715
                
716
                "movq        %%mm0, %%mm6\n\t"
717
                "movq        %%mm3, %%mm7\n\t"
718

    
719
                "movq        8%1, %%mm0\n\t"
720
                "movq        8%1, %%mm1\n\t"
721
                "movq        8%1, %%mm2\n\t"
722
                "pand        %2, %%mm0\n\t"
723
                "pand        %3, %%mm1\n\t"
724
                "pand        %4, %%mm2\n\t"
725
                "psllq        $3, %%mm0\n\t"
726
                "psrlq        $3, %%mm1\n\t"
727
                "psrlq        $8, %%mm2\n\t"
728
                "movq        %%mm0, %%mm3\n\t"
729
                "movq        %%mm1, %%mm4\n\t"
730
                "movq        %%mm2, %%mm5\n\t"
731
                "punpcklwd %5, %%mm0\n\t"
732
                "punpcklwd %5, %%mm1\n\t"
733
                "punpcklwd %5, %%mm2\n\t"
734
                "punpckhwd %5, %%mm3\n\t"
735
                "punpckhwd %5, %%mm4\n\t"
736
                "punpckhwd %5, %%mm5\n\t"
737
                "psllq        $8, %%mm1\n\t"
738
                "psllq        $16, %%mm2\n\t"
739
                "por        %%mm1, %%mm0\n\t"
740
                "por        %%mm2, %%mm0\n\t"
741
                "psllq        $8, %%mm4\n\t"
742
                "psllq        $16, %%mm5\n\t"
743
                "por        %%mm4, %%mm3\n\t"
744
                "por        %%mm5, %%mm3\n\t"
745
                :"=m"(*d)
746
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)                
747
                :"memory");
748
            /* Borrowed 32 to 24 */
749
            __asm __volatile(
750
                "movq        %%mm0, %%mm4\n\t"
751
                "movq        %%mm3, %%mm5\n\t"
752
                "movq        %%mm6, %%mm0\n\t"
753
                "movq        %%mm7, %%mm1\n\t"
754
                
755
                "movq        %%mm4, %%mm6\n\t"
756
                "movq        %%mm5, %%mm7\n\t"
757
                "movq        %%mm0, %%mm2\n\t"
758
                "movq        %%mm1, %%mm3\n\t"
759

    
760
                "psrlq        $8, %%mm2\n\t"
761
                "psrlq        $8, %%mm3\n\t"
762
                "psrlq        $8, %%mm6\n\t"
763
                "psrlq        $8, %%mm7\n\t"
764
                "pand        %2, %%mm0\n\t"
765
                "pand        %2, %%mm1\n\t"
766
                "pand        %2, %%mm4\n\t"
767
                "pand        %2, %%mm5\n\t"
768
                "pand        %3, %%mm2\n\t"
769
                "pand        %3, %%mm3\n\t"
770
                "pand        %3, %%mm6\n\t"
771
                "pand        %3, %%mm7\n\t"
772
                "por        %%mm2, %%mm0\n\t"
773
                "por        %%mm3, %%mm1\n\t"
774
                "por        %%mm6, %%mm4\n\t"
775
                "por        %%mm7, %%mm5\n\t"
776

    
777
                "movq        %%mm1, %%mm2\n\t"
778
                "movq        %%mm4, %%mm3\n\t"
779
                "psllq        $48, %%mm2\n\t"
780
                "psllq        $32, %%mm3\n\t"
781
                "pand        %4, %%mm2\n\t"
782
                "pand        %5, %%mm3\n\t"
783
                "por        %%mm2, %%mm0\n\t"
784
                "psrlq        $16, %%mm1\n\t"
785
                "psrlq        $32, %%mm4\n\t"
786
                "psllq        $16, %%mm5\n\t"
787
                "por        %%mm3, %%mm1\n\t"
788
                "pand        %6, %%mm5\n\t"
789
                "por        %%mm5, %%mm4\n\t"
790

    
791
                MOVNTQ"        %%mm0, %0\n\t"
792
                MOVNTQ"        %%mm1, 8%0\n\t"
793
                MOVNTQ"        %%mm4, 16%0"
794

    
795
                :"=m"(*d)
796
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
797
                :"memory");
798
                d += 24;
799
                s += 8;
800
        }
801
        __asm __volatile(SFENCE:::"memory");
802
        __asm __volatile(EMMS:::"memory");
803
#endif
804
        while(s < end)
805
        {
806
                register uint16_t bgr;
807
                bgr = *s++;
808
                *d++ = (bgr&0x1F)<<3;
809
                *d++ = (bgr&0x7E0)>>3;
810
                *d++ = (bgr&0xF800)>>8;
811
        }
812
}
813

    
814
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
815
{
816
        const uint16_t *end;
817
#ifdef HAVE_MMX
818
        const uint16_t *mm_end;
819
#endif
820
        uint8_t *d = (uint8_t *)dst;
821
        const uint16_t *s = (const uint16_t *)src;
822
        end = s + src_size/2;
823
#ifdef HAVE_MMX
824
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
825
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
826
        mm_end = end - 3;
827
        while(s < mm_end)
828
        {
829
            __asm __volatile(
830
                PREFETCH" 32%1\n\t"
831
                "movq        %1, %%mm0\n\t"
832
                "movq        %1, %%mm1\n\t"
833
                "movq        %1, %%mm2\n\t"
834
                "pand        %2, %%mm0\n\t"
835
                "pand        %3, %%mm1\n\t"
836
                "pand        %4, %%mm2\n\t"
837
                "psllq        $3, %%mm0\n\t"
838
                "psrlq        $2, %%mm1\n\t"
839
                "psrlq        $7, %%mm2\n\t"
840
                "movq        %%mm0, %%mm3\n\t"
841
                "movq        %%mm1, %%mm4\n\t"
842
                "movq        %%mm2, %%mm5\n\t"
843
                "punpcklwd %%mm7, %%mm0\n\t"
844
                "punpcklwd %%mm7, %%mm1\n\t"
845
                "punpcklwd %%mm7, %%mm2\n\t"
846
                "punpckhwd %%mm7, %%mm3\n\t"
847
                "punpckhwd %%mm7, %%mm4\n\t"
848
                "punpckhwd %%mm7, %%mm5\n\t"
849
                "psllq        $8, %%mm1\n\t"
850
                "psllq        $16, %%mm2\n\t"
851
                "por        %%mm1, %%mm0\n\t"
852
                "por        %%mm2, %%mm0\n\t"
853
                "psllq        $8, %%mm4\n\t"
854
                "psllq        $16, %%mm5\n\t"
855
                "por        %%mm4, %%mm3\n\t"
856
                "por        %%mm5, %%mm3\n\t"
857
                MOVNTQ"        %%mm0, %0\n\t"
858
                MOVNTQ"        %%mm3, 8%0\n\t"
859
                :"=m"(*d)
860
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
861
                :"memory");
862
                d += 16;
863
                s += 4;
864
        }
865
        __asm __volatile(SFENCE:::"memory");
866
        __asm __volatile(EMMS:::"memory");
867
#endif
868
        while(s < end)
869
        {
870
                register uint16_t bgr;
871
                bgr = *s++;
872
                *d++ = (bgr&0x1F)<<3;
873
                *d++ = (bgr&0x3E0)>>2;
874
                *d++ = (bgr&0x7C00)>>7;
875
                *d++ = 0;
876
        }
877
}
878

    
879
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
880
{
881
        const uint16_t *end;
882
#ifdef HAVE_MMX
883
        const uint16_t *mm_end;
884
#endif
885
        uint8_t *d = (uint8_t *)dst;
886
        const uint16_t *s = (uint16_t *)src;
887
        end = s + src_size/2;
888
#ifdef HAVE_MMX
889
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
890
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
891
        mm_end = end - 3;
892
        while(s < mm_end)
893
        {
894
            __asm __volatile(
895
                PREFETCH" 32%1\n\t"
896
                "movq        %1, %%mm0\n\t"
897
                "movq        %1, %%mm1\n\t"
898
                "movq        %1, %%mm2\n\t"
899
                "pand        %2, %%mm0\n\t"
900
                "pand        %3, %%mm1\n\t"
901
                "pand        %4, %%mm2\n\t"
902
                "psllq        $3, %%mm0\n\t"
903
                "psrlq        $3, %%mm1\n\t"
904
                "psrlq        $8, %%mm2\n\t"
905
                "movq        %%mm0, %%mm3\n\t"
906
                "movq        %%mm1, %%mm4\n\t"
907
                "movq        %%mm2, %%mm5\n\t"
908
                "punpcklwd %%mm7, %%mm0\n\t"
909
                "punpcklwd %%mm7, %%mm1\n\t"
910
                "punpcklwd %%mm7, %%mm2\n\t"
911
                "punpckhwd %%mm7, %%mm3\n\t"
912
                "punpckhwd %%mm7, %%mm4\n\t"
913
                "punpckhwd %%mm7, %%mm5\n\t"
914
                "psllq        $8, %%mm1\n\t"
915
                "psllq        $16, %%mm2\n\t"
916
                "por        %%mm1, %%mm0\n\t"
917
                "por        %%mm2, %%mm0\n\t"
918
                "psllq        $8, %%mm4\n\t"
919
                "psllq        $16, %%mm5\n\t"
920
                "por        %%mm4, %%mm3\n\t"
921
                "por        %%mm5, %%mm3\n\t"
922
                MOVNTQ"        %%mm0, %0\n\t"
923
                MOVNTQ"        %%mm3, 8%0\n\t"
924
                :"=m"(*d)
925
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
926
                :"memory");
927
                d += 16;
928
                s += 4;
929
        }
930
        __asm __volatile(SFENCE:::"memory");
931
        __asm __volatile(EMMS:::"memory");
932
#endif
933
        while(s < end)
934
        {
935
                register uint16_t bgr;
936
                bgr = *s++;
937
                *d++ = (bgr&0x1F)<<3;
938
                *d++ = (bgr&0x7E0)>>3;
939
                *d++ = (bgr&0xF800)>>8;
940
                *d++ = 0;
941
        }
942
}
943

    
944
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
945
{
946
#ifdef HAVE_MMX
947
/* TODO: unroll this loop */
948
        asm volatile (
949
                "xorl %%eax, %%eax                \n\t"
950
                ".balign 16                        \n\t"
951
                "1:                                \n\t"
952
                PREFETCH" 32(%0, %%eax)                \n\t"
953
                "movq (%0, %%eax), %%mm0        \n\t"
954
                "movq %%mm0, %%mm1                \n\t"
955
                "movq %%mm0, %%mm2                \n\t"
956
                "pslld $16, %%mm0                \n\t"
957
                "psrld $16, %%mm1                \n\t"
958
                "pand "MANGLE(mask32r)", %%mm0        \n\t"
959
                "pand "MANGLE(mask32g)", %%mm2        \n\t"
960
                "pand "MANGLE(mask32b)", %%mm1        \n\t"
961
                "por %%mm0, %%mm2                \n\t"
962
                "por %%mm1, %%mm2                \n\t"
963
                MOVNTQ" %%mm2, (%1, %%eax)        \n\t"
964
                "addl $8, %%eax                        \n\t"
965
                "cmpl %2, %%eax                        \n\t"
966
                " jb 1b                                \n\t"
967
                :: "r" (src), "r"(dst), "r" (src_size-7)
968
                : "%eax"
969
        );
970

    
971
        __asm __volatile(SFENCE:::"memory");
972
        __asm __volatile(EMMS:::"memory");
973
#else
974
        unsigned i;
975
        unsigned num_pixels = src_size >> 2;
976
        for(i=0; i<num_pixels; i++)
977
        {
978
                dst[4*i + 0] = src[4*i + 2];
979
                dst[4*i + 1] = src[4*i + 1];
980
                dst[4*i + 2] = src[4*i + 0];
981
        }
982
#endif
983
}
984

    
985
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
986
{
987
        unsigned i;
988
#ifdef HAVE_MMX
989
        int mmx_size= 23 - src_size;
990
        asm volatile (
991
                "movq "MANGLE(mask24r)", %%mm5        \n\t"
992
                "movq "MANGLE(mask24g)", %%mm6        \n\t"
993
                "movq "MANGLE(mask24b)", %%mm7        \n\t"
994
                ".balign 16                        \n\t"
995
                "1:                                \n\t"
996
                PREFETCH" 32(%1, %%eax)                \n\t"
997
                "movq   (%1, %%eax), %%mm0        \n\t" // BGR BGR BG
998
                "movq   (%1, %%eax), %%mm1        \n\t" // BGR BGR BG
999
                "movq  2(%1, %%eax), %%mm2        \n\t" // R BGR BGR B
1000
                "psllq $16, %%mm0                \n\t" // 00 BGR BGR
1001
                "pand %%mm5, %%mm0                \n\t"
1002
                "pand %%mm6, %%mm1                \n\t"
1003
                "pand %%mm7, %%mm2                \n\t"
1004
                "por %%mm0, %%mm1                \n\t"
1005
                "por %%mm2, %%mm1                \n\t"                
1006
                "movq  6(%1, %%eax), %%mm0        \n\t" // BGR BGR BG
1007
                MOVNTQ" %%mm1,   (%2, %%eax)        \n\t" // RGB RGB RG
1008
                "movq  8(%1, %%eax), %%mm1        \n\t" // R BGR BGR B
1009
                "movq 10(%1, %%eax), %%mm2        \n\t" // GR BGR BGR
1010
                "pand %%mm7, %%mm0                \n\t"
1011
                "pand %%mm5, %%mm1                \n\t"
1012
                "pand %%mm6, %%mm2                \n\t"
1013
                "por %%mm0, %%mm1                \n\t"
1014
                "por %%mm2, %%mm1                \n\t"                
1015
                "movq 14(%1, %%eax), %%mm0        \n\t" // R BGR BGR B
1016
                MOVNTQ" %%mm1,  8(%2, %%eax)        \n\t" // B RGB RGB R
1017
                "movq 16(%1, %%eax), %%mm1        \n\t" // GR BGR BGR
1018
                "movq 18(%1, %%eax), %%mm2        \n\t" // BGR BGR BG
1019
                "pand %%mm6, %%mm0                \n\t"
1020
                "pand %%mm7, %%mm1                \n\t"
1021
                "pand %%mm5, %%mm2                \n\t"
1022
                "por %%mm0, %%mm1                \n\t"
1023
                "por %%mm2, %%mm1                \n\t"                
1024
                MOVNTQ" %%mm1, 16(%2, %%eax)        \n\t"
1025
                "addl $24, %%eax                \n\t"
1026
                " js 1b                                \n\t"
1027
                : "+a" (mmx_size)
1028
                : "r" (src-mmx_size), "r"(dst-mmx_size)
1029
        );
1030

    
1031
        __asm __volatile(SFENCE:::"memory");
1032
        __asm __volatile(EMMS:::"memory");
1033

    
1034
        if(mmx_size==23) return; //finihsed, was multiple of 8
1035

    
1036
        src+= src_size;
1037
        dst+= src_size;
1038
        src_size= 23-mmx_size;
1039
        src-= src_size;
1040
        dst-= src_size;
1041
#endif
1042
        for(i=0; i<src_size; i+=3)
1043
        {
1044
                register uint8_t x;
1045
                x          = src[i + 2];
1046
                dst[i + 1] = src[i + 1];
1047
                dst[i + 2] = src[i + 0];
1048
                dst[i + 0] = x;
1049
        }
1050
}
1051

    
1052
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1053
        unsigned int width, unsigned int height,
1054
        unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
1055
{
1056
        unsigned y;
1057
        const unsigned chromWidth= width>>1;
1058
        for(y=0; y<height; y++)
1059
        {
1060
#ifdef HAVE_MMX
1061
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1062
                asm volatile(
1063
                        "xorl %%eax, %%eax                \n\t"
1064
                        ".balign 16                        \n\t"
1065
                        "1:                                \n\t"
1066
                        PREFETCH" 32(%1, %%eax, 2)        \n\t"
1067
                        PREFETCH" 32(%2, %%eax)                \n\t"
1068
                        PREFETCH" 32(%3, %%eax)                \n\t"
1069
                        "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1070
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1071
                        "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1072
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1073
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1074

    
1075
                        "movq (%1, %%eax,2), %%mm3        \n\t" // Y(0)
1076
                        "movq 8(%1, %%eax,2), %%mm5        \n\t" // Y(8)
1077
                        "movq %%mm3, %%mm4                \n\t" // Y(0)
1078
                        "movq %%mm5, %%mm6                \n\t" // Y(8)
1079
                        "punpcklbw %%mm0, %%mm3                \n\t" // YUYV YUYV(0)
1080
                        "punpckhbw %%mm0, %%mm4                \n\t" // YUYV YUYV(4)
1081
                        "punpcklbw %%mm2, %%mm5                \n\t" // YUYV YUYV(8)
1082
                        "punpckhbw %%mm2, %%mm6                \n\t" // YUYV YUYV(12)
1083

    
1084
                        MOVNTQ" %%mm3, (%0, %%eax, 4)        \n\t"
1085
                        MOVNTQ" %%mm4, 8(%0, %%eax, 4)        \n\t"
1086
                        MOVNTQ" %%mm5, 16(%0, %%eax, 4)        \n\t"
1087
                        MOVNTQ" %%mm6, 24(%0, %%eax, 4)        \n\t"
1088

    
1089
                        "addl $8, %%eax                        \n\t"
1090
                        "cmpl %4, %%eax                        \n\t"
1091
                        " jb 1b                                \n\t"
1092
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
1093
                        : "%eax"
1094
                );
1095
#else
1096
#if __WORDSIZE >= 64
1097
                int i;
1098
                uint64_t *ldst = (uint64_t *) dst;
1099
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1100
                for(i = 0; i < chromWidth; i += 2){
1101
                        uint64_t k, l;
1102
                        k = yc[0] + (uc[0] << 8) +
1103
                            (yc[1] << 16) + (vc[0] << 24);
1104
                        l = yc[2] + (uc[1] << 8) +
1105
                            (yc[3] << 16) + (vc[1] << 24);
1106
                        *ldst++ = k + (l << 32);
1107
                        yc += 4;
1108
                        uc += 2;
1109
                        vc += 2;
1110
                }
1111

    
1112
#else
1113
                int i, *idst = (int32_t *) dst;
1114
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1115
                for(i = 0; i < chromWidth; i++){
1116
                        *idst++ = yc[0] + (uc[0] << 8) +
1117
                            (yc[1] << 16) + (vc[0] << 24);
1118
                        yc += 2;
1119
                        uc++;
1120
                        vc++;
1121
                }
1122
#endif
1123
#endif
1124
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1125
                {
1126
                        usrc += chromStride;
1127
                        vsrc += chromStride;
1128
                }
1129
                ysrc += lumStride;
1130
                dst += dstStride;
1131
        }
1132
#ifdef HAVE_MMX
1133
asm(    EMMS" \n\t"
1134
        SFENCE" \n\t"
1135
        :::"memory");
1136
#endif
1137
}
1138

    
1139
/**
1140
 *
1141
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1142
 * problem for anyone then tell me, and ill fix it)
1143
 */
1144
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1145
        unsigned int width, unsigned int height,
1146
        unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1147
{
1148
        //FIXME interpolate chroma
1149
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1150
}
1151

    
1152
/**
1153
 *
1154
 * width should be a multiple of 16
1155
 */
1156
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1157
        unsigned int width, unsigned int height,
1158
        unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1159
{
1160
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1161
}
1162

    
1163
/**
1164
 *
1165
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1166
 * problem for anyone then tell me, and ill fix it)
1167
 */
1168
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1169
        unsigned int width, unsigned int height,
1170
        unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1171
{
1172
        unsigned y;
1173
        const unsigned chromWidth= width>>1;
1174
        for(y=0; y<height; y+=2)
1175
        {
1176
#ifdef HAVE_MMX
1177
                asm volatile(
1178
                        "xorl %%eax, %%eax                \n\t"
1179
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1180
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1181
                        ".balign 16                        \n\t"
1182
                        "1:                                \n\t"
1183
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1184
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1185
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1186
                        "movq %%mm0, %%mm2                \n\t" // YUYV YUYV(0)
1187
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(4)
1188
                        "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1189
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1190
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1191
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1192
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1193
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1194

    
1195
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
1196

    
1197
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(8)
1198
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(12)
1199
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(8)
1200
                        "movq %%mm2, %%mm4                \n\t" // YUYV YUYV(12)
1201
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1202
                        "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1203
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1204
                        "pand %%mm7, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1205
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1206
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1207

    
1208
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
1209

    
1210
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1211
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1212
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1213
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1214
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1215
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1216
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1217
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1218

    
1219
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
1220
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
1221

    
1222
                        "addl $8, %%eax                        \n\t"
1223
                        "cmpl %4, %%eax                        \n\t"
1224
                        " jb 1b                                \n\t"
1225
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1226
                        : "memory", "%eax"
1227
                );
1228

    
1229
                ydst += lumStride;
1230
                src  += srcStride;
1231

    
1232
                asm volatile(
1233
                        "xorl %%eax, %%eax                \n\t"
1234
                        ".balign 16                        \n\t"
1235
                        "1:                                \n\t"
1236
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1237
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1238
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1239
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
1240
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
1241
                        "pand %%mm7, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1242
                        "pand %%mm7, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1243
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1244
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1245
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1246
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1247

    
1248
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
1249
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
1250

    
1251
                        "addl $8, %%eax                        \n\t"
1252
                        "cmpl %4, %%eax                        \n\t"
1253
                        " jb 1b                                \n\t"
1254

    
1255
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1256
                        : "memory", "%eax"
1257
                );
1258
#else
1259
                unsigned i;
1260
                for(i=0; i<chromWidth; i++)
1261
                {
1262
                        ydst[2*i+0]         = src[4*i+0];
1263
                        udst[i]         = src[4*i+1];
1264
                        ydst[2*i+1]         = src[4*i+2];
1265
                        vdst[i]         = src[4*i+3];
1266
                }
1267
                ydst += lumStride;
1268
                src  += srcStride;
1269

    
1270
                for(i=0; i<chromWidth; i++)
1271
                {
1272
                        ydst[2*i+0]         = src[4*i+0];
1273
                        ydst[2*i+1]         = src[4*i+2];
1274
                }
1275
#endif
1276
                udst += chromStride;
1277
                vdst += chromStride;
1278
                ydst += lumStride;
1279
                src  += srcStride;
1280
        }
1281
#ifdef HAVE_MMX
1282
asm volatile(   EMMS" \n\t"
1283
                SFENCE" \n\t"
1284
                :::"memory");
1285
#endif
1286
}
1287

    
1288
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1289
        uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1290
        unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
1291
{
1292
        /* Y Plane */
1293
        memcpy(ydst, ysrc, width*height);
1294

    
1295
        /* XXX: implement upscaling for U,V */
1296
}
1297

    
1298
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1299
{
1300
        int x,y;
1301
        
1302
        // first line
1303
        for(x=0; x<srcWidth; x++){
1304
                dst[2*x+0]=
1305
                dst[2*x+1]= src[x];
1306
        }
1307
        dst+= dstStride;
1308

    
1309
        for(y=1; y<srcHeight; y++){
1310
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1311
                const int mmxSize= srcWidth;
1312
                asm volatile(
1313
                        "movl %4, %%eax                        \n\t"
1314
                        "1:                                \n\t"
1315
                        "movq (%0, %%eax), %%mm0        \n\t"
1316
                        "movq (%1, %%eax), %%mm1        \n\t"
1317
                        "movq 1(%0, %%eax), %%mm2        \n\t"
1318
                        "movq 1(%1, %%eax), %%mm3        \n\t"
1319
                        "movq %%mm0, %%mm4                \n\t"
1320
                        "movq %%mm1, %%mm5                \n\t"
1321
                        PAVGB" %%mm3, %%mm0                \n\t"
1322
                        PAVGB" %%mm3, %%mm0                \n\t"
1323
                        PAVGB" %%mm4, %%mm3                \n\t"
1324
                        PAVGB" %%mm4, %%mm3                \n\t"
1325
                        PAVGB" %%mm2, %%mm1                \n\t"
1326
                        PAVGB" %%mm2, %%mm1                \n\t"
1327
                        PAVGB" %%mm5, %%mm2                \n\t"
1328
                        PAVGB" %%mm5, %%mm2                \n\t"
1329
                        "movq %%mm3, %%mm4                \n\t"
1330
                        "movq %%mm2, %%mm5                \n\t"
1331
                        "punpcklbw %%mm1, %%mm3                \n\t"
1332
                        "punpckhbw %%mm1, %%mm4                \n\t"
1333
                        "punpcklbw %%mm0, %%mm2                \n\t"
1334
                        "punpckhbw %%mm0, %%mm5                \n\t"
1335
#if 1
1336
                        MOVNTQ" %%mm3, (%2, %%eax, 2)        \n\t"
1337
                        MOVNTQ" %%mm4, 8(%2, %%eax, 2)        \n\t"
1338
                        MOVNTQ" %%mm2, (%3, %%eax, 2)        \n\t"
1339
                        MOVNTQ" %%mm5, 8(%3, %%eax, 2)        \n\t"
1340
#else
1341
                        "movq %%mm3, (%2, %%eax, 2)        \n\t"
1342
                        "movq %%mm4, 8(%2, %%eax, 2)        \n\t"
1343
                        "movq %%mm2, (%3, %%eax, 2)        \n\t"
1344
                        "movq %%mm5, 8(%3, %%eax, 2)        \n\t"
1345
#endif
1346
                        "addl $8, %%eax                        \n\t"
1347
                        " js 1b                                \n\t"
1348
                        :: "r" (src + mmxSize-1), "r" (src + srcStride + mmxSize-1),
1349
                           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1350
                           "g" (-mmxSize)
1351
                        : "%eax"
1352

    
1353
                );
1354
                dst[0]= 
1355
                dst[dstStride]= src[0];
1356
#else
1357
                dst[0]= 
1358
                dst[dstStride]= src[0];
1359

    
1360
                for(x=0; x<srcWidth-1; x++){
1361
                        dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1362
                        dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1363
                        dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1364
                        dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1365
                }
1366
#endif
1367
                dst[srcWidth*2 -1]= 
1368
                dst[srcWidth*2 -1 + dstStride]= src[srcWidth-1];
1369

    
1370
                dst+=dstStride*2;
1371
                src+=srcStride;
1372
        }
1373
        src-=srcStride;
1374
        
1375
        // last line
1376
        for(x=0; x<srcWidth; x++){
1377
                dst[2*x+0]=
1378
                dst[2*x+1]= src[x];
1379
        }
1380
#ifdef HAVE_MMX
1381
asm volatile(   EMMS" \n\t"
1382
                SFENCE" \n\t"
1383
                :::"memory");
1384
#endif
1385
}
1386

    
1387
/**
1388
 *
1389
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1390
 * problem for anyone then tell me, and ill fix it)
1391
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1392
 */
1393
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1394
        unsigned int width, unsigned int height,
1395
        unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1396
{
1397
        unsigned y;
1398
        const unsigned chromWidth= width>>1;
1399
        for(y=0; y<height; y+=2)
1400
        {
1401
#ifdef HAVE_MMX
1402
                asm volatile(
1403
                        "xorl %%eax, %%eax                \n\t"
1404
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1405
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1406
                        ".balign 16                        \n\t"
1407
                        "1:                                \n\t"
1408
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1409
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // UYVY UYVY(0)
1410
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(4)
1411
                        "movq %%mm0, %%mm2                \n\t" // UYVY UYVY(0)
1412
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(4)
1413
                        "pand %%mm7, %%mm0                \n\t" // U0V0 U0V0(0)
1414
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(4)
1415
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1416
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1417
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1418
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1419

    
1420
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
1421

    
1422
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(8)
1423
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // UYVY UYVY(12)
1424
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(8)
1425
                        "movq %%mm2, %%mm4                \n\t" // UYVY UYVY(12)
1426
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(8)
1427
                        "pand %%mm7, %%mm2                \n\t" // U0V0 U0V0(12)
1428
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1429
                        "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1430
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1431
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1432

    
1433
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
1434

    
1435
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1436
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1437
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1438
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1439
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1440
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1441
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1442
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1443

    
1444
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
1445
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
1446

    
1447
                        "addl $8, %%eax                        \n\t"
1448
                        "cmpl %4, %%eax                        \n\t"
1449
                        " jb 1b                                \n\t"
1450
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1451
                        : "memory", "%eax"
1452
                );
1453

    
1454
                ydst += lumStride;
1455
                src  += srcStride;
1456

    
1457
                asm volatile(
1458
                        "xorl %%eax, %%eax                \n\t"
1459
                        ".balign 16                        \n\t"
1460
                        "1:                                \n\t"
1461
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1462
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1463
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1464
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
1465
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
1466
                        "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1467
                        "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1468
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1469
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1470
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1471
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1472

    
1473
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
1474
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
1475

    
1476
                        "addl $8, %%eax                        \n\t"
1477
                        "cmpl %4, %%eax                        \n\t"
1478
                        " jb 1b                                \n\t"
1479

    
1480
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1481
                        : "memory", "%eax"
1482
                );
1483
#else
1484
                unsigned i;
1485
                for(i=0; i<chromWidth; i++)
1486
                {
1487
                        udst[i]         = src[4*i+0];
1488
                        ydst[2*i+0]         = src[4*i+1];
1489
                        vdst[i]         = src[4*i+2];
1490
                        ydst[2*i+1]         = src[4*i+3];
1491
                }
1492
                ydst += lumStride;
1493
                src  += srcStride;
1494

    
1495
                for(i=0; i<chromWidth; i++)
1496
                {
1497
                        ydst[2*i+0]         = src[4*i+1];
1498
                        ydst[2*i+1]         = src[4*i+3];
1499
                }
1500
#endif
1501
                udst += chromStride;
1502
                vdst += chromStride;
1503
                ydst += lumStride;
1504
                src  += srcStride;
1505
        }
1506
#ifdef HAVE_MMX
1507
asm volatile(   EMMS" \n\t"
1508
                SFENCE" \n\t"
1509
                :::"memory");
1510
#endif
1511
}
1512

    
1513
/**
1514
 *
1515
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1516
 * problem for anyone then tell me, and ill fix it)
1517
 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1518
 */
1519
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1520
        unsigned int width, unsigned int height,
1521
        unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1522
{
1523
        unsigned y;
1524
        const unsigned chromWidth= width>>1;
1525
#ifdef HAVE_MMX
1526
        for(y=0; y<height-2; y+=2)
1527
        {
1528
                unsigned i;
1529
                for(i=0; i<2; i++)
1530
                {
1531
                        asm volatile(
1532
                                "movl %2, %%eax                        \n\t"
1533
                                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
1534
                                "movq "MANGLE(w1111)", %%mm5                \n\t"
1535
                                "pxor %%mm7, %%mm7                \n\t"
1536
                                "leal (%%eax, %%eax, 2), %%ebx        \n\t"
1537
                                ".balign 16                        \n\t"
1538
                                "1:                                \n\t"
1539
                                PREFETCH" 64(%0, %%ebx)                \n\t"
1540
                                "movd (%0, %%ebx), %%mm0        \n\t"
1541
                                "movd 3(%0, %%ebx), %%mm1        \n\t"
1542
                                "punpcklbw %%mm7, %%mm0                \n\t"
1543
                                "punpcklbw %%mm7, %%mm1                \n\t"
1544
                                "movd 6(%0, %%ebx), %%mm2        \n\t"
1545
                                "movd 9(%0, %%ebx), %%mm3        \n\t"
1546
                                "punpcklbw %%mm7, %%mm2                \n\t"
1547
                                "punpcklbw %%mm7, %%mm3                \n\t"
1548
                                "pmaddwd %%mm6, %%mm0                \n\t"
1549
                                "pmaddwd %%mm6, %%mm1                \n\t"
1550
                                "pmaddwd %%mm6, %%mm2                \n\t"
1551
                                "pmaddwd %%mm6, %%mm3                \n\t"
1552
#ifndef FAST_BGR2YV12
1553
                                "psrad $8, %%mm0                \n\t"
1554
                                "psrad $8, %%mm1                \n\t"
1555
                                "psrad $8, %%mm2                \n\t"
1556
                                "psrad $8, %%mm3                \n\t"
1557
#endif
1558
                                "packssdw %%mm1, %%mm0                \n\t"
1559
                                "packssdw %%mm3, %%mm2                \n\t"
1560
                                "pmaddwd %%mm5, %%mm0                \n\t"
1561
                                "pmaddwd %%mm5, %%mm2                \n\t"
1562
                                "packssdw %%mm2, %%mm0                \n\t"
1563
                                "psraw $7, %%mm0                \n\t"
1564

    
1565
                                "movd 12(%0, %%ebx), %%mm4        \n\t"
1566
                                "movd 15(%0, %%ebx), %%mm1        \n\t"
1567
                                "punpcklbw %%mm7, %%mm4                \n\t"
1568
                                "punpcklbw %%mm7, %%mm1                \n\t"
1569
                                "movd 18(%0, %%ebx), %%mm2        \n\t"
1570
                                "movd 21(%0, %%ebx), %%mm3        \n\t"
1571
                                "punpcklbw %%mm7, %%mm2                \n\t"
1572
                                "punpcklbw %%mm7, %%mm3                \n\t"
1573
                                "pmaddwd %%mm6, %%mm4                \n\t"
1574
                                "pmaddwd %%mm6, %%mm1                \n\t"
1575
                                "pmaddwd %%mm6, %%mm2                \n\t"
1576
                                "pmaddwd %%mm6, %%mm3                \n\t"
1577
#ifndef FAST_BGR2YV12
1578
                                "psrad $8, %%mm4                \n\t"
1579
                                "psrad $8, %%mm1                \n\t"
1580
                                "psrad $8, %%mm2                \n\t"
1581
                                "psrad $8, %%mm3                \n\t"
1582
#endif
1583
                                "packssdw %%mm1, %%mm4                \n\t"
1584
                                "packssdw %%mm3, %%mm2                \n\t"
1585
                                "pmaddwd %%mm5, %%mm4                \n\t"
1586
                                "pmaddwd %%mm5, %%mm2                \n\t"
1587
                                "addl $24, %%ebx                \n\t"
1588
                                "packssdw %%mm2, %%mm4                \n\t"
1589
                                "psraw $7, %%mm4                \n\t"
1590

    
1591
                                "packuswb %%mm4, %%mm0                \n\t"
1592
                                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
1593

    
1594
                                MOVNTQ" %%mm0, (%1, %%eax)        \n\t"
1595
                                "addl $8, %%eax                        \n\t"
1596
                                " js 1b                                \n\t"
1597
                                : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1598
                                : "%eax", "%ebx"
1599
                        );
1600
                        ydst += lumStride;
1601
                        src  += srcStride;
1602
                }
1603
                src -= srcStride*2;
1604
                asm volatile(
1605
                        "movl %4, %%eax                        \n\t"
1606
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
1607
                        "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
1608
                        "pxor %%mm7, %%mm7                \n\t"
1609
                        "leal (%%eax, %%eax, 2), %%ebx        \n\t"
1610
                        "addl %%ebx, %%ebx                \n\t"
1611
                        ".balign 16                        \n\t"
1612
                        "1:                                \n\t"
1613
                        PREFETCH" 64(%0, %%ebx)                \n\t"
1614
                        PREFETCH" 64(%1, %%ebx)                \n\t"
1615
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1616
                        "movq (%0, %%ebx), %%mm0        \n\t"
1617
                        "movq (%1, %%ebx), %%mm1        \n\t"
1618
                        "movq 6(%0, %%ebx), %%mm2        \n\t"
1619
                        "movq 6(%1, %%ebx), %%mm3        \n\t"
1620
                        PAVGB" %%mm1, %%mm0                \n\t"
1621
                        PAVGB" %%mm3, %%mm2                \n\t"
1622
                        "movq %%mm0, %%mm1                \n\t"
1623
                        "movq %%mm2, %%mm3                \n\t"
1624
                        "psrlq $24, %%mm0                \n\t"
1625
                        "psrlq $24, %%mm2                \n\t"
1626
                        PAVGB" %%mm1, %%mm0                \n\t"
1627
                        PAVGB" %%mm3, %%mm2                \n\t"
1628
                        "punpcklbw %%mm7, %%mm0                \n\t"
1629
                        "punpcklbw %%mm7, %%mm2                \n\t"
1630
#else
1631
                        "movd (%0, %%ebx), %%mm0        \n\t"
1632
                        "movd (%1, %%ebx), %%mm1        \n\t"
1633
                        "movd 3(%0, %%ebx), %%mm2        \n\t"
1634
                        "movd 3(%1, %%ebx), %%mm3        \n\t"
1635
                        "punpcklbw %%mm7, %%mm0                \n\t"
1636
                        "punpcklbw %%mm7, %%mm1                \n\t"
1637
                        "punpcklbw %%mm7, %%mm2                \n\t"
1638
                        "punpcklbw %%mm7, %%mm3                \n\t"
1639
                        "paddw %%mm1, %%mm0                \n\t"
1640
                        "paddw %%mm3, %%mm2                \n\t"
1641
                        "paddw %%mm2, %%mm0                \n\t"
1642
                        "movd 6(%0, %%ebx), %%mm4        \n\t"
1643
                        "movd 6(%1, %%ebx), %%mm1        \n\t"
1644
                        "movd 9(%0, %%ebx), %%mm2        \n\t"
1645
                        "movd 9(%1, %%ebx), %%mm3        \n\t"
1646
                        "punpcklbw %%mm7, %%mm4                \n\t"
1647
                        "punpcklbw %%mm7, %%mm1                \n\t"
1648
                        "punpcklbw %%mm7, %%mm2                \n\t"
1649
                        "punpcklbw %%mm7, %%mm3                \n\t"
1650
                        "paddw %%mm1, %%mm4                \n\t"
1651
                        "paddw %%mm3, %%mm2                \n\t"
1652
                        "paddw %%mm4, %%mm2                \n\t"
1653
                        "psrlw $2, %%mm0                \n\t"
1654
                        "psrlw $2, %%mm2                \n\t"
1655
#endif
1656
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
1657
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
1658

    
1659
                        "pmaddwd %%mm0, %%mm1                \n\t"
1660
                        "pmaddwd %%mm2, %%mm3                \n\t"
1661
                        "pmaddwd %%mm6, %%mm0                \n\t"
1662
                        "pmaddwd %%mm6, %%mm2                \n\t"
1663
#ifndef FAST_BGR2YV12
1664
                        "psrad $8, %%mm0                \n\t"
1665
                        "psrad $8, %%mm1                \n\t"
1666
                        "psrad $8, %%mm2                \n\t"
1667
                        "psrad $8, %%mm3                \n\t"
1668
#endif
1669
                        "packssdw %%mm2, %%mm0                \n\t"
1670
                        "packssdw %%mm3, %%mm1                \n\t"
1671
                        "pmaddwd %%mm5, %%mm0                \n\t"
1672
                        "pmaddwd %%mm5, %%mm1                \n\t"
1673
                        "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
1674
                        "psraw $7, %%mm0                \n\t"
1675

    
1676
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1677
                        "movq 12(%0, %%ebx), %%mm4        \n\t"
1678
                        "movq 12(%1, %%ebx), %%mm1        \n\t"
1679
                        "movq 18(%0, %%ebx), %%mm2        \n\t"
1680
                        "movq 18(%1, %%ebx), %%mm3        \n\t"
1681
                        PAVGB" %%mm1, %%mm4                \n\t"
1682
                        PAVGB" %%mm3, %%mm2                \n\t"
1683
                        "movq %%mm4, %%mm1                \n\t"
1684
                        "movq %%mm2, %%mm3                \n\t"
1685
                        "psrlq $24, %%mm4                \n\t"
1686
                        "psrlq $24, %%mm2                \n\t"
1687
                        PAVGB" %%mm1, %%mm4                \n\t"
1688
                        PAVGB" %%mm3, %%mm2                \n\t"
1689
                        "punpcklbw %%mm7, %%mm4                \n\t"
1690
                        "punpcklbw %%mm7, %%mm2                \n\t"
1691
#else
1692
                        "movd 12(%0, %%ebx), %%mm4        \n\t"
1693
                        "movd 12(%1, %%ebx), %%mm1        \n\t"
1694
                        "movd 15(%0, %%ebx), %%mm2        \n\t"
1695
                        "movd 15(%1, %%ebx), %%mm3        \n\t"
1696
                        "punpcklbw %%mm7, %%mm4                \n\t"
1697
                        "punpcklbw %%mm7, %%mm1                \n\t"
1698
                        "punpcklbw %%mm7, %%mm2                \n\t"
1699
                        "punpcklbw %%mm7, %%mm3                \n\t"
1700
                        "paddw %%mm1, %%mm4                \n\t"
1701
                        "paddw %%mm3, %%mm2                \n\t"
1702
                        "paddw %%mm2, %%mm4                \n\t"
1703
                        "movd 18(%0, %%ebx), %%mm5        \n\t"
1704
                        "movd 18(%1, %%ebx), %%mm1        \n\t"
1705
                        "movd 21(%0, %%ebx), %%mm2        \n\t"
1706
                        "movd 21(%1, %%ebx), %%mm3        \n\t"
1707
                        "punpcklbw %%mm7, %%mm5                \n\t"
1708
                        "punpcklbw %%mm7, %%mm1                \n\t"
1709
                        "punpcklbw %%mm7, %%mm2                \n\t"
1710
                        "punpcklbw %%mm7, %%mm3                \n\t"
1711
                        "paddw %%mm1, %%mm5                \n\t"
1712
                        "paddw %%mm3, %%mm2                \n\t"
1713
                        "paddw %%mm5, %%mm2                \n\t"
1714
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
1715
                        "psrlw $2, %%mm4                \n\t"
1716
                        "psrlw $2, %%mm2                \n\t"
1717
#endif
1718
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
1719
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
1720

    
1721
                        "pmaddwd %%mm4, %%mm1                \n\t"
1722
                        "pmaddwd %%mm2, %%mm3                \n\t"
1723
                        "pmaddwd %%mm6, %%mm4                \n\t"
1724
                        "pmaddwd %%mm6, %%mm2                \n\t"
1725
#ifndef FAST_BGR2YV12
1726
                        "psrad $8, %%mm4                \n\t"
1727
                        "psrad $8, %%mm1                \n\t"
1728
                        "psrad $8, %%mm2                \n\t"
1729
                        "psrad $8, %%mm3                \n\t"
1730
#endif
1731
                        "packssdw %%mm2, %%mm4                \n\t"
1732
                        "packssdw %%mm3, %%mm1                \n\t"
1733
                        "pmaddwd %%mm5, %%mm4                \n\t"
1734
                        "pmaddwd %%mm5, %%mm1                \n\t"
1735
                        "addl $24, %%ebx                \n\t"
1736
                        "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
1737
                        "psraw $7, %%mm4                \n\t"
1738

    
1739
                        "movq %%mm0, %%mm1                \n\t"
1740
                        "punpckldq %%mm4, %%mm0                \n\t"
1741
                        "punpckhdq %%mm4, %%mm1                \n\t"
1742
                        "packsswb %%mm1, %%mm0                \n\t"
1743
                        "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
1744

    
1745
                        "movd %%mm0, (%2, %%eax)        \n\t"
1746
                        "punpckhdq %%mm0, %%mm0                \n\t"
1747
                        "movd %%mm0, (%3, %%eax)        \n\t"
1748
                        "addl $4, %%eax                        \n\t"
1749
                        " js 1b                                \n\t"
1750
                        : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
1751
                        : "%eax", "%ebx"
1752
                );
1753

    
1754
                udst += chromStride;
1755
                vdst += chromStride;
1756
                src  += srcStride*2;
1757
        }
1758

    
1759
        asm volatile(   EMMS" \n\t"
1760
                        SFENCE" \n\t"
1761
                        :::"memory");
1762
#else
1763
        y=0;
1764
#endif
1765
        for(; y<height; y+=2)
1766
        {
1767
                unsigned i;
1768
                for(i=0; i<chromWidth; i++)
1769
                {
1770
                        unsigned int b= src[6*i+0];
1771
                        unsigned int g= src[6*i+1];
1772
                        unsigned int r= src[6*i+2];
1773

    
1774
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1775
                        unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
1776
                        unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
1777

    
1778
                        udst[i]         = U;
1779
                        vdst[i]         = V;
1780
                        ydst[2*i]         = Y;
1781

    
1782
                        b= src[6*i+3];
1783
                        g= src[6*i+4];
1784
                        r= src[6*i+5];
1785

    
1786
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1787
                        ydst[2*i+1]         = Y;
1788
                }
1789
                ydst += lumStride;
1790
                src  += srcStride;
1791

    
1792
                for(i=0; i<chromWidth; i++)
1793
                {
1794
                        unsigned int b= src[6*i+0];
1795
                        unsigned int g= src[6*i+1];
1796
                        unsigned int r= src[6*i+2];
1797

    
1798
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1799

    
1800
                        ydst[2*i]         = Y;
1801

    
1802
                        b= src[6*i+3];
1803
                        g= src[6*i+4];
1804
                        r= src[6*i+5];
1805

    
1806
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1807
                        ydst[2*i+1]         = Y;
1808
                }
1809
                udst += chromStride;
1810
                vdst += chromStride;
1811
                ydst += lumStride;
1812
                src  += srcStride;
1813
        }
1814
}
1815

    
1816
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
1817
                            unsigned width, unsigned height, unsigned src1Stride,
1818
                            unsigned src2Stride, unsigned dstStride){
1819
        unsigned h;
1820

    
1821
        for(h=0; h < height; h++)
1822
        {
1823
                unsigned w;
1824

    
1825
#ifdef HAVE_MMX
1826
#ifdef HAVE_SSE2
1827
                asm(
1828
                        "xorl %%eax, %%eax                \n\t"
1829
                        "1:                                \n\t"
1830
                        PREFETCH" 64(%1, %%eax)                \n\t"
1831
                        PREFETCH" 64(%2, %%eax)                \n\t"
1832
                        "movdqa (%1, %%eax), %%xmm0        \n\t"
1833
                        "movdqa (%1, %%eax), %%xmm1        \n\t"
1834
                        "movdqa (%2, %%eax), %%xmm2        \n\t"
1835
                        "punpcklbw %%xmm2, %%xmm0        \n\t"
1836
                        "punpckhbw %%xmm2, %%xmm1        \n\t"
1837
                        "movntdq %%xmm0, (%0, %%eax, 2)        \n\t"
1838
                        "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
1839
                        "addl $16, %%eax                        \n\t"
1840
                        "cmpl %3, %%eax                        \n\t"
1841
                        " jb 1b                                \n\t"
1842
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
1843
                        : "memory", "%eax"
1844
                );
1845
#else
1846
                asm(
1847
                        "xorl %%eax, %%eax                \n\t"
1848
                        "1:                                \n\t"
1849
                        PREFETCH" 64(%1, %%eax)                \n\t"
1850
                        PREFETCH" 64(%2, %%eax)                \n\t"
1851
                        "movq (%1, %%eax), %%mm0        \n\t"
1852
                        "movq 8(%1, %%eax), %%mm2        \n\t"
1853
                        "movq %%mm0, %%mm1                \n\t"
1854
                        "movq %%mm2, %%mm3                \n\t"
1855
                        "movq (%2, %%eax), %%mm4        \n\t"
1856
                        "movq 8(%2, %%eax), %%mm5        \n\t"
1857
                        "punpcklbw %%mm4, %%mm0                \n\t"
1858
                        "punpckhbw %%mm4, %%mm1                \n\t"
1859
                        "punpcklbw %%mm5, %%mm2                \n\t"
1860
                        "punpckhbw %%mm5, %%mm3                \n\t"
1861
                        MOVNTQ" %%mm0, (%0, %%eax, 2)        \n\t"
1862
                        MOVNTQ" %%mm1, 8(%0, %%eax, 2)        \n\t"
1863
                        MOVNTQ" %%mm2, 16(%0, %%eax, 2)        \n\t"
1864
                        MOVNTQ" %%mm3, 24(%0, %%eax, 2)        \n\t"
1865
                        "addl $16, %%eax                        \n\t"
1866
                        "cmpl %3, %%eax                        \n\t"
1867
                        " jb 1b                                \n\t"
1868
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
1869
                        : "memory", "%eax"
1870
                );
1871
#endif
1872
                for(w= (width&(~15)); w < width; w++)
1873
                {
1874
                        dest[2*w+0] = src1[w];
1875
                        dest[2*w+1] = src2[w];
1876
                }
1877
#else
1878
                for(w=0; w < width; w++)
1879
                {
1880
                        dest[2*w+0] = src1[w];
1881
                        dest[2*w+1] = src2[w];
1882
                }
1883
#endif
1884
                dest += dstStride;
1885
                src1 += src1Stride;
1886
                src2 += src2Stride;
1887
        }
1888
#ifdef HAVE_MMX
1889
        asm(
1890
                EMMS" \n\t"
1891
                SFENCE" \n\t"
1892
                ::: "memory"
1893
                );
1894
#endif
1895
}