Statistics
| Branch: | Revision:

ffmpeg / libswscale / rgb2rgb_template.c @ 4fadc2b4

History | View | Annotate | Download (67.6 KB)

1
/*
2
 *
3
 *  rgb2rgb.c, Software RGB to RGB convertor
4
 *  pluralize by Software PAL8 to RGB convertor
5
 *               Software YUV to YUV convertor
6
 *               Software YUV to RGB convertor
7
 *  Written by Nick Kurshev.
8
 *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
9
 *  lot of big-endian byteorder fixes by Alex Beregszaszi
10
 *
11
 * This program is free software; you can redistribute it and/or modify
12
 * it under the terms of the GNU General Public License as published by
13
 * the Free Software Foundation; either version 2 of the License, or
14
 * (at your option) any later version.
15
 *
16
 * This program is distributed in the hope that it will be useful,
17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
 * GNU General Public License for more details.
20
 *
21
 * You should have received a copy of the GNU General Public License
22
 * along with this program; if not, write to the Free Software
23
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24
 */
25

    
26
#include <stddef.h>
27
#include <inttypes.h> /* for __WORDSIZE */
28

    
29
#ifndef __WORDSIZE
30
// #warning You have misconfigured system and probably will lose performance!
31
#define __WORDSIZE MP_WORDSIZE
32
#endif
33

    
34
#undef PREFETCH
35
#undef MOVNTQ
36
#undef EMMS
37
#undef SFENCE
38
#undef MMREG_SIZE
39
#undef PREFETCHW
40
#undef PAVGB
41

    
42
#ifdef HAVE_SSE2
43
#define MMREG_SIZE 16
44
#else
45
#define MMREG_SIZE 8
46
#endif
47

    
48
#ifdef HAVE_3DNOW
49
#define PREFETCH  "prefetch"
50
#define PREFETCHW "prefetchw"
51
#define PAVGB          "pavgusb"
52
#elif defined ( HAVE_MMX2 )
53
#define PREFETCH "prefetchnta"
54
#define PREFETCHW "prefetcht0"
55
#define PAVGB          "pavgb"
56
#else
57
#ifdef __APPLE__
58
#define PREFETCH "#"
59
#define PREFETCHW "#"
60
#else
61
#define PREFETCH "/nop"
62
#define PREFETCHW "/nop"
63
#endif
64
#endif
65

    
66
#ifdef HAVE_3DNOW
67
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
68
#define EMMS     "femms"
69
#else
70
#define EMMS     "emms"
71
#endif
72

    
73
#ifdef HAVE_MMX2
74
#define MOVNTQ "movntq"
75
#define SFENCE "sfence"
76
#else
77
#define MOVNTQ "movq"
78
#ifdef __APPLE__
79
#define SFENCE "#"
80
#else
81
#define SFENCE "/nop"
82
#endif
83
#endif
84

    
85
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
86
{
87
  uint8_t *dest = dst;
88
  const uint8_t *s = src;
89
  const uint8_t *end;
90
#ifdef HAVE_MMX
91
  const uint8_t *mm_end;
92
#endif
93
  end = s + src_size;
94
#ifdef HAVE_MMX
95
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
96
  mm_end = end - 23;
97
  __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
98
  while(s < mm_end)
99
  {
100
    __asm __volatile(
101
        PREFETCH"        32%1\n\t"
102
        "movd        %1, %%mm0\n\t"
103
        "punpckldq 3%1, %%mm0\n\t"
104
        "movd        6%1, %%mm1\n\t"
105
        "punpckldq 9%1, %%mm1\n\t"
106
        "movd        12%1, %%mm2\n\t"
107
        "punpckldq 15%1, %%mm2\n\t"
108
        "movd        18%1, %%mm3\n\t"
109
        "punpckldq 21%1, %%mm3\n\t"
110
        "pand        %%mm7, %%mm0\n\t"
111
        "pand        %%mm7, %%mm1\n\t"
112
        "pand        %%mm7, %%mm2\n\t"
113
        "pand        %%mm7, %%mm3\n\t"
114
        MOVNTQ"        %%mm0, %0\n\t"
115
        MOVNTQ"        %%mm1, 8%0\n\t"
116
        MOVNTQ"        %%mm2, 16%0\n\t"
117
        MOVNTQ"        %%mm3, 24%0"
118
        :"=m"(*dest)
119
        :"m"(*s)
120
        :"memory");
121
    dest += 32;
122
    s += 24;
123
  }
124
  __asm __volatile(SFENCE:::"memory");
125
  __asm __volatile(EMMS:::"memory");
126
#endif
127
  while(s < end)
128
  {
129
#ifdef WORDS_BIGENDIAN
130
    /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
131
    *dest++ = 0;
132
    *dest++ = s[2];
133
    *dest++ = s[1];
134
    *dest++ = s[0];
135
    s+=3;
136
#else
137
    *dest++ = *s++;
138
    *dest++ = *s++;
139
    *dest++ = *s++;
140
    *dest++ = 0;
141
#endif
142
  }
143
}
144

    
145
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
146
{
147
  uint8_t *dest = dst;
148
  const uint8_t *s = src;
149
  const uint8_t *end;
150
#ifdef HAVE_MMX
151
  const uint8_t *mm_end;
152
#endif
153
  end = s + src_size;
154
#ifdef HAVE_MMX
155
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
156
  mm_end = end - 31;
157
  while(s < mm_end)
158
  {
159
    __asm __volatile(
160
        PREFETCH"        32%1\n\t"
161
        "movq        %1, %%mm0\n\t"
162
        "movq        8%1, %%mm1\n\t"
163
        "movq        16%1, %%mm4\n\t"
164
        "movq        24%1, %%mm5\n\t"
165
        "movq        %%mm0, %%mm2\n\t"
166
        "movq        %%mm1, %%mm3\n\t"
167
        "movq        %%mm4, %%mm6\n\t"
168
        "movq        %%mm5, %%mm7\n\t"
169
        "psrlq        $8, %%mm2\n\t"
170
        "psrlq        $8, %%mm3\n\t"
171
        "psrlq        $8, %%mm6\n\t"
172
        "psrlq        $8, %%mm7\n\t"
173
        "pand        %2, %%mm0\n\t"
174
        "pand        %2, %%mm1\n\t"
175
        "pand        %2, %%mm4\n\t"
176
        "pand        %2, %%mm5\n\t"
177
        "pand        %3, %%mm2\n\t"
178
        "pand        %3, %%mm3\n\t"
179
        "pand        %3, %%mm6\n\t"
180
        "pand        %3, %%mm7\n\t"
181
        "por        %%mm2, %%mm0\n\t"
182
        "por        %%mm3, %%mm1\n\t"
183
        "por        %%mm6, %%mm4\n\t"
184
        "por        %%mm7, %%mm5\n\t"
185

    
186
        "movq        %%mm1, %%mm2\n\t"
187
        "movq        %%mm4, %%mm3\n\t"
188
        "psllq        $48, %%mm2\n\t"
189
        "psllq        $32, %%mm3\n\t"
190
        "pand        %4, %%mm2\n\t"
191
        "pand        %5, %%mm3\n\t"
192
        "por        %%mm2, %%mm0\n\t"
193
        "psrlq        $16, %%mm1\n\t"
194
        "psrlq        $32, %%mm4\n\t"
195
        "psllq        $16, %%mm5\n\t"
196
        "por        %%mm3, %%mm1\n\t"
197
        "pand        %6, %%mm5\n\t"
198
        "por        %%mm5, %%mm4\n\t"
199

    
200
        MOVNTQ"        %%mm0, %0\n\t"
201
        MOVNTQ"        %%mm1, 8%0\n\t"
202
        MOVNTQ"        %%mm4, 16%0"
203
        :"=m"(*dest)
204
        :"m"(*s),"m"(mask24l),
205
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
206
        :"memory");
207
    dest += 24;
208
    s += 32;
209
  }
210
  __asm __volatile(SFENCE:::"memory");
211
  __asm __volatile(EMMS:::"memory");
212
#endif
213
  while(s < end)
214
  {
215
#ifdef WORDS_BIGENDIAN
216
    /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
217
    s++;
218
    dest[2] = *s++;
219
    dest[1] = *s++;
220
    dest[0] = *s++;
221
    dest += 3;
222
#else
223
    *dest++ = *s++;
224
    *dest++ = *s++;
225
    *dest++ = *s++;
226
    s++;
227
#endif
228
  }
229
}
230

    
231
/*
232
 Original by Strepto/Astral
233
 ported to gcc & bugfixed : A'rpi
234
 MMX2, 3DNOW optimization by Nick Kurshev
235
 32bit c version, and and&add trick by Michael Niedermayer
236
*/
237
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
238
{
239
  register const uint8_t* s=src;
240
  register uint8_t* d=dst;
241
  register const uint8_t *end;
242
  const uint8_t *mm_end;
243
  end = s + src_size;
244
#ifdef HAVE_MMX
245
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
246
  __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
247
  mm_end = end - 15;
248
  while(s<mm_end)
249
  {
250
        __asm __volatile(
251
                PREFETCH"        32%1\n\t"
252
                "movq        %1, %%mm0\n\t"
253
                "movq        8%1, %%mm2\n\t"
254
                "movq        %%mm0, %%mm1\n\t"
255
                "movq        %%mm2, %%mm3\n\t"
256
                "pand        %%mm4, %%mm0\n\t"
257
                "pand        %%mm4, %%mm2\n\t"
258
                "paddw        %%mm1, %%mm0\n\t"
259
                "paddw        %%mm3, %%mm2\n\t"
260
                MOVNTQ"        %%mm0, %0\n\t"
261
                MOVNTQ"        %%mm2, 8%0"
262
                :"=m"(*d)
263
                :"m"(*s)
264
                );
265
        d+=16;
266
        s+=16;
267
  }
268
  __asm __volatile(SFENCE:::"memory");
269
  __asm __volatile(EMMS:::"memory");
270
#endif
271
    mm_end = end - 3;
272
    while(s < mm_end)
273
    {
274
        register unsigned x= *((uint32_t *)s);
275
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
276
        d+=4;
277
        s+=4;
278
    }
279
    if(s < end)
280
    {
281
        register unsigned short x= *((uint16_t *)s);
282
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
283
    }
284
}
285

    
286
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
287
{
288
  register const uint8_t* s=src;
289
  register uint8_t* d=dst;
290
  register const uint8_t *end;
291
  const uint8_t *mm_end;
292
  end = s + src_size;
293
#ifdef HAVE_MMX
294
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
295
  __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
296
  __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
297
  mm_end = end - 15;
298
  while(s<mm_end)
299
  {
300
        __asm __volatile(
301
                PREFETCH"        32%1\n\t"
302
                "movq        %1, %%mm0\n\t"
303
                "movq        8%1, %%mm2\n\t"
304
                "movq        %%mm0, %%mm1\n\t"
305
                "movq        %%mm2, %%mm3\n\t"
306
                "psrlq        $1, %%mm0\n\t"
307
                "psrlq        $1, %%mm2\n\t"
308
                "pand        %%mm7, %%mm0\n\t"
309
                "pand        %%mm7, %%mm2\n\t"
310
                "pand        %%mm6, %%mm1\n\t"
311
                "pand        %%mm6, %%mm3\n\t"
312
                "por        %%mm1, %%mm0\n\t"
313
                "por        %%mm3, %%mm2\n\t"
314
                MOVNTQ"        %%mm0, %0\n\t"
315
                MOVNTQ"        %%mm2, 8%0"
316
                :"=m"(*d)
317
                :"m"(*s)
318
                );
319
        d+=16;
320
        s+=16;
321
  }
322
  __asm __volatile(SFENCE:::"memory");
323
  __asm __volatile(EMMS:::"memory");
324
#endif
325
    mm_end = end - 3;
326
    while(s < mm_end)
327
    {
328
        register uint32_t x= *((uint32_t *)s);
329
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
330
        s+=4;
331
        d+=4;
332
    }
333
    if(s < end)
334
    {
335
        register uint16_t x= *((uint16_t *)s);
336
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
337
        s+=2;
338
        d+=2;
339
    }
340
}
341

    
342
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
343
{
344
        const uint8_t *s = src;
345
        const uint8_t *end;
346
#ifdef HAVE_MMX
347
        const uint8_t *mm_end;
348
#endif
349
        uint16_t *d = (uint16_t *)dst;
350
        end = s + src_size;
351
#ifdef HAVE_MMX
352
        mm_end = end - 15;
353
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
354
        asm volatile(
355
                "movq %3, %%mm5                        \n\t"
356
                "movq %4, %%mm6                        \n\t"
357
                "movq %5, %%mm7                        \n\t"
358
                ASMALIGN(4)
359
                "1:                                \n\t"
360
                PREFETCH" 32(%1)                \n\t"
361
                "movd        (%1), %%mm0                \n\t"
362
                "movd        4(%1), %%mm3                \n\t"
363
                "punpckldq 8(%1), %%mm0                \n\t"
364
                "punpckldq 12(%1), %%mm3        \n\t"
365
                "movq %%mm0, %%mm1                \n\t"
366
                "movq %%mm3, %%mm4                \n\t"
367
                "pand %%mm6, %%mm0                \n\t"
368
                "pand %%mm6, %%mm3                \n\t"
369
                "pmaddwd %%mm7, %%mm0                \n\t"
370
                "pmaddwd %%mm7, %%mm3                \n\t"
371
                "pand %%mm5, %%mm1                \n\t"
372
                "pand %%mm5, %%mm4                \n\t"
373
                "por %%mm1, %%mm0                \n\t"        
374
                "por %%mm4, %%mm3                \n\t"
375
                "psrld $5, %%mm0                \n\t"
376
                "pslld $11, %%mm3                \n\t"
377
                "por %%mm3, %%mm0                \n\t"
378
                MOVNTQ"        %%mm0, (%0)                \n\t"
379
                "add $16, %1                        \n\t"
380
                "add $8, %0                        \n\t"
381
                "cmp %2, %1                        \n\t"
382
                " jb 1b                                \n\t"
383
                : "+r" (d), "+r"(s)
384
                : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
385
        );
386
#else
387
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
388
        __asm __volatile(
389
            "movq        %0, %%mm7\n\t"
390
            "movq        %1, %%mm6\n\t"
391
            ::"m"(red_16mask),"m"(green_16mask));
392
        while(s < mm_end)
393
        {
394
            __asm __volatile(
395
                PREFETCH" 32%1\n\t"
396
                "movd        %1, %%mm0\n\t"
397
                "movd        4%1, %%mm3\n\t"
398
                "punpckldq 8%1, %%mm0\n\t"
399
                "punpckldq 12%1, %%mm3\n\t"
400
                "movq        %%mm0, %%mm1\n\t"
401
                "movq        %%mm0, %%mm2\n\t"
402
                "movq        %%mm3, %%mm4\n\t"
403
                "movq        %%mm3, %%mm5\n\t"
404
                "psrlq        $3, %%mm0\n\t"
405
                "psrlq        $3, %%mm3\n\t"
406
                "pand        %2, %%mm0\n\t"
407
                "pand        %2, %%mm3\n\t"
408
                "psrlq        $5, %%mm1\n\t"
409
                "psrlq        $5, %%mm4\n\t"
410
                "pand        %%mm6, %%mm1\n\t"
411
                "pand        %%mm6, %%mm4\n\t"
412
                "psrlq        $8, %%mm2\n\t"
413
                "psrlq        $8, %%mm5\n\t"
414
                "pand        %%mm7, %%mm2\n\t"
415
                "pand        %%mm7, %%mm5\n\t"
416
                "por        %%mm1, %%mm0\n\t"
417
                "por        %%mm4, %%mm3\n\t"
418
                "por        %%mm2, %%mm0\n\t"
419
                "por        %%mm5, %%mm3\n\t"
420
                "psllq        $16, %%mm3\n\t"
421
                "por        %%mm3, %%mm0\n\t"
422
                MOVNTQ"        %%mm0, %0\n\t"
423
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
424
                d += 4;
425
                s += 16;
426
        }
427
#endif
428
        __asm __volatile(SFENCE:::"memory");
429
        __asm __volatile(EMMS:::"memory");
430
#endif
431
        while(s < end)
432
        {
433
                register int rgb = *(uint32_t*)s; s += 4;
434
                *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
435
        }
436
}
437

    
438
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
439
{
440
        const uint8_t *s = src;
441
        const uint8_t *end;
442
#ifdef HAVE_MMX
443
        const uint8_t *mm_end;
444
#endif
445
        uint16_t *d = (uint16_t *)dst;
446
        end = s + src_size;
447
#ifdef HAVE_MMX
448
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
449
        __asm __volatile(
450
            "movq        %0, %%mm7\n\t"
451
            "movq        %1, %%mm6\n\t"
452
            ::"m"(red_16mask),"m"(green_16mask));
453
        mm_end = end - 15;
454
        while(s < mm_end)
455
        {
456
            __asm __volatile(
457
                PREFETCH" 32%1\n\t"
458
                "movd        %1, %%mm0\n\t"
459
                "movd        4%1, %%mm3\n\t"
460
                "punpckldq 8%1, %%mm0\n\t"
461
                "punpckldq 12%1, %%mm3\n\t"
462
                "movq        %%mm0, %%mm1\n\t"
463
                "movq        %%mm0, %%mm2\n\t"
464
                "movq        %%mm3, %%mm4\n\t"
465
                "movq        %%mm3, %%mm5\n\t"
466
                "psllq        $8, %%mm0\n\t"
467
                "psllq        $8, %%mm3\n\t"
468
                "pand        %%mm7, %%mm0\n\t"
469
                "pand        %%mm7, %%mm3\n\t"
470
                "psrlq        $5, %%mm1\n\t"
471
                "psrlq        $5, %%mm4\n\t"
472
                "pand        %%mm6, %%mm1\n\t"
473
                "pand        %%mm6, %%mm4\n\t"
474
                "psrlq        $19, %%mm2\n\t"
475
                "psrlq        $19, %%mm5\n\t"
476
                "pand        %2, %%mm2\n\t"
477
                "pand        %2, %%mm5\n\t"
478
                "por        %%mm1, %%mm0\n\t"
479
                "por        %%mm4, %%mm3\n\t"
480
                "por        %%mm2, %%mm0\n\t"
481
                "por        %%mm5, %%mm3\n\t"
482
                "psllq        $16, %%mm3\n\t"
483
                "por        %%mm3, %%mm0\n\t"
484
                MOVNTQ"        %%mm0, %0\n\t"
485
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
486
                d += 4;
487
                s += 16;
488
        }
489
        __asm __volatile(SFENCE:::"memory");
490
        __asm __volatile(EMMS:::"memory");
491
#endif
492
        while(s < end)
493
        {
494
                register int rgb = *(uint32_t*)s; s += 4;
495
                *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
496
        }
497
}
498

    
499
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
500
{
501
        const uint8_t *s = src;
502
        const uint8_t *end;
503
#ifdef HAVE_MMX
504
        const uint8_t *mm_end;
505
#endif
506
        uint16_t *d = (uint16_t *)dst;
507
        end = s + src_size;
508
#ifdef HAVE_MMX
509
        mm_end = end - 15;
510
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
511
        asm volatile(
512
                "movq %3, %%mm5                        \n\t"
513
                "movq %4, %%mm6                        \n\t"
514
                "movq %5, %%mm7                        \n\t"
515
                ASMALIGN(4)
516
                "1:                                \n\t"
517
                PREFETCH" 32(%1)                \n\t"
518
                "movd        (%1), %%mm0                \n\t"
519
                "movd        4(%1), %%mm3                \n\t"
520
                "punpckldq 8(%1), %%mm0                \n\t"
521
                "punpckldq 12(%1), %%mm3        \n\t"
522
                "movq %%mm0, %%mm1                \n\t"
523
                "movq %%mm3, %%mm4                \n\t"
524
                "pand %%mm6, %%mm0                \n\t"
525
                "pand %%mm6, %%mm3                \n\t"
526
                "pmaddwd %%mm7, %%mm0                \n\t"
527
                "pmaddwd %%mm7, %%mm3                \n\t"
528
                "pand %%mm5, %%mm1                \n\t"
529
                "pand %%mm5, %%mm4                \n\t"
530
                "por %%mm1, %%mm0                \n\t"        
531
                "por %%mm4, %%mm3                \n\t"
532
                "psrld $6, %%mm0                \n\t"
533
                "pslld $10, %%mm3                \n\t"
534
                "por %%mm3, %%mm0                \n\t"
535
                MOVNTQ"        %%mm0, (%0)                \n\t"
536
                "add $16, %1                        \n\t"
537
                "add $8, %0                        \n\t"
538
                "cmp %2, %1                        \n\t"
539
                " jb 1b                                \n\t"
540
                : "+r" (d), "+r"(s)
541
                : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
542
        );
543
#else
544
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
545
        __asm __volatile(
546
            "movq        %0, %%mm7\n\t"
547
            "movq        %1, %%mm6\n\t"
548
            ::"m"(red_15mask),"m"(green_15mask));
549
        while(s < mm_end)
550
        {
551
            __asm __volatile(
552
                PREFETCH" 32%1\n\t"
553
                "movd        %1, %%mm0\n\t"
554
                "movd        4%1, %%mm3\n\t"
555
                "punpckldq 8%1, %%mm0\n\t"
556
                "punpckldq 12%1, %%mm3\n\t"
557
                "movq        %%mm0, %%mm1\n\t"
558
                "movq        %%mm0, %%mm2\n\t"
559
                "movq        %%mm3, %%mm4\n\t"
560
                "movq        %%mm3, %%mm5\n\t"
561
                "psrlq        $3, %%mm0\n\t"
562
                "psrlq        $3, %%mm3\n\t"
563
                "pand        %2, %%mm0\n\t"
564
                "pand        %2, %%mm3\n\t"
565
                "psrlq        $6, %%mm1\n\t"
566
                "psrlq        $6, %%mm4\n\t"
567
                "pand        %%mm6, %%mm1\n\t"
568
                "pand        %%mm6, %%mm4\n\t"
569
                "psrlq        $9, %%mm2\n\t"
570
                "psrlq        $9, %%mm5\n\t"
571
                "pand        %%mm7, %%mm2\n\t"
572
                "pand        %%mm7, %%mm5\n\t"
573
                "por        %%mm1, %%mm0\n\t"
574
                "por        %%mm4, %%mm3\n\t"
575
                "por        %%mm2, %%mm0\n\t"
576
                "por        %%mm5, %%mm3\n\t"
577
                "psllq        $16, %%mm3\n\t"
578
                "por        %%mm3, %%mm0\n\t"
579
                MOVNTQ"        %%mm0, %0\n\t"
580
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
581
                d += 4;
582
                s += 16;
583
        }
584
#endif
585
        __asm __volatile(SFENCE:::"memory");
586
        __asm __volatile(EMMS:::"memory");
587
#endif
588
        while(s < end)
589
        {
590
                register int rgb = *(uint32_t*)s; s += 4;
591
                *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
592
        }
593
}
594

    
595
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
596
{
597
        const uint8_t *s = src;
598
        const uint8_t *end;
599
#ifdef HAVE_MMX
600
        const uint8_t *mm_end;
601
#endif
602
        uint16_t *d = (uint16_t *)dst;
603
        end = s + src_size;
604
#ifdef HAVE_MMX
605
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
606
        __asm __volatile(
607
            "movq        %0, %%mm7\n\t"
608
            "movq        %1, %%mm6\n\t"
609
            ::"m"(red_15mask),"m"(green_15mask));
610
        mm_end = end - 15;
611
        while(s < mm_end)
612
        {
613
            __asm __volatile(
614
                PREFETCH" 32%1\n\t"
615
                "movd        %1, %%mm0\n\t"
616
                "movd        4%1, %%mm3\n\t"
617
                "punpckldq 8%1, %%mm0\n\t"
618
                "punpckldq 12%1, %%mm3\n\t"
619
                "movq        %%mm0, %%mm1\n\t"
620
                "movq        %%mm0, %%mm2\n\t"
621
                "movq        %%mm3, %%mm4\n\t"
622
                "movq        %%mm3, %%mm5\n\t"
623
                "psllq        $7, %%mm0\n\t"
624
                "psllq        $7, %%mm3\n\t"
625
                "pand        %%mm7, %%mm0\n\t"
626
                "pand        %%mm7, %%mm3\n\t"
627
                "psrlq        $6, %%mm1\n\t"
628
                "psrlq        $6, %%mm4\n\t"
629
                "pand        %%mm6, %%mm1\n\t"
630
                "pand        %%mm6, %%mm4\n\t"
631
                "psrlq        $19, %%mm2\n\t"
632
                "psrlq        $19, %%mm5\n\t"
633
                "pand        %2, %%mm2\n\t"
634
                "pand        %2, %%mm5\n\t"
635
                "por        %%mm1, %%mm0\n\t"
636
                "por        %%mm4, %%mm3\n\t"
637
                "por        %%mm2, %%mm0\n\t"
638
                "por        %%mm5, %%mm3\n\t"
639
                "psllq        $16, %%mm3\n\t"
640
                "por        %%mm3, %%mm0\n\t"
641
                MOVNTQ"        %%mm0, %0\n\t"
642
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
643
                d += 4;
644
                s += 16;
645
        }
646
        __asm __volatile(SFENCE:::"memory");
647
        __asm __volatile(EMMS:::"memory");
648
#endif
649
        while(s < end)
650
        {
651
                register int rgb = *(uint32_t*)s; s += 4;
652
                *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
653
        }
654
}
655

    
656
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
657
{
658
        const uint8_t *s = src;
659
        const uint8_t *end;
660
#ifdef HAVE_MMX
661
        const uint8_t *mm_end;
662
#endif
663
        uint16_t *d = (uint16_t *)dst;
664
        end = s + src_size;
665
#ifdef HAVE_MMX
666
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
667
        __asm __volatile(
668
            "movq        %0, %%mm7\n\t"
669
            "movq        %1, %%mm6\n\t"
670
            ::"m"(red_16mask),"m"(green_16mask));
671
        mm_end = end - 11;
672
        while(s < mm_end)
673
        {
674
            __asm __volatile(
675
                PREFETCH" 32%1\n\t"
676
                "movd        %1, %%mm0\n\t"
677
                "movd        3%1, %%mm3\n\t"
678
                "punpckldq 6%1, %%mm0\n\t"
679
                "punpckldq 9%1, %%mm3\n\t"
680
                "movq        %%mm0, %%mm1\n\t"
681
                "movq        %%mm0, %%mm2\n\t"
682
                "movq        %%mm3, %%mm4\n\t"
683
                "movq        %%mm3, %%mm5\n\t"
684
                "psrlq        $3, %%mm0\n\t"
685
                "psrlq        $3, %%mm3\n\t"
686
                "pand        %2, %%mm0\n\t"
687
                "pand        %2, %%mm3\n\t"
688
                "psrlq        $5, %%mm1\n\t"
689
                "psrlq        $5, %%mm4\n\t"
690
                "pand        %%mm6, %%mm1\n\t"
691
                "pand        %%mm6, %%mm4\n\t"
692
                "psrlq        $8, %%mm2\n\t"
693
                "psrlq        $8, %%mm5\n\t"
694
                "pand        %%mm7, %%mm2\n\t"
695
                "pand        %%mm7, %%mm5\n\t"
696
                "por        %%mm1, %%mm0\n\t"
697
                "por        %%mm4, %%mm3\n\t"
698
                "por        %%mm2, %%mm0\n\t"
699
                "por        %%mm5, %%mm3\n\t"
700
                "psllq        $16, %%mm3\n\t"
701
                "por        %%mm3, %%mm0\n\t"
702
                MOVNTQ"        %%mm0, %0\n\t"
703
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
704
                d += 4;
705
                s += 12;
706
        }
707
        __asm __volatile(SFENCE:::"memory");
708
        __asm __volatile(EMMS:::"memory");
709
#endif
710
        while(s < end)
711
        {
712
                const int b= *s++;
713
                const int g= *s++;
714
                const int r= *s++;
715
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
716
        }
717
}
718

    
719
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
720
{
721
        const uint8_t *s = src;
722
        const uint8_t *end;
723
#ifdef HAVE_MMX
724
        const uint8_t *mm_end;
725
#endif
726
        uint16_t *d = (uint16_t *)dst;
727
        end = s + src_size;
728
#ifdef HAVE_MMX
729
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
730
        __asm __volatile(
731
            "movq        %0, %%mm7\n\t"
732
            "movq        %1, %%mm6\n\t"
733
            ::"m"(red_16mask),"m"(green_16mask));
734
        mm_end = end - 15;
735
        while(s < mm_end)
736
        {
737
            __asm __volatile(
738
                PREFETCH" 32%1\n\t"
739
                "movd        %1, %%mm0\n\t"
740
                "movd        3%1, %%mm3\n\t"
741
                "punpckldq 6%1, %%mm0\n\t"
742
                "punpckldq 9%1, %%mm3\n\t"
743
                "movq        %%mm0, %%mm1\n\t"
744
                "movq        %%mm0, %%mm2\n\t"
745
                "movq        %%mm3, %%mm4\n\t"
746
                "movq        %%mm3, %%mm5\n\t"
747
                "psllq        $8, %%mm0\n\t"
748
                "psllq        $8, %%mm3\n\t"
749
                "pand        %%mm7, %%mm0\n\t"
750
                "pand        %%mm7, %%mm3\n\t"
751
                "psrlq        $5, %%mm1\n\t"
752
                "psrlq        $5, %%mm4\n\t"
753
                "pand        %%mm6, %%mm1\n\t"
754
                "pand        %%mm6, %%mm4\n\t"
755
                "psrlq        $19, %%mm2\n\t"
756
                "psrlq        $19, %%mm5\n\t"
757
                "pand        %2, %%mm2\n\t"
758
                "pand        %2, %%mm5\n\t"
759
                "por        %%mm1, %%mm0\n\t"
760
                "por        %%mm4, %%mm3\n\t"
761
                "por        %%mm2, %%mm0\n\t"
762
                "por        %%mm5, %%mm3\n\t"
763
                "psllq        $16, %%mm3\n\t"
764
                "por        %%mm3, %%mm0\n\t"
765
                MOVNTQ"        %%mm0, %0\n\t"
766
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
767
                d += 4;
768
                s += 12;
769
        }
770
        __asm __volatile(SFENCE:::"memory");
771
        __asm __volatile(EMMS:::"memory");
772
#endif
773
        while(s < end)
774
        {
775
                const int r= *s++;
776
                const int g= *s++;
777
                const int b= *s++;
778
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
779
        }
780
}
781

    
782
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
783
{
784
        const uint8_t *s = src;
785
        const uint8_t *end;
786
#ifdef HAVE_MMX
787
        const uint8_t *mm_end;
788
#endif
789
        uint16_t *d = (uint16_t *)dst;
790
        end = s + src_size;
791
#ifdef HAVE_MMX
792
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
793
        __asm __volatile(
794
            "movq        %0, %%mm7\n\t"
795
            "movq        %1, %%mm6\n\t"
796
            ::"m"(red_15mask),"m"(green_15mask));
797
        mm_end = end - 11;
798
        while(s < mm_end)
799
        {
800
            __asm __volatile(
801
                PREFETCH" 32%1\n\t"
802
                "movd        %1, %%mm0\n\t"
803
                "movd        3%1, %%mm3\n\t"
804
                "punpckldq 6%1, %%mm0\n\t"
805
                "punpckldq 9%1, %%mm3\n\t"
806
                "movq        %%mm0, %%mm1\n\t"
807
                "movq        %%mm0, %%mm2\n\t"
808
                "movq        %%mm3, %%mm4\n\t"
809
                "movq        %%mm3, %%mm5\n\t"
810
                "psrlq        $3, %%mm0\n\t"
811
                "psrlq        $3, %%mm3\n\t"
812
                "pand        %2, %%mm0\n\t"
813
                "pand        %2, %%mm3\n\t"
814
                "psrlq        $6, %%mm1\n\t"
815
                "psrlq        $6, %%mm4\n\t"
816
                "pand        %%mm6, %%mm1\n\t"
817
                "pand        %%mm6, %%mm4\n\t"
818
                "psrlq        $9, %%mm2\n\t"
819
                "psrlq        $9, %%mm5\n\t"
820
                "pand        %%mm7, %%mm2\n\t"
821
                "pand        %%mm7, %%mm5\n\t"
822
                "por        %%mm1, %%mm0\n\t"
823
                "por        %%mm4, %%mm3\n\t"
824
                "por        %%mm2, %%mm0\n\t"
825
                "por        %%mm5, %%mm3\n\t"
826
                "psllq        $16, %%mm3\n\t"
827
                "por        %%mm3, %%mm0\n\t"
828
                MOVNTQ"        %%mm0, %0\n\t"
829
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
830
                d += 4;
831
                s += 12;
832
        }
833
        __asm __volatile(SFENCE:::"memory");
834
        __asm __volatile(EMMS:::"memory");
835
#endif
836
        while(s < end)
837
        {
838
                const int b= *s++;
839
                const int g= *s++;
840
                const int r= *s++;
841
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
842
        }
843
}
844

    
845
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
846
{
847
        const uint8_t *s = src;
848
        const uint8_t *end;
849
#ifdef HAVE_MMX
850
        const uint8_t *mm_end;
851
#endif
852
        uint16_t *d = (uint16_t *)dst;
853
        end = s + src_size;
854
#ifdef HAVE_MMX
855
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
856
        __asm __volatile(
857
            "movq        %0, %%mm7\n\t"
858
            "movq        %1, %%mm6\n\t"
859
            ::"m"(red_15mask),"m"(green_15mask));
860
        mm_end = end - 15;
861
        while(s < mm_end)
862
        {
863
            __asm __volatile(
864
                PREFETCH" 32%1\n\t"
865
                "movd        %1, %%mm0\n\t"
866
                "movd        3%1, %%mm3\n\t"
867
                "punpckldq 6%1, %%mm0\n\t"
868
                "punpckldq 9%1, %%mm3\n\t"
869
                "movq        %%mm0, %%mm1\n\t"
870
                "movq        %%mm0, %%mm2\n\t"
871
                "movq        %%mm3, %%mm4\n\t"
872
                "movq        %%mm3, %%mm5\n\t"
873
                "psllq        $7, %%mm0\n\t"
874
                "psllq        $7, %%mm3\n\t"
875
                "pand        %%mm7, %%mm0\n\t"
876
                "pand        %%mm7, %%mm3\n\t"
877
                "psrlq        $6, %%mm1\n\t"
878
                "psrlq        $6, %%mm4\n\t"
879
                "pand        %%mm6, %%mm1\n\t"
880
                "pand        %%mm6, %%mm4\n\t"
881
                "psrlq        $19, %%mm2\n\t"
882
                "psrlq        $19, %%mm5\n\t"
883
                "pand        %2, %%mm2\n\t"
884
                "pand        %2, %%mm5\n\t"
885
                "por        %%mm1, %%mm0\n\t"
886
                "por        %%mm4, %%mm3\n\t"
887
                "por        %%mm2, %%mm0\n\t"
888
                "por        %%mm5, %%mm3\n\t"
889
                "psllq        $16, %%mm3\n\t"
890
                "por        %%mm3, %%mm0\n\t"
891
                MOVNTQ"        %%mm0, %0\n\t"
892
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
893
                d += 4;
894
                s += 12;
895
        }
896
        __asm __volatile(SFENCE:::"memory");
897
        __asm __volatile(EMMS:::"memory");
898
#endif
899
        while(s < end)
900
        {
901
                const int r= *s++;
902
                const int g= *s++;
903
                const int b= *s++;
904
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
905
        }
906
}
907

    
908
/*
909
  I use here less accurate approximation by simply
910
 left-shifting the input
911
  value and filling the low order bits with
912
 zeroes. This method improves png's
913
  compression but this scheme cannot reproduce white exactly, since it does not
914
  generate an all-ones maximum value; the net effect is to darken the
915
  image slightly.
916

917
  The better method should be "left bit replication":
918

919
   4 3 2 1 0
920
   ---------
921
   1 1 0 1 1
922

923
   7 6 5 4 3  2 1 0
924
   ----------------
925
   1 1 0 1 1  1 1 0
926
   |=======|  |===|
927
       |      Leftmost Bits Repeated to Fill Open Bits
928
       |
929
   Original Bits
930
*/
931
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
932
{
933
        const uint16_t *end;
934
#ifdef HAVE_MMX
935
        const uint16_t *mm_end;
936
#endif
937
        uint8_t *d = (uint8_t *)dst;
938
        const uint16_t *s = (uint16_t *)src;
939
        end = s + src_size/2;
940
#ifdef HAVE_MMX
941
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
942
        mm_end = end - 7;
943
        while(s < mm_end)
944
        {
945
            __asm __volatile(
946
                PREFETCH" 32%1\n\t"
947
                "movq        %1, %%mm0\n\t"
948
                "movq        %1, %%mm1\n\t"
949
                "movq        %1, %%mm2\n\t"
950
                "pand        %2, %%mm0\n\t"
951
                "pand        %3, %%mm1\n\t"
952
                "pand        %4, %%mm2\n\t"
953
                "psllq        $3, %%mm0\n\t"
954
                "psrlq        $2, %%mm1\n\t"
955
                "psrlq        $7, %%mm2\n\t"
956
                "movq        %%mm0, %%mm3\n\t"
957
                "movq        %%mm1, %%mm4\n\t"
958
                "movq        %%mm2, %%mm5\n\t"
959
                "punpcklwd %5, %%mm0\n\t"
960
                "punpcklwd %5, %%mm1\n\t"
961
                "punpcklwd %5, %%mm2\n\t"
962
                "punpckhwd %5, %%mm3\n\t"
963
                "punpckhwd %5, %%mm4\n\t"
964
                "punpckhwd %5, %%mm5\n\t"
965
                "psllq        $8, %%mm1\n\t"
966
                "psllq        $16, %%mm2\n\t"
967
                "por        %%mm1, %%mm0\n\t"
968
                "por        %%mm2, %%mm0\n\t"
969
                "psllq        $8, %%mm4\n\t"
970
                "psllq        $16, %%mm5\n\t"
971
                "por        %%mm4, %%mm3\n\t"
972
                "por        %%mm5, %%mm3\n\t"
973

    
974
                "movq        %%mm0, %%mm6\n\t"
975
                "movq        %%mm3, %%mm7\n\t"
976
                
977
                "movq        8%1, %%mm0\n\t"
978
                "movq        8%1, %%mm1\n\t"
979
                "movq        8%1, %%mm2\n\t"
980
                "pand        %2, %%mm0\n\t"
981
                "pand        %3, %%mm1\n\t"
982
                "pand        %4, %%mm2\n\t"
983
                "psllq        $3, %%mm0\n\t"
984
                "psrlq        $2, %%mm1\n\t"
985
                "psrlq        $7, %%mm2\n\t"
986
                "movq        %%mm0, %%mm3\n\t"
987
                "movq        %%mm1, %%mm4\n\t"
988
                "movq        %%mm2, %%mm5\n\t"
989
                "punpcklwd %5, %%mm0\n\t"
990
                "punpcklwd %5, %%mm1\n\t"
991
                "punpcklwd %5, %%mm2\n\t"
992
                "punpckhwd %5, %%mm3\n\t"
993
                "punpckhwd %5, %%mm4\n\t"
994
                "punpckhwd %5, %%mm5\n\t"
995
                "psllq        $8, %%mm1\n\t"
996
                "psllq        $16, %%mm2\n\t"
997
                "por        %%mm1, %%mm0\n\t"
998
                "por        %%mm2, %%mm0\n\t"
999
                "psllq        $8, %%mm4\n\t"
1000
                "psllq        $16, %%mm5\n\t"
1001
                "por        %%mm4, %%mm3\n\t"
1002
                "por        %%mm5, %%mm3\n\t"
1003

    
1004
                :"=m"(*d)
1005
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1006
                :"memory");
1007
            /* Borrowed 32 to 24 */
1008
            __asm __volatile(
1009
                "movq        %%mm0, %%mm4\n\t"
1010
                "movq        %%mm3, %%mm5\n\t"
1011
                "movq        %%mm6, %%mm0\n\t"
1012
                "movq        %%mm7, %%mm1\n\t"
1013
                
1014
                "movq        %%mm4, %%mm6\n\t"
1015
                "movq        %%mm5, %%mm7\n\t"
1016
                "movq        %%mm0, %%mm2\n\t"
1017
                "movq        %%mm1, %%mm3\n\t"
1018

    
1019
                "psrlq        $8, %%mm2\n\t"
1020
                "psrlq        $8, %%mm3\n\t"
1021
                "psrlq        $8, %%mm6\n\t"
1022
                "psrlq        $8, %%mm7\n\t"
1023
                "pand        %2, %%mm0\n\t"
1024
                "pand        %2, %%mm1\n\t"
1025
                "pand        %2, %%mm4\n\t"
1026
                "pand        %2, %%mm5\n\t"
1027
                "pand        %3, %%mm2\n\t"
1028
                "pand        %3, %%mm3\n\t"
1029
                "pand        %3, %%mm6\n\t"
1030
                "pand        %3, %%mm7\n\t"
1031
                "por        %%mm2, %%mm0\n\t"
1032
                "por        %%mm3, %%mm1\n\t"
1033
                "por        %%mm6, %%mm4\n\t"
1034
                "por        %%mm7, %%mm5\n\t"
1035

    
1036
                "movq        %%mm1, %%mm2\n\t"
1037
                "movq        %%mm4, %%mm3\n\t"
1038
                "psllq        $48, %%mm2\n\t"
1039
                "psllq        $32, %%mm3\n\t"
1040
                "pand        %4, %%mm2\n\t"
1041
                "pand        %5, %%mm3\n\t"
1042
                "por        %%mm2, %%mm0\n\t"
1043
                "psrlq        $16, %%mm1\n\t"
1044
                "psrlq        $32, %%mm4\n\t"
1045
                "psllq        $16, %%mm5\n\t"
1046
                "por        %%mm3, %%mm1\n\t"
1047
                "pand        %6, %%mm5\n\t"
1048
                "por        %%mm5, %%mm4\n\t"
1049

    
1050
                MOVNTQ"        %%mm0, %0\n\t"
1051
                MOVNTQ"        %%mm1, 8%0\n\t"
1052
                MOVNTQ"        %%mm4, 16%0"
1053

    
1054
                :"=m"(*d)
1055
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1056
                :"memory");
1057
                d += 24;
1058
                s += 8;
1059
        }
1060
        __asm __volatile(SFENCE:::"memory");
1061
        __asm __volatile(EMMS:::"memory");
1062
#endif
1063
        while(s < end)
1064
        {
1065
                register uint16_t bgr;
1066
                bgr = *s++;
1067
                *d++ = (bgr&0x1F)<<3;
1068
                *d++ = (bgr&0x3E0)>>2;
1069
                *d++ = (bgr&0x7C00)>>7;
1070
        }
1071
}
1072

    
1073
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1074
{
1075
        const uint16_t *end;
1076
#ifdef HAVE_MMX
1077
        const uint16_t *mm_end;
1078
#endif
1079
        uint8_t *d = (uint8_t *)dst;
1080
        const uint16_t *s = (const uint16_t *)src;
1081
        end = s + src_size/2;
1082
#ifdef HAVE_MMX
1083
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1084
        mm_end = end - 7;
1085
        while(s < mm_end)
1086
        {
1087
            __asm __volatile(
1088
                PREFETCH" 32%1\n\t"
1089
                "movq        %1, %%mm0\n\t"
1090
                "movq        %1, %%mm1\n\t"
1091
                "movq        %1, %%mm2\n\t"
1092
                "pand        %2, %%mm0\n\t"
1093
                "pand        %3, %%mm1\n\t"
1094
                "pand        %4, %%mm2\n\t"
1095
                "psllq        $3, %%mm0\n\t"
1096
                "psrlq        $3, %%mm1\n\t"
1097
                "psrlq        $8, %%mm2\n\t"
1098
                "movq        %%mm0, %%mm3\n\t"
1099
                "movq        %%mm1, %%mm4\n\t"
1100
                "movq        %%mm2, %%mm5\n\t"
1101
                "punpcklwd %5, %%mm0\n\t"
1102
                "punpcklwd %5, %%mm1\n\t"
1103
                "punpcklwd %5, %%mm2\n\t"
1104
                "punpckhwd %5, %%mm3\n\t"
1105
                "punpckhwd %5, %%mm4\n\t"
1106
                "punpckhwd %5, %%mm5\n\t"
1107
                "psllq        $8, %%mm1\n\t"
1108
                "psllq        $16, %%mm2\n\t"
1109
                "por        %%mm1, %%mm0\n\t"
1110
                "por        %%mm2, %%mm0\n\t"
1111
                "psllq        $8, %%mm4\n\t"
1112
                "psllq        $16, %%mm5\n\t"
1113
                "por        %%mm4, %%mm3\n\t"
1114
                "por        %%mm5, %%mm3\n\t"
1115
                
1116
                "movq        %%mm0, %%mm6\n\t"
1117
                "movq        %%mm3, %%mm7\n\t"
1118

    
1119
                "movq        8%1, %%mm0\n\t"
1120
                "movq        8%1, %%mm1\n\t"
1121
                "movq        8%1, %%mm2\n\t"
1122
                "pand        %2, %%mm0\n\t"
1123
                "pand        %3, %%mm1\n\t"
1124
                "pand        %4, %%mm2\n\t"
1125
                "psllq        $3, %%mm0\n\t"
1126
                "psrlq        $3, %%mm1\n\t"
1127
                "psrlq        $8, %%mm2\n\t"
1128
                "movq        %%mm0, %%mm3\n\t"
1129
                "movq        %%mm1, %%mm4\n\t"
1130
                "movq        %%mm2, %%mm5\n\t"
1131
                "punpcklwd %5, %%mm0\n\t"
1132
                "punpcklwd %5, %%mm1\n\t"
1133
                "punpcklwd %5, %%mm2\n\t"
1134
                "punpckhwd %5, %%mm3\n\t"
1135
                "punpckhwd %5, %%mm4\n\t"
1136
                "punpckhwd %5, %%mm5\n\t"
1137
                "psllq        $8, %%mm1\n\t"
1138
                "psllq        $16, %%mm2\n\t"
1139
                "por        %%mm1, %%mm0\n\t"
1140
                "por        %%mm2, %%mm0\n\t"
1141
                "psllq        $8, %%mm4\n\t"
1142
                "psllq        $16, %%mm5\n\t"
1143
                "por        %%mm4, %%mm3\n\t"
1144
                "por        %%mm5, %%mm3\n\t"
1145
                :"=m"(*d)
1146
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)                
1147
                :"memory");
1148
            /* Borrowed 32 to 24 */
1149
            __asm __volatile(
1150
                "movq        %%mm0, %%mm4\n\t"
1151
                "movq        %%mm3, %%mm5\n\t"
1152
                "movq        %%mm6, %%mm0\n\t"
1153
                "movq        %%mm7, %%mm1\n\t"
1154
                
1155
                "movq        %%mm4, %%mm6\n\t"
1156
                "movq        %%mm5, %%mm7\n\t"
1157
                "movq        %%mm0, %%mm2\n\t"
1158
                "movq        %%mm1, %%mm3\n\t"
1159

    
1160
                "psrlq        $8, %%mm2\n\t"
1161
                "psrlq        $8, %%mm3\n\t"
1162
                "psrlq        $8, %%mm6\n\t"
1163
                "psrlq        $8, %%mm7\n\t"
1164
                "pand        %2, %%mm0\n\t"
1165
                "pand        %2, %%mm1\n\t"
1166
                "pand        %2, %%mm4\n\t"
1167
                "pand        %2, %%mm5\n\t"
1168
                "pand        %3, %%mm2\n\t"
1169
                "pand        %3, %%mm3\n\t"
1170
                "pand        %3, %%mm6\n\t"
1171
                "pand        %3, %%mm7\n\t"
1172
                "por        %%mm2, %%mm0\n\t"
1173
                "por        %%mm3, %%mm1\n\t"
1174
                "por        %%mm6, %%mm4\n\t"
1175
                "por        %%mm7, %%mm5\n\t"
1176

    
1177
                "movq        %%mm1, %%mm2\n\t"
1178
                "movq        %%mm4, %%mm3\n\t"
1179
                "psllq        $48, %%mm2\n\t"
1180
                "psllq        $32, %%mm3\n\t"
1181
                "pand        %4, %%mm2\n\t"
1182
                "pand        %5, %%mm3\n\t"
1183
                "por        %%mm2, %%mm0\n\t"
1184
                "psrlq        $16, %%mm1\n\t"
1185
                "psrlq        $32, %%mm4\n\t"
1186
                "psllq        $16, %%mm5\n\t"
1187
                "por        %%mm3, %%mm1\n\t"
1188
                "pand        %6, %%mm5\n\t"
1189
                "por        %%mm5, %%mm4\n\t"
1190

    
1191
                MOVNTQ"        %%mm0, %0\n\t"
1192
                MOVNTQ"        %%mm1, 8%0\n\t"
1193
                MOVNTQ"        %%mm4, 16%0"
1194

    
1195
                :"=m"(*d)
1196
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1197
                :"memory");
1198
                d += 24;
1199
                s += 8;
1200
        }
1201
        __asm __volatile(SFENCE:::"memory");
1202
        __asm __volatile(EMMS:::"memory");
1203
#endif
1204
        while(s < end)
1205
        {
1206
                register uint16_t bgr;
1207
                bgr = *s++;
1208
                *d++ = (bgr&0x1F)<<3;
1209
                *d++ = (bgr&0x7E0)>>3;
1210
                *d++ = (bgr&0xF800)>>8;
1211
        }
1212
}
1213

    
1214
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1215
{
1216
        const uint16_t *end;
1217
#ifdef HAVE_MMX
1218
        const uint16_t *mm_end;
1219
#endif
1220
        uint8_t *d = (uint8_t *)dst;
1221
        const uint16_t *s = (const uint16_t *)src;
1222
        end = s + src_size/2;
1223
#ifdef HAVE_MMX
1224
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1225
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1226
        mm_end = end - 3;
1227
        while(s < mm_end)
1228
        {
1229
            __asm __volatile(
1230
                PREFETCH" 32%1\n\t"
1231
                "movq        %1, %%mm0\n\t"
1232
                "movq        %1, %%mm1\n\t"
1233
                "movq        %1, %%mm2\n\t"
1234
                "pand        %2, %%mm0\n\t"
1235
                "pand        %3, %%mm1\n\t"
1236
                "pand        %4, %%mm2\n\t"
1237
                "psllq        $3, %%mm0\n\t"
1238
                "psrlq        $2, %%mm1\n\t"
1239
                "psrlq        $7, %%mm2\n\t"
1240
                "movq        %%mm0, %%mm3\n\t"
1241
                "movq        %%mm1, %%mm4\n\t"
1242
                "movq        %%mm2, %%mm5\n\t"
1243
                "punpcklwd %%mm7, %%mm0\n\t"
1244
                "punpcklwd %%mm7, %%mm1\n\t"
1245
                "punpcklwd %%mm7, %%mm2\n\t"
1246
                "punpckhwd %%mm7, %%mm3\n\t"
1247
                "punpckhwd %%mm7, %%mm4\n\t"
1248
                "punpckhwd %%mm7, %%mm5\n\t"
1249
                "psllq        $8, %%mm1\n\t"
1250
                "psllq        $16, %%mm2\n\t"
1251
                "por        %%mm1, %%mm0\n\t"
1252
                "por        %%mm2, %%mm0\n\t"
1253
                "psllq        $8, %%mm4\n\t"
1254
                "psllq        $16, %%mm5\n\t"
1255
                "por        %%mm4, %%mm3\n\t"
1256
                "por        %%mm5, %%mm3\n\t"
1257
                MOVNTQ"        %%mm0, %0\n\t"
1258
                MOVNTQ"        %%mm3, 8%0\n\t"
1259
                :"=m"(*d)
1260
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1261
                :"memory");
1262
                d += 16;
1263
                s += 4;
1264
        }
1265
        __asm __volatile(SFENCE:::"memory");
1266
        __asm __volatile(EMMS:::"memory");
1267
#endif
1268
        while(s < end)
1269
        {
1270
#if 0 //slightly slower on athlon
1271
                int bgr= *s++;
1272
                *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1273
#else
1274
                register uint16_t bgr;
1275
                bgr = *s++;
1276
#ifdef WORDS_BIGENDIAN
1277
                *d++ = 0;
1278
                *d++ = (bgr&0x7C00)>>7;
1279
                *d++ = (bgr&0x3E0)>>2;
1280
                *d++ = (bgr&0x1F)<<3;
1281
#else
1282
                *d++ = (bgr&0x1F)<<3;
1283
                *d++ = (bgr&0x3E0)>>2;
1284
                *d++ = (bgr&0x7C00)>>7;
1285
                *d++ = 0;
1286
#endif
1287

    
1288
#endif
1289
        }
1290
}
1291

    
1292
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1293
{
1294
        const uint16_t *end;
1295
#ifdef HAVE_MMX
1296
        const uint16_t *mm_end;
1297
#endif
1298
        uint8_t *d = (uint8_t *)dst;
1299
        const uint16_t *s = (uint16_t *)src;
1300
        end = s + src_size/2;
1301
#ifdef HAVE_MMX
1302
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1303
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1304
        mm_end = end - 3;
1305
        while(s < mm_end)
1306
        {
1307
            __asm __volatile(
1308
                PREFETCH" 32%1\n\t"
1309
                "movq        %1, %%mm0\n\t"
1310
                "movq        %1, %%mm1\n\t"
1311
                "movq        %1, %%mm2\n\t"
1312
                "pand        %2, %%mm0\n\t"
1313
                "pand        %3, %%mm1\n\t"
1314
                "pand        %4, %%mm2\n\t"
1315
                "psllq        $3, %%mm0\n\t"
1316
                "psrlq        $3, %%mm1\n\t"
1317
                "psrlq        $8, %%mm2\n\t"
1318
                "movq        %%mm0, %%mm3\n\t"
1319
                "movq        %%mm1, %%mm4\n\t"
1320
                "movq        %%mm2, %%mm5\n\t"
1321
                "punpcklwd %%mm7, %%mm0\n\t"
1322
                "punpcklwd %%mm7, %%mm1\n\t"
1323
                "punpcklwd %%mm7, %%mm2\n\t"
1324
                "punpckhwd %%mm7, %%mm3\n\t"
1325
                "punpckhwd %%mm7, %%mm4\n\t"
1326
                "punpckhwd %%mm7, %%mm5\n\t"
1327
                "psllq        $8, %%mm1\n\t"
1328
                "psllq        $16, %%mm2\n\t"
1329
                "por        %%mm1, %%mm0\n\t"
1330
                "por        %%mm2, %%mm0\n\t"
1331
                "psllq        $8, %%mm4\n\t"
1332
                "psllq        $16, %%mm5\n\t"
1333
                "por        %%mm4, %%mm3\n\t"
1334
                "por        %%mm5, %%mm3\n\t"
1335
                MOVNTQ"        %%mm0, %0\n\t"
1336
                MOVNTQ"        %%mm3, 8%0\n\t"
1337
                :"=m"(*d)
1338
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1339
                :"memory");
1340
                d += 16;
1341
                s += 4;
1342
        }
1343
        __asm __volatile(SFENCE:::"memory");
1344
        __asm __volatile(EMMS:::"memory");
1345
#endif
1346
        while(s < end)
1347
        {
1348
                register uint16_t bgr;
1349
                bgr = *s++;
1350
#ifdef WORDS_BIGENDIAN
1351
                *d++ = 0;
1352
                *d++ = (bgr&0xF800)>>8;
1353
                *d++ = (bgr&0x7E0)>>3;
1354
                *d++ = (bgr&0x1F)<<3;
1355
#else
1356
                *d++ = (bgr&0x1F)<<3;
1357
                *d++ = (bgr&0x7E0)>>3;
1358
                *d++ = (bgr&0xF800)>>8;
1359
                *d++ = 0;
1360
#endif
1361
        }
1362
}
1363

    
1364
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1365
{
1366
#ifdef HAVE_MMX
1367
/* TODO: unroll this loop */
1368
        asm volatile (
1369
                "xor %%"REG_a", %%"REG_a"        \n\t"
1370
                ASMALIGN(4)
1371
                "1:                                \n\t"
1372
                PREFETCH" 32(%0, %%"REG_a")        \n\t"
1373
                "movq (%0, %%"REG_a"), %%mm0        \n\t"
1374
                "movq %%mm0, %%mm1                \n\t"
1375
                "movq %%mm0, %%mm2                \n\t"
1376
                "pslld $16, %%mm0                \n\t"
1377
                "psrld $16, %%mm1                \n\t"
1378
                "pand "MANGLE(mask32r)", %%mm0        \n\t"
1379
                "pand "MANGLE(mask32g)", %%mm2        \n\t"
1380
                "pand "MANGLE(mask32b)", %%mm1        \n\t"
1381
                "por %%mm0, %%mm2                \n\t"
1382
                "por %%mm1, %%mm2                \n\t"
1383
                MOVNTQ" %%mm2, (%1, %%"REG_a")        \n\t"
1384
                "add $8, %%"REG_a"                \n\t"
1385
                "cmp %2, %%"REG_a"                \n\t"
1386
                " jb 1b                                \n\t"
1387
                :: "r" (src), "r"(dst), "r" (src_size-7)
1388
                : "%"REG_a
1389
        );
1390

    
1391
        __asm __volatile(SFENCE:::"memory");
1392
        __asm __volatile(EMMS:::"memory");
1393
#else
1394
        unsigned i;
1395
        unsigned num_pixels = src_size >> 2;
1396
        for(i=0; i<num_pixels; i++)
1397
        {
1398
#ifdef WORDS_BIGENDIAN  
1399
          dst[4*i + 1] = src[4*i + 3];
1400
          dst[4*i + 2] = src[4*i + 2];
1401
          dst[4*i + 3] = src[4*i + 1];
1402
#else
1403
          dst[4*i + 0] = src[4*i + 2];
1404
          dst[4*i + 1] = src[4*i + 1];
1405
          dst[4*i + 2] = src[4*i + 0];
1406
#endif
1407
        }
1408
#endif
1409
}
1410

    
1411
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1412
{
1413
        unsigned i;
1414
#ifdef HAVE_MMX
1415
        long mmx_size= 23 - src_size;
1416
        asm volatile (
1417
                "movq "MANGLE(mask24r)", %%mm5        \n\t"
1418
                "movq "MANGLE(mask24g)", %%mm6        \n\t"
1419
                "movq "MANGLE(mask24b)", %%mm7        \n\t"
1420
                ASMALIGN(4)
1421
                "1:                                \n\t"
1422
                PREFETCH" 32(%1, %%"REG_a")        \n\t"
1423
                "movq   (%1, %%"REG_a"), %%mm0        \n\t" // BGR BGR BG
1424
                "movq   (%1, %%"REG_a"), %%mm1        \n\t" // BGR BGR BG
1425
                "movq  2(%1, %%"REG_a"), %%mm2        \n\t" // R BGR BGR B
1426
                "psllq $16, %%mm0                \n\t" // 00 BGR BGR
1427
                "pand %%mm5, %%mm0                \n\t"
1428
                "pand %%mm6, %%mm1                \n\t"
1429
                "pand %%mm7, %%mm2                \n\t"
1430
                "por %%mm0, %%mm1                \n\t"
1431
                "por %%mm2, %%mm1                \n\t"                
1432
                "movq  6(%1, %%"REG_a"), %%mm0        \n\t" // BGR BGR BG
1433
                MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1434
                "movq  8(%1, %%"REG_a"), %%mm1        \n\t" // R BGR BGR B
1435
                "movq 10(%1, %%"REG_a"), %%mm2        \n\t" // GR BGR BGR
1436
                "pand %%mm7, %%mm0                \n\t"
1437
                "pand %%mm5, %%mm1                \n\t"
1438
                "pand %%mm6, %%mm2                \n\t"
1439
                "por %%mm0, %%mm1                \n\t"
1440
                "por %%mm2, %%mm1                \n\t"                
1441
                "movq 14(%1, %%"REG_a"), %%mm0        \n\t" // R BGR BGR B
1442
                MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1443
                "movq 16(%1, %%"REG_a"), %%mm1        \n\t" // GR BGR BGR
1444
                "movq 18(%1, %%"REG_a"), %%mm2        \n\t" // BGR BGR BG
1445
                "pand %%mm6, %%mm0                \n\t"
1446
                "pand %%mm7, %%mm1                \n\t"
1447
                "pand %%mm5, %%mm2                \n\t"
1448
                "por %%mm0, %%mm1                \n\t"
1449
                "por %%mm2, %%mm1                \n\t"                
1450
                MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1451
                "add $24, %%"REG_a"                \n\t"
1452
                " js 1b                                \n\t"
1453
                : "+a" (mmx_size)
1454
                : "r" (src-mmx_size), "r"(dst-mmx_size)
1455
        );
1456

    
1457
        __asm __volatile(SFENCE:::"memory");
1458
        __asm __volatile(EMMS:::"memory");
1459

    
1460
        if(mmx_size==23) return; //finihsed, was multiple of 8
1461

    
1462
        src+= src_size;
1463
        dst+= src_size;
1464
        src_size= 23-mmx_size;
1465
        src-= src_size;
1466
        dst-= src_size;
1467
#endif
1468
        for(i=0; i<src_size; i+=3)
1469
        {
1470
                register uint8_t x;
1471
                x          = src[i + 2];
1472
                dst[i + 1] = src[i + 1];
1473
                dst[i + 2] = src[i + 0];
1474
                dst[i + 0] = x;
1475
        }
1476
}
1477

    
1478
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1479
        long width, long height,
1480
        long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1481
{
1482
        long y;
1483
        const long chromWidth= width>>1;
1484
        for(y=0; y<height; y++)
1485
        {
1486
#ifdef HAVE_MMX
1487
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1488
                asm volatile(
1489
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1490
                        ASMALIGN(4)
1491
                        "1:                                \n\t"
1492
                        PREFETCH" 32(%1, %%"REG_a", 2)        \n\t"
1493
                        PREFETCH" 32(%2, %%"REG_a")        \n\t"
1494
                        PREFETCH" 32(%3, %%"REG_a")        \n\t"
1495
                        "movq (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1496
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1497
                        "movq (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1498
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1499
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1500

    
1501
                        "movq (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1502
                        "movq 8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1503
                        "movq %%mm3, %%mm4                \n\t" // Y(0)
1504
                        "movq %%mm5, %%mm6                \n\t" // Y(8)
1505
                        "punpcklbw %%mm0, %%mm3                \n\t" // YUYV YUYV(0)
1506
                        "punpckhbw %%mm0, %%mm4                \n\t" // YUYV YUYV(4)
1507
                        "punpcklbw %%mm2, %%mm5                \n\t" // YUYV YUYV(8)
1508
                        "punpckhbw %%mm2, %%mm6                \n\t" // YUYV YUYV(12)
1509

    
1510
                        MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1511
                        MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1512
                        MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1513
                        MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1514

    
1515
                        "add $8, %%"REG_a"                \n\t"
1516
                        "cmp %4, %%"REG_a"                \n\t"
1517
                        " jb 1b                                \n\t"
1518
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1519
                        : "%"REG_a
1520
                );
1521
#else
1522

    
1523
#if defined ARCH_ALPHA && defined HAVE_MVI
1524
#define pl2yuy2(n)                                        \
1525
        y1 = yc[n];                                        \
1526
        y2 = yc2[n];                                        \
1527
        u = uc[n];                                        \
1528
        v = vc[n];                                        \
1529
        asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));        \
1530
        asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));        \
1531
        asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1532
        asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1533
        yuv1 = (u << 8) + (v << 24);                        \
1534
        yuv2 = yuv1 + y2;                                \
1535
        yuv1 += y1;                                        \
1536
        qdst[n] = yuv1;                                        \
1537
        qdst2[n] = yuv2;
1538

    
1539
                int i;
1540
                uint64_t *qdst = (uint64_t *) dst;
1541
                uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1542
                const uint32_t *yc = (uint32_t *) ysrc;
1543
                const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1544
                const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1545
                for(i = 0; i < chromWidth; i += 8){
1546
                        uint64_t y1, y2, yuv1, yuv2;
1547
                        uint64_t u, v;
1548
                        /* Prefetch */
1549
                        asm("ldq $31,64(%0)" :: "r"(yc));
1550
                        asm("ldq $31,64(%0)" :: "r"(yc2));
1551
                        asm("ldq $31,64(%0)" :: "r"(uc));
1552
                        asm("ldq $31,64(%0)" :: "r"(vc));
1553

    
1554
                        pl2yuy2(0);
1555
                        pl2yuy2(1);
1556
                        pl2yuy2(2);
1557
                        pl2yuy2(3);
1558

    
1559
                        yc += 4;
1560
                        yc2 += 4;
1561
                        uc += 4;
1562
                        vc += 4;
1563
                        qdst += 4;
1564
                        qdst2 += 4;
1565
                }
1566
                y++;
1567
                ysrc += lumStride;
1568
                dst += dstStride;
1569

    
1570
#elif __WORDSIZE >= 64
1571
                int i;
1572
                uint64_t *ldst = (uint64_t *) dst;
1573
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1574
                for(i = 0; i < chromWidth; i += 2){
1575
                        uint64_t k, l;
1576
                        k = yc[0] + (uc[0] << 8) +
1577
                            (yc[1] << 16) + (vc[0] << 24);
1578
                        l = yc[2] + (uc[1] << 8) +
1579
                            (yc[3] << 16) + (vc[1] << 24);
1580
                        *ldst++ = k + (l << 32);
1581
                        yc += 4;
1582
                        uc += 2;
1583
                        vc += 2;
1584
                }
1585

    
1586
#else
1587
                int i, *idst = (int32_t *) dst;
1588
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1589
                for(i = 0; i < chromWidth; i++){
1590
#ifdef WORDS_BIGENDIAN
1591
                        *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1592
                            (yc[1] << 8) + (vc[0] << 0);
1593
#else
1594
                        *idst++ = yc[0] + (uc[0] << 8) +
1595
                            (yc[1] << 16) + (vc[0] << 24);
1596
#endif
1597
                        yc += 2;
1598
                        uc++;
1599
                        vc++;
1600
                }
1601
#endif
1602
#endif
1603
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1604
                {
1605
                        usrc += chromStride;
1606
                        vsrc += chromStride;
1607
                }
1608
                ysrc += lumStride;
1609
                dst += dstStride;
1610
        }
1611
#ifdef HAVE_MMX
1612
asm(    EMMS" \n\t"
1613
        SFENCE" \n\t"
1614
        :::"memory");
1615
#endif
1616
}
1617

    
1618
/**
1619
 *
1620
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1621
 * problem for anyone then tell me, and ill fix it)
1622
 */
1623
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1624
        long width, long height,
1625
        long lumStride, long chromStride, long dstStride)
1626
{
1627
        //FIXME interpolate chroma
1628
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1629
}
1630

    
1631
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1632
        long width, long height,
1633
        long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1634
{
1635
        long y;
1636
        const long chromWidth= width>>1;
1637
        for(y=0; y<height; y++)
1638
        {
1639
#ifdef HAVE_MMX
1640
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1641
                asm volatile(
1642
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1643
                        ASMALIGN(4)
1644
                        "1:                                \n\t"
1645
                        PREFETCH" 32(%1, %%"REG_a", 2)        \n\t"
1646
                        PREFETCH" 32(%2, %%"REG_a")        \n\t"
1647
                        PREFETCH" 32(%3, %%"REG_a")        \n\t"
1648
                        "movq (%2, %%"REG_a"), %%mm0        \n\t" // U(0)
1649
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1650
                        "movq (%3, %%"REG_a"), %%mm1        \n\t" // V(0)
1651
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1652
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1653

    
1654
                        "movq (%1, %%"REG_a",2), %%mm3        \n\t" // Y(0)
1655
                        "movq 8(%1, %%"REG_a",2), %%mm5        \n\t" // Y(8)
1656
                        "movq %%mm0, %%mm4                \n\t" // Y(0)
1657
                        "movq %%mm2, %%mm6                \n\t" // Y(8)
1658
                        "punpcklbw %%mm3, %%mm0                \n\t" // YUYV YUYV(0)
1659
                        "punpckhbw %%mm3, %%mm4                \n\t" // YUYV YUYV(4)
1660
                        "punpcklbw %%mm5, %%mm2                \n\t" // YUYV YUYV(8)
1661
                        "punpckhbw %%mm5, %%mm6                \n\t" // YUYV YUYV(12)
1662

    
1663
                        MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1664
                        MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1665
                        MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1666
                        MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1667

    
1668
                        "add $8, %%"REG_a"                \n\t"
1669
                        "cmp %4, %%"REG_a"                \n\t"
1670
                        " jb 1b                                \n\t"
1671
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1672
                        : "%"REG_a
1673
                );
1674
#else
1675
//FIXME adapt the alpha asm code from yv12->yuy2
1676

    
1677
#if __WORDSIZE >= 64
1678
                int i;
1679
                uint64_t *ldst = (uint64_t *) dst;
1680
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1681
                for(i = 0; i < chromWidth; i += 2){
1682
                        uint64_t k, l;
1683
                        k = uc[0] + (yc[0] << 8) +
1684
                            (vc[0] << 16) + (yc[1] << 24);
1685
                        l = uc[1] + (yc[2] << 8) +
1686
                            (vc[1] << 16) + (yc[3] << 24);
1687
                        *ldst++ = k + (l << 32);
1688
                        yc += 4;
1689
                        uc += 2;
1690
                        vc += 2;
1691
                }
1692

    
1693
#else
1694
                int i, *idst = (int32_t *) dst;
1695
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1696
                for(i = 0; i < chromWidth; i++){
1697
#ifdef WORDS_BIGENDIAN
1698
                        *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1699
                            (vc[0] << 8) + (yc[1] << 0);
1700
#else
1701
                        *idst++ = uc[0] + (yc[0] << 8) +
1702
                            (vc[0] << 16) + (yc[1] << 24);
1703
#endif
1704
                        yc += 2;
1705
                        uc++;
1706
                        vc++;
1707
                }
1708
#endif
1709
#endif
1710
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1711
                {
1712
                        usrc += chromStride;
1713
                        vsrc += chromStride;
1714
                }
1715
                ysrc += lumStride;
1716
                dst += dstStride;
1717
        }
1718
#ifdef HAVE_MMX
1719
asm(    EMMS" \n\t"
1720
        SFENCE" \n\t"
1721
        :::"memory");
1722
#endif
1723
}
1724

    
1725
/**
1726
 *
1727
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1728
 * problem for anyone then tell me, and ill fix it)
1729
 */
1730
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1731
        long width, long height,
1732
        long lumStride, long chromStride, long dstStride)
1733
{
1734
        //FIXME interpolate chroma
1735
        RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1736
}
1737

    
1738
/**
1739
 *
1740
 * width should be a multiple of 16
1741
 */
1742
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1743
        long width, long height,
1744
        long lumStride, long chromStride, long dstStride)
1745
{
1746
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1747
}
1748

    
1749
/**
1750
 *
1751
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1752
 * problem for anyone then tell me, and ill fix it)
1753
 */
1754
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1755
        long width, long height,
1756
        long lumStride, long chromStride, long srcStride)
1757
{
1758
        long y;
1759
        const long chromWidth= width>>1;
1760
        for(y=0; y<height; y+=2)
1761
        {
1762
#ifdef HAVE_MMX
1763
                asm volatile(
1764
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1765
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1766
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1767
                        ASMALIGN(4)
1768
                        "1:                                \n\t"
1769
                        PREFETCH" 64(%0, %%"REG_a", 4)        \n\t"
1770
                        "movq (%0, %%"REG_a", 4), %%mm0        \n\t" // YUYV YUYV(0)
1771
                        "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1772
                        "movq %%mm0, %%mm2                \n\t" // YUYV YUYV(0)
1773
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(4)
1774
                        "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1775
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1776
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1777
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1778
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1779
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1780

    
1781
                        MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1782

    
1783
                        "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1784
                        "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1785
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(8)
1786
                        "movq %%mm2, %%mm4                \n\t" // YUYV YUYV(12)
1787
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1788
                        "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1789
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1790
                        "pand %%mm7, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1791
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1792
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1793

    
1794
                        MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1795

    
1796
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1797
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1798
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1799
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1800
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1801
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1802
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1803
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1804

    
1805
                        MOVNTQ" %%mm0, (%3, %%"REG_a")        \n\t"
1806
                        MOVNTQ" %%mm2, (%2, %%"REG_a")        \n\t"
1807

    
1808
                        "add $8, %%"REG_a"                \n\t"
1809
                        "cmp %4, %%"REG_a"                \n\t"
1810
                        " jb 1b                                \n\t"
1811
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1812
                        : "memory", "%"REG_a
1813
                );
1814

    
1815
                ydst += lumStride;
1816
                src  += srcStride;
1817

    
1818
                asm volatile(
1819
                        "xor %%"REG_a", %%"REG_a"        \n\t"
1820
                        ASMALIGN(4)
1821
                        "1:                                \n\t"
1822
                        PREFETCH" 64(%0, %%"REG_a", 4)        \n\t"
1823
                        "movq (%0, %%"REG_a", 4), %%mm0        \n\t" // YUYV YUYV(0)
1824
                        "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1825
                        "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1826
                        "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1827
                        "pand %%mm7, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1828
                        "pand %%mm7, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1829
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1830
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1831
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1832
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1833

    
1834
                        MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1835
                        MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1836

    
1837
                        "add $8, %%"REG_a"                \n\t"
1838
                        "cmp %4, %%"REG_a"                \n\t"
1839
                        " jb 1b                                \n\t"
1840

    
1841
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1842
                        : "memory", "%"REG_a
1843
                );
1844
#else
1845
                long i;
1846
                for(i=0; i<chromWidth; i++)
1847
                {
1848
                        ydst[2*i+0]         = src[4*i+0];
1849
                        udst[i]         = src[4*i+1];
1850
                        ydst[2*i+1]         = src[4*i+2];
1851
                        vdst[i]         = src[4*i+3];
1852
                }
1853
                ydst += lumStride;
1854
                src  += srcStride;
1855

    
1856
                for(i=0; i<chromWidth; i++)
1857
                {
1858
                        ydst[2*i+0]         = src[4*i+0];
1859
                        ydst[2*i+1]         = src[4*i+2];
1860
                }
1861
#endif
1862
                udst += chromStride;
1863
                vdst += chromStride;
1864
                ydst += lumStride;
1865
                src  += srcStride;
1866
        }
1867
#ifdef HAVE_MMX
1868
asm volatile(   EMMS" \n\t"
1869
                SFENCE" \n\t"
1870
                :::"memory");
1871
#endif
1872
}
1873

    
1874
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1875
        uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1876
        long width, long height, long lumStride, long chromStride)
1877
{
1878
        /* Y Plane */
1879
        memcpy(ydst, ysrc, width*height);
1880

    
1881
        /* XXX: implement upscaling for U,V */
1882
}
1883

    
1884
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1885
{
1886
        long x,y;
1887
        
1888
        dst[0]= src[0];
1889
        
1890
        // first line
1891
        for(x=0; x<srcWidth-1; x++){
1892
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1893
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1894
        }
1895
        dst[2*srcWidth-1]= src[srcWidth-1];
1896
        
1897
        dst+= dstStride;
1898

    
1899
        for(y=1; y<srcHeight; y++){
1900
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1901
                const long mmxSize= srcWidth&~15;
1902
                asm volatile(
1903
                        "mov %4, %%"REG_a"                \n\t"
1904
                        "1:                                \n\t"
1905
                        "movq (%0, %%"REG_a"), %%mm0        \n\t"
1906
                        "movq (%1, %%"REG_a"), %%mm1        \n\t"
1907
                        "movq 1(%0, %%"REG_a"), %%mm2        \n\t"
1908
                        "movq 1(%1, %%"REG_a"), %%mm3        \n\t"
1909
                        "movq -1(%0, %%"REG_a"), %%mm4        \n\t"
1910
                        "movq -1(%1, %%"REG_a"), %%mm5        \n\t"
1911
                        PAVGB" %%mm0, %%mm5                \n\t"
1912
                        PAVGB" %%mm0, %%mm3                \n\t"
1913
                        PAVGB" %%mm0, %%mm5                \n\t"
1914
                        PAVGB" %%mm0, %%mm3                \n\t"
1915
                        PAVGB" %%mm1, %%mm4                \n\t"
1916
                        PAVGB" %%mm1, %%mm2                \n\t"
1917
                        PAVGB" %%mm1, %%mm4                \n\t"
1918
                        PAVGB" %%mm1, %%mm2                \n\t"
1919
                        "movq %%mm5, %%mm7                \n\t"
1920
                        "movq %%mm4, %%mm6                \n\t"
1921
                        "punpcklbw %%mm3, %%mm5                \n\t"
1922
                        "punpckhbw %%mm3, %%mm7                \n\t"
1923
                        "punpcklbw %%mm2, %%mm4                \n\t"
1924
                        "punpckhbw %%mm2, %%mm6                \n\t"
1925
#if 1
1926
                        MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1927
                        MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1928
                        MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1929
                        MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1930
#else
1931
                        "movq %%mm5, (%2, %%"REG_a", 2)        \n\t"
1932
                        "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1933
                        "movq %%mm4, (%3, %%"REG_a", 2)        \n\t"
1934
                        "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1935
#endif
1936
                        "add $8, %%"REG_a"                \n\t"
1937
                        " js 1b                                \n\t"
1938
                        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1939
                           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1940
                           "g" (-mmxSize)
1941
                        : "%"REG_a
1942

    
1943
                );
1944
#else
1945
                const long mmxSize=1;
1946
#endif
1947
                dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1948
                dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1949

    
1950
                for(x=mmxSize-1; x<srcWidth-1; x++){
1951
                        dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1952
                        dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1953
                        dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1954
                        dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1955
                }
1956
                dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1957
                dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1958

    
1959
                dst+=dstStride*2;
1960
                src+=srcStride;
1961
        }
1962
        
1963
        // last line
1964
#if 1
1965
        dst[0]= src[0];
1966
        
1967
        for(x=0; x<srcWidth-1; x++){
1968
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1969
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1970
        }
1971
        dst[2*srcWidth-1]= src[srcWidth-1];
1972
#else
1973
        for(x=0; x<srcWidth; x++){
1974
                dst[2*x+0]=
1975
                dst[2*x+1]= src[x];
1976
        }
1977
#endif
1978

    
1979
#ifdef HAVE_MMX
1980
asm volatile(   EMMS" \n\t"
1981
                SFENCE" \n\t"
1982
                :::"memory");
1983
#endif
1984
}
1985

    
1986
/**
1987
 *
1988
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1989
 * problem for anyone then tell me, and ill fix it)
1990
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1991
 */
1992
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1993
        long width, long height,
1994
        long lumStride, long chromStride, long srcStride)
1995
{
1996
        long y;
1997
        const long chromWidth= width>>1;
1998
        for(y=0; y<height; y+=2)
1999
        {
2000
#ifdef HAVE_MMX
2001
                asm volatile(
2002
                        "xorl %%eax, %%eax                \n\t"
2003
                        "pcmpeqw %%mm7, %%mm7                \n\t"
2004
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
2005
                        ASMALIGN(4)
2006
                        "1:                                \n\t"
2007
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
2008
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // UYVY UYVY(0)
2009
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(4)
2010
                        "movq %%mm0, %%mm2                \n\t" // UYVY UYVY(0)
2011
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(4)
2012
                        "pand %%mm7, %%mm0                \n\t" // U0V0 U0V0(0)
2013
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(4)
2014
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
2015
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
2016
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
2017
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
2018

    
2019
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
2020

    
2021
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(8)
2022
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // UYVY UYVY(12)
2023
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(8)
2024
                        "movq %%mm2, %%mm4                \n\t" // UYVY UYVY(12)
2025
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(8)
2026
                        "pand %%mm7, %%mm2                \n\t" // U0V0 U0V0(12)
2027
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2028
                        "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2029
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
2030
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
2031

    
2032
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
2033

    
2034
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
2035
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
2036
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2037
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2038
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
2039
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
2040
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
2041
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
2042

    
2043
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
2044
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
2045

    
2046
                        "addl $8, %%eax                        \n\t"
2047
                        "cmpl %4, %%eax                        \n\t"
2048
                        " jb 1b                                \n\t"
2049
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2050
                        : "memory", "%eax"
2051
                );
2052

    
2053
                ydst += lumStride;
2054
                src  += srcStride;
2055

    
2056
                asm volatile(
2057
                        "xorl %%eax, %%eax                \n\t"
2058
                        ASMALIGN(4)
2059
                        "1:                                \n\t"
2060
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
2061
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
2062
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
2063
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
2064
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
2065
                        "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2066
                        "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2067
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2068
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2069
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
2070
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
2071

    
2072
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
2073
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
2074

    
2075
                        "addl $8, %%eax                        \n\t"
2076
                        "cmpl %4, %%eax                        \n\t"
2077
                        " jb 1b                                \n\t"
2078

    
2079
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2080
                        : "memory", "%eax"
2081
                );
2082
#else
2083
                long i;
2084
                for(i=0; i<chromWidth; i++)
2085
                {
2086
                        udst[i]         = src[4*i+0];
2087
                        ydst[2*i+0]         = src[4*i+1];
2088
                        vdst[i]         = src[4*i+2];
2089
                        ydst[2*i+1]         = src[4*i+3];
2090
                }
2091
                ydst += lumStride;
2092
                src  += srcStride;
2093

    
2094
                for(i=0; i<chromWidth; i++)
2095
                {
2096
                        ydst[2*i+0]         = src[4*i+1];
2097
                        ydst[2*i+1]         = src[4*i+3];
2098
                }
2099
#endif
2100
                udst += chromStride;
2101
                vdst += chromStride;
2102
                ydst += lumStride;
2103
                src  += srcStride;
2104
        }
2105
#ifdef HAVE_MMX
2106
asm volatile(   EMMS" \n\t"
2107
                SFENCE" \n\t"
2108
                :::"memory");
2109
#endif
2110
}
2111

    
2112
/**
2113
 *
2114
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2115
 * problem for anyone then tell me, and ill fix it)
2116
 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2117
 */
2118
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2119
        long width, long height,
2120
        long lumStride, long chromStride, long srcStride)
2121
{
2122
        long y;
2123
        const long chromWidth= width>>1;
2124
#ifdef HAVE_MMX
2125
        for(y=0; y<height-2; y+=2)
2126
        {
2127
                long i;
2128
                for(i=0; i<2; i++)
2129
                {
2130
                        asm volatile(
2131
                                "mov %2, %%"REG_a"                \n\t"
2132
                                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
2133
                                "movq "MANGLE(w1111)", %%mm5                \n\t"
2134
                                "pxor %%mm7, %%mm7                \n\t"
2135
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2136
                                ASMALIGN(4)
2137
                                "1:                                \n\t"
2138
                                PREFETCH" 64(%0, %%"REG_d")        \n\t"
2139
                                "movd (%0, %%"REG_d"), %%mm0        \n\t"
2140
                                "movd 3(%0, %%"REG_d"), %%mm1        \n\t"
2141
                                "punpcklbw %%mm7, %%mm0                \n\t"
2142
                                "punpcklbw %%mm7, %%mm1                \n\t"
2143
                                "movd 6(%0, %%"REG_d"), %%mm2        \n\t"
2144
                                "movd 9(%0, %%"REG_d"), %%mm3        \n\t"
2145
                                "punpcklbw %%mm7, %%mm2                \n\t"
2146
                                "punpcklbw %%mm7, %%mm3                \n\t"
2147
                                "pmaddwd %%mm6, %%mm0                \n\t"
2148
                                "pmaddwd %%mm6, %%mm1                \n\t"
2149
                                "pmaddwd %%mm6, %%mm2                \n\t"
2150
                                "pmaddwd %%mm6, %%mm3                \n\t"
2151
#ifndef FAST_BGR2YV12
2152
                                "psrad $8, %%mm0                \n\t"
2153
                                "psrad $8, %%mm1                \n\t"
2154
                                "psrad $8, %%mm2                \n\t"
2155
                                "psrad $8, %%mm3                \n\t"
2156
#endif
2157
                                "packssdw %%mm1, %%mm0                \n\t"
2158
                                "packssdw %%mm3, %%mm2                \n\t"
2159
                                "pmaddwd %%mm5, %%mm0                \n\t"
2160
                                "pmaddwd %%mm5, %%mm2                \n\t"
2161
                                "packssdw %%mm2, %%mm0                \n\t"
2162
                                "psraw $7, %%mm0                \n\t"
2163

    
2164
                                "movd 12(%0, %%"REG_d"), %%mm4        \n\t"
2165
                                "movd 15(%0, %%"REG_d"), %%mm1        \n\t"
2166
                                "punpcklbw %%mm7, %%mm4                \n\t"
2167
                                "punpcklbw %%mm7, %%mm1                \n\t"
2168
                                "movd 18(%0, %%"REG_d"), %%mm2        \n\t"
2169
                                "movd 21(%0, %%"REG_d"), %%mm3        \n\t"
2170
                                "punpcklbw %%mm7, %%mm2                \n\t"
2171
                                "punpcklbw %%mm7, %%mm3                \n\t"
2172
                                "pmaddwd %%mm6, %%mm4                \n\t"
2173
                                "pmaddwd %%mm6, %%mm1                \n\t"
2174
                                "pmaddwd %%mm6, %%mm2                \n\t"
2175
                                "pmaddwd %%mm6, %%mm3                \n\t"
2176
#ifndef FAST_BGR2YV12
2177
                                "psrad $8, %%mm4                \n\t"
2178
                                "psrad $8, %%mm1                \n\t"
2179
                                "psrad $8, %%mm2                \n\t"
2180
                                "psrad $8, %%mm3                \n\t"
2181
#endif
2182
                                "packssdw %%mm1, %%mm4                \n\t"
2183
                                "packssdw %%mm3, %%mm2                \n\t"
2184
                                "pmaddwd %%mm5, %%mm4                \n\t"
2185
                                "pmaddwd %%mm5, %%mm2                \n\t"
2186
                                "add $24, %%"REG_d"                \n\t"
2187
                                "packssdw %%mm2, %%mm4                \n\t"
2188
                                "psraw $7, %%mm4                \n\t"
2189

    
2190
                                "packuswb %%mm4, %%mm0                \n\t"
2191
                                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
2192

    
2193
                                MOVNTQ" %%mm0, (%1, %%"REG_a")        \n\t"
2194
                                "add $8, %%"REG_a"                \n\t"
2195
                                " js 1b                                \n\t"
2196
                                : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2197
                                : "%"REG_a, "%"REG_d
2198
                        );
2199
                        ydst += lumStride;
2200
                        src  += srcStride;
2201
                }
2202
                src -= srcStride*2;
2203
                asm volatile(
2204
                        "mov %4, %%"REG_a"                \n\t"
2205
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2206
                        "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
2207
                        "pxor %%mm7, %%mm7                \n\t"
2208
                        "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2209
                        "add %%"REG_d", %%"REG_d"        \n\t"
2210
                        ASMALIGN(4)
2211
                        "1:                                \n\t"
2212
                        PREFETCH" 64(%0, %%"REG_d")        \n\t"
2213
                        PREFETCH" 64(%1, %%"REG_d")        \n\t"
2214
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2215
                        "movq (%0, %%"REG_d"), %%mm0        \n\t"
2216
                        "movq (%1, %%"REG_d"), %%mm1        \n\t"
2217
                        "movq 6(%0, %%"REG_d"), %%mm2        \n\t"
2218
                        "movq 6(%1, %%"REG_d"), %%mm3        \n\t"
2219
                        PAVGB" %%mm1, %%mm0                \n\t"
2220
                        PAVGB" %%mm3, %%mm2                \n\t"
2221
                        "movq %%mm0, %%mm1                \n\t"
2222
                        "movq %%mm2, %%mm3                \n\t"
2223
                        "psrlq $24, %%mm0                \n\t"
2224
                        "psrlq $24, %%mm2                \n\t"
2225
                        PAVGB" %%mm1, %%mm0                \n\t"
2226
                        PAVGB" %%mm3, %%mm2                \n\t"
2227
                        "punpcklbw %%mm7, %%mm0                \n\t"
2228
                        "punpcklbw %%mm7, %%mm2                \n\t"
2229
#else
2230
                        "movd (%0, %%"REG_d"), %%mm0        \n\t"
2231
                        "movd (%1, %%"REG_d"), %%mm1        \n\t"
2232
                        "movd 3(%0, %%"REG_d"), %%mm2        \n\t"
2233
                        "movd 3(%1, %%"REG_d"), %%mm3        \n\t"
2234
                        "punpcklbw %%mm7, %%mm0                \n\t"
2235
                        "punpcklbw %%mm7, %%mm1                \n\t"
2236
                        "punpcklbw %%mm7, %%mm2                \n\t"
2237
                        "punpcklbw %%mm7, %%mm3                \n\t"
2238
                        "paddw %%mm1, %%mm0                \n\t"
2239
                        "paddw %%mm3, %%mm2                \n\t"
2240
                        "paddw %%mm2, %%mm0                \n\t"
2241
                        "movd 6(%0, %%"REG_d"), %%mm4        \n\t"
2242
                        "movd 6(%1, %%"REG_d"), %%mm1        \n\t"
2243
                        "movd 9(%0, %%"REG_d"), %%mm2        \n\t"
2244
                        "movd 9(%1, %%"REG_d"), %%mm3        \n\t"
2245
                        "punpcklbw %%mm7, %%mm4                \n\t"
2246
                        "punpcklbw %%mm7, %%mm1                \n\t"
2247
                        "punpcklbw %%mm7, %%mm2                \n\t"
2248
                        "punpcklbw %%mm7, %%mm3                \n\t"
2249
                        "paddw %%mm1, %%mm4                \n\t"
2250
                        "paddw %%mm3, %%mm2                \n\t"
2251
                        "paddw %%mm4, %%mm2                \n\t"
2252
                        "psrlw $2, %%mm0                \n\t"
2253
                        "psrlw $2, %%mm2                \n\t"
2254
#endif
2255
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2256
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2257

    
2258
                        "pmaddwd %%mm0, %%mm1                \n\t"
2259
                        "pmaddwd %%mm2, %%mm3                \n\t"
2260
                        "pmaddwd %%mm6, %%mm0                \n\t"
2261
                        "pmaddwd %%mm6, %%mm2                \n\t"
2262
#ifndef FAST_BGR2YV12
2263
                        "psrad $8, %%mm0                \n\t"
2264
                        "psrad $8, %%mm1                \n\t"
2265
                        "psrad $8, %%mm2                \n\t"
2266
                        "psrad $8, %%mm3                \n\t"
2267
#endif
2268
                        "packssdw %%mm2, %%mm0                \n\t"
2269
                        "packssdw %%mm3, %%mm1                \n\t"
2270
                        "pmaddwd %%mm5, %%mm0                \n\t"
2271
                        "pmaddwd %%mm5, %%mm1                \n\t"
2272
                        "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
2273
                        "psraw $7, %%mm0                \n\t"
2274

    
2275
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2276
                        "movq 12(%0, %%"REG_d"), %%mm4        \n\t"
2277
                        "movq 12(%1, %%"REG_d"), %%mm1        \n\t"
2278
                        "movq 18(%0, %%"REG_d"), %%mm2        \n\t"
2279
                        "movq 18(%1, %%"REG_d"), %%mm3        \n\t"
2280
                        PAVGB" %%mm1, %%mm4                \n\t"
2281
                        PAVGB" %%mm3, %%mm2                \n\t"
2282
                        "movq %%mm4, %%mm1                \n\t"
2283
                        "movq %%mm2, %%mm3                \n\t"
2284
                        "psrlq $24, %%mm4                \n\t"
2285
                        "psrlq $24, %%mm2                \n\t"
2286
                        PAVGB" %%mm1, %%mm4                \n\t"
2287
                        PAVGB" %%mm3, %%mm2                \n\t"
2288
                        "punpcklbw %%mm7, %%mm4                \n\t"
2289
                        "punpcklbw %%mm7, %%mm2                \n\t"
2290
#else
2291
                        "movd 12(%0, %%"REG_d"), %%mm4        \n\t"
2292
                        "movd 12(%1, %%"REG_d"), %%mm1        \n\t"
2293
                        "movd 15(%0, %%"REG_d"), %%mm2        \n\t"
2294
                        "movd 15(%1, %%"REG_d"), %%mm3        \n\t"
2295
                        "punpcklbw %%mm7, %%mm4                \n\t"
2296
                        "punpcklbw %%mm7, %%mm1                \n\t"
2297
                        "punpcklbw %%mm7, %%mm2                \n\t"
2298
                        "punpcklbw %%mm7, %%mm3                \n\t"
2299
                        "paddw %%mm1, %%mm4                \n\t"
2300
                        "paddw %%mm3, %%mm2                \n\t"
2301
                        "paddw %%mm2, %%mm4                \n\t"
2302
                        "movd 18(%0, %%"REG_d"), %%mm5        \n\t"
2303
                        "movd 18(%1, %%"REG_d"), %%mm1        \n\t"
2304
                        "movd 21(%0, %%"REG_d"), %%mm2        \n\t"
2305
                        "movd 21(%1, %%"REG_d"), %%mm3        \n\t"
2306
                        "punpcklbw %%mm7, %%mm5                \n\t"
2307
                        "punpcklbw %%mm7, %%mm1                \n\t"
2308
                        "punpcklbw %%mm7, %%mm2                \n\t"
2309
                        "punpcklbw %%mm7, %%mm3                \n\t"
2310
                        "paddw %%mm1, %%mm5                \n\t"
2311
                        "paddw %%mm3, %%mm2                \n\t"
2312
                        "paddw %%mm5, %%mm2                \n\t"
2313
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2314
                        "psrlw $2, %%mm4                \n\t"
2315
                        "psrlw $2, %%mm2                \n\t"
2316
#endif
2317
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2318
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2319

    
2320
                        "pmaddwd %%mm4, %%mm1                \n\t"
2321
                        "pmaddwd %%mm2, %%mm3                \n\t"
2322
                        "pmaddwd %%mm6, %%mm4                \n\t"
2323
                        "pmaddwd %%mm6, %%mm2                \n\t"
2324
#ifndef FAST_BGR2YV12
2325
                        "psrad $8, %%mm4                \n\t"
2326
                        "psrad $8, %%mm1                \n\t"
2327
                        "psrad $8, %%mm2                \n\t"
2328
                        "psrad $8, %%mm3                \n\t"
2329
#endif
2330
                        "packssdw %%mm2, %%mm4                \n\t"
2331
                        "packssdw %%mm3, %%mm1                \n\t"
2332
                        "pmaddwd %%mm5, %%mm4                \n\t"
2333
                        "pmaddwd %%mm5, %%mm1                \n\t"
2334
                        "add $24, %%"REG_d"                \n\t"
2335
                        "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
2336
                        "psraw $7, %%mm4                \n\t"
2337

    
2338
                        "movq %%mm0, %%mm1                \n\t"
2339
                        "punpckldq %%mm4, %%mm0                \n\t"
2340
                        "punpckhdq %%mm4, %%mm1                \n\t"
2341
                        "packsswb %%mm1, %%mm0                \n\t"
2342
                        "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
2343
                        "movd %%mm0, (%2, %%"REG_a")        \n\t"
2344
                        "punpckhdq %%mm0, %%mm0                \n\t"
2345
                        "movd %%mm0, (%3, %%"REG_a")        \n\t"
2346
                        "add $4, %%"REG_a"                \n\t"
2347
                        " js 1b                                \n\t"
2348
                        : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2349
                        : "%"REG_a, "%"REG_d
2350
                );
2351

    
2352
                udst += chromStride;
2353
                vdst += chromStride;
2354
                src  += srcStride*2;
2355
        }
2356

    
2357
        asm volatile(   EMMS" \n\t"
2358
                        SFENCE" \n\t"
2359
                        :::"memory");
2360
#else
2361
        y=0;
2362
#endif
2363
        for(; y<height; y+=2)
2364
        {
2365
                long i;
2366
                for(i=0; i<chromWidth; i++)
2367
                {
2368
                        unsigned int b= src[6*i+0];
2369
                        unsigned int g= src[6*i+1];
2370
                        unsigned int r= src[6*i+2];
2371

    
2372
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2373
                        unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2374
                        unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2375

    
2376
                        udst[i]         = U;
2377
                        vdst[i]         = V;
2378
                        ydst[2*i]         = Y;
2379

    
2380
                        b= src[6*i+3];
2381
                        g= src[6*i+4];
2382
                        r= src[6*i+5];
2383

    
2384
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2385
                        ydst[2*i+1]         = Y;
2386
                }
2387
                ydst += lumStride;
2388
                src  += srcStride;
2389

    
2390
                for(i=0; i<chromWidth; i++)
2391
                {
2392
                        unsigned int b= src[6*i+0];
2393
                        unsigned int g= src[6*i+1];
2394
                        unsigned int r= src[6*i+2];
2395

    
2396
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2397

    
2398
                        ydst[2*i]         = Y;
2399

    
2400
                        b= src[6*i+3];
2401
                        g= src[6*i+4];
2402
                        r= src[6*i+5];
2403

    
2404
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2405
                        ydst[2*i+1]         = Y;
2406
                }
2407
                udst += chromStride;
2408
                vdst += chromStride;
2409
                ydst += lumStride;
2410
                src  += srcStride;
2411
        }
2412
}
2413

    
2414
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2415
                            long width, long height, long src1Stride,
2416
                            long src2Stride, long dstStride){
2417
        long h;
2418

    
2419
        for(h=0; h < height; h++)
2420
        {
2421
                long w;
2422

    
2423
#ifdef HAVE_MMX
2424
#ifdef HAVE_SSE2
2425
                asm(
2426
                        "xor %%"REG_a", %%"REG_a"        \n\t"
2427
                        "1:                                \n\t"
2428
                        PREFETCH" 64(%1, %%"REG_a")        \n\t"
2429
                        PREFETCH" 64(%2, %%"REG_a")        \n\t"
2430
                        "movdqa (%1, %%"REG_a"), %%xmm0        \n\t"
2431
                        "movdqa (%1, %%"REG_a"), %%xmm1        \n\t"
2432
                        "movdqa (%2, %%"REG_a"), %%xmm2        \n\t"
2433
                        "punpcklbw %%xmm2, %%xmm0        \n\t"
2434
                        "punpckhbw %%xmm2, %%xmm1        \n\t"
2435
                        "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2436
                        "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2437
                        "add $16, %%"REG_a"                \n\t"
2438
                        "cmp %3, %%"REG_a"                \n\t"
2439
                        " jb 1b                                \n\t"
2440
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2441
                        : "memory", "%"REG_a""
2442
                );
2443
#else
2444
                asm(
2445
                        "xor %%"REG_a", %%"REG_a"        \n\t"
2446
                        "1:                                \n\t"
2447
                        PREFETCH" 64(%1, %%"REG_a")        \n\t"
2448
                        PREFETCH" 64(%2, %%"REG_a")        \n\t"
2449
                        "movq (%1, %%"REG_a"), %%mm0        \n\t"
2450
                        "movq 8(%1, %%"REG_a"), %%mm2        \n\t"
2451
                        "movq %%mm0, %%mm1                \n\t"
2452
                        "movq %%mm2, %%mm3                \n\t"
2453
                        "movq (%2, %%"REG_a"), %%mm4        \n\t"
2454
                        "movq 8(%2, %%"REG_a"), %%mm5        \n\t"
2455
                        "punpcklbw %%mm4, %%mm0                \n\t"
2456
                        "punpckhbw %%mm4, %%mm1                \n\t"
2457
                        "punpcklbw %%mm5, %%mm2                \n\t"
2458
                        "punpckhbw %%mm5, %%mm3                \n\t"
2459
                        MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2460
                        MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2461
                        MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2462
                        MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2463
                        "add $16, %%"REG_a"                \n\t"
2464
                        "cmp %3, %%"REG_a"                \n\t"
2465
                        " jb 1b                                \n\t"
2466
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2467
                        : "memory", "%"REG_a
2468
                );
2469
#endif
2470
                for(w= (width&(~15)); w < width; w++)
2471
                {
2472
                        dest[2*w+0] = src1[w];
2473
                        dest[2*w+1] = src2[w];
2474
                }
2475
#else
2476
                for(w=0; w < width; w++)
2477
                {
2478
                        dest[2*w+0] = src1[w];
2479
                        dest[2*w+1] = src2[w];
2480
                }
2481
#endif
2482
                dest += dstStride;
2483
                src1 += src1Stride;
2484
                src2 += src2Stride;
2485
        }
2486
#ifdef HAVE_MMX
2487
        asm(
2488
                EMMS" \n\t"
2489
                SFENCE" \n\t"
2490
                ::: "memory"
2491
                );
2492
#endif
2493
}
2494

    
2495
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2496
                        uint8_t *dst1, uint8_t *dst2,
2497
                        long width, long height,
2498
                        long srcStride1, long srcStride2,
2499
                        long dstStride1, long dstStride2)
2500
{
2501
    long y,x,w,h;
2502
    w=width/2; h=height/2;
2503
#ifdef HAVE_MMX
2504
    asm volatile(
2505
        PREFETCH" %0\n\t"
2506
        PREFETCH" %1\n\t"
2507
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2508
#endif
2509
    for(y=0;y<h;y++){
2510
        const uint8_t* s1=src1+srcStride1*(y>>1);
2511
        uint8_t* d=dst1+dstStride1*y;
2512
        x=0;
2513
#ifdef HAVE_MMX
2514
        for(;x<w-31;x+=32)
2515
        {
2516
            asm volatile(
2517
                PREFETCH" 32%1\n\t"
2518
                "movq        %1, %%mm0\n\t"
2519
                "movq        8%1, %%mm2\n\t"
2520
                "movq        16%1, %%mm4\n\t"
2521
                "movq        24%1, %%mm6\n\t"
2522
                "movq        %%mm0, %%mm1\n\t"
2523
                "movq        %%mm2, %%mm3\n\t"
2524
                "movq        %%mm4, %%mm5\n\t"
2525
                "movq        %%mm6, %%mm7\n\t"
2526
                "punpcklbw %%mm0, %%mm0\n\t"
2527
                "punpckhbw %%mm1, %%mm1\n\t"
2528
                "punpcklbw %%mm2, %%mm2\n\t"
2529
                "punpckhbw %%mm3, %%mm3\n\t"
2530
                "punpcklbw %%mm4, %%mm4\n\t"
2531
                "punpckhbw %%mm5, %%mm5\n\t"
2532
                "punpcklbw %%mm6, %%mm6\n\t"
2533
                "punpckhbw %%mm7, %%mm7\n\t"
2534
                MOVNTQ"        %%mm0, %0\n\t"
2535
                MOVNTQ"        %%mm1, 8%0\n\t"
2536
                MOVNTQ"        %%mm2, 16%0\n\t"
2537
                MOVNTQ"        %%mm3, 24%0\n\t"
2538
                MOVNTQ"        %%mm4, 32%0\n\t"
2539
                MOVNTQ"        %%mm5, 40%0\n\t"
2540
                MOVNTQ"        %%mm6, 48%0\n\t"
2541
                MOVNTQ"        %%mm7, 56%0"
2542
                :"=m"(d[2*x])
2543
                :"m"(s1[x])
2544
                :"memory");
2545
        }
2546
#endif
2547
        for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2548
    }
2549
    for(y=0;y<h;y++){
2550
        const uint8_t* s2=src2+srcStride2*(y>>1);
2551
        uint8_t* d=dst2+dstStride2*y;
2552
        x=0;
2553
#ifdef HAVE_MMX
2554
        for(;x<w-31;x+=32)
2555
        {
2556
            asm volatile(
2557
                PREFETCH" 32%1\n\t"
2558
                "movq        %1, %%mm0\n\t"
2559
                "movq        8%1, %%mm2\n\t"
2560
                "movq        16%1, %%mm4\n\t"
2561
                "movq        24%1, %%mm6\n\t"
2562
                "movq        %%mm0, %%mm1\n\t"
2563
                "movq        %%mm2, %%mm3\n\t"
2564
                "movq        %%mm4, %%mm5\n\t"
2565
                "movq        %%mm6, %%mm7\n\t"
2566
                "punpcklbw %%mm0, %%mm0\n\t"
2567
                "punpckhbw %%mm1, %%mm1\n\t"
2568
                "punpcklbw %%mm2, %%mm2\n\t"
2569
                "punpckhbw %%mm3, %%mm3\n\t"
2570
                "punpcklbw %%mm4, %%mm4\n\t"
2571
                "punpckhbw %%mm5, %%mm5\n\t"
2572
                "punpcklbw %%mm6, %%mm6\n\t"
2573
                "punpckhbw %%mm7, %%mm7\n\t"
2574
                MOVNTQ"        %%mm0, %0\n\t"
2575
                MOVNTQ"        %%mm1, 8%0\n\t"
2576
                MOVNTQ"        %%mm2, 16%0\n\t"
2577
                MOVNTQ"        %%mm3, 24%0\n\t"
2578
                MOVNTQ"        %%mm4, 32%0\n\t"
2579
                MOVNTQ"        %%mm5, 40%0\n\t"
2580
                MOVNTQ"        %%mm6, 48%0\n\t"
2581
                MOVNTQ"        %%mm7, 56%0"
2582
                :"=m"(d[2*x])
2583
                :"m"(s2[x])
2584
                :"memory");
2585
        }
2586
#endif
2587
        for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2588
    }
2589
#ifdef HAVE_MMX
2590
        asm(
2591
                EMMS" \n\t"
2592
                SFENCE" \n\t"
2593
                ::: "memory"
2594
                );
2595
#endif
2596
}
2597

    
2598
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2599
                        uint8_t *dst,
2600
                        long width, long height,
2601
                        long srcStride1, long srcStride2,
2602
                        long srcStride3, long dstStride)
2603
{
2604
    long y,x,w,h;
2605
    w=width/2; h=height;
2606
    for(y=0;y<h;y++){
2607
        const uint8_t* yp=src1+srcStride1*y;
2608
        const uint8_t* up=src2+srcStride2*(y>>2);
2609
        const uint8_t* vp=src3+srcStride3*(y>>2);
2610
        uint8_t* d=dst+dstStride*y;
2611
        x=0;
2612
#ifdef HAVE_MMX
2613
        for(;x<w-7;x+=8)
2614
        {
2615
            asm volatile(
2616
                PREFETCH" 32(%1, %0)\n\t"
2617
                PREFETCH" 32(%2, %0)\n\t"
2618
                PREFETCH" 32(%3, %0)\n\t"
2619
                "movq        (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2620
                "movq        (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2621
                "movq        (%3, %0), %%mm2\n\t"             /* V0V1V2V3V4V5V6V7 */
2622
                "movq        %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2623
                "movq        %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2624
                "movq        %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2625
                "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2626
                "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2627
                "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2628
                "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2629

    
2630
                "movq        %%mm1, %%mm6\n\t"
2631
                "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2632
                "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2633
                "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2634
                MOVNTQ"        %%mm0, (%4, %0, 8)\n\t"
2635
                MOVNTQ"        %%mm3, 8(%4, %0, 8)\n\t"
2636
                
2637
                "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2638
                "movq        8(%1, %0, 4), %%mm0\n\t"
2639
                "movq        %%mm0, %%mm3\n\t"
2640
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2641
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2642
                MOVNTQ"        %%mm0, 16(%4, %0, 8)\n\t"
2643
                MOVNTQ"        %%mm3, 24(%4, %0, 8)\n\t"
2644

    
2645
                "movq        %%mm4, %%mm6\n\t"
2646
                "movq        16(%1, %0, 4), %%mm0\n\t"
2647
                "movq        %%mm0, %%mm3\n\t"
2648
                "punpcklbw %%mm5, %%mm4\n\t"
2649
                "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2650
                "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2651
                MOVNTQ"        %%mm0, 32(%4, %0, 8)\n\t"
2652
                MOVNTQ"        %%mm3, 40(%4, %0, 8)\n\t"
2653
                
2654
                "punpckhbw %%mm5, %%mm6\n\t"
2655
                "movq        24(%1, %0, 4), %%mm0\n\t"
2656
                "movq        %%mm0, %%mm3\n\t"
2657
                "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2658
                "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2659
                MOVNTQ"        %%mm0, 48(%4, %0, 8)\n\t"
2660
                MOVNTQ"        %%mm3, 56(%4, %0, 8)\n\t"
2661

    
2662
                : "+r" (x)
2663
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2664
                :"memory");
2665
        }
2666
#endif
2667
        for(; x<w; x++)
2668
        {
2669
            const long x2= x<<2;
2670
            d[8*x+0]=yp[x2];
2671
            d[8*x+1]=up[x];
2672
            d[8*x+2]=yp[x2+1];
2673
            d[8*x+3]=vp[x];
2674
            d[8*x+4]=yp[x2+2];
2675
            d[8*x+5]=up[x];
2676
            d[8*x+6]=yp[x2+3];
2677
            d[8*x+7]=vp[x];
2678
        }
2679
    }
2680
#ifdef HAVE_MMX
2681
        asm(
2682
                EMMS" \n\t"
2683
                SFENCE" \n\t"
2684
                ::: "memory"
2685
                );
2686
#endif
2687
}