Statistics
| Branch: | Revision:

ffmpeg / postproc / rgb2rgb_template.c @ 399cec2f

History | View | Annotate | Download (65.9 KB)

1 fcfbc150 Michael Niedermayer
/*
2 a3aece93 Nick Kurshev
 *
3
 *  rgb2rgb.c, Software RGB to RGB convertor
4 6611aa83 Nick Kurshev
 *  pluralize by Software PAL8 to RGB convertor
5
 *               Software YUV to YUV convertor
6
 *               Software YUV to RGB convertor
7 a3aece93 Nick Kurshev
 *  Written by Nick Kurshev.
8 1de97d84 Michael Niedermayer
 *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9 a3aece93 Nick Kurshev
 */
10
11 0d9f3d85 Arpi
#include <stddef.h>
12
#include <inttypes.h> /* for __WORDSIZE */
13
14
#ifndef __WORDSIZE
15 ff78c596 Arpi
// #warning You have misconfigured system and probably will lose performance!
16
#define __WORDSIZE MP_WORDSIZE
17 0d9f3d85 Arpi
#endif
18
19 1de97d84 Michael Niedermayer
#undef PREFETCH
20
#undef MOVNTQ
21
#undef EMMS
22
#undef SFENCE
23
#undef MMREG_SIZE
24
#undef PREFETCHW
25
#undef PAVGB
26
27
#ifdef HAVE_SSE2
28
#define MMREG_SIZE 16
29
#else
30
#define MMREG_SIZE 8
31
#endif
32
33
#ifdef HAVE_3DNOW
34
#define PREFETCH  "prefetch"
35
#define PREFETCHW "prefetchw"
36
#define PAVGB          "pavgusb"
37
#elif defined ( HAVE_MMX2 )
38
#define PREFETCH "prefetchnta"
39
#define PREFETCHW "prefetcht0"
40
#define PAVGB          "pavgb"
41
#else
42
#define PREFETCH "/nop"
43
#define PREFETCHW "/nop"
44 99969243 Michael Niedermayer
#endif
45 1de97d84 Michael Niedermayer
46
#ifdef HAVE_3DNOW
47
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48
#define EMMS     "femms"
49
#else
50
#define EMMS     "emms"
51 e697a141 Michael Niedermayer
#endif
52 79811694 Nick Kurshev
53 1de97d84 Michael Niedermayer
#ifdef HAVE_MMX2
54
#define MOVNTQ "movntq"
55
#define SFENCE "sfence"
56
#else
57
#define MOVNTQ "movq"
58
#define SFENCE "/nop"
59
#endif
60
61
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
62 b234ae81 Nick Kurshev
{
63 fde33ab5 Nick Kurshev
  uint8_t *dest = dst;
64 56993147 Nick Kurshev
  const uint8_t *s = src;
65
  const uint8_t *end;
66 49a0c6ee Nick Kurshev
#ifdef HAVE_MMX
67 d8dad2a5 Michael Niedermayer
  const uint8_t *mm_end;
68 49a0c6ee Nick Kurshev
#endif
69 b234ae81 Nick Kurshev
  end = s + src_size;
70 49a0c6ee Nick Kurshev
#ifdef HAVE_MMX
71 a3aece93 Nick Kurshev
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
72 d8dad2a5 Michael Niedermayer
  mm_end = end - 23;
73 a3aece93 Nick Kurshev
  __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
74 49a0c6ee Nick Kurshev
  while(s < mm_end)
75
  {
76
    __asm __volatile(
77 a3aece93 Nick Kurshev
        PREFETCH"        32%1\n\t"
78 49a0c6ee Nick Kurshev
        "movd        %1, %%mm0\n\t"
79 0155db7f Nick Kurshev
        "punpckldq 3%1, %%mm0\n\t"
80
        "movd        6%1, %%mm1\n\t"
81
        "punpckldq 9%1, %%mm1\n\t"
82
        "movd        12%1, %%mm2\n\t"
83
        "punpckldq 15%1, %%mm2\n\t"
84
        "movd        18%1, %%mm3\n\t"
85
        "punpckldq 21%1, %%mm3\n\t"
86 49a0c6ee Nick Kurshev
        "pand        %%mm7, %%mm0\n\t"
87 0155db7f Nick Kurshev
        "pand        %%mm7, %%mm1\n\t"
88 49a0c6ee Nick Kurshev
        "pand        %%mm7, %%mm2\n\t"
89 0155db7f Nick Kurshev
        "pand        %%mm7, %%mm3\n\t"
90 96b956cc Nick Kurshev
        MOVNTQ"        %%mm0, %0\n\t"
91 0155db7f Nick Kurshev
        MOVNTQ"        %%mm1, 8%0\n\t"
92
        MOVNTQ"        %%mm2, 16%0\n\t"
93
        MOVNTQ"        %%mm3, 24%0"
94 49a0c6ee Nick Kurshev
        :"=m"(*dest)
95
        :"m"(*s)
96
        :"memory");
97 0155db7f Nick Kurshev
    dest += 32;
98
    s += 24;
99 49a0c6ee Nick Kurshev
  }
100 79811694 Nick Kurshev
  __asm __volatile(SFENCE:::"memory");
101 96b956cc Nick Kurshev
  __asm __volatile(EMMS:::"memory");
102 49a0c6ee Nick Kurshev
#endif
103 b234ae81 Nick Kurshev
  while(s < end)
104
  {
105 fde33ab5 Nick Kurshev
    *dest++ = *s++;
106
    *dest++ = *s++;
107
    *dest++ = *s++;
108
    *dest++ = 0;
109 b234ae81 Nick Kurshev
  }
110
}
111 59ac5a93 Nick Kurshev
112 1de97d84 Michael Niedermayer
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
113 59ac5a93 Nick Kurshev
{
114
  uint8_t *dest = dst;
115 56993147 Nick Kurshev
  const uint8_t *s = src;
116
  const uint8_t *end;
117 494a6294 Nick Kurshev
#ifdef HAVE_MMX
118 d8dad2a5 Michael Niedermayer
  const uint8_t *mm_end;
119 494a6294 Nick Kurshev
#endif
120 59ac5a93 Nick Kurshev
  end = s + src_size;
121 494a6294 Nick Kurshev
#ifdef HAVE_MMX
122 a3aece93 Nick Kurshev
  __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
123 d8dad2a5 Michael Niedermayer
  mm_end = end - 31;
124 494a6294 Nick Kurshev
  while(s < mm_end)
125
  {
126
    __asm __volatile(
127 a3aece93 Nick Kurshev
        PREFETCH"        32%1\n\t"
128 494a6294 Nick Kurshev
        "movq        %1, %%mm0\n\t"
129
        "movq        8%1, %%mm1\n\t"
130 2b3eef22 Nick Kurshev
        "movq        16%1, %%mm4\n\t"
131
        "movq        24%1, %%mm5\n\t"
132 494a6294 Nick Kurshev
        "movq        %%mm0, %%mm2\n\t"
133
        "movq        %%mm1, %%mm3\n\t"
134 2b3eef22 Nick Kurshev
        "movq        %%mm4, %%mm6\n\t"
135
        "movq        %%mm5, %%mm7\n\t"
136 494a6294 Nick Kurshev
        "psrlq        $8, %%mm2\n\t"
137
        "psrlq        $8, %%mm3\n\t"
138 2b3eef22 Nick Kurshev
        "psrlq        $8, %%mm6\n\t"
139
        "psrlq        $8, %%mm7\n\t"
140
        "pand        %2, %%mm0\n\t"
141
        "pand        %2, %%mm1\n\t"
142
        "pand        %2, %%mm4\n\t"
143
        "pand        %2, %%mm5\n\t"
144
        "pand        %3, %%mm2\n\t"
145
        "pand        %3, %%mm3\n\t"
146
        "pand        %3, %%mm6\n\t"
147
        "pand        %3, %%mm7\n\t"
148
        "por        %%mm2, %%mm0\n\t"
149
        "por        %%mm3, %%mm1\n\t"
150
        "por        %%mm6, %%mm4\n\t"
151
        "por        %%mm7, %%mm5\n\t"
152
153
        "movq        %%mm1, %%mm2\n\t"
154
        "movq        %%mm4, %%mm3\n\t"
155
        "psllq        $48, %%mm2\n\t"
156
        "psllq        $32, %%mm3\n\t"
157
        "pand        %4, %%mm2\n\t"
158
        "pand        %5, %%mm3\n\t"
159 494a6294 Nick Kurshev
        "por        %%mm2, %%mm0\n\t"
160 2b3eef22 Nick Kurshev
        "psrlq        $16, %%mm1\n\t"
161
        "psrlq        $32, %%mm4\n\t"
162
        "psllq        $16, %%mm5\n\t"
163 494a6294 Nick Kurshev
        "por        %%mm3, %%mm1\n\t"
164 2b3eef22 Nick Kurshev
        "pand        %6, %%mm5\n\t"
165
        "por        %%mm5, %%mm4\n\t"
166 1de97d84 Michael Niedermayer
167 494a6294 Nick Kurshev
        MOVNTQ"        %%mm0, %0\n\t"
168 2b3eef22 Nick Kurshev
        MOVNTQ"        %%mm1, 8%0\n\t"
169
        MOVNTQ"        %%mm4, 16%0"
170 494a6294 Nick Kurshev
        :"=m"(*dest)
171 2b3eef22 Nick Kurshev
        :"m"(*s),"m"(mask24l),
172
         "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173 494a6294 Nick Kurshev
        :"memory");
174 2b3eef22 Nick Kurshev
    dest += 24;
175
    s += 32;
176 494a6294 Nick Kurshev
  }
177
  __asm __volatile(SFENCE:::"memory");
178
  __asm __volatile(EMMS:::"memory");
179
#endif
180 59ac5a93 Nick Kurshev
  while(s < end)
181
  {
182
    *dest++ = *s++;
183
    *dest++ = *s++;
184
    *dest++ = *s++;
185
    s++;
186
  }
187
}
188 b238eb2e Nick Kurshev
189 a3aece93 Nick Kurshev
/*
190
 Original by Strepto/Astral
191
 ported to gcc & bugfixed : A'rpi
192 51da31f1 Nick Kurshev
 MMX2, 3DNOW optimization by Nick Kurshev
193 9b2c28e6 Michael Niedermayer
 32bit c version, and and&add trick by Michael Niedermayer
194 a3aece93 Nick Kurshev
*/
195 1de97d84 Michael Niedermayer
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196 b238eb2e Nick Kurshev
{
197 0d9f3d85 Arpi
  register const uint8_t* s=src;
198
  register uint8_t* d=dst;
199
  register const uint8_t *end;
200 d8dad2a5 Michael Niedermayer
  const uint8_t *mm_end;
201 0d9f3d85 Arpi
  end = s + src_size;
202 b238eb2e Nick Kurshev
#ifdef HAVE_MMX
203 0d9f3d85 Arpi
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
204
  __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
205 d8dad2a5 Michael Niedermayer
  mm_end = end - 15;
206 0d9f3d85 Arpi
  while(s<mm_end)
207 a3aece93 Nick Kurshev
  {
208
        __asm __volatile(
209
                PREFETCH"        32%1\n\t"
210
                "movq        %1, %%mm0\n\t"
211
                "movq        8%1, %%mm2\n\t"
212
                "movq        %%mm0, %%mm1\n\t"
213
                "movq        %%mm2, %%mm3\n\t"
214
                "pand        %%mm4, %%mm0\n\t"
215
                "pand        %%mm4, %%mm2\n\t"
216 9b2c28e6 Michael Niedermayer
                "paddw        %%mm1, %%mm0\n\t"
217
                "paddw        %%mm3, %%mm2\n\t"
218 a3aece93 Nick Kurshev
                MOVNTQ"        %%mm0, %0\n\t"
219
                MOVNTQ"        %%mm2, 8%0"
220 0d9f3d85 Arpi
                :"=m"(*d)
221
                :"m"(*s)
222 9b2c28e6 Michael Niedermayer
                );
223 0d9f3d85 Arpi
        d+=16;
224
        s+=16;
225 b238eb2e Nick Kurshev
  }
226 a3aece93 Nick Kurshev
  __asm __volatile(SFENCE:::"memory");
227
  __asm __volatile(EMMS:::"memory");
228 b238eb2e Nick Kurshev
#endif
229 d8dad2a5 Michael Niedermayer
    mm_end = end - 3;
230 0d9f3d85 Arpi
    while(s < mm_end)
231
    {
232
        register unsigned x= *((uint32_t *)s);
233
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234
        d+=4;
235
        s+=4;
236
    }
237
    if(s < end)
238
    {
239
        register unsigned short x= *((uint16_t *)s);
240
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241
    }
242 b238eb2e Nick Kurshev
}
243 fcfbc150 Michael Niedermayer
244 ac4d0aea Michael Niedermayer
static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
245
{
246
  register const uint8_t* s=src;
247
  register uint8_t* d=dst;
248
  register const uint8_t *end;
249 0598bcbb Michael Niedermayer
  const uint8_t *mm_end;
250 ac4d0aea Michael Niedermayer
  end = s + src_size;
251
#ifdef HAVE_MMX
252
  __asm __volatile(PREFETCH"        %0"::"m"(*s));
253
  __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
254
  __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
255 0598bcbb Michael Niedermayer
  mm_end = end - 15;
256 ac4d0aea Michael Niedermayer
  while(s<mm_end)
257
  {
258
        __asm __volatile(
259
                PREFETCH"        32%1\n\t"
260
                "movq        %1, %%mm0\n\t"
261
                "movq        8%1, %%mm2\n\t"
262
                "movq        %%mm0, %%mm1\n\t"
263
                "movq        %%mm2, %%mm3\n\t"
264
                "psrlq        $1, %%mm0\n\t"
265
                "psrlq        $1, %%mm2\n\t"
266
                "pand        %%mm7, %%mm0\n\t"
267
                "pand        %%mm7, %%mm2\n\t"
268
                "pand        %%mm6, %%mm1\n\t"
269
                "pand        %%mm6, %%mm3\n\t"
270
                "por        %%mm1, %%mm0\n\t"
271
                "por        %%mm3, %%mm2\n\t"
272
                MOVNTQ"        %%mm0, %0\n\t"
273
                MOVNTQ"        %%mm2, 8%0"
274
                :"=m"(*d)
275
                :"m"(*s)
276
                );
277
        d+=16;
278
        s+=16;
279
  }
280
  __asm __volatile(SFENCE:::"memory");
281
  __asm __volatile(EMMS:::"memory");
282
#endif
283 0598bcbb Michael Niedermayer
    mm_end = end - 3;
284
    while(s < mm_end)
285 ac4d0aea Michael Niedermayer
    {
286
        register uint32_t x= *((uint32_t *)s);
287
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
288
        s+=4;
289
        d+=4;
290
    }
291
    if(s < end)
292
    {
293
        register uint16_t x= *((uint16_t *)s);
294
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
295
        s+=2;
296
        d+=2;
297
    }
298
}
299
300 1de97d84 Michael Niedermayer
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
301 fcfbc150 Michael Niedermayer
{
302 53445e83 Nick Kurshev
        const uint8_t *s = src;
303 0d9f3d85 Arpi
        const uint8_t *end;
304
#ifdef HAVE_MMX
305
        const uint8_t *mm_end;
306
#endif
307 53445e83 Nick Kurshev
        uint16_t *d = (uint16_t *)dst;
308
        end = s + src_size;
309 0d9f3d85 Arpi
#ifdef HAVE_MMX
310 aeae5d53 Michael Niedermayer
        mm_end = end - 15;
311
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
312
        asm volatile(
313
                "movq %3, %%mm5                        \n\t"
314
                "movq %4, %%mm6                        \n\t"
315
                "movq %5, %%mm7                        \n\t"
316
                ".balign 16                        \n\t"
317
                "1:                                \n\t"
318
                PREFETCH" 32(%1)                \n\t"
319
                "movd        (%1), %%mm0                \n\t"
320
                "movd        4(%1), %%mm3                \n\t"
321
                "punpckldq 8(%1), %%mm0                \n\t"
322
                "punpckldq 12(%1), %%mm3        \n\t"
323
                "movq %%mm0, %%mm1                \n\t"
324
                "movq %%mm3, %%mm4                \n\t"
325
                "pand %%mm6, %%mm0                \n\t"
326
                "pand %%mm6, %%mm3                \n\t"
327
                "pmaddwd %%mm7, %%mm0                \n\t"
328
                "pmaddwd %%mm7, %%mm3                \n\t"
329
                "pand %%mm5, %%mm1                \n\t"
330
                "pand %%mm5, %%mm4                \n\t"
331
                "por %%mm1, %%mm0                \n\t"        
332
                "por %%mm4, %%mm3                \n\t"
333
                "psrld $5, %%mm0                \n\t"
334
                "pslld $11, %%mm3                \n\t"
335
                "por %%mm3, %%mm0                \n\t"
336
                MOVNTQ"        %%mm0, (%0)                \n\t"
337
                "addl $16, %1                        \n\t"
338
                "addl $8, %0                        \n\t"
339
                "cmpl %2, %1                        \n\t"
340
                " jb 1b                                \n\t"
341
                : "+r" (d), "+r"(s)
342
                : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
343
        );
344
#else
345 53445e83 Nick Kurshev
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
346
        __asm __volatile(
347
            "movq        %0, %%mm7\n\t"
348
            "movq        %1, %%mm6\n\t"
349
            ::"m"(red_16mask),"m"(green_16mask));
350
        while(s < mm_end)
351
        {
352
            __asm __volatile(
353
                PREFETCH" 32%1\n\t"
354
                "movd        %1, %%mm0\n\t"
355
                "movd        4%1, %%mm3\n\t"
356
                "punpckldq 8%1, %%mm0\n\t"
357
                "punpckldq 12%1, %%mm3\n\t"
358
                "movq        %%mm0, %%mm1\n\t"
359
                "movq        %%mm0, %%mm2\n\t"
360
                "movq        %%mm3, %%mm4\n\t"
361
                "movq        %%mm3, %%mm5\n\t"
362
                "psrlq        $3, %%mm0\n\t"
363
                "psrlq        $3, %%mm3\n\t"
364
                "pand        %2, %%mm0\n\t"
365
                "pand        %2, %%mm3\n\t"
366
                "psrlq        $5, %%mm1\n\t"
367
                "psrlq        $5, %%mm4\n\t"
368
                "pand        %%mm6, %%mm1\n\t"
369
                "pand        %%mm6, %%mm4\n\t"
370
                "psrlq        $8, %%mm2\n\t"
371
                "psrlq        $8, %%mm5\n\t"
372
                "pand        %%mm7, %%mm2\n\t"
373
                "pand        %%mm7, %%mm5\n\t"
374
                "por        %%mm1, %%mm0\n\t"
375
                "por        %%mm4, %%mm3\n\t"
376
                "por        %%mm2, %%mm0\n\t"
377
                "por        %%mm5, %%mm3\n\t"
378
                "psllq        $16, %%mm3\n\t"
379
                "por        %%mm3, %%mm0\n\t"
380
                MOVNTQ"        %%mm0, %0\n\t"
381
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
382
                d += 4;
383
                s += 16;
384
        }
385 aeae5d53 Michael Niedermayer
#endif
386 0d9f3d85 Arpi
        __asm __volatile(SFENCE:::"memory");
387
        __asm __volatile(EMMS:::"memory");
388
#endif
389 53445e83 Nick Kurshev
        while(s < end)
390
        {
391 deb2277c Michael Niedermayer
                const int src= *((uint32_t*)s)++;
392
                *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393
//                *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
394 53445e83 Nick Kurshev
        }
395 fcfbc150 Michael Niedermayer
}
396
397 ac4d0aea Michael Niedermayer
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
398
{
399
        const uint8_t *s = src;
400
        const uint8_t *end;
401
#ifdef HAVE_MMX
402
        const uint8_t *mm_end;
403
#endif
404
        uint16_t *d = (uint16_t *)dst;
405
        end = s + src_size;
406
#ifdef HAVE_MMX
407
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
408
        __asm __volatile(
409
            "movq        %0, %%mm7\n\t"
410
            "movq        %1, %%mm6\n\t"
411
            ::"m"(red_16mask),"m"(green_16mask));
412 0598bcbb Michael Niedermayer
        mm_end = end - 15;
413 ac4d0aea Michael Niedermayer
        while(s < mm_end)
414
        {
415
            __asm __volatile(
416
                PREFETCH" 32%1\n\t"
417
                "movd        %1, %%mm0\n\t"
418
                "movd        4%1, %%mm3\n\t"
419
                "punpckldq 8%1, %%mm0\n\t"
420
                "punpckldq 12%1, %%mm3\n\t"
421
                "movq        %%mm0, %%mm1\n\t"
422
                "movq        %%mm0, %%mm2\n\t"
423
                "movq        %%mm3, %%mm4\n\t"
424
                "movq        %%mm3, %%mm5\n\t"
425
                "psllq        $8, %%mm0\n\t"
426
                "psllq        $8, %%mm3\n\t"
427
                "pand        %%mm7, %%mm0\n\t"
428
                "pand        %%mm7, %%mm3\n\t"
429
                "psrlq        $5, %%mm1\n\t"
430
                "psrlq        $5, %%mm4\n\t"
431
                "pand        %%mm6, %%mm1\n\t"
432
                "pand        %%mm6, %%mm4\n\t"
433
                "psrlq        $19, %%mm2\n\t"
434
                "psrlq        $19, %%mm5\n\t"
435
                "pand        %2, %%mm2\n\t"
436
                "pand        %2, %%mm5\n\t"
437
                "por        %%mm1, %%mm0\n\t"
438
                "por        %%mm4, %%mm3\n\t"
439
                "por        %%mm2, %%mm0\n\t"
440
                "por        %%mm5, %%mm3\n\t"
441
                "psllq        $16, %%mm3\n\t"
442
                "por        %%mm3, %%mm0\n\t"
443
                MOVNTQ"        %%mm0, %0\n\t"
444
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
445
                d += 4;
446
                s += 16;
447
        }
448
        __asm __volatile(SFENCE:::"memory");
449
        __asm __volatile(EMMS:::"memory");
450
#endif
451
        while(s < end)
452
        {
453 deb2277c Michael Niedermayer
                const int src= *((uint32_t*)s)++;
454
                *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
455 ac4d0aea Michael Niedermayer
        }
456
}
457
458 1de97d84 Michael Niedermayer
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
459 fcfbc150 Michael Niedermayer
{
460 53445e83 Nick Kurshev
        const uint8_t *s = src;
461 0d9f3d85 Arpi
        const uint8_t *end;
462
#ifdef HAVE_MMX
463
        const uint8_t *mm_end;
464
#endif
465 53445e83 Nick Kurshev
        uint16_t *d = (uint16_t *)dst;
466
        end = s + src_size;
467 0d9f3d85 Arpi
#ifdef HAVE_MMX
468 aeae5d53 Michael Niedermayer
        mm_end = end - 15;
469
#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
470
        asm volatile(
471
                "movq %3, %%mm5                        \n\t"
472
                "movq %4, %%mm6                        \n\t"
473
                "movq %5, %%mm7                        \n\t"
474
                ".balign 16                        \n\t"
475
                "1:                                \n\t"
476
                PREFETCH" 32(%1)                \n\t"
477
                "movd        (%1), %%mm0                \n\t"
478
                "movd        4(%1), %%mm3                \n\t"
479
                "punpckldq 8(%1), %%mm0                \n\t"
480
                "punpckldq 12(%1), %%mm3        \n\t"
481
                "movq %%mm0, %%mm1                \n\t"
482
                "movq %%mm3, %%mm4                \n\t"
483
                "pand %%mm6, %%mm0                \n\t"
484
                "pand %%mm6, %%mm3                \n\t"
485
                "pmaddwd %%mm7, %%mm0                \n\t"
486
                "pmaddwd %%mm7, %%mm3                \n\t"
487
                "pand %%mm5, %%mm1                \n\t"
488
                "pand %%mm5, %%mm4                \n\t"
489
                "por %%mm1, %%mm0                \n\t"        
490
                "por %%mm4, %%mm3                \n\t"
491
                "psrld $6, %%mm0                \n\t"
492
                "pslld $10, %%mm3                \n\t"
493
                "por %%mm3, %%mm0                \n\t"
494
                MOVNTQ"        %%mm0, (%0)                \n\t"
495
                "addl $16, %1                        \n\t"
496
                "addl $8, %0                        \n\t"
497
                "cmpl %2, %1                        \n\t"
498
                " jb 1b                                \n\t"
499
                : "+r" (d), "+r"(s)
500
                : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
501
        );
502
#else
503 53445e83 Nick Kurshev
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
504
        __asm __volatile(
505
            "movq        %0, %%mm7\n\t"
506
            "movq        %1, %%mm6\n\t"
507
            ::"m"(red_15mask),"m"(green_15mask));
508
        while(s < mm_end)
509
        {
510
            __asm __volatile(
511
                PREFETCH" 32%1\n\t"
512
                "movd        %1, %%mm0\n\t"
513
                "movd        4%1, %%mm3\n\t"
514
                "punpckldq 8%1, %%mm0\n\t"
515
                "punpckldq 12%1, %%mm3\n\t"
516
                "movq        %%mm0, %%mm1\n\t"
517
                "movq        %%mm0, %%mm2\n\t"
518
                "movq        %%mm3, %%mm4\n\t"
519
                "movq        %%mm3, %%mm5\n\t"
520
                "psrlq        $3, %%mm0\n\t"
521
                "psrlq        $3, %%mm3\n\t"
522
                "pand        %2, %%mm0\n\t"
523
                "pand        %2, %%mm3\n\t"
524
                "psrlq        $6, %%mm1\n\t"
525
                "psrlq        $6, %%mm4\n\t"
526
                "pand        %%mm6, %%mm1\n\t"
527
                "pand        %%mm6, %%mm4\n\t"
528
                "psrlq        $9, %%mm2\n\t"
529
                "psrlq        $9, %%mm5\n\t"
530
                "pand        %%mm7, %%mm2\n\t"
531
                "pand        %%mm7, %%mm5\n\t"
532
                "por        %%mm1, %%mm0\n\t"
533
                "por        %%mm4, %%mm3\n\t"
534
                "por        %%mm2, %%mm0\n\t"
535
                "por        %%mm5, %%mm3\n\t"
536
                "psllq        $16, %%mm3\n\t"
537
                "por        %%mm3, %%mm0\n\t"
538
                MOVNTQ"        %%mm0, %0\n\t"
539
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
540
                d += 4;
541
                s += 16;
542
        }
543 aeae5d53 Michael Niedermayer
#endif
544 0d9f3d85 Arpi
        __asm __volatile(SFENCE:::"memory");
545
        __asm __volatile(EMMS:::"memory");
546
#endif
547 53445e83 Nick Kurshev
        while(s < end)
548
        {
549 deb2277c Michael Niedermayer
                const int src= *((uint32_t*)s)++;
550
                *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
551 53445e83 Nick Kurshev
        }
552 fcfbc150 Michael Niedermayer
}
553
554 ac4d0aea Michael Niedermayer
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
555
{
556
        const uint8_t *s = src;
557
        const uint8_t *end;
558
#ifdef HAVE_MMX
559
        const uint8_t *mm_end;
560
#endif
561
        uint16_t *d = (uint16_t *)dst;
562
        end = s + src_size;
563
#ifdef HAVE_MMX
564
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
565
        __asm __volatile(
566
            "movq        %0, %%mm7\n\t"
567
            "movq        %1, %%mm6\n\t"
568
            ::"m"(red_15mask),"m"(green_15mask));
569 0598bcbb Michael Niedermayer
        mm_end = end - 15;
570 ac4d0aea Michael Niedermayer
        while(s < mm_end)
571
        {
572
            __asm __volatile(
573
                PREFETCH" 32%1\n\t"
574
                "movd        %1, %%mm0\n\t"
575
                "movd        4%1, %%mm3\n\t"
576
                "punpckldq 8%1, %%mm0\n\t"
577
                "punpckldq 12%1, %%mm3\n\t"
578
                "movq        %%mm0, %%mm1\n\t"
579
                "movq        %%mm0, %%mm2\n\t"
580
                "movq        %%mm3, %%mm4\n\t"
581
                "movq        %%mm3, %%mm5\n\t"
582
                "psllq        $7, %%mm0\n\t"
583
                "psllq        $7, %%mm3\n\t"
584
                "pand        %%mm7, %%mm0\n\t"
585
                "pand        %%mm7, %%mm3\n\t"
586
                "psrlq        $6, %%mm1\n\t"
587
                "psrlq        $6, %%mm4\n\t"
588
                "pand        %%mm6, %%mm1\n\t"
589
                "pand        %%mm6, %%mm4\n\t"
590
                "psrlq        $19, %%mm2\n\t"
591
                "psrlq        $19, %%mm5\n\t"
592
                "pand        %2, %%mm2\n\t"
593
                "pand        %2, %%mm5\n\t"
594
                "por        %%mm1, %%mm0\n\t"
595
                "por        %%mm4, %%mm3\n\t"
596
                "por        %%mm2, %%mm0\n\t"
597
                "por        %%mm5, %%mm3\n\t"
598
                "psllq        $16, %%mm3\n\t"
599
                "por        %%mm3, %%mm0\n\t"
600
                MOVNTQ"        %%mm0, %0\n\t"
601
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
602
                d += 4;
603
                s += 16;
604
        }
605
        __asm __volatile(SFENCE:::"memory");
606
        __asm __volatile(EMMS:::"memory");
607
#endif
608
        while(s < end)
609
        {
610 deb2277c Michael Niedermayer
                const int src= *((uint32_t*)s)++;
611
                *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
612 ac4d0aea Michael Niedermayer
        }
613
}
614
615 1de97d84 Michael Niedermayer
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
616 996e1a7c Nick Kurshev
{
617 3eb2151c Nick Kurshev
        const uint8_t *s = src;
618 0d9f3d85 Arpi
        const uint8_t *end;
619
#ifdef HAVE_MMX
620
        const uint8_t *mm_end;
621
#endif
622 90226a43 Nick Kurshev
        uint16_t *d = (uint16_t *)dst;
623 3eb2151c Nick Kurshev
        end = s + src_size;
624 0d9f3d85 Arpi
#ifdef HAVE_MMX
625 0155db7f Nick Kurshev
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
626
        __asm __volatile(
627
            "movq        %0, %%mm7\n\t"
628
            "movq        %1, %%mm6\n\t"
629 53445e83 Nick Kurshev
            ::"m"(red_16mask),"m"(green_16mask));
630 d8dad2a5 Michael Niedermayer
        mm_end = end - 11;
631 3eb2151c Nick Kurshev
        while(s < mm_end)
632 0155db7f Nick Kurshev
        {
633
            __asm __volatile(
634
                PREFETCH" 32%1\n\t"
635
                "movd        %1, %%mm0\n\t"
636 3eb2151c Nick Kurshev
                "movd        3%1, %%mm3\n\t"
637
                "punpckldq 6%1, %%mm0\n\t"
638 0155db7f Nick Kurshev
                "punpckldq 9%1, %%mm3\n\t"
639
                "movq        %%mm0, %%mm1\n\t"
640
                "movq        %%mm0, %%mm2\n\t"
641
                "movq        %%mm3, %%mm4\n\t"
642
                "movq        %%mm3, %%mm5\n\t"
643
                "psrlq        $3, %%mm0\n\t"
644
                "psrlq        $3, %%mm3\n\t"
645 3eb2151c Nick Kurshev
                "pand        %2, %%mm0\n\t"
646
                "pand        %2, %%mm3\n\t"
647
                "psrlq        $5, %%mm1\n\t"
648
                "psrlq        $5, %%mm4\n\t"
649
                "pand        %%mm6, %%mm1\n\t"
650
                "pand        %%mm6, %%mm4\n\t"
651
                "psrlq        $8, %%mm2\n\t"
652
                "psrlq        $8, %%mm5\n\t"
653
                "pand        %%mm7, %%mm2\n\t"
654
                "pand        %%mm7, %%mm5\n\t"
655 0155db7f Nick Kurshev
                "por        %%mm1, %%mm0\n\t"
656
                "por        %%mm4, %%mm3\n\t"
657 3eb2151c Nick Kurshev
                "por        %%mm2, %%mm0\n\t"
658 0155db7f Nick Kurshev
                "por        %%mm5, %%mm3\n\t"
659 3eb2151c Nick Kurshev
                "psllq        $16, %%mm3\n\t"
660
                "por        %%mm3, %%mm0\n\t"
661 0155db7f Nick Kurshev
                MOVNTQ"        %%mm0, %0\n\t"
662 53445e83 Nick Kurshev
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
663 3eb2151c Nick Kurshev
                d += 4;
664
                s += 12;
665 0155db7f Nick Kurshev
        }
666 0d9f3d85 Arpi
        __asm __volatile(SFENCE:::"memory");
667
        __asm __volatile(EMMS:::"memory");
668
#endif
669 3eb2151c Nick Kurshev
        while(s < end)
670
        {
671
                const int b= *s++;
672
                const int g= *s++;
673
                const int r= *s++;
674
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
675
        }
676 996e1a7c Nick Kurshev
}
677
678 ac4d0aea Michael Niedermayer
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
679
{
680
        const uint8_t *s = src;
681
        const uint8_t *end;
682
#ifdef HAVE_MMX
683
        const uint8_t *mm_end;
684
#endif
685
        uint16_t *d = (uint16_t *)dst;
686
        end = s + src_size;
687
#ifdef HAVE_MMX
688
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
689
        __asm __volatile(
690
            "movq        %0, %%mm7\n\t"
691
            "movq        %1, %%mm6\n\t"
692
            ::"m"(red_16mask),"m"(green_16mask));
693 0598bcbb Michael Niedermayer
        mm_end = end - 15;
694 ac4d0aea Michael Niedermayer
        while(s < mm_end)
695
        {
696
            __asm __volatile(
697
                PREFETCH" 32%1\n\t"
698
                "movd        %1, %%mm0\n\t"
699
                "movd        3%1, %%mm3\n\t"
700
                "punpckldq 6%1, %%mm0\n\t"
701
                "punpckldq 9%1, %%mm3\n\t"
702
                "movq        %%mm0, %%mm1\n\t"
703
                "movq        %%mm0, %%mm2\n\t"
704
                "movq        %%mm3, %%mm4\n\t"
705
                "movq        %%mm3, %%mm5\n\t"
706
                "psllq        $8, %%mm0\n\t"
707
                "psllq        $8, %%mm3\n\t"
708
                "pand        %%mm7, %%mm0\n\t"
709
                "pand        %%mm7, %%mm3\n\t"
710
                "psrlq        $5, %%mm1\n\t"
711
                "psrlq        $5, %%mm4\n\t"
712
                "pand        %%mm6, %%mm1\n\t"
713
                "pand        %%mm6, %%mm4\n\t"
714
                "psrlq        $19, %%mm2\n\t"
715
                "psrlq        $19, %%mm5\n\t"
716
                "pand        %2, %%mm2\n\t"
717
                "pand        %2, %%mm5\n\t"
718
                "por        %%mm1, %%mm0\n\t"
719
                "por        %%mm4, %%mm3\n\t"
720
                "por        %%mm2, %%mm0\n\t"
721
                "por        %%mm5, %%mm3\n\t"
722
                "psllq        $16, %%mm3\n\t"
723
                "por        %%mm3, %%mm0\n\t"
724
                MOVNTQ"        %%mm0, %0\n\t"
725
                :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
726
                d += 4;
727
                s += 12;
728
        }
729
        __asm __volatile(SFENCE:::"memory");
730
        __asm __volatile(EMMS:::"memory");
731
#endif
732
        while(s < end)
733
        {
734
                const int r= *s++;
735
                const int g= *s++;
736
                const int b= *s++;
737
                *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
738
        }
739
}
740
741 1de97d84 Michael Niedermayer
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
742 996e1a7c Nick Kurshev
{
743 53445e83 Nick Kurshev
        const uint8_t *s = src;
744 0d9f3d85 Arpi
        const uint8_t *end;
745
#ifdef HAVE_MMX
746
        const uint8_t *mm_end;
747
#endif
748 53445e83 Nick Kurshev
        uint16_t *d = (uint16_t *)dst;
749
        end = s + src_size;
750 0d9f3d85 Arpi
#ifdef HAVE_MMX
751 53445e83 Nick Kurshev
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
752
        __asm __volatile(
753
            "movq        %0, %%mm7\n\t"
754
            "movq        %1, %%mm6\n\t"
755
            ::"m"(red_15mask),"m"(green_15mask));
756 d8dad2a5 Michael Niedermayer
        mm_end = end - 11;
757 53445e83 Nick Kurshev
        while(s < mm_end)
758
        {
759
            __asm __volatile(
760
                PREFETCH" 32%1\n\t"
761
                "movd        %1, %%mm0\n\t"
762
                "movd        3%1, %%mm3\n\t"
763
                "punpckldq 6%1, %%mm0\n\t"
764
                "punpckldq 9%1, %%mm3\n\t"
765
                "movq        %%mm0, %%mm1\n\t"
766
                "movq        %%mm0, %%mm2\n\t"
767
                "movq        %%mm3, %%mm4\n\t"
768
                "movq        %%mm3, %%mm5\n\t"
769
                "psrlq        $3, %%mm0\n\t"
770
                "psrlq        $3, %%mm3\n\t"
771
                "pand        %2, %%mm0\n\t"
772
                "pand        %2, %%mm3\n\t"
773
                "psrlq        $6, %%mm1\n\t"
774
                "psrlq        $6, %%mm4\n\t"
775
                "pand        %%mm6, %%mm1\n\t"
776
                "pand        %%mm6, %%mm4\n\t"
777
                "psrlq        $9, %%mm2\n\t"
778
                "psrlq        $9, %%mm5\n\t"
779
                "pand        %%mm7, %%mm2\n\t"
780
                "pand        %%mm7, %%mm5\n\t"
781
                "por        %%mm1, %%mm0\n\t"
782
                "por        %%mm4, %%mm3\n\t"
783
                "por        %%mm2, %%mm0\n\t"
784
                "por        %%mm5, %%mm3\n\t"
785
                "psllq        $16, %%mm3\n\t"
786
                "por        %%mm3, %%mm0\n\t"
787
                MOVNTQ"        %%mm0, %0\n\t"
788
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
789
                d += 4;
790
                s += 12;
791
        }
792 0d9f3d85 Arpi
        __asm __volatile(SFENCE:::"memory");
793
        __asm __volatile(EMMS:::"memory");
794
#endif
795 53445e83 Nick Kurshev
        while(s < end)
796
        {
797
                const int b= *s++;
798
                const int g= *s++;
799
                const int r= *s++;
800
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
801
        }
802 0d9f3d85 Arpi
}
803
804 ac4d0aea Michael Niedermayer
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
805
{
806
        const uint8_t *s = src;
807
        const uint8_t *end;
808
#ifdef HAVE_MMX
809
        const uint8_t *mm_end;
810
#endif
811
        uint16_t *d = (uint16_t *)dst;
812
        end = s + src_size;
813
#ifdef HAVE_MMX
814
        __asm __volatile(PREFETCH"        %0"::"m"(*src):"memory");
815
        __asm __volatile(
816
            "movq        %0, %%mm7\n\t"
817
            "movq        %1, %%mm6\n\t"
818
            ::"m"(red_15mask),"m"(green_15mask));
819 0598bcbb Michael Niedermayer
        mm_end = end - 15;
820 ac4d0aea Michael Niedermayer
        while(s < mm_end)
821
        {
822
            __asm __volatile(
823
                PREFETCH" 32%1\n\t"
824
                "movd        %1, %%mm0\n\t"
825
                "movd        3%1, %%mm3\n\t"
826
                "punpckldq 6%1, %%mm0\n\t"
827
                "punpckldq 9%1, %%mm3\n\t"
828
                "movq        %%mm0, %%mm1\n\t"
829
                "movq        %%mm0, %%mm2\n\t"
830
                "movq        %%mm3, %%mm4\n\t"
831
                "movq        %%mm3, %%mm5\n\t"
832
                "psllq        $7, %%mm0\n\t"
833
                "psllq        $7, %%mm3\n\t"
834
                "pand        %%mm7, %%mm0\n\t"
835
                "pand        %%mm7, %%mm3\n\t"
836
                "psrlq        $6, %%mm1\n\t"
837
                "psrlq        $6, %%mm4\n\t"
838
                "pand        %%mm6, %%mm1\n\t"
839
                "pand        %%mm6, %%mm4\n\t"
840
                "psrlq        $19, %%mm2\n\t"
841
                "psrlq        $19, %%mm5\n\t"
842
                "pand        %2, %%mm2\n\t"
843
                "pand        %2, %%mm5\n\t"
844
                "por        %%mm1, %%mm0\n\t"
845
                "por        %%mm4, %%mm3\n\t"
846
                "por        %%mm2, %%mm0\n\t"
847
                "por        %%mm5, %%mm3\n\t"
848
                "psllq        $16, %%mm3\n\t"
849
                "por        %%mm3, %%mm0\n\t"
850
                MOVNTQ"        %%mm0, %0\n\t"
851
                :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
852
                d += 4;
853
                s += 12;
854
        }
855
        __asm __volatile(SFENCE:::"memory");
856
        __asm __volatile(EMMS:::"memory");
857
#endif
858
        while(s < end)
859
        {
860
                const int r= *s++;
861
                const int g= *s++;
862
                const int b= *s++;
863
                *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
864
        }
865
}
866
867 0d9f3d85 Arpi
/*
868
  I use here less accurate approximation by simply
869
 left-shifting the input
870
  value and filling the low order bits with
871
 zeroes. This method improves png's
872
  compression but this scheme cannot reproduce white exactly, since it does not
873
  generate an all-ones maximum value; the net effect is to darken the
874
  image slightly.
875

876
  The better method should be "left bit replication":
877

878
   4 3 2 1 0
879
   ---------
880
   1 1 0 1 1
881

882
   7 6 5 4 3  2 1 0
883
   ----------------
884
   1 1 0 1 1  1 1 0
885
   |=======|  |===|
886
       |      Leftmost Bits Repeated to Fill Open Bits
887
       |
888
   Original Bits
889
*/
890
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
891
{
892
        const uint16_t *end;
893
#ifdef HAVE_MMX
894
        const uint16_t *mm_end;
895
#endif
896
        uint8_t *d = (uint8_t *)dst;
897
        const uint16_t *s = (uint16_t *)src;
898
        end = s + src_size/2;
899
#ifdef HAVE_MMX
900
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
901 d8dad2a5 Michael Niedermayer
        mm_end = end - 7;
902 0d9f3d85 Arpi
        while(s < mm_end)
903
        {
904
            __asm __volatile(
905
                PREFETCH" 32%1\n\t"
906
                "movq        %1, %%mm0\n\t"
907
                "movq        %1, %%mm1\n\t"
908
                "movq        %1, %%mm2\n\t"
909
                "pand        %2, %%mm0\n\t"
910
                "pand        %3, %%mm1\n\t"
911
                "pand        %4, %%mm2\n\t"
912
                "psllq        $3, %%mm0\n\t"
913
                "psrlq        $2, %%mm1\n\t"
914
                "psrlq        $7, %%mm2\n\t"
915
                "movq        %%mm0, %%mm3\n\t"
916
                "movq        %%mm1, %%mm4\n\t"
917
                "movq        %%mm2, %%mm5\n\t"
918
                "punpcklwd %5, %%mm0\n\t"
919
                "punpcklwd %5, %%mm1\n\t"
920
                "punpcklwd %5, %%mm2\n\t"
921
                "punpckhwd %5, %%mm3\n\t"
922
                "punpckhwd %5, %%mm4\n\t"
923
                "punpckhwd %5, %%mm5\n\t"
924
                "psllq        $8, %%mm1\n\t"
925
                "psllq        $16, %%mm2\n\t"
926
                "por        %%mm1, %%mm0\n\t"
927
                "por        %%mm2, %%mm0\n\t"
928
                "psllq        $8, %%mm4\n\t"
929
                "psllq        $16, %%mm5\n\t"
930
                "por        %%mm4, %%mm3\n\t"
931
                "por        %%mm5, %%mm3\n\t"
932
933
                "movq        %%mm0, %%mm6\n\t"
934
                "movq        %%mm3, %%mm7\n\t"
935
                
936
                "movq        8%1, %%mm0\n\t"
937
                "movq        8%1, %%mm1\n\t"
938
                "movq        8%1, %%mm2\n\t"
939
                "pand        %2, %%mm0\n\t"
940
                "pand        %3, %%mm1\n\t"
941
                "pand        %4, %%mm2\n\t"
942
                "psllq        $3, %%mm0\n\t"
943
                "psrlq        $2, %%mm1\n\t"
944
                "psrlq        $7, %%mm2\n\t"
945
                "movq        %%mm0, %%mm3\n\t"
946
                "movq        %%mm1, %%mm4\n\t"
947
                "movq        %%mm2, %%mm5\n\t"
948
                "punpcklwd %5, %%mm0\n\t"
949
                "punpcklwd %5, %%mm1\n\t"
950
                "punpcklwd %5, %%mm2\n\t"
951
                "punpckhwd %5, %%mm3\n\t"
952
                "punpckhwd %5, %%mm4\n\t"
953
                "punpckhwd %5, %%mm5\n\t"
954
                "psllq        $8, %%mm1\n\t"
955
                "psllq        $16, %%mm2\n\t"
956
                "por        %%mm1, %%mm0\n\t"
957
                "por        %%mm2, %%mm0\n\t"
958
                "psllq        $8, %%mm4\n\t"
959
                "psllq        $16, %%mm5\n\t"
960
                "por        %%mm4, %%mm3\n\t"
961
                "por        %%mm5, %%mm3\n\t"
962
963
                :"=m"(*d)
964
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965
                :"memory");
966
            /* Borrowed 32 to 24 */
967
            __asm __volatile(
968
                "movq        %%mm0, %%mm4\n\t"
969
                "movq        %%mm3, %%mm5\n\t"
970
                "movq        %%mm6, %%mm0\n\t"
971
                "movq        %%mm7, %%mm1\n\t"
972
                
973
                "movq        %%mm4, %%mm6\n\t"
974
                "movq        %%mm5, %%mm7\n\t"
975
                "movq        %%mm0, %%mm2\n\t"
976
                "movq        %%mm1, %%mm3\n\t"
977
978
                "psrlq        $8, %%mm2\n\t"
979
                "psrlq        $8, %%mm3\n\t"
980
                "psrlq        $8, %%mm6\n\t"
981
                "psrlq        $8, %%mm7\n\t"
982
                "pand        %2, %%mm0\n\t"
983
                "pand        %2, %%mm1\n\t"
984
                "pand        %2, %%mm4\n\t"
985
                "pand        %2, %%mm5\n\t"
986
                "pand        %3, %%mm2\n\t"
987
                "pand        %3, %%mm3\n\t"
988
                "pand        %3, %%mm6\n\t"
989
                "pand        %3, %%mm7\n\t"
990
                "por        %%mm2, %%mm0\n\t"
991
                "por        %%mm3, %%mm1\n\t"
992
                "por        %%mm6, %%mm4\n\t"
993
                "por        %%mm7, %%mm5\n\t"
994
995
                "movq        %%mm1, %%mm2\n\t"
996
                "movq        %%mm4, %%mm3\n\t"
997
                "psllq        $48, %%mm2\n\t"
998
                "psllq        $32, %%mm3\n\t"
999
                "pand        %4, %%mm2\n\t"
1000
                "pand        %5, %%mm3\n\t"
1001
                "por        %%mm2, %%mm0\n\t"
1002
                "psrlq        $16, %%mm1\n\t"
1003
                "psrlq        $32, %%mm4\n\t"
1004
                "psllq        $16, %%mm5\n\t"
1005
                "por        %%mm3, %%mm1\n\t"
1006
                "pand        %6, %%mm5\n\t"
1007
                "por        %%mm5, %%mm4\n\t"
1008
1009
                MOVNTQ"        %%mm0, %0\n\t"
1010
                MOVNTQ"        %%mm1, 8%0\n\t"
1011
                MOVNTQ"        %%mm4, 16%0"
1012
1013
                :"=m"(*d)
1014
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1015
                :"memory");
1016
                d += 24;
1017
                s += 8;
1018
        }
1019 53445e83 Nick Kurshev
        __asm __volatile(SFENCE:::"memory");
1020
        __asm __volatile(EMMS:::"memory");
1021 0d9f3d85 Arpi
#endif
1022
        while(s < end)
1023
        {
1024
                register uint16_t bgr;
1025
                bgr = *s++;
1026
                *d++ = (bgr&0x1F)<<3;
1027
                *d++ = (bgr&0x3E0)>>2;
1028
                *d++ = (bgr&0x7C00)>>7;
1029
        }
1030
}
1031
1032
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1033
{
1034
        const uint16_t *end;
1035
#ifdef HAVE_MMX
1036
        const uint16_t *mm_end;
1037
#endif
1038
        uint8_t *d = (uint8_t *)dst;
1039
        const uint16_t *s = (const uint16_t *)src;
1040
        end = s + src_size/2;
1041
#ifdef HAVE_MMX
1042
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1043 d8dad2a5 Michael Niedermayer
        mm_end = end - 7;
1044 0d9f3d85 Arpi
        while(s < mm_end)
1045
        {
1046
            __asm __volatile(
1047
                PREFETCH" 32%1\n\t"
1048
                "movq        %1, %%mm0\n\t"
1049
                "movq        %1, %%mm1\n\t"
1050
                "movq        %1, %%mm2\n\t"
1051
                "pand        %2, %%mm0\n\t"
1052
                "pand        %3, %%mm1\n\t"
1053
                "pand        %4, %%mm2\n\t"
1054
                "psllq        $3, %%mm0\n\t"
1055
                "psrlq        $3, %%mm1\n\t"
1056
                "psrlq        $8, %%mm2\n\t"
1057
                "movq        %%mm0, %%mm3\n\t"
1058
                "movq        %%mm1, %%mm4\n\t"
1059
                "movq        %%mm2, %%mm5\n\t"
1060
                "punpcklwd %5, %%mm0\n\t"
1061
                "punpcklwd %5, %%mm1\n\t"
1062
                "punpcklwd %5, %%mm2\n\t"
1063
                "punpckhwd %5, %%mm3\n\t"
1064
                "punpckhwd %5, %%mm4\n\t"
1065
                "punpckhwd %5, %%mm5\n\t"
1066
                "psllq        $8, %%mm1\n\t"
1067
                "psllq        $16, %%mm2\n\t"
1068
                "por        %%mm1, %%mm0\n\t"
1069
                "por        %%mm2, %%mm0\n\t"
1070
                "psllq        $8, %%mm4\n\t"
1071
                "psllq        $16, %%mm5\n\t"
1072
                "por        %%mm4, %%mm3\n\t"
1073
                "por        %%mm5, %%mm3\n\t"
1074
                
1075
                "movq        %%mm0, %%mm6\n\t"
1076
                "movq        %%mm3, %%mm7\n\t"
1077
1078
                "movq        8%1, %%mm0\n\t"
1079
                "movq        8%1, %%mm1\n\t"
1080
                "movq        8%1, %%mm2\n\t"
1081
                "pand        %2, %%mm0\n\t"
1082
                "pand        %3, %%mm1\n\t"
1083
                "pand        %4, %%mm2\n\t"
1084
                "psllq        $3, %%mm0\n\t"
1085
                "psrlq        $3, %%mm1\n\t"
1086
                "psrlq        $8, %%mm2\n\t"
1087
                "movq        %%mm0, %%mm3\n\t"
1088
                "movq        %%mm1, %%mm4\n\t"
1089
                "movq        %%mm2, %%mm5\n\t"
1090
                "punpcklwd %5, %%mm0\n\t"
1091
                "punpcklwd %5, %%mm1\n\t"
1092
                "punpcklwd %5, %%mm2\n\t"
1093
                "punpckhwd %5, %%mm3\n\t"
1094
                "punpckhwd %5, %%mm4\n\t"
1095
                "punpckhwd %5, %%mm5\n\t"
1096
                "psllq        $8, %%mm1\n\t"
1097
                "psllq        $16, %%mm2\n\t"
1098
                "por        %%mm1, %%mm0\n\t"
1099
                "por        %%mm2, %%mm0\n\t"
1100
                "psllq        $8, %%mm4\n\t"
1101
                "psllq        $16, %%mm5\n\t"
1102
                "por        %%mm4, %%mm3\n\t"
1103
                "por        %%mm5, %%mm3\n\t"
1104
                :"=m"(*d)
1105
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)                
1106
                :"memory");
1107
            /* Borrowed 32 to 24 */
1108
            __asm __volatile(
1109
                "movq        %%mm0, %%mm4\n\t"
1110
                "movq        %%mm3, %%mm5\n\t"
1111
                "movq        %%mm6, %%mm0\n\t"
1112
                "movq        %%mm7, %%mm1\n\t"
1113
                
1114
                "movq        %%mm4, %%mm6\n\t"
1115
                "movq        %%mm5, %%mm7\n\t"
1116
                "movq        %%mm0, %%mm2\n\t"
1117
                "movq        %%mm1, %%mm3\n\t"
1118
1119
                "psrlq        $8, %%mm2\n\t"
1120
                "psrlq        $8, %%mm3\n\t"
1121
                "psrlq        $8, %%mm6\n\t"
1122
                "psrlq        $8, %%mm7\n\t"
1123
                "pand        %2, %%mm0\n\t"
1124
                "pand        %2, %%mm1\n\t"
1125
                "pand        %2, %%mm4\n\t"
1126
                "pand        %2, %%mm5\n\t"
1127
                "pand        %3, %%mm2\n\t"
1128
                "pand        %3, %%mm3\n\t"
1129
                "pand        %3, %%mm6\n\t"
1130
                "pand        %3, %%mm7\n\t"
1131
                "por        %%mm2, %%mm0\n\t"
1132
                "por        %%mm3, %%mm1\n\t"
1133
                "por        %%mm6, %%mm4\n\t"
1134
                "por        %%mm7, %%mm5\n\t"
1135
1136
                "movq        %%mm1, %%mm2\n\t"
1137
                "movq        %%mm4, %%mm3\n\t"
1138
                "psllq        $48, %%mm2\n\t"
1139
                "psllq        $32, %%mm3\n\t"
1140
                "pand        %4, %%mm2\n\t"
1141
                "pand        %5, %%mm3\n\t"
1142
                "por        %%mm2, %%mm0\n\t"
1143
                "psrlq        $16, %%mm1\n\t"
1144
                "psrlq        $32, %%mm4\n\t"
1145
                "psllq        $16, %%mm5\n\t"
1146
                "por        %%mm3, %%mm1\n\t"
1147
                "pand        %6, %%mm5\n\t"
1148
                "por        %%mm5, %%mm4\n\t"
1149
1150
                MOVNTQ"        %%mm0, %0\n\t"
1151
                MOVNTQ"        %%mm1, 8%0\n\t"
1152
                MOVNTQ"        %%mm4, 16%0"
1153
1154
                :"=m"(*d)
1155
                :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1156
                :"memory");
1157
                d += 24;
1158
                s += 8;
1159
        }
1160
        __asm __volatile(SFENCE:::"memory");
1161
        __asm __volatile(EMMS:::"memory");
1162
#endif
1163
        while(s < end)
1164
        {
1165
                register uint16_t bgr;
1166
                bgr = *s++;
1167
                *d++ = (bgr&0x1F)<<3;
1168
                *d++ = (bgr&0x7E0)>>3;
1169
                *d++ = (bgr&0xF800)>>8;
1170
        }
1171
}
1172
1173
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1174
{
1175
        const uint16_t *end;
1176
#ifdef HAVE_MMX
1177
        const uint16_t *mm_end;
1178
#endif
1179
        uint8_t *d = (uint8_t *)dst;
1180
        const uint16_t *s = (const uint16_t *)src;
1181
        end = s + src_size/2;
1182
#ifdef HAVE_MMX
1183
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1184
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1185 d8dad2a5 Michael Niedermayer
        mm_end = end - 3;
1186 0d9f3d85 Arpi
        while(s < mm_end)
1187
        {
1188
            __asm __volatile(
1189
                PREFETCH" 32%1\n\t"
1190
                "movq        %1, %%mm0\n\t"
1191
                "movq        %1, %%mm1\n\t"
1192
                "movq        %1, %%mm2\n\t"
1193
                "pand        %2, %%mm0\n\t"
1194
                "pand        %3, %%mm1\n\t"
1195
                "pand        %4, %%mm2\n\t"
1196
                "psllq        $3, %%mm0\n\t"
1197
                "psrlq        $2, %%mm1\n\t"
1198
                "psrlq        $7, %%mm2\n\t"
1199
                "movq        %%mm0, %%mm3\n\t"
1200
                "movq        %%mm1, %%mm4\n\t"
1201
                "movq        %%mm2, %%mm5\n\t"
1202
                "punpcklwd %%mm7, %%mm0\n\t"
1203
                "punpcklwd %%mm7, %%mm1\n\t"
1204
                "punpcklwd %%mm7, %%mm2\n\t"
1205
                "punpckhwd %%mm7, %%mm3\n\t"
1206
                "punpckhwd %%mm7, %%mm4\n\t"
1207
                "punpckhwd %%mm7, %%mm5\n\t"
1208
                "psllq        $8, %%mm1\n\t"
1209
                "psllq        $16, %%mm2\n\t"
1210
                "por        %%mm1, %%mm0\n\t"
1211
                "por        %%mm2, %%mm0\n\t"
1212
                "psllq        $8, %%mm4\n\t"
1213
                "psllq        $16, %%mm5\n\t"
1214
                "por        %%mm4, %%mm3\n\t"
1215
                "por        %%mm5, %%mm3\n\t"
1216
                MOVNTQ"        %%mm0, %0\n\t"
1217
                MOVNTQ"        %%mm3, 8%0\n\t"
1218
                :"=m"(*d)
1219
                :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1220
                :"memory");
1221
                d += 16;
1222
                s += 4;
1223
        }
1224
        __asm __volatile(SFENCE:::"memory");
1225
        __asm __volatile(EMMS:::"memory");
1226
#endif
1227
        while(s < end)
1228 996e1a7c Nick Kurshev
        {
1229 deb2277c Michael Niedermayer
#if 0 //slightly slower on athlon
1230
                int bgr= *s++;
1231
                *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1232
#else
1233
//FIXME this is very likely wrong for bigendian (and the following converters too)
1234 0d9f3d85 Arpi
                register uint16_t bgr;
1235
                bgr = *s++;
1236
                *d++ = (bgr&0x1F)<<3;
1237
                *d++ = (bgr&0x3E0)>>2;
1238
                *d++ = (bgr&0x7C00)>>7;
1239
                *d++ = 0;
1240 deb2277c Michael Niedermayer
#endif
1241 0d9f3d85 Arpi
        }
1242
}
1243 996e1a7c Nick Kurshev
1244 0d9f3d85 Arpi
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1245
{
1246
        const uint16_t *end;
1247
#ifdef HAVE_MMX
1248
        const uint16_t *mm_end;
1249
#endif
1250
        uint8_t *d = (uint8_t *)dst;
1251
        const uint16_t *s = (uint16_t *)src;
1252
        end = s + src_size/2;
1253
#ifdef HAVE_MMX
1254
        __asm __volatile(PREFETCH"        %0"::"m"(*s):"memory");
1255
        __asm __volatile("pxor        %%mm7,%%mm7\n\t":::"memory");
1256 d8dad2a5 Michael Niedermayer
        mm_end = end - 3;
1257 0d9f3d85 Arpi
        while(s < mm_end)
1258
        {
1259
            __asm __volatile(
1260
                PREFETCH" 32%1\n\t"
1261
                "movq        %1, %%mm0\n\t"
1262
                "movq        %1, %%mm1\n\t"
1263
                "movq        %1, %%mm2\n\t"
1264
                "pand        %2, %%mm0\n\t"
1265
                "pand        %3, %%mm1\n\t"
1266
                "pand        %4, %%mm2\n\t"
1267
                "psllq        $3, %%mm0\n\t"
1268
                "psrlq        $3, %%mm1\n\t"
1269
                "psrlq        $8, %%mm2\n\t"
1270
                "movq        %%mm0, %%mm3\n\t"
1271
                "movq        %%mm1, %%mm4\n\t"
1272
                "movq        %%mm2, %%mm5\n\t"
1273
                "punpcklwd %%mm7, %%mm0\n\t"
1274
                "punpcklwd %%mm7, %%mm1\n\t"
1275
                "punpcklwd %%mm7, %%mm2\n\t"
1276
                "punpckhwd %%mm7, %%mm3\n\t"
1277
                "punpckhwd %%mm7, %%mm4\n\t"
1278
                "punpckhwd %%mm7, %%mm5\n\t"
1279
                "psllq        $8, %%mm1\n\t"
1280
                "psllq        $16, %%mm2\n\t"
1281
                "por        %%mm1, %%mm0\n\t"
1282
                "por        %%mm2, %%mm0\n\t"
1283
                "psllq        $8, %%mm4\n\t"
1284
                "psllq        $16, %%mm5\n\t"
1285
                "por        %%mm4, %%mm3\n\t"
1286
                "por        %%mm5, %%mm3\n\t"
1287
                MOVNTQ"        %%mm0, %0\n\t"
1288
                MOVNTQ"        %%mm3, 8%0\n\t"
1289
                :"=m"(*d)
1290
                :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1291
                :"memory");
1292
                d += 16;
1293
                s += 4;
1294 996e1a7c Nick Kurshev
        }
1295 0d9f3d85 Arpi
        __asm __volatile(SFENCE:::"memory");
1296
        __asm __volatile(EMMS:::"memory");
1297 53445e83 Nick Kurshev
#endif
1298 0d9f3d85 Arpi
        while(s < end)
1299
        {
1300
                register uint16_t bgr;
1301
                bgr = *s++;
1302
                *d++ = (bgr&0x1F)<<3;
1303
                *d++ = (bgr&0x7E0)>>3;
1304
                *d++ = (bgr&0xF800)>>8;
1305
                *d++ = 0;
1306
        }
1307 996e1a7c Nick Kurshev
}
1308 fcfbc150 Michael Niedermayer
1309 1de97d84 Michael Niedermayer
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1310 99969243 Michael Niedermayer
{
1311
#ifdef HAVE_MMX
1312 0d9f3d85 Arpi
/* TODO: unroll this loop */
1313 99969243 Michael Niedermayer
        asm volatile (
1314
                "xorl %%eax, %%eax                \n\t"
1315 cff6ecd7 Michael Niedermayer
                ".balign 16                        \n\t"
1316 99969243 Michael Niedermayer
                "1:                                \n\t"
1317
                PREFETCH" 32(%0, %%eax)                \n\t"
1318
                "movq (%0, %%eax), %%mm0        \n\t"
1319
                "movq %%mm0, %%mm1                \n\t"
1320
                "movq %%mm0, %%mm2                \n\t"
1321
                "pslld $16, %%mm0                \n\t"
1322
                "psrld $16, %%mm1                \n\t"
1323 0d9f3d85 Arpi
                "pand "MANGLE(mask32r)", %%mm0        \n\t"
1324
                "pand "MANGLE(mask32g)", %%mm2        \n\t"
1325
                "pand "MANGLE(mask32b)", %%mm1        \n\t"
1326 99969243 Michael Niedermayer
                "por %%mm0, %%mm2                \n\t"
1327
                "por %%mm1, %%mm2                \n\t"
1328
                MOVNTQ" %%mm2, (%1, %%eax)        \n\t"
1329 218ad65d Michael Niedermayer
                "addl $8, %%eax                        \n\t"
1330 99969243 Michael Niedermayer
                "cmpl %2, %%eax                        \n\t"
1331
                " jb 1b                                \n\t"
1332 d8dad2a5 Michael Niedermayer
                :: "r" (src), "r"(dst), "r" (src_size-7)
1333 99969243 Michael Niedermayer
                : "%eax"
1334
        );
1335 9395185f Michael Niedermayer
1336
        __asm __volatile(SFENCE:::"memory");
1337
        __asm __volatile(EMMS:::"memory");
1338 99969243 Michael Niedermayer
#else
1339 0d9f3d85 Arpi
        unsigned i;
1340
        unsigned num_pixels = src_size >> 2;
1341 99969243 Michael Niedermayer
        for(i=0; i<num_pixels; i++)
1342
        {
1343 d7b8e4b6 Michael Niedermayer
#ifdef WORDS_BIGENDIAN  
1344
          dst[4*i + 1] = src[4*i + 3];
1345
          dst[4*i + 2] = src[4*i + 2];
1346
          dst[4*i + 3] = src[4*i + 1];
1347
#else
1348
          dst[4*i + 0] = src[4*i + 2];
1349
          dst[4*i + 1] = src[4*i + 1];
1350
          dst[4*i + 2] = src[4*i + 0];
1351
#endif
1352 99969243 Michael Niedermayer
        }
1353
#endif
1354
}
1355
1356 74d35835 Michael Niedermayer
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1357
{
1358 0d9f3d85 Arpi
        unsigned i;
1359 74d35835 Michael Niedermayer
#ifdef HAVE_MMX
1360
        int mmx_size= 23 - src_size;
1361
        asm volatile (
1362
                "movq "MANGLE(mask24r)", %%mm5        \n\t"
1363
                "movq "MANGLE(mask24g)", %%mm6        \n\t"
1364
                "movq "MANGLE(mask24b)", %%mm7        \n\t"
1365
                ".balign 16                        \n\t"
1366
                "1:                                \n\t"
1367
                PREFETCH" 32(%1, %%eax)                \n\t"
1368
                "movq   (%1, %%eax), %%mm0        \n\t" // BGR BGR BG
1369
                "movq   (%1, %%eax), %%mm1        \n\t" // BGR BGR BG
1370
                "movq  2(%1, %%eax), %%mm2        \n\t" // R BGR BGR B
1371
                "psllq $16, %%mm0                \n\t" // 00 BGR BGR
1372
                "pand %%mm5, %%mm0                \n\t"
1373
                "pand %%mm6, %%mm1                \n\t"
1374
                "pand %%mm7, %%mm2                \n\t"
1375
                "por %%mm0, %%mm1                \n\t"
1376
                "por %%mm2, %%mm1                \n\t"                
1377
                "movq  6(%1, %%eax), %%mm0        \n\t" // BGR BGR BG
1378
                MOVNTQ" %%mm1,   (%2, %%eax)        \n\t" // RGB RGB RG
1379
                "movq  8(%1, %%eax), %%mm1        \n\t" // R BGR BGR B
1380
                "movq 10(%1, %%eax), %%mm2        \n\t" // GR BGR BGR
1381
                "pand %%mm7, %%mm0                \n\t"
1382
                "pand %%mm5, %%mm1                \n\t"
1383
                "pand %%mm6, %%mm2                \n\t"
1384
                "por %%mm0, %%mm1                \n\t"
1385
                "por %%mm2, %%mm1                \n\t"                
1386
                "movq 14(%1, %%eax), %%mm0        \n\t" // R BGR BGR B
1387
                MOVNTQ" %%mm1,  8(%2, %%eax)        \n\t" // B RGB RGB R
1388
                "movq 16(%1, %%eax), %%mm1        \n\t" // GR BGR BGR
1389
                "movq 18(%1, %%eax), %%mm2        \n\t" // BGR BGR BG
1390
                "pand %%mm6, %%mm0                \n\t"
1391
                "pand %%mm7, %%mm1                \n\t"
1392
                "pand %%mm5, %%mm2                \n\t"
1393
                "por %%mm0, %%mm1                \n\t"
1394
                "por %%mm2, %%mm1                \n\t"                
1395
                MOVNTQ" %%mm1, 16(%2, %%eax)        \n\t"
1396
                "addl $24, %%eax                \n\t"
1397
                " js 1b                                \n\t"
1398
                : "+a" (mmx_size)
1399
                : "r" (src-mmx_size), "r"(dst-mmx_size)
1400
        );
1401
1402
        __asm __volatile(SFENCE:::"memory");
1403
        __asm __volatile(EMMS:::"memory");
1404
1405 218ad65d Michael Niedermayer
        if(mmx_size==23) return; //finihsed, was multiple of 8
1406 0d9f3d85 Arpi
1407 74d35835 Michael Niedermayer
        src+= src_size;
1408
        dst+= src_size;
1409 0d9f3d85 Arpi
        src_size= 23-mmx_size;
1410 74d35835 Michael Niedermayer
        src-= src_size;
1411
        dst-= src_size;
1412
#endif
1413
        for(i=0; i<src_size; i+=3)
1414
        {
1415 0d9f3d85 Arpi
                register uint8_t x;
1416 74d35835 Michael Niedermayer
                x          = src[i + 2];
1417
                dst[i + 1] = src[i + 1];
1418
                dst[i + 2] = src[i + 0];
1419
                dst[i + 0] = x;
1420
        }
1421
}
1422
1423 b1ec5875 Michael Niedermayer
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424 cfc15dc6 Michael Niedermayer
        unsigned int width, unsigned int height,
1425 f0b62bbd Michael Niedermayer
        int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1426 d9d58d17 Michael Niedermayer
{
1427 0d9f3d85 Arpi
        unsigned y;
1428
        const unsigned chromWidth= width>>1;
1429 42b5fcb8 Michael Niedermayer
        for(y=0; y<height; y++)
1430
        {
1431 4060205b Michael Niedermayer
#ifdef HAVE_MMX
1432 42b5fcb8 Michael Niedermayer
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1433
                asm volatile(
1434
                        "xorl %%eax, %%eax                \n\t"
1435 cff6ecd7 Michael Niedermayer
                        ".balign 16                        \n\t"
1436 42b5fcb8 Michael Niedermayer
                        "1:                                \n\t"
1437
                        PREFETCH" 32(%1, %%eax, 2)        \n\t"
1438
                        PREFETCH" 32(%2, %%eax)                \n\t"
1439
                        PREFETCH" 32(%3, %%eax)                \n\t"
1440
                        "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1441
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1442
                        "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1443
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1444
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1445 4060205b Michael Niedermayer
1446 42b5fcb8 Michael Niedermayer
                        "movq (%1, %%eax,2), %%mm3        \n\t" // Y(0)
1447
                        "movq 8(%1, %%eax,2), %%mm5        \n\t" // Y(8)
1448
                        "movq %%mm3, %%mm4                \n\t" // Y(0)
1449
                        "movq %%mm5, %%mm6                \n\t" // Y(8)
1450
                        "punpcklbw %%mm0, %%mm3                \n\t" // YUYV YUYV(0)
1451
                        "punpckhbw %%mm0, %%mm4                \n\t" // YUYV YUYV(4)
1452
                        "punpcklbw %%mm2, %%mm5                \n\t" // YUYV YUYV(8)
1453
                        "punpckhbw %%mm2, %%mm6                \n\t" // YUYV YUYV(12)
1454 4060205b Michael Niedermayer
1455 42b5fcb8 Michael Niedermayer
                        MOVNTQ" %%mm3, (%0, %%eax, 4)        \n\t"
1456
                        MOVNTQ" %%mm4, 8(%0, %%eax, 4)        \n\t"
1457
                        MOVNTQ" %%mm5, 16(%0, %%eax, 4)        \n\t"
1458
                        MOVNTQ" %%mm6, 24(%0, %%eax, 4)        \n\t"
1459 4060205b Michael Niedermayer
1460 42b5fcb8 Michael Niedermayer
                        "addl $8, %%eax                        \n\t"
1461
                        "cmpl %4, %%eax                        \n\t"
1462
                        " jb 1b                                \n\t"
1463 4596673c Michael Niedermayer
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1464 42b5fcb8 Michael Niedermayer
                        : "%eax"
1465
                );
1466 4060205b Michael Niedermayer
#else
1467 b3b8bf64 Michael Niedermayer
1468
#if defined ARCH_ALPHA && defined HAVE_MVI
1469
#define pl2yuy2(n)                                        \
1470
        y1 = yc[n];                                        \
1471
        y2 = yc2[n];                                        \
1472
        u = uc[n];                                        \
1473
        v = vc[n];                                        \
1474
        asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));        \
1475
        asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));        \
1476
        asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1477
        asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1478
        yuv1 = (u << 8) + (v << 24);                        \
1479
        yuv2 = yuv1 + y2;                                \
1480
        yuv1 += y1;                                        \
1481
        qdst[n] = yuv1;                                        \
1482
        qdst2[n] = yuv2;
1483
1484
                int i;
1485
                uint64_t *qdst = (uint64_t *) dst;
1486
                uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487
                const uint32_t *yc = (uint32_t *) ysrc;
1488
                const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489
                const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490
                for(i = 0; i < chromWidth; i += 8){
1491
                        uint64_t y1, y2, yuv1, yuv2;
1492
                        uint64_t u, v;
1493
                        /* Prefetch */
1494
                        asm("ldq $31,64(%0)" :: "r"(yc));
1495
                        asm("ldq $31,64(%0)" :: "r"(yc2));
1496
                        asm("ldq $31,64(%0)" :: "r"(uc));
1497
                        asm("ldq $31,64(%0)" :: "r"(vc));
1498
1499
                        pl2yuy2(0);
1500
                        pl2yuy2(1);
1501
                        pl2yuy2(2);
1502
                        pl2yuy2(3);
1503
1504
                        yc += 4;
1505
                        yc2 += 4;
1506
                        uc += 4;
1507
                        vc += 4;
1508
                        qdst += 4;
1509
                        qdst2 += 4;
1510
                }
1511
                y++;
1512
                ysrc += lumStride;
1513
                dst += dstStride;
1514
1515
#elif __WORDSIZE >= 64
1516 42b5fcb8 Michael Niedermayer
                int i;
1517 0d9f3d85 Arpi
                uint64_t *ldst = (uint64_t *) dst;
1518
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519
                for(i = 0; i < chromWidth; i += 2){
1520
                        uint64_t k, l;
1521
                        k = yc[0] + (uc[0] << 8) +
1522
                            (yc[1] << 16) + (vc[0] << 24);
1523
                        l = yc[2] + (uc[1] << 8) +
1524
                            (yc[3] << 16) + (vc[1] << 24);
1525
                        *ldst++ = k + (l << 32);
1526
                        yc += 4;
1527
                        uc += 2;
1528
                        vc += 2;
1529 42b5fcb8 Michael Niedermayer
                }
1530 0d9f3d85 Arpi
1531
#else
1532
                int i, *idst = (int32_t *) dst;
1533
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534
                for(i = 0; i < chromWidth; i++){
1535
                        *idst++ = yc[0] + (uc[0] << 8) +
1536
                            (yc[1] << 16) + (vc[0] << 24);
1537
                        yc += 2;
1538
                        uc++;
1539
                        vc++;
1540
                }
1541
#endif
1542 42b5fcb8 Michael Niedermayer
#endif
1543 b1ec5875 Michael Niedermayer
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1544 42b5fcb8 Michael Niedermayer
                {
1545
                        usrc += chromStride;
1546
                        vsrc += chromStride;
1547
                }
1548
                ysrc += lumStride;
1549
                dst += dstStride;
1550 d9d58d17 Michael Niedermayer
        }
1551 42b5fcb8 Michael Niedermayer
#ifdef HAVE_MMX
1552
asm(    EMMS" \n\t"
1553
        SFENCE" \n\t"
1554
        :::"memory");
1555 4060205b Michael Niedermayer
#endif
1556 d9d58d17 Michael Niedermayer
}
1557
1558 dabcdbc4 Michael Niedermayer
/**
1559
 *
1560
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1561
 * problem for anyone then tell me, and ill fix it)
1562
 */
1563 b1ec5875 Michael Niedermayer
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1564
        unsigned int width, unsigned int height,
1565 f0b62bbd Michael Niedermayer
        int lumStride, int chromStride, int dstStride)
1566 b1ec5875 Michael Niedermayer
{
1567
        //FIXME interpolate chroma
1568
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1569 caeaabe7 Alex Beregszaszi
}
1570
1571
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1572
        unsigned int width, unsigned int height,
1573
        int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1574
{
1575
        unsigned y;
1576
        const unsigned chromWidth= width>>1;
1577
        for(y=0; y<height; y++)
1578
        {
1579 7ac25f2d Michael Niedermayer
#ifdef HAVE_MMX
1580
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1581
                asm volatile(
1582
                        "xorl %%eax, %%eax                \n\t"
1583
                        ".balign 16                        \n\t"
1584
                        "1:                                \n\t"
1585
                        PREFETCH" 32(%1, %%eax, 2)        \n\t"
1586
                        PREFETCH" 32(%2, %%eax)                \n\t"
1587
                        PREFETCH" 32(%3, %%eax)                \n\t"
1588
                        "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1589
                        "movq %%mm0, %%mm2                \n\t" // U(0)
1590
                        "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1591
                        "punpcklbw %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1592
                        "punpckhbw %%mm1, %%mm2                \n\t" // UVUV UVUV(8)
1593
1594
                        "movq (%1, %%eax,2), %%mm3        \n\t" // Y(0)
1595
                        "movq 8(%1, %%eax,2), %%mm5        \n\t" // Y(8)
1596
                        "movq %%mm0, %%mm4                \n\t" // Y(0)
1597
                        "movq %%mm2, %%mm6                \n\t" // Y(8)
1598
                        "punpcklbw %%mm3, %%mm0                \n\t" // YUYV YUYV(0)
1599
                        "punpckhbw %%mm3, %%mm4                \n\t" // YUYV YUYV(4)
1600
                        "punpcklbw %%mm5, %%mm2                \n\t" // YUYV YUYV(8)
1601
                        "punpckhbw %%mm5, %%mm6                \n\t" // YUYV YUYV(12)
1602
1603
                        MOVNTQ" %%mm0, (%0, %%eax, 4)        \n\t"
1604
                        MOVNTQ" %%mm4, 8(%0, %%eax, 4)        \n\t"
1605
                        MOVNTQ" %%mm2, 16(%0, %%eax, 4)        \n\t"
1606
                        MOVNTQ" %%mm6, 24(%0, %%eax, 4)        \n\t"
1607
1608
                        "addl $8, %%eax                        \n\t"
1609
                        "cmpl %4, %%eax                        \n\t"
1610
                        " jb 1b                                \n\t"
1611
                        ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1612
                        : "%eax"
1613
                );
1614
#else
1615
//FIXME adapt the alpha asm code from yv12->yuy2
1616
1617 caeaabe7 Alex Beregszaszi
#if __WORDSIZE >= 64
1618
                int i;
1619
                uint64_t *ldst = (uint64_t *) dst;
1620
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1621
                for(i = 0; i < chromWidth; i += 2){
1622
                        uint64_t k, l;
1623
                        k = uc[0] + (yc[0] << 8) +
1624
                            (vc[0] << 16) + (yc[1] << 24);
1625
                        l = uc[1] + (yc[2] << 8) +
1626
                            (vc[1] << 16) + (yc[3] << 24);
1627
                        *ldst++ = k + (l << 32);
1628
                        yc += 4;
1629
                        uc += 2;
1630
                        vc += 2;
1631
                }
1632
1633
#else
1634
                int i, *idst = (int32_t *) dst;
1635
                const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1636
                for(i = 0; i < chromWidth; i++){
1637
                        *idst++ = uc[0] + (yc[0] << 8) +
1638
                            (vc[0] << 16) + (yc[1] << 24);
1639
                        yc += 2;
1640
                        uc++;
1641
                        vc++;
1642
                }
1643
#endif
1644 7ac25f2d Michael Niedermayer
#endif
1645 caeaabe7 Alex Beregszaszi
                if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1646
                {
1647
                        usrc += chromStride;
1648
                        vsrc += chromStride;
1649
                }
1650
                ysrc += lumStride;
1651
                dst += dstStride;
1652
        }
1653 7ac25f2d Michael Niedermayer
#ifdef HAVE_MMX
1654
asm(    EMMS" \n\t"
1655
        SFENCE" \n\t"
1656
        :::"memory");
1657
#endif
1658 caeaabe7 Alex Beregszaszi
}
1659
1660
/**
1661
 *
1662
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1663
 * problem for anyone then tell me, and ill fix it)
1664
 */
1665
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1666
        unsigned int width, unsigned int height,
1667
        int lumStride, int chromStride, int dstStride)
1668
{
1669
        //FIXME interpolate chroma
1670
        RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1671 b1ec5875 Michael Niedermayer
}
1672
1673
/**
1674
 *
1675
 * width should be a multiple of 16
1676
 */
1677
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1678
        unsigned int width, unsigned int height,
1679 f0b62bbd Michael Niedermayer
        int lumStride, int chromStride, int dstStride)
1680 b1ec5875 Michael Niedermayer
{
1681
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1682
}
1683
1684
/**
1685
 *
1686
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1687
 * problem for anyone then tell me, and ill fix it)
1688
 */
1689 1de97d84 Michael Niedermayer
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1690 cfc15dc6 Michael Niedermayer
        unsigned int width, unsigned int height,
1691 f0b62bbd Michael Niedermayer
        int lumStride, int chromStride, int srcStride)
1692 d9d58d17 Michael Niedermayer
{
1693 0d9f3d85 Arpi
        unsigned y;
1694
        const unsigned chromWidth= width>>1;
1695 dabcdbc4 Michael Niedermayer
        for(y=0; y<height; y+=2)
1696
        {
1697 bd09433f Michael Niedermayer
#ifdef HAVE_MMX
1698 dabcdbc4 Michael Niedermayer
                asm volatile(
1699
                        "xorl %%eax, %%eax                \n\t"
1700
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1701
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1702 cff6ecd7 Michael Niedermayer
                        ".balign 16                        \n\t"
1703 dabcdbc4 Michael Niedermayer
                        "1:                                \n\t"
1704
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1705
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1706
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1707
                        "movq %%mm0, %%mm2                \n\t" // YUYV YUYV(0)
1708
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(4)
1709
                        "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1710
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1711
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1712
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1713
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1714
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1715
1716
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
1717
1718
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(8)
1719
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(12)
1720
                        "movq %%mm1, %%mm3                \n\t" // YUYV YUYV(8)
1721
                        "movq %%mm2, %%mm4                \n\t" // YUYV YUYV(12)
1722
                        "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1723
                        "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1724
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1725
                        "pand %%mm7, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1726
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1727
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1728
1729
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
1730
1731
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1732
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1733
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1734
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1735
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1736
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1737
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1738
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1739
1740
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
1741
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
1742
1743
                        "addl $8, %%eax                        \n\t"
1744
                        "cmpl %4, %%eax                        \n\t"
1745
                        " jb 1b                                \n\t"
1746 4596673c Michael Niedermayer
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1747 cfc15dc6 Michael Niedermayer
                        : "memory", "%eax"
1748
                );
1749 dabcdbc4 Michael Niedermayer
1750 ed346065 Michael Niedermayer
                ydst += lumStride;
1751
                src  += srcStride;
1752
1753 cfc15dc6 Michael Niedermayer
                asm volatile(
1754
                        "xorl %%eax, %%eax                \n\t"
1755 cff6ecd7 Michael Niedermayer
                        ".balign 16                        \n\t"
1756 dabcdbc4 Michael Niedermayer
                        "1:                                \n\t"
1757
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1758
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1759
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1760
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
1761
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
1762
                        "pand %%mm7, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1763
                        "pand %%mm7, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1764
                        "pand %%mm7, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1765
                        "pand %%mm7, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1766
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
1767
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
1768
1769
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
1770
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
1771
1772
                        "addl $8, %%eax                        \n\t"
1773 cfc15dc6 Michael Niedermayer
                        "cmpl %4, %%eax                        \n\t"
1774 dabcdbc4 Michael Niedermayer
                        " jb 1b                                \n\t"
1775
1776 4596673c Michael Niedermayer
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1777 dabcdbc4 Michael Niedermayer
                        : "memory", "%eax"
1778
                );
1779 bd09433f Michael Niedermayer
#else
1780 0d9f3d85 Arpi
                unsigned i;
1781 dabcdbc4 Michael Niedermayer
                for(i=0; i<chromWidth; i++)
1782
                {
1783
                        ydst[2*i+0]         = src[4*i+0];
1784
                        udst[i]         = src[4*i+1];
1785
                        ydst[2*i+1]         = src[4*i+2];
1786
                        vdst[i]         = src[4*i+3];
1787
                }
1788
                ydst += lumStride;
1789
                src  += srcStride;
1790
1791
                for(i=0; i<chromWidth; i++)
1792
                {
1793
                        ydst[2*i+0]         = src[4*i+0];
1794
                        ydst[2*i+1]         = src[4*i+2];
1795
                }
1796
#endif
1797
                udst += chromStride;
1798
                vdst += chromStride;
1799
                ydst += lumStride;
1800
                src  += srcStride;
1801 d9d58d17 Michael Niedermayer
        }
1802 dabcdbc4 Michael Niedermayer
#ifdef HAVE_MMX
1803 ed8c0670 Michael Niedermayer
asm volatile(   EMMS" \n\t"
1804
                SFENCE" \n\t"
1805
                :::"memory");
1806 bd09433f Michael Niedermayer
#endif
1807 42b5fcb8 Michael Niedermayer
}
1808 81c0590e Arpi
1809 d661d18d Alex Beregszaszi
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1810
        uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1811 f0b62bbd Michael Niedermayer
        unsigned int width, unsigned int height, int lumStride, int chromStride)
1812 d661d18d Alex Beregszaszi
{
1813
        /* Y Plane */
1814
        memcpy(ydst, ysrc, width*height);
1815
1816
        /* XXX: implement upscaling for U,V */
1817
}
1818
1819 b241cbf2 Michael Niedermayer
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1820
{
1821
        int x,y;
1822
        
1823 b2609d4c Michael Niedermayer
        dst[0]= src[0];
1824
        
1825 b241cbf2 Michael Niedermayer
        // first line
1826 b2609d4c Michael Niedermayer
        for(x=0; x<srcWidth-1; x++){
1827
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1828
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1829 b241cbf2 Michael Niedermayer
        }
1830 b2609d4c Michael Niedermayer
        dst[2*srcWidth-1]= src[srcWidth-1];
1831
        
1832
        dst+= dstStride;
1833 b241cbf2 Michael Niedermayer
1834
        for(y=1; y<srcHeight; y++){
1835
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1836 b2609d4c Michael Niedermayer
                const int mmxSize= srcWidth&~15;
1837 b241cbf2 Michael Niedermayer
                asm volatile(
1838
                        "movl %4, %%eax                        \n\t"
1839
                        "1:                                \n\t"
1840
                        "movq (%0, %%eax), %%mm0        \n\t"
1841
                        "movq (%1, %%eax), %%mm1        \n\t"
1842
                        "movq 1(%0, %%eax), %%mm2        \n\t"
1843
                        "movq 1(%1, %%eax), %%mm3        \n\t"
1844 b2609d4c Michael Niedermayer
                        "movq -1(%0, %%eax), %%mm4        \n\t"
1845
                        "movq -1(%1, %%eax), %%mm5        \n\t"
1846
                        PAVGB" %%mm0, %%mm5                \n\t"
1847
                        PAVGB" %%mm0, %%mm3                \n\t"
1848
                        PAVGB" %%mm0, %%mm5                \n\t"
1849
                        PAVGB" %%mm0, %%mm3                \n\t"
1850
                        PAVGB" %%mm1, %%mm4                \n\t"
1851
                        PAVGB" %%mm1, %%mm2                \n\t"
1852
                        PAVGB" %%mm1, %%mm4                \n\t"
1853
                        PAVGB" %%mm1, %%mm2                \n\t"
1854
                        "movq %%mm5, %%mm7                \n\t"
1855
                        "movq %%mm4, %%mm6                \n\t"
1856
                        "punpcklbw %%mm3, %%mm5                \n\t"
1857
                        "punpckhbw %%mm3, %%mm7                \n\t"
1858
                        "punpcklbw %%mm2, %%mm4                \n\t"
1859
                        "punpckhbw %%mm2, %%mm6                \n\t"
1860 b241cbf2 Michael Niedermayer
#if 1
1861 b2609d4c Michael Niedermayer
                        MOVNTQ" %%mm5, (%2, %%eax, 2)        \n\t"
1862
                        MOVNTQ" %%mm7, 8(%2, %%eax, 2)        \n\t"
1863
                        MOVNTQ" %%mm4, (%3, %%eax, 2)        \n\t"
1864
                        MOVNTQ" %%mm6, 8(%3, %%eax, 2)        \n\t"
1865 b241cbf2 Michael Niedermayer
#else
1866 b2609d4c Michael Niedermayer
                        "movq %%mm5, (%2, %%eax, 2)        \n\t"
1867
                        "movq %%mm7, 8(%2, %%eax, 2)        \n\t"
1868
                        "movq %%mm4, (%3, %%eax, 2)        \n\t"
1869
                        "movq %%mm6, 8(%3, %%eax, 2)        \n\t"
1870 b241cbf2 Michael Niedermayer
#endif
1871
                        "addl $8, %%eax                        \n\t"
1872
                        " js 1b                                \n\t"
1873 b2609d4c Michael Niedermayer
                        :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1874 b241cbf2 Michael Niedermayer
                           "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1875
                           "g" (-mmxSize)
1876
                        : "%eax"
1877
1878
                );
1879
#else
1880 b2609d4c Michael Niedermayer
                const int mmxSize=1;
1881
#endif
1882
                dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1883
                dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1884 b241cbf2 Michael Niedermayer
1885 b2609d4c Michael Niedermayer
                for(x=mmxSize-1; x<srcWidth-1; x++){
1886 b241cbf2 Michael Niedermayer
                        dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1887
                        dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1888
                        dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1889
                        dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1890
                }
1891 b2609d4c Michael Niedermayer
                dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1892
                dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1893 b241cbf2 Michael Niedermayer
1894
                dst+=dstStride*2;
1895
                src+=srcStride;
1896
        }
1897
        
1898
        // last line
1899 b2609d4c Michael Niedermayer
#if 1
1900
        dst[0]= src[0];
1901
        
1902
        for(x=0; x<srcWidth-1; x++){
1903
                dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1904
                dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1905
        }
1906
        dst[2*srcWidth-1]= src[srcWidth-1];
1907
#else
1908 b241cbf2 Michael Niedermayer
        for(x=0; x<srcWidth; x++){
1909
                dst[2*x+0]=
1910
                dst[2*x+1]= src[x];
1911
        }
1912 b2609d4c Michael Niedermayer
#endif
1913
1914 b241cbf2 Michael Niedermayer
#ifdef HAVE_MMX
1915
asm volatile(   EMMS" \n\t"
1916
                SFENCE" \n\t"
1917
                :::"memory");
1918
#endif
1919
}
1920
1921 81c0590e Arpi
/**
1922
 *
1923
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1924
 * problem for anyone then tell me, and ill fix it)
1925 1de97d84 Michael Niedermayer
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1926 81c0590e Arpi
 */
1927 1de97d84 Michael Niedermayer
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1928 81c0590e Arpi
        unsigned int width, unsigned int height,
1929 f0b62bbd Michael Niedermayer
        int lumStride, int chromStride, int srcStride)
1930 81c0590e Arpi
{
1931 0d9f3d85 Arpi
        unsigned y;
1932
        const unsigned chromWidth= width>>1;
1933 81c0590e Arpi
        for(y=0; y<height; y+=2)
1934
        {
1935 ed8c0670 Michael Niedermayer
#ifdef HAVE_MMX
1936
                asm volatile(
1937
                        "xorl %%eax, %%eax                \n\t"
1938
                        "pcmpeqw %%mm7, %%mm7                \n\t"
1939
                        "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1940
                        ".balign 16                        \n\t"
1941
                        "1:                                \n\t"
1942
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1943
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // UYVY UYVY(0)
1944
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(4)
1945
                        "movq %%mm0, %%mm2                \n\t" // UYVY UYVY(0)
1946
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(4)
1947
                        "pand %%mm7, %%mm0                \n\t" // U0V0 U0V0(0)
1948
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(4)
1949
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1950
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1951
                        "packuswb %%mm1, %%mm0                \n\t" // UVUV UVUV(0)
1952
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(0)
1953
1954
                        MOVNTQ" %%mm2, (%1, %%eax, 2)        \n\t"
1955
1956
                        "movq 16(%0, %%eax, 4), %%mm1        \n\t" // UYVY UYVY(8)
1957
                        "movq 24(%0, %%eax, 4), %%mm2        \n\t" // UYVY UYVY(12)
1958
                        "movq %%mm1, %%mm3                \n\t" // UYVY UYVY(8)
1959
                        "movq %%mm2, %%mm4                \n\t" // UYVY UYVY(12)
1960
                        "pand %%mm7, %%mm1                \n\t" // U0V0 U0V0(8)
1961
                        "pand %%mm7, %%mm2                \n\t" // U0V0 U0V0(12)
1962
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1963
                        "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1964
                        "packuswb %%mm2, %%mm1                \n\t" // UVUV UVUV(8)
1965
                        "packuswb %%mm4, %%mm3                \n\t" // YYYY YYYY(8)
1966
1967
                        MOVNTQ" %%mm3, 8(%1, %%eax, 2)        \n\t"
1968
1969
                        "movq %%mm0, %%mm2                \n\t" // UVUV UVUV(0)
1970
                        "movq %%mm1, %%mm3                \n\t" // UVUV UVUV(8)
1971
                        "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1972
                        "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1973
                        "pand %%mm7, %%mm2                \n\t" // U0U0 U0U0(0)
1974
                        "pand %%mm7, %%mm3                \n\t" // U0U0 U0U0(8)
1975
                        "packuswb %%mm1, %%mm0                \n\t" // VVVV VVVV(0)
1976
                        "packuswb %%mm3, %%mm2                \n\t" // UUUU UUUU(0)
1977
1978
                        MOVNTQ" %%mm0, (%3, %%eax)        \n\t"
1979
                        MOVNTQ" %%mm2, (%2, %%eax)        \n\t"
1980
1981
                        "addl $8, %%eax                        \n\t"
1982
                        "cmpl %4, %%eax                        \n\t"
1983
                        " jb 1b                                \n\t"
1984 4596673c Michael Niedermayer
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1985 ed8c0670 Michael Niedermayer
                        : "memory", "%eax"
1986
                );
1987
1988
                ydst += lumStride;
1989
                src  += srcStride;
1990
1991
                asm volatile(
1992
                        "xorl %%eax, %%eax                \n\t"
1993
                        ".balign 16                        \n\t"
1994
                        "1:                                \n\t"
1995
                        PREFETCH" 64(%0, %%eax, 4)        \n\t"
1996
                        "movq (%0, %%eax, 4), %%mm0        \n\t" // YUYV YUYV(0)
1997
                        "movq 8(%0, %%eax, 4), %%mm1        \n\t" // YUYV YUYV(4)
1998
                        "movq 16(%0, %%eax, 4), %%mm2        \n\t" // YUYV YUYV(8)
1999
                        "movq 24(%0, %%eax, 4), %%mm3        \n\t" // YUYV YUYV(12)
2000
                        "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2001
                        "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2002
                        "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2003
                        "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2004
                        "packuswb %%mm1, %%mm0                \n\t" // YYYY YYYY(0)
2005
                        "packuswb %%mm3, %%mm2                \n\t" // YYYY YYYY(8)
2006
2007
                        MOVNTQ" %%mm0, (%1, %%eax, 2)        \n\t"
2008
                        MOVNTQ" %%mm2, 8(%1, %%eax, 2)        \n\t"
2009
2010
                        "addl $8, %%eax                        \n\t"
2011
                        "cmpl %4, %%eax                        \n\t"
2012
                        " jb 1b                                \n\t"
2013
2014 4596673c Michael Niedermayer
                        ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2015 ed8c0670 Michael Niedermayer
                        : "memory", "%eax"
2016
                );
2017
#else
2018 0d9f3d85 Arpi
                unsigned i;
2019 81c0590e Arpi
                for(i=0; i<chromWidth; i++)
2020
                {
2021
                        udst[i]         = src[4*i+0];
2022
                        ydst[2*i+0]         = src[4*i+1];
2023
                        vdst[i]         = src[4*i+2];
2024
                        ydst[2*i+1]         = src[4*i+3];
2025
                }
2026
                ydst += lumStride;
2027
                src  += srcStride;
2028
2029
                for(i=0; i<chromWidth; i++)
2030
                {
2031
                        ydst[2*i+0]         = src[4*i+1];
2032
                        ydst[2*i+1]         = src[4*i+3];
2033
                }
2034 ed8c0670 Michael Niedermayer
#endif
2035 81c0590e Arpi
                udst += chromStride;
2036
                vdst += chromStride;
2037
                ydst += lumStride;
2038
                src  += srcStride;
2039
        }
2040 ed8c0670 Michael Niedermayer
#ifdef HAVE_MMX
2041
asm volatile(   EMMS" \n\t"
2042
                SFENCE" \n\t"
2043
                :::"memory");
2044
#endif
2045 81c0590e Arpi
}
2046
2047 1de97d84 Michael Niedermayer
/**
2048
 *
2049
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2050
 * problem for anyone then tell me, and ill fix it)
2051 21316f3c Michael Niedermayer
 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2052 1de97d84 Michael Niedermayer
 */
2053
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2054
        unsigned int width, unsigned int height,
2055 f0b62bbd Michael Niedermayer
        int lumStride, int chromStride, int srcStride)
2056 1de97d84 Michael Niedermayer
{
2057 0d9f3d85 Arpi
        unsigned y;
2058
        const unsigned chromWidth= width>>1;
2059 21316f3c Michael Niedermayer
#ifdef HAVE_MMX
2060
        for(y=0; y<height-2; y+=2)
2061
        {
2062 0d9f3d85 Arpi
                unsigned i;
2063 21316f3c Michael Niedermayer
                for(i=0; i<2; i++)
2064
                {
2065
                        asm volatile(
2066
                                "movl %2, %%eax                        \n\t"
2067 854288bb Felix Bünemann
                                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
2068
                                "movq "MANGLE(w1111)", %%mm5                \n\t"
2069 21316f3c Michael Niedermayer
                                "pxor %%mm7, %%mm7                \n\t"
2070
                                "leal (%%eax, %%eax, 2), %%ebx        \n\t"
2071
                                ".balign 16                        \n\t"
2072
                                "1:                                \n\t"
2073
                                PREFETCH" 64(%0, %%ebx)                \n\t"
2074
                                "movd (%0, %%ebx), %%mm0        \n\t"
2075
                                "movd 3(%0, %%ebx), %%mm1        \n\t"
2076
                                "punpcklbw %%mm7, %%mm0                \n\t"
2077
                                "punpcklbw %%mm7, %%mm1                \n\t"
2078
                                "movd 6(%0, %%ebx), %%mm2        \n\t"
2079
                                "movd 9(%0, %%ebx), %%mm3        \n\t"
2080
                                "punpcklbw %%mm7, %%mm2                \n\t"
2081
                                "punpcklbw %%mm7, %%mm3                \n\t"
2082
                                "pmaddwd %%mm6, %%mm0                \n\t"
2083
                                "pmaddwd %%mm6, %%mm1                \n\t"
2084
                                "pmaddwd %%mm6, %%mm2                \n\t"
2085
                                "pmaddwd %%mm6, %%mm3                \n\t"
2086
#ifndef FAST_BGR2YV12
2087
                                "psrad $8, %%mm0                \n\t"
2088
                                "psrad $8, %%mm1                \n\t"
2089
                                "psrad $8, %%mm2                \n\t"
2090
                                "psrad $8, %%mm3                \n\t"
2091
#endif
2092
                                "packssdw %%mm1, %%mm0                \n\t"
2093
                                "packssdw %%mm3, %%mm2                \n\t"
2094
                                "pmaddwd %%mm5, %%mm0                \n\t"
2095
                                "pmaddwd %%mm5, %%mm2                \n\t"
2096
                                "packssdw %%mm2, %%mm0                \n\t"
2097
                                "psraw $7, %%mm0                \n\t"
2098
2099
                                "movd 12(%0, %%ebx), %%mm4        \n\t"
2100
                                "movd 15(%0, %%ebx), %%mm1        \n\t"
2101
                                "punpcklbw %%mm7, %%mm4                \n\t"
2102
                                "punpcklbw %%mm7, %%mm1                \n\t"
2103
                                "movd 18(%0, %%ebx), %%mm2        \n\t"
2104
                                "movd 21(%0, %%ebx), %%mm3        \n\t"
2105
                                "punpcklbw %%mm7, %%mm2                \n\t"
2106
                                "punpcklbw %%mm7, %%mm3                \n\t"
2107
                                "pmaddwd %%mm6, %%mm4                \n\t"
2108
                                "pmaddwd %%mm6, %%mm1                \n\t"
2109
                                "pmaddwd %%mm6, %%mm2                \n\t"
2110
                                "pmaddwd %%mm6, %%mm3                \n\t"
2111
#ifndef FAST_BGR2YV12
2112
                                "psrad $8, %%mm4                \n\t"
2113
                                "psrad $8, %%mm1                \n\t"
2114
                                "psrad $8, %%mm2                \n\t"
2115
                                "psrad $8, %%mm3                \n\t"
2116
#endif
2117
                                "packssdw %%mm1, %%mm4                \n\t"
2118
                                "packssdw %%mm3, %%mm2                \n\t"
2119
                                "pmaddwd %%mm5, %%mm4                \n\t"
2120
                                "pmaddwd %%mm5, %%mm2                \n\t"
2121
                                "addl $24, %%ebx                \n\t"
2122
                                "packssdw %%mm2, %%mm4                \n\t"
2123
                                "psraw $7, %%mm4                \n\t"
2124
2125
                                "packuswb %%mm4, %%mm0                \n\t"
2126 854288bb Felix Bünemann
                                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
2127 21316f3c Michael Niedermayer
2128
                                MOVNTQ" %%mm0, (%1, %%eax)        \n\t"
2129
                                "addl $8, %%eax                        \n\t"
2130
                                " js 1b                                \n\t"
2131
                                : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2132
                                : "%eax", "%ebx"
2133
                        );
2134
                        ydst += lumStride;
2135
                        src  += srcStride;
2136
                }
2137
                src -= srcStride*2;
2138
                asm volatile(
2139
                        "movl %4, %%eax                        \n\t"
2140 854288bb Felix Bünemann
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2141
                        "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
2142 21316f3c Michael Niedermayer
                        "pxor %%mm7, %%mm7                \n\t"
2143
                        "leal (%%eax, %%eax, 2), %%ebx        \n\t"
2144
                        "addl %%ebx, %%ebx                \n\t"
2145
                        ".balign 16                        \n\t"
2146
                        "1:                                \n\t"
2147
                        PREFETCH" 64(%0, %%ebx)                \n\t"
2148
                        PREFETCH" 64(%1, %%ebx)                \n\t"
2149
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2150
                        "movq (%0, %%ebx), %%mm0        \n\t"
2151
                        "movq (%1, %%ebx), %%mm1        \n\t"
2152
                        "movq 6(%0, %%ebx), %%mm2        \n\t"
2153
                        "movq 6(%1, %%ebx), %%mm3        \n\t"
2154
                        PAVGB" %%mm1, %%mm0                \n\t"
2155
                        PAVGB" %%mm3, %%mm2                \n\t"
2156
                        "movq %%mm0, %%mm1                \n\t"
2157
                        "movq %%mm2, %%mm3                \n\t"
2158
                        "psrlq $24, %%mm0                \n\t"
2159
                        "psrlq $24, %%mm2                \n\t"
2160
                        PAVGB" %%mm1, %%mm0                \n\t"
2161
                        PAVGB" %%mm3, %%mm2                \n\t"
2162
                        "punpcklbw %%mm7, %%mm0                \n\t"
2163
                        "punpcklbw %%mm7, %%mm2                \n\t"
2164
#else
2165
                        "movd (%0, %%ebx), %%mm0        \n\t"
2166
                        "movd (%1, %%ebx), %%mm1        \n\t"
2167
                        "movd 3(%0, %%ebx), %%mm2        \n\t"
2168
                        "movd 3(%1, %%ebx), %%mm3        \n\t"
2169
                        "punpcklbw %%mm7, %%mm0                \n\t"
2170
                        "punpcklbw %%mm7, %%mm1                \n\t"
2171
                        "punpcklbw %%mm7, %%mm2                \n\t"
2172
                        "punpcklbw %%mm7, %%mm3                \n\t"
2173
                        "paddw %%mm1, %%mm0                \n\t"
2174
                        "paddw %%mm3, %%mm2                \n\t"
2175
                        "paddw %%mm2, %%mm0                \n\t"
2176
                        "movd 6(%0, %%ebx), %%mm4        \n\t"
2177
                        "movd 6(%1, %%ebx), %%mm1        \n\t"
2178
                        "movd 9(%0, %%ebx), %%mm2        \n\t"
2179
                        "movd 9(%1, %%ebx), %%mm3        \n\t"
2180
                        "punpcklbw %%mm7, %%mm4                \n\t"
2181
                        "punpcklbw %%mm7, %%mm1                \n\t"
2182
                        "punpcklbw %%mm7, %%mm2                \n\t"
2183
                        "punpcklbw %%mm7, %%mm3                \n\t"
2184
                        "paddw %%mm1, %%mm4                \n\t"
2185
                        "paddw %%mm3, %%mm2                \n\t"
2186
                        "paddw %%mm4, %%mm2                \n\t"
2187
                        "psrlw $2, %%mm0                \n\t"
2188
                        "psrlw $2, %%mm2                \n\t"
2189
#endif
2190 854288bb Felix Bünemann
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2191
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2192 21316f3c Michael Niedermayer
2193
                        "pmaddwd %%mm0, %%mm1                \n\t"
2194
                        "pmaddwd %%mm2, %%mm3                \n\t"
2195
                        "pmaddwd %%mm6, %%mm0                \n\t"
2196
                        "pmaddwd %%mm6, %%mm2                \n\t"
2197
#ifndef FAST_BGR2YV12
2198
                        "psrad $8, %%mm0                \n\t"
2199
                        "psrad $8, %%mm1                \n\t"
2200
                        "psrad $8, %%mm2                \n\t"
2201
                        "psrad $8, %%mm3                \n\t"
2202
#endif
2203
                        "packssdw %%mm2, %%mm0                \n\t"
2204
                        "packssdw %%mm3, %%mm1                \n\t"
2205
                        "pmaddwd %%mm5, %%mm0                \n\t"
2206
                        "pmaddwd %%mm5, %%mm1                \n\t"
2207
                        "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
2208
                        "psraw $7, %%mm0                \n\t"
2209
2210
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2211
                        "movq 12(%0, %%ebx), %%mm4        \n\t"
2212
                        "movq 12(%1, %%ebx), %%mm1        \n\t"
2213
                        "movq 18(%0, %%ebx), %%mm2        \n\t"
2214
                        "movq 18(%1, %%ebx), %%mm3        \n\t"
2215
                        PAVGB" %%mm1, %%mm4                \n\t"
2216
                        PAVGB" %%mm3, %%mm2                \n\t"
2217
                        "movq %%mm4, %%mm1                \n\t"
2218
                        "movq %%mm2, %%mm3                \n\t"
2219
                        "psrlq $24, %%mm4                \n\t"
2220
                        "psrlq $24, %%mm2                \n\t"
2221
                        PAVGB" %%mm1, %%mm4                \n\t"
2222
                        PAVGB" %%mm3, %%mm2                \n\t"
2223
                        "punpcklbw %%mm7, %%mm4                \n\t"
2224
                        "punpcklbw %%mm7, %%mm2                \n\t"
2225
#else
2226
                        "movd 12(%0, %%ebx), %%mm4        \n\t"
2227
                        "movd 12(%1, %%ebx), %%mm1        \n\t"
2228
                        "movd 15(%0, %%ebx), %%mm2        \n\t"
2229
                        "movd 15(%1, %%ebx), %%mm3        \n\t"
2230
                        "punpcklbw %%mm7, %%mm4                \n\t"
2231
                        "punpcklbw %%mm7, %%mm1                \n\t"
2232
                        "punpcklbw %%mm7, %%mm2                \n\t"
2233
                        "punpcklbw %%mm7, %%mm3                \n\t"
2234
                        "paddw %%mm1, %%mm4                \n\t"
2235
                        "paddw %%mm3, %%mm2                \n\t"
2236
                        "paddw %%mm2, %%mm4                \n\t"
2237
                        "movd 18(%0, %%ebx), %%mm5        \n\t"
2238
                        "movd 18(%1, %%ebx), %%mm1        \n\t"
2239
                        "movd 21(%0, %%ebx), %%mm2        \n\t"
2240
                        "movd 21(%1, %%ebx), %%mm3        \n\t"
2241
                        "punpcklbw %%mm7, %%mm5                \n\t"
2242
                        "punpcklbw %%mm7, %%mm1                \n\t"
2243
                        "punpcklbw %%mm7, %%mm2                \n\t"
2244
                        "punpcklbw %%mm7, %%mm3                \n\t"
2245
                        "paddw %%mm1, %%mm5                \n\t"
2246
                        "paddw %%mm3, %%mm2                \n\t"
2247
                        "paddw %%mm5, %%mm2                \n\t"
2248 854288bb Felix Bünemann
                        "movq "MANGLE(w1111)", %%mm5                \n\t"
2249 21316f3c Michael Niedermayer
                        "psrlw $2, %%mm4                \n\t"
2250
                        "psrlw $2, %%mm2                \n\t"
2251
#endif
2252 854288bb Felix Bünemann
                        "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2253
                        "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2254 21316f3c Michael Niedermayer
2255
                        "pmaddwd %%mm4, %%mm1                \n\t"
2256
                        "pmaddwd %%mm2, %%mm3                \n\t"
2257
                        "pmaddwd %%mm6, %%mm4                \n\t"
2258
                        "pmaddwd %%mm6, %%mm2                \n\t"
2259
#ifndef FAST_BGR2YV12
2260
                        "psrad $8, %%mm4                \n\t"
2261
                        "psrad $8, %%mm1                \n\t"
2262
                        "psrad $8, %%mm2                \n\t"
2263
                        "psrad $8, %%mm3                \n\t"
2264
#endif
2265
                        "packssdw %%mm2, %%mm4                \n\t"
2266
                        "packssdw %%mm3, %%mm1                \n\t"
2267
                        "pmaddwd %%mm5, %%mm4                \n\t"
2268
                        "pmaddwd %%mm5, %%mm1                \n\t"
2269
                        "addl $24, %%ebx                \n\t"
2270
                        "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
2271
                        "psraw $7, %%mm4                \n\t"
2272
2273
                        "movq %%mm0, %%mm1                \n\t"
2274
                        "punpckldq %%mm4, %%mm0                \n\t"
2275
                        "punpckhdq %%mm4, %%mm1                \n\t"
2276
                        "packsswb %%mm1, %%mm0                \n\t"
2277 854288bb Felix Bünemann
                        "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
2278 21316f3c Michael Niedermayer
2279
                        "movd %%mm0, (%2, %%eax)        \n\t"
2280
                        "punpckhdq %%mm0, %%mm0                \n\t"
2281
                        "movd %%mm0, (%3, %%eax)        \n\t"
2282
                        "addl $4, %%eax                        \n\t"
2283
                        " js 1b                                \n\t"
2284
                        : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2285
                        : "%eax", "%ebx"
2286
                );
2287
2288
                udst += chromStride;
2289
                vdst += chromStride;
2290
                src  += srcStride*2;
2291
        }
2292
2293
        asm volatile(   EMMS" \n\t"
2294
                        SFENCE" \n\t"
2295
                        :::"memory");
2296
#else
2297
        y=0;
2298
#endif
2299
        for(; y<height; y+=2)
2300 1de97d84 Michael Niedermayer
        {
2301 0d9f3d85 Arpi
                unsigned i;
2302 1de97d84 Michael Niedermayer
                for(i=0; i<chromWidth; i++)
2303
                {
2304
                        unsigned int b= src[6*i+0];
2305
                        unsigned int g= src[6*i+1];
2306
                        unsigned int r= src[6*i+2];
2307
2308 aa21f0c3 Michael Niedermayer
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2309
                        unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2310
                        unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2311 81c0590e Arpi
2312 1de97d84 Michael Niedermayer
                        udst[i]         = U;
2313
                        vdst[i]         = V;
2314
                        ydst[2*i]         = Y;
2315
2316
                        b= src[6*i+3];
2317
                        g= src[6*i+4];
2318
                        r= src[6*i+5];
2319
2320 aa21f0c3 Michael Niedermayer
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2321 1de97d84 Michael Niedermayer
                        ydst[2*i+1]         = Y;
2322
                }
2323
                ydst += lumStride;
2324
                src  += srcStride;
2325
2326
                for(i=0; i<chromWidth; i++)
2327
                {
2328
                        unsigned int b= src[6*i+0];
2329
                        unsigned int g= src[6*i+1];
2330
                        unsigned int r= src[6*i+2];
2331
2332 aa21f0c3 Michael Niedermayer
                        unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2333 1de97d84 Michael Niedermayer
2334
                        ydst[2*i]         = Y;
2335
2336
                        b= src[6*i+3];
2337
                        g= src[6*i+4];
2338
                        r= src[6*i+5];
2339
2340 aa21f0c3 Michael Niedermayer
                        Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2341 1de97d84 Michael Niedermayer
                        ydst[2*i+1]         = Y;
2342
                }
2343
                udst += chromStride;
2344
                vdst += chromStride;
2345
                ydst += lumStride;
2346
                src  += srcStride;
2347
        }
2348
}
2349 5d55fdb4 Michael Niedermayer
2350
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2351 f0b62bbd Michael Niedermayer
                            unsigned width, unsigned height, int src1Stride,
2352
                            int src2Stride, int dstStride){
2353 0d9f3d85 Arpi
        unsigned h;
2354 5d55fdb4 Michael Niedermayer
2355
        for(h=0; h < height; h++)
2356
        {
2357 0d9f3d85 Arpi
                unsigned w;
2358 5d55fdb4 Michael Niedermayer
2359
#ifdef HAVE_MMX
2360
#ifdef HAVE_SSE2
2361
                asm(
2362
                        "xorl %%eax, %%eax                \n\t"
2363
                        "1:                                \n\t"
2364
                        PREFETCH" 64(%1, %%eax)                \n\t"
2365
                        PREFETCH" 64(%2, %%eax)                \n\t"
2366
                        "movdqa (%1, %%eax), %%xmm0        \n\t"
2367
                        "movdqa (%1, %%eax), %%xmm1        \n\t"
2368
                        "movdqa (%2, %%eax), %%xmm2        \n\t"
2369
                        "punpcklbw %%xmm2, %%xmm0        \n\t"
2370
                        "punpckhbw %%xmm2, %%xmm1        \n\t"
2371
                        "movntdq %%xmm0, (%0, %%eax, 2)        \n\t"
2372
                        "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2373
                        "addl $16, %%eax                        \n\t"
2374
                        "cmpl %3, %%eax                        \n\t"
2375
                        " jb 1b                                \n\t"
2376
                        ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2377
                        : "memory", "%eax"
2378
                );
2379
#else
2380
                asm(
2381
                        "xorl %%eax, %%eax                \n\t"
2382
                        "1:                                \n\t"
2383
                        PREFETCH" 64(%1, %%eax)                \n\t"
2384
                        PREFETCH" 64(%2, %%eax)                \n\t"
2385
                        "movq (%1, %%eax), %%mm0        \n\t"
2386
                        "movq 8(%1, %%eax), %%mm2        \n\t"
2387
                        "movq %%mm0, %%mm1                \n\t"
2388
                        "movq %%mm2, %%mm3                \n\t"
2389
                        "movq (%2, %%eax), %%mm4        \n\t"
2390
                        "movq 8(%2, %%eax), %%mm5        \n\t"
2391
                        "punpcklbw %%mm4, %%mm0                \n\t"
2392
                        "punpckhbw %%mm4, %%mm1                \n\t"
2393
                        "punpcklbw %%mm5, %%mm2                \n\t"
2394
                        "punpckhbw %%mm5, %%mm3                \n\t"
2395
                        MOVNTQ" %%mm0, (%0, %%eax, 2)        \n\t"
2396
                        MOVNTQ" %%mm1, 8(%0, %%eax, 2)        \n\t