Revision c0038328

View differences:

libswscale/Makefile
12 12
                               bfin/yuv2rgb_bfin.o
13 13
OBJS-$(CONFIG_MLIB)        +=  mlib/yuv2rgb_mlib.o
14 14
OBJS-$(HAVE_ALTIVEC)       +=  ppc/yuv2rgb_altivec.o
15
OBJS-$(HAVE_MMX)           +=  x86/yuv2rgb_mmx.o
15
OBJS-$(HAVE_MMX)           +=  x86/rgb2rgb.o            \
16
                               x86/yuv2rgb_mmx.o
16 17
OBJS-$(HAVE_VIS)           +=  sparc/yuv2rgb_vis.o
17 18

  
18 19
TESTPROGS = colorspace swscale
libswscale/rgb2rgb.c
24 24
 */
25 25
#include <inttypes.h>
26 26
#include "config.h"
27
#include "libavutil/x86_cpu.h"
28 27
#include "libavutil/bswap.h"
29 28
#include "rgb2rgb.h"
30 29
#include "swscale.h"
......
95 94
                     long width, long height,
96 95
                     long lumStride, long chromStride, long srcStride);
97 96

  
98

  
99
#if ARCH_X86
100
DECLARE_ASM_CONST(8, uint64_t, mmx_ff)       = 0x00000000000000FFULL;
101
DECLARE_ASM_CONST(8, uint64_t, mmx_null)     = 0x0000000000000000ULL;
102
DECLARE_ASM_CONST(8, uint64_t, mmx_one)      = 0xFFFFFFFFFFFFFFFFULL;
103
DECLARE_ASM_CONST(8, uint64_t, mask32b)      = 0x000000FF000000FFULL;
104
DECLARE_ASM_CONST(8, uint64_t, mask32g)      = 0x0000FF000000FF00ULL;
105
DECLARE_ASM_CONST(8, uint64_t, mask32r)      = 0x00FF000000FF0000ULL;
106
DECLARE_ASM_CONST(8, uint64_t, mask32a)      = 0xFF000000FF000000ULL;
107
DECLARE_ASM_CONST(8, uint64_t, mask32)       = 0x00FFFFFF00FFFFFFULL;
108
DECLARE_ASM_CONST(8, uint64_t, mask3216br)   = 0x00F800F800F800F8ULL;
109
DECLARE_ASM_CONST(8, uint64_t, mask3216g)    = 0x0000FC000000FC00ULL;
110
DECLARE_ASM_CONST(8, uint64_t, mask3215g)    = 0x0000F8000000F800ULL;
111
DECLARE_ASM_CONST(8, uint64_t, mul3216)      = 0x2000000420000004ULL;
112
DECLARE_ASM_CONST(8, uint64_t, mul3215)      = 0x2000000820000008ULL;
113
DECLARE_ASM_CONST(8, uint64_t, mask24b)      = 0x00FF0000FF0000FFULL;
114
DECLARE_ASM_CONST(8, uint64_t, mask24g)      = 0xFF0000FF0000FF00ULL;
115
DECLARE_ASM_CONST(8, uint64_t, mask24r)      = 0x0000FF0000FF0000ULL;
116
DECLARE_ASM_CONST(8, uint64_t, mask24l)      = 0x0000000000FFFFFFULL;
117
DECLARE_ASM_CONST(8, uint64_t, mask24h)      = 0x0000FFFFFF000000ULL;
118
DECLARE_ASM_CONST(8, uint64_t, mask24hh)     = 0xffff000000000000ULL;
119
DECLARE_ASM_CONST(8, uint64_t, mask24hhh)    = 0xffffffff00000000ULL;
120
DECLARE_ASM_CONST(8, uint64_t, mask24hhhh)   = 0xffffffffffff0000ULL;
121
DECLARE_ASM_CONST(8, uint64_t, mask15b)      = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
122
DECLARE_ASM_CONST(8, uint64_t, mask15rg)     = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
123
DECLARE_ASM_CONST(8, uint64_t, mask15s)      = 0xFFE0FFE0FFE0FFE0ULL;
124
DECLARE_ASM_CONST(8, uint64_t, mask15g)      = 0x03E003E003E003E0ULL;
125
DECLARE_ASM_CONST(8, uint64_t, mask15r)      = 0x7C007C007C007C00ULL;
126
#define mask16b mask15b
127
DECLARE_ASM_CONST(8, uint64_t, mask16g)      = 0x07E007E007E007E0ULL;
128
DECLARE_ASM_CONST(8, uint64_t, mask16r)      = 0xF800F800F800F800ULL;
129
DECLARE_ASM_CONST(8, uint64_t, red_16mask)   = 0x0000f8000000f800ULL;
130
DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
131
DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
132
DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
133
DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
134
DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
135
#endif /* ARCH_X86 */
136

  
137 97
#define RGB2YUV_SHIFT 8
138 98
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
139 99
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
......
145 105
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
146 106
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
147 107

  
148
//Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one.
149 108
//plain C versions
150
#define COMPILE_TEMPLATE_MMX 0
151
#define COMPILE_TEMPLATE_MMX2 0
152
#define COMPILE_TEMPLATE_AMD3DNOW 0
153
#define COMPILE_TEMPLATE_SSE2 0
154
#define RENAME(a) a ## _C
155
#include "rgb2rgb_template.c"
156

  
157
#if ARCH_X86
158

  
159
//MMX versions
160
#undef RENAME
161
#undef COMPILE_TEMPLATE_MMX
162
#define COMPILE_TEMPLATE_MMX 1
163
#define RENAME(a) a ## _MMX
164
#include "rgb2rgb_template.c"
165

  
166
//MMX2 versions
167
#undef RENAME
168
#undef COMPILE_TEMPLATE_MMX2
169
#define COMPILE_TEMPLATE_MMX2 1
170
#define RENAME(a) a ## _MMX2
171
#include "rgb2rgb_template.c"
172

  
173
//SSE2 versions
174
#undef RENAME
175
#undef COMPILE_TEMPLATE_SSE2
176
#define COMPILE_TEMPLATE_SSE2 1
177
#define RENAME(a) a ## _SSE2
178
#include "rgb2rgb_template.c"
179

  
180
//3DNOW versions
181
#undef RENAME
182
#undef COMPILE_TEMPLATE_MMX2
183
#undef COMPILE_TEMPLATE_SSE2
184
#undef COMPILE_TEMPLATE_AMD3DNOW
185
#define COMPILE_TEMPLATE_MMX2 0
186
#define COMPILE_TEMPLATE_SSE2 1
187
#define COMPILE_TEMPLATE_AMD3DNOW 1
188
#define RENAME(a) a ## _3DNOW
189 109
#include "rgb2rgb_template.c"
190 110

  
191
#endif //ARCH_X86 || ARCH_X86_64
192 111

  
193 112
/*
194 113
 RGB15->RGB16 original by Strepto/Astral
......
199 118

  
200 119
void sws_rgb2rgb_init(int flags)
201 120
{
121
    rgb2rgb_init_c();
202 122
#if HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX
203
    if (flags & SWS_CPU_CAPS_SSE2)
204
        rgb2rgb_init_SSE2();
205
    else if (flags & SWS_CPU_CAPS_MMX2)
206
        rgb2rgb_init_MMX2();
207
    else if (flags & SWS_CPU_CAPS_3DNOW)
208
        rgb2rgb_init_3DNOW();
209
    else if (flags & SWS_CPU_CAPS_MMX)
210
        rgb2rgb_init_MMX();
211
    else
123
    rgb2rgb_init_x86(flags);
212 124
#endif /* HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX */
213
        rgb2rgb_init_C();
214 125
}
215 126

  
216 127
#if LIBSWSCALE_VERSION_MAJOR < 1
libswscale/rgb2rgb.h
168 168

  
169 169
void sws_rgb2rgb_init(int flags);
170 170

  
171
void rgb2rgb_init_x86(int flags);
172

  
171 173
#endif /* SWSCALE_RGB2RGB_H */
libswscale/rgb2rgb_template.c
26 26

  
27 27
#include <stddef.h>
28 28

  
29
#undef PREFETCH
30
#undef MOVNTQ
31
#undef EMMS
32
#undef SFENCE
33
#undef MMREG_SIZE
34
#undef PAVGB
35

  
36
#if COMPILE_TEMPLATE_SSE2
37
#define MMREG_SIZE 16
38
#else
39
#define MMREG_SIZE 8
40
#endif
41

  
42
#if COMPILE_TEMPLATE_AMD3DNOW
43
#define PREFETCH  "prefetch"
44
#define PAVGB     "pavgusb"
45
#elif COMPILE_TEMPLATE_MMX2
46
#define PREFETCH "prefetchnta"
47
#define PAVGB     "pavgb"
48
#else
49
#define PREFETCH  " # nop"
50
#endif
51

  
52
#if COMPILE_TEMPLATE_AMD3DNOW
53
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
54
#define EMMS     "femms"
55
#else
56
#define EMMS     "emms"
57
#endif
58

  
59
#if COMPILE_TEMPLATE_MMX2
60
#define MOVNTQ "movntq"
61
#define SFENCE "sfence"
62
#else
63
#define MOVNTQ "movq"
64
#define SFENCE " # nop"
65
#endif
66

  
67
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
29
static inline void rgb24tobgr32_c(const uint8_t *src, uint8_t *dst, long src_size)
68 30
{
69 31
    uint8_t *dest = dst;
70 32
    const uint8_t *s = src;
71 33
    const uint8_t *end;
72
#if COMPILE_TEMPLATE_MMX
73
    const uint8_t *mm_end;
74
#endif
75 34
    end = s + src_size;
76
#if COMPILE_TEMPLATE_MMX
77
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
78
    mm_end = end - 23;
79
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
80
    while (s < mm_end) {
81
        __asm__ volatile(
82
            PREFETCH"    32%1           \n\t"
83
            "movd          %1, %%mm0    \n\t"
84
            "punpckldq    3%1, %%mm0    \n\t"
85
            "movd         6%1, %%mm1    \n\t"
86
            "punpckldq    9%1, %%mm1    \n\t"
87
            "movd        12%1, %%mm2    \n\t"
88
            "punpckldq   15%1, %%mm2    \n\t"
89
            "movd        18%1, %%mm3    \n\t"
90
            "punpckldq   21%1, %%mm3    \n\t"
91
            "por        %%mm7, %%mm0    \n\t"
92
            "por        %%mm7, %%mm1    \n\t"
93
            "por        %%mm7, %%mm2    \n\t"
94
            "por        %%mm7, %%mm3    \n\t"
95
            MOVNTQ"     %%mm0,   %0     \n\t"
96
            MOVNTQ"     %%mm1,  8%0     \n\t"
97
            MOVNTQ"     %%mm2, 16%0     \n\t"
98
            MOVNTQ"     %%mm3, 24%0"
99
            :"=m"(*dest)
100
            :"m"(*s)
101
            :"memory");
102
        dest += 32;
103
        s += 24;
104
    }
105
    __asm__ volatile(SFENCE:::"memory");
106
    __asm__ volatile(EMMS:::"memory");
107
#endif
35

  
108 36
    while (s < end) {
109 37
#if HAVE_BIGENDIAN
110 38
        /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
......
122 50
    }
123 51
}
124 52

  
125
#define STORE_BGR24_MMX \
126
            "psrlq         $8, %%mm2    \n\t" \
127
            "psrlq         $8, %%mm3    \n\t" \
128
            "psrlq         $8, %%mm6    \n\t" \
129
            "psrlq         $8, %%mm7    \n\t" \
130
            "pand "MANGLE(mask24l)", %%mm0\n\t" \
131
            "pand "MANGLE(mask24l)", %%mm1\n\t" \
132
            "pand "MANGLE(mask24l)", %%mm4\n\t" \
133
            "pand "MANGLE(mask24l)", %%mm5\n\t" \
134
            "pand "MANGLE(mask24h)", %%mm2\n\t" \
135
            "pand "MANGLE(mask24h)", %%mm3\n\t" \
136
            "pand "MANGLE(mask24h)", %%mm6\n\t" \
137
            "pand "MANGLE(mask24h)", %%mm7\n\t" \
138
            "por        %%mm2, %%mm0    \n\t" \
139
            "por        %%mm3, %%mm1    \n\t" \
140
            "por        %%mm6, %%mm4    \n\t" \
141
            "por        %%mm7, %%mm5    \n\t" \
142
 \
143
            "movq       %%mm1, %%mm2    \n\t" \
144
            "movq       %%mm4, %%mm3    \n\t" \
145
            "psllq        $48, %%mm2    \n\t" \
146
            "psllq        $32, %%mm3    \n\t" \
147
            "pand "MANGLE(mask24hh)", %%mm2\n\t" \
148
            "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
149
            "por        %%mm2, %%mm0    \n\t" \
150
            "psrlq        $16, %%mm1    \n\t" \
151
            "psrlq        $32, %%mm4    \n\t" \
152
            "psllq        $16, %%mm5    \n\t" \
153
            "por        %%mm3, %%mm1    \n\t" \
154
            "pand  "MANGLE(mask24hhhh)", %%mm5\n\t" \
155
            "por        %%mm5, %%mm4    \n\t" \
156
 \
157
            MOVNTQ"     %%mm0,   %0     \n\t" \
158
            MOVNTQ"     %%mm1,  8%0     \n\t" \
159
            MOVNTQ"     %%mm4, 16%0"
160

  
161

  
162
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
53
static inline void rgb32tobgr24_c(const uint8_t *src, uint8_t *dst, long src_size)
163 54
{
164 55
    uint8_t *dest = dst;
165 56
    const uint8_t *s = src;
166 57
    const uint8_t *end;
167
#if COMPILE_TEMPLATE_MMX
168
    const uint8_t *mm_end;
169
#endif
58

  
170 59
    end = s + src_size;
171
#if COMPILE_TEMPLATE_MMX
172
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
173
    mm_end = end - 31;
174
    while (s < mm_end) {
175
        __asm__ volatile(
176
            PREFETCH"    32%1           \n\t"
177
            "movq          %1, %%mm0    \n\t"
178
            "movq         8%1, %%mm1    \n\t"
179
            "movq        16%1, %%mm4    \n\t"
180
            "movq        24%1, %%mm5    \n\t"
181
            "movq       %%mm0, %%mm2    \n\t"
182
            "movq       %%mm1, %%mm3    \n\t"
183
            "movq       %%mm4, %%mm6    \n\t"
184
            "movq       %%mm5, %%mm7    \n\t"
185
            STORE_BGR24_MMX
186
            :"=m"(*dest)
187
            :"m"(*s)
188
            :"memory");
189
        dest += 24;
190
        s += 32;
191
    }
192
    __asm__ volatile(SFENCE:::"memory");
193
    __asm__ volatile(EMMS:::"memory");
194
#endif
60

  
195 61
    while (s < end) {
196 62
#if HAVE_BIGENDIAN
197 63
        /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
......
215 81
 MMX2, 3DNOW optimization by Nick Kurshev
216 82
 32-bit C version, and and&add trick by Michael Niedermayer
217 83
*/
218
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
84
static inline void rgb15to16_c(const uint8_t *src, uint8_t *dst, long src_size)
219 85
{
220 86
    register const uint8_t* s=src;
221 87
    register uint8_t* d=dst;
222 88
    register const uint8_t *end;
223 89
    const uint8_t *mm_end;
224 90
    end = s + src_size;
225
#if COMPILE_TEMPLATE_MMX
226
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
227
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
228
    mm_end = end - 15;
229
    while (s<mm_end) {
230
        __asm__ volatile(
231
            PREFETCH"  32%1         \n\t"
232
            "movq        %1, %%mm0  \n\t"
233
            "movq       8%1, %%mm2  \n\t"
234
            "movq     %%mm0, %%mm1  \n\t"
235
            "movq     %%mm2, %%mm3  \n\t"
236
            "pand     %%mm4, %%mm0  \n\t"
237
            "pand     %%mm4, %%mm2  \n\t"
238
            "paddw    %%mm1, %%mm0  \n\t"
239
            "paddw    %%mm3, %%mm2  \n\t"
240
            MOVNTQ"   %%mm0,  %0    \n\t"
241
            MOVNTQ"   %%mm2, 8%0"
242
            :"=m"(*d)
243
            :"m"(*s)
244
        );
245
        d+=16;
246
        s+=16;
247
    }
248
    __asm__ volatile(SFENCE:::"memory");
249
    __asm__ volatile(EMMS:::"memory");
250
#endif
251 91
    mm_end = end - 3;
252 92
    while (s < mm_end) {
253 93
        register unsigned x= *((const uint32_t *)s);
......
261 101
    }
262 102
}
263 103

  
264
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
104
static inline void rgb16to15_c(const uint8_t *src, uint8_t *dst, long src_size)
265 105
{
266 106
    register const uint8_t* s=src;
267 107
    register uint8_t* d=dst;
268 108
    register const uint8_t *end;
269 109
    const uint8_t *mm_end;
270 110
    end = s + src_size;
271
#if COMPILE_TEMPLATE_MMX
272
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
273
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
274
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
275
    mm_end = end - 15;
276
    while (s<mm_end) {
277
        __asm__ volatile(
278
            PREFETCH"  32%1         \n\t"
279
            "movq        %1, %%mm0  \n\t"
280
            "movq       8%1, %%mm2  \n\t"
281
            "movq     %%mm0, %%mm1  \n\t"
282
            "movq     %%mm2, %%mm3  \n\t"
283
            "psrlq       $1, %%mm0  \n\t"
284
            "psrlq       $1, %%mm2  \n\t"
285
            "pand     %%mm7, %%mm0  \n\t"
286
            "pand     %%mm7, %%mm2  \n\t"
287
            "pand     %%mm6, %%mm1  \n\t"
288
            "pand     %%mm6, %%mm3  \n\t"
289
            "por      %%mm1, %%mm0  \n\t"
290
            "por      %%mm3, %%mm2  \n\t"
291
            MOVNTQ"   %%mm0,  %0    \n\t"
292
            MOVNTQ"   %%mm2, 8%0"
293
            :"=m"(*d)
294
            :"m"(*s)
295
        );
296
        d+=16;
297
        s+=16;
298
    }
299
    __asm__ volatile(SFENCE:::"memory");
300
    __asm__ volatile(EMMS:::"memory");
301
#endif
111

  
302 112
    mm_end = end - 3;
303 113
    while (s < mm_end) {
304 114
        register uint32_t x= *((const uint32_t*)s);
......
312 122
    }
313 123
}
314 124

  
315
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
125
static inline void rgb32to16_c(const uint8_t *src, uint8_t *dst, long src_size)
316 126
{
317 127
    const uint8_t *s = src;
318 128
    const uint8_t *end;
319
#if COMPILE_TEMPLATE_MMX
320
    const uint8_t *mm_end;
321
#endif
322 129
    uint16_t *d = (uint16_t *)dst;
323 130
    end = s + src_size;
324
#if COMPILE_TEMPLATE_MMX
325
    mm_end = end - 15;
326
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
327
    __asm__ volatile(
328
        "movq           %3, %%mm5   \n\t"
329
        "movq           %4, %%mm6   \n\t"
330
        "movq           %5, %%mm7   \n\t"
331
        "jmp 2f                     \n\t"
332
        ".p2align        4          \n\t"
333
        "1:                         \n\t"
334
        PREFETCH"   32(%1)          \n\t"
335
        "movd         (%1), %%mm0   \n\t"
336
        "movd        4(%1), %%mm3   \n\t"
337
        "punpckldq   8(%1), %%mm0   \n\t"
338
        "punpckldq  12(%1), %%mm3   \n\t"
339
        "movq        %%mm0, %%mm1   \n\t"
340
        "movq        %%mm3, %%mm4   \n\t"
341
        "pand        %%mm6, %%mm0   \n\t"
342
        "pand        %%mm6, %%mm3   \n\t"
343
        "pmaddwd     %%mm7, %%mm0   \n\t"
344
        "pmaddwd     %%mm7, %%mm3   \n\t"
345
        "pand        %%mm5, %%mm1   \n\t"
346
        "pand        %%mm5, %%mm4   \n\t"
347
        "por         %%mm1, %%mm0   \n\t"
348
        "por         %%mm4, %%mm3   \n\t"
349
        "psrld          $5, %%mm0   \n\t"
350
        "pslld         $11, %%mm3   \n\t"
351
        "por         %%mm3, %%mm0   \n\t"
352
        MOVNTQ"      %%mm0, (%0)    \n\t"
353
        "add           $16,  %1     \n\t"
354
        "add            $8,  %0     \n\t"
355
        "2:                         \n\t"
356
        "cmp            %2,  %1     \n\t"
357
        " jb            1b          \n\t"
358
        : "+r" (d), "+r"(s)
359
        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
360
    );
361
#else
362
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
363
    __asm__ volatile(
364
        "movq    %0, %%mm7    \n\t"
365
        "movq    %1, %%mm6    \n\t"
366
        ::"m"(red_16mask),"m"(green_16mask));
367
    while (s < mm_end) {
368
        __asm__ volatile(
369
            PREFETCH"    32%1           \n\t"
370
            "movd          %1, %%mm0    \n\t"
371
            "movd         4%1, %%mm3    \n\t"
372
            "punpckldq    8%1, %%mm0    \n\t"
373
            "punpckldq   12%1, %%mm3    \n\t"
374
            "movq       %%mm0, %%mm1    \n\t"
375
            "movq       %%mm0, %%mm2    \n\t"
376
            "movq       %%mm3, %%mm4    \n\t"
377
            "movq       %%mm3, %%mm5    \n\t"
378
            "psrlq         $3, %%mm0    \n\t"
379
            "psrlq         $3, %%mm3    \n\t"
380
            "pand          %2, %%mm0    \n\t"
381
            "pand          %2, %%mm3    \n\t"
382
            "psrlq         $5, %%mm1    \n\t"
383
            "psrlq         $5, %%mm4    \n\t"
384
            "pand       %%mm6, %%mm1    \n\t"
385
            "pand       %%mm6, %%mm4    \n\t"
386
            "psrlq         $8, %%mm2    \n\t"
387
            "psrlq         $8, %%mm5    \n\t"
388
            "pand       %%mm7, %%mm2    \n\t"
389
            "pand       %%mm7, %%mm5    \n\t"
390
            "por        %%mm1, %%mm0    \n\t"
391
            "por        %%mm4, %%mm3    \n\t"
392
            "por        %%mm2, %%mm0    \n\t"
393
            "por        %%mm5, %%mm3    \n\t"
394
            "psllq        $16, %%mm3    \n\t"
395
            "por        %%mm3, %%mm0    \n\t"
396
            MOVNTQ"     %%mm0, %0       \n\t"
397
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
398
        d += 4;
399
        s += 16;
400
    }
401
#endif
402
    __asm__ volatile(SFENCE:::"memory");
403
    __asm__ volatile(EMMS:::"memory");
404
#endif
131

  
405 132
    while (s < end) {
406 133
        register int rgb = *(const uint32_t*)s; s += 4;
407 134
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
408 135
    }
409 136
}
410 137

  
411
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
138
static inline void rgb32tobgr16_c(const uint8_t *src, uint8_t *dst, long src_size)
412 139
{
413 140
    const uint8_t *s = src;
414 141
    const uint8_t *end;
415
#if COMPILE_TEMPLATE_MMX
416
    const uint8_t *mm_end;
417
#endif
418 142
    uint16_t *d = (uint16_t *)dst;
419 143
    end = s + src_size;
420
#if COMPILE_TEMPLATE_MMX
421
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
422
    __asm__ volatile(
423
        "movq          %0, %%mm7    \n\t"
424
        "movq          %1, %%mm6    \n\t"
425
        ::"m"(red_16mask),"m"(green_16mask));
426
    mm_end = end - 15;
427
    while (s < mm_end) {
428
        __asm__ volatile(
429
            PREFETCH"    32%1           \n\t"
430
            "movd          %1, %%mm0    \n\t"
431
            "movd         4%1, %%mm3    \n\t"
432
            "punpckldq    8%1, %%mm0    \n\t"
433
            "punpckldq   12%1, %%mm3    \n\t"
434
            "movq       %%mm0, %%mm1    \n\t"
435
            "movq       %%mm0, %%mm2    \n\t"
436
            "movq       %%mm3, %%mm4    \n\t"
437
            "movq       %%mm3, %%mm5    \n\t"
438
            "psllq         $8, %%mm0    \n\t"
439
            "psllq         $8, %%mm3    \n\t"
440
            "pand       %%mm7, %%mm0    \n\t"
441
            "pand       %%mm7, %%mm3    \n\t"
442
            "psrlq         $5, %%mm1    \n\t"
443
            "psrlq         $5, %%mm4    \n\t"
444
            "pand       %%mm6, %%mm1    \n\t"
445
            "pand       %%mm6, %%mm4    \n\t"
446
            "psrlq        $19, %%mm2    \n\t"
447
            "psrlq        $19, %%mm5    \n\t"
448
            "pand          %2, %%mm2    \n\t"
449
            "pand          %2, %%mm5    \n\t"
450
            "por        %%mm1, %%mm0    \n\t"
451
            "por        %%mm4, %%mm3    \n\t"
452
            "por        %%mm2, %%mm0    \n\t"
453
            "por        %%mm5, %%mm3    \n\t"
454
            "psllq        $16, %%mm3    \n\t"
455
            "por        %%mm3, %%mm0    \n\t"
456
            MOVNTQ"     %%mm0, %0       \n\t"
457
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
458
        d += 4;
459
        s += 16;
460
    }
461
    __asm__ volatile(SFENCE:::"memory");
462
    __asm__ volatile(EMMS:::"memory");
463
#endif
464 144
    while (s < end) {
465 145
        register int rgb = *(const uint32_t*)s; s += 4;
466 146
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
467 147
    }
468 148
}
469 149

  
470
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
150
static inline void rgb32to15_c(const uint8_t *src, uint8_t *dst, long src_size)
471 151
{
472 152
    const uint8_t *s = src;
473 153
    const uint8_t *end;
474
#if COMPILE_TEMPLATE_MMX
475
    const uint8_t *mm_end;
476
#endif
477 154
    uint16_t *d = (uint16_t *)dst;
478 155
    end = s + src_size;
479
#if COMPILE_TEMPLATE_MMX
480
    mm_end = end - 15;
481
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
482
    __asm__ volatile(
483
        "movq           %3, %%mm5   \n\t"
484
        "movq           %4, %%mm6   \n\t"
485
        "movq           %5, %%mm7   \n\t"
486
        "jmp            2f          \n\t"
487
        ".p2align        4          \n\t"
488
        "1:                         \n\t"
489
        PREFETCH"   32(%1)          \n\t"
490
        "movd         (%1), %%mm0   \n\t"
491
        "movd        4(%1), %%mm3   \n\t"
492
        "punpckldq   8(%1), %%mm0   \n\t"
493
        "punpckldq  12(%1), %%mm3   \n\t"
494
        "movq        %%mm0, %%mm1   \n\t"
495
        "movq        %%mm3, %%mm4   \n\t"
496
        "pand        %%mm6, %%mm0   \n\t"
497
        "pand        %%mm6, %%mm3   \n\t"
498
        "pmaddwd     %%mm7, %%mm0   \n\t"
499
        "pmaddwd     %%mm7, %%mm3   \n\t"
500
        "pand        %%mm5, %%mm1   \n\t"
501
        "pand        %%mm5, %%mm4   \n\t"
502
        "por         %%mm1, %%mm0   \n\t"
503
        "por         %%mm4, %%mm3   \n\t"
504
        "psrld          $6, %%mm0   \n\t"
505
        "pslld         $10, %%mm3   \n\t"
506
        "por         %%mm3, %%mm0   \n\t"
507
        MOVNTQ"      %%mm0, (%0)    \n\t"
508
        "add           $16,  %1     \n\t"
509
        "add            $8,  %0     \n\t"
510
        "2:                         \n\t"
511
        "cmp            %2,  %1     \n\t"
512
        " jb            1b          \n\t"
513
        : "+r" (d), "+r"(s)
514
        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
515
    );
516
#else
517
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
518
    __asm__ volatile(
519
        "movq          %0, %%mm7    \n\t"
520
        "movq          %1, %%mm6    \n\t"
521
        ::"m"(red_15mask),"m"(green_15mask));
522
    while (s < mm_end) {
523
        __asm__ volatile(
524
            PREFETCH"    32%1           \n\t"
525
            "movd          %1, %%mm0    \n\t"
526
            "movd         4%1, %%mm3    \n\t"
527
            "punpckldq    8%1, %%mm0    \n\t"
528
            "punpckldq   12%1, %%mm3    \n\t"
529
            "movq       %%mm0, %%mm1    \n\t"
530
            "movq       %%mm0, %%mm2    \n\t"
531
            "movq       %%mm3, %%mm4    \n\t"
532
            "movq       %%mm3, %%mm5    \n\t"
533
            "psrlq         $3, %%mm0    \n\t"
534
            "psrlq         $3, %%mm3    \n\t"
535
            "pand          %2, %%mm0    \n\t"
536
            "pand          %2, %%mm3    \n\t"
537
            "psrlq         $6, %%mm1    \n\t"
538
            "psrlq         $6, %%mm4    \n\t"
539
            "pand       %%mm6, %%mm1    \n\t"
540
            "pand       %%mm6, %%mm4    \n\t"
541
            "psrlq         $9, %%mm2    \n\t"
542
            "psrlq         $9, %%mm5    \n\t"
543
            "pand       %%mm7, %%mm2    \n\t"
544
            "pand       %%mm7, %%mm5    \n\t"
545
            "por        %%mm1, %%mm0    \n\t"
546
            "por        %%mm4, %%mm3    \n\t"
547
            "por        %%mm2, %%mm0    \n\t"
548
            "por        %%mm5, %%mm3    \n\t"
549
            "psllq        $16, %%mm3    \n\t"
550
            "por        %%mm3, %%mm0    \n\t"
551
            MOVNTQ"     %%mm0, %0       \n\t"
552
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
553
        d += 4;
554
        s += 16;
555
    }
556
#endif
557
    __asm__ volatile(SFENCE:::"memory");
558
    __asm__ volatile(EMMS:::"memory");
559
#endif
560 156
    while (s < end) {
561 157
        register int rgb = *(const uint32_t*)s; s += 4;
562 158
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
563 159
    }
564 160
}
565 161

  
566
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
162
static inline void rgb32tobgr15_c(const uint8_t *src, uint8_t *dst, long src_size)
567 163
{
568 164
    const uint8_t *s = src;
569 165
    const uint8_t *end;
570
#if COMPILE_TEMPLATE_MMX
571
    const uint8_t *mm_end;
572
#endif
573 166
    uint16_t *d = (uint16_t *)dst;
574 167
    end = s + src_size;
575
#if COMPILE_TEMPLATE_MMX
576
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
577
    __asm__ volatile(
578
        "movq          %0, %%mm7    \n\t"
579
        "movq          %1, %%mm6    \n\t"
580
        ::"m"(red_15mask),"m"(green_15mask));
581
    mm_end = end - 15;
582
    while (s < mm_end) {
583
        __asm__ volatile(
584
            PREFETCH"    32%1           \n\t"
585
            "movd          %1, %%mm0    \n\t"
586
            "movd         4%1, %%mm3    \n\t"
587
            "punpckldq    8%1, %%mm0    \n\t"
588
            "punpckldq   12%1, %%mm3    \n\t"
589
            "movq       %%mm0, %%mm1    \n\t"
590
            "movq       %%mm0, %%mm2    \n\t"
591
            "movq       %%mm3, %%mm4    \n\t"
592
            "movq       %%mm3, %%mm5    \n\t"
593
            "psllq         $7, %%mm0    \n\t"
594
            "psllq         $7, %%mm3    \n\t"
595
            "pand       %%mm7, %%mm0    \n\t"
596
            "pand       %%mm7, %%mm3    \n\t"
597
            "psrlq         $6, %%mm1    \n\t"
598
            "psrlq         $6, %%mm4    \n\t"
599
            "pand       %%mm6, %%mm1    \n\t"
600
            "pand       %%mm6, %%mm4    \n\t"
601
            "psrlq        $19, %%mm2    \n\t"
602
            "psrlq        $19, %%mm5    \n\t"
603
            "pand          %2, %%mm2    \n\t"
604
            "pand          %2, %%mm5    \n\t"
605
            "por        %%mm1, %%mm0    \n\t"
606
            "por        %%mm4, %%mm3    \n\t"
607
            "por        %%mm2, %%mm0    \n\t"
608
            "por        %%mm5, %%mm3    \n\t"
609
            "psllq        $16, %%mm3    \n\t"
610
            "por        %%mm3, %%mm0    \n\t"
611
            MOVNTQ"     %%mm0, %0       \n\t"
612
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
613
        d += 4;
614
        s += 16;
615
    }
616
    __asm__ volatile(SFENCE:::"memory");
617
    __asm__ volatile(EMMS:::"memory");
618
#endif
619 168
    while (s < end) {
620 169
        register int rgb = *(const uint32_t*)s; s += 4;
621 170
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
622 171
    }
623 172
}
624 173

  
625
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
174
static inline void rgb24tobgr16_c(const uint8_t *src, uint8_t *dst, long src_size)
626 175
{
627 176
    const uint8_t *s = src;
628 177
    const uint8_t *end;
629
#if COMPILE_TEMPLATE_MMX
630
    const uint8_t *mm_end;
631
#endif
632 178
    uint16_t *d = (uint16_t *)dst;
633 179
    end = s + src_size;
634
#if COMPILE_TEMPLATE_MMX
635
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
636
    __asm__ volatile(
637
        "movq         %0, %%mm7     \n\t"
638
        "movq         %1, %%mm6     \n\t"
639
        ::"m"(red_16mask),"m"(green_16mask));
640
    mm_end = end - 11;
641
    while (s < mm_end) {
642
        __asm__ volatile(
643
            PREFETCH"    32%1           \n\t"
644
            "movd          %1, %%mm0    \n\t"
645
            "movd         3%1, %%mm3    \n\t"
646
            "punpckldq    6%1, %%mm0    \n\t"
647
            "punpckldq    9%1, %%mm3    \n\t"
648
            "movq       %%mm0, %%mm1    \n\t"
649
            "movq       %%mm0, %%mm2    \n\t"
650
            "movq       %%mm3, %%mm4    \n\t"
651
            "movq       %%mm3, %%mm5    \n\t"
652
            "psrlq         $3, %%mm0    \n\t"
653
            "psrlq         $3, %%mm3    \n\t"
654
            "pand          %2, %%mm0    \n\t"
655
            "pand          %2, %%mm3    \n\t"
656
            "psrlq         $5, %%mm1    \n\t"
657
            "psrlq         $5, %%mm4    \n\t"
658
            "pand       %%mm6, %%mm1    \n\t"
659
            "pand       %%mm6, %%mm4    \n\t"
660
            "psrlq         $8, %%mm2    \n\t"
661
            "psrlq         $8, %%mm5    \n\t"
662
            "pand       %%mm7, %%mm2    \n\t"
663
            "pand       %%mm7, %%mm5    \n\t"
664
            "por        %%mm1, %%mm0    \n\t"
665
            "por        %%mm4, %%mm3    \n\t"
666
            "por        %%mm2, %%mm0    \n\t"
667
            "por        %%mm5, %%mm3    \n\t"
668
            "psllq        $16, %%mm3    \n\t"
669
            "por        %%mm3, %%mm0    \n\t"
670
            MOVNTQ"     %%mm0, %0       \n\t"
671
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
672
        d += 4;
673
        s += 12;
674
    }
675
    __asm__ volatile(SFENCE:::"memory");
676
    __asm__ volatile(EMMS:::"memory");
677
#endif
678 180
    while (s < end) {
679 181
        const int b = *s++;
680 182
        const int g = *s++;
......
683 185
    }
684 186
}
685 187

  
686
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
188
static inline void rgb24to16_c(const uint8_t *src, uint8_t *dst, long src_size)
687 189
{
688 190
    const uint8_t *s = src;
689 191
    const uint8_t *end;
690
#if COMPILE_TEMPLATE_MMX
691
    const uint8_t *mm_end;
692
#endif
693 192
    uint16_t *d = (uint16_t *)dst;
694 193
    end = s + src_size;
695
#if COMPILE_TEMPLATE_MMX
696
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
697
    __asm__ volatile(
698
        "movq         %0, %%mm7     \n\t"
699
        "movq         %1, %%mm6     \n\t"
700
        ::"m"(red_16mask),"m"(green_16mask));
701
    mm_end = end - 15;
702
    while (s < mm_end) {
703
        __asm__ volatile(
704
            PREFETCH"    32%1           \n\t"
705
            "movd          %1, %%mm0    \n\t"
706
            "movd         3%1, %%mm3    \n\t"
707
            "punpckldq    6%1, %%mm0    \n\t"
708
            "punpckldq    9%1, %%mm3    \n\t"
709
            "movq       %%mm0, %%mm1    \n\t"
710
            "movq       %%mm0, %%mm2    \n\t"
711
            "movq       %%mm3, %%mm4    \n\t"
712
            "movq       %%mm3, %%mm5    \n\t"
713
            "psllq         $8, %%mm0    \n\t"
714
            "psllq         $8, %%mm3    \n\t"
715
            "pand       %%mm7, %%mm0    \n\t"
716
            "pand       %%mm7, %%mm3    \n\t"
717
            "psrlq         $5, %%mm1    \n\t"
718
            "psrlq         $5, %%mm4    \n\t"
719
            "pand       %%mm6, %%mm1    \n\t"
720
            "pand       %%mm6, %%mm4    \n\t"
721
            "psrlq        $19, %%mm2    \n\t"
722
            "psrlq        $19, %%mm5    \n\t"
723
            "pand          %2, %%mm2    \n\t"
724
            "pand          %2, %%mm5    \n\t"
725
            "por        %%mm1, %%mm0    \n\t"
726
            "por        %%mm4, %%mm3    \n\t"
727
            "por        %%mm2, %%mm0    \n\t"
728
            "por        %%mm5, %%mm3    \n\t"
729
            "psllq        $16, %%mm3    \n\t"
730
            "por        %%mm3, %%mm0    \n\t"
731
            MOVNTQ"     %%mm0, %0       \n\t"
732
            :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
733
        d += 4;
734
        s += 12;
735
    }
736
    __asm__ volatile(SFENCE:::"memory");
737
    __asm__ volatile(EMMS:::"memory");
738
#endif
739 194
    while (s < end) {
740 195
        const int r = *s++;
741 196
        const int g = *s++;
......
744 199
    }
745 200
}
746 201

  
747
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
202
static inline void rgb24tobgr15_c(const uint8_t *src, uint8_t *dst, long src_size)
748 203
{
749 204
    const uint8_t *s = src;
750 205
    const uint8_t *end;
751
#if COMPILE_TEMPLATE_MMX
752
    const uint8_t *mm_end;
753
#endif
754 206
    uint16_t *d = (uint16_t *)dst;
755 207
    end = s + src_size;
756
#if COMPILE_TEMPLATE_MMX
757
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
758
    __asm__ volatile(
759
        "movq          %0, %%mm7    \n\t"
760
        "movq          %1, %%mm6    \n\t"
761
        ::"m"(red_15mask),"m"(green_15mask));
762
    mm_end = end - 11;
763
    while (s < mm_end) {
764
        __asm__ volatile(
765
            PREFETCH"    32%1           \n\t"
766
            "movd          %1, %%mm0    \n\t"
767
            "movd         3%1, %%mm3    \n\t"
768
            "punpckldq    6%1, %%mm0    \n\t"
769
            "punpckldq    9%1, %%mm3    \n\t"
770
            "movq       %%mm0, %%mm1    \n\t"
771
            "movq       %%mm0, %%mm2    \n\t"
772
            "movq       %%mm3, %%mm4    \n\t"
773
            "movq       %%mm3, %%mm5    \n\t"
774
            "psrlq         $3, %%mm0    \n\t"
775
            "psrlq         $3, %%mm3    \n\t"
776
            "pand          %2, %%mm0    \n\t"
777
            "pand          %2, %%mm3    \n\t"
778
            "psrlq         $6, %%mm1    \n\t"
779
            "psrlq         $6, %%mm4    \n\t"
780
            "pand       %%mm6, %%mm1    \n\t"
781
            "pand       %%mm6, %%mm4    \n\t"
782
            "psrlq         $9, %%mm2    \n\t"
783
            "psrlq         $9, %%mm5    \n\t"
784
            "pand       %%mm7, %%mm2    \n\t"
785
            "pand       %%mm7, %%mm5    \n\t"
786
            "por        %%mm1, %%mm0    \n\t"
787
            "por        %%mm4, %%mm3    \n\t"
788
            "por        %%mm2, %%mm0    \n\t"
789
            "por        %%mm5, %%mm3    \n\t"
790
            "psllq        $16, %%mm3    \n\t"
791
            "por        %%mm3, %%mm0    \n\t"
792
            MOVNTQ"     %%mm0, %0       \n\t"
793
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794
        d += 4;
795
        s += 12;
796
    }
797
    __asm__ volatile(SFENCE:::"memory");
798
    __asm__ volatile(EMMS:::"memory");
799
#endif
800 208
    while (s < end) {
801 209
        const int b = *s++;
802 210
        const int g = *s++;
......
805 213
    }
806 214
}
807 215

  
808
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
216
static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, long src_size)
809 217
{
810 218
    const uint8_t *s = src;
811 219
    const uint8_t *end;
812
#if COMPILE_TEMPLATE_MMX
813
    const uint8_t *mm_end;
814
#endif
815 220
    uint16_t *d = (uint16_t *)dst;
816 221
    end = s + src_size;
817
#if COMPILE_TEMPLATE_MMX
818
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
819
    __asm__ volatile(
820
        "movq         %0, %%mm7     \n\t"
821
        "movq         %1, %%mm6     \n\t"
822
        ::"m"(red_15mask),"m"(green_15mask));
823
    mm_end = end - 15;
824
    while (s < mm_end) {
825
        __asm__ volatile(
826
            PREFETCH"   32%1            \n\t"
827
            "movd         %1, %%mm0     \n\t"
828
            "movd        3%1, %%mm3     \n\t"
829
            "punpckldq   6%1, %%mm0     \n\t"
830
            "punpckldq   9%1, %%mm3     \n\t"
831
            "movq      %%mm0, %%mm1     \n\t"
832
            "movq      %%mm0, %%mm2     \n\t"
833
            "movq      %%mm3, %%mm4     \n\t"
834
            "movq      %%mm3, %%mm5     \n\t"
835
            "psllq        $7, %%mm0     \n\t"
836
            "psllq        $7, %%mm3     \n\t"
837
            "pand      %%mm7, %%mm0     \n\t"
838
            "pand      %%mm7, %%mm3     \n\t"
839
            "psrlq        $6, %%mm1     \n\t"
840
            "psrlq        $6, %%mm4     \n\t"
841
            "pand      %%mm6, %%mm1     \n\t"
842
            "pand      %%mm6, %%mm4     \n\t"
843
            "psrlq       $19, %%mm2     \n\t"
844
            "psrlq       $19, %%mm5     \n\t"
845
            "pand         %2, %%mm2     \n\t"
846
            "pand         %2, %%mm5     \n\t"
847
            "por       %%mm1, %%mm0     \n\t"
848
            "por       %%mm4, %%mm3     \n\t"
849
            "por       %%mm2, %%mm0     \n\t"
850
            "por       %%mm5, %%mm3     \n\t"
851
            "psllq       $16, %%mm3     \n\t"
852
            "por       %%mm3, %%mm0     \n\t"
853
            MOVNTQ"    %%mm0, %0        \n\t"
854
            :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855
        d += 4;
856
        s += 12;
857
    }
858
    __asm__ volatile(SFENCE:::"memory");
859
    __asm__ volatile(EMMS:::"memory");
860
#endif
861 222
    while (s < end) {
862 223
        const int r = *s++;
863 224
        const int g = *s++;
......
887 248
       |
888 249
   original bits
889 250
*/
890
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
251
static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst, long src_size)
891 252
{
892 253
    const uint16_t *end;
893
#if COMPILE_TEMPLATE_MMX
894
    const uint16_t *mm_end;
895
#endif
896 254
    uint8_t *d = dst;
897 255
    const uint16_t *s = (const uint16_t*)src;
898 256
    end = s + src_size/2;
899
#if COMPILE_TEMPLATE_MMX
900
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
901
    mm_end = end - 7;
902
    while (s < mm_end) {
903
        __asm__ volatile(
904
            PREFETCH"    32%1           \n\t"
905
            "movq          %1, %%mm0    \n\t"
906
            "movq          %1, %%mm1    \n\t"
907
            "movq          %1, %%mm2    \n\t"
908
            "pand          %2, %%mm0    \n\t"
909
            "pand          %3, %%mm1    \n\t"
910
            "pand          %4, %%mm2    \n\t"
911
            "psllq         $3, %%mm0    \n\t"
912
            "psrlq         $2, %%mm1    \n\t"
913
            "psrlq         $7, %%mm2    \n\t"
914
            "movq       %%mm0, %%mm3    \n\t"
915
            "movq       %%mm1, %%mm4    \n\t"
916
            "movq       %%mm2, %%mm5    \n\t"
917
            "punpcklwd     %5, %%mm0    \n\t"
918
            "punpcklwd     %5, %%mm1    \n\t"
919
            "punpcklwd     %5, %%mm2    \n\t"
920
            "punpckhwd     %5, %%mm3    \n\t"
921
            "punpckhwd     %5, %%mm4    \n\t"
922
            "punpckhwd     %5, %%mm5    \n\t"
923
            "psllq         $8, %%mm1    \n\t"
924
            "psllq        $16, %%mm2    \n\t"
925
            "por        %%mm1, %%mm0    \n\t"
926
            "por        %%mm2, %%mm0    \n\t"
927
            "psllq         $8, %%mm4    \n\t"
928
            "psllq        $16, %%mm5    \n\t"
929
            "por        %%mm4, %%mm3    \n\t"
930
            "por        %%mm5, %%mm3    \n\t"
931

  
932
            "movq       %%mm0, %%mm6    \n\t"
933
            "movq       %%mm3, %%mm7    \n\t"
934

  
935
            "movq         8%1, %%mm0    \n\t"
936
            "movq         8%1, %%mm1    \n\t"
937
            "movq         8%1, %%mm2    \n\t"
938
            "pand          %2, %%mm0    \n\t"
939
            "pand          %3, %%mm1    \n\t"
940
            "pand          %4, %%mm2    \n\t"
941
            "psllq         $3, %%mm0    \n\t"
942
            "psrlq         $2, %%mm1    \n\t"
943
            "psrlq         $7, %%mm2    \n\t"
944
            "movq       %%mm0, %%mm3    \n\t"
945
            "movq       %%mm1, %%mm4    \n\t"
946
            "movq       %%mm2, %%mm5    \n\t"
947
            "punpcklwd     %5, %%mm0    \n\t"
948
            "punpcklwd     %5, %%mm1    \n\t"
949
            "punpcklwd     %5, %%mm2    \n\t"
950
            "punpckhwd     %5, %%mm3    \n\t"
951
            "punpckhwd     %5, %%mm4    \n\t"
952
            "punpckhwd     %5, %%mm5    \n\t"
953
            "psllq         $8, %%mm1    \n\t"
954
            "psllq        $16, %%mm2    \n\t"
955
            "por        %%mm1, %%mm0    \n\t"
956
            "por        %%mm2, %%mm0    \n\t"
957
            "psllq         $8, %%mm4    \n\t"
958
            "psllq        $16, %%mm5    \n\t"
959
            "por        %%mm4, %%mm3    \n\t"
960
            "por        %%mm5, %%mm3    \n\t"
961

  
962
            :"=m"(*d)
963
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
964
            :"memory");
965
        /* borrowed 32 to 24 */
966
        __asm__ volatile(
967
            "movq       %%mm0, %%mm4    \n\t"
968
            "movq       %%mm3, %%mm5    \n\t"
969
            "movq       %%mm6, %%mm0    \n\t"
970
            "movq       %%mm7, %%mm1    \n\t"
971

  
972
            "movq       %%mm4, %%mm6    \n\t"
973
            "movq       %%mm5, %%mm7    \n\t"
974
            "movq       %%mm0, %%mm2    \n\t"
975
            "movq       %%mm1, %%mm3    \n\t"
976

  
977
            STORE_BGR24_MMX
978

  
979
            :"=m"(*d)
980
            :"m"(*s)
981
            :"memory");
982
        d += 24;
983
        s += 8;
984
    }
985
    __asm__ volatile(SFENCE:::"memory");
986
    __asm__ volatile(EMMS:::"memory");
987
#endif
988 257
    while (s < end) {
989 258
        register uint16_t bgr;
990 259
        bgr = *s++;
......
994 263
    }
995 264
}
996 265

  
997
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
266
static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst, long src_size)
998 267
{
999 268
    const uint16_t *end;
1000
#if COMPILE_TEMPLATE_MMX
1001
    const uint16_t *mm_end;
1002
#endif
1003 269
    uint8_t *d = (uint8_t *)dst;
1004 270
    const uint16_t *s = (const uint16_t *)src;
1005 271
    end = s + src_size/2;
1006
#if COMPILE_TEMPLATE_MMX
1007
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1008
    mm_end = end - 7;
1009
    while (s < mm_end) {
1010
        __asm__ volatile(
1011
            PREFETCH"    32%1           \n\t"
1012
            "movq          %1, %%mm0    \n\t"
1013
            "movq          %1, %%mm1    \n\t"
1014
            "movq          %1, %%mm2    \n\t"
1015
            "pand          %2, %%mm0    \n\t"
1016
            "pand          %3, %%mm1    \n\t"
1017
            "pand          %4, %%mm2    \n\t"
1018
            "psllq         $3, %%mm0    \n\t"
1019
            "psrlq         $3, %%mm1    \n\t"
1020
            "psrlq         $8, %%mm2    \n\t"
1021
            "movq       %%mm0, %%mm3    \n\t"
1022
            "movq       %%mm1, %%mm4    \n\t"
1023
            "movq       %%mm2, %%mm5    \n\t"
1024
            "punpcklwd     %5, %%mm0    \n\t"
1025
            "punpcklwd     %5, %%mm1    \n\t"
1026
            "punpcklwd     %5, %%mm2    \n\t"
1027
            "punpckhwd     %5, %%mm3    \n\t"
1028
            "punpckhwd     %5, %%mm4    \n\t"
1029
            "punpckhwd     %5, %%mm5    \n\t"
1030
            "psllq         $8, %%mm1    \n\t"
1031
            "psllq        $16, %%mm2    \n\t"
1032
            "por        %%mm1, %%mm0    \n\t"
1033
            "por        %%mm2, %%mm0    \n\t"
1034
            "psllq         $8, %%mm4    \n\t"
1035
            "psllq        $16, %%mm5    \n\t"
1036
            "por        %%mm4, %%mm3    \n\t"
1037
            "por        %%mm5, %%mm3    \n\t"
1038

  
1039
            "movq       %%mm0, %%mm6    \n\t"
1040
            "movq       %%mm3, %%mm7    \n\t"
1041

  
1042
            "movq         8%1, %%mm0    \n\t"
1043
            "movq         8%1, %%mm1    \n\t"
1044
            "movq         8%1, %%mm2    \n\t"
1045
            "pand          %2, %%mm0    \n\t"
1046
            "pand          %3, %%mm1    \n\t"
1047
            "pand          %4, %%mm2    \n\t"
1048
            "psllq         $3, %%mm0    \n\t"
1049
            "psrlq         $3, %%mm1    \n\t"
1050
            "psrlq         $8, %%mm2    \n\t"
1051
            "movq       %%mm0, %%mm3    \n\t"
1052
            "movq       %%mm1, %%mm4    \n\t"
1053
            "movq       %%mm2, %%mm5    \n\t"
1054
            "punpcklwd     %5, %%mm0    \n\t"
1055
            "punpcklwd     %5, %%mm1    \n\t"
1056
            "punpcklwd     %5, %%mm2    \n\t"
1057
            "punpckhwd     %5, %%mm3    \n\t"
1058
            "punpckhwd     %5, %%mm4    \n\t"
1059
            "punpckhwd     %5, %%mm5    \n\t"
1060
            "psllq         $8, %%mm1    \n\t"
1061
            "psllq        $16, %%mm2    \n\t"
1062
            "por        %%mm1, %%mm0    \n\t"
1063
            "por        %%mm2, %%mm0    \n\t"
1064
            "psllq         $8, %%mm4    \n\t"
1065
            "psllq        $16, %%mm5    \n\t"
1066
            "por        %%mm4, %%mm3    \n\t"
1067
            "por        %%mm5, %%mm3    \n\t"
1068
            :"=m"(*d)
1069
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1070
            :"memory");
1071
        /* borrowed 32 to 24 */
1072
        __asm__ volatile(
1073
            "movq       %%mm0, %%mm4    \n\t"
1074
            "movq       %%mm3, %%mm5    \n\t"
1075
            "movq       %%mm6, %%mm0    \n\t"
1076
            "movq       %%mm7, %%mm1    \n\t"
1077

  
1078
            "movq       %%mm4, %%mm6    \n\t"
1079
            "movq       %%mm5, %%mm7    \n\t"
1080
            "movq       %%mm0, %%mm2    \n\t"
1081
            "movq       %%mm1, %%mm3    \n\t"
1082

  
1083
            STORE_BGR24_MMX
1084

  
1085
            :"=m"(*d)
1086
            :"m"(*s)
1087
            :"memory");
1088
        d += 24;
1089
        s += 8;
1090
    }
1091
    __asm__ volatile(SFENCE:::"memory");
1092
    __asm__ volatile(EMMS:::"memory");
1093
#endif
1094 272
    while (s < end) {
1095 273
        register uint16_t bgr;
1096 274
        bgr = *s++;
......
1119 297
    MOVNTQ"     %%mm0,  %0      \n\t"                               \
1120 298
    MOVNTQ"     %%mm3, 8%0      \n\t"                               \
1121 299

  
1122
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
300
static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, long src_size)
1123 301
{
1124 302
    const uint16_t *end;
1125
#if COMPILE_TEMPLATE_MMX
1126
    const uint16_t *mm_end;
1127
#endif
1128 303
    uint8_t *d = dst;
1129 304
    const uint16_t *s = (const uint16_t *)src;
1130 305
    end = s + src_size/2;
1131
#if COMPILE_TEMPLATE_MMX
1132
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1133
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1134
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1135
    mm_end = end - 3;
1136
    while (s < mm_end) {
1137
        __asm__ volatile(
1138
            PREFETCH"    32%1           \n\t"
1139
            "movq          %1, %%mm0    \n\t"
1140
            "movq          %1, %%mm1    \n\t"
1141
            "movq          %1, %%mm2    \n\t"
1142
            "pand          %2, %%mm0    \n\t"
1143
            "pand          %3, %%mm1    \n\t"
1144
            "pand          %4, %%mm2    \n\t"
1145
            "psllq         $3, %%mm0    \n\t"
1146
            "psrlq         $2, %%mm1    \n\t"
1147
            "psrlq         $7, %%mm2    \n\t"
1148
            PACK_RGB32
1149
            :"=m"(*d)
1150
            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1151
            :"memory");
1152
        d += 16;
1153
        s += 4;
1154
    }
1155
    __asm__ volatile(SFENCE:::"memory");
1156
    __asm__ volatile(EMMS:::"memory");
1157
#endif
1158 306
    while (s < end) {
1159 307
        register uint16_t bgr;
1160 308
        bgr = *s++;
......
1172 320
    }
1173 321
}
1174 322

  
1175
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
323
static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, long src_size)
1176 324
{
1177 325
    const uint16_t *end;
1178
#if COMPILE_TEMPLATE_MMX
1179
    const uint16_t *mm_end;
1180
#endif
1181 326
    uint8_t *d = dst;
1182 327
    const uint16_t *s = (const uint16_t*)src;
1183 328
    end = s + src_size/2;
1184
#if COMPILE_TEMPLATE_MMX
1185
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1186
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1187
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1188
    mm_end = end - 3;
1189
    while (s < mm_end) {
1190
        __asm__ volatile(
1191
            PREFETCH"    32%1           \n\t"
1192
            "movq          %1, %%mm0    \n\t"
1193
            "movq          %1, %%mm1    \n\t"
1194
            "movq          %1, %%mm2    \n\t"
1195
            "pand          %2, %%mm0    \n\t"
1196
            "pand          %3, %%mm1    \n\t"
1197
            "pand          %4, %%mm2    \n\t"
1198
            "psllq         $3, %%mm0    \n\t"
1199
            "psrlq         $3, %%mm1    \n\t"
1200
            "psrlq         $8, %%mm2    \n\t"
1201
            PACK_RGB32
1202
            :"=m"(*d)
1203
            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1204
            :"memory");
1205
        d += 16;
1206
        s += 4;
1207
    }
1208
    __asm__ volatile(SFENCE:::"memory");
1209
    __asm__ volatile(EMMS:::"memory");
1210
#endif
1211 329
    while (s < end) {
1212 330
        register uint16_t bgr;
1213 331
        bgr = *s++;
......
1225 343
    }
1226 344
}
1227 345

  
1228
static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
346
static inline void shuffle_bytes_2103_c(const uint8_t *src, uint8_t *dst, long src_size)
1229 347
{
1230
    x86_reg idx = 15 - src_size;
348
    int idx = 15 - src_size;
1231 349
    const uint8_t *s = src-idx;
1232 350
    uint8_t *d = dst-idx;
1233
#if COMPILE_TEMPLATE_MMX
1234
    __asm__ volatile(
1235
        "test          %0, %0           \n\t"
1236
        "jns           2f               \n\t"
1237
        PREFETCH"       (%1, %0)        \n\t"
1238
        "movq          %3, %%mm7        \n\t"
1239
        "pxor          %4, %%mm7        \n\t"
1240
        "movq       %%mm7, %%mm6        \n\t"
1241
        "pxor          %5, %%mm7        \n\t"
1242
        ".p2align       4               \n\t"
1243
        "1:                             \n\t"
1244
        PREFETCH"     32(%1, %0)        \n\t"
1245
        "movq           (%1, %0), %%mm0 \n\t"
1246
        "movq          8(%1, %0), %%mm1 \n\t"
1247
# if COMPILE_TEMPLATE_MMX2
1248
        "pshufw      $177, %%mm0, %%mm3 \n\t"
1249
        "pshufw      $177, %%mm1, %%mm5 \n\t"
1250
        "pand       %%mm7, %%mm0        \n\t"
1251
        "pand       %%mm6, %%mm3        \n\t"
1252
        "pand       %%mm7, %%mm1        \n\t"
1253
        "pand       %%mm6, %%mm5        \n\t"
1254
        "por        %%mm3, %%mm0        \n\t"
1255
        "por        %%mm5, %%mm1        \n\t"
1256
# else
1257
        "movq       %%mm0, %%mm2        \n\t"
1258
        "movq       %%mm1, %%mm4        \n\t"
1259
        "pand       %%mm7, %%mm0        \n\t"
1260
        "pand       %%mm6, %%mm2        \n\t"
1261
        "pand       %%mm7, %%mm1        \n\t"
1262
        "pand       %%mm6, %%mm4        \n\t"
1263
        "movq       %%mm2, %%mm3        \n\t"
1264
        "movq       %%mm4, %%mm5        \n\t"
1265
        "pslld        $16, %%mm2        \n\t"
1266
        "psrld        $16, %%mm3        \n\t"
1267
        "pslld        $16, %%mm4        \n\t"
1268
        "psrld        $16, %%mm5        \n\t"
1269
        "por        %%mm2, %%mm0        \n\t"
1270
        "por        %%mm4, %%mm1        \n\t"
1271
        "por        %%mm3, %%mm0        \n\t"
1272
        "por        %%mm5, %%mm1        \n\t"
1273
# endif
1274
        MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
1275
        MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
1276
        "add          $16, %0           \n\t"
1277
        "js            1b               \n\t"
1278
        SFENCE"                         \n\t"
1279
        EMMS"                           \n\t"
1280
        "2:                             \n\t"
1281
        : "+&r"(idx)
1282
        : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1283
        : "memory");
1284
#endif
1285 351
    for (; idx<15; idx+=4) {
1286 352
        register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1287 353
        v &= 0xff00ff;
......
1289 355
    }
1290 356
}
1291 357

  
1292
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
358
static inline void rgb24tobgr24_c(const uint8_t *src, uint8_t *dst, long src_size)
1293 359
{
1294 360
    unsigned i;
1295
#if COMPILE_TEMPLATE_MMX
1296
    x86_reg mmx_size= 23 - src_size;
1297
    __asm__ volatile (
1298
        "test             %%"REG_a", %%"REG_a"          \n\t"
1299
        "jns                     2f                     \n\t"
1300
        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1301
        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1302
        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1303
        ".p2align                 4                     \n\t"
1304
        "1:                                             \n\t"
1305
        PREFETCH" 32(%1, %%"REG_a")                     \n\t"
1306
        "movq       (%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1307
        "movq       (%1, %%"REG_a"), %%mm1              \n\t" // BGR BGR BG
1308
        "movq      2(%1, %%"REG_a"), %%mm2              \n\t" // R BGR BGR B
1309
        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1310
        "pand                 %%mm5, %%mm0              \n\t"
1311
        "pand                 %%mm6, %%mm1              \n\t"
1312
        "pand                 %%mm7, %%mm2              \n\t"
1313
        "por                  %%mm0, %%mm1              \n\t"
1314
        "por                  %%mm2, %%mm1              \n\t"
1315
        "movq      6(%1, %%"REG_a"), %%mm0              \n\t" // BGR BGR BG
1316
        MOVNTQ"               %%mm1,   (%2, %%"REG_a")  \n\t" // RGB RGB RG
1317
        "movq      8(%1, %%"REG_a"), %%mm1              \n\t" // R BGR BGR B
1318
        "movq     10(%1, %%"REG_a"), %%mm2              \n\t" // GR BGR BGR
1319
        "pand                 %%mm7, %%mm0              \n\t"
1320
        "pand                 %%mm5, %%mm1              \n\t"
1321
        "pand                 %%mm6, %%mm2              \n\t"
1322
        "por                  %%mm0, %%mm1              \n\t"
1323
        "por                  %%mm2, %%mm1              \n\t"
1324
        "movq     14(%1, %%"REG_a"), %%mm0              \n\t" // R BGR BGR B
1325
        MOVNTQ"               %%mm1,  8(%2, %%"REG_a")  \n\t" // B RGB RGB R
1326
        "movq     16(%1, %%"REG_a"), %%mm1              \n\t" // GR BGR BGR
1327
        "movq     18(%1, %%"REG_a"), %%mm2              \n\t" // BGR BGR BG
1328
        "pand                 %%mm6, %%mm0              \n\t"
1329
        "pand                 %%mm7, %%mm1              \n\t"
1330
        "pand                 %%mm5, %%mm2              \n\t"
1331
        "por                  %%mm0, %%mm1              \n\t"
1332
        "por                  %%mm2, %%mm1              \n\t"
1333
        MOVNTQ"               %%mm1, 16(%2, %%"REG_a")  \n\t"
1334
        "add                    $24, %%"REG_a"          \n\t"
1335
        " js                     1b                     \n\t"
1336
        "2:                                             \n\t"
1337
        : "+a" (mmx_size)
1338
        : "r" (src-mmx_size), "r"(dst-mmx_size)
1339
    );
1340

  
1341
    __asm__ volatile(SFENCE:::"memory");
1342
    __asm__ volatile(EMMS:::"memory");
1343

  
1344
    if (mmx_size==23) return; //finished, was multiple of 8
1345

  
1346
    src+= src_size;
1347
    dst+= src_size;
1348
    src_size= 23-mmx_size;
1349
    src-= src_size;
1350
    dst-= src_size;
1351
#endif
1352 361
    for (i=0; i<src_size; i+=3) {
1353 362
        register uint8_t x;
1354 363
        x          = src[i + 2];
......
1358 367
    }
1359 368
}
1360 369

  
1361
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1362
                                           long width, long height,
1363
                                           long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
370
static inline void yuvPlanartoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
371
                                     const uint8_t *vsrc, uint8_t *dst,
372
                                     long width, long height,
373
                                     long lumStride, long chromStride,
374
                                     long dstStride, long vertLumPerChroma)
1364 375
{
1365 376
    long y;
1366
    const x86_reg chromWidth= width>>1;
377
    const int chromWidth = width >> 1;
1367 378
    for (y=0; y<height; y++) {
1368
#if COMPILE_TEMPLATE_MMX
1369
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1370
        __asm__ volatile(
1371
            "xor                 %%"REG_a", %%"REG_a"   \n\t"
1372
            ".p2align                    4              \n\t"
1373
            "1:                                         \n\t"
1374
            PREFETCH"    32(%1, %%"REG_a", 2)           \n\t"
1375
            PREFETCH"    32(%2, %%"REG_a")              \n\t"
1376
            PREFETCH"    32(%3, %%"REG_a")              \n\t"
1377
            "movq          (%2, %%"REG_a"), %%mm0       \n\t" // U(0)
1378
            "movq                    %%mm0, %%mm2       \n\t" // U(0)
1379
            "movq          (%3, %%"REG_a"), %%mm1       \n\t" // V(0)
1380
            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1381
            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1382

  
1383
            "movq        (%1, %%"REG_a",2), %%mm3       \n\t" // Y(0)
1384
            "movq       8(%1, %%"REG_a",2), %%mm5       \n\t" // Y(8)
1385
            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1386
            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1387
            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1388
            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1389
            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1390
            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1391

  
1392
            MOVNTQ"                  %%mm3,   (%0, %%"REG_a", 4)    \n\t"
1393
            MOVNTQ"                  %%mm4,  8(%0, %%"REG_a", 4)    \n\t"
1394
            MOVNTQ"                  %%mm5, 16(%0, %%"REG_a", 4)    \n\t"
1395
            MOVNTQ"                  %%mm6, 24(%0, %%"REG_a", 4)    \n\t"
1396

  
1397
            "add                        $8, %%"REG_a"   \n\t"
1398
            "cmp                        %4, %%"REG_a"   \n\t"
1399
            " jb                        1b              \n\t"
1400
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1401
            : "%"REG_a
1402
        );
1403
#else
1404

  
1405
#if ARCH_ALPHA && HAVE_MVI
1406
#define pl2yuy2(n)                  \
1407
    y1 = yc[n];                     \
1408
    y2 = yc2[n];                    \
1409
    u = uc[n];                      \
1410
    v = vc[n];                      \
1411
    __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
1412
    __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
1413
    __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
1414
    __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
1415
    yuv1 = (u << 8) + (v << 24);                \
1416
    yuv2 = yuv1 + y2;               \
1417
    yuv1 += y1;                     \
1418
    qdst[n]  = yuv1;                \
1419
    qdst2[n] = yuv2;
1420

  
1421
        int i;
1422
        uint64_t *qdst = (uint64_t *) dst;
1423
        uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1424
        const uint32_t *yc = (uint32_t *) ysrc;
1425
        const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1426
        const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1427
        for (i = 0; i < chromWidth; i += 8) {
1428
            uint64_t y1, y2, yuv1, yuv2;
1429
            uint64_t u, v;
1430
            /* Prefetch */
1431
            __asm__("ldq $31,64(%0)" :: "r"(yc));
1432
            __asm__("ldq $31,64(%0)" :: "r"(yc2));
1433
            __asm__("ldq $31,64(%0)" :: "r"(uc));
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff