26 |
26 |
|
27 |
27 |
#include <stddef.h>
|
28 |
28 |
|
29 |
|
#undef PREFETCH
|
30 |
|
#undef MOVNTQ
|
31 |
|
#undef EMMS
|
32 |
|
#undef SFENCE
|
33 |
|
#undef MMREG_SIZE
|
34 |
|
#undef PAVGB
|
35 |
|
|
36 |
|
#if COMPILE_TEMPLATE_SSE2
|
37 |
|
#define MMREG_SIZE 16
|
38 |
|
#else
|
39 |
|
#define MMREG_SIZE 8
|
40 |
|
#endif
|
41 |
|
|
42 |
|
#if COMPILE_TEMPLATE_AMD3DNOW
|
43 |
|
#define PREFETCH "prefetch"
|
44 |
|
#define PAVGB "pavgusb"
|
45 |
|
#elif COMPILE_TEMPLATE_MMX2
|
46 |
|
#define PREFETCH "prefetchnta"
|
47 |
|
#define PAVGB "pavgb"
|
48 |
|
#else
|
49 |
|
#define PREFETCH " # nop"
|
50 |
|
#endif
|
51 |
|
|
52 |
|
#if COMPILE_TEMPLATE_AMD3DNOW
|
53 |
|
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
|
54 |
|
#define EMMS "femms"
|
55 |
|
#else
|
56 |
|
#define EMMS "emms"
|
57 |
|
#endif
|
58 |
|
|
59 |
|
#if COMPILE_TEMPLATE_MMX2
|
60 |
|
#define MOVNTQ "movntq"
|
61 |
|
#define SFENCE "sfence"
|
62 |
|
#else
|
63 |
|
#define MOVNTQ "movq"
|
64 |
|
#define SFENCE " # nop"
|
65 |
|
#endif
|
66 |
|
|
67 |
|
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
29 |
static inline void rgb24tobgr32_c(const uint8_t *src, uint8_t *dst, long src_size)
|
68 |
30 |
{
|
69 |
31 |
uint8_t *dest = dst;
|
70 |
32 |
const uint8_t *s = src;
|
71 |
33 |
const uint8_t *end;
|
72 |
|
#if COMPILE_TEMPLATE_MMX
|
73 |
|
const uint8_t *mm_end;
|
74 |
|
#endif
|
75 |
34 |
end = s + src_size;
|
76 |
|
#if COMPILE_TEMPLATE_MMX
|
77 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
|
78 |
|
mm_end = end - 23;
|
79 |
|
__asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
|
80 |
|
while (s < mm_end) {
|
81 |
|
__asm__ volatile(
|
82 |
|
PREFETCH" 32%1 \n\t"
|
83 |
|
"movd %1, %%mm0 \n\t"
|
84 |
|
"punpckldq 3%1, %%mm0 \n\t"
|
85 |
|
"movd 6%1, %%mm1 \n\t"
|
86 |
|
"punpckldq 9%1, %%mm1 \n\t"
|
87 |
|
"movd 12%1, %%mm2 \n\t"
|
88 |
|
"punpckldq 15%1, %%mm2 \n\t"
|
89 |
|
"movd 18%1, %%mm3 \n\t"
|
90 |
|
"punpckldq 21%1, %%mm3 \n\t"
|
91 |
|
"por %%mm7, %%mm0 \n\t"
|
92 |
|
"por %%mm7, %%mm1 \n\t"
|
93 |
|
"por %%mm7, %%mm2 \n\t"
|
94 |
|
"por %%mm7, %%mm3 \n\t"
|
95 |
|
MOVNTQ" %%mm0, %0 \n\t"
|
96 |
|
MOVNTQ" %%mm1, 8%0 \n\t"
|
97 |
|
MOVNTQ" %%mm2, 16%0 \n\t"
|
98 |
|
MOVNTQ" %%mm3, 24%0"
|
99 |
|
:"=m"(*dest)
|
100 |
|
:"m"(*s)
|
101 |
|
:"memory");
|
102 |
|
dest += 32;
|
103 |
|
s += 24;
|
104 |
|
}
|
105 |
|
__asm__ volatile(SFENCE:::"memory");
|
106 |
|
__asm__ volatile(EMMS:::"memory");
|
107 |
|
#endif
|
|
35 |
|
108 |
36 |
while (s < end) {
|
109 |
37 |
#if HAVE_BIGENDIAN
|
110 |
38 |
/* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
|
... | ... | |
122 |
50 |
}
|
123 |
51 |
}
|
124 |
52 |
|
125 |
|
#define STORE_BGR24_MMX \
|
126 |
|
"psrlq $8, %%mm2 \n\t" \
|
127 |
|
"psrlq $8, %%mm3 \n\t" \
|
128 |
|
"psrlq $8, %%mm6 \n\t" \
|
129 |
|
"psrlq $8, %%mm7 \n\t" \
|
130 |
|
"pand "MANGLE(mask24l)", %%mm0\n\t" \
|
131 |
|
"pand "MANGLE(mask24l)", %%mm1\n\t" \
|
132 |
|
"pand "MANGLE(mask24l)", %%mm4\n\t" \
|
133 |
|
"pand "MANGLE(mask24l)", %%mm5\n\t" \
|
134 |
|
"pand "MANGLE(mask24h)", %%mm2\n\t" \
|
135 |
|
"pand "MANGLE(mask24h)", %%mm3\n\t" \
|
136 |
|
"pand "MANGLE(mask24h)", %%mm6\n\t" \
|
137 |
|
"pand "MANGLE(mask24h)", %%mm7\n\t" \
|
138 |
|
"por %%mm2, %%mm0 \n\t" \
|
139 |
|
"por %%mm3, %%mm1 \n\t" \
|
140 |
|
"por %%mm6, %%mm4 \n\t" \
|
141 |
|
"por %%mm7, %%mm5 \n\t" \
|
142 |
|
\
|
143 |
|
"movq %%mm1, %%mm2 \n\t" \
|
144 |
|
"movq %%mm4, %%mm3 \n\t" \
|
145 |
|
"psllq $48, %%mm2 \n\t" \
|
146 |
|
"psllq $32, %%mm3 \n\t" \
|
147 |
|
"pand "MANGLE(mask24hh)", %%mm2\n\t" \
|
148 |
|
"pand "MANGLE(mask24hhh)", %%mm3\n\t" \
|
149 |
|
"por %%mm2, %%mm0 \n\t" \
|
150 |
|
"psrlq $16, %%mm1 \n\t" \
|
151 |
|
"psrlq $32, %%mm4 \n\t" \
|
152 |
|
"psllq $16, %%mm5 \n\t" \
|
153 |
|
"por %%mm3, %%mm1 \n\t" \
|
154 |
|
"pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
|
155 |
|
"por %%mm5, %%mm4 \n\t" \
|
156 |
|
\
|
157 |
|
MOVNTQ" %%mm0, %0 \n\t" \
|
158 |
|
MOVNTQ" %%mm1, 8%0 \n\t" \
|
159 |
|
MOVNTQ" %%mm4, 16%0"
|
160 |
|
|
161 |
|
|
162 |
|
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
53 |
static inline void rgb32tobgr24_c(const uint8_t *src, uint8_t *dst, long src_size)
|
163 |
54 |
{
|
164 |
55 |
uint8_t *dest = dst;
|
165 |
56 |
const uint8_t *s = src;
|
166 |
57 |
const uint8_t *end;
|
167 |
|
#if COMPILE_TEMPLATE_MMX
|
168 |
|
const uint8_t *mm_end;
|
169 |
|
#endif
|
|
58 |
|
170 |
59 |
end = s + src_size;
|
171 |
|
#if COMPILE_TEMPLATE_MMX
|
172 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
|
173 |
|
mm_end = end - 31;
|
174 |
|
while (s < mm_end) {
|
175 |
|
__asm__ volatile(
|
176 |
|
PREFETCH" 32%1 \n\t"
|
177 |
|
"movq %1, %%mm0 \n\t"
|
178 |
|
"movq 8%1, %%mm1 \n\t"
|
179 |
|
"movq 16%1, %%mm4 \n\t"
|
180 |
|
"movq 24%1, %%mm5 \n\t"
|
181 |
|
"movq %%mm0, %%mm2 \n\t"
|
182 |
|
"movq %%mm1, %%mm3 \n\t"
|
183 |
|
"movq %%mm4, %%mm6 \n\t"
|
184 |
|
"movq %%mm5, %%mm7 \n\t"
|
185 |
|
STORE_BGR24_MMX
|
186 |
|
:"=m"(*dest)
|
187 |
|
:"m"(*s)
|
188 |
|
:"memory");
|
189 |
|
dest += 24;
|
190 |
|
s += 32;
|
191 |
|
}
|
192 |
|
__asm__ volatile(SFENCE:::"memory");
|
193 |
|
__asm__ volatile(EMMS:::"memory");
|
194 |
|
#endif
|
|
60 |
|
195 |
61 |
while (s < end) {
|
196 |
62 |
#if HAVE_BIGENDIAN
|
197 |
63 |
/* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
|
... | ... | |
215 |
81 |
MMX2, 3DNOW optimization by Nick Kurshev
|
216 |
82 |
32-bit C version, and and&add trick by Michael Niedermayer
|
217 |
83 |
*/
|
218 |
|
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
84 |
static inline void rgb15to16_c(const uint8_t *src, uint8_t *dst, long src_size)
|
219 |
85 |
{
|
220 |
86 |
register const uint8_t* s=src;
|
221 |
87 |
register uint8_t* d=dst;
|
222 |
88 |
register const uint8_t *end;
|
223 |
89 |
const uint8_t *mm_end;
|
224 |
90 |
end = s + src_size;
|
225 |
|
#if COMPILE_TEMPLATE_MMX
|
226 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*s));
|
227 |
|
__asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
|
228 |
|
mm_end = end - 15;
|
229 |
|
while (s<mm_end) {
|
230 |
|
__asm__ volatile(
|
231 |
|
PREFETCH" 32%1 \n\t"
|
232 |
|
"movq %1, %%mm0 \n\t"
|
233 |
|
"movq 8%1, %%mm2 \n\t"
|
234 |
|
"movq %%mm0, %%mm1 \n\t"
|
235 |
|
"movq %%mm2, %%mm3 \n\t"
|
236 |
|
"pand %%mm4, %%mm0 \n\t"
|
237 |
|
"pand %%mm4, %%mm2 \n\t"
|
238 |
|
"paddw %%mm1, %%mm0 \n\t"
|
239 |
|
"paddw %%mm3, %%mm2 \n\t"
|
240 |
|
MOVNTQ" %%mm0, %0 \n\t"
|
241 |
|
MOVNTQ" %%mm2, 8%0"
|
242 |
|
:"=m"(*d)
|
243 |
|
:"m"(*s)
|
244 |
|
);
|
245 |
|
d+=16;
|
246 |
|
s+=16;
|
247 |
|
}
|
248 |
|
__asm__ volatile(SFENCE:::"memory");
|
249 |
|
__asm__ volatile(EMMS:::"memory");
|
250 |
|
#endif
|
251 |
91 |
mm_end = end - 3;
|
252 |
92 |
while (s < mm_end) {
|
253 |
93 |
register unsigned x= *((const uint32_t *)s);
|
... | ... | |
261 |
101 |
}
|
262 |
102 |
}
|
263 |
103 |
|
264 |
|
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
104 |
static inline void rgb16to15_c(const uint8_t *src, uint8_t *dst, long src_size)
|
265 |
105 |
{
|
266 |
106 |
register const uint8_t* s=src;
|
267 |
107 |
register uint8_t* d=dst;
|
268 |
108 |
register const uint8_t *end;
|
269 |
109 |
const uint8_t *mm_end;
|
270 |
110 |
end = s + src_size;
|
271 |
|
#if COMPILE_TEMPLATE_MMX
|
272 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*s));
|
273 |
|
__asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
|
274 |
|
__asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
|
275 |
|
mm_end = end - 15;
|
276 |
|
while (s<mm_end) {
|
277 |
|
__asm__ volatile(
|
278 |
|
PREFETCH" 32%1 \n\t"
|
279 |
|
"movq %1, %%mm0 \n\t"
|
280 |
|
"movq 8%1, %%mm2 \n\t"
|
281 |
|
"movq %%mm0, %%mm1 \n\t"
|
282 |
|
"movq %%mm2, %%mm3 \n\t"
|
283 |
|
"psrlq $1, %%mm0 \n\t"
|
284 |
|
"psrlq $1, %%mm2 \n\t"
|
285 |
|
"pand %%mm7, %%mm0 \n\t"
|
286 |
|
"pand %%mm7, %%mm2 \n\t"
|
287 |
|
"pand %%mm6, %%mm1 \n\t"
|
288 |
|
"pand %%mm6, %%mm3 \n\t"
|
289 |
|
"por %%mm1, %%mm0 \n\t"
|
290 |
|
"por %%mm3, %%mm2 \n\t"
|
291 |
|
MOVNTQ" %%mm0, %0 \n\t"
|
292 |
|
MOVNTQ" %%mm2, 8%0"
|
293 |
|
:"=m"(*d)
|
294 |
|
:"m"(*s)
|
295 |
|
);
|
296 |
|
d+=16;
|
297 |
|
s+=16;
|
298 |
|
}
|
299 |
|
__asm__ volatile(SFENCE:::"memory");
|
300 |
|
__asm__ volatile(EMMS:::"memory");
|
301 |
|
#endif
|
|
111 |
|
302 |
112 |
mm_end = end - 3;
|
303 |
113 |
while (s < mm_end) {
|
304 |
114 |
register uint32_t x= *((const uint32_t*)s);
|
... | ... | |
312 |
122 |
}
|
313 |
123 |
}
|
314 |
124 |
|
315 |
|
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
125 |
static inline void rgb32to16_c(const uint8_t *src, uint8_t *dst, long src_size)
|
316 |
126 |
{
|
317 |
127 |
const uint8_t *s = src;
|
318 |
128 |
const uint8_t *end;
|
319 |
|
#if COMPILE_TEMPLATE_MMX
|
320 |
|
const uint8_t *mm_end;
|
321 |
|
#endif
|
322 |
129 |
uint16_t *d = (uint16_t *)dst;
|
323 |
130 |
end = s + src_size;
|
324 |
|
#if COMPILE_TEMPLATE_MMX
|
325 |
|
mm_end = end - 15;
|
326 |
|
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
|
327 |
|
__asm__ volatile(
|
328 |
|
"movq %3, %%mm5 \n\t"
|
329 |
|
"movq %4, %%mm6 \n\t"
|
330 |
|
"movq %5, %%mm7 \n\t"
|
331 |
|
"jmp 2f \n\t"
|
332 |
|
".p2align 4 \n\t"
|
333 |
|
"1: \n\t"
|
334 |
|
PREFETCH" 32(%1) \n\t"
|
335 |
|
"movd (%1), %%mm0 \n\t"
|
336 |
|
"movd 4(%1), %%mm3 \n\t"
|
337 |
|
"punpckldq 8(%1), %%mm0 \n\t"
|
338 |
|
"punpckldq 12(%1), %%mm3 \n\t"
|
339 |
|
"movq %%mm0, %%mm1 \n\t"
|
340 |
|
"movq %%mm3, %%mm4 \n\t"
|
341 |
|
"pand %%mm6, %%mm0 \n\t"
|
342 |
|
"pand %%mm6, %%mm3 \n\t"
|
343 |
|
"pmaddwd %%mm7, %%mm0 \n\t"
|
344 |
|
"pmaddwd %%mm7, %%mm3 \n\t"
|
345 |
|
"pand %%mm5, %%mm1 \n\t"
|
346 |
|
"pand %%mm5, %%mm4 \n\t"
|
347 |
|
"por %%mm1, %%mm0 \n\t"
|
348 |
|
"por %%mm4, %%mm3 \n\t"
|
349 |
|
"psrld $5, %%mm0 \n\t"
|
350 |
|
"pslld $11, %%mm3 \n\t"
|
351 |
|
"por %%mm3, %%mm0 \n\t"
|
352 |
|
MOVNTQ" %%mm0, (%0) \n\t"
|
353 |
|
"add $16, %1 \n\t"
|
354 |
|
"add $8, %0 \n\t"
|
355 |
|
"2: \n\t"
|
356 |
|
"cmp %2, %1 \n\t"
|
357 |
|
" jb 1b \n\t"
|
358 |
|
: "+r" (d), "+r"(s)
|
359 |
|
: "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
|
360 |
|
);
|
361 |
|
#else
|
362 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
|
363 |
|
__asm__ volatile(
|
364 |
|
"movq %0, %%mm7 \n\t"
|
365 |
|
"movq %1, %%mm6 \n\t"
|
366 |
|
::"m"(red_16mask),"m"(green_16mask));
|
367 |
|
while (s < mm_end) {
|
368 |
|
__asm__ volatile(
|
369 |
|
PREFETCH" 32%1 \n\t"
|
370 |
|
"movd %1, %%mm0 \n\t"
|
371 |
|
"movd 4%1, %%mm3 \n\t"
|
372 |
|
"punpckldq 8%1, %%mm0 \n\t"
|
373 |
|
"punpckldq 12%1, %%mm3 \n\t"
|
374 |
|
"movq %%mm0, %%mm1 \n\t"
|
375 |
|
"movq %%mm0, %%mm2 \n\t"
|
376 |
|
"movq %%mm3, %%mm4 \n\t"
|
377 |
|
"movq %%mm3, %%mm5 \n\t"
|
378 |
|
"psrlq $3, %%mm0 \n\t"
|
379 |
|
"psrlq $3, %%mm3 \n\t"
|
380 |
|
"pand %2, %%mm0 \n\t"
|
381 |
|
"pand %2, %%mm3 \n\t"
|
382 |
|
"psrlq $5, %%mm1 \n\t"
|
383 |
|
"psrlq $5, %%mm4 \n\t"
|
384 |
|
"pand %%mm6, %%mm1 \n\t"
|
385 |
|
"pand %%mm6, %%mm4 \n\t"
|
386 |
|
"psrlq $8, %%mm2 \n\t"
|
387 |
|
"psrlq $8, %%mm5 \n\t"
|
388 |
|
"pand %%mm7, %%mm2 \n\t"
|
389 |
|
"pand %%mm7, %%mm5 \n\t"
|
390 |
|
"por %%mm1, %%mm0 \n\t"
|
391 |
|
"por %%mm4, %%mm3 \n\t"
|
392 |
|
"por %%mm2, %%mm0 \n\t"
|
393 |
|
"por %%mm5, %%mm3 \n\t"
|
394 |
|
"psllq $16, %%mm3 \n\t"
|
395 |
|
"por %%mm3, %%mm0 \n\t"
|
396 |
|
MOVNTQ" %%mm0, %0 \n\t"
|
397 |
|
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
|
398 |
|
d += 4;
|
399 |
|
s += 16;
|
400 |
|
}
|
401 |
|
#endif
|
402 |
|
__asm__ volatile(SFENCE:::"memory");
|
403 |
|
__asm__ volatile(EMMS:::"memory");
|
404 |
|
#endif
|
|
131 |
|
405 |
132 |
while (s < end) {
|
406 |
133 |
register int rgb = *(const uint32_t*)s; s += 4;
|
407 |
134 |
*d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
|
408 |
135 |
}
|
409 |
136 |
}
|
410 |
137 |
|
411 |
|
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
138 |
static inline void rgb32tobgr16_c(const uint8_t *src, uint8_t *dst, long src_size)
|
412 |
139 |
{
|
413 |
140 |
const uint8_t *s = src;
|
414 |
141 |
const uint8_t *end;
|
415 |
|
#if COMPILE_TEMPLATE_MMX
|
416 |
|
const uint8_t *mm_end;
|
417 |
|
#endif
|
418 |
142 |
uint16_t *d = (uint16_t *)dst;
|
419 |
143 |
end = s + src_size;
|
420 |
|
#if COMPILE_TEMPLATE_MMX
|
421 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
|
422 |
|
__asm__ volatile(
|
423 |
|
"movq %0, %%mm7 \n\t"
|
424 |
|
"movq %1, %%mm6 \n\t"
|
425 |
|
::"m"(red_16mask),"m"(green_16mask));
|
426 |
|
mm_end = end - 15;
|
427 |
|
while (s < mm_end) {
|
428 |
|
__asm__ volatile(
|
429 |
|
PREFETCH" 32%1 \n\t"
|
430 |
|
"movd %1, %%mm0 \n\t"
|
431 |
|
"movd 4%1, %%mm3 \n\t"
|
432 |
|
"punpckldq 8%1, %%mm0 \n\t"
|
433 |
|
"punpckldq 12%1, %%mm3 \n\t"
|
434 |
|
"movq %%mm0, %%mm1 \n\t"
|
435 |
|
"movq %%mm0, %%mm2 \n\t"
|
436 |
|
"movq %%mm3, %%mm4 \n\t"
|
437 |
|
"movq %%mm3, %%mm5 \n\t"
|
438 |
|
"psllq $8, %%mm0 \n\t"
|
439 |
|
"psllq $8, %%mm3 \n\t"
|
440 |
|
"pand %%mm7, %%mm0 \n\t"
|
441 |
|
"pand %%mm7, %%mm3 \n\t"
|
442 |
|
"psrlq $5, %%mm1 \n\t"
|
443 |
|
"psrlq $5, %%mm4 \n\t"
|
444 |
|
"pand %%mm6, %%mm1 \n\t"
|
445 |
|
"pand %%mm6, %%mm4 \n\t"
|
446 |
|
"psrlq $19, %%mm2 \n\t"
|
447 |
|
"psrlq $19, %%mm5 \n\t"
|
448 |
|
"pand %2, %%mm2 \n\t"
|
449 |
|
"pand %2, %%mm5 \n\t"
|
450 |
|
"por %%mm1, %%mm0 \n\t"
|
451 |
|
"por %%mm4, %%mm3 \n\t"
|
452 |
|
"por %%mm2, %%mm0 \n\t"
|
453 |
|
"por %%mm5, %%mm3 \n\t"
|
454 |
|
"psllq $16, %%mm3 \n\t"
|
455 |
|
"por %%mm3, %%mm0 \n\t"
|
456 |
|
MOVNTQ" %%mm0, %0 \n\t"
|
457 |
|
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
|
458 |
|
d += 4;
|
459 |
|
s += 16;
|
460 |
|
}
|
461 |
|
__asm__ volatile(SFENCE:::"memory");
|
462 |
|
__asm__ volatile(EMMS:::"memory");
|
463 |
|
#endif
|
464 |
144 |
while (s < end) {
|
465 |
145 |
register int rgb = *(const uint32_t*)s; s += 4;
|
466 |
146 |
*d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
|
467 |
147 |
}
|
468 |
148 |
}
|
469 |
149 |
|
470 |
|
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
150 |
static inline void rgb32to15_c(const uint8_t *src, uint8_t *dst, long src_size)
|
471 |
151 |
{
|
472 |
152 |
const uint8_t *s = src;
|
473 |
153 |
const uint8_t *end;
|
474 |
|
#if COMPILE_TEMPLATE_MMX
|
475 |
|
const uint8_t *mm_end;
|
476 |
|
#endif
|
477 |
154 |
uint16_t *d = (uint16_t *)dst;
|
478 |
155 |
end = s + src_size;
|
479 |
|
#if COMPILE_TEMPLATE_MMX
|
480 |
|
mm_end = end - 15;
|
481 |
|
#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
|
482 |
|
__asm__ volatile(
|
483 |
|
"movq %3, %%mm5 \n\t"
|
484 |
|
"movq %4, %%mm6 \n\t"
|
485 |
|
"movq %5, %%mm7 \n\t"
|
486 |
|
"jmp 2f \n\t"
|
487 |
|
".p2align 4 \n\t"
|
488 |
|
"1: \n\t"
|
489 |
|
PREFETCH" 32(%1) \n\t"
|
490 |
|
"movd (%1), %%mm0 \n\t"
|
491 |
|
"movd 4(%1), %%mm3 \n\t"
|
492 |
|
"punpckldq 8(%1), %%mm0 \n\t"
|
493 |
|
"punpckldq 12(%1), %%mm3 \n\t"
|
494 |
|
"movq %%mm0, %%mm1 \n\t"
|
495 |
|
"movq %%mm3, %%mm4 \n\t"
|
496 |
|
"pand %%mm6, %%mm0 \n\t"
|
497 |
|
"pand %%mm6, %%mm3 \n\t"
|
498 |
|
"pmaddwd %%mm7, %%mm0 \n\t"
|
499 |
|
"pmaddwd %%mm7, %%mm3 \n\t"
|
500 |
|
"pand %%mm5, %%mm1 \n\t"
|
501 |
|
"pand %%mm5, %%mm4 \n\t"
|
502 |
|
"por %%mm1, %%mm0 \n\t"
|
503 |
|
"por %%mm4, %%mm3 \n\t"
|
504 |
|
"psrld $6, %%mm0 \n\t"
|
505 |
|
"pslld $10, %%mm3 \n\t"
|
506 |
|
"por %%mm3, %%mm0 \n\t"
|
507 |
|
MOVNTQ" %%mm0, (%0) \n\t"
|
508 |
|
"add $16, %1 \n\t"
|
509 |
|
"add $8, %0 \n\t"
|
510 |
|
"2: \n\t"
|
511 |
|
"cmp %2, %1 \n\t"
|
512 |
|
" jb 1b \n\t"
|
513 |
|
: "+r" (d), "+r"(s)
|
514 |
|
: "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
|
515 |
|
);
|
516 |
|
#else
|
517 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
|
518 |
|
__asm__ volatile(
|
519 |
|
"movq %0, %%mm7 \n\t"
|
520 |
|
"movq %1, %%mm6 \n\t"
|
521 |
|
::"m"(red_15mask),"m"(green_15mask));
|
522 |
|
while (s < mm_end) {
|
523 |
|
__asm__ volatile(
|
524 |
|
PREFETCH" 32%1 \n\t"
|
525 |
|
"movd %1, %%mm0 \n\t"
|
526 |
|
"movd 4%1, %%mm3 \n\t"
|
527 |
|
"punpckldq 8%1, %%mm0 \n\t"
|
528 |
|
"punpckldq 12%1, %%mm3 \n\t"
|
529 |
|
"movq %%mm0, %%mm1 \n\t"
|
530 |
|
"movq %%mm0, %%mm2 \n\t"
|
531 |
|
"movq %%mm3, %%mm4 \n\t"
|
532 |
|
"movq %%mm3, %%mm5 \n\t"
|
533 |
|
"psrlq $3, %%mm0 \n\t"
|
534 |
|
"psrlq $3, %%mm3 \n\t"
|
535 |
|
"pand %2, %%mm0 \n\t"
|
536 |
|
"pand %2, %%mm3 \n\t"
|
537 |
|
"psrlq $6, %%mm1 \n\t"
|
538 |
|
"psrlq $6, %%mm4 \n\t"
|
539 |
|
"pand %%mm6, %%mm1 \n\t"
|
540 |
|
"pand %%mm6, %%mm4 \n\t"
|
541 |
|
"psrlq $9, %%mm2 \n\t"
|
542 |
|
"psrlq $9, %%mm5 \n\t"
|
543 |
|
"pand %%mm7, %%mm2 \n\t"
|
544 |
|
"pand %%mm7, %%mm5 \n\t"
|
545 |
|
"por %%mm1, %%mm0 \n\t"
|
546 |
|
"por %%mm4, %%mm3 \n\t"
|
547 |
|
"por %%mm2, %%mm0 \n\t"
|
548 |
|
"por %%mm5, %%mm3 \n\t"
|
549 |
|
"psllq $16, %%mm3 \n\t"
|
550 |
|
"por %%mm3, %%mm0 \n\t"
|
551 |
|
MOVNTQ" %%mm0, %0 \n\t"
|
552 |
|
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
|
553 |
|
d += 4;
|
554 |
|
s += 16;
|
555 |
|
}
|
556 |
|
#endif
|
557 |
|
__asm__ volatile(SFENCE:::"memory");
|
558 |
|
__asm__ volatile(EMMS:::"memory");
|
559 |
|
#endif
|
560 |
156 |
while (s < end) {
|
561 |
157 |
register int rgb = *(const uint32_t*)s; s += 4;
|
562 |
158 |
*d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
|
563 |
159 |
}
|
564 |
160 |
}
|
565 |
161 |
|
566 |
|
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
162 |
static inline void rgb32tobgr15_c(const uint8_t *src, uint8_t *dst, long src_size)
|
567 |
163 |
{
|
568 |
164 |
const uint8_t *s = src;
|
569 |
165 |
const uint8_t *end;
|
570 |
|
#if COMPILE_TEMPLATE_MMX
|
571 |
|
const uint8_t *mm_end;
|
572 |
|
#endif
|
573 |
166 |
uint16_t *d = (uint16_t *)dst;
|
574 |
167 |
end = s + src_size;
|
575 |
|
#if COMPILE_TEMPLATE_MMX
|
576 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
|
577 |
|
__asm__ volatile(
|
578 |
|
"movq %0, %%mm7 \n\t"
|
579 |
|
"movq %1, %%mm6 \n\t"
|
580 |
|
::"m"(red_15mask),"m"(green_15mask));
|
581 |
|
mm_end = end - 15;
|
582 |
|
while (s < mm_end) {
|
583 |
|
__asm__ volatile(
|
584 |
|
PREFETCH" 32%1 \n\t"
|
585 |
|
"movd %1, %%mm0 \n\t"
|
586 |
|
"movd 4%1, %%mm3 \n\t"
|
587 |
|
"punpckldq 8%1, %%mm0 \n\t"
|
588 |
|
"punpckldq 12%1, %%mm3 \n\t"
|
589 |
|
"movq %%mm0, %%mm1 \n\t"
|
590 |
|
"movq %%mm0, %%mm2 \n\t"
|
591 |
|
"movq %%mm3, %%mm4 \n\t"
|
592 |
|
"movq %%mm3, %%mm5 \n\t"
|
593 |
|
"psllq $7, %%mm0 \n\t"
|
594 |
|
"psllq $7, %%mm3 \n\t"
|
595 |
|
"pand %%mm7, %%mm0 \n\t"
|
596 |
|
"pand %%mm7, %%mm3 \n\t"
|
597 |
|
"psrlq $6, %%mm1 \n\t"
|
598 |
|
"psrlq $6, %%mm4 \n\t"
|
599 |
|
"pand %%mm6, %%mm1 \n\t"
|
600 |
|
"pand %%mm6, %%mm4 \n\t"
|
601 |
|
"psrlq $19, %%mm2 \n\t"
|
602 |
|
"psrlq $19, %%mm5 \n\t"
|
603 |
|
"pand %2, %%mm2 \n\t"
|
604 |
|
"pand %2, %%mm5 \n\t"
|
605 |
|
"por %%mm1, %%mm0 \n\t"
|
606 |
|
"por %%mm4, %%mm3 \n\t"
|
607 |
|
"por %%mm2, %%mm0 \n\t"
|
608 |
|
"por %%mm5, %%mm3 \n\t"
|
609 |
|
"psllq $16, %%mm3 \n\t"
|
610 |
|
"por %%mm3, %%mm0 \n\t"
|
611 |
|
MOVNTQ" %%mm0, %0 \n\t"
|
612 |
|
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
|
613 |
|
d += 4;
|
614 |
|
s += 16;
|
615 |
|
}
|
616 |
|
__asm__ volatile(SFENCE:::"memory");
|
617 |
|
__asm__ volatile(EMMS:::"memory");
|
618 |
|
#endif
|
619 |
168 |
while (s < end) {
|
620 |
169 |
register int rgb = *(const uint32_t*)s; s += 4;
|
621 |
170 |
*d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
|
622 |
171 |
}
|
623 |
172 |
}
|
624 |
173 |
|
625 |
|
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
174 |
static inline void rgb24tobgr16_c(const uint8_t *src, uint8_t *dst, long src_size)
|
626 |
175 |
{
|
627 |
176 |
const uint8_t *s = src;
|
628 |
177 |
const uint8_t *end;
|
629 |
|
#if COMPILE_TEMPLATE_MMX
|
630 |
|
const uint8_t *mm_end;
|
631 |
|
#endif
|
632 |
178 |
uint16_t *d = (uint16_t *)dst;
|
633 |
179 |
end = s + src_size;
|
634 |
|
#if COMPILE_TEMPLATE_MMX
|
635 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
|
636 |
|
__asm__ volatile(
|
637 |
|
"movq %0, %%mm7 \n\t"
|
638 |
|
"movq %1, %%mm6 \n\t"
|
639 |
|
::"m"(red_16mask),"m"(green_16mask));
|
640 |
|
mm_end = end - 11;
|
641 |
|
while (s < mm_end) {
|
642 |
|
__asm__ volatile(
|
643 |
|
PREFETCH" 32%1 \n\t"
|
644 |
|
"movd %1, %%mm0 \n\t"
|
645 |
|
"movd 3%1, %%mm3 \n\t"
|
646 |
|
"punpckldq 6%1, %%mm0 \n\t"
|
647 |
|
"punpckldq 9%1, %%mm3 \n\t"
|
648 |
|
"movq %%mm0, %%mm1 \n\t"
|
649 |
|
"movq %%mm0, %%mm2 \n\t"
|
650 |
|
"movq %%mm3, %%mm4 \n\t"
|
651 |
|
"movq %%mm3, %%mm5 \n\t"
|
652 |
|
"psrlq $3, %%mm0 \n\t"
|
653 |
|
"psrlq $3, %%mm3 \n\t"
|
654 |
|
"pand %2, %%mm0 \n\t"
|
655 |
|
"pand %2, %%mm3 \n\t"
|
656 |
|
"psrlq $5, %%mm1 \n\t"
|
657 |
|
"psrlq $5, %%mm4 \n\t"
|
658 |
|
"pand %%mm6, %%mm1 \n\t"
|
659 |
|
"pand %%mm6, %%mm4 \n\t"
|
660 |
|
"psrlq $8, %%mm2 \n\t"
|
661 |
|
"psrlq $8, %%mm5 \n\t"
|
662 |
|
"pand %%mm7, %%mm2 \n\t"
|
663 |
|
"pand %%mm7, %%mm5 \n\t"
|
664 |
|
"por %%mm1, %%mm0 \n\t"
|
665 |
|
"por %%mm4, %%mm3 \n\t"
|
666 |
|
"por %%mm2, %%mm0 \n\t"
|
667 |
|
"por %%mm5, %%mm3 \n\t"
|
668 |
|
"psllq $16, %%mm3 \n\t"
|
669 |
|
"por %%mm3, %%mm0 \n\t"
|
670 |
|
MOVNTQ" %%mm0, %0 \n\t"
|
671 |
|
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
|
672 |
|
d += 4;
|
673 |
|
s += 12;
|
674 |
|
}
|
675 |
|
__asm__ volatile(SFENCE:::"memory");
|
676 |
|
__asm__ volatile(EMMS:::"memory");
|
677 |
|
#endif
|
678 |
180 |
while (s < end) {
|
679 |
181 |
const int b = *s++;
|
680 |
182 |
const int g = *s++;
|
... | ... | |
683 |
185 |
}
|
684 |
186 |
}
|
685 |
187 |
|
686 |
|
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
188 |
static inline void rgb24to16_c(const uint8_t *src, uint8_t *dst, long src_size)
|
687 |
189 |
{
|
688 |
190 |
const uint8_t *s = src;
|
689 |
191 |
const uint8_t *end;
|
690 |
|
#if COMPILE_TEMPLATE_MMX
|
691 |
|
const uint8_t *mm_end;
|
692 |
|
#endif
|
693 |
192 |
uint16_t *d = (uint16_t *)dst;
|
694 |
193 |
end = s + src_size;
|
695 |
|
#if COMPILE_TEMPLATE_MMX
|
696 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
|
697 |
|
__asm__ volatile(
|
698 |
|
"movq %0, %%mm7 \n\t"
|
699 |
|
"movq %1, %%mm6 \n\t"
|
700 |
|
::"m"(red_16mask),"m"(green_16mask));
|
701 |
|
mm_end = end - 15;
|
702 |
|
while (s < mm_end) {
|
703 |
|
__asm__ volatile(
|
704 |
|
PREFETCH" 32%1 \n\t"
|
705 |
|
"movd %1, %%mm0 \n\t"
|
706 |
|
"movd 3%1, %%mm3 \n\t"
|
707 |
|
"punpckldq 6%1, %%mm0 \n\t"
|
708 |
|
"punpckldq 9%1, %%mm3 \n\t"
|
709 |
|
"movq %%mm0, %%mm1 \n\t"
|
710 |
|
"movq %%mm0, %%mm2 \n\t"
|
711 |
|
"movq %%mm3, %%mm4 \n\t"
|
712 |
|
"movq %%mm3, %%mm5 \n\t"
|
713 |
|
"psllq $8, %%mm0 \n\t"
|
714 |
|
"psllq $8, %%mm3 \n\t"
|
715 |
|
"pand %%mm7, %%mm0 \n\t"
|
716 |
|
"pand %%mm7, %%mm3 \n\t"
|
717 |
|
"psrlq $5, %%mm1 \n\t"
|
718 |
|
"psrlq $5, %%mm4 \n\t"
|
719 |
|
"pand %%mm6, %%mm1 \n\t"
|
720 |
|
"pand %%mm6, %%mm4 \n\t"
|
721 |
|
"psrlq $19, %%mm2 \n\t"
|
722 |
|
"psrlq $19, %%mm5 \n\t"
|
723 |
|
"pand %2, %%mm2 \n\t"
|
724 |
|
"pand %2, %%mm5 \n\t"
|
725 |
|
"por %%mm1, %%mm0 \n\t"
|
726 |
|
"por %%mm4, %%mm3 \n\t"
|
727 |
|
"por %%mm2, %%mm0 \n\t"
|
728 |
|
"por %%mm5, %%mm3 \n\t"
|
729 |
|
"psllq $16, %%mm3 \n\t"
|
730 |
|
"por %%mm3, %%mm0 \n\t"
|
731 |
|
MOVNTQ" %%mm0, %0 \n\t"
|
732 |
|
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
|
733 |
|
d += 4;
|
734 |
|
s += 12;
|
735 |
|
}
|
736 |
|
__asm__ volatile(SFENCE:::"memory");
|
737 |
|
__asm__ volatile(EMMS:::"memory");
|
738 |
|
#endif
|
739 |
194 |
while (s < end) {
|
740 |
195 |
const int r = *s++;
|
741 |
196 |
const int g = *s++;
|
... | ... | |
744 |
199 |
}
|
745 |
200 |
}
|
746 |
201 |
|
747 |
|
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
202 |
static inline void rgb24tobgr15_c(const uint8_t *src, uint8_t *dst, long src_size)
|
748 |
203 |
{
|
749 |
204 |
const uint8_t *s = src;
|
750 |
205 |
const uint8_t *end;
|
751 |
|
#if COMPILE_TEMPLATE_MMX
|
752 |
|
const uint8_t *mm_end;
|
753 |
|
#endif
|
754 |
206 |
uint16_t *d = (uint16_t *)dst;
|
755 |
207 |
end = s + src_size;
|
756 |
|
#if COMPILE_TEMPLATE_MMX
|
757 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
|
758 |
|
__asm__ volatile(
|
759 |
|
"movq %0, %%mm7 \n\t"
|
760 |
|
"movq %1, %%mm6 \n\t"
|
761 |
|
::"m"(red_15mask),"m"(green_15mask));
|
762 |
|
mm_end = end - 11;
|
763 |
|
while (s < mm_end) {
|
764 |
|
__asm__ volatile(
|
765 |
|
PREFETCH" 32%1 \n\t"
|
766 |
|
"movd %1, %%mm0 \n\t"
|
767 |
|
"movd 3%1, %%mm3 \n\t"
|
768 |
|
"punpckldq 6%1, %%mm0 \n\t"
|
769 |
|
"punpckldq 9%1, %%mm3 \n\t"
|
770 |
|
"movq %%mm0, %%mm1 \n\t"
|
771 |
|
"movq %%mm0, %%mm2 \n\t"
|
772 |
|
"movq %%mm3, %%mm4 \n\t"
|
773 |
|
"movq %%mm3, %%mm5 \n\t"
|
774 |
|
"psrlq $3, %%mm0 \n\t"
|
775 |
|
"psrlq $3, %%mm3 \n\t"
|
776 |
|
"pand %2, %%mm0 \n\t"
|
777 |
|
"pand %2, %%mm3 \n\t"
|
778 |
|
"psrlq $6, %%mm1 \n\t"
|
779 |
|
"psrlq $6, %%mm4 \n\t"
|
780 |
|
"pand %%mm6, %%mm1 \n\t"
|
781 |
|
"pand %%mm6, %%mm4 \n\t"
|
782 |
|
"psrlq $9, %%mm2 \n\t"
|
783 |
|
"psrlq $9, %%mm5 \n\t"
|
784 |
|
"pand %%mm7, %%mm2 \n\t"
|
785 |
|
"pand %%mm7, %%mm5 \n\t"
|
786 |
|
"por %%mm1, %%mm0 \n\t"
|
787 |
|
"por %%mm4, %%mm3 \n\t"
|
788 |
|
"por %%mm2, %%mm0 \n\t"
|
789 |
|
"por %%mm5, %%mm3 \n\t"
|
790 |
|
"psllq $16, %%mm3 \n\t"
|
791 |
|
"por %%mm3, %%mm0 \n\t"
|
792 |
|
MOVNTQ" %%mm0, %0 \n\t"
|
793 |
|
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
|
794 |
|
d += 4;
|
795 |
|
s += 12;
|
796 |
|
}
|
797 |
|
__asm__ volatile(SFENCE:::"memory");
|
798 |
|
__asm__ volatile(EMMS:::"memory");
|
799 |
|
#endif
|
800 |
208 |
while (s < end) {
|
801 |
209 |
const int b = *s++;
|
802 |
210 |
const int g = *s++;
|
... | ... | |
805 |
213 |
}
|
806 |
214 |
}
|
807 |
215 |
|
808 |
|
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
216 |
static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, long src_size)
|
809 |
217 |
{
|
810 |
218 |
const uint8_t *s = src;
|
811 |
219 |
const uint8_t *end;
|
812 |
|
#if COMPILE_TEMPLATE_MMX
|
813 |
|
const uint8_t *mm_end;
|
814 |
|
#endif
|
815 |
220 |
uint16_t *d = (uint16_t *)dst;
|
816 |
221 |
end = s + src_size;
|
817 |
|
#if COMPILE_TEMPLATE_MMX
|
818 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
|
819 |
|
__asm__ volatile(
|
820 |
|
"movq %0, %%mm7 \n\t"
|
821 |
|
"movq %1, %%mm6 \n\t"
|
822 |
|
::"m"(red_15mask),"m"(green_15mask));
|
823 |
|
mm_end = end - 15;
|
824 |
|
while (s < mm_end) {
|
825 |
|
__asm__ volatile(
|
826 |
|
PREFETCH" 32%1 \n\t"
|
827 |
|
"movd %1, %%mm0 \n\t"
|
828 |
|
"movd 3%1, %%mm3 \n\t"
|
829 |
|
"punpckldq 6%1, %%mm0 \n\t"
|
830 |
|
"punpckldq 9%1, %%mm3 \n\t"
|
831 |
|
"movq %%mm0, %%mm1 \n\t"
|
832 |
|
"movq %%mm0, %%mm2 \n\t"
|
833 |
|
"movq %%mm3, %%mm4 \n\t"
|
834 |
|
"movq %%mm3, %%mm5 \n\t"
|
835 |
|
"psllq $7, %%mm0 \n\t"
|
836 |
|
"psllq $7, %%mm3 \n\t"
|
837 |
|
"pand %%mm7, %%mm0 \n\t"
|
838 |
|
"pand %%mm7, %%mm3 \n\t"
|
839 |
|
"psrlq $6, %%mm1 \n\t"
|
840 |
|
"psrlq $6, %%mm4 \n\t"
|
841 |
|
"pand %%mm6, %%mm1 \n\t"
|
842 |
|
"pand %%mm6, %%mm4 \n\t"
|
843 |
|
"psrlq $19, %%mm2 \n\t"
|
844 |
|
"psrlq $19, %%mm5 \n\t"
|
845 |
|
"pand %2, %%mm2 \n\t"
|
846 |
|
"pand %2, %%mm5 \n\t"
|
847 |
|
"por %%mm1, %%mm0 \n\t"
|
848 |
|
"por %%mm4, %%mm3 \n\t"
|
849 |
|
"por %%mm2, %%mm0 \n\t"
|
850 |
|
"por %%mm5, %%mm3 \n\t"
|
851 |
|
"psllq $16, %%mm3 \n\t"
|
852 |
|
"por %%mm3, %%mm0 \n\t"
|
853 |
|
MOVNTQ" %%mm0, %0 \n\t"
|
854 |
|
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
|
855 |
|
d += 4;
|
856 |
|
s += 12;
|
857 |
|
}
|
858 |
|
__asm__ volatile(SFENCE:::"memory");
|
859 |
|
__asm__ volatile(EMMS:::"memory");
|
860 |
|
#endif
|
861 |
222 |
while (s < end) {
|
862 |
223 |
const int r = *s++;
|
863 |
224 |
const int g = *s++;
|
... | ... | |
887 |
248 |
|
|
888 |
249 |
original bits
|
889 |
250 |
*/
|
890 |
|
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
251 |
static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst, long src_size)
|
891 |
252 |
{
|
892 |
253 |
const uint16_t *end;
|
893 |
|
#if COMPILE_TEMPLATE_MMX
|
894 |
|
const uint16_t *mm_end;
|
895 |
|
#endif
|
896 |
254 |
uint8_t *d = dst;
|
897 |
255 |
const uint16_t *s = (const uint16_t*)src;
|
898 |
256 |
end = s + src_size/2;
|
899 |
|
#if COMPILE_TEMPLATE_MMX
|
900 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
|
901 |
|
mm_end = end - 7;
|
902 |
|
while (s < mm_end) {
|
903 |
|
__asm__ volatile(
|
904 |
|
PREFETCH" 32%1 \n\t"
|
905 |
|
"movq %1, %%mm0 \n\t"
|
906 |
|
"movq %1, %%mm1 \n\t"
|
907 |
|
"movq %1, %%mm2 \n\t"
|
908 |
|
"pand %2, %%mm0 \n\t"
|
909 |
|
"pand %3, %%mm1 \n\t"
|
910 |
|
"pand %4, %%mm2 \n\t"
|
911 |
|
"psllq $3, %%mm0 \n\t"
|
912 |
|
"psrlq $2, %%mm1 \n\t"
|
913 |
|
"psrlq $7, %%mm2 \n\t"
|
914 |
|
"movq %%mm0, %%mm3 \n\t"
|
915 |
|
"movq %%mm1, %%mm4 \n\t"
|
916 |
|
"movq %%mm2, %%mm5 \n\t"
|
917 |
|
"punpcklwd %5, %%mm0 \n\t"
|
918 |
|
"punpcklwd %5, %%mm1 \n\t"
|
919 |
|
"punpcklwd %5, %%mm2 \n\t"
|
920 |
|
"punpckhwd %5, %%mm3 \n\t"
|
921 |
|
"punpckhwd %5, %%mm4 \n\t"
|
922 |
|
"punpckhwd %5, %%mm5 \n\t"
|
923 |
|
"psllq $8, %%mm1 \n\t"
|
924 |
|
"psllq $16, %%mm2 \n\t"
|
925 |
|
"por %%mm1, %%mm0 \n\t"
|
926 |
|
"por %%mm2, %%mm0 \n\t"
|
927 |
|
"psllq $8, %%mm4 \n\t"
|
928 |
|
"psllq $16, %%mm5 \n\t"
|
929 |
|
"por %%mm4, %%mm3 \n\t"
|
930 |
|
"por %%mm5, %%mm3 \n\t"
|
931 |
|
|
932 |
|
"movq %%mm0, %%mm6 \n\t"
|
933 |
|
"movq %%mm3, %%mm7 \n\t"
|
934 |
|
|
935 |
|
"movq 8%1, %%mm0 \n\t"
|
936 |
|
"movq 8%1, %%mm1 \n\t"
|
937 |
|
"movq 8%1, %%mm2 \n\t"
|
938 |
|
"pand %2, %%mm0 \n\t"
|
939 |
|
"pand %3, %%mm1 \n\t"
|
940 |
|
"pand %4, %%mm2 \n\t"
|
941 |
|
"psllq $3, %%mm0 \n\t"
|
942 |
|
"psrlq $2, %%mm1 \n\t"
|
943 |
|
"psrlq $7, %%mm2 \n\t"
|
944 |
|
"movq %%mm0, %%mm3 \n\t"
|
945 |
|
"movq %%mm1, %%mm4 \n\t"
|
946 |
|
"movq %%mm2, %%mm5 \n\t"
|
947 |
|
"punpcklwd %5, %%mm0 \n\t"
|
948 |
|
"punpcklwd %5, %%mm1 \n\t"
|
949 |
|
"punpcklwd %5, %%mm2 \n\t"
|
950 |
|
"punpckhwd %5, %%mm3 \n\t"
|
951 |
|
"punpckhwd %5, %%mm4 \n\t"
|
952 |
|
"punpckhwd %5, %%mm5 \n\t"
|
953 |
|
"psllq $8, %%mm1 \n\t"
|
954 |
|
"psllq $16, %%mm2 \n\t"
|
955 |
|
"por %%mm1, %%mm0 \n\t"
|
956 |
|
"por %%mm2, %%mm0 \n\t"
|
957 |
|
"psllq $8, %%mm4 \n\t"
|
958 |
|
"psllq $16, %%mm5 \n\t"
|
959 |
|
"por %%mm4, %%mm3 \n\t"
|
960 |
|
"por %%mm5, %%mm3 \n\t"
|
961 |
|
|
962 |
|
:"=m"(*d)
|
963 |
|
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
|
964 |
|
:"memory");
|
965 |
|
/* borrowed 32 to 24 */
|
966 |
|
__asm__ volatile(
|
967 |
|
"movq %%mm0, %%mm4 \n\t"
|
968 |
|
"movq %%mm3, %%mm5 \n\t"
|
969 |
|
"movq %%mm6, %%mm0 \n\t"
|
970 |
|
"movq %%mm7, %%mm1 \n\t"
|
971 |
|
|
972 |
|
"movq %%mm4, %%mm6 \n\t"
|
973 |
|
"movq %%mm5, %%mm7 \n\t"
|
974 |
|
"movq %%mm0, %%mm2 \n\t"
|
975 |
|
"movq %%mm1, %%mm3 \n\t"
|
976 |
|
|
977 |
|
STORE_BGR24_MMX
|
978 |
|
|
979 |
|
:"=m"(*d)
|
980 |
|
:"m"(*s)
|
981 |
|
:"memory");
|
982 |
|
d += 24;
|
983 |
|
s += 8;
|
984 |
|
}
|
985 |
|
__asm__ volatile(SFENCE:::"memory");
|
986 |
|
__asm__ volatile(EMMS:::"memory");
|
987 |
|
#endif
|
988 |
257 |
while (s < end) {
|
989 |
258 |
register uint16_t bgr;
|
990 |
259 |
bgr = *s++;
|
... | ... | |
994 |
263 |
}
|
995 |
264 |
}
|
996 |
265 |
|
997 |
|
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
266 |
static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst, long src_size)
|
998 |
267 |
{
|
999 |
268 |
const uint16_t *end;
|
1000 |
|
#if COMPILE_TEMPLATE_MMX
|
1001 |
|
const uint16_t *mm_end;
|
1002 |
|
#endif
|
1003 |
269 |
uint8_t *d = (uint8_t *)dst;
|
1004 |
270 |
const uint16_t *s = (const uint16_t *)src;
|
1005 |
271 |
end = s + src_size/2;
|
1006 |
|
#if COMPILE_TEMPLATE_MMX
|
1007 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
|
1008 |
|
mm_end = end - 7;
|
1009 |
|
while (s < mm_end) {
|
1010 |
|
__asm__ volatile(
|
1011 |
|
PREFETCH" 32%1 \n\t"
|
1012 |
|
"movq %1, %%mm0 \n\t"
|
1013 |
|
"movq %1, %%mm1 \n\t"
|
1014 |
|
"movq %1, %%mm2 \n\t"
|
1015 |
|
"pand %2, %%mm0 \n\t"
|
1016 |
|
"pand %3, %%mm1 \n\t"
|
1017 |
|
"pand %4, %%mm2 \n\t"
|
1018 |
|
"psllq $3, %%mm0 \n\t"
|
1019 |
|
"psrlq $3, %%mm1 \n\t"
|
1020 |
|
"psrlq $8, %%mm2 \n\t"
|
1021 |
|
"movq %%mm0, %%mm3 \n\t"
|
1022 |
|
"movq %%mm1, %%mm4 \n\t"
|
1023 |
|
"movq %%mm2, %%mm5 \n\t"
|
1024 |
|
"punpcklwd %5, %%mm0 \n\t"
|
1025 |
|
"punpcklwd %5, %%mm1 \n\t"
|
1026 |
|
"punpcklwd %5, %%mm2 \n\t"
|
1027 |
|
"punpckhwd %5, %%mm3 \n\t"
|
1028 |
|
"punpckhwd %5, %%mm4 \n\t"
|
1029 |
|
"punpckhwd %5, %%mm5 \n\t"
|
1030 |
|
"psllq $8, %%mm1 \n\t"
|
1031 |
|
"psllq $16, %%mm2 \n\t"
|
1032 |
|
"por %%mm1, %%mm0 \n\t"
|
1033 |
|
"por %%mm2, %%mm0 \n\t"
|
1034 |
|
"psllq $8, %%mm4 \n\t"
|
1035 |
|
"psllq $16, %%mm5 \n\t"
|
1036 |
|
"por %%mm4, %%mm3 \n\t"
|
1037 |
|
"por %%mm5, %%mm3 \n\t"
|
1038 |
|
|
1039 |
|
"movq %%mm0, %%mm6 \n\t"
|
1040 |
|
"movq %%mm3, %%mm7 \n\t"
|
1041 |
|
|
1042 |
|
"movq 8%1, %%mm0 \n\t"
|
1043 |
|
"movq 8%1, %%mm1 \n\t"
|
1044 |
|
"movq 8%1, %%mm2 \n\t"
|
1045 |
|
"pand %2, %%mm0 \n\t"
|
1046 |
|
"pand %3, %%mm1 \n\t"
|
1047 |
|
"pand %4, %%mm2 \n\t"
|
1048 |
|
"psllq $3, %%mm0 \n\t"
|
1049 |
|
"psrlq $3, %%mm1 \n\t"
|
1050 |
|
"psrlq $8, %%mm2 \n\t"
|
1051 |
|
"movq %%mm0, %%mm3 \n\t"
|
1052 |
|
"movq %%mm1, %%mm4 \n\t"
|
1053 |
|
"movq %%mm2, %%mm5 \n\t"
|
1054 |
|
"punpcklwd %5, %%mm0 \n\t"
|
1055 |
|
"punpcklwd %5, %%mm1 \n\t"
|
1056 |
|
"punpcklwd %5, %%mm2 \n\t"
|
1057 |
|
"punpckhwd %5, %%mm3 \n\t"
|
1058 |
|
"punpckhwd %5, %%mm4 \n\t"
|
1059 |
|
"punpckhwd %5, %%mm5 \n\t"
|
1060 |
|
"psllq $8, %%mm1 \n\t"
|
1061 |
|
"psllq $16, %%mm2 \n\t"
|
1062 |
|
"por %%mm1, %%mm0 \n\t"
|
1063 |
|
"por %%mm2, %%mm0 \n\t"
|
1064 |
|
"psllq $8, %%mm4 \n\t"
|
1065 |
|
"psllq $16, %%mm5 \n\t"
|
1066 |
|
"por %%mm4, %%mm3 \n\t"
|
1067 |
|
"por %%mm5, %%mm3 \n\t"
|
1068 |
|
:"=m"(*d)
|
1069 |
|
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
|
1070 |
|
:"memory");
|
1071 |
|
/* borrowed 32 to 24 */
|
1072 |
|
__asm__ volatile(
|
1073 |
|
"movq %%mm0, %%mm4 \n\t"
|
1074 |
|
"movq %%mm3, %%mm5 \n\t"
|
1075 |
|
"movq %%mm6, %%mm0 \n\t"
|
1076 |
|
"movq %%mm7, %%mm1 \n\t"
|
1077 |
|
|
1078 |
|
"movq %%mm4, %%mm6 \n\t"
|
1079 |
|
"movq %%mm5, %%mm7 \n\t"
|
1080 |
|
"movq %%mm0, %%mm2 \n\t"
|
1081 |
|
"movq %%mm1, %%mm3 \n\t"
|
1082 |
|
|
1083 |
|
STORE_BGR24_MMX
|
1084 |
|
|
1085 |
|
:"=m"(*d)
|
1086 |
|
:"m"(*s)
|
1087 |
|
:"memory");
|
1088 |
|
d += 24;
|
1089 |
|
s += 8;
|
1090 |
|
}
|
1091 |
|
__asm__ volatile(SFENCE:::"memory");
|
1092 |
|
__asm__ volatile(EMMS:::"memory");
|
1093 |
|
#endif
|
1094 |
272 |
while (s < end) {
|
1095 |
273 |
register uint16_t bgr;
|
1096 |
274 |
bgr = *s++;
|
... | ... | |
1119 |
297 |
MOVNTQ" %%mm0, %0 \n\t" \
|
1120 |
298 |
MOVNTQ" %%mm3, 8%0 \n\t" \
|
1121 |
299 |
|
1122 |
|
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
300 |
static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, long src_size)
|
1123 |
301 |
{
|
1124 |
302 |
const uint16_t *end;
|
1125 |
|
#if COMPILE_TEMPLATE_MMX
|
1126 |
|
const uint16_t *mm_end;
|
1127 |
|
#endif
|
1128 |
303 |
uint8_t *d = dst;
|
1129 |
304 |
const uint16_t *s = (const uint16_t *)src;
|
1130 |
305 |
end = s + src_size/2;
|
1131 |
|
#if COMPILE_TEMPLATE_MMX
|
1132 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
|
1133 |
|
__asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
|
1134 |
|
__asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
|
1135 |
|
mm_end = end - 3;
|
1136 |
|
while (s < mm_end) {
|
1137 |
|
__asm__ volatile(
|
1138 |
|
PREFETCH" 32%1 \n\t"
|
1139 |
|
"movq %1, %%mm0 \n\t"
|
1140 |
|
"movq %1, %%mm1 \n\t"
|
1141 |
|
"movq %1, %%mm2 \n\t"
|
1142 |
|
"pand %2, %%mm0 \n\t"
|
1143 |
|
"pand %3, %%mm1 \n\t"
|
1144 |
|
"pand %4, %%mm2 \n\t"
|
1145 |
|
"psllq $3, %%mm0 \n\t"
|
1146 |
|
"psrlq $2, %%mm1 \n\t"
|
1147 |
|
"psrlq $7, %%mm2 \n\t"
|
1148 |
|
PACK_RGB32
|
1149 |
|
:"=m"(*d)
|
1150 |
|
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
|
1151 |
|
:"memory");
|
1152 |
|
d += 16;
|
1153 |
|
s += 4;
|
1154 |
|
}
|
1155 |
|
__asm__ volatile(SFENCE:::"memory");
|
1156 |
|
__asm__ volatile(EMMS:::"memory");
|
1157 |
|
#endif
|
1158 |
306 |
while (s < end) {
|
1159 |
307 |
register uint16_t bgr;
|
1160 |
308 |
bgr = *s++;
|
... | ... | |
1172 |
320 |
}
|
1173 |
321 |
}
|
1174 |
322 |
|
1175 |
|
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
323 |
static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, long src_size)
|
1176 |
324 |
{
|
1177 |
325 |
const uint16_t *end;
|
1178 |
|
#if COMPILE_TEMPLATE_MMX
|
1179 |
|
const uint16_t *mm_end;
|
1180 |
|
#endif
|
1181 |
326 |
uint8_t *d = dst;
|
1182 |
327 |
const uint16_t *s = (const uint16_t*)src;
|
1183 |
328 |
end = s + src_size/2;
|
1184 |
|
#if COMPILE_TEMPLATE_MMX
|
1185 |
|
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
|
1186 |
|
__asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
|
1187 |
|
__asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
|
1188 |
|
mm_end = end - 3;
|
1189 |
|
while (s < mm_end) {
|
1190 |
|
__asm__ volatile(
|
1191 |
|
PREFETCH" 32%1 \n\t"
|
1192 |
|
"movq %1, %%mm0 \n\t"
|
1193 |
|
"movq %1, %%mm1 \n\t"
|
1194 |
|
"movq %1, %%mm2 \n\t"
|
1195 |
|
"pand %2, %%mm0 \n\t"
|
1196 |
|
"pand %3, %%mm1 \n\t"
|
1197 |
|
"pand %4, %%mm2 \n\t"
|
1198 |
|
"psllq $3, %%mm0 \n\t"
|
1199 |
|
"psrlq $3, %%mm1 \n\t"
|
1200 |
|
"psrlq $8, %%mm2 \n\t"
|
1201 |
|
PACK_RGB32
|
1202 |
|
:"=m"(*d)
|
1203 |
|
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
|
1204 |
|
:"memory");
|
1205 |
|
d += 16;
|
1206 |
|
s += 4;
|
1207 |
|
}
|
1208 |
|
__asm__ volatile(SFENCE:::"memory");
|
1209 |
|
__asm__ volatile(EMMS:::"memory");
|
1210 |
|
#endif
|
1211 |
329 |
while (s < end) {
|
1212 |
330 |
register uint16_t bgr;
|
1213 |
331 |
bgr = *s++;
|
... | ... | |
1225 |
343 |
}
|
1226 |
344 |
}
|
1227 |
345 |
|
1228 |
|
static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
346 |
static inline void shuffle_bytes_2103_c(const uint8_t *src, uint8_t *dst, long src_size)
|
1229 |
347 |
{
|
1230 |
|
x86_reg idx = 15 - src_size;
|
|
348 |
int idx = 15 - src_size;
|
1231 |
349 |
const uint8_t *s = src-idx;
|
1232 |
350 |
uint8_t *d = dst-idx;
|
1233 |
|
#if COMPILE_TEMPLATE_MMX
|
1234 |
|
__asm__ volatile(
|
1235 |
|
"test %0, %0 \n\t"
|
1236 |
|
"jns 2f \n\t"
|
1237 |
|
PREFETCH" (%1, %0) \n\t"
|
1238 |
|
"movq %3, %%mm7 \n\t"
|
1239 |
|
"pxor %4, %%mm7 \n\t"
|
1240 |
|
"movq %%mm7, %%mm6 \n\t"
|
1241 |
|
"pxor %5, %%mm7 \n\t"
|
1242 |
|
".p2align 4 \n\t"
|
1243 |
|
"1: \n\t"
|
1244 |
|
PREFETCH" 32(%1, %0) \n\t"
|
1245 |
|
"movq (%1, %0), %%mm0 \n\t"
|
1246 |
|
"movq 8(%1, %0), %%mm1 \n\t"
|
1247 |
|
# if COMPILE_TEMPLATE_MMX2
|
1248 |
|
"pshufw $177, %%mm0, %%mm3 \n\t"
|
1249 |
|
"pshufw $177, %%mm1, %%mm5 \n\t"
|
1250 |
|
"pand %%mm7, %%mm0 \n\t"
|
1251 |
|
"pand %%mm6, %%mm3 \n\t"
|
1252 |
|
"pand %%mm7, %%mm1 \n\t"
|
1253 |
|
"pand %%mm6, %%mm5 \n\t"
|
1254 |
|
"por %%mm3, %%mm0 \n\t"
|
1255 |
|
"por %%mm5, %%mm1 \n\t"
|
1256 |
|
# else
|
1257 |
|
"movq %%mm0, %%mm2 \n\t"
|
1258 |
|
"movq %%mm1, %%mm4 \n\t"
|
1259 |
|
"pand %%mm7, %%mm0 \n\t"
|
1260 |
|
"pand %%mm6, %%mm2 \n\t"
|
1261 |
|
"pand %%mm7, %%mm1 \n\t"
|
1262 |
|
"pand %%mm6, %%mm4 \n\t"
|
1263 |
|
"movq %%mm2, %%mm3 \n\t"
|
1264 |
|
"movq %%mm4, %%mm5 \n\t"
|
1265 |
|
"pslld $16, %%mm2 \n\t"
|
1266 |
|
"psrld $16, %%mm3 \n\t"
|
1267 |
|
"pslld $16, %%mm4 \n\t"
|
1268 |
|
"psrld $16, %%mm5 \n\t"
|
1269 |
|
"por %%mm2, %%mm0 \n\t"
|
1270 |
|
"por %%mm4, %%mm1 \n\t"
|
1271 |
|
"por %%mm3, %%mm0 \n\t"
|
1272 |
|
"por %%mm5, %%mm1 \n\t"
|
1273 |
|
# endif
|
1274 |
|
MOVNTQ" %%mm0, (%2, %0) \n\t"
|
1275 |
|
MOVNTQ" %%mm1, 8(%2, %0) \n\t"
|
1276 |
|
"add $16, %0 \n\t"
|
1277 |
|
"js 1b \n\t"
|
1278 |
|
SFENCE" \n\t"
|
1279 |
|
EMMS" \n\t"
|
1280 |
|
"2: \n\t"
|
1281 |
|
: "+&r"(idx)
|
1282 |
|
: "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
|
1283 |
|
: "memory");
|
1284 |
|
#endif
|
1285 |
351 |
for (; idx<15; idx+=4) {
|
1286 |
352 |
register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
|
1287 |
353 |
v &= 0xff00ff;
|
... | ... | |
1289 |
355 |
}
|
1290 |
356 |
}
|
1291 |
357 |
|
1292 |
|
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
|
|
358 |
static inline void rgb24tobgr24_c(const uint8_t *src, uint8_t *dst, long src_size)
|
1293 |
359 |
{
|
1294 |
360 |
unsigned i;
|
1295 |
|
#if COMPILE_TEMPLATE_MMX
|
1296 |
|
x86_reg mmx_size= 23 - src_size;
|
1297 |
|
__asm__ volatile (
|
1298 |
|
"test %%"REG_a", %%"REG_a" \n\t"
|
1299 |
|
"jns 2f \n\t"
|
1300 |
|
"movq "MANGLE(mask24r)", %%mm5 \n\t"
|
1301 |
|
"movq "MANGLE(mask24g)", %%mm6 \n\t"
|
1302 |
|
"movq "MANGLE(mask24b)", %%mm7 \n\t"
|
1303 |
|
".p2align 4 \n\t"
|
1304 |
|
"1: \n\t"
|
1305 |
|
PREFETCH" 32(%1, %%"REG_a") \n\t"
|
1306 |
|
"movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
|
1307 |
|
"movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
|
1308 |
|
"movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
|
1309 |
|
"psllq $16, %%mm0 \n\t" // 00 BGR BGR
|
1310 |
|
"pand %%mm5, %%mm0 \n\t"
|
1311 |
|
"pand %%mm6, %%mm1 \n\t"
|
1312 |
|
"pand %%mm7, %%mm2 \n\t"
|
1313 |
|
"por %%mm0, %%mm1 \n\t"
|
1314 |
|
"por %%mm2, %%mm1 \n\t"
|
1315 |
|
"movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
|
1316 |
|
MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
|
1317 |
|
"movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
|
1318 |
|
"movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
|
1319 |
|
"pand %%mm7, %%mm0 \n\t"
|
1320 |
|
"pand %%mm5, %%mm1 \n\t"
|
1321 |
|
"pand %%mm6, %%mm2 \n\t"
|
1322 |
|
"por %%mm0, %%mm1 \n\t"
|
1323 |
|
"por %%mm2, %%mm1 \n\t"
|
1324 |
|
"movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
|
1325 |
|
MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
|
1326 |
|
"movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
|
1327 |
|
"movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
|
1328 |
|
"pand %%mm6, %%mm0 \n\t"
|
1329 |
|
"pand %%mm7, %%mm1 \n\t"
|
1330 |
|
"pand %%mm5, %%mm2 \n\t"
|
1331 |
|
"por %%mm0, %%mm1 \n\t"
|
1332 |
|
"por %%mm2, %%mm1 \n\t"
|
1333 |
|
MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
|
1334 |
|
"add $24, %%"REG_a" \n\t"
|
1335 |
|
" js 1b \n\t"
|
1336 |
|
"2: \n\t"
|
1337 |
|
: "+a" (mmx_size)
|
1338 |
|
: "r" (src-mmx_size), "r"(dst-mmx_size)
|
1339 |
|
);
|
1340 |
|
|
1341 |
|
__asm__ volatile(SFENCE:::"memory");
|
1342 |
|
__asm__ volatile(EMMS:::"memory");
|
1343 |
|
|
1344 |
|
if (mmx_size==23) return; //finished, was multiple of 8
|
1345 |
|
|
1346 |
|
src+= src_size;
|
1347 |
|
dst+= src_size;
|
1348 |
|
src_size= 23-mmx_size;
|
1349 |
|
src-= src_size;
|
1350 |
|
dst-= src_size;
|
1351 |
|
#endif
|
1352 |
361 |
for (i=0; i<src_size; i+=3) {
|
1353 |
362 |
register uint8_t x;
|
1354 |
363 |
x = src[i + 2];
|
... | ... | |
1358 |
367 |
}
|
1359 |
368 |
}
|
1360 |
369 |
|
1361 |
|
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
|
1362 |
|
long width, long height,
|
1363 |
|
long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
|
|
370 |
static inline void yuvPlanartoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
|
|
371 |
const uint8_t *vsrc, uint8_t *dst,
|
|
372 |
long width, long height,
|
|
373 |
long lumStride, long chromStride,
|
|
374 |
long dstStride, long vertLumPerChroma)
|
1364 |
375 |
{
|
1365 |
376 |
long y;
|
1366 |
|
const x86_reg chromWidth= width>>1;
|
|
377 |
const int chromWidth = width >> 1;
|
1367 |
378 |
for (y=0; y<height; y++) {
|
1368 |
|
#if COMPILE_TEMPLATE_MMX
|
1369 |
|
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
|
1370 |
|
__asm__ volatile(
|
1371 |
|
"xor %%"REG_a", %%"REG_a" \n\t"
|
1372 |
|
".p2align 4 \n\t"
|
1373 |
|
"1: \n\t"
|
1374 |
|
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
|
1375 |
|
PREFETCH" 32(%2, %%"REG_a") \n\t"
|
1376 |
|
PREFETCH" 32(%3, %%"REG_a") \n\t"
|
1377 |
|
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
|
1378 |
|
"movq %%mm0, %%mm2 \n\t" // U(0)
|
1379 |
|
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
|
1380 |
|
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
|
1381 |
|
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
|
1382 |
|
|
1383 |
|
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
|
1384 |
|
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
|
1385 |
|
"movq %%mm3, %%mm4 \n\t" // Y(0)
|
1386 |
|
"movq %%mm5, %%mm6 \n\t" // Y(8)
|
1387 |
|
"punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
|
1388 |
|
"punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
|
1389 |
|
"punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
|
1390 |
|
"punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
|
1391 |
|
|
1392 |
|
MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
|
1393 |
|
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
|
1394 |
|
MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
|
1395 |
|
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
|
1396 |
|
|
1397 |
|
"add $8, %%"REG_a" \n\t"
|
1398 |
|
"cmp %4, %%"REG_a" \n\t"
|
1399 |
|
" jb 1b \n\t"
|
1400 |
|
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
|
1401 |
|
: "%"REG_a
|
1402 |
|
);
|
1403 |
|
#else
|
1404 |
|
|
1405 |
|
#if ARCH_ALPHA && HAVE_MVI
|
1406 |
|
#define pl2yuy2(n) \
|
1407 |
|
y1 = yc[n]; \
|
1408 |
|
y2 = yc2[n]; \
|
1409 |
|
u = uc[n]; \
|
1410 |
|
v = vc[n]; \
|
1411 |
|
__asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
|
1412 |
|
__asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
|
1413 |
|
__asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
|
1414 |
|
__asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
|
1415 |
|
yuv1 = (u << 8) + (v << 24); \
|
1416 |
|
yuv2 = yuv1 + y2; \
|
1417 |
|
yuv1 += y1; \
|
1418 |
|
qdst[n] = yuv1; \
|
1419 |
|
qdst2[n] = yuv2;
|
1420 |
|
|
1421 |
|
int i;
|
1422 |
|
uint64_t *qdst = (uint64_t *) dst;
|
1423 |
|
uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
|
1424 |
|
const uint32_t *yc = (uint32_t *) ysrc;
|
1425 |
|
const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
|
1426 |
|
const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
|
1427 |
|
for (i = 0; i < chromWidth; i += 8) {
|
1428 |
|
uint64_t y1, y2, yuv1, yuv2;
|
1429 |
|
uint64_t u, v;
|
1430 |
|
/* Prefetch */
|
1431 |
|
__asm__("ldq $31,64(%0)" :: "r"(yc));
|
1432 |
|
__asm__("ldq $31,64(%0)" :: "r"(yc2));
|
1433 |
|
__asm__("ldq $31,64(%0)" :: "r"(uc));
|