Revision 935f50c8 libswscale/rgb2rgb_template.c

View differences:

libswscale/rgb2rgb_template.c
1364 1364

  
1365 1365
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1366 1366
{
1367
    uint8_t *d = dst, *s = (uint8_t *) src;
1368
    const uint8_t *end = s + src_size;
1367
	long idx = 15 - src_size;
1368
	uint8_t *s = (uint8_t *) src-idx, *d = dst-idx;
1369 1369
#ifdef HAVE_MMX
1370 1370
	__asm __volatile(
1371
		"	"PREFETCH" (%1)			\n"
1371
		"	test %0, %0			\n"
1372
		"	jns 2f				\n"
1373
		"	"PREFETCH" (%1, %0)		\n"
1372 1374
		"	movq %3, %%mm7			\n"
1373 1375
		"	pxor %4, %%mm7			\n"
1374 1376
		"	movq %%mm7, %%mm6		\n"
1375 1377
		"	pxor %5, %%mm7			\n"
1376
		"	jmp 2f				\n"
1377 1378
			ASMALIGN(4)
1378 1379
		"1:					\n"
1379
		"	"PREFETCH" 32(%1)		\n"
1380
		"	movq (%1), %%mm0		\n"
1381
		"	movq 8(%1), %%mm1		\n"
1380
		"	"PREFETCH" 32(%1, %0)		\n"
1381
		"	movq (%1, %0), %%mm0		\n"
1382
		"	movq 8(%1, %0), %%mm1		\n"
1382 1383
# ifdef HAVE_MMX2
1383 1384
		"	pshufw $177, %%mm0, %%mm3	\n"
1384 1385
		"	pshufw $177, %%mm1, %%mm5	\n"
......
1406 1407
		"	por %%mm3, %%mm0		\n"
1407 1408
		"	por %%mm5, %%mm1		\n"
1408 1409
# endif
1409
		"	"MOVNTQ" %%mm0, (%0)		\n"
1410
		"	"MOVNTQ" %%mm1, 8(%0)		\n"
1410
		"	"MOVNTQ" %%mm0, (%2, %0)	\n"
1411
		"	"MOVNTQ" %%mm1, 8(%2, %0)	\n"
1411 1412
		"	add $16, %0			\n"
1412
		"	add $16, %1			\n"
1413
		"2:					\n"
1414
		"	cmp %1, %2			\n"
1415
		"	ja 1b				\n"
1413
		"	js 1b				\n"
1416 1414
		"	"SFENCE"			\n"
1417 1415
		"	"EMMS"				\n"
1418
		: "+r"(d), "+r"(s)
1419
		: "r" (end-15), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1416
		"2:					\n"
1417
		: "+&r"(idx)
1418
		: "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1420 1419
		: "memory");
1421 1420
#endif
1422
	for (; s<end; s+=4, d+=4) {
1423
		int v = *(uint32_t *)s, g = v & 0xff00;
1421
	for (; idx<15; idx+=4) {
1422
		register int v = *(uint32_t *)&s[idx], g = v & 0xff00;
1424 1423
		v &= 0xff00ff;
1425
		*(uint32_t *)d = (v>>16) + g + (v<<16);
1424
		*(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1426 1425
	}
1427 1426
}
1428 1427

  

Also available in: Unified diff