Revision 6e1c66bc

View differences:

postproc/rgb2rgb.c
11 11
#include "../config.h"
12 12
#include "rgb2rgb.h"
13 13
#include "swscale.h"
14
#include "../cpudetect.h"
14 15
#include "../mangle.h"
15 16
#include "../bswap.h"
16 17
#include "../libvo/fastmemcpy.h"
......
68 69
			int srcStride1, int srcStride2,
69 70
			int srcStride3, int dstStride);
70 71

  
71
#ifdef ARCH_X86
72
#if defined(ARCH_X86) || defined(ARCH_X86_64)
72 73
static const uint64_t mmx_null  __attribute__((aligned(8))) = 0x0000000000000000ULL;
73 74
static const uint64_t mmx_one   __attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL;
74 75
static const uint64_t mask32b  attribute_used __attribute__((aligned(8))) = 0x000000FF000000FFULL;
......
152 153
#define RENAME(a) a ## _C
153 154
#include "rgb2rgb_template.c"
154 155

  
155
#ifdef ARCH_X86
156
#if defined(ARCH_X86) || defined(ARCH_X86_64)
156 157

  
157 158
//MMX versions
158 159
#undef RENAME
......
181 182
#define RENAME(a) a ## _3DNOW
182 183
#include "rgb2rgb_template.c"
183 184

  
184
#endif //ARCH_X86
185
#endif //ARCH_X86 || ARCH_X86_64
185 186

  
186 187
/*
187 188
 rgb15->rgb16 Original by Strepto/Astral
......
191 192
*/
192 193

  
193 194
void sws_rgb2rgb_init(int flags){
194
#ifdef ARCH_X86
195
#if defined(ARCH_X86) || defined(ARCH_X86_64)
195 196
	if(flags & SWS_CPU_CAPS_MMX2){
196 197
		rgb15to16= rgb15to16_MMX2;
197 198
		rgb15to24= rgb15to24_MMX2;
postproc/rgb2rgb_template.c
349 349
		"pslld $11, %%mm3		\n\t"
350 350
		"por %%mm3, %%mm0		\n\t"
351 351
		MOVNTQ"	%%mm0, (%0)		\n\t"
352
		"addl $16, %1			\n\t"
353
		"addl $8, %0			\n\t"
354
		"cmpl %2, %1			\n\t"
352
		"add $16, %1			\n\t"
353
		"add $8, %0			\n\t"
354
		"cmp %2, %1			\n\t"
355 355
		" jb 1b				\n\t"
356 356
		: "+r" (d), "+r"(s)
357 357
		: "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
......
509 509
		"pslld $10, %%mm3		\n\t"
510 510
		"por %%mm3, %%mm0		\n\t"
511 511
		MOVNTQ"	%%mm0, (%0)		\n\t"
512
		"addl $16, %1			\n\t"
513
		"addl $8, %0			\n\t"
514
		"cmpl %2, %1			\n\t"
512
		"add $16, %1			\n\t"
513
		"add $8, %0			\n\t"
514
		"cmp %2, %1			\n\t"
515 515
		" jb 1b				\n\t"
516 516
		: "+r" (d), "+r"(s)
517 517
		: "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
......
1345 1345
#ifdef HAVE_MMX
1346 1346
/* TODO: unroll this loop */
1347 1347
	asm volatile (
1348
		"xorl %%eax, %%eax		\n\t"
1348
		"xor %%"REG_a", %%"REG_a"	\n\t"
1349 1349
		".balign 16			\n\t"
1350 1350
		"1:				\n\t"
1351
		PREFETCH" 32(%0, %%eax)		\n\t"
1352
		"movq (%0, %%eax), %%mm0	\n\t"
1351
		PREFETCH" 32(%0, %%"REG_a")	\n\t"
1352
		"movq (%0, %%"REG_a"), %%mm0	\n\t"
1353 1353
		"movq %%mm0, %%mm1		\n\t"
1354 1354
		"movq %%mm0, %%mm2		\n\t"
1355 1355
		"pslld $16, %%mm0		\n\t"
......
1359 1359
		"pand "MANGLE(mask32b)", %%mm1	\n\t"
1360 1360
		"por %%mm0, %%mm2		\n\t"
1361 1361
		"por %%mm1, %%mm2		\n\t"
1362
		MOVNTQ" %%mm2, (%1, %%eax)	\n\t"
1363
		"addl $8, %%eax			\n\t"
1364
		"cmpl %2, %%eax			\n\t"
1362
		MOVNTQ" %%mm2, (%1, %%"REG_a")	\n\t"
1363
		"add $8, %%"REG_a"		\n\t"
1364
		"cmp %2, %%"REG_a"		\n\t"
1365 1365
		" jb 1b				\n\t"
1366
		:: "r" (src), "r"(dst), "r" (src_size-7)
1367
		: "%eax"
1366
		:: "r" (src), "r"(dst), "r" ((long)src_size-7)
1367
		: "%"REG_a
1368 1368
	);
1369 1369

  
1370 1370
	__asm __volatile(SFENCE:::"memory");
......
1391 1391
{
1392 1392
	unsigned i;
1393 1393
#ifdef HAVE_MMX
1394
	int mmx_size= 23 - src_size;
1394
	long mmx_size= 23 - src_size;
1395 1395
	asm volatile (
1396 1396
		"movq "MANGLE(mask24r)", %%mm5	\n\t"
1397 1397
		"movq "MANGLE(mask24g)", %%mm6	\n\t"
1398 1398
		"movq "MANGLE(mask24b)", %%mm7	\n\t"
1399 1399
		".balign 16			\n\t"
1400 1400
		"1:				\n\t"
1401
		PREFETCH" 32(%1, %%eax)		\n\t"
1402
		"movq   (%1, %%eax), %%mm0	\n\t" // BGR BGR BG
1403
		"movq   (%1, %%eax), %%mm1	\n\t" // BGR BGR BG
1404
		"movq  2(%1, %%eax), %%mm2	\n\t" // R BGR BGR B
1401
		PREFETCH" 32(%1, %%"REG_a")	\n\t"
1402
		"movq   (%1, %%"REG_a"), %%mm0	\n\t" // BGR BGR BG
1403
		"movq   (%1, %%"REG_a"), %%mm1	\n\t" // BGR BGR BG
1404
		"movq  2(%1, %%"REG_a"), %%mm2	\n\t" // R BGR BGR B
1405 1405
		"psllq $16, %%mm0		\n\t" // 00 BGR BGR
1406 1406
		"pand %%mm5, %%mm0		\n\t"
1407 1407
		"pand %%mm6, %%mm1		\n\t"
1408 1408
		"pand %%mm7, %%mm2		\n\t"
1409 1409
		"por %%mm0, %%mm1		\n\t"
1410 1410
		"por %%mm2, %%mm1		\n\t"                
1411
		"movq  6(%1, %%eax), %%mm0	\n\t" // BGR BGR BG
1412
		MOVNTQ" %%mm1,   (%2, %%eax)	\n\t" // RGB RGB RG
1413
		"movq  8(%1, %%eax), %%mm1	\n\t" // R BGR BGR B
1414
		"movq 10(%1, %%eax), %%mm2	\n\t" // GR BGR BGR
1411
		"movq  6(%1, %%"REG_a"), %%mm0	\n\t" // BGR BGR BG
1412
		MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1413
		"movq  8(%1, %%"REG_a"), %%mm1	\n\t" // R BGR BGR B
1414
		"movq 10(%1, %%"REG_a"), %%mm2	\n\t" // GR BGR BGR
1415 1415
		"pand %%mm7, %%mm0		\n\t"
1416 1416
		"pand %%mm5, %%mm1		\n\t"
1417 1417
		"pand %%mm6, %%mm2		\n\t"
1418 1418
		"por %%mm0, %%mm1		\n\t"
1419 1419
		"por %%mm2, %%mm1		\n\t"                
1420
		"movq 14(%1, %%eax), %%mm0	\n\t" // R BGR BGR B
1421
		MOVNTQ" %%mm1,  8(%2, %%eax)	\n\t" // B RGB RGB R
1422
		"movq 16(%1, %%eax), %%mm1	\n\t" // GR BGR BGR
1423
		"movq 18(%1, %%eax), %%mm2	\n\t" // BGR BGR BG
1420
		"movq 14(%1, %%"REG_a"), %%mm0	\n\t" // R BGR BGR B
1421
		MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1422
		"movq 16(%1, %%"REG_a"), %%mm1	\n\t" // GR BGR BGR
1423
		"movq 18(%1, %%"REG_a"), %%mm2	\n\t" // BGR BGR BG
1424 1424
		"pand %%mm6, %%mm0		\n\t"
1425 1425
		"pand %%mm7, %%mm1		\n\t"
1426 1426
		"pand %%mm5, %%mm2		\n\t"
1427 1427
		"por %%mm0, %%mm1		\n\t"
1428 1428
		"por %%mm2, %%mm1		\n\t"                
1429
		MOVNTQ" %%mm1, 16(%2, %%eax)	\n\t"
1430
		"addl $24, %%eax		\n\t"
1429
		MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1430
		"add $24, %%"REG_a"		\n\t"
1431 1431
		" js 1b				\n\t"
1432 1432
		: "+a" (mmx_size)
1433 1433
		: "r" (src-mmx_size), "r"(dst-mmx_size)
......
1465 1465
#ifdef HAVE_MMX
1466 1466
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1467 1467
		asm volatile(
1468
			"xorl %%eax, %%eax		\n\t"
1468
			"xor %%"REG_a", %%"REG_a"	\n\t"
1469 1469
			".balign 16			\n\t"
1470 1470
			"1:				\n\t"
1471
			PREFETCH" 32(%1, %%eax, 2)	\n\t"
1472
			PREFETCH" 32(%2, %%eax)		\n\t"
1473
			PREFETCH" 32(%3, %%eax)		\n\t"
1474
			"movq (%2, %%eax), %%mm0	\n\t" // U(0)
1471
			PREFETCH" 32(%1, %%"REG_a", 2)	\n\t"
1472
			PREFETCH" 32(%2, %%"REG_a")	\n\t"
1473
			PREFETCH" 32(%3, %%"REG_a")	\n\t"
1474
			"movq (%2, %%"REG_a"), %%mm0	\n\t" // U(0)
1475 1475
			"movq %%mm0, %%mm2		\n\t" // U(0)
1476
			"movq (%3, %%eax), %%mm1	\n\t" // V(0)
1476
			"movq (%3, %%"REG_a"), %%mm1	\n\t" // V(0)
1477 1477
			"punpcklbw %%mm1, %%mm0		\n\t" // UVUV UVUV(0)
1478 1478
			"punpckhbw %%mm1, %%mm2		\n\t" // UVUV UVUV(8)
1479 1479

  
1480
			"movq (%1, %%eax,2), %%mm3	\n\t" // Y(0)
1481
			"movq 8(%1, %%eax,2), %%mm5	\n\t" // Y(8)
1480
			"movq (%1, %%"REG_a",2), %%mm3	\n\t" // Y(0)
1481
			"movq 8(%1, %%"REG_a",2), %%mm5	\n\t" // Y(8)
1482 1482
			"movq %%mm3, %%mm4		\n\t" // Y(0)
1483 1483
			"movq %%mm5, %%mm6		\n\t" // Y(8)
1484 1484
			"punpcklbw %%mm0, %%mm3		\n\t" // YUYV YUYV(0)
......
1486 1486
			"punpcklbw %%mm2, %%mm5		\n\t" // YUYV YUYV(8)
1487 1487
			"punpckhbw %%mm2, %%mm6		\n\t" // YUYV YUYV(12)
1488 1488

  
1489
			MOVNTQ" %%mm3, (%0, %%eax, 4)	\n\t"
1490
			MOVNTQ" %%mm4, 8(%0, %%eax, 4)	\n\t"
1491
			MOVNTQ" %%mm5, 16(%0, %%eax, 4)	\n\t"
1492
			MOVNTQ" %%mm6, 24(%0, %%eax, 4)	\n\t"
1489
			MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1490
			MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1491
			MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1492
			MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1493 1493

  
1494
			"addl $8, %%eax			\n\t"
1495
			"cmpl %4, %%eax			\n\t"
1494
			"add $8, %%"REG_a"		\n\t"
1495
			"cmp %4, %%"REG_a"		\n\t"
1496 1496
			" jb 1b				\n\t"
1497
			::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1498
			: "%eax"
1497
			::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1498
			: "%"REG_a
1499 1499
		);
1500 1500
#else
1501 1501

  
......
1618 1618
#ifdef HAVE_MMX
1619 1619
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1620 1620
		asm volatile(
1621
			"xorl %%eax, %%eax		\n\t"
1621
			"xor %%"REG_a", %%"REG_a"	\n\t"
1622 1622
			".balign 16			\n\t"
1623 1623
			"1:				\n\t"
1624
			PREFETCH" 32(%1, %%eax, 2)	\n\t"
1625
			PREFETCH" 32(%2, %%eax)		\n\t"
1626
			PREFETCH" 32(%3, %%eax)		\n\t"
1627
			"movq (%2, %%eax), %%mm0	\n\t" // U(0)
1624
			PREFETCH" 32(%1, %%"REG_a", 2)	\n\t"
1625
			PREFETCH" 32(%2, %%"REG_a")	\n\t"
1626
			PREFETCH" 32(%3, %%"REG_a")	\n\t"
1627
			"movq (%2, %%"REG_a"), %%mm0	\n\t" // U(0)
1628 1628
			"movq %%mm0, %%mm2		\n\t" // U(0)
1629
			"movq (%3, %%eax), %%mm1	\n\t" // V(0)
1629
			"movq (%3, %%"REG_a"), %%mm1	\n\t" // V(0)
1630 1630
			"punpcklbw %%mm1, %%mm0		\n\t" // UVUV UVUV(0)
1631 1631
			"punpckhbw %%mm1, %%mm2		\n\t" // UVUV UVUV(8)
1632 1632

  
1633
			"movq (%1, %%eax,2), %%mm3	\n\t" // Y(0)
1634
			"movq 8(%1, %%eax,2), %%mm5	\n\t" // Y(8)
1633
			"movq (%1, %%"REG_a",2), %%mm3	\n\t" // Y(0)
1634
			"movq 8(%1, %%"REG_a",2), %%mm5	\n\t" // Y(8)
1635 1635
			"movq %%mm0, %%mm4		\n\t" // Y(0)
1636 1636
			"movq %%mm2, %%mm6		\n\t" // Y(8)
1637 1637
			"punpcklbw %%mm3, %%mm0		\n\t" // YUYV YUYV(0)
......
1639 1639
			"punpcklbw %%mm5, %%mm2		\n\t" // YUYV YUYV(8)
1640 1640
			"punpckhbw %%mm5, %%mm6		\n\t" // YUYV YUYV(12)
1641 1641

  
1642
			MOVNTQ" %%mm0, (%0, %%eax, 4)	\n\t"
1643
			MOVNTQ" %%mm4, 8(%0, %%eax, 4)	\n\t"
1644
			MOVNTQ" %%mm2, 16(%0, %%eax, 4)	\n\t"
1645
			MOVNTQ" %%mm6, 24(%0, %%eax, 4)	\n\t"
1642
			MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1643
			MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1644
			MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1645
			MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1646 1646

  
1647
			"addl $8, %%eax			\n\t"
1648
			"cmpl %4, %%eax			\n\t"
1647
			"add $8, %%"REG_a"		\n\t"
1648
			"cmp %4, %%"REG_a"		\n\t"
1649 1649
			" jb 1b				\n\t"
1650
			::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1651
			: "%eax"
1650
			::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1651
			: "%"REG_a
1652 1652
		);
1653 1653
#else
1654 1654
//FIXME adapt the alpha asm code from yv12->yuy2
......
1740 1740
	{
1741 1741
#ifdef HAVE_MMX
1742 1742
		asm volatile(
1743
			"xorl %%eax, %%eax		\n\t"
1743
			"xor %%"REG_a", %%"REG_a"	\n\t"
1744 1744
			"pcmpeqw %%mm7, %%mm7		\n\t"
1745 1745
			"psrlw $8, %%mm7		\n\t" // FF,00,FF,00...
1746 1746
			".balign 16			\n\t"
1747 1747
			"1:				\n\t"
1748
			PREFETCH" 64(%0, %%eax, 4)	\n\t"
1749
			"movq (%0, %%eax, 4), %%mm0	\n\t" // YUYV YUYV(0)
1750
			"movq 8(%0, %%eax, 4), %%mm1	\n\t" // YUYV YUYV(4)
1748
			PREFETCH" 64(%0, %%"REG_a", 4)	\n\t"
1749
			"movq (%0, %%"REG_a", 4), %%mm0	\n\t" // YUYV YUYV(0)
1750
			"movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1751 1751
			"movq %%mm0, %%mm2		\n\t" // YUYV YUYV(0)
1752 1752
			"movq %%mm1, %%mm3		\n\t" // YUYV YUYV(4)
1753 1753
			"psrlw $8, %%mm0		\n\t" // U0V0 U0V0(0)
......
1757 1757
			"packuswb %%mm1, %%mm0		\n\t" // UVUV UVUV(0)
1758 1758
			"packuswb %%mm3, %%mm2		\n\t" // YYYY YYYY(0)
1759 1759

  
1760
			MOVNTQ" %%mm2, (%1, %%eax, 2)	\n\t"
1760
			MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1761 1761

  
1762
			"movq 16(%0, %%eax, 4), %%mm1	\n\t" // YUYV YUYV(8)
1763
			"movq 24(%0, %%eax, 4), %%mm2	\n\t" // YUYV YUYV(12)
1762
			"movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1763
			"movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1764 1764
			"movq %%mm1, %%mm3		\n\t" // YUYV YUYV(8)
1765 1765
			"movq %%mm2, %%mm4		\n\t" // YUYV YUYV(12)
1766 1766
			"psrlw $8, %%mm1		\n\t" // U0V0 U0V0(8)
......
1770 1770
			"packuswb %%mm2, %%mm1		\n\t" // UVUV UVUV(8)
1771 1771
			"packuswb %%mm4, %%mm3		\n\t" // YYYY YYYY(8)
1772 1772

  
1773
			MOVNTQ" %%mm3, 8(%1, %%eax, 2)	\n\t"
1773
			MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1774 1774

  
1775 1775
			"movq %%mm0, %%mm2		\n\t" // UVUV UVUV(0)
1776 1776
			"movq %%mm1, %%mm3		\n\t" // UVUV UVUV(8)
......
1781 1781
			"packuswb %%mm1, %%mm0		\n\t" // VVVV VVVV(0)
1782 1782
			"packuswb %%mm3, %%mm2		\n\t" // UUUU UUUU(0)
1783 1783

  
1784
			MOVNTQ" %%mm0, (%3, %%eax)	\n\t"
1785
			MOVNTQ" %%mm2, (%2, %%eax)	\n\t"
1784
			MOVNTQ" %%mm0, (%3, %%"REG_a")	\n\t"
1785
			MOVNTQ" %%mm2, (%2, %%"REG_a")	\n\t"
1786 1786

  
1787
			"addl $8, %%eax			\n\t"
1788
			"cmpl %4, %%eax			\n\t"
1787
			"add $8, %%"REG_a"		\n\t"
1788
			"cmp %4, %%"REG_a"		\n\t"
1789 1789
			" jb 1b				\n\t"
1790
			::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1791
			: "memory", "%eax"
1790
			::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1791
			: "memory", "%"REG_a
1792 1792
		);
1793 1793

  
1794 1794
		ydst += lumStride;
1795 1795
		src  += srcStride;
1796 1796

  
1797 1797
		asm volatile(
1798
			"xorl %%eax, %%eax		\n\t"
1798
			"xor %%"REG_a", %%"REG_a"	\n\t"
1799 1799
			".balign 16			\n\t"
1800 1800
			"1:				\n\t"
1801
			PREFETCH" 64(%0, %%eax, 4)	\n\t"
1802
			"movq (%0, %%eax, 4), %%mm0	\n\t" // YUYV YUYV(0)
1803
			"movq 8(%0, %%eax, 4), %%mm1	\n\t" // YUYV YUYV(4)
1804
			"movq 16(%0, %%eax, 4), %%mm2	\n\t" // YUYV YUYV(8)
1805
			"movq 24(%0, %%eax, 4), %%mm3	\n\t" // YUYV YUYV(12)
1801
			PREFETCH" 64(%0, %%"REG_a", 4)	\n\t"
1802
			"movq (%0, %%"REG_a", 4), %%mm0	\n\t" // YUYV YUYV(0)
1803
			"movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1804
			"movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1805
			"movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1806 1806
			"pand %%mm7, %%mm0		\n\t" // Y0Y0 Y0Y0(0)
1807 1807
			"pand %%mm7, %%mm1		\n\t" // Y0Y0 Y0Y0(4)
1808 1808
			"pand %%mm7, %%mm2		\n\t" // Y0Y0 Y0Y0(8)
......
1810 1810
			"packuswb %%mm1, %%mm0		\n\t" // YYYY YYYY(0)
1811 1811
			"packuswb %%mm3, %%mm2		\n\t" // YYYY YYYY(8)
1812 1812

  
1813
			MOVNTQ" %%mm0, (%1, %%eax, 2)	\n\t"
1814
			MOVNTQ" %%mm2, 8(%1, %%eax, 2)	\n\t"
1813
			MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1814
			MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1815 1815

  
1816
			"addl $8, %%eax			\n\t"
1817
			"cmpl %4, %%eax			\n\t"
1816
			"add $8, %%"REG_a"		\n\t"
1817
			"cmp %4, %%"REG_a"		\n\t"
1818 1818
			" jb 1b				\n\t"
1819 1819

  
1820
			::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1821
			: "memory", "%eax"
1820
			::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1821
			: "memory", "%"REG_a
1822 1822
		);
1823 1823
#else
1824 1824
		unsigned i;
......
1877 1877

  
1878 1878
	for(y=1; y<srcHeight; y++){
1879 1879
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1880
		const int mmxSize= srcWidth&~15;
1880
		const long mmxSize= srcWidth&~15;
1881 1881
		asm volatile(
1882
			"movl %4, %%eax			\n\t"
1882
			"mov %4, %%"REG_a"		\n\t"
1883 1883
			"1:				\n\t"
1884
			"movq (%0, %%eax), %%mm0	\n\t"
1885
			"movq (%1, %%eax), %%mm1	\n\t"
1886
			"movq 1(%0, %%eax), %%mm2	\n\t"
1887
			"movq 1(%1, %%eax), %%mm3	\n\t"
1888
			"movq -1(%0, %%eax), %%mm4	\n\t"
1889
			"movq -1(%1, %%eax), %%mm5	\n\t"
1884
			"movq (%0, %%"REG_a"), %%mm0	\n\t"
1885
			"movq (%1, %%"REG_a"), %%mm1	\n\t"
1886
			"movq 1(%0, %%"REG_a"), %%mm2	\n\t"
1887
			"movq 1(%1, %%"REG_a"), %%mm3	\n\t"
1888
			"movq -1(%0, %%"REG_a"), %%mm4	\n\t"
1889
			"movq -1(%1, %%"REG_a"), %%mm5	\n\t"
1890 1890
			PAVGB" %%mm0, %%mm5		\n\t"
1891 1891
			PAVGB" %%mm0, %%mm3		\n\t"
1892 1892
			PAVGB" %%mm0, %%mm5		\n\t"
......
1902 1902
			"punpcklbw %%mm2, %%mm4		\n\t"
1903 1903
			"punpckhbw %%mm2, %%mm6		\n\t"
1904 1904
#if 1
1905
			MOVNTQ" %%mm5, (%2, %%eax, 2)	\n\t"
1906
			MOVNTQ" %%mm7, 8(%2, %%eax, 2)	\n\t"
1907
			MOVNTQ" %%mm4, (%3, %%eax, 2)	\n\t"
1908
			MOVNTQ" %%mm6, 8(%3, %%eax, 2)	\n\t"
1905
			MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1906
			MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1907
			MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1908
			MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1909 1909
#else
1910
			"movq %%mm5, (%2, %%eax, 2)	\n\t"
1911
			"movq %%mm7, 8(%2, %%eax, 2)	\n\t"
1912
			"movq %%mm4, (%3, %%eax, 2)	\n\t"
1913
			"movq %%mm6, 8(%3, %%eax, 2)	\n\t"
1910
			"movq %%mm5, (%2, %%"REG_a", 2)	\n\t"
1911
			"movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1912
			"movq %%mm4, (%3, %%"REG_a", 2)	\n\t"
1913
			"movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1914 1914
#endif
1915
			"addl $8, %%eax			\n\t"
1915
			"add $8, %%"REG_a"		\n\t"
1916 1916
			" js 1b				\n\t"
1917 1917
			:: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1918 1918
			   "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1919 1919
			   "g" (-mmxSize)
1920
			: "%eax"
1920
			: "%"REG_a
1921 1921

  
1922 1922
		);
1923 1923
#else
......
2107 2107
		for(i=0; i<2; i++)
2108 2108
		{
2109 2109
			asm volatile(
2110
				"movl %2, %%eax			\n\t"
2110
				"mov %2, %%"REG_a"		\n\t"
2111 2111
				"movq "MANGLE(bgr2YCoeff)", %%mm6		\n\t"
2112 2112
				"movq "MANGLE(w1111)", %%mm5		\n\t"
2113 2113
				"pxor %%mm7, %%mm7		\n\t"
2114
				"leal (%%eax, %%eax, 2), %%ebx	\n\t"
2114
				"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2115 2115
				".balign 16			\n\t"
2116 2116
				"1:				\n\t"
2117
				PREFETCH" 64(%0, %%ebx)		\n\t"
2118
				"movd (%0, %%ebx), %%mm0	\n\t"
2119
				"movd 3(%0, %%ebx), %%mm1	\n\t"
2117
				PREFETCH" 64(%0, %%"REG_b")	\n\t"
2118
				"movd (%0, %%"REG_b"), %%mm0	\n\t"
2119
				"movd 3(%0, %%"REG_b"), %%mm1	\n\t"
2120 2120
				"punpcklbw %%mm7, %%mm0		\n\t"
2121 2121
				"punpcklbw %%mm7, %%mm1		\n\t"
2122
				"movd 6(%0, %%ebx), %%mm2	\n\t"
2123
				"movd 9(%0, %%ebx), %%mm3	\n\t"
2122
				"movd 6(%0, %%"REG_b"), %%mm2	\n\t"
2123
				"movd 9(%0, %%"REG_b"), %%mm3	\n\t"
2124 2124
				"punpcklbw %%mm7, %%mm2		\n\t"
2125 2125
				"punpcklbw %%mm7, %%mm3		\n\t"
2126 2126
				"pmaddwd %%mm6, %%mm0		\n\t"
......
2140 2140
				"packssdw %%mm2, %%mm0		\n\t"
2141 2141
				"psraw $7, %%mm0		\n\t"
2142 2142

  
2143
				"movd 12(%0, %%ebx), %%mm4	\n\t"
2144
				"movd 15(%0, %%ebx), %%mm1	\n\t"
2143
				"movd 12(%0, %%"REG_b"), %%mm4	\n\t"
2144
				"movd 15(%0, %%"REG_b"), %%mm1	\n\t"
2145 2145
				"punpcklbw %%mm7, %%mm4		\n\t"
2146 2146
				"punpcklbw %%mm7, %%mm1		\n\t"
2147
				"movd 18(%0, %%ebx), %%mm2	\n\t"
2148
				"movd 21(%0, %%ebx), %%mm3	\n\t"
2147
				"movd 18(%0, %%"REG_b"), %%mm2	\n\t"
2148
				"movd 21(%0, %%"REG_b"), %%mm3	\n\t"
2149 2149
				"punpcklbw %%mm7, %%mm2		\n\t"
2150 2150
				"punpcklbw %%mm7, %%mm3		\n\t"
2151 2151
				"pmaddwd %%mm6, %%mm4		\n\t"
......
2162 2162
				"packssdw %%mm3, %%mm2		\n\t"
2163 2163
				"pmaddwd %%mm5, %%mm4		\n\t"
2164 2164
				"pmaddwd %%mm5, %%mm2		\n\t"
2165
				"addl $24, %%ebx		\n\t"
2165
				"add $24, %%"REG_b"		\n\t"
2166 2166
				"packssdw %%mm2, %%mm4		\n\t"
2167 2167
				"psraw $7, %%mm4		\n\t"
2168 2168

  
2169 2169
				"packuswb %%mm4, %%mm0		\n\t"
2170 2170
				"paddusb "MANGLE(bgr2YOffset)", %%mm0	\n\t"
2171 2171

  
2172
				MOVNTQ" %%mm0, (%1, %%eax)	\n\t"
2173
				"addl $8, %%eax			\n\t"
2172
				MOVNTQ" %%mm0, (%1, %%"REG_a")	\n\t"
2173
				"add $8, %%"REG_a"		\n\t"
2174 2174
				" js 1b				\n\t"
2175
				: : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2176
				: "%eax", "%ebx"
2175
				: : "r" (src+width*3), "r" (ydst+width), "g" ((long)-width)
2176
				: "%"REG_a, "%"REG_b
2177 2177
			);
2178 2178
			ydst += lumStride;
2179 2179
			src  += srcStride;
2180 2180
		}
2181 2181
		src -= srcStride*2;
2182 2182
		asm volatile(
2183
			"movl %4, %%eax			\n\t"
2183
			"mov %4, %%"REG_a"		\n\t"
2184 2184
			"movq "MANGLE(w1111)", %%mm5		\n\t"
2185 2185
			"movq "MANGLE(bgr2UCoeff)", %%mm6		\n\t"
2186 2186
			"pxor %%mm7, %%mm7		\n\t"
2187
			"leal (%%eax, %%eax, 2), %%ebx	\n\t"
2188
			"addl %%ebx, %%ebx		\n\t"
2187
			"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2188
			"add %%"REG_b", %%"REG_b"	\n\t"
2189 2189
			".balign 16			\n\t"
2190 2190
			"1:				\n\t"
2191
			PREFETCH" 64(%0, %%ebx)		\n\t"
2192
			PREFETCH" 64(%1, %%ebx)		\n\t"
2191
			PREFETCH" 64(%0, %%"REG_b")	\n\t"
2192
			PREFETCH" 64(%1, %%"REG_b")	\n\t"
2193 2193
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2194
			"movq (%0, %%ebx), %%mm0	\n\t"
2195
			"movq (%1, %%ebx), %%mm1	\n\t"
2196
			"movq 6(%0, %%ebx), %%mm2	\n\t"
2197
			"movq 6(%1, %%ebx), %%mm3	\n\t"
2194
			"movq (%0, %%"REG_b"), %%mm0	\n\t"
2195
			"movq (%1, %%"REG_b"), %%mm1	\n\t"
2196
			"movq 6(%0, %%"REG_b"), %%mm2	\n\t"
2197
			"movq 6(%1, %%"REG_b"), %%mm3	\n\t"
2198 2198
			PAVGB" %%mm1, %%mm0		\n\t"
2199 2199
			PAVGB" %%mm3, %%mm2		\n\t"
2200 2200
			"movq %%mm0, %%mm1		\n\t"
......
2206 2206
			"punpcklbw %%mm7, %%mm0		\n\t"
2207 2207
			"punpcklbw %%mm7, %%mm2		\n\t"
2208 2208
#else
2209
			"movd (%0, %%ebx), %%mm0	\n\t"
2210
			"movd (%1, %%ebx), %%mm1	\n\t"
2211
			"movd 3(%0, %%ebx), %%mm2	\n\t"
2212
			"movd 3(%1, %%ebx), %%mm3	\n\t"
2209
			"movd (%0, %%"REG_b"), %%mm0	\n\t"
2210
			"movd (%1, %%"REG_b"), %%mm1	\n\t"
2211
			"movd 3(%0, %%"REG_b"), %%mm2	\n\t"
2212
			"movd 3(%1, %%"REG_b"), %%mm3	\n\t"
2213 2213
			"punpcklbw %%mm7, %%mm0		\n\t"
2214 2214
			"punpcklbw %%mm7, %%mm1		\n\t"
2215 2215
			"punpcklbw %%mm7, %%mm2		\n\t"
......
2217 2217
			"paddw %%mm1, %%mm0		\n\t"
2218 2218
			"paddw %%mm3, %%mm2		\n\t"
2219 2219
			"paddw %%mm2, %%mm0		\n\t"
2220
			"movd 6(%0, %%ebx), %%mm4	\n\t"
2221
			"movd 6(%1, %%ebx), %%mm1	\n\t"
2222
			"movd 9(%0, %%ebx), %%mm2	\n\t"
2223
			"movd 9(%1, %%ebx), %%mm3	\n\t"
2220
			"movd 6(%0, %%"REG_b"), %%mm4	\n\t"
2221
			"movd 6(%1, %%"REG_b"), %%mm1	\n\t"
2222
			"movd 9(%0, %%"REG_b"), %%mm2	\n\t"
2223
			"movd 9(%1, %%"REG_b"), %%mm3	\n\t"
2224 2224
			"punpcklbw %%mm7, %%mm4		\n\t"
2225 2225
			"punpcklbw %%mm7, %%mm1		\n\t"
2226 2226
			"punpcklbw %%mm7, %%mm2		\n\t"
......
2252 2252
			"psraw $7, %%mm0		\n\t"
2253 2253

  
2254 2254
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2255
			"movq 12(%0, %%ebx), %%mm4	\n\t"
2256
			"movq 12(%1, %%ebx), %%mm1	\n\t"
2257
			"movq 18(%0, %%ebx), %%mm2	\n\t"
2258
			"movq 18(%1, %%ebx), %%mm3	\n\t"
2255
			"movq 12(%0, %%"REG_b"), %%mm4	\n\t"
2256
			"movq 12(%1, %%"REG_b"), %%mm1	\n\t"
2257
			"movq 18(%0, %%"REG_b"), %%mm2	\n\t"
2258
			"movq 18(%1, %%"REG_b"), %%mm3	\n\t"
2259 2259
			PAVGB" %%mm1, %%mm4		\n\t"
2260 2260
			PAVGB" %%mm3, %%mm2		\n\t"
2261 2261
			"movq %%mm4, %%mm1		\n\t"
......
2267 2267
			"punpcklbw %%mm7, %%mm4		\n\t"
2268 2268
			"punpcklbw %%mm7, %%mm2		\n\t"
2269 2269
#else
2270
			"movd 12(%0, %%ebx), %%mm4	\n\t"
2271
			"movd 12(%1, %%ebx), %%mm1	\n\t"
2272
			"movd 15(%0, %%ebx), %%mm2	\n\t"
2273
			"movd 15(%1, %%ebx), %%mm3	\n\t"
2270
			"movd 12(%0, %%"REG_b"), %%mm4	\n\t"
2271
			"movd 12(%1, %%"REG_b"), %%mm1	\n\t"
2272
			"movd 15(%0, %%"REG_b"), %%mm2	\n\t"
2273
			"movd 15(%1, %%"REG_b"), %%mm3	\n\t"
2274 2274
			"punpcklbw %%mm7, %%mm4		\n\t"
2275 2275
			"punpcklbw %%mm7, %%mm1		\n\t"
2276 2276
			"punpcklbw %%mm7, %%mm2		\n\t"
......
2278 2278
			"paddw %%mm1, %%mm4		\n\t"
2279 2279
			"paddw %%mm3, %%mm2		\n\t"
2280 2280
			"paddw %%mm2, %%mm4		\n\t"
2281
			"movd 18(%0, %%ebx), %%mm5	\n\t"
2282
			"movd 18(%1, %%ebx), %%mm1	\n\t"
2283
			"movd 21(%0, %%ebx), %%mm2	\n\t"
2284
			"movd 21(%1, %%ebx), %%mm3	\n\t"
2281
			"movd 18(%0, %%"REG_b"), %%mm5	\n\t"
2282
			"movd 18(%1, %%"REG_b"), %%mm1	\n\t"
2283
			"movd 21(%0, %%"REG_b"), %%mm2	\n\t"
2284
			"movd 21(%1, %%"REG_b"), %%mm3	\n\t"
2285 2285
			"punpcklbw %%mm7, %%mm5		\n\t"
2286 2286
			"punpcklbw %%mm7, %%mm1		\n\t"
2287 2287
			"punpcklbw %%mm7, %%mm2		\n\t"
......
2310 2310
			"packssdw %%mm3, %%mm1		\n\t"
2311 2311
			"pmaddwd %%mm5, %%mm4		\n\t"
2312 2312
			"pmaddwd %%mm5, %%mm1		\n\t"
2313
			"addl $24, %%ebx		\n\t"
2313
			"add $24, %%"REG_b"		\n\t"
2314 2314
			"packssdw %%mm1, %%mm4		\n\t" // V3 V2 U3 U2
2315 2315
			"psraw $7, %%mm4		\n\t"
2316 2316

  
......
2319 2319
			"punpckhdq %%mm4, %%mm1		\n\t"
2320 2320
			"packsswb %%mm1, %%mm0		\n\t"
2321 2321
			"paddb "MANGLE(bgr2UVOffset)", %%mm0	\n\t"
2322

  
2323
			"movd %%mm0, (%2, %%eax)	\n\t"
2322
			"movd %%mm0, (%2, %%"REG_a")	\n\t"
2324 2323
			"punpckhdq %%mm0, %%mm0		\n\t"
2325
			"movd %%mm0, (%3, %%eax)	\n\t"
2326
			"addl $4, %%eax			\n\t"
2324
			"movd %%mm0, (%3, %%"REG_a")	\n\t"
2325
			"add $4, %%"REG_a"		\n\t"
2327 2326
			" js 1b				\n\t"
2328
			: : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2329
			: "%eax", "%ebx"
2327
			: : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" ((long)-chromWidth)
2328
			: "%"REG_a, "%"REG_b
2330 2329
		);
2331 2330

  
2332 2331
		udst += chromStride;
......
2403 2402
#ifdef HAVE_MMX
2404 2403
#ifdef HAVE_SSE2
2405 2404
		asm(
2406
			"xorl %%eax, %%eax		\n\t"
2405
			"xor %%"REG_a", %%"REG_a"	\n\t"
2407 2406
			"1:				\n\t"
2408
			PREFETCH" 64(%1, %%eax)		\n\t"
2409
			PREFETCH" 64(%2, %%eax)		\n\t"
2410
			"movdqa (%1, %%eax), %%xmm0	\n\t"
2411
			"movdqa (%1, %%eax), %%xmm1	\n\t"
2412
			"movdqa (%2, %%eax), %%xmm2	\n\t"
2407
			PREFETCH" 64(%1, %%"REG_a")	\n\t"
2408
			PREFETCH" 64(%2, %%"REG_a")	\n\t"
2409
			"movdqa (%1, %%"REG_a"), %%xmm0	\n\t"
2410
			"movdqa (%1, %%"REG_a"), %%xmm1	\n\t"
2411
			"movdqa (%2, %%"REG_a"), %%xmm2	\n\t"
2413 2412
			"punpcklbw %%xmm2, %%xmm0	\n\t"
2414 2413
			"punpckhbw %%xmm2, %%xmm1	\n\t"
2415
			"movntdq %%xmm0, (%0, %%eax, 2)	\n\t"
2416
			"movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2417
			"addl $16, %%eax			\n\t"
2418
			"cmpl %3, %%eax			\n\t"
2414
			"movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2415
			"movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2416
			"add $16, %%"REG_a"		\n\t"
2417
			"cmp %3, %%"REG_a"		\n\t"
2419 2418
			" jb 1b				\n\t"
2420
			::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2421
			: "memory", "%eax"
2419
			::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2420
			: "memory", "%"REG_a""
2422 2421
		);
2423 2422
#else
2424 2423
		asm(
2425
			"xorl %%eax, %%eax		\n\t"
2424
			"xor %%"REG_a", %%"REG_a"	\n\t"
2426 2425
			"1:				\n\t"
2427
			PREFETCH" 64(%1, %%eax)		\n\t"
2428
			PREFETCH" 64(%2, %%eax)		\n\t"
2429
			"movq (%1, %%eax), %%mm0	\n\t"
2430
			"movq 8(%1, %%eax), %%mm2	\n\t"
2426
			PREFETCH" 64(%1, %%"REG_a")	\n\t"
2427
			PREFETCH" 64(%2, %%"REG_a")	\n\t"
2428
			"movq (%1, %%"REG_a"), %%mm0	\n\t"
2429
			"movq 8(%1, %%"REG_a"), %%mm2	\n\t"
2431 2430
			"movq %%mm0, %%mm1		\n\t"
2432 2431
			"movq %%mm2, %%mm3		\n\t"
2433
			"movq (%2, %%eax), %%mm4	\n\t"
2434
			"movq 8(%2, %%eax), %%mm5	\n\t"
2432
			"movq (%2, %%"REG_a"), %%mm4	\n\t"
2433
			"movq 8(%2, %%"REG_a"), %%mm5	\n\t"
2435 2434
			"punpcklbw %%mm4, %%mm0		\n\t"
2436 2435
			"punpckhbw %%mm4, %%mm1		\n\t"
2437 2436
			"punpcklbw %%mm5, %%mm2		\n\t"
2438 2437
			"punpckhbw %%mm5, %%mm3		\n\t"
2439
			MOVNTQ" %%mm0, (%0, %%eax, 2)	\n\t"
2440
			MOVNTQ" %%mm1, 8(%0, %%eax, 2)	\n\t"
2441
			MOVNTQ" %%mm2, 16(%0, %%eax, 2)	\n\t"
2442
			MOVNTQ" %%mm3, 24(%0, %%eax, 2)	\n\t"
2443
			"addl $16, %%eax			\n\t"
2444
			"cmpl %3, %%eax			\n\t"
2438
			MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2439
			MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2440
			MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2441
			MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2442
			"add $16, %%"REG_a"		\n\t"
2443
			"cmp %3, %%"REG_a"		\n\t"
2445 2444
			" jb 1b				\n\t"
2446
			::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2447
			: "memory", "%eax"
2445
			::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2446
			: "memory", "%"REG_a
2448 2447
		);
2449 2448
#endif
2450 2449
		for(w= (width&(~15)); w < width; w++)
......
2582 2581
			int srcStride1, int srcStride2,
2583 2582
			int srcStride3, int dstStride)
2584 2583
{
2585
    unsigned y,x,w,h;
2584
    unsigned long y,x,w,h;
2586 2585
    w=width/2; h=height;
2587 2586
    for(y=0;y<h;y++){
2588 2587
	const uint8_t* yp=src1+srcStride1*y;
postproc/swscale-example.c
104 104
	sws_scale(dstContext, src, srcStride, 0, srcH, dst, dstStride);
105 105
	sws_scale(outContext, dst, dstStride, 0, dstH, out, refStride);
106 106

  
107
#ifdef ARCH_X86
107
#if defined(ARCH_X86) || defined(ARCH_X86_64)
108 108
	asm volatile ("emms\n\t");
109 109
#endif
110 110
	     
......
199 199
			rgb_data[ x + y*4*W]= random();
200 200
		}
201 201
	}
202
#ifdef ARCH_X86
202
#if defined(ARCH_X86) || defined(ARCH_X86_64)
203 203
	sws_rgb2rgb_init(SWS_CPU_CAPS_MMX*0);
204 204
#else
205 205
	sws_rgb2rgb_init(0);
206 206
#endif
207 207
	sws_scale(sws, rgb_src, rgb_stride, 0, H   , src, stride);
208 208

  
209
#ifdef ARCH_X86
209
#if defined(ARCH_X86) || defined(ARCH_X86_64)
210 210
	asm volatile ("emms\n\t");
211 211
#endif
212 212

  
postproc/swscale.c
145 145
#define MIN(a,b) ((a) > (b) ? (b) : (a))
146 146
#define MAX(a,b) ((a) < (b) ? (b) : (a))
147 147

  
148
#ifdef ARCH_X86
148
#if defined(ARCH_X86) || defined(ARCH_X86_64)
149 149
static uint64_t attribute_used __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
150 150
static uint64_t attribute_used __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
151 151
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
......
204 204
extern const uint8_t dither_8x8_73[8][8];
205 205
extern const uint8_t dither_8x8_220[8][8];
206 206

  
207
#ifdef ARCH_X86
207
#if defined(ARCH_X86) || defined(ARCH_X86_64)
208 208
void in_asm_used_var_warning_killer()
209 209
{
210 210
 volatile int i= bF8+bFC+w10+
......
679 679
#endif //HAVE_ALTIVEC
680 680
#endif //ARCH_POWERPC
681 681

  
682
#ifdef ARCH_X86
682
#if defined(ARCH_X86) || defined(ARCH_X86_64)
683 683

  
684 684
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
685 685
#define COMPILE_MMX
......
692 692
#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
693 693
#define COMPILE_3DNOW
694 694
#endif
695
#endif //ARCH_X86
695
#endif //ARCH_X86 || ARCH_X86_64
696 696

  
697 697
#undef HAVE_MMX
698 698
#undef HAVE_MMX2
......
716 716
#endif
717 717
#endif //ARCH_POWERPC
718 718

  
719
#ifdef ARCH_X86
719
#if defined(ARCH_X86) || defined(ARCH_X86_64)
720 720

  
721 721
//X86 versions
722 722
/*
......
758 758
#include "swscale_template.c"
759 759
#endif
760 760

  
761
#endif //ARCH_X86
761
#endif //ARCH_X86 || ARCH_X86_64
762 762

  
763 763
// minor note: the HAVE_xyz is messed up after that line so don't use it
764 764

  
......
783 783
	int minFilterSize;
784 784
	double *filter=NULL;
785 785
	double *filter2=NULL;
786
#ifdef ARCH_X86
786
#if defined(ARCH_X86) || defined(ARCH_X86_64)
787 787
	if(flags & SWS_CPU_CAPS_MMX)
788 788
		asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
789 789
#endif
......
1142 1142
	free(filter);
1143 1143
}
1144 1144

  
1145
#ifdef ARCH_X86
1145
#if defined(ARCH_X86) || defined(ARCH_X86_64)
1146 1146
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
1147 1147
{
1148 1148
	uint8_t *fragmentA;
1149
	int imm8OfPShufW1A;
1150
	int imm8OfPShufW2A;
1151
	int fragmentLengthA;
1149
	long imm8OfPShufW1A;
1150
	long imm8OfPShufW2A;
1151
	long fragmentLengthA;
1152 1152
	uint8_t *fragmentB;
1153
	int imm8OfPShufW1B;
1154
	int imm8OfPShufW2B;
1155
	int fragmentLengthB;
1153
	long imm8OfPShufW1B;
1154
	long imm8OfPShufW2B;
1155
	long fragmentLengthB;
1156 1156
	int fragmentPos;
1157 1157

  
1158 1158
	int xpos, i;
......
1165 1165
		"jmp 9f				\n\t"
1166 1166
	// Begin
1167 1167
		"0:				\n\t"
1168
		"movq (%%edx, %%eax), %%mm3	\n\t" 
1169
		"movd (%%ecx, %%esi), %%mm0	\n\t" 
1170
		"movd 1(%%ecx, %%esi), %%mm1	\n\t"
1168
		"movq (%%"REG_d", %%"REG_a"), %%mm3\n\t" 
1169
		"movd (%%"REG_c", %%"REG_S"), %%mm0\n\t" 
1170
		"movd 1(%%"REG_c", %%"REG_S"), %%mm1\n\t"
1171 1171
		"punpcklbw %%mm7, %%mm1		\n\t"
1172 1172
		"punpcklbw %%mm7, %%mm0		\n\t"
1173 1173
		"pshufw $0xFF, %%mm1, %%mm1	\n\t"
......
1175 1175
		"pshufw $0xFF, %%mm0, %%mm0	\n\t"
1176 1176
		"2:				\n\t"
1177 1177
		"psubw %%mm1, %%mm0		\n\t"
1178
		"movl 8(%%ebx, %%eax), %%esi	\n\t"
1178
		"mov 8(%%"REG_b", %%"REG_a"), %%"REG_S"\n\t"
1179 1179
		"pmullw %%mm3, %%mm0		\n\t"
1180 1180
		"psllw $7, %%mm1		\n\t"
1181 1181
		"paddw %%mm1, %%mm0		\n\t"
1182 1182

  
1183
		"movq %%mm0, (%%edi, %%eax)	\n\t"
1183
		"movq %%mm0, (%%"REG_D", %%"REG_a")\n\t"
1184 1184

  
1185
		"addl $8, %%eax			\n\t"
1185
		"add $8, %%"REG_a"		\n\t"
1186 1186
	// End
1187 1187
		"9:				\n\t"
1188 1188
//		"int $3\n\t"
1189
		"leal 0b, %0			\n\t"
1190
		"leal 1b, %1			\n\t"
1191
		"leal 2b, %2			\n\t"
1192
		"decl %1			\n\t"
1193
		"decl %2			\n\t"
1194
		"subl %0, %1			\n\t"
1195
		"subl %0, %2			\n\t"
1196
		"leal 9b, %3			\n\t"
1197
		"subl %0, %3			\n\t"
1189
		"lea 0b, %0			\n\t"
1190
		"lea 1b, %1			\n\t"
1191
		"lea 2b, %2			\n\t"
1192
		"dec %1				\n\t"
1193
		"dec %2				\n\t"
1194
		"sub %0, %1			\n\t"
1195
		"sub %0, %2			\n\t"
1196
		"lea 9b, %3			\n\t"
1197
		"sub %0, %3			\n\t"
1198 1198

  
1199 1199

  
1200 1200
		:"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
......
1205 1205
		"jmp 9f				\n\t"
1206 1206
	// Begin
1207 1207
		"0:				\n\t"
1208
		"movq (%%edx, %%eax), %%mm3	\n\t" 
1209
		"movd (%%ecx, %%esi), %%mm0	\n\t" 
1208
		"movq (%%"REG_d", %%"REG_a"), %%mm3\n\t" 
1209
		"movd (%%"REG_c", %%"REG_S"), %%mm0\n\t" 
1210 1210
		"punpcklbw %%mm7, %%mm0		\n\t"
1211 1211
		"pshufw $0xFF, %%mm0, %%mm1	\n\t"
1212 1212
		"1:				\n\t"
1213 1213
		"pshufw $0xFF, %%mm0, %%mm0	\n\t"
1214 1214
		"2:				\n\t"
1215 1215
		"psubw %%mm1, %%mm0		\n\t"
1216
		"movl 8(%%ebx, %%eax), %%esi	\n\t"
1216
		"mov 8(%%"REG_b", %%"REG_a"), %%"REG_S"\n\t"
1217 1217
		"pmullw %%mm3, %%mm0		\n\t"
1218 1218
		"psllw $7, %%mm1		\n\t"
1219 1219
		"paddw %%mm1, %%mm0		\n\t"
1220 1220

  
1221
		"movq %%mm0, (%%edi, %%eax)	\n\t"
1221
		"movq %%mm0, (%%"REG_D", %%"REG_a")\n\t"
1222 1222

  
1223
		"addl $8, %%eax			\n\t"
1223
		"add $8, %%"REG_a"		\n\t"
1224 1224
	// End
1225 1225
		"9:				\n\t"
1226 1226
//		"int $3\n\t"
1227
		"leal 0b, %0			\n\t"
1228
		"leal 1b, %1			\n\t"
1229
		"leal 2b, %2			\n\t"
1230
		"decl %1			\n\t"
1231
		"decl %2			\n\t"
1232
		"subl %0, %1			\n\t"
1233
		"subl %0, %2			\n\t"
1234
		"leal 9b, %3			\n\t"
1235
		"subl %0, %3			\n\t"
1227
		"lea 0b, %0			\n\t"
1228
		"lea 1b, %1			\n\t"
1229
		"lea 2b, %2			\n\t"
1230
		"dec %1				\n\t"
1231
		"dec %2				\n\t"
1232
		"sub %0, %1			\n\t"
1233
		"sub %0, %2			\n\t"
1234
		"lea 9b, %3			\n\t"
1235
		"sub %0, %3			\n\t"
1236 1236

  
1237 1237

  
1238 1238
		:"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
......
1313 1313
	}
1314 1314
	filterPos[i/2]= xpos>>16; // needed to jump to the next part
1315 1315
}
1316
#endif // ARCH_X86
1316
#endif // ARCH_X86 || ARCH_X86_64
1317 1317

  
1318 1318
static void globalInit(){
1319 1319
    // generating tables:
......
1327 1327
static SwsFunc getSwsFunc(int flags){
1328 1328
    
1329 1329
#ifdef RUNTIME_CPUDETECT
1330
#ifdef ARCH_X86
1330
#if defined(ARCH_X86) || defined(ARCH_X86_64)
1331 1331
	// ordered per speed fasterst first
1332 1332
	if(flags & SWS_CPU_CAPS_MMX2)
1333 1333
		return swScale_MMX2;
......
1755 1755
	int unscaled, needsDither;
1756 1756
	int srcFormat, dstFormat;
1757 1757
	SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1758
#ifdef ARCH_X86
1758
#if defined(ARCH_X86) || defined(ARCH_X86_64)
1759 1759
	if(flags & SWS_CPU_CAPS_MMX)
1760 1760
		asm volatile("emms\n\t"::: "memory");
1761 1761
#endif
......
1995 1995
				 (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
1996 1996
				 srcFilter->chrH, dstFilter->chrH, c->param);
1997 1997

  
1998
#ifdef ARCH_X86
1998
#if defined(ARCH_X86) || defined(ARCH_X86_64)
1999 1999
// can't downscale !!!
2000 2000
		if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
2001 2001
		{
......
2136 2136
		}
2137 2137
		else
2138 2138
		{
2139
#ifdef ARCH_X86
2139
#if defined(ARCH_X86) || defined(ARCH_X86_64)
2140 2140
			MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
2141 2141
#else
2142 2142
			if(flags & SWS_FAST_BILINEAR)
postproc/swscale_template.c
16 16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 17
*/
18 18

  
19
#undef REAL_MOVNTQ
19 20
#undef MOVNTQ
20 21
#undef PAVGB
21 22
#undef PREFETCH
......
54 55
#endif
55 56

  
56 57
#ifdef HAVE_MMX2
57
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58 59
#else
59
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60 61
#endif
62
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
61 63

  
62 64
#ifdef HAVE_ALTIVEC
63 65
#include "swscale_altivec_template.c"
64 66
#endif
65 67

  
66 68
#define YSCALEYUV2YV12X(x, offset) \
67
			"xorl %%eax, %%eax		\n\t"\
69
			"xor %%"REG_a", %%"REG_a"	\n\t"\
68 70
			"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
69 71
			"movq %%mm3, %%mm4		\n\t"\
70
			"leal " offset "(%0), %%edx	\n\t"\
71
			"movl (%%edx), %%esi		\n\t"\
72
			"lea " offset "(%0), %%"REG_d"	\n\t"\
73
			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
72 74
			".balign 16			\n\t" /* FIXME Unroll? */\
73 75
			"1:				\n\t"\
74
			"movq 8(%%edx), %%mm0		\n\t" /* filterCoeff */\
75
			"movq " #x "(%%esi, %%eax, 2), %%mm2	\n\t" /* srcData */\
76
			"movq 8+" #x "(%%esi, %%eax, 2), %%mm5	\n\t" /* srcData */\
77
			"addl $16, %%edx		\n\t"\
78
			"movl (%%edx), %%esi		\n\t"\
79
			"testl %%esi, %%esi		\n\t"\
76
			"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
77
			"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78
			"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79
			"add $16, %%"REG_d"		\n\t"\
80
			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
81
			"test %%"REG_S", %%"REG_S"	\n\t"\
80 82
			"pmulhw %%mm0, %%mm2		\n\t"\
81 83
			"pmulhw %%mm0, %%mm5		\n\t"\
82 84
			"paddw %%mm2, %%mm3		\n\t"\
......
85 87
			"psraw $3, %%mm3		\n\t"\
86 88
			"psraw $3, %%mm4		\n\t"\
87 89
			"packuswb %%mm4, %%mm3		\n\t"\
88
			MOVNTQ(%%mm3, (%1, %%eax))\
89
			"addl $8, %%eax			\n\t"\
90
			"cmpl %2, %%eax			\n\t"\
90
			MOVNTQ(%%mm3, (%1, %%REGa))\
91
			"add $8, %%"REG_a"		\n\t"\
92
			"cmp %2, %%"REG_a"		\n\t"\
91 93
			"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
92 94
			"movq %%mm3, %%mm4		\n\t"\
93
			"leal " offset "(%0), %%edx	\n\t"\
94
			"movl (%%edx), %%esi		\n\t"\
95
			"lea " offset "(%0), %%"REG_d"	\n\t"\
96
			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
95 97
			"jb 1b				\n\t"
96 98

  
97 99
#define YSCALEYUV2YV121 \
98
			"movl %2, %%eax			\n\t"\
100
			"mov %2, %%"REG_a"		\n\t"\
99 101
			".balign 16			\n\t" /* FIXME Unroll? */\
100 102
			"1:				\n\t"\
101
			"movq (%0, %%eax, 2), %%mm0	\n\t"\
102
			"movq 8(%0, %%eax, 2), %%mm1	\n\t"\
103
			"movq (%0, %%"REG_a", 2), %%mm0	\n\t"\
104
			"movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
103 105
			"psraw $7, %%mm0		\n\t"\
104 106
			"psraw $7, %%mm1		\n\t"\
105 107
			"packuswb %%mm1, %%mm0		\n\t"\
106
			MOVNTQ(%%mm0, (%1, %%eax))\
107
			"addl $8, %%eax			\n\t"\
108
			MOVNTQ(%%mm0, (%1, %%REGa))\
109
			"add $8, %%"REG_a"		\n\t"\
108 110
			"jnc 1b				\n\t"
109 111

  
110 112
/*
......
115 117
			: "%eax", "%ebx", "%ecx", "%edx", "%esi"
116 118
*/
117 119
#define YSCALEYUV2PACKEDX \
118
		"xorl %%eax, %%eax		\n\t"\
120
		"xor %%"REG_a", %%"REG_a"	\n\t"\
119 121
		".balign 16			\n\t"\
120 122
		"nop				\n\t"\
121 123
		"1:				\n\t"\
122
		"leal "CHR_MMX_FILTER_OFFSET"(%0), %%edx	\n\t"\
123
		"movl (%%edx), %%esi		\n\t"\
124
		"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
124 126
		"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
125 127
		"movq %%mm3, %%mm4		\n\t"\
126 128
		".balign 16			\n\t"\
127 129
		"2:				\n\t"\
128
		"movq 8(%%edx), %%mm0		\n\t" /* filterCoeff */\
129
		"movq (%%esi, %%eax), %%mm2	\n\t" /* UsrcData */\
130
		"movq 4096(%%esi, %%eax), %%mm5	\n\t" /* VsrcData */\
131
		"addl $16, %%edx		\n\t"\
132
		"movl (%%edx), %%esi		\n\t"\
130
		"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
131
		"movq (%%"REG_S", %%"REG_a"), %%mm2	\n\t" /* UsrcData */\
132
		"movq 4096(%%"REG_S", %%"REG_a"), %%mm5	\n\t" /* VsrcData */\
133
		"add $16, %%"REG_d"		\n\t"\
134
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
133 135
		"pmulhw %%mm0, %%mm2		\n\t"\
134 136
		"pmulhw %%mm0, %%mm5		\n\t"\
135 137
		"paddw %%mm2, %%mm3		\n\t"\
136 138
		"paddw %%mm5, %%mm4		\n\t"\
137
		"testl %%esi, %%esi		\n\t"\
139
		"test %%"REG_S", %%"REG_S"	\n\t"\
138 140
		" jnz 2b			\n\t"\
139 141
\
140
		"leal "LUM_MMX_FILTER_OFFSET"(%0), %%edx	\n\t"\
141
		"movl (%%edx), %%esi		\n\t"\
142
		"lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
142 144
		"movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
143 145
		"movq %%mm1, %%mm7		\n\t"\
144 146
		".balign 16			\n\t"\
145 147
		"2:				\n\t"\
146
		"movq 8(%%edx), %%mm0		\n\t" /* filterCoeff */\
147
		"movq (%%esi, %%eax, 2), %%mm2	\n\t" /* Y1srcData */\
148
		"movq 8(%%esi, %%eax, 2), %%mm5	\n\t" /* Y2srcData */\
149
		"addl $16, %%edx		\n\t"\
150
		"movl (%%edx), %%esi		\n\t"\
148
		"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
149
		"movq (%%"REG_S", %%"REG_a", 2), %%mm2	\n\t" /* Y1srcData */\
150
		"movq 8(%%"REG_S", %%"REG_a", 2), %%mm5	\n\t" /* Y2srcData */\
151
		"add $16, %%"REG_d"		\n\t"\
152
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
151 153
		"pmulhw %%mm0, %%mm2		\n\t"\
152 154
		"pmulhw %%mm0, %%mm5		\n\t"\
153 155
		"paddw %%mm2, %%mm1		\n\t"\
154 156
		"paddw %%mm5, %%mm7		\n\t"\
155
		"testl %%esi, %%esi		\n\t"\
157
		"test %%"REG_S", %%"REG_S"	\n\t"\
156 158
		" jnz 2b			\n\t"\
157 159

  
158 160

  
......
202 204
		"movd %7, %%mm5			\n\t" /*uvalpha1*/\
203 205
		"punpcklwd %%mm5, %%mm5		\n\t"\
204 206
		"punpcklwd %%mm5, %%mm5		\n\t"\
205
		"xorl %%eax, %%eax		\n\t"\
207
		"xor %%"REG_a", %%"REG_a"		\n\t"\
206 208
		".balign 16			\n\t"\
207 209
		"1:				\n\t"\
208
		"movq (%0, %%eax, 2), %%mm0	\n\t" /*buf0[eax]*/\
209
		"movq (%1, %%eax, 2), %%mm1	\n\t" /*buf1[eax]*/\
210
		"movq (%2, %%eax,2), %%mm2	\n\t" /* uvbuf0[eax]*/\
211
		"movq (%3, %%eax,2), %%mm3	\n\t" /* uvbuf1[eax]*/\
210
		"movq (%0, %%"REG_a", 2), %%mm0	\n\t" /*buf0[eax]*/\
211
		"movq (%1, %%"REG_a", 2), %%mm1	\n\t" /*buf1[eax]*/\
212
		"movq (%2, %%"REG_a",2), %%mm2	\n\t" /* uvbuf0[eax]*/\
213
		"movq (%3, %%"REG_a",2), %%mm3	\n\t" /* uvbuf1[eax]*/\
212 214
		"psubw %%mm1, %%mm0		\n\t" /* buf0[eax] - buf1[eax]*/\
213 215
		"psubw %%mm3, %%mm2		\n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
214 216
		"pmulhw %%mm6, %%mm0		\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
215 217
		"pmulhw %%mm5, %%mm2		\n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
216 218
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
217
		"movq 4096(%2, %%eax,2), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
219
		"movq 4096(%2, %%"REG_a",2), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
218 220
		"psraw $4, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
219 221
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
220
		"movq 4096(%3, %%eax,2), %%mm0	\n\t" /* uvbuf1[eax+2048]*/\
222
		"movq 4096(%3, %%"REG_a",2), %%mm0	\n\t" /* uvbuf1[eax+2048]*/\
221 223
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
222 224
		"psubw %%mm0, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
223 225
		"psubw "MANGLE(w80)", %%mm1	\n\t" /* 8(Y-16)*/\
......
248 250
		"packuswb %%mm1, %%mm1		\n\t"
249 251
#endif
250 252

  
251
#define YSCALEYUV2PACKED(index, c) \
253
#define REAL_YSCALEYUV2PACKED(index, c) \
252 254
		"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
253 255
		"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
254 256
		"psraw $3, %%mm0		\n\t"\
255 257
		"psraw $3, %%mm1		\n\t"\
256 258
		"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
257 259
		"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
258
		"xorl "#index", "#index"		\n\t"\
260
		"xor "#index", "#index"		\n\t"\
259 261
		".balign 16			\n\t"\
260 262
		"1:				\n\t"\
261 263
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
......
284 286
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
285 287
		"paddw %%mm6, %%mm7		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
286 288
                
287
#define YSCALEYUV2RGB(index, c) \
288
		"xorl "#index", "#index"	\n\t"\
289
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
290
                
291
#define REAL_YSCALEYUV2RGB(index, c) \
292
		"xor "#index", "#index"	\n\t"\
289 293
		".balign 16			\n\t"\
290 294
		"1:				\n\t"\
291 295
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
......
348 352
		"packuswb %%mm6, %%mm5		\n\t"\
349 353
		"packuswb %%mm3, %%mm4		\n\t"\
350 354
		"pxor %%mm7, %%mm7		\n\t"
355
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
351 356
                
352
#define YSCALEYUV2PACKED1(index, c) \
353
		"xorl "#index", "#index"		\n\t"\
357
#define REAL_YSCALEYUV2PACKED1(index, c) \
358
		"xor "#index", "#index"		\n\t"\
354 359
		".balign 16			\n\t"\
355 360
		"1:				\n\t"\
356 361
		"movq (%2, "#index"), %%mm3	\n\t" /* uvbuf0[eax]*/\
......
362 367
		"psraw $7, %%mm1		\n\t" \
363 368
		"psraw $7, %%mm7		\n\t" \
364 369
                
365
#define YSCALEYUV2RGB1(index, c) \
366
		"xorl "#index", "#index"	\n\t"\
370
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
371
                
372
#define REAL_YSCALEYUV2RGB1(index, c) \
373
		"xor "#index", "#index"	\n\t"\
367 374
		".balign 16			\n\t"\
368 375
		"1:				\n\t"\
369 376
		"movq (%2, "#index"), %%mm3	\n\t" /* uvbuf0[eax]*/\
......
409 416
		"packuswb %%mm6, %%mm5		\n\t"\
410 417
		"packuswb %%mm3, %%mm4		\n\t"\
411 418
		"pxor %%mm7, %%mm7		\n\t"
419
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
412 420

  
413
#define YSCALEYUV2PACKED1b(index, c) \
414
		"xorl "#index", "#index"		\n\t"\
421
#define REAL_YSCALEYUV2PACKED1b(index, c) \
422
		"xor "#index", "#index"		\n\t"\
415 423
		".balign 16			\n\t"\
416 424
		"1:				\n\t"\
417 425
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
......
426 434
		"movq 8(%0, "#index", 2), %%mm7	\n\t" /*buf0[eax]*/\
427 435
		"psraw $7, %%mm1		\n\t" \
428 436
		"psraw $7, %%mm7		\n\t" 
437
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
429 438
                
430 439
// do vertical chrominance interpolation
431
#define YSCALEYUV2RGB1b(index, c) \
432
		"xorl "#index", "#index"		\n\t"\
440
#define REAL_YSCALEYUV2RGB1b(index, c) \
441
		"xor "#index", "#index"		\n\t"\
433 442
		".balign 16			\n\t"\
434 443
		"1:				\n\t"\
435 444
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
......
479 488
		"packuswb %%mm6, %%mm5		\n\t"\
480 489
		"packuswb %%mm3, %%mm4		\n\t"\
481 490
		"pxor %%mm7, %%mm7		\n\t"
491
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
482 492

  
483
#define WRITEBGR32(dst, dstw, index) \
493
#define REAL_WRITEBGR32(dst, dstw, index) \
484 494
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
485 495
			"movq %%mm2, %%mm1		\n\t" /* B */\
486 496
			"movq %%mm5, %%mm6		\n\t" /* R */\
......
500 510
			MOVNTQ(%%mm1, 16(dst, index, 4))\
501 511
			MOVNTQ(%%mm3, 24(dst, index, 4))\
502 512
\
503
			"addl $8, "#index"		\n\t"\
504
			"cmpl "#dstw", "#index"		\n\t"\
513
			"add $8, "#index"		\n\t"\
514
			"cmp "#dstw", "#index"		\n\t"\
505 515
			" jb 1b				\n\t"
516
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
506 517

  
507
#define WRITEBGR16(dst, dstw, index) \
518
#define REAL_WRITEBGR16(dst, dstw, index) \
508 519
			"pand "MANGLE(bF8)", %%mm2	\n\t" /* B */\
509 520
			"pand "MANGLE(bFC)", %%mm4	\n\t" /* G */\
510 521
			"pand "MANGLE(bF8)", %%mm5	\n\t" /* R */\
......
527 538
			MOVNTQ(%%mm2, (dst, index, 2))\
528 539
			MOVNTQ(%%mm1, 8(dst, index, 2))\
529 540
\
530
			"addl $8, "#index"		\n\t"\
531
			"cmpl "#dstw", "#index"		\n\t"\
541
			"add $8, "#index"		\n\t"\
542
			"cmp "#dstw", "#index"		\n\t"\
532 543
			" jb 1b				\n\t"
544
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
533 545

  
534
#define WRITEBGR15(dst, dstw, index) \
546
#define REAL_WRITEBGR15(dst, dstw, index) \
535 547
			"pand "MANGLE(bF8)", %%mm2	\n\t" /* B */\
536 548
			"pand "MANGLE(bF8)", %%mm4	\n\t" /* G */\
537 549
			"pand "MANGLE(bF8)", %%mm5	\n\t" /* R */\
......
555 567
			MOVNTQ(%%mm2, (dst, index, 2))\
556 568
			MOVNTQ(%%mm1, 8(dst, index, 2))\
557 569
\
558
			"addl $8, "#index"		\n\t"\
559
			"cmpl "#dstw", "#index"		\n\t"\
570
			"add $8, "#index"		\n\t"\
571
			"cmp "#dstw", "#index"		\n\t"\
560 572
			" jb 1b				\n\t"
573
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
561 574

  
562 575
#define WRITEBGR24OLD(dst, dstw, index) \
563 576
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
......
609 622
			MOVNTQ(%%mm0, (dst))\
610 623
			MOVNTQ(%%mm2, 8(dst))\
611 624
			MOVNTQ(%%mm3, 16(dst))\
612
			"addl $24, "#dst"		\n\t"\
625
			"add $24, "#dst"		\n\t"\
613 626
\
614
			"addl $8, "#index"		\n\t"\
615
			"cmpl "#dstw", "#index"		\n\t"\
627
			"add $8, "#index"		\n\t"\
628
			"cmp "#dstw", "#index"		\n\t"\
616 629
			" jb 1b				\n\t"
617 630

  
618 631
#define WRITEBGR24MMX(dst, dstw, index) \
......
662 675
			"por %%mm3, %%mm5		\n\t" /* RGBRGBRG 2 */\
663 676
			MOVNTQ(%%mm5, 16(dst))\
664 677
\
665
			"addl $24, "#dst"		\n\t"\
678
			"add $24, "#dst"		\n\t"\
666 679
\
667
			"addl $8, "#index"			\n\t"\
668
			"cmpl "#dstw", "#index"			\n\t"\
680
			"add $8, "#index"			\n\t"\
681
			"cmp "#dstw", "#index"			\n\t"\
669 682
			" jb 1b				\n\t"
670 683

  
671 684
#define WRITEBGR24MMX2(dst, dstw, index) \
......
710 723
			"por %%mm3, %%mm6		\n\t"\
711 724
			MOVNTQ(%%mm6, 16(dst))\
712 725
\
713
			"addl $24, "#dst"		\n\t"\
726
			"add $24, "#dst"		\n\t"\
714 727
\
715
			"addl $8, "#index"		\n\t"\
716
			"cmpl "#dstw", "#index"		\n\t"\
728
			"add $8, "#index"		\n\t"\
729
			"cmp "#dstw", "#index"		\n\t"\
717 730
			" jb 1b				\n\t"
718 731

  
719 732
#ifdef HAVE_MMX2
720 733
#undef WRITEBGR24
721
#define WRITEBGR24 WRITEBGR24MMX2
734
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
722 735
#else
723 736
#undef WRITEBGR24
724
#define WRITEBGR24 WRITEBGR24MMX
737
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
725 738
#endif
726 739

  
727
#define WRITEYUY2(dst, dstw, index) \
740
#define REAL_WRITEYUY2(dst, dstw, index) \
728 741
			"packuswb %%mm3, %%mm3		\n\t"\
729 742
			"packuswb %%mm4, %%mm4		\n\t"\
730 743
			"packuswb %%mm7, %%mm1		\n\t"\
......
736 749
			MOVNTQ(%%mm1, (dst, index, 2))\
737 750
			MOVNTQ(%%mm7, 8(dst, index, 2))\
738 751
\
739
			"addl $8, "#index"		\n\t"\
740
			"cmpl "#dstw", "#index"		\n\t"\
752
			"add $8, "#index"		\n\t"\
753
			"cmp "#dstw", "#index"		\n\t"\
741 754
			" jb 1b				\n\t"
755
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
742 756

  
743 757

  
744 758
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
......
751 765
		asm volatile(
752 766
				YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
753 767
				:: "r" (&c->redDither),
754
				"r" (uDest), "m" (chrDstW)
755
				: "%eax", "%edx", "%esi"
768
				"r" (uDest), "m" ((long)chrDstW)
769
				: "%"REG_a, "%"REG_d, "%"REG_S
756 770
			);
757 771

  
758 772
		asm volatile(
759 773
				YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
760 774
				:: "r" (&c->redDither),
761
				"r" (vDest), "m" (chrDstW)
762
				: "%eax", "%edx", "%esi"
775
				"r" (vDest), "m" ((long)chrDstW)
776
				: "%"REG_a, "%"REG_d, "%"REG_S
763 777
			);
764 778
	}
765 779

  
766 780
	asm volatile(
767 781
			YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
768 782
			:: "r" (&c->redDither),
769
			   "r" (dest), "m" (dstW)
770
			: "%eax", "%edx", "%esi"
783
			   "r" (dest), "m" ((long)dstW)
784
			: "%"REG_a, "%"REG_d, "%"REG_S
771 785
		);
772 786
#else
773 787
#ifdef HAVE_ALTIVEC
......
791 805
		asm volatile(
792 806
				YSCALEYUV2YV121
793 807
				:: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
794
				"g" (-chrDstW)
795
				: "%eax"
808
				"g" ((long)-chrDstW)
809
				: "%"REG_a
796 810
			);
797 811

  
798 812
		asm volatile(
799 813
				YSCALEYUV2YV121
800 814
				:: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
801
				"g" (-chrDstW)
802
				: "%eax"
815
				"g" ((long)-chrDstW)
816
				: "%"REG_a
803 817
			);
804 818
	}
805 819

  
806 820
	asm volatile(
807 821
		YSCALEYUV2YV121
808 822
		:: "r" (lumSrc + dstW), "r" (dest + dstW),
809
		"g" (-dstW)
810
		: "%eax"
823
		"g" ((long)-dstW)
824
		: "%"REG_a
811 825
	);
812 826
#else
813 827
	int i;
......
858 872
		{
859 873
			asm volatile(
860 874
				YSCALEYUV2RGBX
861
				WRITEBGR32(%4, %5, %%eax)
875
				WRITEBGR32(%4, %5, %%REGa)
862 876

  
863 877
			:: "r" (&c->redDither), 
864 878
			   "m" (dummy), "m" (dummy), "m" (dummy),
865 879
			   "r" (dest), "m" (dstW)
866
			: "%eax", "%edx", "%esi"
880
			: "%"REG_a, "%"REG_d, "%"REG_S
867 881
			);
868 882
		}
869 883
		break;
......
871 885
		{
872 886
			asm volatile(
873 887
				YSCALEYUV2RGBX
874
				"leal (%%eax, %%eax, 2), %%ebx	\n\t" //FIXME optimize
875
				"addl %4, %%ebx			\n\t"
876
				WRITEBGR24(%%ebx, %5, %%eax)
888
				"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
889
				"add %4, %%"REG_b"			\n\t"
890
				WRITEBGR24(%%REGb, %5, %%REGa)
877 891

  
878 892
			:: "r" (&c->redDither), 
879 893
			   "m" (dummy), "m" (dummy), "m" (dummy),
880 894
			   "r" (dest), "m" (dstW)
881
			: "%eax", "%ebx", "%edx", "%esi" //FIXME ebx
895
			: "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
882 896
			);
883 897
		}
884 898
		break;
......
893 907
				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
894 908
#endif
895 909

  
896
				WRITEBGR15(%4, %5, %%eax)
910
				WRITEBGR15(%4, %5, %%REGa)
897 911

  
898 912
			:: "r" (&c->redDither), 
899 913
			   "m" (dummy), "m" (dummy), "m" (dummy),
900 914
			   "r" (dest), "m" (dstW)
901
			: "%eax", "%edx", "%esi"
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff