Revision 5adf43e4

View differences:

libavcodec/i386/dsputil_mmx.c
1519 1519
        LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
1520 1520
        LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
1521 1521

  
1522
#define MMABS(a,z)\
1522
#define MMABS_MMX(a,z)\
1523 1523
    "pxor " #z ", " #z "              \n\t"\
1524 1524
    "pcmpgtw " #a ", " #z "           \n\t"\
1525 1525
    "pxor " #z ", " #a "              \n\t"\
1526 1526
    "psubw " #z ", " #a "             \n\t"
1527 1527

  
1528
#define MMABS_SUM(a,z, sum)\
1529
    "pxor " #z ", " #z "              \n\t"\
1530
    "pcmpgtw " #a ", " #z "           \n\t"\
1531
    "pxor " #z ", " #a "              \n\t"\
1532
    "psubw " #z ", " #a "             \n\t"\
1533
    "paddusw " #a ", " #sum "         \n\t"
1534

  
1535 1528
#define MMABS_MMX2(a,z)\
1536 1529
    "pxor " #z ", " #z "              \n\t"\
1537 1530
    "psubw " #a ", " #z "             \n\t"\
1538 1531
    "pmaxsw " #z ", " #a "            \n\t"
1539 1532

  
1533
#define MMABS_SUM_MMX(a,z, sum)\
1534
    MMABS_MMX(a,z)\
1535
    "paddusw " #a ", " #sum "         \n\t"
1536

  
1540 1537
#define MMABS_SUM_MMX2(a,z, sum)\
1541
    "pxor " #z ", " #z "              \n\t"\
1542
    "psubw " #a ", " #z "             \n\t"\
1543
    "pmaxsw " #z ", " #a "            \n\t"\
1538
    MMABS_MMX2(a,z)\
1544 1539
    "paddusw " #a ", " #sum "         \n\t"
1545 1540

  
1546 1541
#define LOAD4(o, a, b, c, d)\
......
1555 1550
        "movq "#c", "#o"+32(%1)       \n\t"\
1556 1551
        "movq "#d", "#o"+48(%1)       \n\t"\
1557 1552

  
1558
static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1559
    DECLARE_ALIGNED_8(uint64_t, temp[16]);
1560
    int sum=0;
1561

  
1562
    assert(h==8);
1563

  
1564
    diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1565

  
1566
    asm volatile(
1567
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1568
        LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1569

  
1570
        HADAMARD48
1571

  
1572
        "movq %%mm7, 112(%1)            \n\t"
1573

  
1574
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1575
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1576

  
1577
        "movq 112(%1), %%mm7            \n\t"
1578
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1579
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1580

  
1581
        LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1582
        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1583

  
1584
        HADAMARD48
1585

  
1586
        "movq %%mm7, 120(%1)            \n\t"
1587

  
1588
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1589
        STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1590

  
1591
        "movq 120(%1), %%mm7            \n\t"
1592
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1593
        "movq %%mm7, %%mm5              \n\t"//FIXME remove
1594
        "movq %%mm6, %%mm7              \n\t"
1595
        "movq %%mm0, %%mm6              \n\t"
1596
//        STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1597

  
1598
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1599
//        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1600

  
1601
        HADAMARD48
1602
        "movq %%mm7, 64(%1)             \n\t"
1603
        MMABS(%%mm0, %%mm7)
1604
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1605
        MMABS_SUM(%%mm2, %%mm7, %%mm0)
1606
        MMABS_SUM(%%mm3, %%mm7, %%mm0)
1607
        MMABS_SUM(%%mm4, %%mm7, %%mm0)
1608
        MMABS_SUM(%%mm5, %%mm7, %%mm0)
1609
        MMABS_SUM(%%mm6, %%mm7, %%mm0)
1610
        "movq 64(%1), %%mm1             \n\t"
1611
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1612
        "movq %%mm0, 64(%1)             \n\t"
1613

  
1614
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1615
        LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1616

  
1617
        HADAMARD48
1618
        "movq %%mm7, (%1)               \n\t"
1619
        MMABS(%%mm0, %%mm7)
1620
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1621
        MMABS_SUM(%%mm2, %%mm7, %%mm0)
1622
        MMABS_SUM(%%mm3, %%mm7, %%mm0)
1623
        MMABS_SUM(%%mm4, %%mm7, %%mm0)
1624
        MMABS_SUM(%%mm5, %%mm7, %%mm0)
1625
        MMABS_SUM(%%mm6, %%mm7, %%mm0)
1626
        "movq (%1), %%mm1               \n\t"
1627
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1628
        "movq 64(%1), %%mm1             \n\t"
1629
        MMABS_SUM(%%mm1, %%mm7, %%mm0)
1630

  
1631
        "movq %%mm0, %%mm1              \n\t"
1632
        "psrlq $32, %%mm0               \n\t"
1633
        "paddusw %%mm1, %%mm0           \n\t"
1634
        "movq %%mm0, %%mm1              \n\t"
1635
        "psrlq $16, %%mm0               \n\t"
1636
        "paddusw %%mm1, %%mm0           \n\t"
1637
        "movd %%mm0, %0                 \n\t"
1638

  
1639
        : "=r" (sum)
1640
        : "r"(temp)
1641
    );
1642
    return sum&0xFFFF;
1643
}
1644

  
1645
static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
1646
    DECLARE_ALIGNED_8(uint64_t, temp[16]);
1647
    int sum=0;
1648

  
1649
    assert(h==8);
1650

  
1651
    diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
1652

  
1653
    asm volatile(
1654
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1655
        LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
1656

  
1657
        HADAMARD48
1658

  
1659
        "movq %%mm7, 112(%1)            \n\t"
1660

  
1661
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1662
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
1663

  
1664
        "movq 112(%1), %%mm7            \n\t"
1665
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1666
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
1667

  
1668
        LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
1669
        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1670

  
1671
        HADAMARD48
1672

  
1673
        "movq %%mm7, 120(%1)            \n\t"
1674

  
1675
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
1676
        STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
1677

  
1678
        "movq 120(%1), %%mm7            \n\t"
1679
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
1680
        "movq %%mm7, %%mm5              \n\t"//FIXME remove
1681
        "movq %%mm6, %%mm7              \n\t"
1682
        "movq %%mm0, %%mm6              \n\t"
1683
//        STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
1684

  
1685
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
1686
//        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
1687

  
1688
        HADAMARD48
1689
        "movq %%mm7, 64(%1)             \n\t"
1690
        MMABS_MMX2(%%mm0, %%mm7)
1691
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1692
        MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1693
        MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1694
        MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1695
        MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1696
        MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1697
        "movq 64(%1), %%mm1             \n\t"
1698
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1699
        "movq %%mm0, 64(%1)             \n\t"
1700

  
1701
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
1702
        LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
1703

  
1704
        HADAMARD48
1705
        "movq %%mm7, (%1)               \n\t"
1706
        MMABS_MMX2(%%mm0, %%mm7)
1707
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1708
        MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
1709
        MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
1710
        MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
1711
        MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
1712
        MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
1713
        "movq (%1), %%mm1               \n\t"
1714
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1715
        "movq 64(%1), %%mm1             \n\t"
1716
        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
1717

  
1718
        "pshufw $0x0E, %%mm0, %%mm1     \n\t"
1719
        "paddusw %%mm1, %%mm0           \n\t"
1720
        "pshufw $0x01, %%mm0, %%mm1     \n\t"
1721
        "paddusw %%mm1, %%mm0           \n\t"
1722
        "movd %%mm0, %0                 \n\t"
1723

  
1724
        : "=r" (sum)
1725
        : "r"(temp)
1726
    );
1727
    return sum&0xFFFF;
1728
}
1729

  
1553
#define HSUM_MMX(a, t, dst)\
1554
    "movq "#a", "#t"                  \n\t"\
1555
    "psrlq $32, "#a"                  \n\t"\
1556
    "paddusw "#t", "#a"               \n\t"\
1557
    "movq "#a", "#t"                  \n\t"\
1558
    "psrlq $16, "#a"                  \n\t"\
1559
    "paddusw "#t", "#a"               \n\t"\
1560
    "movd "#a", "#dst"                \n\t"\
1561

  
1562
#define HSUM_MMX2(a, t, dst)\
1563
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1564
    "paddusw "#t", "#a"               \n\t"\
1565
    "pshufw $0x01, "#a", "#t"         \n\t"\
1566
    "paddusw "#t", "#a"               \n\t"\
1567
    "movd "#a", "#dst"                \n\t"\
1568

  
1569
#define HADAMARD8_DIFF_MMX(cpu) \
1570
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1571
    DECLARE_ALIGNED_8(uint64_t, temp[16]);\
1572
    int sum=0;\
1573
\
1574
    assert(h==8);\
1575
\
1576
    diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);\
1577
\
1578
    asm volatile(\
1579
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1580
        LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)\
1581
\
1582
        HADAMARD48\
1583
\
1584
        "movq %%mm7, 112(%1)            \n\t"\
1585
\
1586
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1587
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1588
\
1589
        "movq 112(%1), %%mm7            \n\t"\
1590
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1591
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1592
\
1593
        LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)\
1594
        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)\
1595
\
1596
        HADAMARD48\
1597
\
1598
        "movq %%mm7, 120(%1)            \n\t"\
1599
\
1600
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1601
        STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)\
1602
\
1603
        "movq 120(%1), %%mm7            \n\t"\
1604
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1605
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1606
        "movq %%mm6, %%mm7              \n\t"\
1607
        "movq %%mm0, %%mm6              \n\t"\
1608
\
1609
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1610
\
1611
        HADAMARD48\
1612
        "movq %%mm7, 64(%1)             \n\t"\
1613
        MMABS(%%mm0, %%mm7)\
1614
        MMABS_SUM(%%mm1, %%mm7, %%mm0)\
1615
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1616
        MMABS_SUM(%%mm3, %%mm7, %%mm0)\
1617
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1618
        MMABS_SUM(%%mm5, %%mm7, %%mm0)\
1619
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1620
        "movq 64(%1), %%mm1             \n\t"\
1621
        MMABS_SUM(%%mm1, %%mm7, %%mm0)\
1622
        "movq %%mm0, 64(%1)             \n\t"\
1623
\
1624
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1625
        LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)\
1626
\
1627
        HADAMARD48\
1628
        "movq %%mm7, (%1)               \n\t"\
1629
        MMABS(%%mm0, %%mm7)\
1630
        MMABS_SUM(%%mm1, %%mm7, %%mm0)\
1631
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1632
        MMABS_SUM(%%mm3, %%mm7, %%mm0)\
1633
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1634
        MMABS_SUM(%%mm5, %%mm7, %%mm0)\
1635
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1636
        "movq (%1), %%mm1               \n\t"\
1637
        MMABS_SUM(%%mm1, %%mm7, %%mm0)\
1638
        "movq 64(%1), %%mm1             \n\t"\
1639
        MMABS_SUM(%%mm1, %%mm7, %%mm0)\
1640
\
1641
        HSUM(%%mm0, %%mm1, %0)\
1642
\
1643
        : "=r" (sum)\
1644
        : "r"(temp)\
1645
    );\
1646
    return sum&0xFFFF;\
1647
}
1648

  
1649
#define MMABS(a,z)         MMABS_MMX(a,z)
1650
#define MMABS_SUM(a,z,sum) MMABS_SUM_MMX(a,z,sum)
1651
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1652
HADAMARD8_DIFF_MMX(mmx)
1653
#undef MMABS
1654
#undef MMABS_SUM
1655
#undef HSUM
1656

  
1657
#define MMABS(a,z)         MMABS_MMX2(a,z)
1658
#define MMABS_SUM(a,z,sum) MMABS_SUM_MMX2(a,z,sum)
1659
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1660
HADAMARD8_DIFF_MMX(mmx2)
1661
#undef MMABS
1662
#undef MMABS_SUM
1663
#undef HSUM
1730 1664

  
1731 1665
WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
1732 1666
WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)

Also available in: Unified diff