Revision 268821e7 libavcodec/x86/vp8dsp.asm

View differences:

libavcodec/x86/vp8dsp.asm
1164 1164
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1165 1165
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1166 1166
; we add 1*stride to the third regular registry in the process
1167
%macro WRITE_4x4D 9
1167
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
1168
; same memory region), or 8 if they cover two separate buffers (third one points to
1169
; a different memory region than the first two), allowing for more optimal code for
1170
; the 16-width case
1171
%macro WRITE_4x4D 10
1168 1172
    ; write out (4 dwords per register), start with dwords zero
1169 1173
    movd    [%5+%8*4], m%1
1170 1174
    movd         [%5], m%2
1171
    movd    [%5+%9*4], m%3
1172
    movd    [%5+%9*8], m%4
1175
    movd    [%7+%8*4], m%3
1176
    movd         [%7], m%4
1173 1177

  
1174 1178
    ; store dwords 1
1175 1179
    psrldq        m%1, 4
......
1178 1182
    psrldq        m%4, 4
1179 1183
    movd    [%6+%8*4], m%1
1180 1184
    movd         [%6], m%2
1185
%if %10 == 16
1181 1186
    movd    [%6+%9*4], m%3
1182
    movd    [%6+%9*8], m%4
1187
%endif
1188
    movd      [%7+%9], m%4
1183 1189

  
1184 1190
    ; write dwords 2
1185 1191
    psrldq        m%1, 4
1186 1192
    psrldq        m%2, 4
1193
%if %10 == 8
1194
    movd    [%5+%8*2], m%1
1195
    movd           %5, m%3
1196
%endif
1187 1197
    psrldq        m%3, 4
1188 1198
    psrldq        m%4, 4
1199
%if %10 == 16
1189 1200
    movd    [%5+%8*2], m%1
1201
%endif
1190 1202
    movd      [%6+%9], m%2
1191 1203
    movd    [%7+%8*2], m%3
1192 1204
    movd    [%7+%9*2], m%4
......
1197 1209
    psrldq        m%2, 4
1198 1210
    psrldq        m%3, 4
1199 1211
    psrldq        m%4, 4
1212
%if %10 == 8
1213
    mov     [%7+%8*4], %5d
1214
    movd    [%6+%8*2], m%1
1215
%else
1200 1216
    movd      [%5+%8], m%1
1217
%endif
1201 1218
    movd    [%6+%9*2], m%2
1202 1219
    movd    [%7+%8*2], m%3
1203 1220
    movd    [%7+%9*2], m%4
......
1335 1352
    TRANSPOSE4x4B  0, 1, 2, 3, 4
1336 1353
%if mmsize == 16 ; sse2
1337 1354
    add            r3, r1           ; change from r4*8*stride to r0+8*stride
1338
    WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2
1355
    WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2, 16
1339 1356
%else ; mmx/mmxext
1340 1357
    WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
1341 1358
%endif
......
1374 1391
SIMPLE_LOOPFILTER sse2,   h, 6
1375 1392

  
1376 1393
;-----------------------------------------------------------------------------
1377
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, int stride,
1394
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1378 1395
;                                            int flimE, int flimI, int hev_thr);
1379 1396
;-----------------------------------------------------------------------------
1380 1397

  
1381
%macro INNER_LOOPFILTER 4
1382
cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %4
1383
%define dst_reg     r0
1398
%macro INNER_LOOPFILTER 5
1399
%if %4 == 8 ; chroma
1400
cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5
1401
%define dst8_reg    r1
1402
%define mstride_reg r2
1403
%define E_reg       r3
1404
%define I_reg       r4
1405
%define hev_thr_reg r5
1406
%else ; luma
1407
cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
1384 1408
%define mstride_reg r1
1385 1409
%define E_reg       r2
1386 1410
%define I_reg       r3
......
1392 1416
%else ; x86-32, mmx/mmxext
1393 1417
%define cnt_reg     r5
1394 1418
%endif
1419
%endif
1420
%define dst_reg     r0
1395 1421
%define stride_reg  E_reg
1396 1422
%define dst2_reg    I_reg
1397 1423
%ifndef m8
......
1418 1444
%define flim_I   [rsp+mmsize]
1419 1445
%define hev_thr  [rsp+mmsize*2]
1420 1446
%define mask_res [rsp+mmsize*3]
1447
%define p0backup [rsp+mmsize*3]
1448
%define q0backup [rsp+mmsize*4]
1421 1449

  
1422 1450
    mova         flim_E, m0
1423 1451
    mova         flim_I, m1
......
1429 1457
%define flim_I   m10
1430 1458
%define hev_thr  m11
1431 1459
%define mask_res m12
1460
%define p0backup m12
1461
%define q0backup m8
1432 1462

  
1433 1463
    ; splat function arguments
1434 1464
    SPLATB_REG   flim_E, E_reg, %1   ; E
......
1436 1466
    SPLATB_REG  hev_thr, hev_thr_reg, %1 ; hev_thresh
1437 1467
%endif
1438 1468

  
1439
%if mmsize == 8 ; mmx/mmxext
1469
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
1440 1470
    mov         cnt_reg, 2
1441 1471
%endif
1442 1472
    mov      stride_reg, mstride_reg
1443 1473
    neg     mstride_reg
1444 1474
%ifidn %2, h
1445 1475
    lea         dst_reg, [dst_reg + stride_reg*4-4]
1476
%if %4 == 8
1477
    lea        dst8_reg, [dst8_reg+ stride_reg*4-4]
1478
%endif
1446 1479
%endif
1447 1480

  
1448 1481
%if mmsize == 8
......
1451 1484
    ; read
1452 1485
    lea        dst2_reg, [dst_reg + stride_reg]
1453 1486
%ifidn %2, v
1454
    mova             m0, [dst_reg +mstride_reg*4] ; p3
1455
    mova             m1, [dst2_reg+mstride_reg*4] ; p2
1456
    mova             m2, [dst_reg +mstride_reg*2] ; p1
1457
    mova             m5, [dst2_reg]               ; q1
1458
    mova             m6, [dst2_reg+ stride_reg]   ; q2
1459
    mova             m7, [dst2_reg+ stride_reg*2] ; q3
1487
%if %4 == 8 && mmsize == 16
1488
%define movrow movh
1489
%else
1490
%define movrow mova
1491
%endif
1492
    movrow           m0, [dst_reg +mstride_reg*4] ; p3
1493
    movrow           m1, [dst2_reg+mstride_reg*4] ; p2
1494
    movrow           m2, [dst_reg +mstride_reg*2] ; p1
1495
    movrow           m5, [dst2_reg]               ; q1
1496
    movrow           m6, [dst2_reg+ stride_reg]   ; q2
1497
    movrow           m7, [dst2_reg+ stride_reg*2] ; q3
1498
%if mmsize == 16 && %4 == 8
1499
    movhps           m0, [dst8_reg+mstride_reg*4]
1500
    movhps           m2, [dst8_reg+mstride_reg*2]
1501
    add        dst8_reg, stride_reg
1502
    movhps           m1, [dst8_reg+mstride_reg*4]
1503
    movhps           m5, [dst8_reg]
1504
    movhps           m6, [dst8_reg+ stride_reg]
1505
    movhps           m7, [dst8_reg+ stride_reg*2]
1506
    add        dst8_reg, mstride_reg
1507
%endif
1460 1508
%elif mmsize == 8 ; mmx/mmxext (h)
1461 1509
    ; read 8 rows of 8px each
1462 1510
    movu             m0, [dst_reg +mstride_reg*4]
......
1469 1517

  
1470 1518
    ; 8x8 transpose
1471 1519
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1472
%ifdef m13
1473
    SWAP              1, 8
1474
%else
1475
    mova [rsp+mmsize*4], m1
1476
%endif
1520
    mova       q0backup, m1
1477 1521
    movu             m7, [dst2_reg+ stride_reg*2]
1478 1522
    TRANSPOSE4x4B     4, 5, 6, 7, 1
1479 1523
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1480 1524
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1481 1525
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1482
%ifdef m13
1483
    SWAP              1, 8
1484
    SWAP              2, 8
1485
%else
1486
    mova             m1, [rsp+mmsize*4]
1487
    mova [rsp+mmsize*4], m2          ; store q0
1488
%endif
1526
    mova             m1, q0backup
1527
    mova       q0backup, m2          ; store q0
1489 1528
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1490
%ifdef m14
1491
    SWAP              5, 12
1492
%else
1493
    mova [rsp+mmsize*3], m5          ; store p0
1494
%endif
1529
    mova       p0backup, m5          ; store p0
1495 1530
    SWAP              1, 4
1496 1531
    SWAP              2, 4
1497 1532
    SWAP              6, 3
1498 1533
    SWAP              5, 3
1499 1534
%else ; sse2 (h)
1535
%if %4 == 16
1500 1536
    lea        dst8_reg, [dst_reg + stride_reg*8]
1537
%endif
1501 1538

  
1502 1539
    ; read 16 rows of 8px each, interleave
1503 1540
    movh             m0, [dst_reg +mstride_reg*4]
......
1526 1563

  
1527 1564
    ; 8x16 transpose
1528 1565
    TRANSPOSE4x4B     0, 1, 2, 3, 7
1529
%ifdef m13
1566
%ifdef m8
1530 1567
    SWAP              1, 8
1531 1568
%else
1532
    mova [rsp+mmsize*4], m1
1569
    mova       q0backup, m1
1533 1570
%endif
1534 1571
    movh             m7, [dst2_reg+ stride_reg*2]
1535 1572
    movh             m1, [dst8_reg+ stride_reg*2]
......
1538 1575
    SBUTTERFLY       dq, 0, 4, 1     ; p3/p2
1539 1576
    SBUTTERFLY       dq, 2, 6, 1     ; q0/q1
1540 1577
    SBUTTERFLY       dq, 3, 7, 1     ; q2/q3
1541
%ifdef m13
1578
%ifdef m8
1542 1579
    SWAP              1, 8
1543 1580
    SWAP              2, 8
1544 1581
%else
1545
    mova             m1, [rsp+mmsize*4]
1546
    mova [rsp+mmsize*4], m2          ; store q0
1582
    mova             m1, q0backup
1583
    mova       q0backup, m2          ; store q0
1547 1584
%endif
1548 1585
    SBUTTERFLY       dq, 1, 5, 2     ; p1/p0
1549
%ifdef m14
1586
%ifdef m12
1550 1587
    SWAP              5, 12
1551 1588
%else
1552
    mova [rsp+mmsize*3], m5          ; store p0
1589
    mova       p0backup, m5          ; store p0
1553 1590
%endif
1554 1591
    SWAP              1, 4
1555 1592
    SWAP              2, 4
......
1583 1620
    por              m6, m4          ; abs(q2-q1)
1584 1621

  
1585 1622
%ifidn %1, mmx
1586
%ifdef m10
1587
    SWAP              4, 10
1588
%else
1589
    mova             m4, [rsp+mmsize]
1590
%endif
1623
    mova             m4, flim_I
1591 1624
    pxor             m3, m3
1592 1625
    psubusb          m0, m4
1593 1626
    psubusb          m1, m4
......
1609 1642
    ; normal_limit and high_edge_variance for p1-p0, q1-q0
1610 1643
    SWAP              7, 3           ; now m7 is zero
1611 1644
%ifidn %2, v
1612
    mova             m3, [dst_reg +mstride_reg] ; p0
1613
%elifdef m14
1645
    movrow           m3, [dst_reg +mstride_reg] ; p0
1646
%if mmsize == 16 && %4 == 8
1647
    movhps           m3, [dst8_reg+mstride_reg]
1648
%endif
1649
%elifdef m12
1614 1650
    SWAP              3, 12
1615 1651
%else
1616
    mova             m3, [rsp+mmsize*3]
1652
    mova             m3, p0backup
1617 1653
%endif
1618 1654

  
1619 1655
    mova             m1, m2
......
1630 1666
    pcmpeqb          m1, m7          ; abs(p1-p0) <= I
1631 1667
    pcmpeqb          m6, m7          ; abs(p1-p0) <= hev_thresh
1632 1668
    pand             m0, m1
1633
%ifdef m12
1634
    SWAP              6, 12
1635
%else
1636
    mova [rsp+mmsize*3], m6
1637
%endif
1669
    mova       mask_res, m6
1638 1670
%else ; mmxext/sse2
1639 1671
    pmaxub           m0, m1          ; max_I
1640 1672
    SWAP              1, 4           ; max_hev_thresh
......
1642 1674

  
1643 1675
    SWAP              6, 4           ; now m6 is I
1644 1676
%ifidn %2, v
1645
    mova             m4, [dst_reg]   ; q0
1646
%elifdef m13
1677
    movrow           m4, [dst_reg]   ; q0
1678
%if mmsize == 16 && %4 == 8
1679
    movhps           m4, [dst8_reg]
1680
%endif
1681
%elifdef m8
1647 1682
    SWAP              4, 8
1648 1683
%else
1649
    mova             m4, [rsp+mmsize*4]
1684
    mova             m4, q0backup
1650 1685
%endif
1651 1686
    mova             m1, m4
1652 1687
    SWAP              1, 4
......
1662 1697
    pxor             m6, m6
1663 1698
    pcmpeqb          m1, m6          ; abs(q1-q0) <= I
1664 1699
    pcmpeqb          m7, m6          ; abs(q1-q0) <= hev_thresh
1665
%ifdef m12
1666
    SWAP              6, 12
1667
%else
1668
    mova             m6, [rsp+mmsize*3]
1669
%endif
1700
    mova             m6, mask_res
1670 1701
    pand             m0, m1          ; abs([pq][321]-[pq][210]) <= I
1671 1702
    pand             m6, m7
1672 1703
%else ; mmxext/sse2
......
1681 1712
%ifdef m12
1682 1713
    SWAP              6, 12
1683 1714
%else
1684
    mova [rsp+mmsize*3], m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1715
    mova       mask_res, m6          ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1685 1716
%endif
1686 1717

  
1687 1718
    ; simple_limit
......
1765 1796
%ifdef m12
1766 1797
    SWAP              6, 12
1767 1798
%else
1768
    mova             m6, [rsp+mmsize*3]
1799
    mova             m6, mask_res
1769 1800
%endif
1770 1801
%ifidn %1, mmx
1771 1802
    mova             m7, [pb_1]
......
1793 1824

  
1794 1825
    ; store
1795 1826
%ifidn %2, v
1796
    mova [dst_reg+mstride_reg*2], m2
1797
    mova [dst_reg+mstride_reg  ], m3
1798
    mova       [dst_reg], m4
1799
    mova [dst_reg+ stride_reg  ], m5
1827
    movrow [dst_reg +mstride_reg*2], m2
1828
    movrow [dst_reg +mstride_reg  ], m3
1829
    movrow    [dst_reg], m4
1830
    movrow [dst_reg + stride_reg  ], m5
1831
%if mmsize == 16 && %4 == 8
1832
    movhps [dst8_reg+mstride_reg*2], m2
1833
    movhps [dst8_reg+mstride_reg  ], m3
1834
    movhps   [dst8_reg], m4
1835
    movhps [dst8_reg+ stride_reg  ], m5
1836
%endif
1800 1837
%else ; h
1801
    add          dst_reg, 2
1802
    add         dst2_reg, 2
1838
    add         dst_reg, 2
1839
    add        dst2_reg, 2
1803 1840

  
1804 1841
    ; 4x8/16 transpose
1805 1842
    TRANSPOSE4x4B     2, 3, 4, 5, 6
......
1808 1845
    WRITE_4x2D        2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
1809 1846
%else ; sse2 (h)
1810 1847
    lea        dst8_reg, [dst8_reg+mstride_reg+2]
1811
    WRITE_4x4D        2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg
1848
    WRITE_4x4D        2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
1812 1849
%endif
1813 1850
%endif
1814 1851

  
1815 1852
%if mmsize == 8
1853
%if %4 == 8 ; chroma
1854
%ifidn %2, h
1855
    sub         dst_reg, 2
1856
%endif
1857
    cmp         dst_reg, dst8_reg
1858
    mov         dst_reg, dst8_reg
1859
    jnz .next8px
1860
%else
1816 1861
%ifidn %2, h
1817 1862
    lea         dst_reg, [dst_reg + stride_reg*8-2]
1818 1863
%else ; v
......
1821 1866
    dec         cnt_reg
1822 1867
    jg .next8px
1823 1868
%endif
1869
%endif
1824 1870

  
1825 1871
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
1826 1872
    mov             rsp, stack_reg   ; restore stack pointer
......
1829 1875
%endmacro
1830 1876

  
1831 1877
INIT_MMX
1832
INNER_LOOPFILTER mmx,    v, 6, 8
1833
INNER_LOOPFILTER mmx,    h, 6, 8
1834
INNER_LOOPFILTER mmxext, v, 6, 8
1835
INNER_LOOPFILTER mmxext, h, 6, 8
1878
INNER_LOOPFILTER mmx,    v, 6, 16, 8
1879
INNER_LOOPFILTER mmx,    h, 6, 16, 8
1880
INNER_LOOPFILTER mmxext, v, 6, 16, 8
1881
INNER_LOOPFILTER mmxext, h, 6, 16, 8
1882

  
1883
INNER_LOOPFILTER mmx,    v, 6,  8, 8
1884
INNER_LOOPFILTER mmx,    h, 6,  8, 8
1885
INNER_LOOPFILTER mmxext, v, 6,  8, 8
1886
INNER_LOOPFILTER mmxext, h, 6,  8, 8
1887

  
1836 1888
INIT_XMM
1837
INNER_LOOPFILTER sse2,   v, 5, 13
1889
INNER_LOOPFILTER sse2,   v, 5, 16, 13
1838 1890
%ifdef m8
1839
INNER_LOOPFILTER sse2,   h, 5, 13
1891
INNER_LOOPFILTER sse2,   h, 5, 16, 13
1840 1892
%else
1841
INNER_LOOPFILTER sse2,   h, 6, 13
1893
INNER_LOOPFILTER sse2,   h, 6, 16, 13
1842 1894
%endif
1895
INNER_LOOPFILTER sse2,   v, 6,  8, 13
1896
INNER_LOOPFILTER sse2,   h, 6,  8, 13

Also available in: Unified diff