Revision 1d67b037 libavcodec/i386/h264dsp_mmx.c

View differences:

libavcodec/i386/h264dsp_mmx.c
1111 1111
     dst += 4-h*dstStride;\
1112 1112
   }\
1113 1113
}\
1114
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
1115
    int h = size;\
1114
static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
1116 1115
    int w = (size+8)>>2;\
1117 1116
    src -= 2*srcStride+2;\
1118 1117
    while(w--){\
......
1163 1162
        tmp += 4;\
1164 1163
        src += 4 - (size+5)*srcStride;\
1165 1164
    }\
1166
    tmp -= size+8;\
1167
    w = size>>4;\
1165
}\
1166
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
1167
    int w = size>>4;\
1168 1168
    do{\
1169
    h = size;\
1169
    int h = size;\
1170 1170
    asm volatile(\
1171 1171
        "1:                         \n\t"\
1172 1172
        "movq     (%0), %%mm0       \n\t"\
......
1218 1218
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
1219 1219
}\
1220 1220
\
1221
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1221
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1222 1222
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
1223 1223
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
1224 1224
    src += 8*srcStride;\
......
1237 1237
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
1238 1238
}\
1239 1239
\
1240
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
1241
          put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
1242
    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
1243
}\
1240 1244
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1241 1245
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
1242 1246
}\
......
1306 1310
    OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
1307 1311
}\
1308 1312

  
1313

  
1309 1314
#ifdef ARCH_X86_64
1310
#define QPEL_H264_HL2_XMM(OPNAME, OP, MMX)\
1315
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
1311 1316
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1312 1317
    int h=16;\
1313 1318
    asm volatile(\
......
1373 1378
    );\
1374 1379
}
1375 1380
#else // ARCH_X86_64
1376
#define QPEL_H264_HL2_XMM(OPNAME, OP, MMX)\
1381
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
1377 1382
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1378 1383
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
1379 1384
    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
......
1385 1390
}
1386 1391
#endif // ARCH_X86_64
1387 1392

  
1388
#define QPEL_H264_XMM(OPNAME, OP, MMX)\
1393
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
1389 1394
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1390 1395
    int h=8;\
1391 1396
    asm volatile(\
......
1431 1436
    );\
1432 1437
    }while(--h);\
1433 1438
}\
1434
QPEL_H264_HL2_XMM(OPNAME, OP, MMX)\
1439
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
1435 1440
\
1441
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1442
    int h=8;\
1443
    asm volatile(\
1444
        "pxor %%xmm7, %%xmm7        \n\t"\
1445
        "movdqa %5, %%xmm6          \n\t"\
1446
        "1:                         \n\t"\
1447
        "lddqu   -5(%0), %%xmm1     \n\t"\
1448
        "movdqa  %%xmm1, %%xmm0     \n\t"\
1449
        "punpckhbw %%xmm7, %%xmm1   \n\t"\
1450
        "punpcklbw %%xmm7, %%xmm0   \n\t"\
1451
        "movdqa  %%xmm1, %%xmm2     \n\t"\
1452
        "movdqa  %%xmm1, %%xmm3     \n\t"\
1453
        "movdqa  %%xmm1, %%xmm4     \n\t"\
1454
        "movdqa  %%xmm1, %%xmm5     \n\t"\
1455
        "palignr $6, %%xmm0, %%xmm5 \n\t"\
1456
        "palignr $8, %%xmm0, %%xmm4 \n\t"\
1457
        "palignr $10,%%xmm0, %%xmm3 \n\t"\
1458
        "paddw   %%xmm1, %%xmm5     \n\t"\
1459
        "palignr $12,%%xmm0, %%xmm2 \n\t"\
1460
        "palignr $14,%%xmm0, %%xmm1 \n\t"\
1461
        "paddw   %%xmm3, %%xmm2     \n\t"\
1462
        "paddw   %%xmm4, %%xmm1     \n\t"\
1463
        "psllw   $2,     %%xmm2     \n\t"\
1464
        "psubw   %%xmm1, %%xmm2     \n\t"\
1465
        "paddw   %6,     %%xmm5     \n\t"\
1466
        "pmullw  %%xmm6, %%xmm2     \n\t"\
1467
        "paddw   %%xmm5, %%xmm2     \n\t"\
1468
        "psraw   $5,     %%xmm2     \n\t"\
1469
        "packuswb %%xmm2, %%xmm2    \n\t"\
1470
        OP(%%xmm2, (%1), %%xmm4, q)\
1471
        "add %3, %0                 \n\t"\
1472
        "add %4, %1                 \n\t"\
1473
        "decl %2                    \n\t"\
1474
        " jnz 1b                    \n\t"\
1475
        : "+a"(src), "+c"(dst), "+g"(h)\
1476
        : "D"((long)srcStride), "S"((long)dstStride),\
1477
          "m"(ff_pw_5), "m"(ff_pw_16)\
1478
        : "memory"\
1479
    );\
1480
}\
1481
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1482
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
1483
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
1484
    src += 8*srcStride;\
1485
    dst += 8*dstStride;\
1486
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
1487
    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
1488
}\
1489

  
1490
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
1436 1491
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1437 1492
    src -= 2*srcStride;\
1438 1493
    \
......
1483 1538
        );\
1484 1539
    }\
1485 1540
}\
1486
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
1541
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1542
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
1543
}\
1544
static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1545
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
1546
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
1547
}
1548

  
1549
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
1550
    int w = (size+8)>>3;
1551
    src -= 2*srcStride+2;
1552
    while(w--){
1553
        asm volatile(
1554
            "pxor %%xmm7, %%xmm7        \n\t"
1555
            "movq (%0), %%xmm0          \n\t"
1556
            "add %2, %0                 \n\t"
1557
            "movq (%0), %%xmm1          \n\t"
1558
            "add %2, %0                 \n\t"
1559
            "movq (%0), %%xmm2          \n\t"
1560
            "add %2, %0                 \n\t"
1561
            "movq (%0), %%xmm3          \n\t"
1562
            "add %2, %0                 \n\t"
1563
            "movq (%0), %%xmm4          \n\t"
1564
            "add %2, %0                 \n\t"
1565
            "punpcklbw %%xmm7, %%xmm0   \n\t"
1566
            "punpcklbw %%xmm7, %%xmm1   \n\t"
1567
            "punpcklbw %%xmm7, %%xmm2   \n\t"
1568
            "punpcklbw %%xmm7, %%xmm3   \n\t"
1569
            "punpcklbw %%xmm7, %%xmm4   \n\t"
1570
            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
1571
            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
1572
            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
1573
            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
1574
            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
1575
            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
1576
            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
1577
            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
1578
            : "+a"(src)
1579
            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
1580
            : "memory"
1581
        );
1582
        if(size==16){
1583
            asm volatile(
1584
                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
1585
                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
1586
                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
1587
                QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
1588
                QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
1589
                QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
1590
                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
1591
                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
1592
                : "+a"(src)
1593
                : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
1594
                : "memory"
1595
            );
1596
        }
1597
        tmp += 8;
1598
        src += 8 - (size+5)*srcStride;
1599
    }
1600
}
1601

  
1602
#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
1603
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
1487 1604
    int h = size;\
1488
    int w = (size+8)>>3;\
1489
    src -= 2*srcStride+2;\
1490
    while(w--){\
1491
        asm volatile(\
1492
            "pxor %%xmm7, %%xmm7        \n\t"\
1493
            "movq (%0), %%xmm0          \n\t"\
1494
            "add %2, %0                 \n\t"\
1495
            "movq (%0), %%xmm1          \n\t"\
1496
            "add %2, %0                 \n\t"\
1497
            "movq (%0), %%xmm2          \n\t"\
1498
            "add %2, %0                 \n\t"\
1499
            "movq (%0), %%xmm3          \n\t"\
1500
            "add %2, %0                 \n\t"\
1501
            "movq (%0), %%xmm4          \n\t"\
1502
            "add %2, %0                 \n\t"\
1503
            "punpcklbw %%xmm7, %%xmm0   \n\t"\
1504
            "punpcklbw %%xmm7, %%xmm1   \n\t"\
1505
            "punpcklbw %%xmm7, %%xmm2   \n\t"\
1506
            "punpcklbw %%xmm7, %%xmm3   \n\t"\
1507
            "punpcklbw %%xmm7, %%xmm4   \n\t"\
1508
            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)\
1509
            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)\
1510
            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)\
1511
            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)\
1512
            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)\
1513
            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)\
1514
            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)\
1515
            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)\
1516
            : "+a"(src)\
1517
            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
1518
            : "memory"\
1519
        );\
1520
        if(size==16){\
1521
            asm volatile(\
1522
                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)\
1523
                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)\
1524
                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)\
1525
                QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)\
1526
                QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)\
1527
                QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)\
1528
                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)\
1529
                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)\
1530
                : "+a"(src)\
1531
                : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
1532
                : "memory"\
1533
            );\
1534
        }\
1535
        tmp += 8;\
1536
        src += 8 - (size+5)*srcStride;\
1537
    }\
1538
    tmp -= size+8;\
1539
    h = size;\
1540
    if(size==16){\
1605
    if(size == 16){\
1541 1606
        asm volatile(\
1542 1607
            "1:                         \n\t"\
1543 1608
            "movdqa 32(%0), %%xmm4      \n\t"\
......
1627 1692
            : "memory"\
1628 1693
        );\
1629 1694
    }\
1630
}\
1631
\
1632
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1633
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
1634
}\
1635
static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1636
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
1637
    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
1695
}
1696

  
1697
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
1698
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
1699
          put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
1700
    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
1638 1701
}\
1639 1702
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1640
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
1703
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
1641 1704
}\
1642 1705
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1643
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
1706
    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
1644 1707
}\
1645 1708

  
1646
#define put_pixels8_ssse3 put_pixels8_mmx2
1647
#define put_pixels16_ssse3 put_pixels16_sse2
1648
#define avg_pixels8_ssse3 avg_pixels8_mmx2
1649
#define avg_pixels16_ssse3 avg_pixels16_sse2
1650

  
1709
#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
1710
#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
1711
#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
1712
#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
1651 1713
#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
1652
#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
1653 1714
#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
1715
#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
1654 1716
#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
1655 1717

  
1718
#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
1719
#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
1720
#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
1721
#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
1656 1722
#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
1657
#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
1658 1723
#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
1724
#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
1659 1725
#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
1660 1726

  
1661
#define put_h264_qpel8_h_lowpass_ssse3 put_h264_qpel8_h_lowpass_mmx2
1662
#define put_h264_qpel16_h_lowpass_ssse3 put_h264_qpel16_h_lowpass_mmx2
1663
#define avg_h264_qpel8_h_lowpass_ssse3 avg_h264_qpel8_h_lowpass_mmx2
1664
#define avg_h264_qpel16_h_lowpass_ssse3 avg_h264_qpel16_h_lowpass_mmx2
1727
#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
1728
#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
1729
#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
1730
#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
1731

  
1732
#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
1733
#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
1734
#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
1735
#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
1736

  
1737
#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
1738
#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
1665 1739

  
1666 1740
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
1741
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
1742
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
1743
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
1744
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
1745

  
1746
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
1747
    put_pixels16_sse2(dst, src, stride, 16);
1748
}
1749
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
1750
    avg_pixels16_sse2(dst, src, stride, 16);
1751
}
1752
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
1753
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
1754

  
1755
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
1667 1756
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1668 1757
    OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
1669 1758
}\
1670
\
1759

  
1760
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
1671 1761
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1672 1762
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
1673 1763
}\
......
1679 1769
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1680 1770
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
1681 1771
}\
1682
\
1772

  
1773
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
1683 1774
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1684 1775
    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
1685 1776
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
......
1695 1786
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1696 1787
    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
1697 1788
}\
1698
\
1789

  
1790
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
1699 1791
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1700 1792
    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
1701 1793
    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
......
1761 1853
    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
1762 1854
}\
1763 1855

  
1856
#define H264_MC_4816(MMX)\
1857
H264_MC(put_, 4, MMX, 8)\
1858
H264_MC(put_, 8, MMX, 8)\
1859
H264_MC(put_, 16,MMX, 8)\
1860
H264_MC(avg_, 4, MMX, 8)\
1861
H264_MC(avg_, 8, MMX, 8)\
1862
H264_MC(avg_, 16,MMX, 8)\
1863

  
1864
#define H264_MC_816(QPEL, XMM)\
1865
QPEL(put_, 8, XMM, 16)\
1866
QPEL(put_, 16,XMM, 16)\
1867
QPEL(avg_, 8, XMM, 16)\
1868
QPEL(avg_, 16,XMM, 16)\
1869

  
1764 1870

  
1765 1871
#define AVG_3DNOW_OP(a,b,temp, size) \
1766 1872
"mov" #size " " #b ", " #temp "   \n\t"\
......
1778 1884
#define PAVGB "pavgb"
1779 1885
QPEL_H264(put_,       PUT_OP, mmx2)
1780 1886
QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
1887
QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
1888
QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
1889
QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
1890
QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
1781 1891
#ifdef HAVE_SSSE3
1782
QPEL_H264_XMM(put_,       PUT_OP, ssse3)
1783
QPEL_H264_XMM(avg_,  AVG_MMX2_OP, ssse3)
1892
QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
1893
QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
1894
QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
1895
QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
1896
QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
1897
QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
1784 1898
#endif
1785 1899
#undef PAVGB
1786 1900

  
1787
H264_MC(put_, 4, 3dnow, 8)
1788
H264_MC(put_, 8, 3dnow, 8)
1789
H264_MC(put_, 16,3dnow, 8)
1790
H264_MC(avg_, 4, 3dnow, 8)
1791
H264_MC(avg_, 8, 3dnow, 8)
1792
H264_MC(avg_, 16,3dnow, 8)
1793
H264_MC(put_, 4, mmx2, 8)
1794
H264_MC(put_, 8, mmx2, 8)
1795
H264_MC(put_, 16,mmx2, 8)
1796
H264_MC(avg_, 4, mmx2, 8)
1797
H264_MC(avg_, 8, mmx2, 8)
1798
H264_MC(avg_, 16,mmx2, 8)
1901
H264_MC_4816(3dnow)
1902
H264_MC_4816(mmx2)
1903
H264_MC_816(H264_MC_V, sse2)
1904
H264_MC_816(H264_MC_HV, sse2)
1799 1905
#ifdef HAVE_SSSE3
1800
// FIXME some of these only require sse2
1801
H264_MC(put_, 8, ssse3, 16)
1802
H264_MC(put_, 16,ssse3, 16)
1803
H264_MC(avg_, 8, ssse3, 16)
1804
H264_MC(avg_, 16,ssse3, 16)
1906
H264_MC_816(H264_MC_H, ssse3)
1907
H264_MC_816(H264_MC_HV, ssse3)
1805 1908
#endif
1806 1909

  
1807
// no mc20 because it wasn't significantly faster than mmx2
1808
#define put_h264_qpel8_mc20_ssse3 put_h264_qpel8_mc20_mmx2
1809
#define put_h264_qpel16_mc20_ssse3 put_h264_qpel16_mc20_mmx2
1810
#define avg_h264_qpel8_mc20_ssse3 avg_h264_qpel8_mc20_mmx2
1811
#define avg_h264_qpel16_mc20_ssse3 avg_h264_qpel16_mc20_mmx2
1812

  
1813 1910

  
1814 1911
#define H264_CHROMA_OP(S,D)
1815 1912
#define H264_CHROMA_OP4(S,D,T)

Also available in: Unified diff