Revision e8772eec libavcodec/ppc/dsputil_altivec.c

View differences:

libavcodec/ppc/dsputil_altivec.c
1311 1311
int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1312 1312
POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
1313 1313
  int sum;
1314
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
1315 1314
  register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
1316 1315
  register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1316
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
1317 1317
  {
1318 1318
    register const_vector signed short vprod1 = (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
1319 1319
    register const_vector signed short vprod2 = (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
......
1338 1338
    {                                                                   \
1339 1339
      register vector unsigned char src1, src2, srcO;                   \
1340 1340
      register vector unsigned char dst1, dst2, dstO;                   \
1341
      register vector signed short srcV, dstV;                          \
1342
      register vector signed short but0, but1, but2, op1, op2, op3;     \
1341 1343
      src1 = vec_ld(stride * i, src);                                   \
1342 1344
      if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8)       \
1343 1345
        src2 = vec_ld((stride * i) + 16, src);                          \
......
1348 1350
      dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));           \
1349 1351
      /* promote the unsigned chars to signed shorts */                 \
1350 1352
      /* we're in the 8x8 function, we only care for the first 8 */     \
1351
      register vector signed short srcV =                               \
1352
        (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1353
      register vector signed short dstV =                               \
1354
        (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1353
      srcV =                                                            \
1354
        (vector signed short)vec_mergeh((vector signed char)vzero,      \
1355
        (vector signed char)srcO);                                      \
1356
      dstV =                                                            \
1357
        (vector signed short)vec_mergeh((vector signed char)vzero,      \
1358
        (vector signed char)dstO);                                      \
1355 1359
      /* substractions inside the first butterfly */                    \
1356
      register vector signed short but0 = vec_sub(srcV, dstV);          \
1357
      register vector signed short op1 = vec_perm(but0, but0, perm1);   \
1358
      register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
1359
      register vector signed short op2 = vec_perm(but1, but1, perm2);   \
1360
      register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
1361
      register vector signed short op3 = vec_perm(but2, but2, perm3);   \
1360
      but0 = vec_sub(srcV, dstV);                                       \
1361
      op1 = vec_perm(but0, but0, perm1);                                \
1362
      but1 = vec_mladd(but0, vprod1, op1);                              \
1363
      op2 = vec_perm(but1, but1, perm2);                                \
1364
      but2 = vec_mladd(but1, vprod2, op2);                              \
1365
      op3 = vec_perm(but2, but2, perm3);                                \
1362 1366
      res = vec_mladd(but2, vprod3, op3);                               \
1363 1367
    }
1364 1368
    ONEITERBUTTERFLY(0, temp0);
......
1481 1485

  
1482 1486
#define ONEITERBUTTERFLY(i, res1, res2)                                 \
1483 1487
    {                                                                   \
1484
      register vector unsigned char src1 REG_v(v22), src2 REG_v(v23); \
1485
      register vector unsigned char dst1 REG_v(v24), dst2 REG_v(v25); \
1488
      register vector unsigned char src1 REG_v(v22),                    \
1489
                                    src2 REG_v(v23),                    \
1490
                                    dst1 REG_v(v24),                    \
1491
                                    dst2 REG_v(v25),                    \
1492
                                    srcO REG_v(v22),                    \
1493
                                    dstO REG_v(v23);                    \
1494
                                                                        \
1495
      register vector signed short  srcV REG_v(v24),                    \
1496
                                    dstV REG_v(v25),                    \
1497
                                    srcW REG_v(v26),                    \
1498
                                    dstW REG_v(v27),                    \
1499
                                    but0 REG_v(v28),                    \
1500
                                    but0S REG_v(v29),                   \
1501
                                    op1 REG_v(v30),                     \
1502
                                    but1 REG_v(v22),                    \
1503
                                    op1S REG_v(v23),                    \
1504
                                    but1S REG_v(v24),                   \
1505
                                    op2 REG_v(v25),                     \
1506
                                    but2 REG_v(v26),                    \
1507
                                    op2S REG_v(v27),                    \
1508
                                    but2S REG_v(v28),                   \
1509
                                    op3 REG_v(v29),                     \
1510
                                    op3S REG_v(v30);                    \
1511
                                                                        \
1486 1512
      src1 = vec_ld(stride * i, src);                                   \
1487 1513
      src2 = vec_ld((stride * i) + 16, src);                            \
1488
      register vector unsigned char srcO REG_v(v22) = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1514
      srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src));           \
1489 1515
      dst1 = vec_ld(stride * i, dst);                                   \
1490 1516
      dst2 = vec_ld((stride * i) + 16, dst);                            \
1491
      register vector unsigned char dstO REG_v(v23) = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1517
      dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));           \
1492 1518
      /* promote the unsigned chars to signed shorts */                 \
1493
      register vector signed short srcV REG_v(v24) =                   \
1494
        (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1495
      register vector signed short dstV REG_v(v25) =                   \
1496
        (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1497
      register vector signed short srcW REG_v(v26) =                   \
1498
        (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
1499
      register vector signed short dstW REG_v(v27) =                   \
1500
        (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
1519
      srcV =                                                            \
1520
        (vector signed short)vec_mergeh((vector signed char)vzero,      \
1521
        (vector signed char)srcO);                                      \
1522
      dstV =                                                            \
1523
        (vector signed short)vec_mergeh((vector signed char)vzero,      \
1524
        (vector signed char)dstO);                                      \
1525
      srcW =                                                            \
1526
        (vector signed short)vec_mergel((vector signed char)vzero,      \
1527
        (vector signed char)srcO);                                      \
1528
      dstW =                                                            \
1529
        (vector signed short)vec_mergel((vector signed char)vzero,      \
1530
        (vector signed char)dstO);                                      \
1501 1531
      /* substractions inside the first butterfly */                    \
1502
      register vector signed short but0 REG_v(v28) = vec_sub(srcV, dstV); \
1503
      register vector signed short but0S REG_v(v29) = vec_sub(srcW, dstW); \
1504
      register vector signed short op1 REG_v(v30) = vec_perm(but0, but0, perm1); \
1505
      register vector signed short but1 REG_v(v22) = vec_mladd(but0, vprod1, op1); \
1506
      register vector signed short op1S REG_v(v23) = vec_perm(but0S, but0S, perm1); \
1507
      register vector signed short but1S REG_v(v24) = vec_mladd(but0S, vprod1, op1S); \
1508
      register vector signed short op2 REG_v(v25) = vec_perm(but1, but1, perm2); \
1509
      register vector signed short but2 REG_v(v26) = vec_mladd(but1, vprod2, op2); \
1510
      register vector signed short op2S REG_v(v27) = vec_perm(but1S, but1S, perm2); \
1511
      register vector signed short but2S REG_v(v28) = vec_mladd(but1S, vprod2, op2S); \
1512
      register vector signed short op3 REG_v(v29) = vec_perm(but2, but2, perm3); \
1532
      but0 = vec_sub(srcV, dstV);                                       \
1533
      but0S = vec_sub(srcW, dstW);                                      \
1534
      op1 = vec_perm(but0, but0, perm1);                                \
1535
      but1 = vec_mladd(but0, vprod1, op1);                              \
1536
      op1S = vec_perm(but0S, but0S, perm1);                             \
1537
      but1S = vec_mladd(but0S, vprod1, op1S);                           \
1538
      op2 = vec_perm(but1, but1, perm2);                                \
1539
      but2 = vec_mladd(but1, vprod2, op2);                              \
1540
      op2S = vec_perm(but1S, but1S, perm2);                             \
1541
      but2S = vec_mladd(but1S, vprod2, op2S);                           \
1542
      op3 = vec_perm(but2, but2, perm3);                                \
1513 1543
      res1 = vec_mladd(but2, vprod3, op3);                              \
1514
      register vector signed short op3S REG_v(v30) = vec_perm(but2S, but2S, perm3); \
1544
      op3S = vec_perm(but2S, but2S, perm3);                             \
1515 1545
      res2 = vec_mladd(but2S, vprod3, op3S);                            \
1516 1546
    }
1517 1547
    ONEITERBUTTERFLY(0, temp0, temp0S);
......
1526 1556
#undef ONEITERBUTTERFLY
1527 1557
  {
1528 1558
    register vector signed int vsum;
1559
    register vector signed short line0S, line1S, line2S, line3S, line4S,
1560
                                 line5S, line6S, line7S, line0BS,line2BS,
1561
                                 line1BS,line3BS,line4BS,line6BS,line5BS,
1562
                                 line7BS,line0CS,line4CS,line1CS,line5CS,
1563
                                 line2CS,line6CS,line3CS,line7CS;
1564

  
1529 1565
    register vector signed short line0 = vec_add(temp0, temp1);
1530 1566
    register vector signed short line1 = vec_sub(temp0, temp1);
1531 1567
    register vector signed short line2 = vec_add(temp2, temp3);
......
1562 1598
    vsum = vec_sum4s(vec_abs(line6C), vsum);
1563 1599
    vsum = vec_sum4s(vec_abs(line7C), vsum);
1564 1600

  
1565
    register vector signed short line0S = vec_add(temp0S, temp1S);
1566
    register vector signed short line1S = vec_sub(temp0S, temp1S);
1567
    register vector signed short line2S = vec_add(temp2S, temp3S);
1568
    register vector signed short line3S = vec_sub(temp2S, temp3S);
1569
    register vector signed short line4S = vec_add(temp4S, temp5S);
1570
    register vector signed short line5S = vec_sub(temp4S, temp5S);
1571
    register vector signed short line6S = vec_add(temp6S, temp7S);
1572
    register vector signed short line7S = vec_sub(temp6S, temp7S);
1573

  
1574
    register vector signed short line0BS = vec_add(line0S, line2S);
1575
    register vector signed short line2BS = vec_sub(line0S, line2S);
1576
    register vector signed short line1BS = vec_add(line1S, line3S);
1577
    register vector signed short line3BS = vec_sub(line1S, line3S);
1578
    register vector signed short line4BS = vec_add(line4S, line6S);
1579
    register vector signed short line6BS = vec_sub(line4S, line6S);
1580
    register vector signed short line5BS = vec_add(line5S, line7S);
1581
    register vector signed short line7BS = vec_sub(line5S, line7S);
1582

  
1583
    register vector signed short line0CS = vec_add(line0BS, line4BS);
1584
    register vector signed short line4CS = vec_sub(line0BS, line4BS);
1585
    register vector signed short line1CS = vec_add(line1BS, line5BS);
1586
    register vector signed short line5CS = vec_sub(line1BS, line5BS);
1587
    register vector signed short line2CS = vec_add(line2BS, line6BS);
1588
    register vector signed short line6CS = vec_sub(line2BS, line6BS);
1589
    register vector signed short line3CS = vec_add(line3BS, line7BS);
1590
    register vector signed short line7CS = vec_sub(line3BS, line7BS);
1601
    line0S = vec_add(temp0S, temp1S);
1602
    line1S = vec_sub(temp0S, temp1S);
1603
    line2S = vec_add(temp2S, temp3S);
1604
    line3S = vec_sub(temp2S, temp3S);
1605
    line4S = vec_add(temp4S, temp5S);
1606
    line5S = vec_sub(temp4S, temp5S);
1607
    line6S = vec_add(temp6S, temp7S);
1608
    line7S = vec_sub(temp6S, temp7S);
1609

  
1610
    line0BS = vec_add(line0S, line2S);
1611
    line2BS = vec_sub(line0S, line2S);
1612
    line1BS = vec_add(line1S, line3S);
1613
    line3BS = vec_sub(line1S, line3S);
1614
    line4BS = vec_add(line4S, line6S);
1615
    line6BS = vec_sub(line4S, line6S);
1616
    line5BS = vec_add(line5S, line7S);
1617
    line7BS = vec_sub(line5S, line7S);
1618

  
1619
    line0CS = vec_add(line0BS, line4BS);
1620
    line4CS = vec_sub(line0BS, line4BS);
1621
    line1CS = vec_add(line1BS, line5BS);
1622
    line5CS = vec_sub(line1BS, line5BS);
1623
    line2CS = vec_add(line2BS, line6BS);
1624
    line6CS = vec_sub(line2BS, line6BS);
1625
    line3CS = vec_add(line3BS, line7BS);
1626
    line7CS = vec_sub(line3BS, line7BS);
1591 1627

  
1592 1628
    vsum = vec_sum4s(vec_abs(line0CS), vsum);
1593 1629
    vsum = vec_sum4s(vec_abs(line1CS), vsum);

Also available in: Unified diff