Revision e3905ce0 libavcodec/ppc/mpegvideo_altivec.c

View differences:

libavcodec/ppc/mpegvideo_altivec.c
41 41
// transposes a matrix consisting of four vectors with four elements each
42 42
#define TRANSPOSE4(a,b,c,d) \
43 43
do { \
44
  __typeof__(a) _trans_ach = vec_mergeh(a, c); \
45
  __typeof__(a) _trans_acl = vec_mergel(a, c); \
46
  __typeof__(a) _trans_bdh = vec_mergeh(b, d); \
47
  __typeof__(a) _trans_bdl = vec_mergel(b, d); \
48
 \
49
  a = vec_mergeh(_trans_ach, _trans_bdh); \
50
  b = vec_mergel(_trans_ach, _trans_bdh); \
51
  c = vec_mergeh(_trans_acl, _trans_bdl); \
52
  d = vec_mergel(_trans_acl, _trans_bdl); \
44
    __typeof__(a) _trans_ach = vec_mergeh(a, c); \
45
    __typeof__(a) _trans_acl = vec_mergel(a, c); \
46
    __typeof__(a) _trans_bdh = vec_mergeh(b, d); \
47
    __typeof__(a) _trans_bdl = vec_mergel(b, d); \
48
                                                 \
49
    a = vec_mergeh(_trans_ach, _trans_bdh);      \
50
    b = vec_mergel(_trans_ach, _trans_bdh);      \
51
    c = vec_mergeh(_trans_acl, _trans_bdl);      \
52
    d = vec_mergel(_trans_acl, _trans_bdl);      \
53 53
} while (0)
54 54

  
55 55

  
......
58 58
// target address is four-byte aligned (which should be always).
59 59
#define LOAD4(vec, address) \
60 60
{ \
61
    __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \
62
    vector unsigned char _perm_vec = vec_lvsl(0,(address)); \
63
    vec = vec_ld(0, _load_addr); \
64
    vec = vec_perm(vec, vec, _perm_vec); \
65
    vec = vec_splat(vec, 0); \
61
    __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address);  \
62
    vector unsigned char _perm_vec = vec_lvsl(0,(address));     \
63
    vec = vec_ld(0, _load_addr);                                \
64
    vec = vec_perm(vec, vec, _perm_vec);                        \
65
    vec = vec_splat(vec, 0);                                    \
66 66
}
67 67

  
68 68

  
69 69
#define FOUROF(a) AVV(a,a,a,a)
70 70

  
71 71
int dct_quantize_altivec(MpegEncContext* s,
72
                        DCTELEM* data, int n,
73
                        int qscale, int* overflow)
72
                         DCTELEM* data, int n,
73
                         int qscale, int* overflow)
74 74
{
75 75
    int lastNonZero;
76 76
    vector float row0, row1, row2, row3, row4, row5, row6, row7;
......
137 137

  
138 138
        int whichPass, whichHalf;
139 139

  
140
        for(whichPass = 1; whichPass<=2; whichPass++)
141
        {
142
            for(whichHalf = 1; whichHalf<=2; whichHalf++)
143
            {
140
        for(whichPass = 1; whichPass<=2; whichPass++) {
141
            for(whichHalf = 1; whichHalf<=2; whichHalf++) {
144 142
                vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
145 143
                vector float tmp10, tmp11, tmp12, tmp13;
146 144
                vector float z1, z2, z3, z4, z5;
......
235 233
                SWAP(row7, alt7);
236 234
            }
237 235

  
238
            if (whichPass == 1)
239
            {
236
            if (whichPass == 1) {
240 237
                // transpose the data for the second pass
241 238

  
242 239
                // First, block transpose the upper right with lower left.
......
261 258
        const vector signed int* qmat;
262 259
        vector float bias, negBias;
263 260

  
264
        if (s->mb_intra)
265
        {
261
        if (s->mb_intra) {
266 262
            vector signed int baseVector;
267 263

  
268 264
            // We must cache element 0 in the intra case
......
272 268

  
273 269
            qmat = (vector signed int*)s->q_intra_matrix[qscale];
274 270
            biasAddr = &(s->intra_quant_bias);
275
        }
276
        else
277
        {
271
        } else {
278 272
            qmat = (vector signed int*)s->q_inter_matrix[qscale];
279 273
            biasAddr = &(s->inter_quant_bias);
280 274
        }
......
439 433
        // and handle it using the vector unit if we can.  This is the permute used
440 434
        // by the altivec idct, so it is common when using the altivec dct.
441 435

  
442
        if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM))
443
        {
436
        if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) {
444 437
            TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
445 438
        }
446 439

  
......
456 449
    }
457 450

  
458 451
    // special handling of block[0]
459
    if (s->mb_intra)
460
    {
461
        if (!s->h263_aic)
462
        {
452
    if (s->mb_intra) {
453
        if (!s->h263_aic) {
463 454
            if (n < 4)
464 455
                oldBaseValue /= s->y_dc_scale;
465 456
            else
......
474 465
    // need to permute the "no" permutation case.
475 466
    if ((lastNonZero > 0) &&
476 467
        (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
477
        (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM))
478
    {
468
        (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) {
479 469
        ff_block_permute(data, s->dsp.idct_permutation,
480 470
                s->intra_scantable.scantable, lastNonZero);
481 471
    }
......
483 473
    return lastNonZero;
484 474
}
485 475

  
486
/*
487
  AltiVec version of dct_unquantize_h263
488
  this code assumes `block' is 16 bytes-aligned
489
*/
476
/* AltiVec version of dct_unquantize_h263
477
   this code assumes `block' is 16 bytes-aligned */
490 478
void dct_unquantize_h263_altivec(MpegEncContext *s,
491 479
                                 DCTELEM *block, int n, int qscale)
492 480
{
......
517 505
    }
518 506

  
519 507
    {
520
      register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
521
      DECLARE_ALIGNED_16(short, qmul8[]) =
522
          {
523
            qmul, qmul, qmul, qmul,
524
            qmul, qmul, qmul, qmul
525
          };
526
      DECLARE_ALIGNED_16(short, qadd8[]) =
527
          {
528
            qadd, qadd, qadd, qadd,
529
            qadd, qadd, qadd, qadd
530
          };
531
      DECLARE_ALIGNED_16(short, nqadd8[]) =
532
          {
533
            -qadd, -qadd, -qadd, -qadd,
534
            -qadd, -qadd, -qadd, -qadd
535
          };
536
      register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
537
      register vector bool short blockv_null, blockv_neg;
538
      register short backup_0 = block[0];
539
      register int j = 0;
540

  
541
      qmulv = vec_ld(0, qmul8);
542
      qaddv = vec_ld(0, qadd8);
543
      nqaddv = vec_ld(0, nqadd8);
544

  
545
#if 0 // block *is* 16 bytes-aligned, it seems.
546
      // first make sure block[j] is 16 bytes-aligned
547
      for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
548
        level = block[j];
549
        if (level) {
550
          if (level < 0) {
551
                level = level * qmul - qadd;
552
            } else {
553
                level = level * qmul + qadd;
508
        register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
509
        DECLARE_ALIGNED_16(short, qmul8[]) =
510
            {
511
              qmul, qmul, qmul, qmul,
512
              qmul, qmul, qmul, qmul
513
            };
514
        DECLARE_ALIGNED_16(short, qadd8[]) =
515
            {
516
              qadd, qadd, qadd, qadd,
517
              qadd, qadd, qadd, qadd
518
            };
519
        DECLARE_ALIGNED_16(short, nqadd8[]) =
520
            {
521
              -qadd, -qadd, -qadd, -qadd,
522
              -qadd, -qadd, -qadd, -qadd
523
            };
524
        register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
525
        register vector bool short blockv_null, blockv_neg;
526
        register short backup_0 = block[0];
527
        register int j = 0;
528

  
529
        qmulv = vec_ld(0, qmul8);
530
        qaddv = vec_ld(0, qadd8);
531
        nqaddv = vec_ld(0, nqadd8);
532

  
533
#if 0   // block *is* 16 bytes-aligned, it seems.
534
        // first make sure block[j] is 16 bytes-aligned
535
        for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
536
            level = block[j];
537
            if (level) {
538
                if (level < 0) {
539
                    level = level * qmul - qadd;
540
                } else {
541
                    level = level * qmul + qadd;
542
                }
543
                block[j] = level;
554 544
            }
555
            block[j] = level;
556 545
        }
557
      }
558 546
#endif
559 547

  
560
      // vectorize all the 16 bytes-aligned blocks
561
      // of 8 elements
562
      for(; (j + 7) <= nCoeffs ; j+=8)
563
      {
564
        blockv = vec_ld(j << 1, block);
565
        blockv_neg = vec_cmplt(blockv, vczero);
566
        blockv_null = vec_cmpeq(blockv, vczero);
567
        // choose between +qadd or -qadd as the third operand
568
        temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
569
        // multiply & add (block{i,i+7} * qmul [+-] qadd)
570
        temp1 = vec_mladd(blockv, qmulv, temp1);
571
        // put 0 where block[{i,i+7} used to have 0
572
        blockv = vec_sel(temp1, blockv, blockv_null);
573
        vec_st(blockv, j << 1, block);
574
      }
575

  
576
      // if nCoeffs isn't a multiple of 8, finish the job
577
      // using good old scalar units.
578
      // (we could do it using a truncated vector,
579
      // but I'm not sure it's worth the hassle)
580
      for(; j <= nCoeffs ; j++) {
581
        level = block[j];
582
        if (level) {
583
          if (level < 0) {
584
                level = level * qmul - qadd;
585
            } else {
586
                level = level * qmul + qadd;
548
        // vectorize all the 16 bytes-aligned blocks
549
        // of 8 elements
550
        for(; (j + 7) <= nCoeffs ; j+=8) {
551
            blockv = vec_ld(j << 1, block);
552
            blockv_neg = vec_cmplt(blockv, vczero);
553
            blockv_null = vec_cmpeq(blockv, vczero);
554
            // choose between +qadd or -qadd as the third operand
555
            temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
556
            // multiply & add (block{i,i+7} * qmul [+-] qadd)
557
            temp1 = vec_mladd(blockv, qmulv, temp1);
558
            // put 0 where block[{i,i+7} used to have 0
559
            blockv = vec_sel(temp1, blockv, blockv_null);
560
            vec_st(blockv, j << 1, block);
561
        }
562

  
563
        // if nCoeffs isn't a multiple of 8, finish the job
564
        // using good old scalar units.
565
        // (we could do it using a truncated vector,
566
        // but I'm not sure it's worth the hassle)
567
        for(; j <= nCoeffs ; j++) {
568
            level = block[j];
569
            if (level) {
570
                if (level < 0) {
571
                    level = level * qmul - qadd;
572
                } else {
573
                    level = level * qmul + qadd;
574
                }
575
                block[j] = level;
587 576
            }
588
            block[j] = level;
589 577
        }
590
      }
591 578

  
592
      if (i == 1)
593
      { // cheat. this avoid special-casing the first iteration
594
        block[0] = backup_0;
595
      }
579
        if (i == 1) {
580
            // cheat. this avoid special-casing the first iteration
581
            block[0] = backup_0;
582
        }
596 583
    }
597 584
POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
598 585
}
......
605 592
{
606 593
    if ((mm_flags & MM_ALTIVEC) == 0) return;
607 594

  
608
    if (s->avctx->lowres==0)
609
    {
595
    if (s->avctx->lowres==0) {
610 596
        if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
611
                (s->avctx->idct_algo == FF_IDCT_ALTIVEC))
612
        {
597
            (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) {
613 598
            s->dsp.idct_put = idct_put_altivec;
614 599
            s->dsp.idct_add = idct_add_altivec;
615 600
            s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
......
618 603

  
619 604
    // Test to make sure that the dct required alignments are met.
620 605
    if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
621
        (((long)(s->q_inter_matrix) & 0x0f) != 0))
622
    {
606
        (((long)(s->q_inter_matrix) & 0x0f) != 0)) {
623 607
        av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
624 608
                "to use AltiVec DCT. Reverting to non-AltiVec version.\n");
625 609
        return;
626 610
    }
627 611

  
628
    if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
629
    {
612
    if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) {
630 613
        av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
631 614
                "to use AltiVec DCT. Reverting to non-AltiVec version.\n");
632 615
        return;
......
634 617

  
635 618

  
636 619
    if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
637
            (s->avctx->dct_algo == FF_DCT_ALTIVEC))
638
    {
620
            (s->avctx->dct_algo == FF_DCT_ALTIVEC)) {
639 621
#if 0 /* seems to cause trouble under some circumstances */
640 622
        s->dct_quantize = dct_quantize_altivec;
641 623
#endif

Also available in: Unified diff