Revision a6b4448c libavcodec/ppc/h264_altivec.c

View differences:

libavcodec/ppc/h264_altivec.c
189 189
                         ((8 - x) * (y)),
190 190
                             ((x) * (y))};
191 191
    register int i;
192
    vec_u8_t fperm;
193
    const vec_s32_t vABCD = vec_ld(0, ABCD);
194
    const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
195
    const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
196
    const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
197
    const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
192
    vec_u8 fperm;
193
    const vec_s32 vABCD = vec_ld(0, ABCD);
194
    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
195
    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
196
    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
197
    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
198 198
    LOAD_ZERO;
199
    const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
200
    const vec_u16_t v6us  = vec_splat_u16(6);
199
    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
200
    const vec_u16 v6us  = vec_splat_u16(6);
201 201
    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
202 202
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
203 203

  
204
    vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
205
    vec_u8_t vsrc0uc, vsrc1uc;
206
    vec_s16_t vsrc0ssH, vsrc1ssH;
207
    vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
208
    vec_s16_t vsrc2ssH, vsrc3ssH, psum;
209
    vec_u8_t vdst, ppsum, fsum;
204
    vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
205
    vec_u8 vsrc0uc, vsrc1uc;
206
    vec_s16 vsrc0ssH, vsrc1ssH;
207
    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
208
    vec_s16 vsrc2ssH, vsrc3ssH, psum;
209
    vec_u8 vdst, ppsum, fsum;
210 210

  
211 211
    if (((unsigned long)dst) % 16 == 0) {
212
        fperm = (vec_u8_t){0x10, 0x11, 0x12, 0x13,
212
        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
213 213
                           0x14, 0x15, 0x16, 0x17,
214 214
                           0x08, 0x09, 0x0A, 0x0B,
215 215
                           0x0C, 0x0D, 0x0E, 0x0F};
216 216
    } else {
217
        fperm = (vec_u8_t){0x00, 0x01, 0x02, 0x03,
217
        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
218 218
                           0x04, 0x05, 0x06, 0x07,
219 219
                           0x18, 0x19, 0x1A, 0x1B,
220 220
                           0x1C, 0x1D, 0x1E, 0x1F};
......
233 233
    else
234 234
        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
235 235

  
236
    vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc);
237
    vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc);
236
    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
237
    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
238 238

  
239 239
    if (!loadSecond) {// -> !reallyBadAlign
240 240
        for (i = 0 ; i < h ; i++) {
......
245 245
            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
246 246
            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
247 247

  
248
            vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
249
            vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
248
            vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
249
            vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
250 250

  
251 251
            psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
252 252
            psum = vec_mladd(vB, vsrc1ssH, psum);
......
256 256
            psum = vec_sra(psum, v6us);
257 257

  
258 258
            vdst = vec_ld(0, dst);
259
            ppsum = (vec_u8_t)vec_packsu(psum, psum);
259
            ppsum = (vec_u8)vec_packsu(psum, psum);
260 260
            fsum = vec_perm(vdst, ppsum, fperm);
261 261

  
262 262
            vec_st(fsum, 0, dst);
......
268 268
            src += stride;
269 269
        }
270 270
    } else {
271
        vec_u8_t vsrcDuc;
271
        vec_u8 vsrcDuc;
272 272
        for (i = 0 ; i < h ; i++) {
273 273
            vsrcCuc = vec_ld(stride + 0, src);
274 274
            vsrcDuc = vec_ld(stride + 16, src);
......
279 279
            else
280 280
                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
281 281

  
282
            vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
283
            vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
282
            vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
283
            vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
284 284

  
285 285
            psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
286 286
            psum = vec_mladd(vB, vsrc1ssH, psum);
......
290 290
            psum = vec_sr(psum, v6us);
291 291

  
292 292
            vdst = vec_ld(0, dst);
293
            ppsum = (vec_u8_t)vec_pack(psum, psum);
293
            ppsum = (vec_u8)vec_pack(psum, psum);
294 294
            fsum = vec_perm(vdst, ppsum, fperm);
295 295

  
296 296
            vec_st(fsum, 0, dst);
......
309 309
                                    int src_stride1, int h)
310 310
{
311 311
    int i;
312
    vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
312
    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
313 313

  
314 314
    mask_ = vec_lvsl(0, src2);
315 315

  
......
351 351
                                    int src_stride1, int h)
352 352
{
353 353
    int i;
354
    vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
354
    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
355 355

  
356 356
    mask_ = vec_lvsl(0, src2);
357 357

  
......
432 432
#define VEC_LOAD_U8_ADD_S16_STORE_U8(va)                      \
433 433
    vdst_orig = vec_ld(0, dst);                               \
434 434
    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);          \
435
    vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst);         \
435
    vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst);         \
436 436
    va = vec_add(va, vdst_ss);                                \
437 437
    va_u8 = vec_packsu(va, zero_s16v);                        \
438
    va_u32 = vec_splat((vec_u32_t)va_u8, 0);                  \
438
    va_u32 = vec_splat((vec_u32)va_u8, 0);                  \
439 439
    vec_ste(va_u32, element, (uint32_t*)dst);
440 440

  
441 441
static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
442 442
{
443
    vec_s16_t va0, va1, va2, va3;
444
    vec_s16_t vz0, vz1, vz2, vz3;
445
    vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3;
446
    vec_u8_t va_u8;
447
    vec_u32_t va_u32;
448
    vec_s16_t vdst_ss;
449
    const vec_u16_t v6us = vec_splat_u16(6);
450
    vec_u8_t vdst, vdst_orig;
451
    vec_u8_t vdst_mask = vec_lvsl(0, dst);
443
    vec_s16 va0, va1, va2, va3;
444
    vec_s16 vz0, vz1, vz2, vz3;
445
    vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
446
    vec_u8 va_u8;
447
    vec_u32 va_u32;
448
    vec_s16 vdst_ss;
449
    const vec_u16 v6us = vec_splat_u16(6);
450
    vec_u8 vdst, vdst_orig;
451
    vec_u8 vdst_mask = vec_lvsl(0, dst);
452 452
    int element = ((unsigned long)dst & 0xf) >> 2;
453 453
    LOAD_ZERO;
454 454

  
......
479 479

  
480 480
#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7) {\
481 481
    /*        a0  = SRC(0) + SRC(4); */ \
482
    vec_s16_t a0v = vec_add(s0, s4);    \
482
    vec_s16 a0v = vec_add(s0, s4);    \
483 483
    /*        a2  = SRC(0) - SRC(4); */ \
484
    vec_s16_t a2v = vec_sub(s0, s4);    \
484
    vec_s16 a2v = vec_sub(s0, s4);    \
485 485
    /*        a4  =           (SRC(2)>>1) - SRC(6); */ \
486
    vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6);    \
486
    vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6);    \
487 487
    /*        a6  =           (SRC(6)>>1) + SRC(2); */ \
488
    vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2);    \
488
    vec_s16 a6v = vec_add(vec_sra(s6, onev), s2);    \
489 489
    /*        b0  =         a0 + a6; */ \
490
    vec_s16_t b0v = vec_add(a0v, a6v);  \
490
    vec_s16 b0v = vec_add(a0v, a6v);  \
491 491
    /*        b2  =         a2 + a4; */ \
492
    vec_s16_t b2v = vec_add(a2v, a4v);  \
492
    vec_s16 b2v = vec_add(a2v, a4v);  \
493 493
    /*        b4  =         a2 - a4; */ \
494
    vec_s16_t b4v = vec_sub(a2v, a4v);  \
494
    vec_s16 b4v = vec_sub(a2v, a4v);  \
495 495
    /*        b6  =         a0 - a6; */ \
496
    vec_s16_t b6v = vec_sub(a0v, a6v);  \
496
    vec_s16 b6v = vec_sub(a0v, a6v);  \
497 497
    /* a1 =  SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
498 498
    /*        a1 =             (SRC(5)-SRC(3)) -  (SRC(7)  +  (SRC(7)>>1)); */ \
499
    vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
499
    vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
500 500
    /* a3 =  SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
501 501
    /*        a3 =             (SRC(7)+SRC(1)) -  (SRC(3)  +  (SRC(3)>>1)); */ \
502
    vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
502
    vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
503 503
    /* a5 =  SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
504 504
    /*        a5 =             (SRC(7)-SRC(1)) +   SRC(5) +   (SRC(5)>>1); */ \
505
    vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
505
    vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
506 506
    /*        a7 =                SRC(5)+SRC(3) +  SRC(1) +   (SRC(1)>>1); */ \
507
    vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
507
    vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
508 508
    /*        b1 =                  (a7>>2)  +  a1; */ \
509
    vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \
509
    vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \
510 510
    /*        b3 =          a3 +        (a5>>2); */ \
511
    vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \
511
    vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \
512 512
    /*        b5 =                  (a3>>2)  -   a5; */ \
513
    vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \
513
    vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \
514 514
    /*        b7 =           a7 -        (a1>>2); */ \
515
    vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
515
    vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
516 516
    /* DST(0,    b0 + b7); */ \
517 517
    d0 = vec_add(b0v, b7v); \
518 518
    /* DST(1,    b2 + b5); */ \
......
533 533

  
534 534
#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
535 535
    /* unaligned load */                                       \
536
    vec_u8_t hv = vec_ld( 0, dest );                           \
537
    vec_u8_t lv = vec_ld( 7, dest );                           \
538
    vec_u8_t dstv   = vec_perm( hv, lv, (vec_u8_t)perm_ldv );  \
539
    vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                 \
540
    vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);   \
541
    vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);  \
542
    vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum);        \
543
    vec_u8_t edgehv;                                           \
536
    vec_u8 hv = vec_ld( 0, dest );                           \
537
    vec_u8 lv = vec_ld( 7, dest );                           \
538
    vec_u8 dstv   = vec_perm( hv, lv, (vec_u8)perm_ldv );  \
539
    vec_s16 idct_sh6 = vec_sra(idctv, sixv);                 \
540
    vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv);   \
541
    vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16);  \
542
    vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum);        \
543
    vec_u8 edgehv;                                           \
544 544
    /* unaligned store */                                      \
545
    vec_u8_t bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
546
    vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
545
    vec_u8 bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
546
    vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
547 547
    lv    = vec_sel( lv, bodyv, edgelv );                      \
548 548
    vec_st( lv, 7, dest );                                     \
549 549
    hv    = vec_ld( 0, dest );                                 \
......
553 553
 }
554 554

  
555 555
void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
556
    vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;
557
    vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
558
    vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
556
    vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
557
    vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
558
    vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
559 559

  
560
    vec_u8_t perm_ldv = vec_lvsl(0, dst);
561
    vec_u8_t perm_stv = vec_lvsr(8, dst);
560
    vec_u8 perm_ldv = vec_lvsl(0, dst);
561
    vec_u8 perm_stv = vec_lvsr(8, dst);
562 562

  
563
    const vec_u16_t onev = vec_splat_u16(1);
564
    const vec_u16_t twov = vec_splat_u16(2);
565
    const vec_u16_t sixv = vec_splat_u16(6);
563
    const vec_u16 onev = vec_splat_u16(1);
564
    const vec_u16 twov = vec_splat_u16(2);
565
    const vec_u16 sixv = vec_splat_u16(6);
566 566

  
567
    const vec_u8_t sel = (vec_u8_t) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1};
567
    const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1};
568 568
    LOAD_ZERO;
569 569

  
570 570
    dct[0] += 32; // rounding for the >>6 at the end
......
621 621
}
622 622

  
623 623
#define transpose4x16(r0, r1, r2, r3) {      \
624
    register vec_u8_t r4;                    \
625
    register vec_u8_t r5;                    \
626
    register vec_u8_t r6;                    \
627
    register vec_u8_t r7;                    \
624
    register vec_u8 r4;                    \
625
    register vec_u8 r5;                    \
626
    register vec_u8 r6;                    \
627
    register vec_u8 r7;                    \
628 628
                                             \
629 629
    r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/ \
630 630
    r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/ \
......
638 638
}
639 639

  
640 640
static inline void write16x4(uint8_t *dst, int dst_stride,
641
                             register vec_u8_t r0, register vec_u8_t r1,
642
                             register vec_u8_t r2, register vec_u8_t r3) {
641
                             register vec_u8 r0, register vec_u8 r1,
642
                             register vec_u8 r2, register vec_u8 r3) {
643 643
    DECLARE_ALIGNED_16(unsigned char, result[64]);
644 644
    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
645 645
    int int_dst_stride = dst_stride/4;
......
671 671
    \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
672 672
    out of unaligned_load() */
673 673
#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
674
    register vec_u8_t r0  = unaligned_load(0,             src);            \
675
    register vec_u8_t r1  = unaligned_load(   src_stride, src);            \
676
    register vec_u8_t r2  = unaligned_load(2* src_stride, src);            \
677
    register vec_u8_t r3  = unaligned_load(3* src_stride, src);            \
678
    register vec_u8_t r4  = unaligned_load(4* src_stride, src);            \
679
    register vec_u8_t r5  = unaligned_load(5* src_stride, src);            \
680
    register vec_u8_t r6  = unaligned_load(6* src_stride, src);            \
681
    register vec_u8_t r7  = unaligned_load(7* src_stride, src);            \
682
    register vec_u8_t r14 = unaligned_load(14*src_stride, src);            \
683
    register vec_u8_t r15 = unaligned_load(15*src_stride, src);            \
674
    register vec_u8 r0  = unaligned_load(0,             src);            \
675
    register vec_u8 r1  = unaligned_load(   src_stride, src);            \
676
    register vec_u8 r2  = unaligned_load(2* src_stride, src);            \
677
    register vec_u8 r3  = unaligned_load(3* src_stride, src);            \
678
    register vec_u8 r4  = unaligned_load(4* src_stride, src);            \
679
    register vec_u8 r5  = unaligned_load(5* src_stride, src);            \
680
    register vec_u8 r6  = unaligned_load(6* src_stride, src);            \
681
    register vec_u8 r7  = unaligned_load(7* src_stride, src);            \
682
    register vec_u8 r14 = unaligned_load(14*src_stride, src);            \
683
    register vec_u8 r15 = unaligned_load(15*src_stride, src);            \
684 684
                                                                           \
685 685
    r8  = unaligned_load( 8*src_stride, src);                              \
686 686
    r9  = unaligned_load( 9*src_stride, src);                              \
......
730 730
}
731 731

  
732 732
// out: o = |x-y| < a
733
static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x,
734
                                         register vec_u8_t y,
735
                                         register vec_u8_t a) {
736

  
737
    register vec_u8_t diff = vec_subs(x, y);
738
    register vec_u8_t diffneg = vec_subs(y, x);
739
    register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
740
    o = (vec_u8_t)vec_cmplt(o, a);
733
static inline vec_u8 diff_lt_altivec ( register vec_u8 x,
734
                                         register vec_u8 y,
735
                                         register vec_u8 a) {
736

  
737
    register vec_u8 diff = vec_subs(x, y);
738
    register vec_u8 diffneg = vec_subs(y, x);
739
    register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */
740
    o = (vec_u8)vec_cmplt(o, a);
741 741
    return o;
742 742
}
743 743

  
744
static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0,
745
                                           register vec_u8_t p1,
746
                                           register vec_u8_t q0,
747
                                           register vec_u8_t q1,
748
                                           register vec_u8_t alpha,
749
                                           register vec_u8_t beta) {
744
static inline vec_u8 h264_deblock_mask ( register vec_u8 p0,
745
                                           register vec_u8 p1,
746
                                           register vec_u8 q0,
747
                                           register vec_u8 q1,
748
                                           register vec_u8 alpha,
749
                                           register vec_u8 beta) {
750 750

  
751
    register vec_u8_t mask;
752
    register vec_u8_t tempmask;
751
    register vec_u8 mask;
752
    register vec_u8 tempmask;
753 753

  
754 754
    mask = diff_lt_altivec(p0, q0, alpha);
755 755
    tempmask = diff_lt_altivec(p1, p0, beta);
......
761 761
}
762 762

  
763 763
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
764
static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
765
                                       register vec_u8_t p1,
766
                                       register vec_u8_t p2,
767
                                       register vec_u8_t q0,
768
                                       register vec_u8_t tc0) {
769

  
770
    register vec_u8_t average = vec_avg(p0, q0);
771
    register vec_u8_t temp;
772
    register vec_u8_t uncliped;
773
    register vec_u8_t ones;
774
    register vec_u8_t max;
775
    register vec_u8_t min;
776
    register vec_u8_t newp1;
764
static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
765
                                       register vec_u8 p1,
766
                                       register vec_u8 p2,
767
                                       register vec_u8 q0,
768
                                       register vec_u8 tc0) {
769

  
770
    register vec_u8 average = vec_avg(p0, q0);
771
    register vec_u8 temp;
772
    register vec_u8 uncliped;
773
    register vec_u8 ones;
774
    register vec_u8 max;
775
    register vec_u8 min;
776
    register vec_u8 newp1;
777 777

  
778 778
    temp = vec_xor(average, p2);
779 779
    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
......
789 789

  
790 790
#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) {                                           \
791 791
                                                                                                  \
792
    const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                               \
792
    const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                               \
793 793
                                                                                                  \
794
    register vec_u8_t pq0bit = vec_xor(p0,q0);                                                    \
795
    register vec_u8_t q1minus;                                                                    \
796
    register vec_u8_t p0minus;                                                                    \
797
    register vec_u8_t stage1;                                                                     \
798
    register vec_u8_t stage2;                                                                     \
799
    register vec_u8_t vec160;                                                                     \
800
    register vec_u8_t delta;                                                                      \
801
    register vec_u8_t deltaneg;                                                                   \
794
    register vec_u8 pq0bit = vec_xor(p0,q0);                                                    \
795
    register vec_u8 q1minus;                                                                    \
796
    register vec_u8 p0minus;                                                                    \
797
    register vec_u8 stage1;                                                                     \
798
    register vec_u8 stage2;                                                                     \
799
    register vec_u8 vec160;                                                                     \
800
    register vec_u8 delta;                                                                      \
801
    register vec_u8 deltaneg;                                                                   \
802 802
                                                                                                  \
803 803
    q1minus = vec_nor(q1, q1);                 /* 255 - q1 */                                     \
804 804
    stage1 = vec_avg(p1, q1minus);             /* (p1 - q1 + 256)>>1 */                           \
......
821 821

  
822 822
#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) {            \
823 823
    DECLARE_ALIGNED_16(unsigned char, temp[16]);                                             \
824
    register vec_u8_t alphavec;                                                              \
825
    register vec_u8_t betavec;                                                               \
826
    register vec_u8_t mask;                                                                  \
827
    register vec_u8_t p1mask;                                                                \
828
    register vec_u8_t q1mask;                                                                \
824
    register vec_u8 alphavec;                                                              \
825
    register vec_u8 betavec;                                                               \
826
    register vec_u8 mask;                                                                  \
827
    register vec_u8 p1mask;                                                                \
828
    register vec_u8 q1mask;                                                                \
829 829
    register vector signed   char tc0vec;                                                    \
830
    register vec_u8_t finaltc0;                                                              \
831
    register vec_u8_t tc0masked;                                                             \
832
    register vec_u8_t newp1;                                                                 \
833
    register vec_u8_t newq1;                                                                 \
830
    register vec_u8 finaltc0;                                                              \
831
    register vec_u8 tc0masked;                                                             \
832
    register vec_u8 newp1;                                                                 \
833
    register vec_u8 newq1;                                                                 \
834 834
                                                                                             \
835 835
    temp[0] = alpha;                                                                         \
836 836
    temp[1] = beta;                                                                          \
......
844 844
    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
845 845
    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
846 846
    mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1)));  /* if tc0[i] >= 0 */         \
847
    finaltc0 = vec_and((vec_u8_t)tc0vec, mask);     /* tc = tc0 */                           \
847
    finaltc0 = vec_and((vec_u8)tc0vec, mask);     /* tc = tc0 */                           \
848 848
                                                                                             \
849 849
    p1mask = diff_lt_altivec(p2, p0, betavec);                                               \
850 850
    p1mask = vec_and(p1mask, mask);                             /* if ( |p2 - p0| < beta) */ \
851
    tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec);                                           \
851
    tc0masked = vec_and(p1mask, (vec_u8)tc0vec);                                           \
852 852
    finaltc0 = vec_sub(finaltc0, p1mask);                       /* tc++ */                   \
853 853
    newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                      \
854 854
    /*end if*/                                                                               \
855 855
                                                                                             \
856 856
    q1mask = diff_lt_altivec(q2, q0, betavec);                                               \
857 857
    q1mask = vec_and(q1mask, mask);                             /* if ( |q2 - q0| < beta ) */\
858
    tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec);                                           \
858
    tc0masked = vec_and(q1mask, (vec_u8)tc0vec);                                           \
859 859
    finaltc0 = vec_sub(finaltc0, q1mask);                       /* tc++ */                   \
860 860
    newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked);                                      \
861 861
    /*end if*/                                                                               \
......
868 868
static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
869 869

  
870 870
    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
871
        register vec_u8_t p2 = vec_ld(-3*stride, pix);
872
        register vec_u8_t p1 = vec_ld(-2*stride, pix);
873
        register vec_u8_t p0 = vec_ld(-1*stride, pix);
874
        register vec_u8_t q0 = vec_ld(0, pix);
875
        register vec_u8_t q1 = vec_ld(stride, pix);
876
        register vec_u8_t q2 = vec_ld(2*stride, pix);
871
        register vec_u8 p2 = vec_ld(-3*stride, pix);
872
        register vec_u8 p1 = vec_ld(-2*stride, pix);
873
        register vec_u8 p0 = vec_ld(-1*stride, pix);
874
        register vec_u8 q0 = vec_ld(0, pix);
875
        register vec_u8 q1 = vec_ld(stride, pix);
876
        register vec_u8 q2 = vec_ld(2*stride, pix);
877 877
        h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
878 878
        vec_st(p1, -2*stride, pix);
879 879
        vec_st(p0, -1*stride, pix);
......
884 884

  
885 885
static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
886 886

  
887
    register vec_u8_t line0, line1, line2, line3, line4, line5;
887
    register vec_u8 line0, line1, line2, line3, line4, line5;
888 888
    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
889 889
        return;
890 890
    readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);

Also available in: Unified diff