Revision c663cb0d libavcodec/ppc/h264_altivec.c

View differences:

libavcodec/ppc/h264_altivec.c
596 596
    ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
597 597
}
598 598

  
599
// TODO: implement this in AltiVec
600
static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) {
601
    int i, j;
602
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
603
    int dc = (block[0] + 32) >> 6;
604
    for( j = 0; j < 8; j++ )
605
    {
606
        for( i = 0; i < 8; i++ )
607
            dst[i] = cm[ dst[i] + dc ];
608
        dst += stride;
599
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size)
600
{
601
    vec_s16 dc16;
602
    vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
603
    LOAD_ZERO;
604
    DECLARE_ALIGNED_16(int, dc);
605
    int i;
606

  
607
    dc = (block[0] + 32) >> 6;
608
    dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);
609

  
610
    if (size == 4)
611
        dc16 = vec_sld(dc16, zero_s16v, 8);
612
    dcplus = vec_packsu(dc16, zero_s16v);
613
    dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);
614

  
615
    aligner = vec_lvsr(0, dst);
616
    dcplus = vec_perm(dcplus, dcplus, aligner);
617
    dcminus = vec_perm(dcminus, dcminus, aligner);
618

  
619
    for (i = 0; i < size; i += 4) {
620
        v0 = vec_ld(0, dst+0*stride);
621
        v1 = vec_ld(0, dst+1*stride);
622
        v2 = vec_ld(0, dst+2*stride);
623
        v3 = vec_ld(0, dst+3*stride);
624

  
625
        v0 = vec_adds(v0, dcplus);
626
        v1 = vec_adds(v1, dcplus);
627
        v2 = vec_adds(v2, dcplus);
628
        v3 = vec_adds(v3, dcplus);
629

  
630
        v0 = vec_subs(v0, dcminus);
631
        v1 = vec_subs(v1, dcminus);
632
        v2 = vec_subs(v2, dcminus);
633
        v3 = vec_subs(v3, dcminus);
634

  
635
        vec_st(v0, 0, dst+0*stride);
636
        vec_st(v1, 0, dst+1*stride);
637
        vec_st(v2, 0, dst+2*stride);
638
        vec_st(v3, 0, dst+3*stride);
639

  
640
        dst += 4*stride;
609 641
    }
610 642
}
611 643

  
644
static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
645
{
646
    h264_idct_dc_add_internal(dst, block, stride, 4);
647
}
648

  
649
static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
650
{
651
    h264_idct_dc_add_internal(dst, block, stride, 8);
652
}
653

  
612 654
static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
613 655
    int i;
614 656
    for(i=0; i<16; i+=4){
......
903 945
   h264_idct_add16, h264_idct_add16intra, h264_idct_add8 are implemented
904 946
        c->h264_idct_add = ff_h264_idct_add_altivec;
905 947
*/
948
        c->h264_idct_dc_add= h264_idct_dc_add_altivec;
949
        c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec;
906 950
        c->h264_idct8_add = ff_h264_idct8_add_altivec;
907 951
        c->h264_idct8_add4 = ff_h264_idct8_add4_altivec;
908 952
        c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;

Also available in: Unified diff