Revision c663cb0d
libavcodec/ppc/h264_altivec.c | ||
---|---|---|
596 | 596 |
ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); |
597 | 597 |
} |
598 | 598 |
|
599 |
// TODO: implement this in AltiVec |
|
600 |
static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) { |
|
601 |
int i, j; |
|
602 |
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
|
603 |
int dc = (block[0] + 32) >> 6; |
|
604 |
for( j = 0; j < 8; j++ ) |
|
605 |
{ |
|
606 |
for( i = 0; i < 8; i++ ) |
|
607 |
dst[i] = cm[ dst[i] + dc ]; |
|
608 |
dst += stride; |
|
599 |
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size) |
|
600 |
{ |
|
601 |
vec_s16 dc16; |
|
602 |
vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; |
|
603 |
LOAD_ZERO; |
|
604 |
DECLARE_ALIGNED_16(int, dc); |
|
605 |
int i; |
|
606 |
|
|
607 |
dc = (block[0] + 32) >> 6; |
|
608 |
dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); |
|
609 |
|
|
610 |
if (size == 4) |
|
611 |
dc16 = vec_sld(dc16, zero_s16v, 8); |
|
612 |
dcplus = vec_packsu(dc16, zero_s16v); |
|
613 |
dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); |
|
614 |
|
|
615 |
aligner = vec_lvsr(0, dst); |
|
616 |
dcplus = vec_perm(dcplus, dcplus, aligner); |
|
617 |
dcminus = vec_perm(dcminus, dcminus, aligner); |
|
618 |
|
|
619 |
for (i = 0; i < size; i += 4) { |
|
620 |
v0 = vec_ld(0, dst+0*stride); |
|
621 |
v1 = vec_ld(0, dst+1*stride); |
|
622 |
v2 = vec_ld(0, dst+2*stride); |
|
623 |
v3 = vec_ld(0, dst+3*stride); |
|
624 |
|
|
625 |
v0 = vec_adds(v0, dcplus); |
|
626 |
v1 = vec_adds(v1, dcplus); |
|
627 |
v2 = vec_adds(v2, dcplus); |
|
628 |
v3 = vec_adds(v3, dcplus); |
|
629 |
|
|
630 |
v0 = vec_subs(v0, dcminus); |
|
631 |
v1 = vec_subs(v1, dcminus); |
|
632 |
v2 = vec_subs(v2, dcminus); |
|
633 |
v3 = vec_subs(v3, dcminus); |
|
634 |
|
|
635 |
vec_st(v0, 0, dst+0*stride); |
|
636 |
vec_st(v1, 0, dst+1*stride); |
|
637 |
vec_st(v2, 0, dst+2*stride); |
|
638 |
vec_st(v3, 0, dst+3*stride); |
|
639 |
|
|
640 |
dst += 4*stride; |
|
609 | 641 |
} |
610 | 642 |
} |
611 | 643 |
|
644 |
static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) |
|
645 |
{ |
|
646 |
h264_idct_dc_add_internal(dst, block, stride, 4); |
|
647 |
} |
|
648 |
|
|
649 |
static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) |
|
650 |
{ |
|
651 |
h264_idct_dc_add_internal(dst, block, stride, 8); |
|
652 |
} |
|
653 |
|
|
612 | 654 |
static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ |
613 | 655 |
int i; |
614 | 656 |
for(i=0; i<16; i+=4){ |
... | ... | |
903 | 945 |
h264_idct_add16, h264_idct_add16intra, h264_idct_add8 are implemented |
904 | 946 |
c->h264_idct_add = ff_h264_idct_add_altivec; |
905 | 947 |
*/ |
948 |
c->h264_idct_dc_add= h264_idct_dc_add_altivec; |
|
949 |
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec; |
|
906 | 950 |
c->h264_idct8_add = ff_h264_idct8_add_altivec; |
907 | 951 |
c->h264_idct8_add4 = ff_h264_idct8_add4_altivec; |
908 | 952 |
c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; |
Also available in: Unified diff