Revision 5cf08f23 libavcodec/i386/dsputil_mmx.c

View differences:

libavcodec/i386/dsputil_mmx.c
45 45
static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
46 46
static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
47 47

  
48
static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
48 49
static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
49 50

  
50 51
#define JUMPALIGN() __asm __volatile (".balign 8"::)
......
692 693
    );
693 694
}
694 695

  
695
// dst = ABS( a - b )
696
#define MMABS_DIFF_MMX2(a,b,dst,z)\
697
    "movq    " #b ", " #dst " \n\t"\
698
    "movq    " #a ", " #z   " \n\t"\
699
    "psubusw " #b ", " #z   " \n\t"\
700
    "psubusw " #a ", " #dst " \n\t"\
701
    "pmaxsw  " #z ", " #dst " \n\t"
702

  
703
// a = clip( a, -tc, tc )
704
#define CLIP_MMX2(a,tc,z)\
705
    "pxor    " #z  ", " #z "  \n\t"\
706
    "psubw   " #tc ", " #z "  \n\t"\
707
    "pmaxsw  " #z  ", " #a "  \n\t"\
708
    "pminsw  " #tc ", " #a "  \n\t"
709

  
710
// in: mm0=p1, mm1=p0, mm2=q0, mm3=q1
711
// out: mm7 = do we filter this pixel?
712
#define H264_DEBLOCK_THRESH(alpha,beta)\
713
    "pxor      %%mm7, %%mm7     \n\t"\
714
    "punpcklbw %%mm7, %%mm0     \n\t"\
715
    "punpcklbw %%mm7, %%mm1     \n\t"\
716
    "punpcklbw %%mm7, %%mm2     \n\t"\
717
    "punpcklbw %%mm7, %%mm3     \n\t"\
718
    MMABS_DIFF_MMX2(%%mm1, %%mm2, %%mm5, %%mm4)\
719
    "movd " #alpha ", %%mm6     \n\t"\
720
    "pshufw    $0, %%mm6, %%mm6 \n\t"\
721
    "pcmpgtw   %%mm5, %%mm6     \n\t" /* ABS(p0-q0) < alpha */\
722
    MMABS_DIFF_MMX2(%%mm0, %%mm1, %%mm5, %%mm4)\
723
    MMABS_DIFF_MMX2(%%mm3, %%mm2, %%mm7, %%mm4)\
724
    "pmaxsw    %%mm7, %%mm5     \n\t"\
725
    "movd  " #beta ", %%mm7     \n\t"\
726
    "pshufw    $0, %%mm7, %%mm7 \n\t"\
727
    "movq      %%mm7, %%mm4     \n\t"\
728
    "pcmpgtw   %%mm5, %%mm7     \n\t" /* ABS(p1-p0) < beta && ABS(q1-q0) < beta */\
729
    "pand      %%mm6, %%mm7     \n\t"
730

  
731
// in: mm0=p1, mm1=p0, mm2=q0, mm3=q1, mm6=tc
732
// out: mm1=p0', mm2=q0'
733
#define H264_DEBLOCK_P0_Q0(pw4)\
734
    "movq   " #pw4 ", %%mm4     \n\t"\
735
    "movq      %%mm2, %%mm5     \n\t"\
736
    "paddw     %%mm4, %%mm0     \n\t"\
737
    "psubw     %%mm1, %%mm5     \n\t"\
738
    "psubw     %%mm3, %%mm0     \n\t"\
739
    "psllw     $2,    %%mm5     \n\t"\
740
    "paddw     %%mm0, %%mm5     \n\t"\
741
    "psraw     $3,    %%mm5     \n\t" /* mm5 = (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3 */\
742
    CLIP_MMX2(%%mm5, %%mm6, %%mm4)    /* delta = clip( mm5, -tc, tc ) */\
743
    "paddw     %%mm5, %%mm1     \n\t" /* p0 += delta */\
744
    "psubw     %%mm5, %%mm2     \n\t" /* q0 -= delta */
745

  
746
// in: mm1=p0, mm2=q0, mm6=tc0
747
// out: mm5=delta
748
#define H264_DEBLOCK_DELTA_PQ1(p1,p2,z)\
749
    "movq      %%mm1, %%mm5     \n\t"\
750
    "pavgb     %%mm2, %%mm5     \n\t"\
751
    "paddw   " #p2 ", %%mm5     \n\t"\
752
    "psraw     $1, %%mm5        \n\t"\
753
    "psubw   " #p1 ", %%mm5     \n\t" /* ( ( q2 + ((p0+q0+1)>>1) ) >> 1 ) - q1 */\
754
    CLIP_MMX2(%%mm5, %%mm6, z)
755

  
756
static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int tc0)
696

  
697
// out: o = |x-y|>a
698
// clobbers: t
699
#define DIFF_GT_MMX(x,y,a,o,t)\
700
    "movq     "#y", "#t"  \n\t"\
701
    "movq     "#x", "#o"  \n\t"\
702
    "psubusb  "#x", "#t"  \n\t"\
703
    "psubusb  "#y", "#o"  \n\t"\
704
    "por      "#t", "#o"  \n\t"\
705
    "psubusb  "#a", "#o"  \n\t"
706

  
707
// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
708
// out: mm5=beta-1, mm7=mask
709
// clobbers: mm4,mm6
710
#define H264_DEBLOCK_MASK(alpha1, beta1) \
711
    "pshufw $0, "#alpha1", %%mm4 \n\t"\
712
    "pshufw $0, "#beta1 ", %%mm5 \n\t"\
713
    "packuswb  %%mm4, %%mm4      \n\t"\
714
    "packuswb  %%mm5, %%mm5      \n\t"\
715
    DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
716
    DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
717
    "por       %%mm4, %%mm7      \n\t"\
718
    DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
719
    "por       %%mm4, %%mm7      \n\t"\
720
    "pxor      %%mm6, %%mm6      \n\t"\
721
    "pcmpeqb   %%mm6, %%mm7      \n\t"
722

  
723
// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
724
// out: mm1=p0' mm2=q0'
725
// clobbers: mm0,3-6
726
#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
727
        /* a = q0^p0^((p1-q1)>>2) */\
728
        "movq    %%mm0, %%mm4  \n\t"\
729
        "psubb   %%mm3, %%mm4  \n\t"\
730
        "psrlw   $2,    %%mm4  \n\t"\
731
        "pxor    %%mm1, %%mm4  \n\t"\
732
        "pxor    %%mm2, %%mm4  \n\t"\
733
        /* b = p0^(q1>>2) */\
734
        "psrlw   $2,    %%mm3  \n\t"\
735
        "pand "#pb_3f", %%mm3  \n\t"\
736
        "movq    %%mm1, %%mm5  \n\t"\
737
        "pxor    %%mm3, %%mm5  \n\t"\
738
        /* c = q0^(p1>>2) */\
739
        "psrlw   $2,    %%mm0  \n\t"\
740
        "pand "#pb_3f", %%mm0  \n\t"\
741
        "movq    %%mm2, %%mm6  \n\t"\
742
        "pxor    %%mm0, %%mm6  \n\t"\
743
        /* d = (c^b) & ~(b^a) & 1 */\
744
        "pxor    %%mm5, %%mm6  \n\t"\
745
        "pxor    %%mm4, %%mm5  \n\t"\
746
        "pandn   %%mm6, %%mm5  \n\t"\
747
        "pand "#pb_01", %%mm5  \n\t"\
748
        /* delta = (avg(q0, p1>>2) + (d&a))
749
         *       - (avg(p0, q1>>2) + (d&~a)) */\
750
        "pavgb   %%mm2, %%mm0  \n\t"\
751
        "movq    %%mm5, %%mm6  \n\t"\
752
        "pand    %%mm4, %%mm6  \n\t"\
753
        "paddusb %%mm6, %%mm0  \n\t"\
754
        "pavgb   %%mm1, %%mm3  \n\t"\
755
        "pandn   %%mm5, %%mm4  \n\t"\
756
        "paddusb %%mm4, %%mm3  \n\t"\
757
        /* p0 += clip(delta, -tc0, tc0)
758
         * q0 -= clip(delta, -tc0, tc0) */\
759
        "movq    %%mm0, %%mm4  \n\t"\
760
        "psubusb %%mm3, %%mm0  \n\t"\
761
        "psubusb %%mm4, %%mm3  \n\t"\
762
        "pminub  %%mm7, %%mm0  \n\t"\
763
        "pminub  %%mm7, %%mm3  \n\t"\
764
        "paddusb %%mm0, %%mm1  \n\t"\
765
        "paddusb %%mm3, %%mm2  \n\t"\
766
        "psubusb %%mm3, %%mm1  \n\t"\
767
        "psubusb %%mm0, %%mm2  \n\t"
768

  
769
// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=mm_bone
770
// out: (q1addr) = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
771
// clobbers: q2, tmp, tc0
772
#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
773
        "movq     %%mm1,  "#tmp"   \n\t"\
774
        "pavgb    %%mm2,  "#tmp"   \n\t"\
775
        "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
776
        "pxor   "q2addr", "#tmp"   \n\t"\
777
        "pand     %8,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
778
        "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
779
        "movq     "#p1",  "#tmp"   \n\t"\
780
        "psubusb  "#tc0", "#tmp"   \n\t"\
781
        "paddusb  "#p1",  "#tc0"   \n\t"\
782
        "pmaxub   "#tmp", "#q2"    \n\t"\
783
        "pminub   "#tc0", "#q2"    \n\t"\
784
        "movq     "#q2",  "q1addr" \n\t"
785

  
786
static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
757 787
{
758
    uint64_t tmp0, tmp1;
788
    uint64_t tmp0;
789
    uint64_t tc = (uint8_t)tc0[1]*0x01010000 | (uint8_t)tc0[0]*0x0101;
790
    // with luma, tc0=0 doesn't mean no filtering, so we need a separate input mask
791
    uint32_t mask[2] = { (tc0[0]>=0)*0xffffffff, (tc0[1]>=0)*0xffffffff };
792

  
759 793
    asm volatile(
760
        "movd  (%2,%4),   %%mm0        \n\t" //p1
761
        "movd  (%2,%4,2), %%mm1        \n\t" //p0
762
        "movd  (%3),      %%mm2        \n\t" //q0
763
        "movd  (%3,%4),   %%mm3        \n\t" //q1
764
        H264_DEBLOCK_THRESH(%6,%7)
765
        "movq      %%mm7, %0           \n\t"
766

  
767
// filter p1 if ABS(p2-p0) < beta
768
        "movd      (%2),  %%mm3        \n\t"
769
        "pxor      %%mm6, %%mm6        \n\t"
770
        "punpcklbw %%mm6, %%mm3        \n\t" //p2
771
        MMABS_DIFF_MMX2(%%mm1, %%mm3, %%mm5, %%mm6)
772
        "pcmpgtw   %%mm5, %%mm4        \n\t"
773
        "pand      %%mm7, %%mm4        \n\t" // mm4 = ( ABS( p2 - p0 ) < beta && filterp )
774
        "movd      %5,    %%mm6        \n\t"
775
        "pshufw    $0, %%mm6, %%mm6    \n\t" //tc
776

  
777
        H264_DEBLOCK_DELTA_PQ1(%%mm0, %%mm3, %%mm7) // delta = clip( ( p2 + ((p0+q0+1)>>1) ) >> 1 ) - p1 )
778
        "pand      %%mm4, %%mm5        \n\t"
779
        "paddw     %%mm0, %%mm5        \n\t"
780
        "packuswb  %%mm5, %%mm5        \n\t"
781
        "movd      %%mm5, (%2,%4)      \n\t" // *p1 += delta
782
        "psrlw     $15, %%mm4          \n\t"
783
        "paddw     %%mm6, %%mm4        \n\t" // tc++
784
        "movq      %%mm4, %1           \n\t"
785

  
786
// filter q1 if ABS(q2-q0) < beta
787
        "pxor      %%mm7, %%mm7        \n\t"
788
        "movd  (%3,%4),   %%mm3        \n\t" //q1
789
        "movd  (%3,%4,2), %%mm4        \n\t" //q2
790
        "punpcklbw %%mm7, %%mm3        \n\t"
791
        "punpcklbw %%mm7, %%mm4        \n\t"
792
        MMABS_DIFF_MMX2(%%mm2, %%mm4, %%mm5, %%mm7)
793
        "movd      %7,    %%mm7        \n\t"
794
        "pshufw    $0, %%mm7, %%mm7    \n\t"
795
        "pcmpgtw   %%mm5, %%mm7        \n\t"
796

  
797
        H264_DEBLOCK_DELTA_PQ1(%%mm3, %%mm4, %%mm4) // delta = clip( ( q2 + ((p0+q0+1)>>1) ) >> 1 ) - q1 )
798
        "movq      %0,    %%mm4        \n\t"
799
        "pand      %%mm4, %%mm7        \n\t" // mm7 = ( ABS( q2 - q0 ) < beta && filterp )
800
        "pand      %%mm7, %%mm5        \n\t"
801
        "paddw     %%mm3, %%mm5        \n\t"
802
        "packuswb  %%mm5, %%mm5        \n\t"
803
        "movd      %%mm5, (%3,%4)      \n\t" // *q1 += delta
804
        "movq      %1, %%mm6           \n\t"
805
        "psrlw     $15, %%mm7          \n\t"
806
        "paddw     %%mm7, %%mm6        \n\t" // tc++
807
        "movq      %0,    %%mm4        \n\t"
808
        "pand      %%mm4, %%mm6        \n\t"
809

  
810
        H264_DEBLOCK_P0_Q0(%8)
811
        "packuswb  %%mm1, %%mm1        \n\t"
812
        "packuswb  %%mm2, %%mm2        \n\t"
813
        "movd      %%mm1, (%2,%4,2)    \n\t"
814
        "movd      %%mm2, (%3)         \n\t"
815

  
816
        : "=m"(tmp0), "=m"(tmp1)
794
        "movq    (%1,%3), %%mm0    \n\t" //p1
795
        "movq    (%1,%3,2), %%mm1  \n\t" //p0
796
        "movq    (%2),    %%mm2    \n\t" //q0
797
        "movq    (%2,%3), %%mm3    \n\t" //q1
798
        H264_DEBLOCK_MASK(%6, %7)
799
        "pand     %5,     %%mm7    \n\t"
800
        "movq     %%mm7,  %0       \n\t"
801

  
802
        /* filter p1 */
803
        "movq     (%1),   %%mm3    \n\t" //p2
804
        DIFF_GT_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
805
        "pandn    %%mm7,  %%mm6    \n\t"
806
        "pcmpeqb  %%mm7,  %%mm6    \n\t"
807
        "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
808
        "pshufw  $80, %4, %%mm4    \n\t"
809
        "pand     %%mm7,  %%mm4    \n\t" // mask & tc0
810
        "movq     %8,     %%mm7    \n\t"
811
        "pand     %%mm6,  %%mm7    \n\t" // mask & |p2-p0|<beta & 1
812
        "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
813
        "paddb    %%mm4,  %%mm7    \n\t" // tc++
814
        H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
815

  
816
        /* filter q1 */
817
        "movq    (%2,%3,2), %%mm4  \n\t" //q2
818
        DIFF_GT_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
819
        "pandn    %0,     %%mm6    \n\t"
820
        "pcmpeqb  %0,     %%mm6    \n\t"
821
        "pand     %0,     %%mm6    \n\t"
822
        "pshufw  $80, %4, %%mm5    \n\t"
823
        "pand     %%mm6,  %%mm5    \n\t"
824
        "pand     %8,     %%mm6    \n\t"
825
        "paddb    %%mm6,  %%mm7    \n\t"
826
        "movq    (%2,%3), %%mm3    \n\t"
827
        H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
828

  
829
        /* filter p0, q0 */
830
        H264_DEBLOCK_P0_Q0(%8, %9)
831
        "movq      %%mm1, (%1,%3,2) \n\t"
832
        "movq      %%mm2, (%2)      \n\t"
833

  
834
        : "=m"(tmp0)
817 835
        : "r"(pix-3*stride), "r"(pix), "r"((long)stride),
818
          "r"(tc0), "r"(alpha), "r"(beta), "m"(ff_pw_4)
836
          "m"(tc), "m"(*(uint64_t*)mask), "m"(alpha1), "m"(beta1),
837
          "m"(mm_bone), "m"(ff_pb_3F)
819 838
    );
820 839
}
821 840

  
822
static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
841
static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
823 842
{
824
    int i;
825
    for(i=0; i<4; i++, pix+=4) {
826
        if(tc0[i] < 0)
827
            continue;
828
        h264_loop_filter_luma_mmx2(pix, stride, alpha, beta, tc0[i]);
829
    }
843
    if((tc0[0] & tc0[1]) >= 0)
844
        h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
845
    if((tc0[2] & tc0[3]) >= 0)
846
        h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
830 847
}
831

  
832
static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
848
static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
833 849
{
834
    uint8_t trans[4*8];
850
    //FIXME: could cut some load/stores by merging transpose with filter
851
    // also, it only needs to transpose 6x8
852
    uint8_t trans[8*8];
835 853
    int i;
836
    for(i=0; i<4; i++, pix+=4*stride) {
837
        if(tc0[i] < 0)
854
    for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
855
        if((tc0[0] & tc0[1]) < 0)
838 856
            continue;
839
        //FIXME: could cut some load/stores by merging transpose with filter
840
        transpose4x4(trans, pix-4, 4, stride);
841
        transpose4x4(trans+4*4, pix, 4, stride);
842
        h264_loop_filter_luma_mmx2(trans+4*4, 4, alpha, beta, tc0[i]);
843
        transpose4x4(pix-2, trans+2*4, stride, 4);
857
        transpose4x4(trans,       pix-4,          8, stride);
858
        transpose4x4(trans  +4*8, pix,            8, stride);
859
        transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
860
        transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
861
        h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
862
        transpose4x4(pix-2,          trans  +2*8, stride, 8);
863
        transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
844 864
    }
845 865
}
846 866

  
847
static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
867
static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
848 868
{
849 869
    asm volatile(
850
        "movd    (%0),    %%mm0     \n\t"
851
        "movd    (%0,%2), %%mm1     \n\t"
852
        "movd    (%1),    %%mm2     \n\t"
853
        "movd    (%1,%2), %%mm3     \n\t"
854
        H264_DEBLOCK_THRESH(%4,%5)
870
        "movq    (%0),    %%mm0     \n\t" //p1
871
        "movq    (%0,%2), %%mm1     \n\t" //p0
872
        "movq    (%1),    %%mm2     \n\t" //q0
873
        "movq    (%1,%2), %%mm3     \n\t" //q1
874
        H264_DEBLOCK_MASK(%4, %5)
855 875
        "movd      %3,    %%mm6     \n\t"
856
        "pshufw $0x50, %%mm6, %%mm6 \n\t" // mm6 = tc[1], tc[1], tc[0], tc[0]
857
        "pand      %%mm7, %%mm6     \n\t"
858
        H264_DEBLOCK_P0_Q0(%6)
859
        "packuswb  %%mm1, %%mm1     \n\t"
860
        "packuswb  %%mm2, %%mm2     \n\t"
861
        "movd      %%mm1, (%0,%2)   \n\t"
862
        "movd      %%mm2, (%1)      \n\t"
876
        "punpcklbw %%mm6, %%mm6     \n\t"
877
        "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
878
        H264_DEBLOCK_P0_Q0(%6, %7)
879
        "movq      %%mm1, (%0,%2)   \n\t"
880
        "movq      %%mm2, (%1)      \n\t"
881

  
863 882
        :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
864
           "r"(tc0[1]<<16 | tc0[0]),
865
           "r"(alpha), "g"(beta), "m"(ff_pw_4)
883
           "r"(*(uint32_t*)tc0),
884
           "m"(alpha1), "m"(beta1), "m"(mm_bone), "m"(ff_pb_3F)
866 885
    );
867 886
}
868 887

  
869
static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
888
static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
870 889
{
871
    int i;
872
    for(i=0; i<2; i++) {
873
        h264_loop_filter_chroma_mmx2(pix, stride, alpha, beta, tc0);
874
        pix += 4;
875
        tc0 += 2;
876
    }
890
    h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
877 891
}
878 892

  
879
static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
893
static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
880 894
{
881
    uint8_t trans[4*4];
882
    int i;
883
    for(i=0; i<2; i++) {
884
        //FIXME: could cut some load/stores by merging transpose with filter
885
        transpose4x4(trans, pix-2, 4, stride);
886
        h264_loop_filter_chroma_mmx2(trans+2*4, 4, alpha, beta, tc0);
887
        transpose4x4(pix-2, trans, stride, 4);
888
        pix += 4*stride;
889
        tc0 += 2;
890
    }
895
    //FIXME: could cut some load/stores by merging transpose with filter
896
    uint8_t trans[8*4];
897
    transpose4x4(trans, pix-2, 8, stride);
898
    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
899
    h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
900
    transpose4x4(pix-2, trans, stride, 8);
901
    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
891 902
}
892 903

  
904
// p0 = (p0 + q1 + 2*p1 + 2) >> 2
905
#define H264_FILTER_CHROMA4(p0, p1, q1, one) \
906
    "movq    "#p0", %%mm4  \n\t"\
907
    "pxor    "#q1", %%mm4  \n\t"\
908
    "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
909
    "pavgb   "#q1", "#p0"  \n\t"\
910
    "psubusb %%mm4, "#p0"  \n\t"\
911
    "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
912

  
913
static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
914
{
915
    asm volatile(
916
        "movq    (%0),    %%mm0     \n\t"
917
        "movq    (%0,%2), %%mm1     \n\t"
918
        "movq    (%1),    %%mm2     \n\t"
919
        "movq    (%1,%2), %%mm3     \n\t"
920
        H264_DEBLOCK_MASK(%3, %4)
921
        "movq    %%mm1,   %%mm5     \n\t"
922
        "movq    %%mm2,   %%mm6     \n\t"
923
        H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
924
        H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
925
        "psubb   %%mm5,   %%mm1     \n\t"
926
        "psubb   %%mm6,   %%mm2     \n\t"
927
        "pand    %%mm7,   %%mm1     \n\t"
928
        "pand    %%mm7,   %%mm2     \n\t"
929
        "paddb   %%mm5,   %%mm1     \n\t"
930
        "paddb   %%mm6,   %%mm2     \n\t"
931
        "movq    %%mm1,   (%0,%2)   \n\t"
932
        "movq    %%mm2,   (%1)      \n\t"
933
        :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
934
           "m"(alpha1), "m"(beta1), "m"(mm_bone)
935
    );
936
}
937

  
938
static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
939
{
940
    h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
941
}
942

  
943
static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
944
{
945
    //FIXME: could cut some load/stores by merging transpose with filter
946
    uint8_t trans[8*4];
947
    transpose4x4(trans, pix-2, 8, stride);
948
    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
949
    h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
950
    transpose4x4(pix-2, trans, stride, 8);
951
    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
952
}
953

  
954

  
893 955
#ifdef CONFIG_ENCODERS
894 956
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
895 957
    int tmp;
......
3415 3477
            c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
3416 3478
            c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
3417 3479
            c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
3480
            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
3481
            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
3418 3482

  
3419 3483
#ifdef CONFIG_ENCODERS
3420 3484
            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;

Also available in: Unified diff