Revision 5705b020 libavcodec/x86/h264_deblock_10bit.asm

View differences:

libavcodec/x86/h264_deblock_10bit.asm
34 34
SECTION .text
35 35

  
36 36
cextern pw_2
37
cextern pw_3
37 38
cextern pw_4
38 39

  
39 40
; out: %4 = |%1-%2|-%3
......
802 803
DEBLOCK_LUMA avx
803 804
DEBLOCK_LUMA_INTRA avx
804 805
%endif
806

  
807
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
808
; out: %1=p0', %2=q0'
809
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
810
    mova    %6, [pw_2]
811
    paddw   %6, %3
812
    paddw   %6, %4
813
    paddw   %7, %6, %2
814
    paddw   %6, %1
815
    paddw   %6, %3
816
    paddw   %7, %4
817
    psraw   %6, 2
818
    psraw   %7, 2
819
    psubw   %6, %1
820
    psubw   %7, %2
821
    pand    %6, %5
822
    pand    %7, %5
823
    paddw   %1, %6
824
    paddw   %2, %7
825
%endmacro
826

  
827
%macro CHROMA_V_LOAD 1
828
    mova        m0, [r0]    ; p1
829
    mova        m1, [r0+r1] ; p0
830
    mova        m2, [%1]    ; q0
831
    mova        m3, [%1+r1] ; q1
832
%endmacro
833

  
834
%macro CHROMA_V_STORE 0
835
    mova [r0+1*r1], m1
836
    mova [r0+2*r1], m2
837
%endmacro
838

  
839
%macro DEBLOCK_CHROMA 1
840
;-----------------------------------------------------------------------------
841
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
842
;-----------------------------------------------------------------------------
843
cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16)
844
    mov         r5, r0
845
    sub         r0, r1
846
    sub         r0, r1
847
    shl        r2d, 2
848
    shl        r3d, 2
849
%if mmsize < 16
850
    mov         r6, 16/mmsize
851
.loop:
852
%endif
853
    CHROMA_V_LOAD r5
854
    LOAD_AB     m4, m5, r2, r3
855
    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
856
    pxor        m4, m4
857
    LOAD_TC     m6, r4
858
    psubw       m6, [pw_3]
859
    pmaxsw      m6, m4
860
    pand        m7, m6
861
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
862
    CHROMA_V_STORE
863
%if mmsize < 16
864
    add         r0, mmsize
865
    add         r5, mmsize
866
    add         r4, mmsize/8
867
    dec         r6
868
    jg .loop
869
    REP_RET
870
%else
871
    RET
872
%endif
873

  
874
;-----------------------------------------------------------------------------
875
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
876
;-----------------------------------------------------------------------------
877
cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16)
878
    mov         r4, r0
879
    sub         r0, r1
880
    sub         r0, r1
881
    shl        r2d, 2
882
    shl        r3d, 2
883
%if mmsize < 16
884
    mov         r5, 16/mmsize
885
.loop:
886
%endif
887
    CHROMA_V_LOAD r4
888
    LOAD_AB     m4, m5, r2, r3
889
    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
890
    CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
891
    CHROMA_V_STORE
892
%if mmsize < 16
893
    add         r0, mmsize
894
    add         r4, mmsize
895
    dec         r5
896
    jg .loop
897
    REP_RET
898
%else
899
    RET
900
%endif
901
%endmacro
902

  
903
%ifndef ARCH_X86_64
904
INIT_MMX
905
DEBLOCK_CHROMA mmxext
906
%endif
907
INIT_XMM
908
DEBLOCK_CHROMA sse2
909
INIT_AVX
910
DEBLOCK_CHROMA avx

Also available in: Unified diff