Revision 5705b020

View differences:

libavcodec/x86/h264_deblock.asm
106 106
    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
107 107
%endmacro
108 108

  
109
%macro TRANSPOSE4x8W_LOAD 8
110
%if mmsize==16
111
    TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
112
%else
113
    SWAP  1, 4, 2, 3
114
    mova  m0, [t5]
115
    mova  m1, [t5+r1]
116
    mova  m2, [t5+r1*2]
117
    mova  m3, [t5+t6]
118
    TRANSPOSE4x4W 0, 1, 2, 3, 4
119
%endif
120
%endmacro
121

  
122
%macro TRANSPOSE8x2W_STORE 8
123
    punpckhwd  m0, m1, m2
124
    punpcklwd  m1, m2
125
%if mmsize==8
126
    movd       %3, m0
127
    movd       %1, m1
128
    psrlq      m1, 32
129
    psrlq      m0, 32
130
    movd       %2, m1
131
    movd       %4, m0
132
%else
133
    movd       %5, m0
134
    movd       %1, m1
135
    psrldq     m1, 4
136
    psrldq     m0, 4
137
    movd       %2, m1
138
    movd       %6, m0
139
    psrldq     m1, 4
140
    psrldq     m0, 4
141
    movd       %3, m1
142
    movd       %7, m0
143
    psrldq     m1, 4
144
    psrldq     m0, 4
145
    movd       %4, m1
146
    movd       %8, m0
147
%endif
148
%endmacro
149

  
150 109
%macro SBUTTERFLY3 4
151 110
    punpckh%1  %4, %2, %3
152 111
    punpckl%1  %2, %3
libavcodec/x86/h264_deblock_10bit.asm
34 34
SECTION .text
35 35

  
36 36
cextern pw_2
37
cextern pw_3
37 38
cextern pw_4
38 39

  
39 40
; out: %4 = |%1-%2|-%3
......
802 803
DEBLOCK_LUMA avx
803 804
DEBLOCK_LUMA_INTRA avx
804 805
%endif
806

  
807
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
808
; out: %1=p0', %2=q0'
809
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
810
    mova    %6, [pw_2]
811
    paddw   %6, %3
812
    paddw   %6, %4
813
    paddw   %7, %6, %2
814
    paddw   %6, %1
815
    paddw   %6, %3
816
    paddw   %7, %4
817
    psraw   %6, 2
818
    psraw   %7, 2
819
    psubw   %6, %1
820
    psubw   %7, %2
821
    pand    %6, %5
822
    pand    %7, %5
823
    paddw   %1, %6
824
    paddw   %2, %7
825
%endmacro
826

  
827
%macro CHROMA_V_LOAD 1
828
    mova        m0, [r0]    ; p1
829
    mova        m1, [r0+r1] ; p0
830
    mova        m2, [%1]    ; q0
831
    mova        m3, [%1+r1] ; q1
832
%endmacro
833

  
834
%macro CHROMA_V_STORE 0
835
    mova [r0+1*r1], m1
836
    mova [r0+2*r1], m2
837
%endmacro
838

  
839
%macro DEBLOCK_CHROMA 1
840
;-----------------------------------------------------------------------------
841
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
842
;-----------------------------------------------------------------------------
843
cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16)
844
    mov         r5, r0
845
    sub         r0, r1
846
    sub         r0, r1
847
    shl        r2d, 2
848
    shl        r3d, 2
849
%if mmsize < 16
850
    mov         r6, 16/mmsize
851
.loop:
852
%endif
853
    CHROMA_V_LOAD r5
854
    LOAD_AB     m4, m5, r2, r3
855
    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
856
    pxor        m4, m4
857
    LOAD_TC     m6, r4
858
    psubw       m6, [pw_3]
859
    pmaxsw      m6, m4
860
    pand        m7, m6
861
    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
862
    CHROMA_V_STORE
863
%if mmsize < 16
864
    add         r0, mmsize
865
    add         r5, mmsize
866
    add         r4, mmsize/8
867
    dec         r6
868
    jg .loop
869
    REP_RET
870
%else
871
    RET
872
%endif
873

  
874
;-----------------------------------------------------------------------------
875
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
876
;-----------------------------------------------------------------------------
877
cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16)
878
    mov         r4, r0
879
    sub         r0, r1
880
    sub         r0, r1
881
    shl        r2d, 2
882
    shl        r3d, 2
883
%if mmsize < 16
884
    mov         r5, 16/mmsize
885
.loop:
886
%endif
887
    CHROMA_V_LOAD r4
888
    LOAD_AB     m4, m5, r2, r3
889
    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
890
    CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
891
    CHROMA_V_STORE
892
%if mmsize < 16
893
    add         r0, mmsize
894
    add         r4, mmsize
895
    dec         r5
896
    jg .loop
897
    REP_RET
898
%else
899
    RET
900
%endif
901
%endmacro
902

  
903
%ifndef ARCH_X86_64
904
INIT_MMX
905
DEBLOCK_CHROMA mmxext
906
%endif
907
INIT_XMM
908
DEBLOCK_CHROMA sse2
909
INIT_AVX
910
DEBLOCK_CHROMA avx
libavcodec/x86/h264dsp_mmx.c
236 236
LF_IFUNC(h,  luma_intra,   depth, sse2)\
237 237
LF_FUNC (v,  luma,         depth, sse2)\
238 238
LF_IFUNC(v,  luma_intra,   depth, sse2)\
239
LF_FUNC (h,  chroma,       depth, sse2)\
240
LF_IFUNC(h,  chroma_intra, depth, sse2)\
241
LF_FUNC (v,  chroma,       depth, sse2)\
242
LF_IFUNC(v,  chroma_intra, depth, sse2)\
239 243
LF_FUNC (h,  luma,         depth,  avx)\
240 244
LF_IFUNC(h,  luma_intra,   depth,  avx)\
241 245
LF_FUNC (v,  luma,         depth,  avx)\
242
LF_IFUNC(v,  luma_intra,   depth,  avx)
246
LF_IFUNC(v,  luma_intra,   depth,  avx)\
247
LF_FUNC (h,  chroma,       depth,  avx)\
248
LF_IFUNC(h,  chroma_intra, depth,  avx)\
249
LF_FUNC (v,  chroma,       depth,  avx)\
250
LF_IFUNC(v,  chroma_intra, depth,  avx)
243 251

  
244 252
LF_FUNCS( uint8_t,  8)
245 253
LF_FUNCS(uint16_t, 10)
......
401 409
    if (mm_flags & AV_CPU_FLAG_MMX) {
402 410
        if (mm_flags & AV_CPU_FLAG_MMX2) {
403 411
#if ARCH_X86_32
412
            c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmxext;
413
            c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmxext;
404 414
            c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext;
405 415
            c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext;
406 416
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
407 417
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
408 418
#endif
409 419
            if (mm_flags&AV_CPU_FLAG_SSE2) {
420
                c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
421
                c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
410 422
#if HAVE_ALIGNED_STACK
411 423
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
412 424
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
......
415 427
#endif
416 428
            }
417 429
            if (mm_flags&AV_CPU_FLAG_AVX) {
430
                c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx;
431
                c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx;
418 432
#if HAVE_ALIGNED_STACK
419 433
                c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
420 434
                c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;

Also available in: Unified diff