Revision 19fb234e libavcodec/x86/h264_idct.asm

View differences:

libavcodec/x86/h264_idct.asm
47 47
%endif
48 48

  
49 49
cextern pw_32
50
cextern pw_1
50 51

  
51 52
SECTION .text
52 53

  
......
854 855
    add8_sse2_cycle 2, 0x21
855 856
    add8_sse2_cycle 3, 0x29
856 857
    RET
858

  
859
;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
860

  
861
%macro WALSH4_1D 5
862
    SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
863
    SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
864
    SWAP %1, %4, %3
865
%endmacro
866

  
867
%macro DEQUANT_MMX 3
868
    mova        m7, [pw_1]
869
    mova        m4, %1
870
    punpcklwd   %1, m7
871
    punpckhwd   m4, m7
872
    mova        m5, %2
873
    punpcklwd   %2, m7
874
    punpckhwd   m5, m7
875
    movd        m7, t3d
876
    punpckldq   m7, m7
877
    pmaddwd     %1, m7
878
    pmaddwd     %2, m7
879
    pmaddwd     m4, m7
880
    pmaddwd     m5, m7
881
    psrad       %1, %3
882
    psrad       %2, %3
883
    psrad       m4, %3
884
    psrad       m5, %3
885
    packssdw    %1, m4
886
    packssdw    %2, m5
887
%endmacro
888

  
889
%macro STORE_WORDS_MMX 5
890
    movd  t0d, %1
891
    psrlq  %1, 32
892
    movd  t1d, %1
893
    mov [t2+%2*32], t0w
894
    mov [t2+%4*32], t1w
895
    shr   t0d, 16
896
    shr   t1d, 16
897
    mov [t2+%3*32], t0w
898
    mov [t2+%5*32], t1w
899
%endmacro
900

  
901
%macro DEQUANT_STORE_MMX 1
902
    DEQUANT_MMX m0, m1, %1
903
    STORE_WORDS_MMX m0,  0,  1,  4,  5
904
    STORE_WORDS_MMX m1,  2,  3,  6,  7
905

  
906
    DEQUANT_MMX m2, m3, %1
907
    STORE_WORDS_MMX m2,  8,  9, 12, 13
908
    STORE_WORDS_MMX m3, 10, 11, 14, 15
909
%endmacro
910

  
911
%macro STORE_WORDS_SSE 9
912
    movd  t0d, %1
913
    psrldq  %1, 4
914
    movd  t1d, %1
915
    psrldq  %1, 4
916
    mov [t2+%2*32], t0w
917
    mov [t2+%4*32], t1w
918
    shr   t0d, 16
919
    shr   t1d, 16
920
    mov [t2+%3*32], t0w
921
    mov [t2+%5*32], t1w
922
    movd  t0d, %1
923
    psrldq  %1, 4
924
    movd  t1d, %1
925
    mov [t2+%6*32], t0w
926
    mov [t2+%8*32], t1w
927
    shr   t0d, 16
928
    shr   t1d, 16
929
    mov [t2+%7*32], t0w
930
    mov [t2+%9*32], t1w
931
%endmacro
932

  
933
%macro DEQUANT_STORE_SSE2 1
934
    movd      xmm4, t3d
935
    movq      xmm5, [pw_1]
936
    pshufd    xmm4, xmm4, 0
937
    movq2dq   xmm0, m0
938
    movq2dq   xmm1, m1
939
    movq2dq   xmm2, m2
940
    movq2dq   xmm3, m3
941
    punpcklwd xmm0, xmm5
942
    punpcklwd xmm1, xmm5
943
    punpcklwd xmm2, xmm5
944
    punpcklwd xmm3, xmm5
945
    pmaddwd   xmm0, xmm4
946
    pmaddwd   xmm1, xmm4
947
    pmaddwd   xmm2, xmm4
948
    pmaddwd   xmm3, xmm4
949
    psrad     xmm0, %1
950
    psrad     xmm1, %1
951
    psrad     xmm2, %1
952
    psrad     xmm3, %1
953
    packssdw  xmm0, xmm1
954
    packssdw  xmm2, xmm3
955
    STORE_WORDS_SSE xmm0,  0,  1,  4,  5,  2,  3,  6,  7
956
    STORE_WORDS_SSE xmm2,  8,  9, 12, 13, 10, 11, 14, 15
957
%endmacro
958

  
959
%macro IDCT_DC_DEQUANT 2
960
cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
961
    movq        m3, [r1+24]
962
    movq        m2, [r1+16]
963
    movq        m1, [r1+ 8]
964
    movq        m0, [r1+ 0]
965
    WALSH4_1D    0,1,2,3,4
966
    TRANSPOSE4x4W 0,1,2,3,4
967
    WALSH4_1D    0,1,2,3,4
968

  
969
; shift, tmp, output, qmul
970
%ifdef WIN64
971
    DECLARE_REG_TMP 0,3,1,2
972
    ; we can't avoid this, because r0 is the shift register (ecx) on win64
973
    xchg        r0, t2
974
%elifdef ARCH_X86_64
975
    DECLARE_REG_TMP 3,1,0,2
976
%else
977
    DECLARE_REG_TMP 1,3,0,2
978
%endif
979

  
980
    cmp        t3d, 32767
981
    jg .big_qmul
982
    add        t3d, 128 << 16
983
%ifidn %1,mmx
984
    DEQUANT_STORE_MMX 8
985
%else
986
    DEQUANT_STORE_SSE2 8
987
%endif
988
    RET
989
.big_qmul:
990
    bsr        t0d, t3d
991
    add        t3d, 128 << 16
992
    mov        t1d, 7
993
    cmp        t0d, t1d
994
    cmovg      t0d, t1d
995
    inc        t1d
996
    shr        t3d, t0b
997
    sub        t1d, t0d
998
%ifidn %1,mmx
999
    movd        m6, t1d
1000
    DEQUANT_STORE_MMX m6
1001
%else
1002
    movd      xmm6, t1d
1003
    DEQUANT_STORE_SSE2 xmm6
1004
%endif
1005
    RET
1006
%endmacro
1007

  
1008
INIT_MMX
1009
IDCT_DC_DEQUANT mmx, 0
1010
IDCT_DC_DEQUANT sse2, 7

Also available in: Unified diff