Revision 2dd2f716

View differences:

libavcodec/x86/vp8dsp-init.c
196 196
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
197 197
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
198 198
extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]);
199
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
199 200
#endif
200 201

  
201 202
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
......
229 230
#if HAVE_YASM
230 231
    if (mm_flags & FF_MM_MMX) {
231 232
        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_mmx;
233
        c->vp8_idct_add                     = ff_vp8_idct_add_mmx;
232 234
        c->put_vp8_epel_pixels_tab[0][0][0]     =
233 235
        c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
234 236
        c->put_vp8_epel_pixels_tab[1][0][0]     =
libavcodec/x86/vp8dsp.asm
142 142
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
143 143
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
144 144

  
145
pw_20091: times 4 dw 20091
146
pw_17734: times 4 dw 17734
147

  
145 148
cextern pw_3
146 149
cextern pw_4
147 150
cextern pw_64
......
924 927
    RET
925 928

  
926 929
;-----------------------------------------------------------------------------
930
; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
931
;-----------------------------------------------------------------------------
932

  
933
; calculate %1=%2+%1; %2=%2-%1, with %3=temp register
934
%macro SUMSUB 3
935
    mova      %3, %1
936
    paddw     %1, %2
937
    psubw     %2, %3
938
%endmacro
939

  
940
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
941
;           this macro assumes that m6/m7 have words for 20091/17734 loaded
942
%macro VP8_MULTIPLY_SUMSUB 4
943
    mova      %3, %1
944
    mova      %4, %2
945
    pmulhw    %3, m6 ;20091(1)
946
    pmulhw    %4, m6 ;20091(2)
947
    paddw     %3, %1
948
    paddw     %4, %2
949
    psllw     %1, 1
950
    psllw     %2, 1
951
    pmulhw    %1, m7 ;35468(1)
952
    pmulhw    %2, m7 ;35468(2)
953
    psubw     %1, %4
954
    paddw     %2, %3
955
%endmacro
956

  
957
; calculate x0=%1+%3; x1=%1-%3
958
;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
959
;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
960
;           %5/%6 are temporary registers
961
;           we assume m6/m7 have constant words 20091/17734 loaded in them
962
%macro VP8_IDCT_TRANSFORM4x4_1D 6
963
    SUMSUB_BA           m%3, m%1, m%5     ;t0, t1
964
    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
965
    SUMSUB_BA           m%4, m%3, m%5     ;tmp0, tmp3
966
    SUMSUB_BA           m%2, m%1, m%5     ;tmp1, tmp2
967
    SWAP                 %4,  %1
968
    SWAP                 %4,  %3
969
%endmacro
970

  
971
; transpose a 4x4 table
972
%macro TRANSPOSE4x4 5 ; output in %1/%4/%5/%3
973
    mova      m%5, m%1
974
    punpcklwd m%1, m%2
975
    punpckhwd m%5, m%2
976
    mova      m%2, m%3
977
    punpcklwd m%3, m%4
978
    punpckhwd m%2, m%4
979
    mova      m%4, m%1
980
    punpckldq m%1, m%3 ;col0
981
    punpckhdq m%4, m%3 ;col1
982
    mova      m%3, m%5
983
    punpckldq m%5, m%2 ;col2
984
    punpckhdq m%3, m%2 ;col3
985
    SWAP       %4,  %2
986
    SWAP       %4,  %5
987
    SWAP       %4,  %3
988
%endmacro
989

  
990
INIT_MMX
991
cglobal vp8_idct_add_mmx, 3, 3
992
    ; load block data
993
    movq         m0, [r1]
994
    movq         m1, [r1+8]
995
    movq         m2, [r1+16]
996
    movq         m3, [r1+24]
997
    movq         m6, [pw_20091]
998
    movq         m7, [pw_17734]
999

  
1000
    ; actual IDCT
1001
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1002
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1003
    paddw        m0, [pw_4]
1004
    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1005
    TRANSPOSE4x4W            0, 1, 2, 3, 4
1006

  
1007
    ; store
1008
    pxor         m4, m4
1009
    lea          r1, [r0+2*r2]
1010
    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
1011
    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
1012

  
1013
    RET
1014

  
1015
;-----------------------------------------------------------------------------
927 1016
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
928 1017
;-----------------------------------------------------------------------------
929 1018

  
libavcodec/x86/x86util.asm
365 365
    packuswb   %1, %1
366 366
    movh       %4, %1
367 367
%endmacro
368

  
369
%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
370
    movh       %3, [%7]
371
    movh       %4, [%7+%8]
372
    punpcklbw  %3, %5
373
    punpcklbw  %4, %5
374
    psraw      %1, %6
375
    psraw      %2, %6
376
    paddw      %3, %1
377
    paddw      %4, %2
378
    packuswb   %3, %5
379
    packuswb   %4, %5
380
    movh     [%7], %3
381
    movh  [%7+%8], %4
382
%endmacro

Also available in: Unified diff