Revision 9f3d6ca4 libavcodec/x86/h264_deblock.asm

View differences:

libavcodec/x86/h264_deblock.asm
324 324
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325 325
;-----------------------------------------------------------------------------
326 326
%macro DEBLOCK_LUMA 1
327
cglobal deblock_v_luma_%1, 5,5,10
327
cglobal deblock_v_luma_8_%1, 5,5,10
328 328
    movd    m8, [r4] ; tc0
329 329
    lea     r4, [r1*3]
330 330
    dec     r2d        ; alpha-1
......
369 369
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
370 370
;-----------------------------------------------------------------------------
371 371
INIT_MMX
372
cglobal deblock_h_luma_%1, 5,7
372
cglobal deblock_h_luma_8_%1, 5,7
373 373
    movsxd r10, r1d
374 374
    lea    r11, [r10+r10*2]
375 375
    lea    r6,  [r0-4]
......
396 396
%ifdef WIN64
397 397
    mov    [rsp+0x20], r4
398 398
%endif
399
    call   deblock_v_luma_%1
399
    call   deblock_v_luma_8_%1
400 400

  
401 401
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
402 402
    add    r6, 2
......
436 436
;-----------------------------------------------------------------------------
437 437
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
438 438
;-----------------------------------------------------------------------------
439
cglobal deblock_%2_luma_%1, 5,5
439
cglobal deblock_%2_luma_8_%1, 5,5
440 440
    lea     r4, [r1*3]
441 441
    dec     r2     ; alpha-1
442 442
    neg     r4
......
489 489
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
490 490
;-----------------------------------------------------------------------------
491 491
INIT_MMX
492
cglobal deblock_h_luma_%1, 0,5
492
cglobal deblock_h_luma_8_%1, 0,5
493 493
    mov    r0, r0mp
494 494
    mov    r3, r1m
495 495
    lea    r4, [r3*3]
......
512 512
    PUSH   dword r2m
513 513
    PUSH   dword 16
514 514
    PUSH   dword r0
515
    call   deblock_%2_luma_%1
515
    call   deblock_%2_luma_8_%1
516 516
%ifidn %2, v8
517 517
    add    dword [esp   ], 8 ; pix_tmp+0x38
518 518
    add    dword [esp+16], 2 ; tc0+2
519
    call   deblock_%2_luma_%1
519
    call   deblock_%2_luma_8_%1
520 520
%endif
521 521
    ADD    esp, 20
522 522

  
......
685 685
;-----------------------------------------------------------------------------
686 686
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
687 687
;-----------------------------------------------------------------------------
688
cglobal deblock_%2_luma_intra_%1, 4,6,16
688
cglobal deblock_%2_luma_intra_8_%1, 4,6,16
689 689
%ifndef ARCH_X86_64
690 690
    sub     esp, 0x60
691 691
%endif
......
747 747
;-----------------------------------------------------------------------------
748 748
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
749 749
;-----------------------------------------------------------------------------
750
cglobal deblock_h_luma_intra_%1, 4,7
750
cglobal deblock_h_luma_intra_8_%1, 4,7
751 751
    movsxd r10, r1d
752 752
    lea    r11, [r10*3]
753 753
    lea    r6,  [r0-4]
......
763 763

  
764 764
    lea    r0,  [pix_tmp+0x40]
765 765
    mov    r1,  0x10
766
    call   deblock_v_luma_intra_%1
766
    call   deblock_v_luma_intra_8_%1
767 767

  
768 768
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
769 769
    lea    r5, [r6+r11]
......
776 776
    add    rsp, 0x88
777 777
    RET
778 778
%else
779
cglobal deblock_h_luma_intra_%1, 2,4
779
cglobal deblock_h_luma_intra_8_%1, 2,4
780 780
    lea    r3,  [r1*3]
781 781
    sub    r0,  4
782 782
    lea    r2,  [r0+r3]
......
795 795
    PUSH   dword r2m
796 796
    PUSH   dword 16
797 797
    PUSH   r0
798
    call   deblock_%2_luma_intra_%1
798
    call   deblock_%2_luma_intra_8_%1
799 799
%ifidn %2, v8
800 800
    add    dword [rsp], 8 ; pix_tmp+8
801
    call   deblock_%2_luma_intra_%1
801
    call   deblock_%2_luma_intra_8_%1
802 802
%endif
803 803
    ADD    esp, 16
804 804

  
......
851 851
;-----------------------------------------------------------------------------
852 852
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
853 853
;-----------------------------------------------------------------------------
854
cglobal deblock_v_chroma_mmxext, 5,6
854
cglobal deblock_v_chroma_8_mmxext, 5,6
855 855
    CHROMA_V_START
856 856
    movq  m0, [t5]
857 857
    movq  m1, [t5+r1]
......
865 865
;-----------------------------------------------------------------------------
866 866
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
867 867
;-----------------------------------------------------------------------------
868
cglobal deblock_h_chroma_mmxext, 5,7
868
cglobal deblock_h_chroma_8_mmxext, 5,7
869 869
%ifdef ARCH_X86_64
870 870
    %define buf0 [rsp-24]
871 871
    %define buf1 [rsp-16]
......
911 911
;-----------------------------------------------------------------------------
912 912
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
913 913
;-----------------------------------------------------------------------------
914
cglobal deblock_v_chroma_intra_mmxext, 4,5
914
cglobal deblock_v_chroma_intra_8_mmxext, 4,5
915 915
    CHROMA_V_START
916 916
    movq  m0, [t5]
917 917
    movq  m1, [t5+r1]
......
925 925
;-----------------------------------------------------------------------------
926 926
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
927 927
;-----------------------------------------------------------------------------
928
cglobal deblock_h_chroma_intra_mmxext, 4,6
928
cglobal deblock_h_chroma_intra_8_mmxext, 4,6
929 929
    CHROMA_H_START
930 930
    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
931 931
    call ff_chroma_intra_body_mmxext

Also available in: Unified diff