Revision 3f87f39c libavcodec/x86/h264_deblock_sse2.asm

View differences:

libavcodec/x86/h264_deblock_sse2.asm
278 278
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279 279
;-----------------------------------------------------------------------------
280 280
INIT_XMM
281
cglobal x264_deblock_v_luma_sse2
281
cglobal x264_deblock_v_luma_sse2, 5,5,10
282 282
    movd    m8, [r4] ; tc0
283 283
    lea     r4, [r1*3]
284 284
    dec     r2d        ; alpha-1
......
318 318
    DEBLOCK_P0_Q0
319 319
    mova    [r4+2*r1], m1
320 320
    mova    [r0], m2
321
    ret
321
    RET
322 322

  
323 323
;-----------------------------------------------------------------------------
324 324
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325 325
;-----------------------------------------------------------------------------
326 326
INIT_MMX
327
cglobal x264_deblock_h_luma_sse2
328
    movsxd r10, esi
327
cglobal x264_deblock_h_luma_sse2, 5,7
328
    movsxd r10, r1d
329 329
    lea    r11, [r10+r10*2]
330
    lea    rax, [r0-4]
331
    lea    r9,  [r0-4+r11]
330
    lea    r6,  [r0-4]
331
    lea    r5,  [r0-4+r11]
332
%ifdef WIN64
333
    sub    rsp, 0x98
334
    %define pix_tmp rsp+0x30
335
%else
332 336
    sub    rsp, 0x68
333 337
    %define pix_tmp rsp
338
%endif
334 339

  
335 340
    ; transpose 6x16 -> tmp space
336
    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp
337
    lea    rax, [rax+r10*8]
338
    lea    r9,  [r9 +r10*8]
339
    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
341
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp
342
    lea    r6, [r6+r10*8]
343
    lea    r5, [r5+r10*8]
344
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
340 345

  
341 346
    ; vertical filter
342 347
    ; alpha, beta, tc0 are still in r2d, r3d, r4
343
    ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
348
    ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
344 349
    lea    r0, [pix_tmp+0x30]
345
    mov    esi, 0x10
350
    mov    r1d, 0x10
351
%ifdef WIN64
352
    mov    [rsp+0x20], r4
353
%endif
346 354
    call   x264_deblock_v_luma_sse2
347 355

  
348 356
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
349
    add    rax, 2
350
    add    r9,  2
357
    add    r6, 2
358
    add    r5, 2
351 359
    movq   m0, [pix_tmp+0x18]
352 360
    movq   m1, [pix_tmp+0x28]
353 361
    movq   m2, [pix_tmp+0x38]
354 362
    movq   m3, [pix_tmp+0x48]
355
    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
363
    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
356 364

  
357 365
    shl    r10, 3
358
    sub    rax, r10
359
    sub    r9,  r10
366
    sub    r6,  r10
367
    sub    r5,  r10
360 368
    shr    r10, 3
361 369
    movq   m0, [pix_tmp+0x10]
362 370
    movq   m1, [pix_tmp+0x20]
363 371
    movq   m2, [pix_tmp+0x30]
364 372
    movq   m3, [pix_tmp+0x40]
365
    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
373
    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
366 374

  
375
%ifdef WIN64
376
    add    rsp, 0x98
377
%else
367 378
    add    rsp, 0x68
368
    ret
379
%endif
380
    RET
369 381

  
370 382
%else
371 383

  
......
388 400
    mova    m3, [r0+r1]   ; q1
389 401
    LOAD_MASK r2, r3
390 402

  
391
    mov     r3, r4m
403
    mov     r3, r4mp
392 404
    movd    m4, [r3] ; tc0
393 405
    punpcklbw m4, m4
394 406
    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
......
428 440
;-----------------------------------------------------------------------------
429 441
INIT_MMX
430 442
cglobal x264_deblock_h_luma_%1, 0,5
431
    mov    r0, r0m
443
    mov    r0, r0mp
432 444
    mov    r3, r1m
433 445
    lea    r4, [r3*3]
434 446
    sub    r0, 4
......
459 471
    ADD    esp, 20
460 472

  
461 473
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
462
    mov    r0, r0m
474
    mov    r0, r0mp
463 475
    sub    r0, 2
464 476
    lea    r1, [r0+r4]
465 477

  
......
607 619
;-----------------------------------------------------------------------------
608 620
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
609 621
;-----------------------------------------------------------------------------
610
cglobal x264_deblock_%2_luma_intra_%1, 4,6
622
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
611 623
%ifndef ARCH_X86_64
612 624
    sub     esp, 0x60
613 625
%endif
......
669 681
;-----------------------------------------------------------------------------
670 682
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
671 683
;-----------------------------------------------------------------------------
672
cglobal x264_deblock_h_luma_intra_%1
684
cglobal x264_deblock_h_luma_intra_%1, 4,7
673 685
    movsxd r10, r1d
674 686
    lea    r11, [r10*3]
675
    lea    rax, [r0-4]
676
    lea    r9,  [r0-4+r11]
687
    lea    r6,  [r0-4]
688
    lea    r5,  [r0-4+r11]
677 689
    sub    rsp, 0x88
678 690
    %define pix_tmp rsp
679 691

  
680 692
    ; transpose 8x16 -> tmp space
681
    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
682
    lea    rax, [rax+r10*8]
683
    lea    r9,  [r9+r10*8]
684
    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
693
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
694
    lea    r6, [r6+r10*8]
695
    lea    r5, [r5+r10*8]
696
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
685 697

  
686 698
    lea    r0,  [pix_tmp+0x40]
687 699
    mov    r1,  0x10
688 700
    call   x264_deblock_v_luma_intra_%1
689 701

  
690 702
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
691
    lea    r9, [rax+r11]
692
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
703
    lea    r5, [r6+r11]
704
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
693 705
    shl    r10, 3
694
    sub    rax, r10
695
    sub    r9,  r10
706
    sub    r6,  r10
707
    sub    r5,  r10
696 708
    shr    r10, 3
697
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
709
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
698 710
    add    rsp, 0x88
699
    ret
711
    RET
700 712
%else
701 713
cglobal x264_deblock_h_luma_intra_%1, 2,4
702 714
    lea    r3,  [r1*3]
......
725 737
    ADD    esp, 16
726 738

  
727 739
    mov    r1,  r1m
728
    mov    r0,  r0m
740
    mov    r0,  r0mp
729 741
    lea    r3,  [r1*3]
730 742
    sub    r0,  4
731 743
    lea    r2,  [r0+r3]

Also available in: Unified diff