Revision baffa091

View differences:

libavcodec/x86/dsputil_mmx.c
1664 1664
static void just_return(void) { return; }
1665 1665
#endif
1666 1666

  
1667
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1668
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
1667
#if HAVE_YASM
1668
typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1669
                                 x86_reg linesize, x86_reg start_y,
1670
                                 x86_reg end_y, x86_reg block_h,
1671
                                 x86_reg start_x, x86_reg end_x,
1672
                                 x86_reg block_w);
1673
extern emu_edge_core_func ff_emu_edge_core_mmx;
1674
extern emu_edge_core_func ff_emu_edge_core_sse;
1675

  
1676
static av_always_inline
1677
void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1678
                      int block_w, int block_h,
1679
                      int src_x, int src_y, int w, int h,
1680
                      emu_edge_core_func *core_fn)
1681
{
1682
    int start_y, start_x, end_y, end_x, src_y_add=0;
1683

  
1684
    if(src_y>= h){
1685
        src_y_add = h-1-src_y;
1686
        src_y=h-1;
1687
    }else if(src_y<=-block_h){
1688
        src_y_add = 1-block_h-src_y;
1689
        src_y=1-block_h;
1690
    }
1691
    if(src_x>= w){
1692
        src+= (w-1-src_x);
1693
        src_x=w-1;
1694
    }else if(src_x<=-block_w){
1695
        src+= (1-block_w-src_x);
1696
        src_x=1-block_w;
1697
    }
1698

  
1699
    start_y= FFMAX(0, -src_y);
1700
    start_x= FFMAX(0, -src_x);
1701
    end_y= FFMIN(block_h, h-src_y);
1702
    end_x= FFMIN(block_w, w-src_x);
1703
    assert(start_x < end_x && block_w > 0);
1704
    assert(start_y < end_y && block_h > 0);
1705

  
1706
    // fill in the to-be-copied part plus all above/below
1707
    src += (src_y_add+start_y)*linesize + start_x;
1708
    buf += start_x;
1709
    core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1710
}
1711

  
1712
#if ARCH_X86_32
1713
static av_noinline
1714
void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1715
                          int block_w, int block_h,
1716
                          int src_x, int src_y, int w, int h)
1717
{
1718
    emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1719
                     w, h, &ff_emu_edge_core_mmx);
1720
}
1721
#endif
1722
static av_noinline
1723
void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1724
                          int block_w, int block_h,
1725
                          int src_x, int src_y, int w, int h)
1726
{
1727
    emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1728
                     w, h, &ff_emu_edge_core_sse);
1729
}
1730
#endif /* HAVE_YASM */
1731

  
1732
typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1733
                                    int linesize, int block_w, int block_h,
1734
                                    int src_x, int src_y, int w, int h);
1735

  
1736
static av_always_inline
1737
void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1738
         int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1739
         emulated_edge_mc_func *emu_edge_fn)
1740
{
1669 1741
    const int w = 8;
1670 1742
    const int ix = ox>>(16+shift);
1671 1743
    const int iy = oy>>(16+shift);
......
1701 1773
    if( (unsigned)ix >= width-w ||
1702 1774
        (unsigned)iy >= height-h )
1703 1775
    {
1704
        ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1776
        emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1705 1777
        src = edge_buf;
1706 1778
    }
1707 1779

  
......
1782 1854
    }
1783 1855
}
1784 1856

  
1857
#if HAVE_YASM
1858
#if ARCH_X86_32
1859
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1860
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1861
{
1862
    gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1863
        width, height, &emulated_edge_mc_mmx);
1864
}
1865
#endif
1866
static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1867
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1868
{
1869
    gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1870
        width, height, &emulated_edge_mc_sse);
1871
}
1872
#else
1873
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1874
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1875
{
1876
    gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1877
        width, height, &ff_emulated_edge_mc);
1878
}
1879
#endif
1880

  
1785 1881
#define PREFETCH(name, op) \
1786 1882
static void name(void *mem, int stride, int h){\
1787 1883
    const uint8_t *p= mem;\
......
2626 2722
        SET_HPEL_FUNCS(avg, 1, 8, mmx);
2627 2723
        SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2628 2724

  
2725
#if ARCH_X86_32 || !HAVE_YASM
2629 2726
        c->gmc= gmc_mmx;
2727
#endif
2728
#if ARCH_X86_32 && HAVE_YASM
2729
        c->emulated_edge_mc = emulated_edge_mc_mmx;
2730
#endif
2630 2731

  
2631 2732
        c->add_bytes= add_bytes_mmx;
2632 2733
        c->add_bytes_l2= add_bytes_l2_mmx;
......
2913 3014
#if HAVE_YASM
2914 3015
            c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2915 3016
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
3017

  
3018
            c->emulated_edge_mc = emulated_edge_mc_sse;
3019
            c->gmc= gmc_sse;
2916 3020
#endif
2917 3021
        }
2918 3022
        if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit
libavcodec/x86/dsputil_yasm.asm
421 421
    fld     dword r0m
422 422
%endif
423 423
    RET
424

  
425
; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
426
;                              x86_reg start_y, x86_reg end_y, x86_reg block_h,
427
;                              x86_reg start_x, x86_reg end_x, x86_reg block_w);
428
;
429
; The actual function itself is below. It basically wraps a very simple
430
; w = end_x - start_x
431
; if (w) {
432
;   if (w > 22) {
433
;     jump to the slow loop functions
434
;   } else {
435
;     jump to the fast loop functions
436
;   }
437
; }
438
;
439
; ... and then the same for left/right extend also. See below for loop
440
; function implementations. Fast are fixed-width, slow is variable-width
441

  
442
%macro EMU_EDGE_FUNC 1
443
%ifdef ARCH_X86_64
444
%define w_reg r10
445
cglobal emu_edge_core_%1, 6, 7, 1
446
    mov        r11, r5          ; save block_h
447
%else
448
%define w_reg r6
449
cglobal emu_edge_core_%1, 2, 7, 0
450
    mov         r4, r4m         ; end_y
451
    mov         r5, r5m         ; block_h
452
%endif
453

  
454
    ; start with vertical extend (top/bottom) and body pixel copy
455
    mov      w_reg, r7m
456
    sub      w_reg, r6m         ; w = start_x - end_x
457
    sub         r5, r4
458
%ifdef ARCH_X86_64
459
    sub         r4, r3
460
%else
461
    sub         r4, dword r3m
462
%endif
463
    cmp      w_reg, 22
464
    jg .slow_v_extend_loop
465
%ifdef ARCH_X86_32
466
    mov         r2, r2m         ; linesize
467
%endif
468
    sal      w_reg, 7           ; w * 128
469
%ifdef PIC
470
    lea        rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
471
    add      w_reg, rax
472
%else
473
    lea      w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
474
%endif
475
    call     w_reg              ; fast top extend, body copy and bottom extend
476
.v_extend_end:
477

  
478
    ; horizontal extend (left/right)
479
    mov      w_reg, r6m         ; start_x
480
    sub         r0, w_reg
481
%ifdef ARCH_X86_64
482
    mov         r3, r0          ; backup of buf+block_h*linesize
483
    mov         r5, r11
484
%else
485
    mov        r0m, r0          ; backup of buf+block_h*linesize
486
    mov         r5, r5m
487
%endif
488
    test     w_reg, w_reg
489
    jz .right_extend
490
    cmp      w_reg, 22
491
    jg .slow_left_extend_loop
492
    mov         r1, w_reg
493
    dec      w_reg
494
    ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
495
    sar      w_reg, 1
496
    sal      w_reg, 6
497
    ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
498
    ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
499
%ifdef PIC
500
    lea        rax, [.emuedge_extend_left_2]
501
    add      w_reg, rax
502
%else
503
    lea      w_reg, [.emuedge_extend_left_2+w_reg]
504
%endif
505
    call     w_reg
506

  
507
    ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
508
.right_extend:
509
%ifdef ARCH_X86_32
510
    mov         r0, r0m
511
    mov         r5, r5m
512
%endif
513
    mov      w_reg, r7m         ; end_x
514
    mov         r1, r8m         ; block_w
515
    mov         r4, r1
516
    sub         r1, w_reg
517
    jz .h_extend_end            ; if (end_x == block_w) goto h_extend_end
518
    cmp         r1, 22
519
    jg .slow_right_extend_loop
520
    dec         r1
521
    ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
522
    sar         r1, 1
523
    sal         r1, 6
524
%ifdef PIC
525
    lea        rax, [.emuedge_extend_right_2]
526
    add         r1, rax
527
%else
528
    lea         r1, [.emuedge_extend_right_2+r1]
529
%endif
530
    call        r1
531
.h_extend_end:
532
    RET
533

  
534
%ifdef ARCH_X86_64
535
%define vall  al
536
%define valh  ah
537
%define valw  ax
538
%define valw2 r10w
539
%define valw3 r3w
540
%define vald eax
541
%else
542
%define vall  bl
543
%define valh  bh
544
%define valw  bx
545
%define valw2 r6w
546
%define valw3 valw2
547
%define vald ebx
548
%define stack_offset 0x14
549
%endif
550

  
551
%endmacro
552

  
553
; macro to read/write a horizontal number of pixels (%2) to/from registers
554
; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
555
;            - if (%2 & 15 == 8) fills the last 8 bytes into rax
556
;            - else if (%2 & 8)  fills 8 bytes into mm0
557
;            - if (%2 & 7 == 4)  fills the last 4 bytes into rax
558
;            - else if (%2 & 4)  fills 4 bytes into mm0-1
559
;            - if (%2 & 3 == 3)  fills 2 bytes into r10/r3, and 1 into eax
560
;              (note that we're using r3 for body/bottom because it's a shorter
561
;               opcode, and then the loop fits in 128 bytes)
562
;            - else              fills remaining bytes into rax
563
; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
564
;            - if (%2 & 7 == 4)  fills 4 bytes into ebx
565
;            - else if (%2 & 4)  fills 4 bytes into mm0-7
566
;            - if (%2 & 3 == 3)  fills 2 bytes into r6, and 1 into ebx
567
;            - else              fills remaining bytes into ebx
568
; writing data out is in the same way
569
%macro READ_NUM_BYTES 3
570
%assign %%src_off 0 ; offset in source buffer
571
%assign %%smidx   0 ; mmx register idx
572
%assign %%sxidx   0 ; xmm register idx
573

  
574
%ifnidn %3, mmx
575
%rep %2/16
576
    movdqu xmm %+ %%sxidx, [r1+%%src_off]
577
%assign %%src_off %%src_off+16
578
%assign %%sxidx   %%sxidx+1
579
%endrep ; %2/16
580
%endif ; !mmx
581

  
582
%ifdef ARCH_X86_64
583
%if (%2-%%src_off) == 8
584
    mov           rax, [r1+%%src_off]
585
%assign %%src_off %%src_off+8
586
%endif ; (%2-%%src_off) == 8
587
%endif ; x86-64
588

  
589
%rep (%2-%%src_off)/8
590
    movq    mm %+ %%smidx, [r1+%%src_off]
591
%assign %%src_off %%src_off+8
592
%assign %%smidx   %%smidx+1
593
%endrep ; (%2-%%dst_off)/8
594

  
595
%if (%2-%%src_off) == 4
596
    mov          vald, [r1+%%src_off]
597
%elif (%2-%%src_off) & 4
598
    movd    mm %+ %%smidx, [r1+%%src_off]
599
%assign %%src_off %%src_off+4
600
%endif ; (%2-%%src_off) ==/& 4
601

  
602
%if (%2-%%src_off) == 1
603
    mov          vall, [r1+%%src_off]
604
%elif (%2-%%src_off) == 2
605
    mov          valw, [r1+%%src_off]
606
%elif (%2-%%src_off) == 3
607
%ifidn %1, top
608
    mov         valw2, [r1+%%src_off]
609
%else ; %1 != top
610
    mov         valw3, [r1+%%src_off]
611
%endif ; %1 ==/!= top
612
    mov          vall, [r1+%%src_off+2]
613
%endif ; (%2-%%src_off) == 1/2/3
614
%endmacro ; READ_NUM_BYTES
615

  
616
%macro WRITE_NUM_BYTES 3
617
%assign %%dst_off 0 ; offset in destination buffer
618
%assign %%dmidx   0 ; mmx register idx
619
%assign %%dxidx   0 ; xmm register idx
620

  
621
%ifnidn %3, mmx
622
%rep %2/16
623
    movdqu [r0+%%dst_off], xmm %+ %%dxidx
624
%assign %%dst_off %%dst_off+16
625
%assign %%dxidx   %%dxidx+1
626
%endrep ; %2/16
627
%endif
628

  
629
%ifdef ARCH_X86_64
630
%if (%2-%%dst_off) == 8
631
    mov    [r0+%%dst_off], rax
632
%assign %%dst_off %%dst_off+8
633
%endif ; (%2-%%dst_off) == 8
634
%endif ; x86-64
635

  
636
%rep (%2-%%dst_off)/8
637
    movq   [r0+%%dst_off], mm %+ %%dmidx
638
%assign %%dst_off %%dst_off+8
639
%assign %%dmidx   %%dmidx+1
640
%endrep ; (%2-%%dst_off)/8
641

  
642
%if (%2-%%dst_off) == 4
643
    mov    [r0+%%dst_off], vald
644
%elif (%2-%%dst_off) & 4
645
    movd   [r0+%%dst_off], mm %+ %%dmidx
646
%assign %%dst_off %%dst_off+4
647
%endif ; (%2-%%dst_off) ==/& 4
648

  
649
%if (%2-%%dst_off) == 1
650
    mov    [r0+%%dst_off], vall
651
%elif (%2-%%dst_off) == 2
652
    mov    [r0+%%dst_off], valw
653
%elif (%2-%%dst_off) == 3
654
%ifidn %1, top
655
    mov    [r0+%%dst_off], valw2
656
%else ; %1 != top
657
    mov    [r0+%%dst_off], valw3
658
%endif ; %1 ==/!= top
659
    mov  [r0+%%dst_off+2], vall
660
%endif ; (%2-%%dst_off) == 1/2/3
661
%endmacro ; WRITE_NUM_BYTES
662

  
663
; vertical top/bottom extend and body copy fast loops
664
; these are function pointers to set-width line copy functions, i.e.
665
; they read a fixed number of pixels into set registers, and write
666
; those out into the destination buffer
667
; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
668
; r6(eax/64)/r3(ebx/32)=val_reg
669
%macro VERTICAL_EXTEND 1
670
%assign %%n 1
671
%rep 22
672
ALIGN 128
673
.emuedge_v_extend_ %+ %%n:
674
    ; extend pixels above body
675
%ifdef ARCH_X86_64
676
    test           r3 , r3                   ; if (!start_y)
677
    jz .emuedge_copy_body_ %+ %%n %+ _loop   ;   goto body
678
%else ; ARCH_X86_32
679
    cmp      dword r3m, 0
680
    je .emuedge_copy_body_ %+ %%n %+ _loop
681
%endif ; ARCH_X86_64/32
682
    READ_NUM_BYTES  top,    %%n, %1          ; read bytes
683
.emuedge_extend_top_ %+ %%n %+ _loop:        ; do {
684
    WRITE_NUM_BYTES top,    %%n, %1          ;   write bytes
685
    add            r0 , r2                   ;   dst += linesize
686
%ifdef ARCH_X86_64
687
    dec            r3
688
%else ; ARCH_X86_32
689
    dec      dword r3m
690
%endif ; ARCH_X86_64/32
691
    jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
692

  
693
    ; copy body pixels
694
.emuedge_copy_body_ %+ %%n %+ _loop:         ; do {
695
    READ_NUM_BYTES  body,   %%n, %1          ;   read bytes
696
    WRITE_NUM_BYTES body,   %%n, %1          ;   write bytes
697
    add            r0 , r2                   ;   dst += linesize
698
    add            r1 , r2                   ;   src += linesize
699
    dec            r4
700
    jnz .emuedge_copy_body_ %+ %%n %+ _loop  ; } while (--end_y)
701

  
702
    ; copy bottom pixels
703
    test           r5 , r5                   ; if (!block_h)
704
    jz .emuedge_v_extend_end_ %+ %%n         ;   goto end
705
    sub            r1 , r2                   ; src -= linesize
706
    READ_NUM_BYTES  bottom, %%n, %1          ; read bytes
707
.emuedge_extend_bottom_ %+ %%n %+ _loop:     ; do {
708
    WRITE_NUM_BYTES bottom, %%n, %1          ;   write bytes
709
    add            r0 , r2                   ;   dst += linesize
710
    dec            r5
711
    jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
712

  
713
.emuedge_v_extend_end_ %+ %%n:
714
%ifdef ARCH_X86_64
715
    ret
716
%else ; ARCH_X86_32
717
    rep ret
718
%endif ; ARCH_X86_64/32
719
%assign %%n %%n+1
720
%endrep
721
%endmacro VERTICAL_EXTEND
722

  
723
; left/right (horizontal) fast extend functions
724
; these are essentially identical to the vertical extend ones above,
725
; just left/right separated because number of pixels to extend is
726
; obviously not the same on both sides.
727
; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
728
; lowest two bytes of the register (so val*0x0101), and are splatted
729
; into each byte of mm0 as well if n_pixels >= 8
730

  
731
%macro READ_V_PIXEL 3
732
    mov        vall, %2
733
    mov        valh, vall
734
%if %1 >= 8
735
    movd        mm0, vald
736
%ifidn %3, mmx
737
    punpcklwd   mm0, mm0
738
    punpckldq   mm0, mm0
739
%else ; !mmx
740
    pshufw      mm0, mm0, 0
741
%endif ; mmx
742
%endif ; %1 >= 8
743
%endmacro
744

  
745
%macro WRITE_V_PIXEL 2
746
%assign %%dst_off 0
747
%rep %1/8
748
    movq [%2+%%dst_off], mm0
749
%assign %%dst_off %%dst_off+8
750
%endrep
751
%if %1 & 4
752
%if %1 >= 8
753
    movd [%2+%%dst_off], mm0
754
%else ; %1 < 8
755
    mov  [%2+%%dst_off]  , valw
756
    mov  [%2+%%dst_off+2], valw
757
%endif ; %1 >=/< 8
758
%assign %%dst_off %%dst_off+4
759
%endif ; %1 & 4
760
%if %1&2
761
    mov  [%2+%%dst_off], valw
762
%endif ; %1 & 2
763
%endmacro
764

  
765
; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
766
%macro LEFT_EXTEND 1
767
%assign %%n 2
768
%rep 11
769
ALIGN 64
770
.emuedge_extend_left_ %+ %%n:          ; do {
771
    sub         r0, r2                 ;   dst -= linesize
772
    READ_V_PIXEL  %%n, [r0+r1], %1     ;   read pixels
773
    WRITE_V_PIXEL %%n, r0              ;   write pixels
774
    dec         r5
775
    jnz .emuedge_extend_left_ %+ %%n   ; } while (--block_h)
776
%ifdef ARCH_X86_64
777
    ret
778
%else ; ARCH_X86_32
779
    rep ret
780
%endif ; ARCH_X86_64/32
781
%assign %%n %%n+2
782
%endrep
783
%endmacro ; LEFT_EXTEND
784

  
785
; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
786
%macro RIGHT_EXTEND 1
787
%assign %%n 2
788
%rep 11
789
ALIGN 64
790
.emuedge_extend_right_ %+ %%n:          ; do {
791
%ifdef ARCH_X86_64
792
    sub        r3, r2                   ;   dst -= linesize
793
    READ_V_PIXEL  %%n, [r3+w_reg-1], %1 ;   read pixels
794
    WRITE_V_PIXEL %%n, r3+r4-%%n        ;   write pixels
795
    dec       r11
796
%else ; ARCH_X86_32
797
    sub        r0, r2                   ;   dst -= linesize
798
    READ_V_PIXEL  %%n, [r0+w_reg-1], %1 ;   read pixels
799
    WRITE_V_PIXEL %%n, r0+r4-%%n        ;   write pixels
800
    dec     r5
801
%endif ; ARCH_X86_64/32
802
    jnz .emuedge_extend_right_ %+ %%n   ; } while (--block_h)
803
%ifdef ARCH_X86_64
804
    ret
805
%else ; ARCH_X86_32
806
    rep ret
807
%endif ; ARCH_X86_64/32
808
%assign %%n %%n+2
809
%endrep
810

  
811
%ifdef ARCH_X86_32
812
%define stack_offset 0x10
813
%endif
814
%endmacro ; RIGHT_EXTEND
815

  
816
; below follow the "slow" copy/extend functions, these act on a non-fixed
817
; width specified in a register, and run a loop to copy the full amount
818
; of bytes. They are optimized for copying of large amounts of pixels per
819
; line, so they unconditionally splat data into mm registers to copy 8
820
; bytes per loop iteration. It could be considered to use xmm for x86-64
821
; also, but I haven't optimized this as much (i.e. FIXME)
822
%macro V_COPY_NPX 4-5
823
%if %0 == 4
824
    test     w_reg, %4
825
    jz .%1_skip_%4_px
826
%else ; %0 == 5
827
.%1_%4_px_loop:
828
%endif
829
    %3          %2, [r1+cnt_reg]
830
    %3 [r0+cnt_reg], %2
831
    add    cnt_reg, %4
832
%if %0 == 5
833
    sub      w_reg, %4
834
    test     w_reg, %5
835
    jnz .%1_%4_px_loop
836
%endif
837
.%1_skip_%4_px:
838
%endmacro
839

  
840
%macro V_COPY_ROW 3
841
%ifidn %1, bottom
842
    sub         r1, linesize
843
%endif
844
.%1_copy_loop:
845
    xor    cnt_reg, cnt_reg
846
%ifidn %3, mmx
847
%define linesize r2m
848
    V_COPY_NPX %1,  mm0, movq,    8, 0xFFFFFFF8
849
%else ; !mmx
850
    V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0
851
%ifdef ARCH_X86_64
852
%define linesize r2
853
    V_COPY_NPX %1, rax , mov,     8
854
%else ; ARCH_X86_32
855
%define linesize r2m
856
    V_COPY_NPX %1,  mm0, movq,    8
857
%endif ; ARCH_X86_64/32
858
%endif ; mmx
859
    V_COPY_NPX %1, vald, mov,     4
860
    V_COPY_NPX %1, valw, mov,     2
861
    V_COPY_NPX %1, vall, mov,     1
862
    mov      w_reg, cnt_reg
863
%ifidn %1, body
864
    add         r1, linesize
865
%endif
866
    add         r0, linesize
867
    dec         %2
868
    jnz .%1_copy_loop
869
%endmacro
870

  
871
%macro SLOW_V_EXTEND 1
872
.slow_v_extend_loop:
873
; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
874
; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
875
%ifdef ARCH_X86_64
876
    push       r11              ; save old value of block_h
877
    test        r3, r3
878
%define cnt_reg r11
879
    jz .do_body_copy            ; if (!start_y) goto do_body_copy
880
    V_COPY_ROW top, r3, %1
881
%else
882
    cmp  dword r3m, 0
883
%define cnt_reg r2
884
    je .do_body_copy            ; if (!start_y) goto do_body_copy
885
    V_COPY_ROW top, dword r3m, %1
886
%endif
887

  
888
.do_body_copy:
889
    V_COPY_ROW body, r4, %1
890

  
891
%ifdef ARCH_X86_64
892
    pop        r11              ; restore old value of block_h
893
%define cnt_reg r3
894
%endif
895
    test        r5, r5
896
%ifdef ARCH_X86_64
897
    jz .v_extend_end
898
%else
899
    jz .skip_bottom_extend
900
%endif
901
    V_COPY_ROW bottom, r5, %1
902
%ifdef ARCH_X86_32
903
.skip_bottom_extend:
904
    mov         r2, r2m
905
%endif
906
    jmp .v_extend_end
907
%endmacro
908

  
909
%macro SLOW_LEFT_EXTEND 1
910
.slow_left_extend_loop:
911
; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
912
    mov         r4, 8
913
    sub         r0, linesize
914
    READ_V_PIXEL 8, [r0+w_reg], %1
915
.left_extend_8px_loop:
916
    movq [r0+r4-8], mm0
917
    add         r4, 8
918
    cmp         r4, w_reg
919
    jle .left_extend_8px_loop
920
    sub         r4, 8
921
    cmp         r4, w_reg
922
    jge .left_extend_loop_end
923
.left_extend_2px_loop:
924
    mov    [r0+r4], valw
925
    add         r4, 2
926
    cmp         r4, w_reg
927
    jl .left_extend_2px_loop
928
.left_extend_loop_end:
929
    dec         r5
930
    jnz .slow_left_extend_loop
931
%ifdef ARCH_X86_32
932
    mov         r2, r2m
933
%endif
934
    jmp .right_extend
935
%endmacro
936

  
937
%macro SLOW_RIGHT_EXTEND 1
938
.slow_right_extend_loop:
939
; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
940
; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
941
%ifdef ARCH_X86_64
942
%define buf_reg r3
943
%define bh_reg r11
944
%else
945
%define buf_reg r0
946
%define bh_reg r5
947
%endif
948
    lea         r1, [r4-8]
949
    sub    buf_reg, linesize
950
    READ_V_PIXEL 8, [buf_reg+w_reg-1], %1
951
.right_extend_8px_loop:
952
    movq [buf_reg+r1], mm0
953
    sub         r1, 8
954
    cmp         r1, w_reg
955
    jge .right_extend_8px_loop
956
    add         r1, 8
957
    cmp         r1, w_reg
958
    je .right_extend_loop_end
959
.right_extend_2px_loop:
960
    sub         r1, 2
961
    mov [buf_reg+r1], valw
962
    cmp         r1, w_reg
963
    jg .right_extend_2px_loop
964
.right_extend_loop_end:
965
    dec         bh_reg
966
    jnz .slow_right_extend_loop
967
    jmp .h_extend_end
968
%endmacro
969

  
970
%macro emu_edge 1
971
EMU_EDGE_FUNC     %1
972
VERTICAL_EXTEND   %1
973
LEFT_EXTEND       %1
974
RIGHT_EXTEND      %1
975
SLOW_V_EXTEND     %1
976
SLOW_LEFT_EXTEND  %1
977
SLOW_RIGHT_EXTEND %1
978
%endmacro
979

  
980
emu_edge sse
981
%ifdef ARCH_X86_32
982
emu_edge mmx
983
%endif

Also available in: Unified diff