Revision 78b5c97d libavcodec/x86/fft_mmx.asm

View differences:

libavcodec/x86/fft_mmx.asm
29 29

  
30 30
%include "x86inc.asm"
31 31

  
32
%ifdef ARCH_X86_64
33
%define pointer resq
34
%else
35
%define pointer resd
36
%endif
37

  
38
struc FFTContext
39
    .nbits:    resd 1
40
    .reverse:  resd 1
41
    .revtab:   pointer 1
42
    .tmpbuf:   pointer 1
43
    .mdctsize: resd 1
44
    .mdctbits: resd 1
45
    .tcos:     pointer 1
46
    .tsin:     pointer 1
47
endstruc
48

  
32 49
SECTION_RODATA
33 50

  
34 51
%define M_SQRT1_2 0.70710678118654752440
......
428 445
%define SECTION_REL
429 446
%endif
430 447

  
448
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
449
    lea r2, [dispatch_tab%1]
450
    mov r2, [r2 + (%2q-2)*gprsize]
451
%ifdef PIC
452
    lea r3, [$$]
453
    add r2, r3
454
%endif
455
    call r2
456
%endmacro ; FFT_DISPATCH
457

  
431 458
%macro DECL_FFT 2-3 ; nbits, cpu, suffix
432 459
%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
433 460
%if %1==5
......
464 491
; On x86_32, this function does the register saving and restoring for all of fft.
465 492
; The others pass args in registers and don't spill anything.
466 493
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
467
    lea r2, [dispatch_tab%3%2]
468
    mov r2, [r2 + (nbitsq-2)*gprsize]
469
%ifdef PIC
470
    lea r3, [$$]
471
    add r2, r3
472
%endif
473
    call r2
494
    FFT_DISPATCH %3%2, nbits
474 495
    RET
475 496
%endmacro ; DECL_FFT
476 497

  
......
481 502
DECL_FFT 4, _3dn2
482 503
DECL_FFT 4, _3dn2, _interleave
483 504

  
505
INIT_XMM
506
%undef mulps
507
%undef addps
508
%undef subps
509
%undef unpcklps
510
%undef unpckhps
511

  
512
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
513
    movaps   xmm0, [%3+%2*4]
514
    movaps   xmm1, [%3+%1*4-0x10]
515
    movaps   xmm2, xmm0
516
    shufps   xmm0, xmm1, 0x88
517
    shufps   xmm1, xmm2, 0x77
518
    movlps   xmm4, [%4+%2*2]
519
    movlps   xmm5, [%5+%2*2+0x0]
520
    movhps   xmm4, [%4+%1*2-0x8]
521
    movhps   xmm5, [%5+%1*2-0x8]
522
    movaps   xmm2, xmm0
523
    movaps   xmm3, xmm1
524
    mulps    xmm0, xmm5
525
    mulps    xmm1, xmm4
526
    mulps    xmm2, xmm4
527
    mulps    xmm3, xmm5
528
    subps    xmm1, xmm0
529
    addps    xmm2, xmm3
530
    movaps   xmm0, xmm1
531
    unpcklps xmm1, xmm2
532
    unpckhps xmm0, xmm2
533
%endmacro
534

  
535
%macro PREROTATEW 3 ;addr1, addr2, xmm
536
    movlps   %1,   %3
537
    movhps   %2,   %3
538
%endmacro
539

  
540
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
541
    movaps   xmm6, [%4+%1*2]
542
    movaps   %2,   [%4+%1*2+0x10]
543
    movaps   %3,   xmm6
544
    movaps   xmm7, %2
545
    mulps    xmm6, [%5+%1*1]
546
    mulps    %2,   [%6+%1*1]
547
    mulps    %3,   [%6+%1*1]
548
    mulps    xmm7, [%5+%1*1]
549
    subps    %2,   xmm6
550
    addps    %3,   xmm7
551
%endmacro
552

  
553
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
554
.post:
555
    CMUL     %1,   xmm0, xmm1, %3, %4, %5
556
    CMUL     %2,   xmm4, xmm5, %3, %4, %5
557
    shufps   xmm1, xmm1, 0x1b
558
    shufps   xmm5, xmm5, 0x1b
559
    movaps   xmm6, xmm4
560
    unpckhps xmm4, xmm1
561
    unpcklps xmm6, xmm1
562
    movaps   xmm2, xmm0
563
    unpcklps xmm0, xmm5
564
    unpckhps xmm2, xmm5
565
    movaps   [%3+%2*2],      xmm6
566
    movaps   [%3+%2*2+0x10], xmm4
567
    movaps   [%3+%1*2],      xmm0
568
    movaps   [%3+%1*2+0x10], xmm2
569
    sub      %2,   0x10
570
    add      %1,   0x10
571
    jl       .post
572
%endmacro
573

  
574
cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
575
%ifdef ARCH_X86_64
576
%define rrevtab r10
577
%define rtcos   r11
578
%define rtsin   r12
579
    push  r10
580
    push  r11
581
    push  r12
582
    push  r13
583
    push  r14
584
%else
585
%define rrevtab r6
586
%define rtsin   r6
587
%define rtcos   r5
588
%endif
589
    mov   r3d, [r0+FFTContext.mdctsize]
590
    add   r2, r3
591
    shr   r3, 1
592
    mov   rtcos, [r0+FFTContext.tcos]
593
    mov   rtsin, [r0+FFTContext.tsin]
594
    add   rtcos, r3
595
    add   rtsin, r3
596
%ifndef ARCH_X86_64
597
    push  rtcos
598
    push  rtsin
599
%endif
600
    shr   r3, 1
601
    mov   rrevtab, [r0+FFTContext.revtab]
602
    add   rrevtab, r3
603
%ifndef ARCH_X86_64
604
    push  rrevtab
605
%endif
606

  
607
    sub   r3, 4
608
%ifdef ARCH_X86_64
609
    xor   r4, r4
610
    sub   r4, r3
611
%endif
612
.pre:
613
%ifndef ARCH_X86_64
614
;unspill
615
    xor   r4, r4
616
    sub   r4, r3
617
    mov   rtsin, [esp+4]
618
    mov   rtcos, [esp+8]
619
%endif
620

  
621
    PREROTATER r4, r3, r2, rtcos, rtsin
622
%ifdef ARCH_X86_64
623
    movzx  r5,  word [rrevtab+r4*1-4]
624
    movzx  r6,  word [rrevtab+r4*1-2]
625
    movzx  r13, word [rrevtab+r3*1]
626
    movzx  r14, word [rrevtab+r3*1+2]
627
    PREROTATEW [r1+r5 *8], [r1+r6 *8], xmm0
628
    PREROTATEW [r1+r13*8], [r1+r14*8], xmm1
629
    add    r4, 4
630
%else
631
    mov    r6, [esp]
632
    movzx  r5, word [r6+r4*1-4]
633
    movzx  r4, word [r6+r4*1-2]
634
    PREROTATEW [r1+r5*8], [r1+r4*8], xmm0
635
    movzx  r5, word [r6+r3*1]
636
    movzx  r4, word [r6+r3*1+2]
637
    PREROTATEW [r1+r5*8], [r1+r4*8], xmm1
638
%endif
639
    sub    r3, 4
640
    jns    .pre
641

  
642
    mov  r5, r0
643
    mov  r6, r1
644
    mov  r0, r1
645
    mov  r1d, [r5+FFTContext.nbits]
646

  
647
    FFT_DISPATCH _sse, r1
648

  
649
    mov  r0d, [r5+FFTContext.mdctsize]
650
    add  r6, r0
651
    shr  r0, 1
652
%ifndef ARCH_X86_64
653
%define rtcos r2
654
%define rtsin r3
655
    mov  rtcos, [esp+8]
656
    mov  rtsin, [esp+4]
657
%endif
658
    neg  r0
659
    mov  r1, -16
660
    sub  r1, r0
661
    POSROTATESHUF r0, r1, r6, rtcos, rtsin
662
%ifdef ARCH_X86_64
663
    pop  r14
664
    pop  r13
665
    pop  r12
666
    pop  r11
667
    pop  r10
668
%else
669
    add esp, 12
670
%endif
671
    RET

Also available in: Unified diff