Revision 5d0ddd1a libavcodec/i386/fft_3dn2.c

View differences:

libavcodec/i386/fft_3dn2.c
23 23
#include "libavutil/x86_cpu.h"
24 24
#include "libavcodec/dsputil.h"
25 25

  
26
static const int p1m1[2] __attribute__((aligned(8))) =
27
    { 0, 1 << 31 };
26
#ifdef EMULATE_3DNOWEXT
27
#define ff_fft_calc_3dn2 ff_fft_calc_3dn
28
#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
29
#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
30
#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
31
#define ff_imdct_half_3dn2 ff_imdct_half_3dn
32
#endif
28 33

  
29
static const int m1p1[2] __attribute__((aligned(8))) =
30
    { 1 << 31, 0 };
34
void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits);
35
void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits);
31 36

  
32 37
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
33 38
{
34
    int ln = s->nbits;
35
    long j;
36
    x86_reg i;
37
    long nblocks, nloops;
38
    FFTComplex *p, *cptr;
39

  
40
    asm volatile(
41
        /* FEMMS is not a must here but recommended by AMD */
42
        "femms \n\t"
43
        "movq %0, %%mm7 \n\t"
44
        ::"m"(*(s->inverse ? m1p1 : p1m1))
45
    );
46

  
47
    i = 8 << ln;
48
    asm volatile(
49
        "1: \n\t"
50
        "sub $32, %0 \n\t"
51
        "movq    (%0,%1), %%mm0 \n\t"
52
        "movq  16(%0,%1), %%mm1 \n\t"
53
        "movq   8(%0,%1), %%mm2 \n\t"
54
        "movq  24(%0,%1), %%mm3 \n\t"
55
        "movq      %%mm0, %%mm4 \n\t"
56
        "movq      %%mm1, %%mm5 \n\t"
57
        "pfadd     %%mm2, %%mm0 \n\t"
58
        "pfadd     %%mm3, %%mm1 \n\t"
59
        "pfsub     %%mm2, %%mm4 \n\t"
60
        "pfsub     %%mm3, %%mm5 \n\t"
61
        "movq      %%mm0, %%mm2 \n\t"
62
        "pswapd    %%mm5, %%mm5 \n\t"
63
        "movq      %%mm4, %%mm3 \n\t"
64
        "pxor      %%mm7, %%mm5 \n\t"
65
        "pfadd     %%mm1, %%mm0 \n\t"
66
        "pfadd     %%mm5, %%mm4 \n\t"
67
        "pfsub     %%mm1, %%mm2 \n\t"
68
        "pfsub     %%mm5, %%mm3 \n\t"
69
        "movq      %%mm0,   (%0,%1) \n\t"
70
        "movq      %%mm4,  8(%0,%1) \n\t"
71
        "movq      %%mm2, 16(%0,%1) \n\t"
72
        "movq      %%mm3, 24(%0,%1) \n\t"
73
        "jg 1b \n\t"
74
        :"+r"(i)
75
        :"r"(z)
76
    );
77
    /* pass 2 .. ln-1 */
78

  
79
    nblocks = 1 << (ln-3);
80
    nloops = 1 << 2;
81
    cptr = s->exptab1;
82
    do {
83
        p = z;
84
        j = nblocks;
85
        do {
86
            i = nloops*8;
87
            asm volatile(
88
                "1: \n\t"
89
                "sub $16, %0 \n\t"
90
                "movq    (%1,%0), %%mm0 \n\t"
91
                "movq   8(%1,%0), %%mm1 \n\t"
92
                "movq    (%2,%0), %%mm2 \n\t"
93
                "movq   8(%2,%0), %%mm3 \n\t"
94
                "movq  (%3,%0,2), %%mm4 \n\t"
95
                "movq 8(%3,%0,2), %%mm5 \n\t"
96
                "pswapd    %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3]
97
                "pswapd    %%mm5, %%mm7 \n\t"
98
                "pfmul     %%mm2, %%mm4 \n\t" // cre*re cim*im
99
                "pfmul     %%mm3, %%mm5 \n\t"
100
                "pfmul     %%mm2, %%mm6 \n\t" // cim*re cre*im
101
                "pfmul     %%mm3, %%mm7 \n\t"
102
                "pfpnacc   %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
103
                "pfpnacc   %%mm7, %%mm5 \n\t"
104
                "movq      %%mm0, %%mm2 \n\t"
105
                "movq      %%mm1, %%mm3 \n\t"
106
                "pfadd     %%mm4, %%mm0 \n\t"
107
                "pfadd     %%mm5, %%mm1 \n\t"
108
                "pfsub     %%mm4, %%mm2 \n\t"
109
                "pfsub     %%mm5, %%mm3 \n\t"
110
                "movq      %%mm0,  (%1,%0) \n\t"
111
                "movq      %%mm1, 8(%1,%0) \n\t"
112
                "movq      %%mm2,  (%2,%0) \n\t"
113
                "movq      %%mm3, 8(%2,%0) \n\t"
114
                "jg 1b \n\t"
115
                :"+r"(i)
116
                :"r"(p), "r"(p + nloops), "r"(cptr)
117
            );
118
            p += nloops*2;
119
        } while (--j);
120
        cptr += nloops*2;
121
        nblocks >>= 1;
122
        nloops <<= 1;
123
    } while (nblocks != 0);
39
    int n = 1<<s->nbits;
40
    int i;
41
    ff_fft_dispatch_interleave_3dn2(z, s->nbits);
124 42
    asm volatile("femms");
43
    if(n <= 8)
44
        for(i=0; i<n; i+=2)
45
            FFSWAP(FFTSample, z[i].im, z[i+1].re);
125 46
}
126 47

  
127 48
static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
......
162 83
        );
163 84
    }
164 85

  
165
    ff_fft_calc(&s->fft, z);
86
    ff_fft_calc_3dn2(&s->fft, z);
166 87

  
167 88
    /* post rotation + reordering */
168 89
    for(k = 0; k < n4; k++) {

Also available in: Unified diff