Revision 5d0ddd1a libavcodec/i386/fft_sse.c

View differences:

libavcodec/i386/fft_sse.c
22 22
#include "libavutil/x86_cpu.h"
23 23
#include "libavcodec/dsputil.h"
24 24

  
25
static const int p1p1p1m1[4] __attribute__((aligned(16))) =
26
    { 0, 0, 0, 1 << 31 };
27

  
28
static const int p1p1m1p1[4] __attribute__((aligned(16))) =
29
    { 0, 0, 1 << 31, 0 };
30

  
31
static const int p1p1m1m1[4] __attribute__((aligned(16))) =
32
    { 0, 0, 1 << 31, 1 << 31 };
33

  
34 25
static const int p1m1p1m1[4] __attribute__((aligned(16))) =
35 26
    { 0, 1 << 31, 0, 1 << 31 };
36 27

  
37 28
static const int m1m1m1m1[4] __attribute__((aligned(16))) =
38 29
    { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
39 30

  
40
#if 0
41
static void print_v4sf(const char *str, __m128 a)
42
{
43
    float *p = (float *)&a;
44
    printf("%s: %f %f %f %f\n",
45
           str, p[0], p[1], p[2], p[3]);
46
}
47
#endif
31
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
32
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
48 33

  
49
/* XXX: handle reverse case */
50 34
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
51 35
{
52
    int ln = s->nbits;
53
    x86_reg i;
54
    long j;
55
    long nblocks, nloops;
56
    FFTComplex *p, *cptr;
36
    int n = 1 << s->nbits;
57 37

  
58
    asm volatile(
59
        "movaps %0, %%xmm4 \n\t"
60
        "movaps %1, %%xmm5 \n\t"
61
        ::"m"(*p1p1m1m1),
62
          "m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
63
    );
38
    ff_fft_dispatch_interleave_sse(z, s->nbits);
64 39

  
65
    i = 8 << ln;
66
    asm volatile(
67
        "1: \n\t"
68
        "sub $32, %0 \n\t"
69
        /* do the pass 0 butterfly */
70
        "movaps   (%0,%1), %%xmm0 \n\t"
71
        "movaps    %%xmm0, %%xmm1 \n\t"
72
        "shufps     $0x4E, %%xmm0, %%xmm0 \n\t"
73
        "xorps     %%xmm4, %%xmm1 \n\t"
74
        "addps     %%xmm1, %%xmm0 \n\t"
75
        "movaps 16(%0,%1), %%xmm2 \n\t"
76
        "movaps    %%xmm2, %%xmm3 \n\t"
77
        "shufps     $0x4E, %%xmm2, %%xmm2 \n\t"
78
        "xorps     %%xmm4, %%xmm3 \n\t"
79
        "addps     %%xmm3, %%xmm2 \n\t"
80
        /* multiply third by -i */
81
        /* by toggling the sign bit */
82
        "shufps     $0xB4, %%xmm2, %%xmm2 \n\t"
83
        "xorps     %%xmm5, %%xmm2 \n\t"
84
        /* do the pass 1 butterfly */
85
        "movaps    %%xmm0, %%xmm1 \n\t"
86
        "addps     %%xmm2, %%xmm0 \n\t"
87
        "subps     %%xmm2, %%xmm1 \n\t"
88
        "movaps    %%xmm0,   (%0,%1) \n\t"
89
        "movaps    %%xmm1, 16(%0,%1) \n\t"
90
        "jg 1b \n\t"
91
        :"+r"(i)
92
        :"r"(z)
93
    );
94
    /* pass 2 .. ln-1 */
40
    if(n <= 16) {
41
        x86_reg i = -8*n;
42
        asm volatile(
43
            "1: \n"
44
            "movaps     (%0,%1), %%xmm0 \n"
45
            "movaps      %%xmm0, %%xmm1 \n"
46
            "unpcklps 16(%0,%1), %%xmm0 \n"
47
            "unpckhps 16(%0,%1), %%xmm1 \n"
48
            "movaps      %%xmm0,   (%0,%1) \n"
49
            "movaps      %%xmm1, 16(%0,%1) \n"
50
            "add $32, %0 \n"
51
            "jl 1b \n"
52
            :"+r"(i)
53
            :"r"(z+n)
54
            :"memory"
55
        );
56
    }
57
}
95 58

  
96
    nblocks = 1 << (ln-3);
97
    nloops = 1 << 2;
98
    cptr = s->exptab1;
99
    do {
100
        p = z;
101
        j = nblocks;
102
        do {
103
            i = nloops*8;
104
            asm volatile(
105
                "1: \n\t"
106
                "sub $32, %0 \n\t"
107
                "movaps    (%2,%0), %%xmm1 \n\t"
108
                "movaps    (%1,%0), %%xmm0 \n\t"
109
                "movaps  16(%2,%0), %%xmm5 \n\t"
110
                "movaps  16(%1,%0), %%xmm4 \n\t"
111
                "movaps     %%xmm1, %%xmm2 \n\t"
112
                "movaps     %%xmm5, %%xmm6 \n\t"
113
                "shufps      $0xA0, %%xmm1, %%xmm1 \n\t"
114
                "shufps      $0xF5, %%xmm2, %%xmm2 \n\t"
115
                "shufps      $0xA0, %%xmm5, %%xmm5 \n\t"
116
                "shufps      $0xF5, %%xmm6, %%xmm6 \n\t"
117
                "mulps   (%3,%0,2), %%xmm1 \n\t" //  cre*re cim*re
118
                "mulps 16(%3,%0,2), %%xmm2 \n\t" // -cim*im cre*im
119
                "mulps 32(%3,%0,2), %%xmm5 \n\t" //  cre*re cim*re
120
                "mulps 48(%3,%0,2), %%xmm6 \n\t" // -cim*im cre*im
121
                "addps      %%xmm2, %%xmm1 \n\t"
122
                "addps      %%xmm6, %%xmm5 \n\t"
123
                "movaps     %%xmm0, %%xmm3 \n\t"
124
                "movaps     %%xmm4, %%xmm7 \n\t"
125
                "addps      %%xmm1, %%xmm0 \n\t"
126
                "subps      %%xmm1, %%xmm3 \n\t"
127
                "addps      %%xmm5, %%xmm4 \n\t"
128
                "subps      %%xmm5, %%xmm7 \n\t"
129
                "movaps     %%xmm0, (%1,%0) \n\t"
130
                "movaps     %%xmm3, (%2,%0) \n\t"
131
                "movaps     %%xmm4, 16(%1,%0) \n\t"
132
                "movaps     %%xmm7, 16(%2,%0) \n\t"
133
                "jg 1b \n\t"
134
                :"+r"(i)
135
                :"r"(p), "r"(p + nloops), "r"(cptr)
136
            );
137
            p += nloops*2;
138
        } while (--j);
139
        cptr += nloops*2;
140
        nblocks >>= 1;
141
        nloops <<= 1;
142
    } while (nblocks != 0);
59
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
60
{
61
    int n = 1 << s->nbits;
62
    int i;
63
    for(i=0; i<n; i+=2) {
64
        asm volatile(
65
            "movaps %2, %%xmm0 \n"
66
            "movlps %%xmm0, %0 \n"
67
            "movhps %%xmm0, %1 \n"
68
            :"=m"(s->tmp_buf[s->revtab[i]]),
69
             "=m"(s->tmp_buf[s->revtab[i+1]])
70
            :"m"(z[i])
71
        );
72
    }
73
    memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
143 74
}
144 75

  
145 76
static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)

Also available in: Unified diff