Revision 5d0ddd1a

View differences:

libavcodec/Makefile
388 388
        i386/simple_idct_mmx.o \
389 389
        i386/idct_mmx_xvid.o \
390 390
        i386/idct_sse2_xvid.o \
391

  
392
OBJS-$(HAVE_YASM) += i386/fft_mmx.o \
391 393
        i386/fft_sse.o \
392 394
        i386/fft_3dn.o \
393 395
        i386/fft_3dn2.o \
libavcodec/dsputil.h
639 639
    uint16_t *revtab;
640 640
    FFTComplex *exptab;
641 641
    FFTComplex *exptab1; /* only used by SSE code */
642
    FFTComplex *tmp_buf;
643
    void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
642 644
    void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
643 645
    void (*imdct_calc)(struct MDCTContext *s, FFTSample *output,
644 646
                       const FFTSample *input, FFTSample *tmp);
......
647 649
} FFTContext;
648 650

  
649 651
int ff_fft_init(FFTContext *s, int nbits, int inverse);
650
void ff_fft_permute(FFTContext *s, FFTComplex *z);
652
void ff_fft_permute_c(FFTContext *s, FFTComplex *z);
653
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
651 654
void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
652 655
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
653 656
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
654 657
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
655 658
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
656 659

  
660
static inline void ff_fft_permute(FFTContext *s, FFTComplex *z)
661
{
662
    s->fft_permute(s, z);
663
}
657 664
static inline void ff_fft_calc(FFTContext *s, FFTComplex *z)
658 665
{
659 666
    s->fft_calc(s, z);
libavcodec/fft.c
1 1
/*
2 2
 * FFT/IFFT transforms
3
 * Copyright (c) 2008 Loren Merritt
3 4
 * Copyright (c) 2002 Fabrice Bellard.
5
 * Partly based on libdjbfft by D. J. Bernstein
4 6
 *
5 7
 * This file is part of FFmpeg.
6 8
 *
......
26 28

  
27 29
#include "dsputil.h"
28 30

  
31
/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
32
DECLARE_ALIGNED_16(FFTSample, ff_cos_16[8]);
33
DECLARE_ALIGNED_16(FFTSample, ff_cos_32[16]);
34
DECLARE_ALIGNED_16(FFTSample, ff_cos_64[32]);
35
DECLARE_ALIGNED_16(FFTSample, ff_cos_128[64]);
36
DECLARE_ALIGNED_16(FFTSample, ff_cos_256[128]);
37
DECLARE_ALIGNED_16(FFTSample, ff_cos_512[256]);
38
DECLARE_ALIGNED_16(FFTSample, ff_cos_1024[512]);
39
DECLARE_ALIGNED_16(FFTSample, ff_cos_2048[1024]);
40
DECLARE_ALIGNED_16(FFTSample, ff_cos_4096[2048]);
41
DECLARE_ALIGNED_16(FFTSample, ff_cos_8192[4096]);
42
DECLARE_ALIGNED_16(FFTSample, ff_cos_16384[8192]);
43
DECLARE_ALIGNED_16(FFTSample, ff_cos_32768[16384]);
44
DECLARE_ALIGNED_16(FFTSample, ff_cos_65536[32768]);
45
static FFTSample *ff_cos_tabs[] = {
46
    ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024,
47
    ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536,
48
};
49

  
50
static int split_radix_permutation(int i, int n, int inverse)
51
{
52
    int m;
53
    if(n <= 2) return i&1;
54
    m = n >> 1;
55
    if(!(i&m))            return split_radix_permutation(i, m, inverse)*2;
56
    m >>= 1;
57
    if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1;
58
    else                  return split_radix_permutation(i, m, inverse)*4 - 1;
59
}
60

  
29 61
/**
30 62
 * The size of the FFT is 2^nbits. If inverse is TRUE, inverse FFT is
31 63
 * done
......
34 66
{
35 67
    int i, j, m, n;
36 68
    float alpha, c1, s1, s2;
37
    int shuffle = 0;
69
    int split_radix = 1;
38 70
    int av_unused has_vectors;
39 71

  
72
    if (nbits < 2 || nbits > 16)
73
        goto fail;
40 74
    s->nbits = nbits;
41 75
    n = 1 << nbits;
42 76

  
77
    s->tmp_buf = NULL;
43 78
    s->exptab = av_malloc((n / 2) * sizeof(FFTComplex));
44 79
    if (!s->exptab)
45 80
        goto fail;
......
50 85

  
51 86
    s2 = inverse ? 1.0 : -1.0;
52 87

  
53
    for(i=0;i<(n/2);i++) {
54
        alpha = 2 * M_PI * (float)i / (float)n;
55
        c1 = cos(alpha);
56
        s1 = sin(alpha) * s2;
57
        s->exptab[i].re = c1;
58
        s->exptab[i].im = s1;
59
    }
88
    s->fft_permute = ff_fft_permute_c;
60 89
    s->fft_calc = ff_fft_calc_c;
61 90
    s->imdct_calc = ff_imdct_calc;
62 91
    s->imdct_half = ff_imdct_half;
63 92
    s->exptab1 = NULL;
64 93

  
65
#ifdef HAVE_MMX
94
#if defined HAVE_MMX && defined HAVE_YASM
66 95
    has_vectors = mm_support();
67
    shuffle = 1;
68
    if (has_vectors & MM_3DNOWEXT) {
69
        /* 3DNowEx for K7/K8 */
96
    if (has_vectors & MM_SSE) {
97
        /* SSE for P3/P4/K8 */
98
        s->imdct_calc = ff_imdct_calc_sse;
99
        s->imdct_half = ff_imdct_half_sse;
100
        s->fft_permute = ff_fft_permute_sse;
101
        s->fft_calc = ff_fft_calc_sse;
102
    } else if (has_vectors & MM_3DNOWEXT) {
103
        /* 3DNowEx for K7 */
70 104
        s->imdct_calc = ff_imdct_calc_3dn2;
71 105
        s->imdct_half = ff_imdct_half_3dn2;
72 106
        s->fft_calc = ff_fft_calc_3dn2;
73 107
    } else if (has_vectors & MM_3DNOW) {
74 108
        /* 3DNow! for K6-2/3 */
75 109
        s->fft_calc = ff_fft_calc_3dn;
76
    } else if (has_vectors & MM_SSE) {
77
        /* SSE for P3/P4 */
78
        s->imdct_calc = ff_imdct_calc_sse;
79
        s->imdct_half = ff_imdct_half_sse;
80
        s->fft_calc = ff_fft_calc_sse;
81
    } else {
82
        shuffle = 0;
83 110
    }
84 111
#elif defined HAVE_ALTIVEC && !defined ALTIVEC_USE_REFERENCE_C_CODE
85 112
    has_vectors = mm_support();
86 113
    if (has_vectors & MM_ALTIVEC) {
87 114
        s->fft_calc = ff_fft_calc_altivec;
88
        shuffle = 1;
115
        split_radix = 0;
89 116
    }
90 117
#endif
91 118

  
92
    /* compute constant table for HAVE_SSE version */
93
    if (shuffle) {
119
    if (split_radix) {
120
        for(j=4; j<=nbits; j++) {
121
            int m = 1<<j;
122
            double freq = 2*M_PI/m;
123
            FFTSample *tab = ff_cos_tabs[j-4];
124
            for(i=0; i<=m/4; i++)
125
                tab[i] = cos(i*freq);
126
            for(i=1; i<m/4; i++)
127
                tab[m/2-i] = tab[i];
128
        }
129
        for(i=0; i<n; i++)
130
            s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i;
131
        s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
132
    } else {
94 133
        int np, nblocks, np2, l;
95 134
        FFTComplex *q;
96 135

  
136
        for(i=0; i<(n/2); i++) {
137
            alpha = 2 * M_PI * (float)i / (float)n;
138
            c1 = cos(alpha);
139
            s1 = sin(alpha) * s2;
140
            s->exptab[i].re = c1;
141
            s->exptab[i].im = s1;
142
        }
143

  
97 144
        np = 1 << nbits;
98 145
        nblocks = np >> 3;
99 146
        np2 = np >> 1;
......
116 163
            nblocks = nblocks >> 1;
117 164
        } while (nblocks != 0);
118 165
        av_freep(&s->exptab);
119
    }
120 166

  
121 167
    /* compute bit reverse table */
122 168

  
......
127 173
        }
128 174
        s->revtab[i]=m;
129 175
    }
176
    }
177

  
130 178
    return 0;
131 179
 fail:
132 180
    av_freep(&s->revtab);
133 181
    av_freep(&s->exptab);
134 182
    av_freep(&s->exptab1);
183
    av_freep(&s->tmp_buf);
135 184
    return -1;
136 185
}
137 186

  
138
/* butter fly op */
139
#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
140
{\
141
  FFTSample ax, ay, bx, by;\
142
  bx=pre1;\
143
  by=pim1;\
144
  ax=qre1;\
145
  ay=qim1;\
146
  pre = (bx + ax);\
147
  pim = (by + ay);\
148
  qre = (bx - ax);\
149
  qim = (by - ay);\
150
}
151

  
152
#define MUL16(a,b) ((a) * (b))
153

  
154
#define CMUL(pre, pim, are, aim, bre, bim) \
155
{\
156
   pre = (MUL16(are, bre) - MUL16(aim, bim));\
157
   pim = (MUL16(are, bim) + MUL16(bre, aim));\
158
}
159

  
160
/**
161
 * Do a complex FFT with the parameters defined in ff_fft_init(). The
162
 * input data must be permuted before with s->revtab table. No
163
 * 1.0/sqrt(n) normalization is done.
164
 */
165
void ff_fft_calc_c(FFTContext *s, FFTComplex *z)
166
{
167
    int ln = s->nbits;
168
    int j, np, np2;
169
    int nblocks, nloops;
170
    register FFTComplex *p, *q;
171
    FFTComplex *exptab = s->exptab;
172
    int l;
173
    FFTSample tmp_re, tmp_im;
174

  
175
    np = 1 << ln;
176

  
177
    /* pass 0 */
178

  
179
    p=&z[0];
180
    j=(np >> 1);
181
    do {
182
        BF(p[0].re, p[0].im, p[1].re, p[1].im,
183
           p[0].re, p[0].im, p[1].re, p[1].im);
184
        p+=2;
185
    } while (--j != 0);
186

  
187
    /* pass 1 */
188

  
189

  
190
    p=&z[0];
191
    j=np >> 2;
192
    if (s->inverse) {
193
        do {
194
            BF(p[0].re, p[0].im, p[2].re, p[2].im,
195
               p[0].re, p[0].im, p[2].re, p[2].im);
196
            BF(p[1].re, p[1].im, p[3].re, p[3].im,
197
               p[1].re, p[1].im, -p[3].im, p[3].re);
198
            p+=4;
199
        } while (--j != 0);
200
    } else {
201
        do {
202
            BF(p[0].re, p[0].im, p[2].re, p[2].im,
203
               p[0].re, p[0].im, p[2].re, p[2].im);
204
            BF(p[1].re, p[1].im, p[3].re, p[3].im,
205
               p[1].re, p[1].im, p[3].im, -p[3].re);
206
            p+=4;
207
        } while (--j != 0);
208
    }
209
    /* pass 2 .. ln-1 */
210

  
211
    nblocks = np >> 3;
212
    nloops = 1 << 2;
213
    np2 = np >> 1;
214
    do {
215
        p = z;
216
        q = z + nloops;
217
        for (j = 0; j < nblocks; ++j) {
218
            BF(p->re, p->im, q->re, q->im,
219
               p->re, p->im, q->re, q->im);
220

  
221
            p++;
222
            q++;
223
            for(l = nblocks; l < np2; l += nblocks) {
224
                CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
225
                BF(p->re, p->im, q->re, q->im,
226
                   p->re, p->im, tmp_re, tmp_im);
227
                p++;
228
                q++;
229
            }
230

  
231
            p += nloops;
232
            q += nloops;
233
        }
234
        nblocks = nblocks >> 1;
235
        nloops = nloops << 1;
236
    } while (nblocks != 0);
237
}
238

  
239 187
/**
240 188
 * Do the permutation needed BEFORE calling ff_fft_calc()
241 189
 */
242
void ff_fft_permute(FFTContext *s, FFTComplex *z)
190
void ff_fft_permute_c(FFTContext *s, FFTComplex *z)
243 191
{
244 192
    int j, k, np;
245 193
    FFTComplex tmp;
246 194
    const uint16_t *revtab = s->revtab;
195
    np = 1 << s->nbits;
196

  
197
    if (s->tmp_buf) {
198
        /* TODO: handle split-radix permute in a more optimal way, probably in-place */
199
        for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
200
        memcpy(z, s->tmp_buf, np * sizeof(FFTComplex));
201
        return;
202
    }
247 203

  
248 204
    /* reverse */
249
    np = 1 << s->nbits;
250 205
    for(j=0;j<np;j++) {
251 206
        k = revtab[j];
252 207
        if (k < j) {
......
262 217
    av_freep(&s->revtab);
263 218
    av_freep(&s->exptab);
264 219
    av_freep(&s->exptab1);
220
    av_freep(&s->tmp_buf);
221
}
222

  
223
#define sqrthalf (float)M_SQRT1_2
224

  
225
#define BF(x,y,a,b) {\
226
    x = a - b;\
227
    y = a + b;\
228
}
229

  
230
#define BUTTERFLIES(a0,a1,a2,a3) {\
231
    BF(t3, t5, t5, t1);\
232
    BF(a2.re, a0.re, a0.re, t5);\
233
    BF(a3.im, a1.im, a1.im, t3);\
234
    BF(t4, t6, t2, t6);\
235
    BF(a3.re, a1.re, a1.re, t4);\
236
    BF(a2.im, a0.im, a0.im, t6);\
237
}
238

  
239
// force loading all the inputs before storing any.
240
// this is slightly slower for small data, but avoids store->load aliasing
241
// for addresses separated by large powers of 2.
242
#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\
243
    FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\
244
    BF(t3, t5, t5, t1);\
245
    BF(a2.re, a0.re, r0, t5);\
246
    BF(a3.im, a1.im, i1, t3);\
247
    BF(t4, t6, t2, t6);\
248
    BF(a3.re, a1.re, r1, t4);\
249
    BF(a2.im, a0.im, i0, t6);\
250
}
251

  
252
#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\
253
    t1 = a2.re * wre + a2.im * wim;\
254
    t2 = a2.im * wre - a2.re * wim;\
255
    t5 = a3.re * wre - a3.im * wim;\
256
    t6 = a3.im * wre + a3.re * wim;\
257
    BUTTERFLIES(a0,a1,a2,a3)\
258
}
259

  
260
#define TRANSFORM_ZERO(a0,a1,a2,a3) {\
261
    t1 = a2.re;\
262
    t2 = a2.im;\
263
    t5 = a3.re;\
264
    t6 = a3.im;\
265
    BUTTERFLIES(a0,a1,a2,a3)\
266
}
267

  
268
/* z[0...8n-1], w[1...2n-1] */
269
#define PASS(name)\
270
static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\
271
{\
272
    FFTSample t1, t2, t3, t4, t5, t6;\
273
    int o1 = 2*n;\
274
    int o2 = 4*n;\
275
    int o3 = 6*n;\
276
    const FFTSample *wim = wre+o1;\
277
    n--;\
278
\
279
    TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\
280
    TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
281
    do {\
282
        z += 2;\
283
        wre += 2;\
284
        wim -= 2;\
285
        TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\
286
        TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
287
    } while(--n);\
288
}
289

  
290
PASS(pass)
291
#undef BUTTERFLIES
292
#define BUTTERFLIES BUTTERFLIES_BIG
293
PASS(pass_big)
294

  
295
#define DECL_FFT(n,n2,n4)\
296
static void fft##n(FFTComplex *z)\
297
{\
298
    fft##n2(z);\
299
    fft##n4(z+n4*2);\
300
    fft##n4(z+n4*3);\
301
    pass(z,ff_cos_##n,n4/2);\
302
}
303

  
304
static void fft4(FFTComplex *z)
305
{
306
    FFTSample t1, t2, t3, t4, t5, t6, t7, t8;
307

  
308
    BF(t3, t1, z[0].re, z[1].re);
309
    BF(t8, t6, z[3].re, z[2].re);
310
    BF(z[2].re, z[0].re, t1, t6);
311
    BF(t4, t2, z[0].im, z[1].im);
312
    BF(t7, t5, z[2].im, z[3].im);
313
    BF(z[3].im, z[1].im, t4, t8);
314
    BF(z[3].re, z[1].re, t3, t7);
315
    BF(z[2].im, z[0].im, t2, t5);
316
}
317

  
318
static void fft8(FFTComplex *z)
319
{
320
    FFTSample t1, t2, t3, t4, t5, t6, t7, t8;
321

  
322
    fft4(z);
323

  
324
    BF(t1, z[5].re, z[4].re, -z[5].re);
325
    BF(t2, z[5].im, z[4].im, -z[5].im);
326
    BF(t3, z[7].re, z[6].re, -z[7].re);
327
    BF(t4, z[7].im, z[6].im, -z[7].im);
328
    BF(t8, t1, t3, t1);
329
    BF(t7, t2, t2, t4);
330
    BF(z[4].re, z[0].re, z[0].re, t1);
331
    BF(z[4].im, z[0].im, z[0].im, t2);
332
    BF(z[6].re, z[2].re, z[2].re, t7);
333
    BF(z[6].im, z[2].im, z[2].im, t8);
334

  
335
    TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf);
336
}
337

  
338
#ifndef CONFIG_SMALL
339
static void fft16(FFTComplex *z)
340
{
341
    FFTSample t1, t2, t3, t4, t5, t6;
342

  
343
    fft8(z);
344
    fft4(z+8);
345
    fft4(z+12);
346

  
347
    TRANSFORM_ZERO(z[0],z[4],z[8],z[12]);
348
    TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf);
349
    TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]);
350
    TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]);
351
}
352
#else
353
DECL_FFT(16,8,4)
354
#endif
355
DECL_FFT(32,16,8)
356
DECL_FFT(64,32,16)
357
DECL_FFT(128,64,32)
358
DECL_FFT(256,128,64)
359
DECL_FFT(512,256,128)
360
#ifndef CONFIG_SMALL
361
#define pass pass_big
362
#endif
363
DECL_FFT(1024,512,256)
364
DECL_FFT(2048,1024,512)
365
DECL_FFT(4096,2048,1024)
366
DECL_FFT(8192,4096,2048)
367
DECL_FFT(16384,8192,4096)
368
DECL_FFT(32768,16384,8192)
369
DECL_FFT(65536,32768,16384)
370

  
371
static void (*fft_dispatch[])(FFTComplex*) = {
372
    fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024,
373
    fft2048, fft4096, fft8192, fft16384, fft32768, fft65536,
374
};
375

  
376
/**
377
 * Do a complex FFT with the parameters defined in ff_fft_init(). The
378
 * input data must be permuted before with s->revtab table. No
379
 * 1.0/sqrt(n) normalization is done.
380
 */
381
void ff_fft_calc_c(FFTContext *s, FFTComplex *z)
382
{
383
    fft_dispatch[s->nbits-2](z);
265 384
}
266 385

  
libavcodec/i386/fft_3dn.c
1 1
/*
2 2
 * FFT/MDCT transform with 3DNow! optimizations
3
 * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
4
 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
3
 * Copyright (c) 2008 Loren Merritt
5 4
 *
6 5
 * This file is part of FFmpeg.
7 6
 *
......
20 19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 20
 */
22 21

  
23
#include "libavutil/x86_cpu.h"
24
#include "libavcodec/dsputil.h"
25

  
26
static const int p1m1[2] __attribute__((aligned(8))) =
27
    { 0, 1 << 31 };
28

  
29
static const int m1p1[2] __attribute__((aligned(8))) =
30
    { 1 << 31, 0 };
31

  
32
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
33
{
34
    int ln = s->nbits;
35
    long j;
36
    x86_reg i;
37
    long nblocks, nloops;
38
    FFTComplex *p, *cptr;
39

  
40
    asm volatile(
41
        /* FEMMS is not a must here but recommended by AMD */
42
        "femms \n\t"
43
        "movq %0, %%mm7 \n\t"
44
        ::"m"(*(s->inverse ? m1p1 : p1m1))
45
    );
46

  
47
    i = 8 << ln;
48
    asm volatile(
49
        "1: \n\t"
50
        "sub $32, %0 \n\t"
51
        "movq    (%0,%1), %%mm0 \n\t"
52
        "movq  16(%0,%1), %%mm1 \n\t"
53
        "movq   8(%0,%1), %%mm2 \n\t"
54
        "movq  24(%0,%1), %%mm3 \n\t"
55
        "movq      %%mm0, %%mm4 \n\t"
56
        "movq      %%mm1, %%mm5 \n\t"
57
        "pfadd     %%mm2, %%mm0 \n\t"
58
        "pfadd     %%mm3, %%mm1 \n\t"
59
        "pfsub     %%mm2, %%mm4 \n\t"
60
        "pfsub     %%mm3, %%mm5 \n\t"
61
        "movq      %%mm0, %%mm2 \n\t"
62
        "punpckldq %%mm5, %%mm6 \n\t"
63
        "punpckhdq %%mm6, %%mm5 \n\t"
64
        "movq      %%mm4, %%mm3 \n\t"
65
        "pxor      %%mm7, %%mm5 \n\t"
66
        "pfadd     %%mm1, %%mm0 \n\t"
67
        "pfadd     %%mm5, %%mm4 \n\t"
68
        "pfsub     %%mm1, %%mm2 \n\t"
69
        "pfsub     %%mm5, %%mm3 \n\t"
70
        "movq      %%mm0,   (%0,%1) \n\t"
71
        "movq      %%mm4,  8(%0,%1) \n\t"
72
        "movq      %%mm2, 16(%0,%1) \n\t"
73
        "movq      %%mm3, 24(%0,%1) \n\t"
74
        "jg 1b \n\t"
75
        :"+r"(i)
76
        :"r"(z)
77
    );
78
    /* pass 2 .. ln-1 */
79

  
80
    nblocks = 1 << (ln-3);
81
    nloops = 1 << 2;
82
    cptr = s->exptab1;
83
    do {
84
        p = z;
85
        j = nblocks;
86
        do {
87
            i = nloops*8;
88
            asm volatile(
89
                "1: \n\t"
90
                "sub $16, %0 \n\t"
91
                "movq    (%1,%0), %%mm0 \n\t"
92
                "movq   8(%1,%0), %%mm1 \n\t"
93
                "movq    (%2,%0), %%mm2 \n\t"
94
                "movq   8(%2,%0), %%mm3 \n\t"
95
                "movq      %%mm2, %%mm4 \n\t"
96
                "movq      %%mm3, %%mm5 \n\t"
97
                "punpckldq %%mm2, %%mm2 \n\t"
98
                "punpckldq %%mm3, %%mm3 \n\t"
99
                "punpckhdq %%mm4, %%mm4 \n\t"
100
                "punpckhdq %%mm5, %%mm5 \n\t"
101
                "pfmul   (%3,%0,2), %%mm2 \n\t" //  cre*re cim*re
102
                "pfmul  8(%3,%0,2), %%mm3 \n\t"
103
                "pfmul 16(%3,%0,2), %%mm4 \n\t" // -cim*im cre*im
104
                "pfmul 24(%3,%0,2), %%mm5 \n\t"
105
                "pfadd     %%mm2, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
106
                "pfadd     %%mm3, %%mm5 \n\t"
107
                "movq      %%mm0, %%mm2 \n\t"
108
                "movq      %%mm1, %%mm3 \n\t"
109
                "pfadd     %%mm4, %%mm0 \n\t"
110
                "pfadd     %%mm5, %%mm1 \n\t"
111
                "pfsub     %%mm4, %%mm2 \n\t"
112
                "pfsub     %%mm5, %%mm3 \n\t"
113
                "movq      %%mm0,  (%1,%0) \n\t"
114
                "movq      %%mm1, 8(%1,%0) \n\t"
115
                "movq      %%mm2,  (%2,%0) \n\t"
116
                "movq      %%mm3, 8(%2,%0) \n\t"
117
                "jg 1b \n\t"
118
                :"+r"(i)
119
                :"r"(p), "r"(p + nloops), "r"(cptr)
120
            );
121
            p += nloops*2;
122
        } while (--j);
123
        cptr += nloops*2;
124
        nblocks >>= 1;
125
        nloops <<= 1;
126
    } while (nblocks != 0);
127
    asm volatile("femms");
128
}
22
#define EMULATE_3DNOWEXT
23
#include "fft_3dn2.c"
libavcodec/i386/fft_3dn2.c
23 23
#include "libavutil/x86_cpu.h"
24 24
#include "libavcodec/dsputil.h"
25 25

  
26
static const int p1m1[2] __attribute__((aligned(8))) =
27
    { 0, 1 << 31 };
26
#ifdef EMULATE_3DNOWEXT
27
#define ff_fft_calc_3dn2 ff_fft_calc_3dn
28
#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
29
#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
30
#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
31
#define ff_imdct_half_3dn2 ff_imdct_half_3dn
32
#endif
28 33

  
29
static const int m1p1[2] __attribute__((aligned(8))) =
30
    { 1 << 31, 0 };
34
void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits);
35
void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits);
31 36

  
32 37
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
33 38
{
34
    int ln = s->nbits;
35
    long j;
36
    x86_reg i;
37
    long nblocks, nloops;
38
    FFTComplex *p, *cptr;
39

  
40
    asm volatile(
41
        /* FEMMS is not a must here but recommended by AMD */
42
        "femms \n\t"
43
        "movq %0, %%mm7 \n\t"
44
        ::"m"(*(s->inverse ? m1p1 : p1m1))
45
    );
46

  
47
    i = 8 << ln;
48
    asm volatile(
49
        "1: \n\t"
50
        "sub $32, %0 \n\t"
51
        "movq    (%0,%1), %%mm0 \n\t"
52
        "movq  16(%0,%1), %%mm1 \n\t"
53
        "movq   8(%0,%1), %%mm2 \n\t"
54
        "movq  24(%0,%1), %%mm3 \n\t"
55
        "movq      %%mm0, %%mm4 \n\t"
56
        "movq      %%mm1, %%mm5 \n\t"
57
        "pfadd     %%mm2, %%mm0 \n\t"
58
        "pfadd     %%mm3, %%mm1 \n\t"
59
        "pfsub     %%mm2, %%mm4 \n\t"
60
        "pfsub     %%mm3, %%mm5 \n\t"
61
        "movq      %%mm0, %%mm2 \n\t"
62
        "pswapd    %%mm5, %%mm5 \n\t"
63
        "movq      %%mm4, %%mm3 \n\t"
64
        "pxor      %%mm7, %%mm5 \n\t"
65
        "pfadd     %%mm1, %%mm0 \n\t"
66
        "pfadd     %%mm5, %%mm4 \n\t"
67
        "pfsub     %%mm1, %%mm2 \n\t"
68
        "pfsub     %%mm5, %%mm3 \n\t"
69
        "movq      %%mm0,   (%0,%1) \n\t"
70
        "movq      %%mm4,  8(%0,%1) \n\t"
71
        "movq      %%mm2, 16(%0,%1) \n\t"
72
        "movq      %%mm3, 24(%0,%1) \n\t"
73
        "jg 1b \n\t"
74
        :"+r"(i)
75
        :"r"(z)
76
    );
77
    /* pass 2 .. ln-1 */
78

  
79
    nblocks = 1 << (ln-3);
80
    nloops = 1 << 2;
81
    cptr = s->exptab1;
82
    do {
83
        p = z;
84
        j = nblocks;
85
        do {
86
            i = nloops*8;
87
            asm volatile(
88
                "1: \n\t"
89
                "sub $16, %0 \n\t"
90
                "movq    (%1,%0), %%mm0 \n\t"
91
                "movq   8(%1,%0), %%mm1 \n\t"
92
                "movq    (%2,%0), %%mm2 \n\t"
93
                "movq   8(%2,%0), %%mm3 \n\t"
94
                "movq  (%3,%0,2), %%mm4 \n\t"
95
                "movq 8(%3,%0,2), %%mm5 \n\t"
96
                "pswapd    %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3]
97
                "pswapd    %%mm5, %%mm7 \n\t"
98
                "pfmul     %%mm2, %%mm4 \n\t" // cre*re cim*im
99
                "pfmul     %%mm3, %%mm5 \n\t"
100
                "pfmul     %%mm2, %%mm6 \n\t" // cim*re cre*im
101
                "pfmul     %%mm3, %%mm7 \n\t"
102
                "pfpnacc   %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
103
                "pfpnacc   %%mm7, %%mm5 \n\t"
104
                "movq      %%mm0, %%mm2 \n\t"
105
                "movq      %%mm1, %%mm3 \n\t"
106
                "pfadd     %%mm4, %%mm0 \n\t"
107
                "pfadd     %%mm5, %%mm1 \n\t"
108
                "pfsub     %%mm4, %%mm2 \n\t"
109
                "pfsub     %%mm5, %%mm3 \n\t"
110
                "movq      %%mm0,  (%1,%0) \n\t"
111
                "movq      %%mm1, 8(%1,%0) \n\t"
112
                "movq      %%mm2,  (%2,%0) \n\t"
113
                "movq      %%mm3, 8(%2,%0) \n\t"
114
                "jg 1b \n\t"
115
                :"+r"(i)
116
                :"r"(p), "r"(p + nloops), "r"(cptr)
117
            );
118
            p += nloops*2;
119
        } while (--j);
120
        cptr += nloops*2;
121
        nblocks >>= 1;
122
        nloops <<= 1;
123
    } while (nblocks != 0);
39
    int n = 1<<s->nbits;
40
    int i;
41
    ff_fft_dispatch_interleave_3dn2(z, s->nbits);
124 42
    asm volatile("femms");
43
    if(n <= 8)
44
        for(i=0; i<n; i+=2)
45
            FFSWAP(FFTSample, z[i].im, z[i+1].re);
125 46
}
126 47

  
127 48
static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
......
162 83
        );
163 84
    }
164 85

  
165
    ff_fft_calc(&s->fft, z);
86
    ff_fft_calc_3dn2(&s->fft, z);
166 87

  
167 88
    /* post rotation + reordering */
168 89
    for(k = 0; k < n4; k++) {
libavcodec/i386/fft_mmx.asm
1
;******************************************************************************
2
;* FFT transform with SSE/3DNow optimizations
3
;* Copyright (c) 2008 Loren Merritt
4
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21

  
22
; These functions are not individually interchangeable with the C versions.
23
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
24
; in blocks as conventient to the vector size.
25
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
26

  
27
%include "x86inc.asm"
28

  
29
SECTION_RODATA
30

  
31
%define M_SQRT1_2 0.70710678118654752440
32
ps_root2: times 4 dd M_SQRT1_2
33
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
34
ps_m1p1: dd 1<<31, 0
35

  
36
%assign i 16
37
%rep 13
38
cextern ff_cos_ %+ i
39
%assign i i<<1
40
%endrep
41

  
42
%ifdef ARCH_X86_64
43
    %define pointer dq
44
%else
45
    %define pointer dd
46
%endif
47

  
48
%macro IF0 1+
49
%endmacro
50
%macro IF1 1+
51
    %1
52
%endmacro
53

  
54
section .text align=16
55

  
56
%macro T2_3DN 4 ; z0, z1, mem0, mem1
57
    mova     %1, %3
58
    mova     %2, %1
59
    pfadd    %1, %4
60
    pfsub    %2, %4
61
%endmacro
62

  
63
%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
64
    mova     %5, %3
65
    pfsub    %3, %4
66
    pfadd    %5, %4 ; {t6,t5}
67
    pxor     %3, [ps_m1p1 GLOBAL] ; {t8,t7}
68
    mova     %6, %1
69
    pswapd   %3, %3
70
    pfadd    %1, %5 ; {r0,i0}
71
    pfsub    %6, %5 ; {r2,i2}
72
    mova     %4, %2
73
    pfadd    %2, %3 ; {r1,i1}
74
    pfsub    %4, %3 ; {r3,i3}
75
    SWAP     %3, %6
76
%endmacro
77

  
78
; in:  %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
79
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
80
%macro T4_SSE 3
81
    mova     %3, %1
82
    shufps   %1, %2, 0x64 ; {r0,i0,r3,i2}
83
    shufps   %3, %2, 0xce ; {r1,i1,r2,i3}
84
    mova     %2, %1
85
    addps    %1, %3       ; {t1,t2,t6,t5}
86
    subps    %2, %3       ; {t3,t4,t8,t7}
87
    mova     %3, %1
88
    shufps   %1, %2, 0x44 ; {t1,t2,t3,t4}
89
    shufps   %3, %2, 0xbe ; {t6,t5,t7,t8}
90
    mova     %2, %1
91
    addps    %1, %3       ; {r0,i0,r1,i1}
92
    subps    %2, %3       ; {r2,i2,r3,i3}
93
    mova     %3, %1
94
    shufps   %1, %2, 0x88 ; {r0,r1,r2,r3}
95
    shufps   %3, %2, 0xdd ; {i0,i1,i2,i3}
96
    SWAP     %2, %3
97
%endmacro
98

  
99
%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
100
    mova     %5, %3
101
    shufps   %3, %4, 0x44 ; {r4,i4,r6,i6}
102
    shufps   %5, %4, 0xee ; {r5,i5,r7,i7}
103
    mova     %6, %3
104
    subps    %3, %5       ; {r5,i5,r7,i7}
105
    addps    %6, %5       ; {t1,t2,t3,t4}
106
    mova     %5, %3
107
    shufps   %5, %5, 0xb1 ; {i5,r5,i7,r7}
108
    mulps    %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7}
109
    mulps    %5, [ps_root2 GLOBAL]
110
    addps    %3, %5       ; {t8,t7,ta,t9}
111
    mova     %5, %6
112
    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
113
    shufps   %5, %3, 0x9c ; {t1,t4,t7,ta}
114
    mova     %3, %6
115
    addps    %6, %5       ; {t1,t2,t9,ta}
116
    subps    %3, %5       ; {t6,t5,tc,tb}
117
    mova     %5, %6
118
    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
119
    shufps   %5, %3, 0x8d ; {t2,ta,t6,tc}
120
    mova     %3, %1
121
    mova     %4, %2
122
    addps    %1, %6       ; {r0,r1,r2,r3}
123
    addps    %2, %5       ; {i0,i1,i2,i3}
124
    subps    %3, %6       ; {r4,r5,r6,r7}
125
    subps    %4, %5       ; {i4,i5,i6,i7}
126
%endmacro
127

  
128
; scheduled for cpu-bound sizes
129
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
130
IF%1 mova    m4, Z(4)
131
IF%1 mova    m5, Z(5)
132
    mova     m0, %2 ; wre
133
    mova     m2, m4
134
    mova     m1, %3 ; wim
135
    mova     m3, m5
136
    mulps    m2, m0 ; r2*wre
137
IF%1 mova    m6, Z(6)
138
    mulps    m3, m1 ; i2*wim
139
IF%1 mova    m7, Z(7)
140
    mulps    m4, m1 ; r2*wim
141
    mulps    m5, m0 ; i2*wre
142
    addps    m2, m3 ; r2*wre + i2*wim
143
    mova     m3, m1
144
    mulps    m1, m6 ; r3*wim
145
    subps    m5, m4 ; i2*wre - r2*wim
146
    mova     m4, m0
147
    mulps    m3, m7 ; i3*wim
148
    mulps    m4, m6 ; r3*wre
149
    mulps    m0, m7 ; i3*wre
150
    subps    m4, m3 ; r3*wre - i3*wim
151
    mova     m3, Z(0)
152
    addps    m0, m1 ; i3*wre + r3*wim
153
    mova     m1, m4
154
    addps    m4, m2 ; t5
155
    subps    m1, m2 ; t3
156
    subps    m3, m4 ; r2
157
    addps    m4, Z(0) ; r0
158
    mova     m6, Z(2)
159
    mova   Z(4), m3
160
    mova   Z(0), m4
161
    mova     m3, m5
162
    subps    m5, m0 ; t4
163
    mova     m4, m6
164
    subps    m6, m5 ; r3
165
    addps    m5, m4 ; r1
166
    mova   Z(6), m6
167
    mova   Z(2), m5
168
    mova     m2, Z(3)
169
    addps    m3, m0 ; t6
170
    subps    m2, m1 ; i3
171
    mova     m7, Z(1)
172
    addps    m1, Z(3) ; i1
173
    mova   Z(7), m2
174
    mova   Z(3), m1
175
    mova     m4, m7
176
    subps    m7, m3 ; i2
177
    addps    m3, m4 ; i0
178
    mova   Z(5), m7
179
    mova   Z(1), m3
180
%endmacro
181

  
182
; scheduled to avoid store->load aliasing
183
%macro PASS_BIG 1 ; (!interleave)
184
    mova     m4, Z(4) ; r2
185
    mova     m5, Z(5) ; i2
186
    mova     m2, m4
187
    mova     m0, [wq] ; wre
188
    mova     m3, m5
189
    mova     m1, [wq+o1q] ; wim
190
    mulps    m2, m0 ; r2*wre
191
    mova     m6, Z(6) ; r3
192
    mulps    m3, m1 ; i2*wim
193
    mova     m7, Z(7) ; i3
194
    mulps    m4, m1 ; r2*wim
195
    mulps    m5, m0 ; i2*wre
196
    addps    m2, m3 ; r2*wre + i2*wim
197
    mova     m3, m1
198
    mulps    m1, m6 ; r3*wim
199
    subps    m5, m4 ; i2*wre - r2*wim
200
    mova     m4, m0
201
    mulps    m3, m7 ; i3*wim
202
    mulps    m4, m6 ; r3*wre
203
    mulps    m0, m7 ; i3*wre
204
    subps    m4, m3 ; r3*wre - i3*wim
205
    mova     m3, Z(0)
206
    addps    m0, m1 ; i3*wre + r3*wim
207
    mova     m1, m4
208
    addps    m4, m2 ; t5
209
    subps    m1, m2 ; t3
210
    subps    m3, m4 ; r2
211
    addps    m4, Z(0) ; r0
212
    mova     m6, Z(2)
213
    mova   Z(4), m3
214
    mova   Z(0), m4
215
    mova     m3, m5
216
    subps    m5, m0 ; t4
217
    mova     m4, m6
218
    subps    m6, m5 ; r3
219
    addps    m5, m4 ; r1
220
IF%1 mova  Z(6), m6
221
IF%1 mova  Z(2), m5
222
    mova     m2, Z(3)
223
    addps    m3, m0 ; t6
224
    subps    m2, m1 ; i3
225
    mova     m7, Z(1)
226
    addps    m1, Z(3) ; i1
227
IF%1 mova  Z(7), m2
228
IF%1 mova  Z(3), m1
229
    mova     m4, m7
230
    subps    m7, m3 ; i2
231
    addps    m3, m4 ; i0
232
IF%1 mova  Z(5), m7
233
IF%1 mova  Z(1), m3
234
%if %1==0
235
    mova     m4, m5 ; r1
236
    mova     m0, m6 ; r3
237
    unpcklps m5, m1
238
    unpckhps m4, m1
239
    unpcklps m6, m2
240
    unpckhps m0, m2
241
    mova     m1, Z(0)
242
    mova     m2, Z(4)
243
    mova   Z(2), m5
244
    mova   Z(3), m4
245
    mova   Z(6), m6
246
    mova   Z(7), m0
247
    mova     m5, m1 ; r0
248
    mova     m4, m2 ; r2
249
    unpcklps m1, m3
250
    unpckhps m5, m3
251
    unpcklps m2, m7
252
    unpckhps m4, m7
253
    mova   Z(0), m1
254
    mova   Z(1), m5
255
    mova   Z(4), m2
256
    mova   Z(5), m4
257
%endif
258
%endmacro
259

  
260
%macro PUNPCK 3
261
    mova      %3, %1
262
    punpckldq %1, %2
263
    punpckhdq %3, %2
264
%endmacro
265

  
266
INIT_XMM
267

  
268
%define Z(x) [r0+mmsize*x]
269

  
270
align 16
271
fft4_sse:
272
    mova     m0, Z(0)
273
    mova     m1, Z(1)
274
    T4_SSE   m0, m1, m2
275
    mova   Z(0), m0
276
    mova   Z(1), m1
277
    ret
278

  
279
align 16
280
fft8_sse:
281
    mova     m0, Z(0)
282
    mova     m1, Z(1)
283
    T4_SSE   m0, m1, m2
284
    mova     m2, Z(2)
285
    mova     m3, Z(3)
286
    T8_SSE   m0, m1, m2, m3, m4, m5
287
    mova   Z(0), m0
288
    mova   Z(1), m1
289
    mova   Z(2), m2
290
    mova   Z(3), m3
291
    ret
292

  
293
align 16
294
fft16_sse:
295
    mova     m0, Z(0)
296
    mova     m1, Z(1)
297
    T4_SSE   m0, m1, m2
298
    mova     m2, Z(2)
299
    mova     m3, Z(3)
300
    T8_SSE   m0, m1, m2, m3, m4, m5
301
    mova     m4, Z(4)
302
    mova     m5, Z(5)
303
    mova   Z(0), m0
304
    mova   Z(1), m1
305
    mova   Z(2), m2
306
    mova   Z(3), m3
307
    T4_SSE   m4, m5, m6
308
    mova     m6, Z(6)
309
    mova     m7, Z(7)
310
    T4_SSE   m6, m7, m0
311
    PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL]
312
    ret
313

  
314

  
315
INIT_MMX
316

  
317
%macro FFT48_3DN 1
318
align 16
319
fft4%1:
320
    T2_3DN   m0, m1, Z(0), Z(1)
321
    mova     m2, Z(2)
322
    mova     m3, Z(3)
323
    T4_3DN   m0, m1, m2, m3, m4, m5
324
    PUNPCK   m0, m1, m4
325
    PUNPCK   m2, m3, m5
326
    mova   Z(0), m0
327
    mova   Z(1), m4
328
    mova   Z(2), m2
329
    mova   Z(3), m5
330
    ret
331

  
332
align 16
333
fft8%1:
334
    T2_3DN   m0, m1, Z(0), Z(1)
335
    mova     m2, Z(2)
336
    mova     m3, Z(3)
337
    T4_3DN   m0, m1, m2, m3, m4, m5
338
    mova   Z(0), m0
339
    mova   Z(2), m2
340
    T2_3DN   m4, m5, Z(4), Z(5)
341
    T2_3DN   m6, m7, Z(6), Z(7)
342
    pswapd   m0, m5
343
    pswapd   m2, m7
344
    pxor     m0, [ps_m1p1 GLOBAL]
345
    pxor     m2, [ps_m1p1 GLOBAL]
346
    pfsub    m5, m0
347
    pfadd    m7, m2
348
    pfmul    m5, [ps_root2 GLOBAL]
349
    pfmul    m7, [ps_root2 GLOBAL]
350
    T4_3DN   m1, m3, m5, m7, m0, m2
351
    mova   Z(5), m5
352
    mova   Z(7), m7
353
    mova     m0, Z(0)
354
    mova     m2, Z(2)
355
    T4_3DN   m0, m2, m4, m6, m5, m7
356
    PUNPCK   m0, m1, m5
357
    PUNPCK   m2, m3, m7
358
    mova   Z(0), m0
359
    mova   Z(1), m5
360
    mova   Z(2), m2
361
    mova   Z(3), m7
362
    PUNPCK   m4, Z(5), m5
363
    PUNPCK   m6, Z(7), m7
364
    mova   Z(4), m4
365
    mova   Z(5), m5
366
    mova   Z(6), m6
367
    mova   Z(7), m7
368
    ret
369
%endmacro
370

  
371
FFT48_3DN _3dn2
372

  
373
%macro pswapd 2
374
%ifidn %1, %2
375
    movd [r0+12], %1
376
    punpckhdq %1, [r0+8]
377
%else
378
    movq  %1, %2
379
    psrlq %1, 32
380
    punpckldq %1, %2
381
%endif
382
%endmacro
383

  
384
FFT48_3DN _3dn
385

  
386

  
387
%define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)]
388

  
389
%macro DECL_PASS 2+ ; name, payload
390
align 16
391
%1:
392
DEFINE_ARGS z, w, n, o1, o3
393
    lea o3q, [nq*3]
394
    lea o1q, [nq*8]
395
    shl o3q, 4
396
.loop:
397
    %2
398
    add zq, mmsize*2
399
    add wq, mmsize
400
    sub nd, mmsize/8
401
    jg .loop
402
    rep ret
403
%endmacro
404

  
405
INIT_XMM
406
DECL_PASS pass_sse, PASS_BIG 1
407
DECL_PASS pass_interleave_sse, PASS_BIG 0
408

  
409
INIT_MMX
410
%define mulps pfmul
411
%define addps pfadd
412
%define subps pfsub
413
%define unpcklps punpckldq
414
%define unpckhps punpckhdq
415
DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
416
DECL_PASS pass_interleave_3dn, PASS_BIG 0
417
%define pass_3dn2 pass_3dn
418
%define pass_interleave_3dn2 pass_interleave_3dn
419

  
420

  
421
%macro DECL_FFT 2-3 ; nbits, cpu, suffix
422
%xdefine list_of_fft fft4%2, fft8%2
423
%if %1==5
424
%xdefine list_of_fft list_of_fft, fft16%2
425
%endif
426

  
427
%assign n 1<<%1
428
%rep 17-%1
429
%assign n2 n/2
430
%assign n4 n/4
431
%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2
432

  
433
align 16
434
fft %+ n %+ %3%2:
435
    call fft %+ n2 %+ %2
436
    add r0, n*4 - (n&(-2<<%1))
437
    call fft %+ n4 %+ %2
438
    add r0, n*2 - (n2&(-2<<%1))
439
    call fft %+ n4 %+ %2
440
    sub r0, n*6 + (n2&(-2<<%1))
441
    lea r1, [ff_cos_ %+ n GLOBAL]
442
    mov r2d, n4/2
443
    jmp pass%3%2
444

  
445
%assign n n*2
446
%endrep
447
%undef n
448

  
449
align 8
450
dispatch_tab%3%2: pointer list_of_fft
451

  
452
; On x86_32, this function does the register saving and restoring for all of fft.
453
; The others pass args in registers and don't spill anything.
454
cglobal ff_fft_dispatch%3%2, 2,5,0, z, nbits
455
    lea r2, [dispatch_tab%3%2 GLOBAL]
456
    mov r2, [r2 + (nbitsq-2)*gprsize]
457
    call r2
458
    RET
459
%endmacro ; DECL_FFT
460

  
461
DECL_FFT 5, _sse
462
DECL_FFT 5, _sse, _interleave
463
DECL_FFT 4, _3dn
464
DECL_FFT 4, _3dn, _interleave
465
DECL_FFT 4, _3dn2
466
DECL_FFT 4, _3dn2, _interleave
467

  
libavcodec/i386/fft_sse.c
22 22
#include "libavutil/x86_cpu.h"
23 23
#include "libavcodec/dsputil.h"
24 24

  
25
static const int p1p1p1m1[4] __attribute__((aligned(16))) =
26
    { 0, 0, 0, 1 << 31 };
27

  
28
static const int p1p1m1p1[4] __attribute__((aligned(16))) =
29
    { 0, 0, 1 << 31, 0 };
30

  
31
static const int p1p1m1m1[4] __attribute__((aligned(16))) =
32
    { 0, 0, 1 << 31, 1 << 31 };
33

  
34 25
static const int p1m1p1m1[4] __attribute__((aligned(16))) =
35 26
    { 0, 1 << 31, 0, 1 << 31 };
36 27

  
37 28
static const int m1m1m1m1[4] __attribute__((aligned(16))) =
38 29
    { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
39 30

  
40
#if 0
41
static void print_v4sf(const char *str, __m128 a)
42
{
43
    float *p = (float *)&a;
44
    printf("%s: %f %f %f %f\n",
45
           str, p[0], p[1], p[2], p[3]);
46
}
47
#endif
31
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
32
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
48 33

  
49
/* XXX: handle reverse case */
50 34
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
51 35
{
52
    int ln = s->nbits;
53
    x86_reg i;
54
    long j;
55
    long nblocks, nloops;
56
    FFTComplex *p, *cptr;
36
    int n = 1 << s->nbits;
57 37

  
58
    asm volatile(
59
        "movaps %0, %%xmm4 \n\t"
60
        "movaps %1, %%xmm5 \n\t"
61
        ::"m"(*p1p1m1m1),
62
          "m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
63
    );
38
    ff_fft_dispatch_interleave_sse(z, s->nbits);
64 39

  
65
    i = 8 << ln;
66
    asm volatile(
67
        "1: \n\t"
68
        "sub $32, %0 \n\t"
69
        /* do the pass 0 butterfly */
70
        "movaps   (%0,%1), %%xmm0 \n\t"
71
        "movaps    %%xmm0, %%xmm1 \n\t"
72
        "shufps     $0x4E, %%xmm0, %%xmm0 \n\t"
73
        "xorps     %%xmm4, %%xmm1 \n\t"
74
        "addps     %%xmm1, %%xmm0 \n\t"
75
        "movaps 16(%0,%1), %%xmm2 \n\t"
76
        "movaps    %%xmm2, %%xmm3 \n\t"
77
        "shufps     $0x4E, %%xmm2, %%xmm2 \n\t"
78
        "xorps     %%xmm4, %%xmm3 \n\t"
79
        "addps     %%xmm3, %%xmm2 \n\t"
80
        /* multiply third by -i */
81
        /* by toggling the sign bit */
82
        "shufps     $0xB4, %%xmm2, %%xmm2 \n\t"
83
        "xorps     %%xmm5, %%xmm2 \n\t"
84
        /* do the pass 1 butterfly */
85
        "movaps    %%xmm0, %%xmm1 \n\t"
86
        "addps     %%xmm2, %%xmm0 \n\t"
87
        "subps     %%xmm2, %%xmm1 \n\t"
88
        "movaps    %%xmm0,   (%0,%1) \n\t"
89
        "movaps    %%xmm1, 16(%0,%1) \n\t"
90
        "jg 1b \n\t"
91
        :"+r"(i)
92
        :"r"(z)
93
    );
94
    /* pass 2 .. ln-1 */
40
    if(n <= 16) {
41
        x86_reg i = -8*n;
42
        asm volatile(
43
            "1: \n"
44
            "movaps     (%0,%1), %%xmm0 \n"
45
            "movaps      %%xmm0, %%xmm1 \n"
46
            "unpcklps 16(%0,%1), %%xmm0 \n"
47
            "unpckhps 16(%0,%1), %%xmm1 \n"
48
            "movaps      %%xmm0,   (%0,%1) \n"
49
            "movaps      %%xmm1, 16(%0,%1) \n"
50
            "add $32, %0 \n"
51
            "jl 1b \n"
52
            :"+r"(i)
53
            :"r"(z+n)
54
            :"memory"
55
        );
56
    }
57
}
95 58

  
96
    nblocks = 1 << (ln-3);
97
    nloops = 1 << 2;
98
    cptr = s->exptab1;
99
    do {
100
        p = z;
101
        j = nblocks;
102
        do {
103
            i = nloops*8;
104
            asm volatile(
105
                "1: \n\t"
106
                "sub $32, %0 \n\t"
107
                "movaps    (%2,%0), %%xmm1 \n\t"
108
                "movaps    (%1,%0), %%xmm0 \n\t"
109
                "movaps  16(%2,%0), %%xmm5 \n\t"
110
                "movaps  16(%1,%0), %%xmm4 \n\t"
111
                "movaps     %%xmm1, %%xmm2 \n\t"
112
                "movaps     %%xmm5, %%xmm6 \n\t"
113
                "shufps      $0xA0, %%xmm1, %%xmm1 \n\t"
114
                "shufps      $0xF5, %%xmm2, %%xmm2 \n\t"
115
                "shufps      $0xA0, %%xmm5, %%xmm5 \n\t"
116
                "shufps      $0xF5, %%xmm6, %%xmm6 \n\t"
117
                "mulps   (%3,%0,2), %%xmm1 \n\t" //  cre*re cim*re
118
                "mulps 16(%3,%0,2), %%xmm2 \n\t" // -cim*im cre*im
119
                "mulps 32(%3,%0,2), %%xmm5 \n\t" //  cre*re cim*re
120
                "mulps 48(%3,%0,2), %%xmm6 \n\t" // -cim*im cre*im
121
                "addps      %%xmm2, %%xmm1 \n\t"
122
                "addps      %%xmm6, %%xmm5 \n\t"
123
                "movaps     %%xmm0, %%xmm3 \n\t"
124
                "movaps     %%xmm4, %%xmm7 \n\t"
125
                "addps      %%xmm1, %%xmm0 \n\t"
126
                "subps      %%xmm1, %%xmm3 \n\t"
127
                "addps      %%xmm5, %%xmm4 \n\t"
128
                "subps      %%xmm5, %%xmm7 \n\t"
129
                "movaps     %%xmm0, (%1,%0) \n\t"
130
                "movaps     %%xmm3, (%2,%0) \n\t"
131
                "movaps     %%xmm4, 16(%1,%0) \n\t"
132
                "movaps     %%xmm7, 16(%2,%0) \n\t"
133
                "jg 1b \n\t"
134
                :"+r"(i)
135
                :"r"(p), "r"(p + nloops), "r"(cptr)
136
            );
137
            p += nloops*2;
138
        } while (--j);
139
        cptr += nloops*2;
140
        nblocks >>= 1;
141
        nloops <<= 1;
142
    } while (nblocks != 0);
59
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
60
{
61
    int n = 1 << s->nbits;
62
    int i;
63
    for(i=0; i<n; i+=2) {
64
        asm volatile(
65
            "movaps %2, %%xmm0 \n"
66
            "movlps %%xmm0, %0 \n"
67
            "movhps %%xmm0, %1 \n"
68
            :"=m"(s->tmp_buf[s->revtab[i]]),
69
             "=m"(s->tmp_buf[s->revtab[i+1]])
70
            :"m"(z[i])
71
        );
72
    }
73
    memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
143 74
}
144 75

  
145 76
static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)

Also available in: Unified diff