Statistics
| Branch: | Revision:

ffmpeg / libavcodec / i386 / fft_3dn.c @ 40d0e665

History | View | Annotate | Download (4.24 KB)

1 82eb4b0f Zuxy Meng
/*
2
 * FFT/MDCT transform with 3DNow! optimizations
3 1e4ecf26 Loren Merritt
 * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
4 82eb4b0f Zuxy Meng
 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
5
 *
6 b78e7197 Diego Biurrun
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9 82eb4b0f Zuxy Meng
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11 b78e7197 Diego Biurrun
 * version 2.1 of the License, or (at your option) any later version.
12 82eb4b0f Zuxy Meng
 *
13 b78e7197 Diego Biurrun
 * FFmpeg is distributed in the hope that it will be useful,
14 82eb4b0f Zuxy Meng
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19 b78e7197 Diego Biurrun
 * License along with FFmpeg; if not, write to the Free Software
20 82eb4b0f Zuxy Meng
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22 b550bfaa Ronald S. Bultje
#include "dsputil.h"
23 40d0e665 Ramiro Polla
#include "x86_cpu.h"
24 82eb4b0f Zuxy Meng
25
static const int p1m1[2] __attribute__((aligned(8))) =
26
    { 0, 1 << 31 };
27
28
static const int m1p1[2] __attribute__((aligned(8))) =
29
    { 1 << 31, 0 };
30
31
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
32
{
33
    int ln = s->nbits;
34 40d0e665 Ramiro Polla
    long j;
35
    x86_reg i;
36 1e4ecf26 Loren Merritt
    long nblocks, nloops;
37
    FFTComplex *p, *cptr;
38
39
    asm volatile(
40
        /* FEMMS is not a must here but recommended by AMD */
41
        "femms \n\t"
42
        "movq %0, %%mm7 \n\t"
43
        ::"m"(*(s->inverse ? m1p1 : p1m1))
44
    );
45
46
    i = 8 << ln;
47
    asm volatile(
48
        "1: \n\t"
49
        "sub $32, %0 \n\t"
50
        "movq    (%0,%1), %%mm0 \n\t"
51
        "movq  16(%0,%1), %%mm1 \n\t"
52
        "movq   8(%0,%1), %%mm2 \n\t"
53
        "movq  24(%0,%1), %%mm3 \n\t"
54
        "movq      %%mm0, %%mm4 \n\t"
55
        "movq      %%mm1, %%mm5 \n\t"
56
        "pfadd     %%mm2, %%mm0 \n\t"
57
        "pfadd     %%mm3, %%mm1 \n\t"
58
        "pfsub     %%mm2, %%mm4 \n\t"
59
        "pfsub     %%mm3, %%mm5 \n\t"
60
        "movq      %%mm0, %%mm2 \n\t"
61
        "punpckldq %%mm5, %%mm6 \n\t"
62
        "punpckhdq %%mm6, %%mm5 \n\t"
63
        "movq      %%mm4, %%mm3 \n\t"
64
        "pxor      %%mm7, %%mm5 \n\t"
65
        "pfadd     %%mm1, %%mm0 \n\t"
66
        "pfadd     %%mm5, %%mm4 \n\t"
67
        "pfsub     %%mm1, %%mm2 \n\t"
68
        "pfsub     %%mm5, %%mm3 \n\t"
69
        "movq      %%mm0,   (%0,%1) \n\t"
70
        "movq      %%mm4,  8(%0,%1) \n\t"
71
        "movq      %%mm2, 16(%0,%1) \n\t"
72
        "movq      %%mm3, 24(%0,%1) \n\t"
73
        "jg 1b \n\t"
74
        :"+r"(i)
75
        :"r"(z)
76
    );
77 82eb4b0f Zuxy Meng
    /* pass 2 .. ln-1 */
78
79 1e4ecf26 Loren Merritt
    nblocks = 1 << (ln-3);
80 82eb4b0f Zuxy Meng
    nloops = 1 << 2;
81 1e4ecf26 Loren Merritt
    cptr = s->exptab1;
82 82eb4b0f Zuxy Meng
    do {
83
        p = z;
84
        j = nblocks;
85
        do {
86 1e4ecf26 Loren Merritt
            i = nloops*8;
87
            asm volatile(
88
                "1: \n\t"
89
                "sub $16, %0 \n\t"
90
                "movq    (%1,%0), %%mm0 \n\t"
91
                "movq   8(%1,%0), %%mm1 \n\t"
92
                "movq    (%2,%0), %%mm2 \n\t"
93
                "movq   8(%2,%0), %%mm3 \n\t"
94
                "movq      %%mm2, %%mm4 \n\t"
95
                "movq      %%mm3, %%mm5 \n\t"
96
                "punpckldq %%mm2, %%mm2 \n\t"
97
                "punpckldq %%mm3, %%mm3 \n\t"
98
                "punpckhdq %%mm4, %%mm4 \n\t"
99
                "punpckhdq %%mm5, %%mm5 \n\t"
100
                "pfmul   (%3,%0,2), %%mm2 \n\t" //  cre*re cim*re
101
                "pfmul  8(%3,%0,2), %%mm3 \n\t"
102
                "pfmul 16(%3,%0,2), %%mm4 \n\t" // -cim*im cre*im
103
                "pfmul 24(%3,%0,2), %%mm5 \n\t"
104
                "pfadd     %%mm2, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
105
                "pfadd     %%mm3, %%mm5 \n\t"
106
                "movq      %%mm0, %%mm2 \n\t"
107
                "movq      %%mm1, %%mm3 \n\t"
108
                "pfadd     %%mm4, %%mm0 \n\t"
109
                "pfadd     %%mm5, %%mm1 \n\t"
110
                "pfsub     %%mm4, %%mm2 \n\t"
111
                "pfsub     %%mm5, %%mm3 \n\t"
112
                "movq      %%mm0,  (%1,%0) \n\t"
113
                "movq      %%mm1, 8(%1,%0) \n\t"
114
                "movq      %%mm2,  (%2,%0) \n\t"
115
                "movq      %%mm3, 8(%2,%0) \n\t"
116
                "jg 1b \n\t"
117
                :"+r"(i)
118
                :"r"(p), "r"(p + nloops), "r"(cptr)
119
            );
120
            p += nloops*2;
121 82eb4b0f Zuxy Meng
        } while (--j);
122 1e4ecf26 Loren Merritt
        cptr += nloops*2;
123
        nblocks >>= 1;
124
        nloops <<= 1;
125 82eb4b0f Zuxy Meng
    } while (nblocks != 0);
126 1e4ecf26 Loren Merritt
    asm volatile("femms");
127 82eb4b0f Zuxy Meng
}