Revision bcfa3e58 libavcodec/i386/fft_3dn2.c

View differences:

libavcodec/i386/fft_3dn2.c
1 1
/*
2 2
 * FFT/MDCT transform with Extended 3DNow! optimizations
3
 * Copyright (c) 2006 Zuxy MENG Jie.
3
 * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
4 4
 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
5 5
 *
6 6
 * This library is free software; you can redistribute it and/or
......
134 134
}
135 135

  
136 136
#endif
137

  
138
void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
139
                        const FFTSample *input, FFTSample *tmp)
140
{
141
    int k, n8, n4, n2, n;
142
    const uint16_t *revtab = s->fft.revtab;
143
    const FFTSample *tcos = s->tcos;
144
    const FFTSample *tsin = s->tsin;
145
    const FFTSample *in1, *in2;
146
    FFTComplex *z = (FFTComplex *)tmp;
147

  
148
    n = 1 << s->nbits;
149
    n2 = n >> 1;
150
    n4 = n >> 2;
151
    n8 = n >> 3;
152

  
153
    /* pre rotation */
154
    in1 = input;
155
    in2 = input + n2 - 1;
156
    for(k = 0; k < n4; k++) {
157
        asm volatile(
158
            "movd       %1, %%mm0 \n\t"
159
            "movd       %3, %%mm1 \n\t"
160
            "punpckldq  %2, %%mm0 \n\t"
161
            "punpckldq  %4, %%mm1 \n\t"
162
            "movq    %%mm0, %%mm2 \n\t"
163
            "pfmul   %%mm1, %%mm0 \n\t"
164
            "pswapd  %%mm1, %%mm1 \n\t"
165
            "pfmul   %%mm1, %%mm2 \n\t"
166
            "pfpnacc %%mm2, %%mm0 \n\t"
167
            "movq    %%mm0, %0    \n\t"
168
            :"=m"(z[revtab[k]])
169
            :"m"(in2[-2*k]), "m"(in1[2*k]),
170
             "m"(tcos[k]), "m"(tsin[k])
171
        );
172
    }
173

  
174
    ff_fft_calc(&s->fft, z);
175

  
176
    /* post rotation + reordering */
177
    for(k = 0; k < n4; k++) {
178
        asm volatile(
179
            "movq       %0, %%mm0 \n\t"
180
            "movd       %1, %%mm1 \n\t"
181
            "punpckldq  %2, %%mm1 \n\t"
182
            "movq    %%mm0, %%mm2 \n\t"
183
            "pfmul   %%mm1, %%mm0 \n\t"
184
            "pswapd  %%mm1, %%mm1 \n\t"
185
            "pfmul   %%mm1, %%mm2 \n\t"
186
            "pfpnacc %%mm2, %%mm0 \n\t"
187
            "movq    %%mm0, %0    \n\t"
188
            :"+m"(z[k])
189
            :"m"(tcos[k]), "m"(tsin[k])
190
        );
191
    }
192

  
193
    asm volatile("movd %0, %%mm7" ::"r"(1<<31));
194
    for(k = 0; k < n8; k++) {
195
        asm volatile(
196
            "movq         %4, %%mm0 \n\t"
197
            "pswapd       %5, %%mm1 \n\t"
198
            "movq      %%mm0, %%mm2 \n\t"
199
            "pxor      %%mm7, %%mm2 \n\t"
200
            "punpckldq %%mm1, %%mm2 \n\t"
201
            "pswapd    %%mm2, %%mm3 \n\t"
202
            "punpckhdq %%mm1, %%mm0 \n\t"
203
            "pswapd    %%mm0, %%mm4 \n\t"
204
            "pxor      %%mm7, %%mm0 \n\t"
205
            "pxor      %%mm7, %%mm4 \n\t"
206
            "movq      %%mm0, %0    \n\t" // { -z[n8+k].im, z[n8-1-k].re }
207
            "movq      %%mm4, %1    \n\t" // { -z[n8-1-k].re, z[n8+k].im }
208
            "movq      %%mm2, %2    \n\t" // { -z[n8+k].re, z[n8-1-k].im }
209
            "movq      %%mm3, %3    \n\t" // { z[n8-1-k].im, -z[n8+k].re }
210
            :"=m"(output[2*k]), "=m"(output[n2-2-2*k]),
211
             "=m"(output[n2+2*k]), "=m"(output[n-2-2*k])
212
            :"m"(z[n8+k]), "m"(z[n8-1-k])
213
            :"memory"
214
        );
215
    }
216
    asm volatile("emms");
217
}

Also available in: Unified diff