ffmpeg / libavcodec / x86 / fft_sse.c @ 0cc8a5d0
History  View  Annotate  Download (2.89 KB)
1 
/*


2 
* FFT/MDCT transform with SSE optimizations

3 
* Copyright (c) 2008 Loren Merritt

4 
*

5 
* This file is part of FFmpeg.

6 
*

7 
* FFmpeg is free software; you can redistribute it and/or

8 
* modify it under the terms of the GNU Lesser General Public

9 
* License as published by the Free Software Foundation; either

10 
* version 2.1 of the License, or (at your option) any later version.

11 
*

12 
* FFmpeg is distributed in the hope that it will be useful,

13 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

14 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 
* Lesser General Public License for more details.

16 
*

17 
* You should have received a copy of the GNU Lesser General Public

18 
* License along with FFmpeg; if not, write to the Free Software

19 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

20 
*/

21  
22 
#include "libavutil/x86_cpu.h" 
23 
#include "libavcodec/dsputil.h" 
24 
#include "fft.h" 
25  
26 
DECLARE_ALIGNED(16, static const int, m1m1m1m1)[4] = 
27 
{ 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; 
28  
29 
void ff_fft_dispatch_sse(FFTComplex *z, int nbits); 
30 
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits); 
31  
32 
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)

33 
{ 
34 
int n = 1 << s>nbits; 
35  
36 
ff_fft_dispatch_interleave_sse(z, s>nbits); 
37  
38 
if(n <= 16) { 
39 
x86_reg i = 8*n;

40 
__asm__ volatile(

41 
"1: \n"

42 
"movaps (%0,%1), %%xmm0 \n"

43 
"movaps %%xmm0, %%xmm1 \n"

44 
"unpcklps 16(%0,%1), %%xmm0 \n"

45 
"unpckhps 16(%0,%1), %%xmm1 \n"

46 
"movaps %%xmm0, (%0,%1) \n"

47 
"movaps %%xmm1, 16(%0,%1) \n"

48 
"add $32, %0 \n"

49 
"jl 1b \n"

50 
:"+r"(i)

51 
:"r"(z+n)

52 
:"memory"

53 
); 
54 
} 
55 
} 
56  
57 
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)

58 
{ 
59 
int n = 1 << s>nbits; 
60 
int i;

61 
for(i=0; i<n; i+=2) { 
62 
__asm__ volatile(

63 
"movaps %2, %%xmm0 \n"

64 
"movlps %%xmm0, %0 \n"

65 
"movhps %%xmm0, %1 \n"

66 
:"=m"(s>tmp_buf[s>revtab[i]]),

67 
"=m"(s>tmp_buf[s>revtab[i+1]]) 
68 
:"m"(z[i])

69 
); 
70 
} 
71 
memcpy(z, s>tmp_buf, n*sizeof(FFTComplex));

72 
} 
73  
74 
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) 
75 
{ 
76 
x86_reg j, k; 
77 
long n = s>mdct_size;

78 
long n4 = n >> 2; 
79  
80 
ff_imdct_half_sse(s, output+n4, input); 
81  
82 
j = n; 
83 
k = n16;

84 
__asm__ volatile(

85 
"movaps %4, %%xmm7 \n"

86 
"1: \n"

87 
"movaps (%2,%1), %%xmm0 \n"

88 
"movaps (%3,%0), %%xmm1 \n"

89 
"shufps $0x1b, %%xmm0, %%xmm0 \n"

90 
"shufps $0x1b, %%xmm1, %%xmm1 \n"

91 
"xorps %%xmm7, %%xmm0 \n"

92 
"movaps %%xmm1, (%3,%1) \n"

93 
"movaps %%xmm0, (%2,%0) \n"

94 
"sub $16, %1 \n"

95 
"add $16, %0 \n"

96 
"jl 1b \n"

97 
:"+r"(j), "+r"(k) 
98 
:"r"(output+n4), "r"(output+n4*3), 
99 
"m"(*m1m1m1m1)

100 
); 
101 
} 
102 