ffmpeg / libavcodec / i386 / fft_3dn2.c @ b550bfaa
History  View  Annotate  Download (6.9 KB)
1 
/*


2 
* FFT/MDCT transform with Extended 3DNow! optimizations

3 
* Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt

4 
* Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.

5 
*

6 
* This file is part of FFmpeg.

7 
*

8 
* FFmpeg is free software; you can redistribute it and/or

9 
* modify it under the terms of the GNU Lesser General Public

10 
* License as published by the Free Software Foundation; either

11 
* version 2.1 of the License, or (at your option) any later version.

12 
*

13 
* FFmpeg is distributed in the hope that it will be useful,

14 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

15 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

16 
* Lesser General Public License for more details.

17 
*

18 
* You should have received a copy of the GNU Lesser General Public

19 
* License along with FFmpeg; if not, write to the Free Software

20 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

21 
*/

22 
#include "dsputil.h" 
23  
24 
static const int p1m1[2] __attribute__((aligned(8))) = 
25 
{ 0, 1 << 31 }; 
26  
27 
static const int m1p1[2] __attribute__((aligned(8))) = 
28 
{ 1 << 31, 0 }; 
29  
30 
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)

31 
{ 
32 
int ln = s>nbits;

33 
long i, j;

34 
long nblocks, nloops;

35 
FFTComplex *p, *cptr; 
36  
37 
asm volatile( 
38 
/* FEMMS is not a must here but recommended by AMD */

39 
"femms \n\t"

40 
"movq %0, %%mm7 \n\t"

41 
::"m"(*(s>inverse ? m1p1 : p1m1))

42 
); 
43  
44 
i = 8 << ln;

45 
asm volatile( 
46 
"1: \n\t"

47 
"sub $32, %0 \n\t"

48 
"movq (%0,%1), %%mm0 \n\t"

49 
"movq 16(%0,%1), %%mm1 \n\t"

50 
"movq 8(%0,%1), %%mm2 \n\t"

51 
"movq 24(%0,%1), %%mm3 \n\t"

52 
"movq %%mm0, %%mm4 \n\t"

53 
"movq %%mm1, %%mm5 \n\t"

54 
"pfadd %%mm2, %%mm0 \n\t"

55 
"pfadd %%mm3, %%mm1 \n\t"

56 
"pfsub %%mm2, %%mm4 \n\t"

57 
"pfsub %%mm3, %%mm5 \n\t"

58 
"movq %%mm0, %%mm2 \n\t"

59 
"pswapd %%mm5, %%mm5 \n\t"

60 
"movq %%mm4, %%mm3 \n\t"

61 
"pxor %%mm7, %%mm5 \n\t"

62 
"pfadd %%mm1, %%mm0 \n\t"

63 
"pfadd %%mm5, %%mm4 \n\t"

64 
"pfsub %%mm1, %%mm2 \n\t"

65 
"pfsub %%mm5, %%mm3 \n\t"

66 
"movq %%mm0, (%0,%1) \n\t"

67 
"movq %%mm4, 8(%0,%1) \n\t"

68 
"movq %%mm2, 16(%0,%1) \n\t"

69 
"movq %%mm3, 24(%0,%1) \n\t"

70 
"jg 1b \n\t"

71 
:"+r"(i)

72 
:"r"(z)

73 
); 
74 
/* pass 2 .. ln1 */

75  
76 
nblocks = 1 << (ln3); 
77 
nloops = 1 << 2; 
78 
cptr = s>exptab1; 
79 
do {

80 
p = z; 
81 
j = nblocks; 
82 
do {

83 
i = nloops*8;

84 
asm volatile( 
85 
"1: \n\t"

86 
"sub $16, %0 \n\t"

87 
"movq (%1,%0), %%mm0 \n\t"

88 
"movq 8(%1,%0), %%mm1 \n\t"

89 
"movq (%2,%0), %%mm2 \n\t"

90 
"movq 8(%2,%0), %%mm3 \n\t"

91 
"movq (%3,%0,2), %%mm4 \n\t"

92 
"movq 8(%3,%0,2), %%mm5 \n\t"

93 
"pswapd %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3] 
94 
"pswapd %%mm5, %%mm7 \n\t"

95 
"pfmul %%mm2, %%mm4 \n\t" // cre*re cim*im 
96 
"pfmul %%mm3, %%mm5 \n\t"

97 
"pfmul %%mm2, %%mm6 \n\t" // cim*re cre*im 
98 
"pfmul %%mm3, %%mm7 \n\t"

99 
"pfpnacc %%mm6, %%mm4 \n\t" // cre*recim*im cim*re+cre*im 
100 
"pfpnacc %%mm7, %%mm5 \n\t"

101 
"movq %%mm0, %%mm2 \n\t"

102 
"movq %%mm1, %%mm3 \n\t"

103 
"pfadd %%mm4, %%mm0 \n\t"

104 
"pfadd %%mm5, %%mm1 \n\t"

105 
"pfsub %%mm4, %%mm2 \n\t"

106 
"pfsub %%mm5, %%mm3 \n\t"

107 
"movq %%mm0, (%1,%0) \n\t"

108 
"movq %%mm1, 8(%1,%0) \n\t"

109 
"movq %%mm2, (%2,%0) \n\t"

110 
"movq %%mm3, 8(%2,%0) \n\t"

111 
"jg 1b \n\t"

112 
:"+r"(i)

113 
:"r"(p), "r"(p + nloops), "r"(cptr) 
114 
); 
115 
p += nloops*2;

116 
} while (j);

117 
cptr += nloops*2;

118 
nblocks >>= 1;

119 
nloops <<= 1;

120 
} while (nblocks != 0); 
121 
asm volatile("femms"); 
122 
} 
123  
124 
void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,

125 
const FFTSample *input, FFTSample *tmp)

126 
{ 
127 
long k, n8, n4, n2, n;

128 
const uint16_t *revtab = s>fft.revtab;

129 
const FFTSample *tcos = s>tcos;

130 
const FFTSample *tsin = s>tsin;

131 
const FFTSample *in1, *in2;

132 
FFTComplex *z = (FFTComplex *)tmp; 
133  
134 
n = 1 << s>nbits;

135 
n2 = n >> 1;

136 
n4 = n >> 2;

137 
n8 = n >> 3;

138  
139 
/* pre rotation */

140 
in1 = input; 
141 
in2 = input + n2  1;

142 
for(k = 0; k < n4; k++) { 
143 
// FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it

144 
asm volatile( 
145 
"movd %0, %%mm0 \n\t"

146 
"movd %2, %%mm1 \n\t"

147 
"punpckldq %1, %%mm0 \n\t"

148 
"punpckldq %3, %%mm1 \n\t"

149 
"movq %%mm0, %%mm2 \n\t"

150 
"pfmul %%mm1, %%mm0 \n\t"

151 
"pswapd %%mm1, %%mm1 \n\t"

152 
"pfmul %%mm1, %%mm2 \n\t"

153 
"pfpnacc %%mm2, %%mm0 \n\t"

154 
::"m"(in2[2*k]), "m"(in1[2*k]), 
155 
"m"(tcos[k]), "m"(tsin[k]) 
156 
); 
157 
asm volatile( 
158 
"movq %%mm0, %0 \n\t"

159 
:"=m"(z[revtab[k]])

160 
); 
161 
} 
162  
163 
ff_fft_calc(&s>fft, z); 
164  
165 
/* post rotation + reordering */

166 
for(k = 0; k < n4; k++) { 
167 
asm volatile( 
168 
"movq %0, %%mm0 \n\t"

169 
"movd %1, %%mm1 \n\t"

170 
"punpckldq %2, %%mm1 \n\t"

171 
"movq %%mm0, %%mm2 \n\t"

172 
"pfmul %%mm1, %%mm0 \n\t"

173 
"pswapd %%mm1, %%mm1 \n\t"

174 
"pfmul %%mm1, %%mm2 \n\t"

175 
"pfpnacc %%mm2, %%mm0 \n\t"

176 
"movq %%mm0, %0 \n\t"

177 
:"+m"(z[k])

178 
:"m"(tcos[k]), "m"(tsin[k]) 
179 
); 
180 
} 
181  
182 
k = n8;

183 
asm volatile("movd %0, %%mm7" ::"r"(1<<31)); 
184 
asm volatile( 
185 
"1: \n\t"

186 
"movq (%4,%0), %%mm0 \n\t" // z[n8+k] 
187 
"neg %0 \n\t"

188 
"pswapd 8(%4,%0), %%mm1 \n\t" // z[n81k] 
189 
"movq %%mm0, %%mm2 \n\t"

190 
"pxor %%mm7, %%mm2 \n\t"

191 
"punpckldq %%mm1, %%mm2 \n\t"

192 
"pswapd %%mm2, %%mm3 \n\t"

193 
"punpckhdq %%mm1, %%mm0 \n\t"

194 
"pswapd %%mm0, %%mm4 \n\t"

195 
"pxor %%mm7, %%mm0 \n\t"

196 
"pxor %%mm7, %%mm4 \n\t"

197 
"movq %%mm3, 8(%3,%0) \n\t" // output[n22*k] = { z[n81k].im, z[n8+k].re } 
198 
"movq %%mm4, 8(%2,%0) \n\t" // output[n222*k]= { z[n81k].re, z[n8+k].im } 
199 
"neg %0 \n\t"

200 
"movq %%mm0, (%1,%0) \n\t" // output[2*k] = { z[n8+k].im, z[n81k].re } 
201 
"movq %%mm2, (%2,%0) \n\t" // output[n2+2*k] = { z[n8+k].re, z[n81k].im } 
202 
"sub $8, %0 \n\t"

203 
"jge 1b \n\t"

204 
:"+r"(k)

205 
:"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8) 
206 
:"memory"

207 
); 
208 
asm volatile("femms"); 
209 
} 
210 