Revision bcfa3e58 libavcodec/i386/fft_3dn2.c
libavcodec/i386/fft_3dn2.c  

1  1 
/* 
2  2 
* FFT/MDCT transform with Extended 3DNow! optimizations 
3 
* Copyright (c) 2006 Zuxy MENG Jie.


3 
* Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt


4  4 
* Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. 
5  5 
* 
6  6 
* This library is free software; you can redistribute it and/or 
...  ...  
134  134 
} 
135  135  
136  136 
#endif 
137  
138 
void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, 

139 
const FFTSample *input, FFTSample *tmp) 

140 
{ 

141 
int k, n8, n4, n2, n; 

142 
const uint16_t *revtab = s>fft.revtab; 

143 
const FFTSample *tcos = s>tcos; 

144 
const FFTSample *tsin = s>tsin; 

145 
const FFTSample *in1, *in2; 

146 
FFTComplex *z = (FFTComplex *)tmp; 

147  
148 
n = 1 << s>nbits; 

149 
n2 = n >> 1; 

150 
n4 = n >> 2; 

151 
n8 = n >> 3; 

152  
153 
/* pre rotation */ 

154 
in1 = input; 

155 
in2 = input + n2  1; 

156 
for(k = 0; k < n4; k++) { 

157 
asm volatile( 

158 
"movd %1, %%mm0 \n\t" 

159 
"movd %3, %%mm1 \n\t" 

160 
"punpckldq %2, %%mm0 \n\t" 

161 
"punpckldq %4, %%mm1 \n\t" 

162 
"movq %%mm0, %%mm2 \n\t" 

163 
"pfmul %%mm1, %%mm0 \n\t" 

164 
"pswapd %%mm1, %%mm1 \n\t" 

165 
"pfmul %%mm1, %%mm2 \n\t" 

166 
"pfpnacc %%mm2, %%mm0 \n\t" 

167 
"movq %%mm0, %0 \n\t" 

168 
:"=m"(z[revtab[k]]) 

169 
:"m"(in2[2*k]), "m"(in1[2*k]), 

170 
"m"(tcos[k]), "m"(tsin[k]) 

171 
); 

172 
} 

173  
174 
ff_fft_calc(&s>fft, z); 

175  
176 
/* post rotation + reordering */ 

177 
for(k = 0; k < n4; k++) { 

178 
asm volatile( 

179 
"movq %0, %%mm0 \n\t" 

180 
"movd %1, %%mm1 \n\t" 

181 
"punpckldq %2, %%mm1 \n\t" 

182 
"movq %%mm0, %%mm2 \n\t" 

183 
"pfmul %%mm1, %%mm0 \n\t" 

184 
"pswapd %%mm1, %%mm1 \n\t" 

185 
"pfmul %%mm1, %%mm2 \n\t" 

186 
"pfpnacc %%mm2, %%mm0 \n\t" 

187 
"movq %%mm0, %0 \n\t" 

188 
:"+m"(z[k]) 

189 
:"m"(tcos[k]), "m"(tsin[k]) 

190 
); 

191 
} 

192  
193 
asm volatile("movd %0, %%mm7" ::"r"(1<<31)); 

194 
for(k = 0; k < n8; k++) { 

195 
asm volatile( 

196 
"movq %4, %%mm0 \n\t" 

197 
"pswapd %5, %%mm1 \n\t" 

198 
"movq %%mm0, %%mm2 \n\t" 

199 
"pxor %%mm7, %%mm2 \n\t" 

200 
"punpckldq %%mm1, %%mm2 \n\t" 

201 
"pswapd %%mm2, %%mm3 \n\t" 

202 
"punpckhdq %%mm1, %%mm0 \n\t" 

203 
"pswapd %%mm0, %%mm4 \n\t" 

204 
"pxor %%mm7, %%mm0 \n\t" 

205 
"pxor %%mm7, %%mm4 \n\t" 

206 
"movq %%mm0, %0 \n\t" // { z[n8+k].im, z[n81k].re } 

207 
"movq %%mm4, %1 \n\t" // { z[n81k].re, z[n8+k].im } 

208 
"movq %%mm2, %2 \n\t" // { z[n8+k].re, z[n81k].im } 

209 
"movq %%mm3, %3 \n\t" // { z[n81k].im, z[n8+k].re } 

210 
:"=m"(output[2*k]), "=m"(output[n222*k]), 

211 
"=m"(output[n2+2*k]), "=m"(output[n22*k]) 

212 
:"m"(z[n8+k]), "m"(z[n81k]) 

213 
:"memory" 

214 
); 

215 
} 

216 
asm volatile("emms"); 

217 
} 
Also available in: Unified diff