libavcodec/i386/fft_sse.c  

100  100 
i = nloops*8; 
101  101 
asm volatile( 
102  102 
"1: \n\t" 
103 
"sub $16, %0 \n\t"


103 
"sub $32, %0 \n\t"


104  104 
"movaps (%2,%0), %%xmm1 \n\t" 
105  105 
"movaps (%1,%0), %%xmm0 \n\t" 
106 
"movaps 16(%2,%0), %%xmm5 \n\t" 

107 
"movaps 16(%1,%0), %%xmm4 \n\t" 

106  108 
"movaps %%xmm1, %%xmm2 \n\t" 
109 
"movaps %%xmm5, %%xmm6 \n\t" 

107  110 
"shufps $0xA0, %%xmm1, %%xmm1 \n\t" 
108  111 
"shufps $0xF5, %%xmm2, %%xmm2 \n\t" 
112 
"shufps $0xA0, %%xmm5, %%xmm5 \n\t" 

113 
"shufps $0xF5, %%xmm6, %%xmm6 \n\t" 

109  114 
"mulps (%3,%0,2), %%xmm1 \n\t" // cre*re cim*re 
110  115 
"mulps 16(%3,%0,2), %%xmm2 \n\t" // cim*im cre*im 
116 
"mulps 32(%3,%0,2), %%xmm5 \n\t" // cre*re cim*re 

117 
"mulps 48(%3,%0,2), %%xmm6 \n\t" // cim*im cre*im 

111  118 
"addps %%xmm2, %%xmm1 \n\t" 
119 
"addps %%xmm6, %%xmm5 \n\t" 

112  120 
"movaps %%xmm0, %%xmm3 \n\t" 
121 
"movaps %%xmm4, %%xmm7 \n\t" 

113  122 
"addps %%xmm1, %%xmm0 \n\t" 
114  123 
"subps %%xmm1, %%xmm3 \n\t" 
124 
"addps %%xmm5, %%xmm4 \n\t" 

125 
"subps %%xmm5, %%xmm7 \n\t" 

115  126 
"movaps %%xmm0, (%1,%0) \n\t" 
116  127 
"movaps %%xmm3, (%2,%0) \n\t" 
128 
"movaps %%xmm4, 16(%1,%0) \n\t" 

129 
"movaps %%xmm7, 16(%2,%0) \n\t" 

117  130 
"jg 1b \n\t" 
118  131 
:"+r"(i) 
119  132 
:"r"(p), "r"(p + nloops), "r"(cptr) 
...  ...  
141  154 
n4 = n >> 2; 
142  155 
n8 = n >> 3; 
143  156  
144 
asm volatile ("movaps %0, %%xmm7\n\t"::"m"(*p1m1p1m1)); 

157 
#ifdef ARCH_X86_64 

158 
asm volatile ("movaps %0, %%xmm8\n\t"::"m"(*p1m1p1m1)); 

159 
#define P1M1P1M1 "%%xmm8" 

160 
#else 

161 
#define P1M1P1M1 "%4" 

162 
#endif 

145  163  
146  164 
/* pre rotation */ 
147  165 
in1 = input; 
148  166 
in2 = input + n2  4; 
149  167  
150 
/* Complex multiplication 

151 
Two complex products per iteration, we could have 4 with 8 xmm 

152 
registers, 8 with 16 xmm registers. 

153 
Maybe we should unroll more. 

154 
*/ 

155 
for (k = 0; k < n4; k += 2) { 

168 
/* Complex multiplication */ 

169 
for (k = 0; k < n4; k += 4) { 

156  170 
asm volatile ( 
157  171 
"movaps %0, %%xmm0 \n\t" // xmm0 = r0 X r1 X : in2 
158  172 
"movaps %1, %%xmm3 \n\t" // xmm3 = X i1 X i0: in1 
173 
"movaps 16+%0, %%xmm4 \n\t" // xmm4 = r0 X r1 X : in2 

174 
"movaps 16+%1, %%xmm7 \n\t" // xmm7 = X i1 X i0: in1 

159  175 
"movlps %2, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos 
160  176 
"movlps %3, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin 
177 
"movlps 8+%2, %%xmm5 \n\t" // xmm5 = X X R1 R0: tcos 

178 
"movlps 8+%3, %%xmm6 \n\t" // xmm6 = X X I1 I0: tsin 

161  179 
"shufps $95, %%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0 
162  180 
"shufps $160,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0 
181 
"shufps $95, %%xmm4, %%xmm4 \n\t" // xmm4 = r1 r1 r0 r0 

182 
"shufps $160,%%xmm7, %%xmm7 \n\t" // xmm7 = i1 i1 i0 i0 

163  183 
"unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0 
184 
"unpcklps %%xmm6, %%xmm5 \n\t" // xmm5 = I1 R1 I0 R0 

164  185 
"movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 
165 
"xorps %%xmm7, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 

186 
"movaps %%xmm5, %%xmm6 \n\t" // xmm6 = I1 R1 I0 R0 

187 
"xorps "P1M1P1M1", %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 

188 
"xorps "P1M1P1M1", %%xmm6 \n\t" // xmm6 = I1 R1 I0 R0 

166  189 
"mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR 
190 
"mulps %%xmm5, %%xmm4 \n\t" // xmm4 = rI rR rI rR 

167  191 
"shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 I1 R0 I0 
192 
"shufps $177,%%xmm6, %%xmm6 \n\t" // xmm6 = R1 I1 R0 I0 

168  193 
"mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri Ii Ri Ii 
194 
"mulps %%xmm6, %%xmm7 \n\t" // xmm7 = Ri Ii Ri Ii 

169  195 
"addps %%xmm3, %%xmm0 \n\t" // xmm0 = result 
196 
"addps %%xmm7, %%xmm4 \n\t" // xmm4 = result 

170  197 
::"m"(in2[2*k]), "m"(in1[2*k]), 
171  198 
"m"(tcos[k]), "m"(tsin[k]) 
199 
#ifndef ARCH_X86_64 

200 
,"m"(*p1m1p1m1) 

201 
#endif 

172  202 
); 
173  203 
/* Should be in the same block, hack for gcc2.95 & gcc3 */ 
174  204 
asm ( 
175  205 
"movlps %%xmm0, %0 \n\t" 
176  206 
"movhps %%xmm0, %1 \n\t" 
177 
:"=m"(z[revtab[k]]), "=m"(z[revtab[k + 1]]) 

207 
"movlps %%xmm4, %2 \n\t" 

208 
"movhps %%xmm4, %3 \n\t" 

209 
:"=m"(z[revtab[k]]), "=m"(z[revtab[k + 1]]), 

210 
"=m"(z[revtab[k + 2]]), "=m"(z[revtab[k + 3]]) 

178  211 
); 
179  212 
} 
180  213  
181  214 
ff_fft_calc_sse(&s>fft, z); 
182  215  
183 
/* Not currently needed, added for safety */ 

184 
asm volatile ("movaps %0, %%xmm7\n\t"::"m"(*p1m1p1m1)); 

216 
#ifndef ARCH_X86_64 

217 
#undef P1M1P1M1 

218 
#define P1M1P1M1 "%3" 

219 
#endif 

185  220  
186  221 
/* post rotation + reordering */ 
187 
for (k = 0; k < n4; k += 2) {


222 
for (k = 0; k < n4; k += 4) {


188  223 
asm ( 
189  224 
"movaps %0, %%xmm0 \n\t" // xmm0 = i1 r1 i0 r0: z 
225 
"movaps 16+%0, %%xmm4 \n\t" // xmm4 = i1 r1 i0 r0: z 

190  226 
"movlps %1, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos 
227 
"movlps 8+%1, %%xmm5 \n\t" // xmm5 = X X R1 R0: tcos 

191  228 
"movaps %%xmm0, %%xmm3 \n\t" // xmm3 = i1 r1 i0 r0 
229 
"movaps %%xmm4, %%xmm7 \n\t" // xmm7 = i1 r1 i0 r0 

192  230 
"movlps %2, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin 
231 
"movlps 8+%2, %%xmm6 \n\t" // xmm6 = X X I1 I0: tsin 

193  232 
"shufps $160,%%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0 
194  233 
"shufps $245,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0 
234 
"shufps $160,%%xmm4, %%xmm4 \n\t" // xmm4 = r1 r1 r0 r0 

235 
"shufps $245,%%xmm7, %%xmm7 \n\t" // xmm7 = i1 i1 i0 i0 

195  236 
"unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0 
237 
"unpcklps %%xmm6, %%xmm5 \n\t" // xmm5 = I1 R1 I0 R0 

196  238 
"movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 
197 
"xorps %%xmm7, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 

239 
"movaps %%xmm5, %%xmm6 \n\t" // xmm6 = I1 R1 I0 R0 

240 
"xorps "P1M1P1M1", %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 

198  241 
"mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR 
242 
"xorps "P1M1P1M1", %%xmm6 \n\t" // xmm6 = I1 R1 I0 R0 

243 
"mulps %%xmm5, %%xmm4 \n\t" // xmm4 = rI rR rI rR 

199  244 
"shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 I1 R0 I0 
245 
"shufps $177,%%xmm6, %%xmm6 \n\t" // xmm6 = R1 I1 R0 I0 

200  246 
"mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri Ii Ri Ii 
247 
"mulps %%xmm6, %%xmm7 \n\t" // xmm7 = Ri Ii Ri Ii 

201  248 
"addps %%xmm3, %%xmm0 \n\t" // xmm0 = result 
249 
"addps %%xmm7, %%xmm4 \n\t" // xmm4 = result 

202  250 
"movaps %%xmm0, %0 \n\t" 
251 
"movaps %%xmm4, 16+%0 \n\t" 

203  252 
:"+m"(z[k]) 
204  253 
:"m"(tcos[k]), "m"(tsin[k]) 
254 
#ifndef ARCH_X86_64 

255 
,"m"(*p1m1p1m1) 

256 
#endif 

205  257 
); 
206  258 
} 
207  259 
