ffmpeg / libavcodec / x86 / fft_sse.c @ 4dcc4f8e
History  View  Annotate  Download (17.3 KB)
1 
/*


2 
* FFT/MDCT transform with SSE optimizations

3 
* Copyright (c) 2008 Loren Merritt

4 
*

5 
* This file is part of FFmpeg.

6 
*

7 
* FFmpeg is free software; you can redistribute it and/or

8 
* modify it under the terms of the GNU Lesser General Public

9 
* License as published by the Free Software Foundation; either

10 
* version 2.1 of the License, or (at your option) any later version.

11 
*

12 
* FFmpeg is distributed in the hope that it will be useful,

13 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

14 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 
* Lesser General Public License for more details.

16 
*

17 
* You should have received a copy of the GNU Lesser General Public

18 
* License along with FFmpeg; if not, write to the Free Software

19 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

20 
*/

21  
22 
#include "libavutil/x86_cpu.h" 
23 
#include "libavutil/common.h" 
24 
#include "libavcodec/dsputil.h" 
25 
#include "fft.h" 
26  
27 
DECLARE_ALIGNED(16, static const int, m1m1m1m1)[4] = 
28 
{ 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; 
29  
30 
void ff_fft_dispatch_sse(FFTComplex *z, int nbits); 
31 
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits); 
32  
33 
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)

34 
{ 
35 
int n = 1 << s>nbits; 
36  
37 
ff_fft_dispatch_interleave_sse(z, s>nbits); 
38  
39 
if(n <= 16) { 
40 
x86_reg i = 8*n;

41 
__asm__ volatile(

42 
"1: \n"

43 
"movaps (%0,%1), %%xmm0 \n"

44 
"movaps %%xmm0, %%xmm1 \n"

45 
"unpcklps 16(%0,%1), %%xmm0 \n"

46 
"unpckhps 16(%0,%1), %%xmm1 \n"

47 
"movaps %%xmm0, (%0,%1) \n"

48 
"movaps %%xmm1, 16(%0,%1) \n"

49 
"add $32, %0 \n"

50 
"jl 1b \n"

51 
:"+r"(i)

52 
:"r"(z+n)

53 
:"memory"

54 
); 
55 
} 
56 
} 
57  
58 
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)

59 
{ 
60 
int n = 1 << s>nbits; 
61 
int i;

62 
for(i=0; i<n; i+=2) { 
63 
__asm__ volatile(

64 
"movaps %2, %%xmm0 \n"

65 
"movlps %%xmm0, %0 \n"

66 
"movhps %%xmm0, %1 \n"

67 
:"=m"(s>tmp_buf[s>revtab[i]]),

68 
"=m"(s>tmp_buf[s>revtab[i+1]]) 
69 
:"m"(z[i])

70 
); 
71 
} 
72 
memcpy(z, s>tmp_buf, n*sizeof(FFTComplex));

73 
} 
74  
75 
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input) 
76 
{ 
77 
av_unused x86_reg i, j, k, l; 
78 
long n = 1 << s>mdct_bits; 
79 
long n2 = n >> 1; 
80 
long n4 = n >> 2; 
81 
long n8 = n >> 3; 
82 
const uint16_t *revtab = s>revtab + n8;

83 
const FFTSample *tcos = s>tcos;

84 
const FFTSample *tsin = s>tsin;

85 
FFTComplex *z = (FFTComplex *)output; 
86  
87 
/* pre rotation */

88 
for(k=n82; k>=0; k=2) { 
89 
__asm__ volatile(

90 
"movaps (%2,%1,2), %%xmm0 \n" // { z[k].re, z[k].im, z[k+1].re, z[k+1].im } 
91 
"movaps 16(%2,%0,2), %%xmm1 \n" // { z[k2].re, z[k2].im, z[k1].re, z[k1].im } 
92 
"movaps %%xmm0, %%xmm2 \n"

93 
"shufps $0x88, %%xmm1, %%xmm0 \n" // { z[k].re, z[k+1].re, z[k2].re, z[k1].re } 
94 
"shufps $0x77, %%xmm2, %%xmm1 \n" // { z[k1].im, z[k2].im, z[k+1].im, z[k].im } 
95 
"movlps (%3,%1), %%xmm4 \n"

96 
"movlps (%4,%1), %%xmm5 \n"

97 
"movhps 8(%3,%0), %%xmm4 \n" // { cos[k], cos[k+1], cos[k2], cos[k1] } 
98 
"movhps 8(%4,%0), %%xmm5 \n" // { sin[k], sin[k+1], sin[k2], sin[k1] } 
99 
"movaps %%xmm0, %%xmm2 \n"

100 
"movaps %%xmm1, %%xmm3 \n"

101 
"mulps %%xmm5, %%xmm0 \n" // re*sin 
102 
"mulps %%xmm4, %%xmm1 \n" // im*cos 
103 
"mulps %%xmm4, %%xmm2 \n" // re*cos 
104 
"mulps %%xmm5, %%xmm3 \n" // im*sin 
105 
"subps %%xmm0, %%xmm1 \n" // > re 
106 
"addps %%xmm3, %%xmm2 \n" // > im 
107 
"movaps %%xmm1, %%xmm0 \n"

108 
"unpcklps %%xmm2, %%xmm1 \n" // { z[k], z[k+1] } 
109 
"unpckhps %%xmm2, %%xmm0 \n" // { z[k2], z[k1] } 
110 
::"r"(4*k), "r"(4*k), 
111 
"r"(input+n4), "r"(tcos+n8), "r"(tsin+n8) 
112 
); 
113 
#if ARCH_X86_64

114 
// if we have enough regs, don't let gcc make the luts latencybound

115 
// but if not, latency is faster than spilling

116 
__asm__("movlps %%xmm0, %0 \n"

117 
"movhps %%xmm0, %1 \n"

118 
"movlps %%xmm1, %2 \n"

119 
"movhps %%xmm1, %3 \n"

120 
:"=m"(z[revtab[k2]]), 
121 
"=m"(z[revtab[k1]]), 
122 
"=m"(z[revtab[ k ]]),

123 
"=m"(z[revtab[ k+1]]) 
124 
); 
125 
#else

126 
__asm__("movlps %%xmm0, %0" :"=m"(z[revtab[k2]])); 
127 
__asm__("movhps %%xmm0, %0" :"=m"(z[revtab[k1]])); 
128 
__asm__("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]])); 
129 
__asm__("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]])); 
130 
#endif

131 
} 
132  
133 
ff_fft_dispatch_sse(z, s>nbits); 
134  
135 
/* post rotation + reinterleave + reorder */

136  
137 
#define CMUL(j,xmm0,xmm1)\

138 
"movaps (%2,"#j",2), %%xmm6 \n"\ 
139 
"movaps 16(%2,"#j",2), "#xmm0"\n"\ 
140 
"movaps %%xmm6, "#xmm1"\n"\ 
141 
"movaps "#xmm0",%%xmm7 \n"\ 
142 
"mulps (%3,"#j"), %%xmm6 \n"\ 
143 
"mulps (%4,"#j"), "#xmm0"\n"\ 
144 
"mulps (%4,"#j"), "#xmm1"\n"\ 
145 
"mulps (%3,"#j"), %%xmm7 \n"\ 
146 
"subps %%xmm6, "#xmm0"\n"\ 
147 
"addps %%xmm7, "#xmm1"\n" 
148  
149 
j = n2; 
150 
k = n216;

151 
__asm__ volatile(

152 
"1: \n"

153 
CMUL(%0, %%xmm0, %%xmm1)

154 
CMUL(%1, %%xmm4, %%xmm5)

155 
"shufps $0x1b, %%xmm1, %%xmm1 \n"

156 
"shufps $0x1b, %%xmm5, %%xmm5 \n"

157 
"movaps %%xmm4, %%xmm6 \n"

158 
"unpckhps %%xmm1, %%xmm4 \n"

159 
"unpcklps %%xmm1, %%xmm6 \n"

160 
"movaps %%xmm0, %%xmm2 \n"

161 
"unpcklps %%xmm5, %%xmm0 \n"

162 
"unpckhps %%xmm5, %%xmm2 \n"

163 
"movaps %%xmm6, (%2,%1,2) \n"

164 
"movaps %%xmm4, 16(%2,%1,2) \n"

165 
"movaps %%xmm0, (%2,%0,2) \n"

166 
"movaps %%xmm2, 16(%2,%0,2) \n"

167 
"sub $16, %1 \n"

168 
"add $16, %0 \n"

169 
"jl 1b \n"

170 
:"+&r"(j), "+&r"(k) 
171 
:"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) 
172 
:"memory"

173 
); 
174 
} 
175  
176 
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) 
177 
{ 
178 
x86_reg j, k; 
179 
long n = 1 << s>mdct_bits; 
180 
long n4 = n >> 2; 
181  
182 
ff_imdct_half_sse(s, output+n4, input); 
183  
184 
j = n; 
185 
k = n16;

186 
__asm__ volatile(

187 
"movaps %4, %%xmm7 \n"

188 
"1: \n"

189 
"movaps (%2,%1), %%xmm0 \n"

190 
"movaps (%3,%0), %%xmm1 \n"

191 
"shufps $0x1b, %%xmm0, %%xmm0 \n"

192 
"shufps $0x1b, %%xmm1, %%xmm1 \n"

193 
"xorps %%xmm7, %%xmm0 \n"

194 
"movaps %%xmm1, (%3,%1) \n"

195 
"movaps %%xmm0, (%2,%0) \n"

196 
"sub $16, %1 \n"

197 
"add $16, %0 \n"

198 
"jl 1b \n"

199 
:"+r"(j), "+r"(k) 
200 
:"r"(output+n4), "r"(output+n4*3), 
201 
"m"(*m1m1m1m1)

202 
); 
203 
} 
204  
205 
DECLARE_ALIGNED(16, static const float, b1)[] = { 
206 
0.500603, 0.505471, 0.515447, 0.531043, 
207 
0.553104, 0.582935, 0.622504, 0.674808, 
208 
1.169440, 0.972568, 0.839350, 0.744536, 
209 
10.190008, 3.407609, 2.057781, 1.484165, 
210 
0.502419, 0.522499, 0.566944, 0.646822, 
211 
0.788155, 1.060678, 1.722447, 5.101149, 
212 
0.509796, 0.601345, 0.899976, 2.562916, 
213 
1.000000, 1.000000, 1.306563, 0.541196, 
214 
1.000000, 0.707107, 1.000000, 0.707107 
215 
}; 
216  
217 
DECLARE_ALIGNED(16, static const int32_t, smask)[4] = { 
218 
0, 0, 0x80000000, 0x80000000 
219 
}; 
220  
221 
/* butterfly operator */

222 
#define BUTTERFLY(a,b,c,tmp) \

223 
"movaps %%" #a ", %%" #tmp " \n\t" \ 
224 
"subps %%" #b ", %%" #a " \n\t" \ 
225 
"addps %%" #tmp ", %%" #b " \n\t" \ 
226 
"mulps " #c ", %%" #a " \n\t" 
227  
228 
///* Same as BUTTERFLY when vectors a and b overlap */

229 
#define BUTTERFLY0(val, mask, cos, tmp, shuf) \

230 
"movaps %%" #val ", %%" #tmp " \n\t" \ 
231 
"shufps " #shuf ", %%" #val ",%%" #val " \n\t" \ 
232 
"xorps %%" #mask ", %%" #tmp " \n\t" /* flip signs */ \ 
233 
"addps %%" #tmp ", %%" #val " \n\t" \ 
234 
"mulps %%" #cos ", %%" #val " \n\t" 
235  
236 
#define BUTTERFLY2(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0x1b) 
237 
#define BUTTERFLY3(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0xb1) 
238  
239 
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) 
240 
{ 
241 
int32_t tmp1 = 0;

242 
__asm__ volatile(

243 
/* pass 1 */

244  
245 
"movaps (%4), %%xmm0 \n\t"

246 
"movaps 112(%4), %%xmm1 \n\t"

247 
"shufps $0x1b, %%xmm1, %%xmm1 \n\t"

248 
BUTTERFLY(xmm0, xmm1, (%2), xmm3)

249  
250 
"movaps 64(%4), %%xmm7 \n\t"

251 
"movaps 48(%4), %%xmm4 \n\t"

252 
"shufps $0x1b, %%xmm4, %%xmm4 \n\t"

253 
BUTTERFLY(xmm7, xmm4, 48(%2), xmm3) 
254  
255  
256 
/* pass 2 */

257 
"movaps 64(%2), %%xmm2 \n\t"

258 
BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) 
259 
"movaps %%xmm1, 48(%1) \n\t"

260 
"movaps %%xmm4, (%1) \n\t"

261  
262 
/* pass 1 */

263 
"movaps 16(%4), %%xmm1 \n\t"

264 
"movaps 96(%4), %%xmm6 \n\t"

265 
"shufps $0x1b, %%xmm6, %%xmm6 \n\t"

266 
BUTTERFLY(xmm1, xmm6, 16(%2), xmm3) 
267  
268 
"movaps 80(%4), %%xmm4 \n\t"

269 
"movaps 32(%4), %%xmm5 \n\t"

270 
"shufps $0x1b, %%xmm5, %%xmm5 \n\t"

271 
BUTTERFLY(xmm4, xmm5, 32(%2), xmm3) 
272  
273 
/* pass 2 */

274 
BUTTERFLY(xmm0, xmm7, %%xmm2, xmm3) 
275  
276 
"movaps 80(%2), %%xmm2 \n\t"

277 
BUTTERFLY(xmm6, xmm5, %%xmm2, xmm3) 
278  
279 
BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3) 
280  
281 
/* pass 3 */

282 
"movaps 96(%2), %%xmm2 \n\t"

283 
"shufps $0x1b, %%xmm1, %%xmm1 \n\t"

284 
BUTTERFLY(xmm0, xmm1, %%xmm2, xmm3) 
285 
"movaps %%xmm0, 112(%1) \n\t"

286 
"movaps %%xmm1, 96(%1) \n\t"

287  
288 
"movaps 0(%1), %%xmm0 \n\t"

289 
"shufps $0x1b, %%xmm5, %%xmm5 \n\t"

290 
BUTTERFLY(xmm0, xmm5, %%xmm2, xmm3) 
291  
292 
"movaps 48(%1), %%xmm1 \n\t"

293 
"shufps $0x1b, %%xmm6, %%xmm6 \n\t"

294 
BUTTERFLY(xmm1, xmm6, %%xmm2, xmm3) 
295 
"movaps %%xmm1, 48(%1) \n\t"

296  
297 
"shufps $0x1b, %%xmm4, %%xmm4 \n\t"

298 
BUTTERFLY(xmm7, xmm4, %%xmm2, xmm3) 
299  
300 
/* pass 4 */

301 
"movaps (%3), %%xmm3 \n\t"

302 
"movaps 112(%2), %%xmm2 \n\t"

303  
304 
BUTTERFLY2(xmm5, xmm3, xmm2, xmm1) 
305  
306 
BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) 
307 
"movaps %%xmm0, 16(%1) \n\t"

308  
309 
BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) 
310 
"movaps %%xmm6, 32(%1) \n\t"

311  
312 
"movaps 48(%1), %%xmm0 \n\t"

313 
BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) 
314 
"movaps %%xmm0, 48(%1) \n\t"

315  
316 
BUTTERFLY2(xmm4, xmm3, xmm2, xmm1) 
317  
318 
BUTTERFLY2(xmm7, xmm3, xmm2, xmm1) 
319  
320 
"movaps 96(%1), %%xmm6 \n\t"

321 
BUTTERFLY2(xmm6, xmm3, xmm2, xmm1) 
322  
323 
"movaps 112(%1), %%xmm0 \n\t"

324 
BUTTERFLY2(xmm0, xmm3, xmm2, xmm1) 
325  
326 
/* pass 5 */

327 
"movaps 128(%2), %%xmm2 \n\t"

328 
"shufps $0xCC, %%xmm3,%%xmm3 \n\t"

329  
330 
BUTTERFLY3(xmm5, xmm3, xmm2, xmm1) 
331 
"movaps %%xmm5, (%1) \n\t"

332  
333 
"movaps 16(%1), %%xmm1 \n\t"

334 
BUTTERFLY3(xmm1, xmm3, xmm2, xmm5) 
335 
"movaps %%xmm1, 16(%1) \n\t"

336  
337 
BUTTERFLY3(xmm4, xmm3, xmm2, xmm5) 
338 
"movaps %%xmm4, 64(%1) \n\t"

339  
340 
BUTTERFLY3(xmm7, xmm3, xmm2, xmm5) 
341 
"movaps %%xmm7, 80(%1) \n\t"

342  
343 
"movaps 32(%1), %%xmm5 \n\t"

344 
BUTTERFLY3(xmm5, xmm3, xmm2, xmm7) 
345 
"movaps %%xmm5, 32(%1) \n\t"

346  
347 
"movaps 48(%1), %%xmm4 \n\t"

348 
BUTTERFLY3(xmm4, xmm3, xmm2, xmm7) 
349 
"movaps %%xmm4, 48(%1) \n\t"

350  
351 
BUTTERFLY3(xmm6, xmm3, xmm2, xmm7) 
352 
"movaps %%xmm6, 96(%1) \n\t"

353  
354 
BUTTERFLY3(xmm0, xmm3, xmm2, xmm7) 
355 
"movaps %%xmm0, 112(%1) \n\t"

356  
357  
358 
/* pass 6, no SIMD... */

359 
"movss 56(%1), %%xmm3 \n\t"

360 
"movl 4(%1), %0 \n\t"

361 
"addss 60(%1), %%xmm3 \n\t"

362 
"movss 72(%1), %%xmm7 \n\t"

363 
"addss %%xmm3, %%xmm4 \n\t"

364 
"movss 52(%1), %%xmm2 \n\t"

365 
"addss %%xmm3, %%xmm2 \n\t"

366 
"movss 24(%1), %%xmm3 \n\t"

367 
"addss 28(%1), %%xmm3 \n\t"

368 
"addss 76(%1), %%xmm7 \n\t"

369 
"addss %%xmm3, %%xmm1 \n\t"

370 
"addss %%xmm4, %%xmm5 \n\t"

371 
"movss %%xmm1, 16(%1) \n\t"

372 
"movss 20(%1), %%xmm1 \n\t"

373 
"addss %%xmm3, %%xmm1 \n\t"

374 
"movss 40(%1), %%xmm3 \n\t"

375 
"movss %%xmm1, 48(%1) \n\t"

376 
"addss 44(%1), %%xmm3 \n\t"

377 
"movss 20(%1), %%xmm1 \n\t"

378 
"addss %%xmm3, %%xmm4 \n\t"

379 
"addss %%xmm2, %%xmm3 \n\t"

380 
"addss 28(%1), %%xmm1 \n\t"

381 
"movss %%xmm3, 40(%1) \n\t"

382 
"addss 36(%1), %%xmm2 \n\t"

383 
"movss 8(%1), %%xmm3 \n\t"

384 
"movss %%xmm2, 56(%1) \n\t"

385 
"addss 12(%1), %%xmm3 \n\t"

386 
"movss %%xmm5, 8(%1) \n\t"

387 
"movss %%xmm3, 32(%1) \n\t"

388 
"movss 52(%1), %%xmm2 \n\t"

389 
"movss 80(%1), %%xmm3 \n\t"

390 
"movss 120(%1), %%xmm5 \n\t"

391 
"movss %%xmm1, 80(%1) \n\t"

392 
"movss %%xmm4, 24(%1) \n\t"

393 
"addss 124(%1), %%xmm5 \n\t"

394 
"movss 64(%1), %%xmm1 \n\t"

395 
"addss 60(%1), %%xmm2 \n\t"

396 
"addss %%xmm5, %%xmm0 \n\t"

397 
"addss 116(%1), %%xmm5 \n\t"

398 
"movl %0, 64(%1) \n\t"

399 
"addss %%xmm0, %%xmm6 \n\t"

400 
"addss %%xmm6, %%xmm1 \n\t"

401 
"movl 12(%1), %0 \n\t"

402 
"movss %%xmm1, 4(%1) \n\t"

403 
"movss 88(%1), %%xmm1 \n\t"

404 
"movl %0, 96(%1) \n\t"

405 
"addss 92(%1), %%xmm1 \n\t"

406 
"movss 104(%1), %%xmm4 \n\t"

407 
"movl 28(%1), %0 \n\t"

408 
"addss 108(%1), %%xmm4 \n\t"

409 
"addss %%xmm4, %%xmm0 \n\t"

410 
"addss %%xmm1, %%xmm3 \n\t"

411 
"addss 84(%1), %%xmm1 \n\t"

412 
"addss %%xmm5, %%xmm4 \n\t"

413 
"addss %%xmm3, %%xmm6 \n\t"

414 
"addss %%xmm0, %%xmm3 \n\t"

415 
"addss %%xmm7, %%xmm0 \n\t"

416 
"addss 100(%1), %%xmm5 \n\t"

417 
"addss %%xmm4, %%xmm7 \n\t"

418 
"movl %0, 112(%1) \n\t"

419 
"movss %%xmm0, 28(%1) \n\t"

420 
"movss 36(%1), %%xmm0 \n\t"

421 
"movss %%xmm7, 36(%1) \n\t"

422 
"addss %%xmm1, %%xmm4 \n\t"

423 
"movss 116(%1), %%xmm7 \n\t"

424 
"addss %%xmm2, %%xmm0 \n\t"

425 
"addss 124(%1), %%xmm7 \n\t"

426 
"movss %%xmm0, 72(%1) \n\t"

427 
"movss 44(%1), %%xmm0 \n\t"

428 
"movss %%xmm6, 12(%1) \n\t"

429 
"movss %%xmm3, 20(%1) \n\t"

430 
"addss %%xmm0, %%xmm2 \n\t"

431 
"movss %%xmm4, 44(%1) \n\t"

432 
"movss %%xmm2, 88(%1) \n\t"

433 
"addss 60(%1), %%xmm0 \n\t"

434 
"movl 60(%1), %0 \n\t"

435 
"movl %0, 120(%1) \n\t"

436 
"movss %%xmm0, 104(%1) \n\t"

437 
"addss %%xmm5, %%xmm1 \n\t"

438 
"addss 68(%1), %%xmm5 \n\t"

439 
"movss %%xmm1, 52(%1) \n\t"

440 
"movss %%xmm5, 60(%1) \n\t"

441 
"movss 68(%1), %%xmm1 \n\t"

442 
"movss 100(%1), %%xmm5 \n\t"

443 
"addss %%xmm7, %%xmm5 \n\t"

444 
"addss 108(%1), %%xmm7 \n\t"

445 
"addss %%xmm5, %%xmm1 \n\t"

446 
"movss 84(%1), %%xmm2 \n\t"

447 
"addss 92(%1), %%xmm2 \n\t"

448 
"addss %%xmm2, %%xmm5 \n\t"

449 
"movss %%xmm1, 68(%1) \n\t"

450 
"addss %%xmm7, %%xmm2 \n\t"

451 
"movss 76(%1), %%xmm1 \n\t"

452 
"movss %%xmm2, 84(%1) \n\t"

453 
"movss %%xmm5, 76(%1) \n\t"

454 
"movss 108(%1), %%xmm2 \n\t"

455 
"addss %%xmm1, %%xmm7 \n\t"

456 
"addss 124(%1), %%xmm2 \n\t"

457 
"addss %%xmm2, %%xmm1 \n\t"

458 
"addss 92(%1), %%xmm2 \n\t"

459 
"movss %%xmm1, 100(%1) \n\t"

460 
"movss %%xmm2, 108(%1) \n\t"

461 
"movss 92(%1), %%xmm2 \n\t"

462 
"movss %%xmm7, 92(%1) \n\t"

463 
"addss 124(%1), %%xmm2 \n\t"

464 
"movss %%xmm2, 116(%1) \n\t"

465 
:"+&r"(tmp1)

466 
:"r"(out), "r"(b1), "r"(smask), "r"(in) 
467 
:"memory"

468 
); 
469 
} 