Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / fft_sse.c @ 4dcc4f8e

History | View | Annotate | Download (17.3 KB)

1
/*
2
 * FFT/MDCT transform with SSE optimizations
3
 * Copyright (c) 2008 Loren Merritt
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
#include "libavutil/x86_cpu.h"
23
#include "libavutil/common.h"
24
#include "libavcodec/dsputil.h"
25
#include "fft.h"
26

    
27
DECLARE_ALIGNED(16, static const int, m1m1m1m1)[4] =
28
    { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
29

    
30
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
31
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
32

    
33
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
34
{
35
    int n = 1 << s->nbits;
36

    
37
    ff_fft_dispatch_interleave_sse(z, s->nbits);
38

    
39
    if(n <= 16) {
40
        x86_reg i = -8*n;
41
        __asm__ volatile(
42
            "1: \n"
43
            "movaps     (%0,%1), %%xmm0 \n"
44
            "movaps      %%xmm0, %%xmm1 \n"
45
            "unpcklps 16(%0,%1), %%xmm0 \n"
46
            "unpckhps 16(%0,%1), %%xmm1 \n"
47
            "movaps      %%xmm0,   (%0,%1) \n"
48
            "movaps      %%xmm1, 16(%0,%1) \n"
49
            "add $32, %0 \n"
50
            "jl 1b \n"
51
            :"+r"(i)
52
            :"r"(z+n)
53
            :"memory"
54
        );
55
    }
56
}
57

    
58
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
59
{
60
    int n = 1 << s->nbits;
61
    int i;
62
    for(i=0; i<n; i+=2) {
63
        __asm__ volatile(
64
            "movaps %2, %%xmm0 \n"
65
            "movlps %%xmm0, %0 \n"
66
            "movhps %%xmm0, %1 \n"
67
            :"=m"(s->tmp_buf[s->revtab[i]]),
68
             "=m"(s->tmp_buf[s->revtab[i+1]])
69
            :"m"(z[i])
70
        );
71
    }
72
    memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
73
}
74

    
75
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
76
{
77
    av_unused x86_reg i, j, k, l;
78
    long n = 1 << s->mdct_bits;
79
    long n2 = n >> 1;
80
    long n4 = n >> 2;
81
    long n8 = n >> 3;
82
    const uint16_t *revtab = s->revtab + n8;
83
    const FFTSample *tcos = s->tcos;
84
    const FFTSample *tsin = s->tsin;
85
    FFTComplex *z = (FFTComplex *)output;
86

    
87
    /* pre rotation */
88
    for(k=n8-2; k>=0; k-=2) {
89
        __asm__ volatile(
90
            "movaps     (%2,%1,2), %%xmm0 \n" // { z[k].re,    z[k].im,    z[k+1].re,  z[k+1].im  }
91
            "movaps  -16(%2,%0,2), %%xmm1 \n" // { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im }
92
            "movaps        %%xmm0, %%xmm2 \n"
93
            "shufps $0x88, %%xmm1, %%xmm0 \n" // { z[k].re,    z[k+1].re,  z[-k-2].re, z[-k-1].re }
94
            "shufps $0x77, %%xmm2, %%xmm1 \n" // { z[-k-1].im, z[-k-2].im, z[k+1].im,  z[k].im    }
95
            "movlps       (%3,%1), %%xmm4 \n"
96
            "movlps       (%4,%1), %%xmm5 \n"
97
            "movhps     -8(%3,%0), %%xmm4 \n" // { cos[k],     cos[k+1],   cos[-k-2],  cos[-k-1]  }
98
            "movhps     -8(%4,%0), %%xmm5 \n" // { sin[k],     sin[k+1],   sin[-k-2],  sin[-k-1]  }
99
            "movaps        %%xmm0, %%xmm2 \n"
100
            "movaps        %%xmm1, %%xmm3 \n"
101
            "mulps         %%xmm5, %%xmm0 \n" // re*sin
102
            "mulps         %%xmm4, %%xmm1 \n" // im*cos
103
            "mulps         %%xmm4, %%xmm2 \n" // re*cos
104
            "mulps         %%xmm5, %%xmm3 \n" // im*sin
105
            "subps         %%xmm0, %%xmm1 \n" // -> re
106
            "addps         %%xmm3, %%xmm2 \n" // -> im
107
            "movaps        %%xmm1, %%xmm0 \n"
108
            "unpcklps      %%xmm2, %%xmm1 \n" // { z[k],    z[k+1]  }
109
            "unpckhps      %%xmm2, %%xmm0 \n" // { z[-k-2], z[-k-1] }
110
            ::"r"(-4*k), "r"(4*k),
111
              "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8)
112
        );
113
#if ARCH_X86_64
114
        // if we have enough regs, don't let gcc make the luts latency-bound
115
        // but if not, latency is faster than spilling
116
        __asm__("movlps %%xmm0, %0 \n"
117
            "movhps %%xmm0, %1 \n"
118
            "movlps %%xmm1, %2 \n"
119
            "movhps %%xmm1, %3 \n"
120
            :"=m"(z[revtab[-k-2]]),
121
             "=m"(z[revtab[-k-1]]),
122
             "=m"(z[revtab[ k  ]]),
123
             "=m"(z[revtab[ k+1]])
124
        );
125
#else
126
        __asm__("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]]));
127
        __asm__("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]]));
128
        __asm__("movlps %%xmm1, %0" :"=m"(z[revtab[ k  ]]));
129
        __asm__("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]]));
130
#endif
131
    }
132

    
133
    ff_fft_dispatch_sse(z, s->nbits);
134

    
135
    /* post rotation + reinterleave + reorder */
136

    
137
#define CMUL(j,xmm0,xmm1)\
138
        "movaps   (%2,"#j",2), %%xmm6 \n"\
139
        "movaps 16(%2,"#j",2), "#xmm0"\n"\
140
        "movaps        %%xmm6, "#xmm1"\n"\
141
        "movaps        "#xmm0",%%xmm7 \n"\
142
        "mulps      (%3,"#j"), %%xmm6 \n"\
143
        "mulps      (%4,"#j"), "#xmm0"\n"\
144
        "mulps      (%4,"#j"), "#xmm1"\n"\
145
        "mulps      (%3,"#j"), %%xmm7 \n"\
146
        "subps         %%xmm6, "#xmm0"\n"\
147
        "addps         %%xmm7, "#xmm1"\n"
148

    
149
    j = -n2;
150
    k = n2-16;
151
    __asm__ volatile(
152
        "1: \n"
153
        CMUL(%0, %%xmm0, %%xmm1)
154
        CMUL(%1, %%xmm4, %%xmm5)
155
        "shufps    $0x1b, %%xmm1, %%xmm1 \n"
156
        "shufps    $0x1b, %%xmm5, %%xmm5 \n"
157
        "movaps   %%xmm4, %%xmm6 \n"
158
        "unpckhps %%xmm1, %%xmm4 \n"
159
        "unpcklps %%xmm1, %%xmm6 \n"
160
        "movaps   %%xmm0, %%xmm2 \n"
161
        "unpcklps %%xmm5, %%xmm0 \n"
162
        "unpckhps %%xmm5, %%xmm2 \n"
163
        "movaps   %%xmm6,   (%2,%1,2) \n"
164
        "movaps   %%xmm4, 16(%2,%1,2) \n"
165
        "movaps   %%xmm0,   (%2,%0,2) \n"
166
        "movaps   %%xmm2, 16(%2,%0,2) \n"
167
        "sub $16, %1 \n"
168
        "add $16, %0 \n"
169
        "jl 1b \n"
170
        :"+&r"(j), "+&r"(k)
171
        :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
172
        :"memory"
173
    );
174
}
175

    
176
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
177
{
178
    x86_reg j, k;
179
    long n = 1 << s->mdct_bits;
180
    long n4 = n >> 2;
181

    
182
    ff_imdct_half_sse(s, output+n4, input);
183

    
184
    j = -n;
185
    k = n-16;
186
    __asm__ volatile(
187
        "movaps %4, %%xmm7 \n"
188
        "1: \n"
189
        "movaps       (%2,%1), %%xmm0 \n"
190
        "movaps       (%3,%0), %%xmm1 \n"
191
        "shufps $0x1b, %%xmm0, %%xmm0 \n"
192
        "shufps $0x1b, %%xmm1, %%xmm1 \n"
193
        "xorps         %%xmm7, %%xmm0 \n"
194
        "movaps        %%xmm1, (%3,%1) \n"
195
        "movaps        %%xmm0, (%2,%0) \n"
196
        "sub $16, %1 \n"
197
        "add $16, %0 \n"
198
        "jl 1b \n"
199
        :"+r"(j), "+r"(k)
200
        :"r"(output+n4), "r"(output+n4*3),
201
         "m"(*m1m1m1m1)
202
    );
203
}
204

    
205
DECLARE_ALIGNED(16, static const float, b1)[] = {
206
     0.500603,  0.505471,  0.515447,  0.531043,
207
     0.553104,  0.582935,  0.622504,  0.674808,
208
    -1.169440, -0.972568, -0.839350, -0.744536,
209
   -10.190008, -3.407609, -2.057781, -1.484165,
210
     0.502419,  0.522499,  0.566944,  0.646822,
211
     0.788155,  1.060678,  1.722447,  5.101149,
212
     0.509796,  0.601345,  0.899976,  2.562916,
213
     1.000000,  1.000000,  1.306563,  0.541196,
214
     1.000000,  0.707107,  1.000000, -0.707107
215
};
216

    
217
DECLARE_ALIGNED(16, static const int32_t, smask)[4] = {
218
    0, 0, 0x80000000, 0x80000000
219
};
220

    
221
/* butterfly operator */
222
#define BUTTERFLY(a,b,c,tmp)                            \
223
    "movaps  %%" #a    ", %%" #tmp  "             \n\t" \
224
    "subps   %%" #b    ", %%" #a    "             \n\t" \
225
    "addps   %%" #tmp  ", %%" #b    "             \n\t" \
226
    "mulps     " #c    ", %%" #a    "             \n\t"
227

    
228
///* Same as BUTTERFLY when vectors a and b overlap */
229
#define BUTTERFLY0(val, mask, cos, tmp, shuf)                            \
230
    "movaps  %%" #val  ", %%" #tmp  "             \n\t"                  \
231
    "shufps    " #shuf ", %%" #val  ",%%" #val "  \n\t"                  \
232
    "xorps   %%" #mask ", %%" #tmp  "             \n\t" /* flip signs */ \
233
    "addps   %%" #tmp  ", %%" #val  "             \n\t"                  \
234
    "mulps   %%" #cos  ", %%" #val  "             \n\t"
235

    
236
#define BUTTERFLY2(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0x1b)
237
#define BUTTERFLY3(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0xb1)
238

    
239
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
240
{
241
    int32_t tmp1 = 0;
242
    __asm__ volatile(
243
        /* pass 1 */
244

    
245
        "movaps    (%4), %%xmm0           \n\t"
246
        "movaps 112(%4), %%xmm1           \n\t"
247
        "shufps   $0x1b, %%xmm1, %%xmm1   \n\t"
248
        BUTTERFLY(xmm0, xmm1, (%2), xmm3)
249

    
250
        "movaps  64(%4), %%xmm7           \n\t"
251
        "movaps  48(%4), %%xmm4           \n\t"
252
        "shufps   $0x1b, %%xmm4, %%xmm4   \n\t"
253
        BUTTERFLY(xmm7, xmm4, 48(%2), xmm3)
254

    
255

    
256
        /* pass 2 */
257
        "movaps  64(%2), %%xmm2           \n\t"
258
        BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3)
259
        "movaps  %%xmm1, 48(%1)           \n\t"
260
        "movaps  %%xmm4, (%1)             \n\t"
261

    
262
        /* pass 1 */
263
        "movaps  16(%4), %%xmm1           \n\t"
264
        "movaps  96(%4), %%xmm6           \n\t"
265
        "shufps   $0x1b, %%xmm6, %%xmm6   \n\t"
266
        BUTTERFLY(xmm1, xmm6, 16(%2), xmm3)
267

    
268
        "movaps  80(%4), %%xmm4           \n\t"
269
        "movaps  32(%4), %%xmm5           \n\t"
270
        "shufps   $0x1b, %%xmm5, %%xmm5   \n\t"
271
        BUTTERFLY(xmm4, xmm5, 32(%2), xmm3)
272

    
273
        /* pass 2 */
274
        BUTTERFLY(xmm0, xmm7, %%xmm2, xmm3)
275

    
276
        "movaps  80(%2), %%xmm2           \n\t"
277
        BUTTERFLY(xmm6, xmm5, %%xmm2, xmm3)
278

    
279
        BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3)
280

    
281
        /* pass 3 */
282
        "movaps  96(%2), %%xmm2           \n\t"
283
        "shufps   $0x1b, %%xmm1, %%xmm1   \n\t"
284
        BUTTERFLY(xmm0, xmm1, %%xmm2, xmm3)
285
        "movaps  %%xmm0, 112(%1)          \n\t"
286
        "movaps  %%xmm1,  96(%1)          \n\t"
287

    
288
        "movaps   0(%1), %%xmm0           \n\t"
289
        "shufps   $0x1b, %%xmm5, %%xmm5   \n\t"
290
        BUTTERFLY(xmm0, xmm5, %%xmm2, xmm3)
291

    
292
        "movaps  48(%1), %%xmm1           \n\t"
293
        "shufps   $0x1b, %%xmm6, %%xmm6   \n\t"
294
        BUTTERFLY(xmm1, xmm6, %%xmm2, xmm3)
295
        "movaps  %%xmm1,  48(%1)          \n\t"
296

    
297
        "shufps   $0x1b, %%xmm4, %%xmm4   \n\t"
298
        BUTTERFLY(xmm7, xmm4, %%xmm2, xmm3)
299

    
300
        /* pass 4 */
301
        "movaps    (%3), %%xmm3           \n\t"
302
        "movaps 112(%2), %%xmm2           \n\t"
303

    
304
        BUTTERFLY2(xmm5, xmm3, xmm2, xmm1)
305

    
306
        BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
307
        "movaps  %%xmm0, 16(%1)           \n\t"
308

    
309
        BUTTERFLY2(xmm6, xmm3, xmm2, xmm1)
310
        "movaps  %%xmm6, 32(%1)           \n\t"
311

    
312
        "movaps  48(%1), %%xmm0           \n\t"
313
        BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
314
        "movaps  %%xmm0, 48(%1)           \n\t"
315

    
316
        BUTTERFLY2(xmm4, xmm3, xmm2, xmm1)
317

    
318
        BUTTERFLY2(xmm7, xmm3, xmm2, xmm1)
319

    
320
        "movaps  96(%1), %%xmm6           \n\t"
321
        BUTTERFLY2(xmm6, xmm3, xmm2, xmm1)
322

    
323
        "movaps 112(%1), %%xmm0           \n\t"
324
        BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
325

    
326
        /* pass 5 */
327
        "movaps 128(%2), %%xmm2           \n\t"
328
        "shufps   $0xCC, %%xmm3,%%xmm3    \n\t"
329

    
330
        BUTTERFLY3(xmm5, xmm3, xmm2, xmm1)
331
        "movaps  %%xmm5, (%1)             \n\t"
332

    
333
        "movaps  16(%1), %%xmm1           \n\t"
334
        BUTTERFLY3(xmm1, xmm3, xmm2, xmm5)
335
        "movaps  %%xmm1, 16(%1)           \n\t"
336

    
337
        BUTTERFLY3(xmm4, xmm3, xmm2, xmm5)
338
        "movaps  %%xmm4, 64(%1)           \n\t"
339

    
340
        BUTTERFLY3(xmm7, xmm3, xmm2, xmm5)
341
        "movaps  %%xmm7, 80(%1)           \n\t"
342

    
343
        "movaps  32(%1), %%xmm5           \n\t"
344
        BUTTERFLY3(xmm5, xmm3, xmm2, xmm7)
345
        "movaps  %%xmm5, 32(%1)           \n\t"
346

    
347
        "movaps  48(%1), %%xmm4           \n\t"
348
        BUTTERFLY3(xmm4, xmm3, xmm2, xmm7)
349
        "movaps  %%xmm4, 48(%1)           \n\t"
350

    
351
        BUTTERFLY3(xmm6, xmm3, xmm2, xmm7)
352
        "movaps  %%xmm6, 96(%1)           \n\t"
353

    
354
        BUTTERFLY3(xmm0, xmm3, xmm2, xmm7)
355
        "movaps  %%xmm0, 112(%1)          \n\t"
356

    
357

    
358
        /* pass 6, no SIMD... */
359
        "movss    56(%1),  %%xmm3           \n\t"
360
        "movl      4(%1),      %0           \n\t"
361
        "addss    60(%1),  %%xmm3           \n\t"
362
        "movss    72(%1),  %%xmm7           \n\t"
363
        "addss    %%xmm3,  %%xmm4           \n\t"
364
        "movss    52(%1),  %%xmm2           \n\t"
365
        "addss    %%xmm3,  %%xmm2           \n\t"
366
        "movss    24(%1),  %%xmm3           \n\t"
367
        "addss    28(%1),  %%xmm3           \n\t"
368
        "addss    76(%1),  %%xmm7           \n\t"
369
        "addss    %%xmm3,  %%xmm1           \n\t"
370
        "addss    %%xmm4,  %%xmm5           \n\t"
371
        "movss    %%xmm1,  16(%1)           \n\t"
372
        "movss    20(%1),  %%xmm1           \n\t"
373
        "addss    %%xmm3,  %%xmm1           \n\t"
374
        "movss    40(%1),  %%xmm3           \n\t"
375
        "movss    %%xmm1,  48(%1)           \n\t"
376
        "addss    44(%1),  %%xmm3           \n\t"
377
        "movss    20(%1),  %%xmm1           \n\t"
378
        "addss    %%xmm3,  %%xmm4           \n\t"
379
        "addss    %%xmm2,  %%xmm3           \n\t"
380
        "addss    28(%1),  %%xmm1           \n\t"
381
        "movss    %%xmm3,  40(%1)           \n\t"
382
        "addss    36(%1),  %%xmm2           \n\t"
383
        "movss     8(%1),  %%xmm3           \n\t"
384
        "movss    %%xmm2,  56(%1)           \n\t"
385
        "addss    12(%1),  %%xmm3           \n\t"
386
        "movss    %%xmm5,   8(%1)           \n\t"
387
        "movss    %%xmm3,  32(%1)           \n\t"
388
        "movss    52(%1),  %%xmm2           \n\t"
389
        "movss    80(%1),  %%xmm3           \n\t"
390
        "movss   120(%1),  %%xmm5           \n\t"
391
        "movss    %%xmm1,  80(%1)           \n\t"
392
        "movss    %%xmm4,  24(%1)           \n\t"
393
        "addss   124(%1),  %%xmm5           \n\t"
394
        "movss    64(%1),  %%xmm1           \n\t"
395
        "addss    60(%1),  %%xmm2           \n\t"
396
        "addss    %%xmm5,  %%xmm0           \n\t"
397
        "addss   116(%1),  %%xmm5           \n\t"
398
        "movl         %0,  64(%1)           \n\t"
399
        "addss    %%xmm0,  %%xmm6           \n\t"
400
        "addss    %%xmm6,  %%xmm1           \n\t"
401
        "movl     12(%1),      %0           \n\t"
402
        "movss    %%xmm1,   4(%1)           \n\t"
403
        "movss    88(%1),  %%xmm1           \n\t"
404
        "movl         %0,  96(%1)           \n\t"
405
        "addss    92(%1),  %%xmm1           \n\t"
406
        "movss   104(%1),  %%xmm4           \n\t"
407
        "movl     28(%1),      %0           \n\t"
408
        "addss   108(%1),  %%xmm4           \n\t"
409
        "addss    %%xmm4,  %%xmm0           \n\t"
410
        "addss    %%xmm1,  %%xmm3           \n\t"
411
        "addss    84(%1),  %%xmm1           \n\t"
412
        "addss    %%xmm5,  %%xmm4           \n\t"
413
        "addss    %%xmm3,  %%xmm6           \n\t"
414
        "addss    %%xmm0,  %%xmm3           \n\t"
415
        "addss    %%xmm7,  %%xmm0           \n\t"
416
        "addss   100(%1),  %%xmm5           \n\t"
417
        "addss    %%xmm4,  %%xmm7           \n\t"
418
        "movl         %0, 112(%1)           \n\t"
419
        "movss    %%xmm0,  28(%1)           \n\t"
420
        "movss    36(%1),  %%xmm0           \n\t"
421
        "movss    %%xmm7,  36(%1)           \n\t"
422
        "addss    %%xmm1,  %%xmm4           \n\t"
423
        "movss   116(%1),  %%xmm7           \n\t"
424
        "addss    %%xmm2,  %%xmm0           \n\t"
425
        "addss   124(%1),  %%xmm7           \n\t"
426
        "movss    %%xmm0,  72(%1)           \n\t"
427
        "movss    44(%1),  %%xmm0           \n\t"
428
        "movss    %%xmm6,  12(%1)           \n\t"
429
        "movss    %%xmm3,  20(%1)           \n\t"
430
        "addss    %%xmm0,  %%xmm2           \n\t"
431
        "movss    %%xmm4,  44(%1)           \n\t"
432
        "movss    %%xmm2,  88(%1)           \n\t"
433
        "addss    60(%1),  %%xmm0           \n\t"
434
        "movl     60(%1),      %0           \n\t"
435
        "movl         %0, 120(%1)           \n\t"
436
        "movss    %%xmm0, 104(%1)           \n\t"
437
        "addss    %%xmm5,  %%xmm1           \n\t"
438
        "addss    68(%1),  %%xmm5           \n\t"
439
        "movss    %%xmm1,  52(%1)           \n\t"
440
        "movss    %%xmm5,  60(%1)           \n\t"
441
        "movss    68(%1),  %%xmm1           \n\t"
442
        "movss   100(%1),  %%xmm5           \n\t"
443
        "addss    %%xmm7,  %%xmm5           \n\t"
444
        "addss   108(%1),  %%xmm7           \n\t"
445
        "addss    %%xmm5,  %%xmm1           \n\t"
446
        "movss    84(%1),  %%xmm2           \n\t"
447
        "addss    92(%1),  %%xmm2           \n\t"
448
        "addss    %%xmm2,  %%xmm5           \n\t"
449
        "movss    %%xmm1,  68(%1)           \n\t"
450
        "addss    %%xmm7,  %%xmm2           \n\t"
451
        "movss    76(%1),  %%xmm1           \n\t"
452
        "movss    %%xmm2,  84(%1)           \n\t"
453
        "movss    %%xmm5,  76(%1)           \n\t"
454
        "movss   108(%1),  %%xmm2           \n\t"
455
        "addss    %%xmm1,  %%xmm7           \n\t"
456
        "addss   124(%1),  %%xmm2           \n\t"
457
        "addss    %%xmm2,  %%xmm1           \n\t"
458
        "addss    92(%1),  %%xmm2           \n\t"
459
        "movss    %%xmm1, 100(%1)           \n\t"
460
        "movss    %%xmm2, 108(%1)           \n\t"
461
        "movss    92(%1),  %%xmm2           \n\t"
462
        "movss    %%xmm7,  92(%1)           \n\t"
463
        "addss   124(%1),  %%xmm2           \n\t"
464
        "movss    %%xmm2, 116(%1)           \n\t"
465
        :"+&r"(tmp1)
466
        :"r"(out), "r"(b1), "r"(smask), "r"(in)
467
        :"memory"
468
        );
469
}