Revision 4dcc4f8e libavcodec/x86/fft_sse.c

View differences:

libavcodec/x86/fft_sse.c
20 20
 */
21 21

  
22 22
#include "libavutil/x86_cpu.h"
23
#include "libavutil/common.h"
23 24
#include "libavcodec/dsputil.h"
24 25
#include "fft.h"
25 26

  
......
201 202
    );
202 203
}
203 204

  
205
DECLARE_ALIGNED(16, static const float, b1)[] = {
206
     0.500603,  0.505471,  0.515447,  0.531043,
207
     0.553104,  0.582935,  0.622504,  0.674808,
208
    -1.169440, -0.972568, -0.839350, -0.744536,
209
   -10.190008, -3.407609, -2.057781, -1.484165,
210
     0.502419,  0.522499,  0.566944,  0.646822,
211
     0.788155,  1.060678,  1.722447,  5.101149,
212
     0.509796,  0.601345,  0.899976,  2.562916,
213
     1.000000,  1.000000,  1.306563,  0.541196,
214
     1.000000,  0.707107,  1.000000, -0.707107
215
};
216

  
217
DECLARE_ALIGNED(16, static const int32_t, smask)[4] = {
218
    0, 0, 0x80000000, 0x80000000
219
};
220

  
221
/* butterfly operator */
222
#define BUTTERFLY(a,b,c,tmp)                            \
223
    "movaps  %%" #a    ", %%" #tmp  "             \n\t" \
224
    "subps   %%" #b    ", %%" #a    "             \n\t" \
225
    "addps   %%" #tmp  ", %%" #b    "             \n\t" \
226
    "mulps     " #c    ", %%" #a    "             \n\t"
227

  
228
///* Same as BUTTERFLY when vectors a and b overlap */
229
#define BUTTERFLY0(val, mask, cos, tmp, shuf)                            \
230
    "movaps  %%" #val  ", %%" #tmp  "             \n\t"                  \
231
    "shufps    " #shuf ", %%" #val  ",%%" #val "  \n\t"                  \
232
    "xorps   %%" #mask ", %%" #tmp  "             \n\t" /* flip signs */ \
233
    "addps   %%" #tmp  ", %%" #val  "             \n\t"                  \
234
    "mulps   %%" #cos  ", %%" #val  "             \n\t"
235

  
236
#define BUTTERFLY2(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0x1b)
237
#define BUTTERFLY3(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0xb1)
238

  
239
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
240
{
241
    int32_t tmp1 = 0;
242
    __asm__ volatile(
243
        /* pass 1 */
244

  
245
        "movaps    (%4), %%xmm0           \n\t"
246
        "movaps 112(%4), %%xmm1           \n\t"
247
        "shufps   $0x1b, %%xmm1, %%xmm1   \n\t"
248
        BUTTERFLY(xmm0, xmm1, (%2), xmm3)
249

  
250
        "movaps  64(%4), %%xmm7           \n\t"
251
        "movaps  48(%4), %%xmm4           \n\t"
252
        "shufps   $0x1b, %%xmm4, %%xmm4   \n\t"
253
        BUTTERFLY(xmm7, xmm4, 48(%2), xmm3)
254

  
255

  
256
        /* pass 2 */
257
        "movaps  64(%2), %%xmm2           \n\t"
258
        BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3)
259
        "movaps  %%xmm1, 48(%1)           \n\t"
260
        "movaps  %%xmm4, (%1)             \n\t"
261

  
262
        /* pass 1 */
263
        "movaps  16(%4), %%xmm1           \n\t"
264
        "movaps  96(%4), %%xmm6           \n\t"
265
        "shufps   $0x1b, %%xmm6, %%xmm6   \n\t"
266
        BUTTERFLY(xmm1, xmm6, 16(%2), xmm3)
267

  
268
        "movaps  80(%4), %%xmm4           \n\t"
269
        "movaps  32(%4), %%xmm5           \n\t"
270
        "shufps   $0x1b, %%xmm5, %%xmm5   \n\t"
271
        BUTTERFLY(xmm4, xmm5, 32(%2), xmm3)
272

  
273
        /* pass 2 */
274
        BUTTERFLY(xmm0, xmm7, %%xmm2, xmm3)
275

  
276
        "movaps  80(%2), %%xmm2           \n\t"
277
        BUTTERFLY(xmm6, xmm5, %%xmm2, xmm3)
278

  
279
        BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3)
280

  
281
        /* pass 3 */
282
        "movaps  96(%2), %%xmm2           \n\t"
283
        "shufps   $0x1b, %%xmm1, %%xmm1   \n\t"
284
        BUTTERFLY(xmm0, xmm1, %%xmm2, xmm3)
285
        "movaps  %%xmm0, 112(%1)          \n\t"
286
        "movaps  %%xmm1,  96(%1)          \n\t"
287

  
288
        "movaps   0(%1), %%xmm0           \n\t"
289
        "shufps   $0x1b, %%xmm5, %%xmm5   \n\t"
290
        BUTTERFLY(xmm0, xmm5, %%xmm2, xmm3)
291

  
292
        "movaps  48(%1), %%xmm1           \n\t"
293
        "shufps   $0x1b, %%xmm6, %%xmm6   \n\t"
294
        BUTTERFLY(xmm1, xmm6, %%xmm2, xmm3)
295
        "movaps  %%xmm1,  48(%1)          \n\t"
296

  
297
        "shufps   $0x1b, %%xmm4, %%xmm4   \n\t"
298
        BUTTERFLY(xmm7, xmm4, %%xmm2, xmm3)
299

  
300
        /* pass 4 */
301
        "movaps    (%3), %%xmm3           \n\t"
302
        "movaps 112(%2), %%xmm2           \n\t"
303

  
304
        BUTTERFLY2(xmm5, xmm3, xmm2, xmm1)
305

  
306
        BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
307
        "movaps  %%xmm0, 16(%1)           \n\t"
308

  
309
        BUTTERFLY2(xmm6, xmm3, xmm2, xmm1)
310
        "movaps  %%xmm6, 32(%1)           \n\t"
311

  
312
        "movaps  48(%1), %%xmm0           \n\t"
313
        BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
314
        "movaps  %%xmm0, 48(%1)           \n\t"
315

  
316
        BUTTERFLY2(xmm4, xmm3, xmm2, xmm1)
317

  
318
        BUTTERFLY2(xmm7, xmm3, xmm2, xmm1)
319

  
320
        "movaps  96(%1), %%xmm6           \n\t"
321
        BUTTERFLY2(xmm6, xmm3, xmm2, xmm1)
322

  
323
        "movaps 112(%1), %%xmm0           \n\t"
324
        BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
325

  
326
        /* pass 5 */
327
        "movaps 128(%2), %%xmm2           \n\t"
328
        "shufps   $0xCC, %%xmm3,%%xmm3    \n\t"
329

  
330
        BUTTERFLY3(xmm5, xmm3, xmm2, xmm1)
331
        "movaps  %%xmm5, (%1)             \n\t"
332

  
333
        "movaps  16(%1), %%xmm1           \n\t"
334
        BUTTERFLY3(xmm1, xmm3, xmm2, xmm5)
335
        "movaps  %%xmm1, 16(%1)           \n\t"
336

  
337
        BUTTERFLY3(xmm4, xmm3, xmm2, xmm5)
338
        "movaps  %%xmm4, 64(%1)           \n\t"
339

  
340
        BUTTERFLY3(xmm7, xmm3, xmm2, xmm5)
341
        "movaps  %%xmm7, 80(%1)           \n\t"
342

  
343
        "movaps  32(%1), %%xmm5           \n\t"
344
        BUTTERFLY3(xmm5, xmm3, xmm2, xmm7)
345
        "movaps  %%xmm5, 32(%1)           \n\t"
346

  
347
        "movaps  48(%1), %%xmm4           \n\t"
348
        BUTTERFLY3(xmm4, xmm3, xmm2, xmm7)
349
        "movaps  %%xmm4, 48(%1)           \n\t"
350

  
351
        BUTTERFLY3(xmm6, xmm3, xmm2, xmm7)
352
        "movaps  %%xmm6, 96(%1)           \n\t"
353

  
354
        BUTTERFLY3(xmm0, xmm3, xmm2, xmm7)
355
        "movaps  %%xmm0, 112(%1)          \n\t"
356

  
357

  
358
        /* pass 6, no SIMD... */
359
        "movss    56(%1),  %%xmm3           \n\t"
360
        "movl      4(%1),      %0           \n\t"
361
        "addss    60(%1),  %%xmm3           \n\t"
362
        "movss    72(%1),  %%xmm7           \n\t"
363
        "addss    %%xmm3,  %%xmm4           \n\t"
364
        "movss    52(%1),  %%xmm2           \n\t"
365
        "addss    %%xmm3,  %%xmm2           \n\t"
366
        "movss    24(%1),  %%xmm3           \n\t"
367
        "addss    28(%1),  %%xmm3           \n\t"
368
        "addss    76(%1),  %%xmm7           \n\t"
369
        "addss    %%xmm3,  %%xmm1           \n\t"
370
        "addss    %%xmm4,  %%xmm5           \n\t"
371
        "movss    %%xmm1,  16(%1)           \n\t"
372
        "movss    20(%1),  %%xmm1           \n\t"
373
        "addss    %%xmm3,  %%xmm1           \n\t"
374
        "movss    40(%1),  %%xmm3           \n\t"
375
        "movss    %%xmm1,  48(%1)           \n\t"
376
        "addss    44(%1),  %%xmm3           \n\t"
377
        "movss    20(%1),  %%xmm1           \n\t"
378
        "addss    %%xmm3,  %%xmm4           \n\t"
379
        "addss    %%xmm2,  %%xmm3           \n\t"
380
        "addss    28(%1),  %%xmm1           \n\t"
381
        "movss    %%xmm3,  40(%1)           \n\t"
382
        "addss    36(%1),  %%xmm2           \n\t"
383
        "movss     8(%1),  %%xmm3           \n\t"
384
        "movss    %%xmm2,  56(%1)           \n\t"
385
        "addss    12(%1),  %%xmm3           \n\t"
386
        "movss    %%xmm5,   8(%1)           \n\t"
387
        "movss    %%xmm3,  32(%1)           \n\t"
388
        "movss    52(%1),  %%xmm2           \n\t"
389
        "movss    80(%1),  %%xmm3           \n\t"
390
        "movss   120(%1),  %%xmm5           \n\t"
391
        "movss    %%xmm1,  80(%1)           \n\t"
392
        "movss    %%xmm4,  24(%1)           \n\t"
393
        "addss   124(%1),  %%xmm5           \n\t"
394
        "movss    64(%1),  %%xmm1           \n\t"
395
        "addss    60(%1),  %%xmm2           \n\t"
396
        "addss    %%xmm5,  %%xmm0           \n\t"
397
        "addss   116(%1),  %%xmm5           \n\t"
398
        "movl         %0,  64(%1)           \n\t"
399
        "addss    %%xmm0,  %%xmm6           \n\t"
400
        "addss    %%xmm6,  %%xmm1           \n\t"
401
        "movl     12(%1),      %0           \n\t"
402
        "movss    %%xmm1,   4(%1)           \n\t"
403
        "movss    88(%1),  %%xmm1           \n\t"
404
        "movl         %0,  96(%1)           \n\t"
405
        "addss    92(%1),  %%xmm1           \n\t"
406
        "movss   104(%1),  %%xmm4           \n\t"
407
        "movl     28(%1),      %0           \n\t"
408
        "addss   108(%1),  %%xmm4           \n\t"
409
        "addss    %%xmm4,  %%xmm0           \n\t"
410
        "addss    %%xmm1,  %%xmm3           \n\t"
411
        "addss    84(%1),  %%xmm1           \n\t"
412
        "addss    %%xmm5,  %%xmm4           \n\t"
413
        "addss    %%xmm3,  %%xmm6           \n\t"
414
        "addss    %%xmm0,  %%xmm3           \n\t"
415
        "addss    %%xmm7,  %%xmm0           \n\t"
416
        "addss   100(%1),  %%xmm5           \n\t"
417
        "addss    %%xmm4,  %%xmm7           \n\t"
418
        "movl         %0, 112(%1)           \n\t"
419
        "movss    %%xmm0,  28(%1)           \n\t"
420
        "movss    36(%1),  %%xmm0           \n\t"
421
        "movss    %%xmm7,  36(%1)           \n\t"
422
        "addss    %%xmm1,  %%xmm4           \n\t"
423
        "movss   116(%1),  %%xmm7           \n\t"
424
        "addss    %%xmm2,  %%xmm0           \n\t"
425
        "addss   124(%1),  %%xmm7           \n\t"
426
        "movss    %%xmm0,  72(%1)           \n\t"
427
        "movss    44(%1),  %%xmm0           \n\t"
428
        "movss    %%xmm6,  12(%1)           \n\t"
429
        "movss    %%xmm3,  20(%1)           \n\t"
430
        "addss    %%xmm0,  %%xmm2           \n\t"
431
        "movss    %%xmm4,  44(%1)           \n\t"
432
        "movss    %%xmm2,  88(%1)           \n\t"
433
        "addss    60(%1),  %%xmm0           \n\t"
434
        "movl     60(%1),      %0           \n\t"
435
        "movl         %0, 120(%1)           \n\t"
436
        "movss    %%xmm0, 104(%1)           \n\t"
437
        "addss    %%xmm5,  %%xmm1           \n\t"
438
        "addss    68(%1),  %%xmm5           \n\t"
439
        "movss    %%xmm1,  52(%1)           \n\t"
440
        "movss    %%xmm5,  60(%1)           \n\t"
441
        "movss    68(%1),  %%xmm1           \n\t"
442
        "movss   100(%1),  %%xmm5           \n\t"
443
        "addss    %%xmm7,  %%xmm5           \n\t"
444
        "addss   108(%1),  %%xmm7           \n\t"
445
        "addss    %%xmm5,  %%xmm1           \n\t"
446
        "movss    84(%1),  %%xmm2           \n\t"
447
        "addss    92(%1),  %%xmm2           \n\t"
448
        "addss    %%xmm2,  %%xmm5           \n\t"
449
        "movss    %%xmm1,  68(%1)           \n\t"
450
        "addss    %%xmm7,  %%xmm2           \n\t"
451
        "movss    76(%1),  %%xmm1           \n\t"
452
        "movss    %%xmm2,  84(%1)           \n\t"
453
        "movss    %%xmm5,  76(%1)           \n\t"
454
        "movss   108(%1),  %%xmm2           \n\t"
455
        "addss    %%xmm1,  %%xmm7           \n\t"
456
        "addss   124(%1),  %%xmm2           \n\t"
457
        "addss    %%xmm2,  %%xmm1           \n\t"
458
        "addss    92(%1),  %%xmm2           \n\t"
459
        "movss    %%xmm1, 100(%1)           \n\t"
460
        "movss    %%xmm2, 108(%1)           \n\t"
461
        "movss    92(%1),  %%xmm2           \n\t"
462
        "movss    %%xmm7,  92(%1)           \n\t"
463
        "addss   124(%1),  %%xmm2           \n\t"
464
        "movss    %%xmm2, 116(%1)           \n\t"
465
        :"+&r"(tmp1)
466
        :"r"(out), "r"(b1), "r"(smask), "r"(in)
467
        :"memory"
468
        );
469
}

Also available in: Unified diff