Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / fft_mmx.asm @ 78b5c97d

History | View | Annotate | Download (15 KB)

1
;******************************************************************************
2
;* FFT transform with SSE/3DNow optimizations
3
;* Copyright (c) 2008 Loren Merritt
4
;*
5
;* This algorithm (though not any of the implementation details) is
6
;* based on libdjbfft by D. J. Bernstein.
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24

    
25
; These functions are not individually interchangeable with the C versions.
26
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
27
; in blocks as conventient to the vector size.
28
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
29

    
30
%include "x86inc.asm"
31

    
32
%ifdef ARCH_X86_64
33
%define pointer resq
34
%else
35
%define pointer resd
36
%endif
37

    
38
struc FFTContext
39
    .nbits:    resd 1
40
    .reverse:  resd 1
41
    .revtab:   pointer 1
42
    .tmpbuf:   pointer 1
43
    .mdctsize: resd 1
44
    .mdctbits: resd 1
45
    .tcos:     pointer 1
46
    .tsin:     pointer 1
47
endstruc
48

    
49
SECTION_RODATA
50

    
51
%define M_SQRT1_2 0.70710678118654752440
52
ps_root2: times 4 dd M_SQRT1_2
53
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
54
ps_m1p1: dd 1<<31, 0
55

    
56
%assign i 16
57
%rep 13
58
cextern cos_ %+ i
59
%assign i i<<1
60
%endrep
61

    
62
%ifdef ARCH_X86_64
63
    %define pointer dq
64
%else
65
    %define pointer dd
66
%endif
67

    
68
%macro IF0 1+
69
%endmacro
70
%macro IF1 1+
71
    %1
72
%endmacro
73

    
74
section .text align=16
75

    
76
%macro T2_3DN 4 ; z0, z1, mem0, mem1
77
    mova     %1, %3
78
    mova     %2, %1
79
    pfadd    %1, %4
80
    pfsub    %2, %4
81
%endmacro
82

    
83
%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
84
    mova     %5, %3
85
    pfsub    %3, %4
86
    pfadd    %5, %4 ; {t6,t5}
87
    pxor     %3, [ps_m1p1] ; {t8,t7}
88
    mova     %6, %1
89
    pswapd   %3, %3
90
    pfadd    %1, %5 ; {r0,i0}
91
    pfsub    %6, %5 ; {r2,i2}
92
    mova     %4, %2
93
    pfadd    %2, %3 ; {r1,i1}
94
    pfsub    %4, %3 ; {r3,i3}
95
    SWAP     %3, %6
96
%endmacro
97

    
98
; in:  %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
99
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
100
%macro T4_SSE 3
101
    mova     %3, %1
102
    shufps   %1, %2, 0x64 ; {r0,i0,r3,i2}
103
    shufps   %3, %2, 0xce ; {r1,i1,r2,i3}
104
    mova     %2, %1
105
    addps    %1, %3       ; {t1,t2,t6,t5}
106
    subps    %2, %3       ; {t3,t4,t8,t7}
107
    mova     %3, %1
108
    shufps   %1, %2, 0x44 ; {t1,t2,t3,t4}
109
    shufps   %3, %2, 0xbe ; {t6,t5,t7,t8}
110
    mova     %2, %1
111
    addps    %1, %3       ; {r0,i0,r1,i1}
112
    subps    %2, %3       ; {r2,i2,r3,i3}
113
    mova     %3, %1
114
    shufps   %1, %2, 0x88 ; {r0,r1,r2,r3}
115
    shufps   %3, %2, 0xdd ; {i0,i1,i2,i3}
116
    SWAP     %2, %3
117
%endmacro
118

    
119
%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
120
    mova     %5, %3
121
    shufps   %3, %4, 0x44 ; {r4,i4,r6,i6}
122
    shufps   %5, %4, 0xee ; {r5,i5,r7,i7}
123
    mova     %6, %3
124
    subps    %3, %5       ; {r5,i5,r7,i7}
125
    addps    %6, %5       ; {t1,t2,t3,t4}
126
    mova     %5, %3
127
    shufps   %5, %5, 0xb1 ; {i5,r5,i7,r7}
128
    mulps    %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
129
    mulps    %5, [ps_root2]
130
    addps    %3, %5       ; {t8,t7,ta,t9}
131
    mova     %5, %6
132
    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
133
    shufps   %5, %3, 0x9c ; {t1,t4,t7,ta}
134
    mova     %3, %6
135
    addps    %6, %5       ; {t1,t2,t9,ta}
136
    subps    %3, %5       ; {t6,t5,tc,tb}
137
    mova     %5, %6
138
    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
139
    shufps   %5, %3, 0x8d ; {t2,ta,t6,tc}
140
    mova     %3, %1
141
    mova     %4, %2
142
    addps    %1, %6       ; {r0,r1,r2,r3}
143
    addps    %2, %5       ; {i0,i1,i2,i3}
144
    subps    %3, %6       ; {r4,r5,r6,r7}
145
    subps    %4, %5       ; {i4,i5,i6,i7}
146
%endmacro
147

    
148
; scheduled for cpu-bound sizes
149
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
150
IF%1 mova    m4, Z(4)
151
IF%1 mova    m5, Z(5)
152
    mova     m0, %2 ; wre
153
    mova     m2, m4
154
    mova     m1, %3 ; wim
155
    mova     m3, m5
156
    mulps    m2, m0 ; r2*wre
157
IF%1 mova    m6, Z(6)
158
    mulps    m3, m1 ; i2*wim
159
IF%1 mova    m7, Z(7)
160
    mulps    m4, m1 ; r2*wim
161
    mulps    m5, m0 ; i2*wre
162
    addps    m2, m3 ; r2*wre + i2*wim
163
    mova     m3, m1
164
    mulps    m1, m6 ; r3*wim
165
    subps    m5, m4 ; i2*wre - r2*wim
166
    mova     m4, m0
167
    mulps    m3, m7 ; i3*wim
168
    mulps    m4, m6 ; r3*wre
169
    mulps    m0, m7 ; i3*wre
170
    subps    m4, m3 ; r3*wre - i3*wim
171
    mova     m3, Z(0)
172
    addps    m0, m1 ; i3*wre + r3*wim
173
    mova     m1, m4
174
    addps    m4, m2 ; t5
175
    subps    m1, m2 ; t3
176
    subps    m3, m4 ; r2
177
    addps    m4, Z(0) ; r0
178
    mova     m6, Z(2)
179
    mova   Z(4), m3
180
    mova   Z(0), m4
181
    mova     m3, m5
182
    subps    m5, m0 ; t4
183
    mova     m4, m6
184
    subps    m6, m5 ; r3
185
    addps    m5, m4 ; r1
186
    mova   Z(6), m6
187
    mova   Z(2), m5
188
    mova     m2, Z(3)
189
    addps    m3, m0 ; t6
190
    subps    m2, m1 ; i3
191
    mova     m7, Z(1)
192
    addps    m1, Z(3) ; i1
193
    mova   Z(7), m2
194
    mova   Z(3), m1
195
    mova     m4, m7
196
    subps    m7, m3 ; i2
197
    addps    m3, m4 ; i0
198
    mova   Z(5), m7
199
    mova   Z(1), m3
200
%endmacro
201

    
202
; scheduled to avoid store->load aliasing
203
%macro PASS_BIG 1 ; (!interleave)
204
    mova     m4, Z(4) ; r2
205
    mova     m5, Z(5) ; i2
206
    mova     m2, m4
207
    mova     m0, [wq] ; wre
208
    mova     m3, m5
209
    mova     m1, [wq+o1q] ; wim
210
    mulps    m2, m0 ; r2*wre
211
    mova     m6, Z(6) ; r3
212
    mulps    m3, m1 ; i2*wim
213
    mova     m7, Z(7) ; i3
214
    mulps    m4, m1 ; r2*wim
215
    mulps    m5, m0 ; i2*wre
216
    addps    m2, m3 ; r2*wre + i2*wim
217
    mova     m3, m1
218
    mulps    m1, m6 ; r3*wim
219
    subps    m5, m4 ; i2*wre - r2*wim
220
    mova     m4, m0
221
    mulps    m3, m7 ; i3*wim
222
    mulps    m4, m6 ; r3*wre
223
    mulps    m0, m7 ; i3*wre
224
    subps    m4, m3 ; r3*wre - i3*wim
225
    mova     m3, Z(0)
226
    addps    m0, m1 ; i3*wre + r3*wim
227
    mova     m1, m4
228
    addps    m4, m2 ; t5
229
    subps    m1, m2 ; t3
230
    subps    m3, m4 ; r2
231
    addps    m4, Z(0) ; r0
232
    mova     m6, Z(2)
233
    mova   Z(4), m3
234
    mova   Z(0), m4
235
    mova     m3, m5
236
    subps    m5, m0 ; t4
237
    mova     m4, m6
238
    subps    m6, m5 ; r3
239
    addps    m5, m4 ; r1
240
IF%1 mova  Z(6), m6
241
IF%1 mova  Z(2), m5
242
    mova     m2, Z(3)
243
    addps    m3, m0 ; t6
244
    subps    m2, m1 ; i3
245
    mova     m7, Z(1)
246
    addps    m1, Z(3) ; i1
247
IF%1 mova  Z(7), m2
248
IF%1 mova  Z(3), m1
249
    mova     m4, m7
250
    subps    m7, m3 ; i2
251
    addps    m3, m4 ; i0
252
IF%1 mova  Z(5), m7
253
IF%1 mova  Z(1), m3
254
%if %1==0
255
    mova     m4, m5 ; r1
256
    mova     m0, m6 ; r3
257
    unpcklps m5, m1
258
    unpckhps m4, m1
259
    unpcklps m6, m2
260
    unpckhps m0, m2
261
    mova     m1, Z(0)
262
    mova     m2, Z(4)
263
    mova   Z(2), m5
264
    mova   Z(3), m4
265
    mova   Z(6), m6
266
    mova   Z(7), m0
267
    mova     m5, m1 ; r0
268
    mova     m4, m2 ; r2
269
    unpcklps m1, m3
270
    unpckhps m5, m3
271
    unpcklps m2, m7
272
    unpckhps m4, m7
273
    mova   Z(0), m1
274
    mova   Z(1), m5
275
    mova   Z(4), m2
276
    mova   Z(5), m4
277
%endif
278
%endmacro
279

    
280
%macro PUNPCK 3
281
    mova      %3, %1
282
    punpckldq %1, %2
283
    punpckhdq %3, %2
284
%endmacro
285

    
286
INIT_XMM
287
%define mova movaps
288

    
289
%define Z(x) [r0+mmsize*x]
290

    
291
align 16
292
fft4_sse:
293
    mova     m0, Z(0)
294
    mova     m1, Z(1)
295
    T4_SSE   m0, m1, m2
296
    mova   Z(0), m0
297
    mova   Z(1), m1
298
    ret
299

    
300
align 16
301
fft8_sse:
302
    mova     m0, Z(0)
303
    mova     m1, Z(1)
304
    T4_SSE   m0, m1, m2
305
    mova     m2, Z(2)
306
    mova     m3, Z(3)
307
    T8_SSE   m0, m1, m2, m3, m4, m5
308
    mova   Z(0), m0
309
    mova   Z(1), m1
310
    mova   Z(2), m2
311
    mova   Z(3), m3
312
    ret
313

    
314
align 16
315
fft16_sse:
316
    mova     m0, Z(0)
317
    mova     m1, Z(1)
318
    T4_SSE   m0, m1, m2
319
    mova     m2, Z(2)
320
    mova     m3, Z(3)
321
    T8_SSE   m0, m1, m2, m3, m4, m5
322
    mova     m4, Z(4)
323
    mova     m5, Z(5)
324
    mova   Z(0), m0
325
    mova   Z(1), m1
326
    mova   Z(2), m2
327
    mova   Z(3), m3
328
    T4_SSE   m4, m5, m6
329
    mova     m6, Z(6)
330
    mova     m7, Z(7)
331
    T4_SSE   m6, m7, m0
332
    PASS_SMALL 0, [cos_16], [cos_16+16]
333
    ret
334

    
335

    
336
INIT_MMX
337

    
338
%macro FFT48_3DN 1
339
align 16
340
fft4%1:
341
    T2_3DN   m0, m1, Z(0), Z(1)
342
    mova     m2, Z(2)
343
    mova     m3, Z(3)
344
    T4_3DN   m0, m1, m2, m3, m4, m5
345
    PUNPCK   m0, m1, m4
346
    PUNPCK   m2, m3, m5
347
    mova   Z(0), m0
348
    mova   Z(1), m4
349
    mova   Z(2), m2
350
    mova   Z(3), m5
351
    ret
352

    
353
align 16
354
fft8%1:
355
    T2_3DN   m0, m1, Z(0), Z(1)
356
    mova     m2, Z(2)
357
    mova     m3, Z(3)
358
    T4_3DN   m0, m1, m2, m3, m4, m5
359
    mova   Z(0), m0
360
    mova   Z(2), m2
361
    T2_3DN   m4, m5, Z(4), Z(5)
362
    T2_3DN   m6, m7, Z(6), Z(7)
363
    pswapd   m0, m5
364
    pswapd   m2, m7
365
    pxor     m0, [ps_m1p1]
366
    pxor     m2, [ps_m1p1]
367
    pfsub    m5, m0
368
    pfadd    m7, m2
369
    pfmul    m5, [ps_root2]
370
    pfmul    m7, [ps_root2]
371
    T4_3DN   m1, m3, m5, m7, m0, m2
372
    mova   Z(5), m5
373
    mova   Z(7), m7
374
    mova     m0, Z(0)
375
    mova     m2, Z(2)
376
    T4_3DN   m0, m2, m4, m6, m5, m7
377
    PUNPCK   m0, m1, m5
378
    PUNPCK   m2, m3, m7
379
    mova   Z(0), m0
380
    mova   Z(1), m5
381
    mova   Z(2), m2
382
    mova   Z(3), m7
383
    PUNPCK   m4, Z(5), m5
384
    PUNPCK   m6, Z(7), m7
385
    mova   Z(4), m4
386
    mova   Z(5), m5
387
    mova   Z(6), m6
388
    mova   Z(7), m7
389
    ret
390
%endmacro
391

    
392
FFT48_3DN _3dn2
393

    
394
%macro pswapd 2
395
%ifidn %1, %2
396
    movd [r0+12], %1
397
    punpckhdq %1, [r0+8]
398
%else
399
    movq  %1, %2
400
    psrlq %1, 32
401
    punpckldq %1, %2
402
%endif
403
%endmacro
404

    
405
FFT48_3DN _3dn
406

    
407

    
408
%define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)]
409

    
410
%macro DECL_PASS 2+ ; name, payload
411
align 16
412
%1:
413
DEFINE_ARGS z, w, n, o1, o3
414
    lea o3q, [nq*3]
415
    lea o1q, [nq*8]
416
    shl o3q, 4
417
.loop:
418
    %2
419
    add zq, mmsize*2
420
    add wq, mmsize
421
    sub nd, mmsize/8
422
    jg .loop
423
    rep ret
424
%endmacro
425

    
426
INIT_XMM
427
%define mova movaps
428
DECL_PASS pass_sse, PASS_BIG 1
429
DECL_PASS pass_interleave_sse, PASS_BIG 0
430

    
431
INIT_MMX
432
%define mulps pfmul
433
%define addps pfadd
434
%define subps pfsub
435
%define unpcklps punpckldq
436
%define unpckhps punpckhdq
437
DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
438
DECL_PASS pass_interleave_3dn, PASS_BIG 0
439
%define pass_3dn2 pass_3dn
440
%define pass_interleave_3dn2 pass_interleave_3dn
441

    
442
%ifdef PIC
443
%define SECTION_REL - $$
444
%else
445
%define SECTION_REL
446
%endif
447

    
448
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
449
    lea r2, [dispatch_tab%1]
450
    mov r2, [r2 + (%2q-2)*gprsize]
451
%ifdef PIC
452
    lea r3, [$$]
453
    add r2, r3
454
%endif
455
    call r2
456
%endmacro ; FFT_DISPATCH
457

    
458
%macro DECL_FFT 2-3 ; nbits, cpu, suffix
459
%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
460
%if %1==5
461
%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
462
%endif
463

    
464
%assign n 1<<%1
465
%rep 17-%1
466
%assign n2 n/2
467
%assign n4 n/4
468
%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
469

    
470
align 16
471
fft %+ n %+ %3%2:
472
    call fft %+ n2 %+ %2
473
    add r0, n*4 - (n&(-2<<%1))
474
    call fft %+ n4 %+ %2
475
    add r0, n*2 - (n2&(-2<<%1))
476
    call fft %+ n4 %+ %2
477
    sub r0, n*6 + (n2&(-2<<%1))
478
    lea r1, [cos_ %+ n]
479
    mov r2d, n4/2
480
    jmp pass%3%2
481

    
482
%assign n n*2
483
%endrep
484
%undef n
485

    
486
align 8
487
dispatch_tab%3%2: pointer list_of_fft
488

    
489
section .text
490

    
491
; On x86_32, this function does the register saving and restoring for all of fft.
492
; The others pass args in registers and don't spill anything.
493
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
494
    FFT_DISPATCH %3%2, nbits
495
    RET
496
%endmacro ; DECL_FFT
497

    
498
DECL_FFT 5, _sse
499
DECL_FFT 5, _sse, _interleave
500
DECL_FFT 4, _3dn
501
DECL_FFT 4, _3dn, _interleave
502
DECL_FFT 4, _3dn2
503
DECL_FFT 4, _3dn2, _interleave
504

    
505
INIT_XMM
506
%undef mulps
507
%undef addps
508
%undef subps
509
%undef unpcklps
510
%undef unpckhps
511

    
512
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
513
    movaps   xmm0, [%3+%2*4]
514
    movaps   xmm1, [%3+%1*4-0x10]
515
    movaps   xmm2, xmm0
516
    shufps   xmm0, xmm1, 0x88
517
    shufps   xmm1, xmm2, 0x77
518
    movlps   xmm4, [%4+%2*2]
519
    movlps   xmm5, [%5+%2*2+0x0]
520
    movhps   xmm4, [%4+%1*2-0x8]
521
    movhps   xmm5, [%5+%1*2-0x8]
522
    movaps   xmm2, xmm0
523
    movaps   xmm3, xmm1
524
    mulps    xmm0, xmm5
525
    mulps    xmm1, xmm4
526
    mulps    xmm2, xmm4
527
    mulps    xmm3, xmm5
528
    subps    xmm1, xmm0
529
    addps    xmm2, xmm3
530
    movaps   xmm0, xmm1
531
    unpcklps xmm1, xmm2
532
    unpckhps xmm0, xmm2
533
%endmacro
534

    
535
%macro PREROTATEW 3 ;addr1, addr2, xmm
536
    movlps   %1,   %3
537
    movhps   %2,   %3
538
%endmacro
539

    
540
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
541
    movaps   xmm6, [%4+%1*2]
542
    movaps   %2,   [%4+%1*2+0x10]
543
    movaps   %3,   xmm6
544
    movaps   xmm7, %2
545
    mulps    xmm6, [%5+%1*1]
546
    mulps    %2,   [%6+%1*1]
547
    mulps    %3,   [%6+%1*1]
548
    mulps    xmm7, [%5+%1*1]
549
    subps    %2,   xmm6
550
    addps    %3,   xmm7
551
%endmacro
552

    
553
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
554
.post:
555
    CMUL     %1,   xmm0, xmm1, %3, %4, %5
556
    CMUL     %2,   xmm4, xmm5, %3, %4, %5
557
    shufps   xmm1, xmm1, 0x1b
558
    shufps   xmm5, xmm5, 0x1b
559
    movaps   xmm6, xmm4
560
    unpckhps xmm4, xmm1
561
    unpcklps xmm6, xmm1
562
    movaps   xmm2, xmm0
563
    unpcklps xmm0, xmm5
564
    unpckhps xmm2, xmm5
565
    movaps   [%3+%2*2],      xmm6
566
    movaps   [%3+%2*2+0x10], xmm4
567
    movaps   [%3+%1*2],      xmm0
568
    movaps   [%3+%1*2+0x10], xmm2
569
    sub      %2,   0x10
570
    add      %1,   0x10
571
    jl       .post
572
%endmacro
573

    
574
cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
575
%ifdef ARCH_X86_64
576
%define rrevtab r10
577
%define rtcos   r11
578
%define rtsin   r12
579
    push  r10
580
    push  r11
581
    push  r12
582
    push  r13
583
    push  r14
584
%else
585
%define rrevtab r6
586
%define rtsin   r6
587
%define rtcos   r5
588
%endif
589
    mov   r3d, [r0+FFTContext.mdctsize]
590
    add   r2, r3
591
    shr   r3, 1
592
    mov   rtcos, [r0+FFTContext.tcos]
593
    mov   rtsin, [r0+FFTContext.tsin]
594
    add   rtcos, r3
595
    add   rtsin, r3
596
%ifndef ARCH_X86_64
597
    push  rtcos
598
    push  rtsin
599
%endif
600
    shr   r3, 1
601
    mov   rrevtab, [r0+FFTContext.revtab]
602
    add   rrevtab, r3
603
%ifndef ARCH_X86_64
604
    push  rrevtab
605
%endif
606

    
607
    sub   r3, 4
608
%ifdef ARCH_X86_64
609
    xor   r4, r4
610
    sub   r4, r3
611
%endif
612
.pre:
613
%ifndef ARCH_X86_64
614
;unspill
615
    xor   r4, r4
616
    sub   r4, r3
617
    mov   rtsin, [esp+4]
618
    mov   rtcos, [esp+8]
619
%endif
620

    
621
    PREROTATER r4, r3, r2, rtcos, rtsin
622
%ifdef ARCH_X86_64
623
    movzx  r5,  word [rrevtab+r4*1-4]
624
    movzx  r6,  word [rrevtab+r4*1-2]
625
    movzx  r13, word [rrevtab+r3*1]
626
    movzx  r14, word [rrevtab+r3*1+2]
627
    PREROTATEW [r1+r5 *8], [r1+r6 *8], xmm0
628
    PREROTATEW [r1+r13*8], [r1+r14*8], xmm1
629
    add    r4, 4
630
%else
631
    mov    r6, [esp]
632
    movzx  r5, word [r6+r4*1-4]
633
    movzx  r4, word [r6+r4*1-2]
634
    PREROTATEW [r1+r5*8], [r1+r4*8], xmm0
635
    movzx  r5, word [r6+r3*1]
636
    movzx  r4, word [r6+r3*1+2]
637
    PREROTATEW [r1+r5*8], [r1+r4*8], xmm1
638
%endif
639
    sub    r3, 4
640
    jns    .pre
641

    
642
    mov  r5, r0
643
    mov  r6, r1
644
    mov  r0, r1
645
    mov  r1d, [r5+FFTContext.nbits]
646

    
647
    FFT_DISPATCH _sse, r1
648

    
649
    mov  r0d, [r5+FFTContext.mdctsize]
650
    add  r6, r0
651
    shr  r0, 1
652
%ifndef ARCH_X86_64
653
%define rtcos r2
654
%define rtsin r3
655
    mov  rtcos, [esp+8]
656
    mov  rtsin, [esp+4]
657
%endif
658
    neg  r0
659
    mov  r1, -16
660
    sub  r1, r0
661
    POSROTATESHUF r0, r1, r6, rtcos, rtsin
662
%ifdef ARCH_X86_64
663
    pop  r14
664
    pop  r13
665
    pop  r12
666
    pop  r11
667
    pop  r10
668
%else
669
    add esp, 12
670
%endif
671
    RET