Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / fft_mmx.asm @ 2912e87a

History | View | Annotate | Download (14.9 KB)

1
;******************************************************************************
2
;* FFT transform with SSE/3DNow optimizations
3
;* Copyright (c) 2008 Loren Merritt
4
;*
5
;* This algorithm (though not any of the implementation details) is
6
;* based on libdjbfft by D. J. Bernstein.
7
;*
8
;* This file is part of Libav.
9
;*
10
;* Libav is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* Libav is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with Libav; if not, write to the Free Software
22
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24

    
25
; These functions are not individually interchangeable with the C versions.
26
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
27
; in blocks as conventient to the vector size.
28
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
29

    
30
%include "x86inc.asm"
31

    
32
%ifdef ARCH_X86_64
33
%define pointer resq
34
%else
35
%define pointer resd
36
%endif
37

    
38
struc FFTContext
39
    .nbits:    resd 1
40
    .reverse:  resd 1
41
    .revtab:   pointer 1
42
    .tmpbuf:   pointer 1
43
    .mdctsize: resd 1
44
    .mdctbits: resd 1
45
    .tcos:     pointer 1
46
    .tsin:     pointer 1
47
endstruc
48

    
49
SECTION_RODATA
50

    
51
%define M_SQRT1_2 0.70710678118654752440
52
ps_root2: times 4 dd M_SQRT1_2
53
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
54
ps_p1p1m1p1: dd 0, 0, 1<<31, 0
55
ps_m1p1: dd 1<<31, 0
56

    
57
%assign i 16
58
%rep 13
59
cextern cos_ %+ i
60
%assign i i<<1
61
%endrep
62

    
63
%ifdef ARCH_X86_64
64
    %define pointer dq
65
%else
66
    %define pointer dd
67
%endif
68

    
69
%macro IF0 1+
70
%endmacro
71
%macro IF1 1+
72
    %1
73
%endmacro
74

    
75
section .text align=16
76

    
77
%macro T2_3DN 4 ; z0, z1, mem0, mem1
78
    mova     %1, %3
79
    mova     %2, %1
80
    pfadd    %1, %4
81
    pfsub    %2, %4
82
%endmacro
83

    
84
%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
85
    mova     %5, %3
86
    pfsub    %3, %4
87
    pfadd    %5, %4 ; {t6,t5}
88
    pxor     %3, [ps_m1p1] ; {t8,t7}
89
    mova     %6, %1
90
    pswapd   %3, %3
91
    pfadd    %1, %5 ; {r0,i0}
92
    pfsub    %6, %5 ; {r2,i2}
93
    mova     %4, %2
94
    pfadd    %2, %3 ; {r1,i1}
95
    pfsub    %4, %3 ; {r3,i3}
96
    SWAP     %3, %6
97
%endmacro
98

    
99
; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
100
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
101
%macro T4_SSE 3
102
    mova     %3, %1
103
    addps    %1, %2       ; {t1,t2,t6,t5}
104
    subps    %3, %2       ; {t3,t4,-t8,t7}
105
    xorps    %3, [ps_p1p1m1p1]
106
    mova     %2, %1
107
    shufps   %1, %3, 0x44 ; {t1,t2,t3,t4}
108
    shufps   %2, %3, 0xbe ; {t6,t5,t7,t8}
109
    mova     %3, %1
110
    addps    %1, %2       ; {r0,i0,r1,i1}
111
    subps    %3, %2       ; {r2,i2,r3,i3}
112
    mova     %2, %1
113
    shufps   %1, %3, 0x88 ; {r0,r1,r2,r3}
114
    shufps   %2, %3, 0xdd ; {i0,i1,i2,i3}
115
%endmacro
116

    
117
; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
118
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
119
%macro T8_SSE 6
120
    mova     %6, %3
121
    subps    %3, %4       ; {r5,i5,r7,i7}
122
    addps    %6, %4       ; {t1,t2,t3,t4}
123
    mova     %4, %3
124
    shufps   %4, %4, 0xb1 ; {i5,r5,i7,r7}
125
    mulps    %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
126
    mulps    %4, [ps_root2]
127
    addps    %3, %4       ; {t8,t7,ta,t9}
128
    mova     %4, %6
129
    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
130
    shufps   %4, %3, 0x9c ; {t1,t4,t7,ta}
131
    mova     %3, %6
132
    addps    %6, %4       ; {t1,t2,t9,ta}
133
    subps    %3, %4       ; {t6,t5,tc,tb}
134
    mova     %4, %6
135
    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
136
    shufps   %4, %3, 0x8d ; {t2,ta,t6,tc}
137
    mova     %3, %1
138
    mova     %5, %2
139
    addps    %1, %6       ; {r0,r1,r2,r3}
140
    addps    %2, %4       ; {i0,i1,i2,i3}
141
    subps    %3, %6       ; {r4,r5,r6,r7}
142
    subps    %5, %4       ; {i4,i5,i6,i7}
143
    SWAP     %4, %5
144
%endmacro
145

    
146
; scheduled for cpu-bound sizes
147
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
148
IF%1 mova    m4, Z(4)
149
IF%1 mova    m5, Z(5)
150
    mova     m0, %2 ; wre
151
    mova     m2, m4
152
    mova     m1, %3 ; wim
153
    mova     m3, m5
154
    mulps    m2, m0 ; r2*wre
155
IF%1 mova    m6, Z2(6)
156
    mulps    m3, m1 ; i2*wim
157
IF%1 mova    m7, Z2(7)
158
    mulps    m4, m1 ; r2*wim
159
    mulps    m5, m0 ; i2*wre
160
    addps    m2, m3 ; r2*wre + i2*wim
161
    mova     m3, m1
162
    mulps    m1, m6 ; r3*wim
163
    subps    m5, m4 ; i2*wre - r2*wim
164
    mova     m4, m0
165
    mulps    m3, m7 ; i3*wim
166
    mulps    m4, m6 ; r3*wre
167
    mulps    m0, m7 ; i3*wre
168
    subps    m4, m3 ; r3*wre - i3*wim
169
    mova     m3, Z(0)
170
    addps    m0, m1 ; i3*wre + r3*wim
171
    mova     m1, m4
172
    addps    m4, m2 ; t5
173
    subps    m1, m2 ; t3
174
    subps    m3, m4 ; r2
175
    addps    m4, Z(0) ; r0
176
    mova     m6, Z(2)
177
    mova   Z(4), m3
178
    mova   Z(0), m4
179
    mova     m3, m5
180
    subps    m5, m0 ; t4
181
    mova     m4, m6
182
    subps    m6, m5 ; r3
183
    addps    m5, m4 ; r1
184
    mova  Z2(6), m6
185
    mova   Z(2), m5
186
    mova     m2, Z(3)
187
    addps    m3, m0 ; t6
188
    subps    m2, m1 ; i3
189
    mova     m7, Z(1)
190
    addps    m1, Z(3) ; i1
191
    mova  Z2(7), m2
192
    mova   Z(3), m1
193
    mova     m4, m7
194
    subps    m7, m3 ; i2
195
    addps    m3, m4 ; i0
196
    mova   Z(5), m7
197
    mova   Z(1), m3
198
%endmacro
199

    
200
; scheduled to avoid store->load aliasing
201
%macro PASS_BIG 1 ; (!interleave)
202
    mova     m4, Z(4) ; r2
203
    mova     m5, Z(5) ; i2
204
    mova     m2, m4
205
    mova     m0, [wq] ; wre
206
    mova     m3, m5
207
    mova     m1, [wq+o1q] ; wim
208
    mulps    m2, m0 ; r2*wre
209
    mova     m6, Z2(6) ; r3
210
    mulps    m3, m1 ; i2*wim
211
    mova     m7, Z2(7) ; i3
212
    mulps    m4, m1 ; r2*wim
213
    mulps    m5, m0 ; i2*wre
214
    addps    m2, m3 ; r2*wre + i2*wim
215
    mova     m3, m1
216
    mulps    m1, m6 ; r3*wim
217
    subps    m5, m4 ; i2*wre - r2*wim
218
    mova     m4, m0
219
    mulps    m3, m7 ; i3*wim
220
    mulps    m4, m6 ; r3*wre
221
    mulps    m0, m7 ; i3*wre
222
    subps    m4, m3 ; r3*wre - i3*wim
223
    mova     m3, Z(0)
224
    addps    m0, m1 ; i3*wre + r3*wim
225
    mova     m1, m4
226
    addps    m4, m2 ; t5
227
    subps    m1, m2 ; t3
228
    subps    m3, m4 ; r2
229
    addps    m4, Z(0) ; r0
230
    mova     m6, Z(2)
231
    mova   Z(4), m3
232
    mova   Z(0), m4
233
    mova     m3, m5
234
    subps    m5, m0 ; t4
235
    mova     m4, m6
236
    subps    m6, m5 ; r3
237
    addps    m5, m4 ; r1
238
IF%1 mova Z2(6), m6
239
IF%1 mova  Z(2), m5
240
    mova     m2, Z(3)
241
    addps    m3, m0 ; t6
242
    subps    m2, m1 ; i3
243
    mova     m7, Z(1)
244
    addps    m1, Z(3) ; i1
245
IF%1 mova Z2(7), m2
246
IF%1 mova  Z(3), m1
247
    mova     m4, m7
248
    subps    m7, m3 ; i2
249
    addps    m3, m4 ; i0
250
IF%1 mova  Z(5), m7
251
IF%1 mova  Z(1), m3
252
%if %1==0
253
    mova     m4, m5 ; r1
254
    mova     m0, m6 ; r3
255
    unpcklps m5, m1
256
    unpckhps m4, m1
257
    unpcklps m6, m2
258
    unpckhps m0, m2
259
    mova     m1, Z(0)
260
    mova     m2, Z(4)
261
    mova   Z(2), m5
262
    mova   Z(3), m4
263
    mova  Z2(6), m6
264
    mova  Z2(7), m0
265
    mova     m5, m1 ; r0
266
    mova     m4, m2 ; r2
267
    unpcklps m1, m3
268
    unpckhps m5, m3
269
    unpcklps m2, m7
270
    unpckhps m4, m7
271
    mova   Z(0), m1
272
    mova   Z(1), m5
273
    mova   Z(4), m2
274
    mova   Z(5), m4
275
%endif
276
%endmacro
277

    
278
%macro PUNPCK 3
279
    mova      %3, %1
280
    punpckldq %1, %2
281
    punpckhdq %3, %2
282
%endmacro
283

    
284
INIT_XMM
285
%define mova movaps
286

    
287
%define Z(x) [r0+mmsize*x]
288
%define Z2(x) [r0+mmsize*x]
289

    
290
align 16
291
fft4_sse:
292
    mova     m0, Z(0)
293
    mova     m1, Z(1)
294
    T4_SSE   m0, m1, m2
295
    mova   Z(0), m0
296
    mova   Z(1), m1
297
    ret
298

    
299
align 16
300
fft8_sse:
301
    mova     m0, Z(0)
302
    mova     m1, Z(1)
303
    T4_SSE   m0, m1, m2
304
    mova     m2, Z(2)
305
    mova     m3, Z(3)
306
    T8_SSE   m0, m1, m2, m3, m4, m5
307
    mova   Z(0), m0
308
    mova   Z(1), m1
309
    mova   Z(2), m2
310
    mova   Z(3), m3
311
    ret
312

    
313
align 16
314
fft16_sse:
315
    mova     m0, Z(0)
316
    mova     m1, Z(1)
317
    T4_SSE   m0, m1, m2
318
    mova     m2, Z(2)
319
    mova     m3, Z(3)
320
    T8_SSE   m0, m1, m2, m3, m4, m5
321
    mova     m4, Z(4)
322
    mova     m5, Z(5)
323
    mova   Z(0), m0
324
    mova   Z(1), m1
325
    mova   Z(2), m2
326
    mova   Z(3), m3
327
    T4_SSE   m4, m5, m6
328
    mova     m6, Z2(6)
329
    mova     m7, Z2(7)
330
    T4_SSE   m6, m7, m0
331
    PASS_SMALL 0, [cos_16], [cos_16+16]
332
    ret
333

    
334

    
335
INIT_MMX
336

    
337
%macro FFT48_3DN 1
338
align 16
339
fft4%1:
340
    T2_3DN   m0, m1, Z(0), Z(1)
341
    mova     m2, Z(2)
342
    mova     m3, Z(3)
343
    T4_3DN   m0, m1, m2, m3, m4, m5
344
    PUNPCK   m0, m1, m4
345
    PUNPCK   m2, m3, m5
346
    mova   Z(0), m0
347
    mova   Z(1), m4
348
    mova   Z(2), m2
349
    mova   Z(3), m5
350
    ret
351

    
352
align 16
353
fft8%1:
354
    T2_3DN   m0, m1, Z(0), Z(1)
355
    mova     m2, Z(2)
356
    mova     m3, Z(3)
357
    T4_3DN   m0, m1, m2, m3, m4, m5
358
    mova   Z(0), m0
359
    mova   Z(2), m2
360
    T2_3DN   m4, m5,  Z(4),  Z(5)
361
    T2_3DN   m6, m7, Z2(6), Z2(7)
362
    pswapd   m0, m5
363
    pswapd   m2, m7
364
    pxor     m0, [ps_m1p1]
365
    pxor     m2, [ps_m1p1]
366
    pfsub    m5, m0
367
    pfadd    m7, m2
368
    pfmul    m5, [ps_root2]
369
    pfmul    m7, [ps_root2]
370
    T4_3DN   m1, m3, m5, m7, m0, m2
371
    mova   Z(5), m5
372
    mova  Z2(7), m7
373
    mova     m0, Z(0)
374
    mova     m2, Z(2)
375
    T4_3DN   m0, m2, m4, m6, m5, m7
376
    PUNPCK   m0, m1, m5
377
    PUNPCK   m2, m3, m7
378
    mova   Z(0), m0
379
    mova   Z(1), m5
380
    mova   Z(2), m2
381
    mova   Z(3), m7
382
    PUNPCK   m4,  Z(5), m5
383
    PUNPCK   m6, Z2(7), m7
384
    mova   Z(4), m4
385
    mova   Z(5), m5
386
    mova  Z2(6), m6
387
    mova  Z2(7), m7
388
    ret
389
%endmacro
390

    
391
FFT48_3DN _3dn2
392

    
393
%macro pswapd 2
394
%ifidn %1, %2
395
    movd [r0+12], %1
396
    punpckhdq %1, [r0+8]
397
%else
398
    movq  %1, %2
399
    psrlq %1, 32
400
    punpckldq %1, %2
401
%endif
402
%endmacro
403

    
404
FFT48_3DN _3dn
405

    
406

    
407
%define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
408
%define Z2(x) [zq + o3q + mmsize*(x&1)]
409

    
410
%macro DECL_PASS 2+ ; name, payload
411
align 16
412
%1:
413
DEFINE_ARGS z, w, n, o1, o3
414
    lea o3q, [nq*3]
415
    lea o1q, [nq*8]
416
    shl o3q, 4
417
.loop:
418
    %2
419
    add zq, mmsize*2
420
    add wq, mmsize
421
    sub nd, mmsize/8
422
    jg .loop
423
    rep ret
424
%endmacro
425

    
426
INIT_XMM
427
%define mova movaps
428
DECL_PASS pass_sse, PASS_BIG 1
429
DECL_PASS pass_interleave_sse, PASS_BIG 0
430

    
431
INIT_MMX
432
%define mulps pfmul
433
%define addps pfadd
434
%define subps pfsub
435
%define unpcklps punpckldq
436
%define unpckhps punpckhdq
437
DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
438
DECL_PASS pass_interleave_3dn, PASS_BIG 0
439
%define pass_3dn2 pass_3dn
440
%define pass_interleave_3dn2 pass_interleave_3dn
441

    
442
%ifdef PIC
443
%define SECTION_REL - $$
444
%else
445
%define SECTION_REL
446
%endif
447

    
448
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
449
    lea r2, [dispatch_tab%1]
450
    mov r2, [r2 + (%2q-2)*gprsize]
451
%ifdef PIC
452
    lea r3, [$$]
453
    add r2, r3
454
%endif
455
    call r2
456
%endmacro ; FFT_DISPATCH
457

    
458
%macro DECL_FFT 2-3 ; nbits, cpu, suffix
459
%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
460
%if %1==5
461
%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
462
%endif
463

    
464
%assign n 1<<%1
465
%rep 17-%1
466
%assign n2 n/2
467
%assign n4 n/4
468
%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
469

    
470
align 16
471
fft %+ n %+ %3%2:
472
    call fft %+ n2 %+ %2
473
    add r0, n*4 - (n&(-2<<%1))
474
    call fft %+ n4 %+ %2
475
    add r0, n*2 - (n2&(-2<<%1))
476
    call fft %+ n4 %+ %2
477
    sub r0, n*6 + (n2&(-2<<%1))
478
    lea r1, [cos_ %+ n]
479
    mov r2d, n4/2
480
    jmp pass%3%2
481

    
482
%assign n n*2
483
%endrep
484
%undef n
485

    
486
align 8
487
dispatch_tab%3%2: pointer list_of_fft
488

    
489
section .text
490

    
491
; On x86_32, this function does the register saving and restoring for all of fft.
492
; The others pass args in registers and don't spill anything.
493
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
494
    FFT_DISPATCH %3%2, nbits
495
    RET
496
%endmacro ; DECL_FFT
497

    
498
DECL_FFT 5, _sse
499
DECL_FFT 5, _sse, _interleave
500
DECL_FFT 4, _3dn
501
DECL_FFT 4, _3dn, _interleave
502
DECL_FFT 4, _3dn2
503
DECL_FFT 4, _3dn2, _interleave
504

    
505
INIT_XMM
506
%undef mulps
507
%undef addps
508
%undef subps
509
%undef unpcklps
510
%undef unpckhps
511

    
512
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
513
    movaps   xmm0, [%3+%2*4]
514
    movaps   xmm1, [%3+%1*4-0x10]
515
    movaps   xmm2, xmm0
516
    shufps   xmm0, xmm1, 0x88
517
    shufps   xmm1, xmm2, 0x77
518
    movlps   xmm4, [%4+%2*2]
519
    movlps   xmm5, [%5+%2*2+0x0]
520
    movhps   xmm4, [%4+%1*2-0x8]
521
    movhps   xmm5, [%5+%1*2-0x8]
522
    movaps   xmm2, xmm0
523
    movaps   xmm3, xmm1
524
    mulps    xmm0, xmm5
525
    mulps    xmm1, xmm4
526
    mulps    xmm2, xmm4
527
    mulps    xmm3, xmm5
528
    subps    xmm1, xmm0
529
    addps    xmm2, xmm3
530
    movaps   xmm0, xmm1
531
    unpcklps xmm1, xmm2
532
    unpckhps xmm0, xmm2
533
%endmacro
534

    
535
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
536
    movaps   xmm6, [%4+%1*2]
537
    movaps   %2,   [%4+%1*2+0x10]
538
    movaps   %3,   xmm6
539
    movaps   xmm7, %2
540
    mulps    xmm6, [%5+%1]
541
    mulps    %2,   [%6+%1]
542
    mulps    %3,   [%6+%1]
543
    mulps    xmm7, [%5+%1]
544
    subps    %2,   xmm6
545
    addps    %3,   xmm7
546
%endmacro
547

    
548
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
549
.post:
550
    CMUL     %1,   xmm0, xmm1, %3, %4, %5
551
    CMUL     %2,   xmm4, xmm5, %3, %4, %5
552
    shufps   xmm1, xmm1, 0x1b
553
    shufps   xmm5, xmm5, 0x1b
554
    movaps   xmm6, xmm4
555
    unpckhps xmm4, xmm1
556
    unpcklps xmm6, xmm1
557
    movaps   xmm2, xmm0
558
    unpcklps xmm0, xmm5
559
    unpckhps xmm2, xmm5
560
    movaps   [%3+%2*2],      xmm6
561
    movaps   [%3+%2*2+0x10], xmm4
562
    movaps   [%3+%1*2],      xmm0
563
    movaps   [%3+%1*2+0x10], xmm2
564
    sub      %2,   0x10
565
    add      %1,   0x10
566
    jl       .post
567
%endmacro
568

    
569
cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
570
%ifdef ARCH_X86_64
571
%define rrevtab r10
572
%define rtcos   r11
573
%define rtsin   r12
574
    push  r12
575
    push  r13
576
    push  r14
577
%else
578
%define rrevtab r6
579
%define rtsin   r6
580
%define rtcos   r5
581
%endif
582
    mov   r3d, [r0+FFTContext.mdctsize]
583
    add   r2, r3
584
    shr   r3, 1
585
    mov   rtcos, [r0+FFTContext.tcos]
586
    mov   rtsin, [r0+FFTContext.tsin]
587
    add   rtcos, r3
588
    add   rtsin, r3
589
%ifndef ARCH_X86_64
590
    push  rtcos
591
    push  rtsin
592
%endif
593
    shr   r3, 1
594
    mov   rrevtab, [r0+FFTContext.revtab]
595
    add   rrevtab, r3
596
%ifndef ARCH_X86_64
597
    push  rrevtab
598
%endif
599

    
600
    sub   r3, 4
601
%ifdef ARCH_X86_64
602
    xor   r4, r4
603
    sub   r4, r3
604
%endif
605
.pre:
606
%ifndef ARCH_X86_64
607
;unspill
608
    xor   r4, r4
609
    sub   r4, r3
610
    mov   rtsin, [esp+4]
611
    mov   rtcos, [esp+8]
612
%endif
613

    
614
    PREROTATER r4, r3, r2, rtcos, rtsin
615
%ifdef ARCH_X86_64
616
    movzx  r5,  word [rrevtab+r4-4]
617
    movzx  r6,  word [rrevtab+r4-2]
618
    movzx  r13, word [rrevtab+r3]
619
    movzx  r14, word [rrevtab+r3+2]
620
    movlps [r1+r5 *8], xmm0
621
    movhps [r1+r6 *8], xmm0
622
    movlps [r1+r13*8], xmm1
623
    movhps [r1+r14*8], xmm1
624
    add    r4, 4
625
%else
626
    mov    r6, [esp]
627
    movzx  r5, word [r6+r4-4]
628
    movzx  r4, word [r6+r4-2]
629
    movlps [r1+r5*8], xmm0
630
    movhps [r1+r4*8], xmm0
631
    movzx  r5, word [r6+r3]
632
    movzx  r4, word [r6+r3+2]
633
    movlps [r1+r5*8], xmm1
634
    movhps [r1+r4*8], xmm1
635
%endif
636
    sub    r3, 4
637
    jns    .pre
638

    
639
    mov  r5, r0
640
    mov  r6, r1
641
    mov  r0, r1
642
    mov  r1d, [r5+FFTContext.nbits]
643

    
644
    FFT_DISPATCH _sse, r1
645

    
646
    mov  r0d, [r5+FFTContext.mdctsize]
647
    add  r6, r0
648
    shr  r0, 1
649
%ifndef ARCH_X86_64
650
%define rtcos r2
651
%define rtsin r3
652
    mov  rtcos, [esp+8]
653
    mov  rtsin, [esp+4]
654
%endif
655
    neg  r0
656
    mov  r1, -16
657
    sub  r1, r0
658
    POSROTATESHUF r0, r1, r6, rtcos, rtsin
659
%ifdef ARCH_X86_64
660
    pop  r14
661
    pop  r13
662
    pop  r12
663
%else
664
    add esp, 12
665
%endif
666
    RET