Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / fft_mmx.asm @ 12802ec0

History | View | Annotate | Download (14.9 KB)

1 5d0ddd1a Loren Merritt
;******************************************************************************
2
;* FFT transform with SSE/3DNow optimizations
3
;* Copyright (c) 2008 Loren Merritt
4
;*
5 1ee076b1 Loren Merritt
;* This algorithm (though not any of the implementation details) is
6
;* based on libdjbfft by D. J. Bernstein.
7
;*
8 5d0ddd1a Loren Merritt
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24
25
; These functions are not individually interchangeable with the C versions.
26
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
27
; in blocks as conventient to the vector size.
28
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
29
30
%include "x86inc.asm"
31
32 78b5c97d Alex Converse
%ifdef ARCH_X86_64
33
%define pointer resq
34
%else
35
%define pointer resd
36
%endif
37
38
struc FFTContext
39
    .nbits:    resd 1
40
    .reverse:  resd 1
41
    .revtab:   pointer 1
42
    .tmpbuf:   pointer 1
43
    .mdctsize: resd 1
44
    .mdctbits: resd 1
45
    .tcos:     pointer 1
46
    .tsin:     pointer 1
47
endstruc
48
49 5d0ddd1a Loren Merritt
SECTION_RODATA
50
51
%define M_SQRT1_2 0.70710678118654752440
52
ps_root2: times 4 dd M_SQRT1_2
53
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
54 e6b1ed69 Loren Merritt
ps_p1p1m1p1: dd 0, 0, 1<<31, 0
55 5d0ddd1a Loren Merritt
ps_m1p1: dd 1<<31, 0
56
57
%assign i 16
58
%rep 13
59 2966cc18 Jason Garrett-Glaser
cextern cos_ %+ i
60 5d0ddd1a Loren Merritt
%assign i i<<1
61
%endrep
62
63
%ifdef ARCH_X86_64
64
    %define pointer dq
65
%else
66
    %define pointer dd
67
%endif
68
69
%macro IF0 1+
70
%endmacro
71
%macro IF1 1+
72
    %1
73
%endmacro
74
75
section .text align=16
76
77
%macro T2_3DN 4 ; z0, z1, mem0, mem1
78
    mova     %1, %3
79
    mova     %2, %1
80
    pfadd    %1, %4
81
    pfsub    %2, %4
82
%endmacro
83
84
%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
85
    mova     %5, %3
86
    pfsub    %3, %4
87
    pfadd    %5, %4 ; {t6,t5}
88 2966cc18 Jason Garrett-Glaser
    pxor     %3, [ps_m1p1] ; {t8,t7}
89 5d0ddd1a Loren Merritt
    mova     %6, %1
90
    pswapd   %3, %3
91
    pfadd    %1, %5 ; {r0,i0}
92
    pfsub    %6, %5 ; {r2,i2}
93
    mova     %4, %2
94
    pfadd    %2, %3 ; {r1,i1}
95
    pfsub    %4, %3 ; {r3,i3}
96
    SWAP     %3, %6
97
%endmacro
98
99 e6b1ed69 Loren Merritt
; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
100 5d0ddd1a Loren Merritt
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
101
%macro T4_SSE 3
102
    mova     %3, %1
103 e6b1ed69 Loren Merritt
    addps    %1, %2       ; {t1,t2,t6,t5}
104
    subps    %3, %2       ; {t3,t4,-t8,t7}
105
    xorps    %3, [ps_p1p1m1p1]
106 5d0ddd1a Loren Merritt
    mova     %2, %1
107 e6b1ed69 Loren Merritt
    shufps   %1, %3, 0x44 ; {t1,t2,t3,t4}
108
    shufps   %2, %3, 0xbe ; {t6,t5,t7,t8}
109 5d0ddd1a Loren Merritt
    mova     %3, %1
110 e6b1ed69 Loren Merritt
    addps    %1, %2       ; {r0,i0,r1,i1}
111
    subps    %3, %2       ; {r2,i2,r3,i3}
112 5d0ddd1a Loren Merritt
    mova     %2, %1
113 e6b1ed69 Loren Merritt
    shufps   %1, %3, 0x88 ; {r0,r1,r2,r3}
114
    shufps   %2, %3, 0xdd ; {i0,i1,i2,i3}
115 5d0ddd1a Loren Merritt
%endmacro
116
117 e6b1ed69 Loren Merritt
; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
118
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
119
%macro T8_SSE 6
120 5d0ddd1a Loren Merritt
    mova     %6, %3
121 e6b1ed69 Loren Merritt
    subps    %3, %4       ; {r5,i5,r7,i7}
122
    addps    %6, %4       ; {t1,t2,t3,t4}
123
    mova     %4, %3
124
    shufps   %4, %4, 0xb1 ; {i5,r5,i7,r7}
125 2966cc18 Jason Garrett-Glaser
    mulps    %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
126 e6b1ed69 Loren Merritt
    mulps    %4, [ps_root2]
127
    addps    %3, %4       ; {t8,t7,ta,t9}
128
    mova     %4, %6
129 5d0ddd1a Loren Merritt
    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
130 e6b1ed69 Loren Merritt
    shufps   %4, %3, 0x9c ; {t1,t4,t7,ta}
131 5d0ddd1a Loren Merritt
    mova     %3, %6
132 e6b1ed69 Loren Merritt
    addps    %6, %4       ; {t1,t2,t9,ta}
133
    subps    %3, %4       ; {t6,t5,tc,tb}
134
    mova     %4, %6
135 5d0ddd1a Loren Merritt
    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
136 e6b1ed69 Loren Merritt
    shufps   %4, %3, 0x8d ; {t2,ta,t6,tc}
137 5d0ddd1a Loren Merritt
    mova     %3, %1
138 e6b1ed69 Loren Merritt
    mova     %5, %2
139 5d0ddd1a Loren Merritt
    addps    %1, %6       ; {r0,r1,r2,r3}
140 e6b1ed69 Loren Merritt
    addps    %2, %4       ; {i0,i1,i2,i3}
141 5d0ddd1a Loren Merritt
    subps    %3, %6       ; {r4,r5,r6,r7}
142 e6b1ed69 Loren Merritt
    subps    %5, %4       ; {i4,i5,i6,i7}
143
    SWAP     %4, %5
144 5d0ddd1a Loren Merritt
%endmacro
145
146
; scheduled for cpu-bound sizes
147
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
148
IF%1 mova    m4, Z(4)
149
IF%1 mova    m5, Z(5)
150
    mova     m0, %2 ; wre
151
    mova     m2, m4
152
    mova     m1, %3 ; wim
153
    mova     m3, m5
154
    mulps    m2, m0 ; r2*wre
155 dc77e985 Reimar Döffinger
IF%1 mova    m6, Z2(6)
156 5d0ddd1a Loren Merritt
    mulps    m3, m1 ; i2*wim
157 dc77e985 Reimar Döffinger
IF%1 mova    m7, Z2(7)
158 5d0ddd1a Loren Merritt
    mulps    m4, m1 ; r2*wim
159
    mulps    m5, m0 ; i2*wre
160
    addps    m2, m3 ; r2*wre + i2*wim
161
    mova     m3, m1
162
    mulps    m1, m6 ; r3*wim
163
    subps    m5, m4 ; i2*wre - r2*wim
164
    mova     m4, m0
165
    mulps    m3, m7 ; i3*wim
166
    mulps    m4, m6 ; r3*wre
167
    mulps    m0, m7 ; i3*wre
168
    subps    m4, m3 ; r3*wre - i3*wim
169
    mova     m3, Z(0)
170
    addps    m0, m1 ; i3*wre + r3*wim
171
    mova     m1, m4
172
    addps    m4, m2 ; t5
173
    subps    m1, m2 ; t3
174
    subps    m3, m4 ; r2
175
    addps    m4, Z(0) ; r0
176
    mova     m6, Z(2)
177
    mova   Z(4), m3
178
    mova   Z(0), m4
179
    mova     m3, m5
180
    subps    m5, m0 ; t4
181
    mova     m4, m6
182
    subps    m6, m5 ; r3
183
    addps    m5, m4 ; r1
184 dc77e985 Reimar Döffinger
    mova  Z2(6), m6
185 5d0ddd1a Loren Merritt
    mova   Z(2), m5
186
    mova     m2, Z(3)
187
    addps    m3, m0 ; t6
188
    subps    m2, m1 ; i3
189
    mova     m7, Z(1)
190
    addps    m1, Z(3) ; i1
191 dc77e985 Reimar Döffinger
    mova  Z2(7), m2
192 5d0ddd1a Loren Merritt
    mova   Z(3), m1
193
    mova     m4, m7
194
    subps    m7, m3 ; i2
195
    addps    m3, m4 ; i0
196
    mova   Z(5), m7
197
    mova   Z(1), m3
198
%endmacro
199
200
; scheduled to avoid store->load aliasing
201
%macro PASS_BIG 1 ; (!interleave)
202
    mova     m4, Z(4) ; r2
203
    mova     m5, Z(5) ; i2
204
    mova     m2, m4
205
    mova     m0, [wq] ; wre
206
    mova     m3, m5
207
    mova     m1, [wq+o1q] ; wim
208
    mulps    m2, m0 ; r2*wre
209 dc77e985 Reimar Döffinger
    mova     m6, Z2(6) ; r3
210 5d0ddd1a Loren Merritt
    mulps    m3, m1 ; i2*wim
211 dc77e985 Reimar Döffinger
    mova     m7, Z2(7) ; i3
212 5d0ddd1a Loren Merritt
    mulps    m4, m1 ; r2*wim
213
    mulps    m5, m0 ; i2*wre
214
    addps    m2, m3 ; r2*wre + i2*wim
215
    mova     m3, m1
216
    mulps    m1, m6 ; r3*wim
217
    subps    m5, m4 ; i2*wre - r2*wim
218
    mova     m4, m0
219
    mulps    m3, m7 ; i3*wim
220
    mulps    m4, m6 ; r3*wre
221
    mulps    m0, m7 ; i3*wre
222
    subps    m4, m3 ; r3*wre - i3*wim
223
    mova     m3, Z(0)
224
    addps    m0, m1 ; i3*wre + r3*wim
225
    mova     m1, m4
226
    addps    m4, m2 ; t5
227
    subps    m1, m2 ; t3
228
    subps    m3, m4 ; r2
229
    addps    m4, Z(0) ; r0
230
    mova     m6, Z(2)
231
    mova   Z(4), m3
232
    mova   Z(0), m4
233
    mova     m3, m5
234
    subps    m5, m0 ; t4
235
    mova     m4, m6
236
    subps    m6, m5 ; r3
237
    addps    m5, m4 ; r1
238 dc77e985 Reimar Döffinger
IF%1 mova Z2(6), m6
239 5d0ddd1a Loren Merritt
IF%1 mova  Z(2), m5
240
    mova     m2, Z(3)
241
    addps    m3, m0 ; t6
242
    subps    m2, m1 ; i3
243
    mova     m7, Z(1)
244
    addps    m1, Z(3) ; i1
245 dc77e985 Reimar Döffinger
IF%1 mova Z2(7), m2
246 5d0ddd1a Loren Merritt
IF%1 mova  Z(3), m1
247
    mova     m4, m7
248
    subps    m7, m3 ; i2
249
    addps    m3, m4 ; i0
250
IF%1 mova  Z(5), m7
251
IF%1 mova  Z(1), m3
252
%if %1==0
253
    mova     m4, m5 ; r1
254
    mova     m0, m6 ; r3
255
    unpcklps m5, m1
256
    unpckhps m4, m1
257
    unpcklps m6, m2
258
    unpckhps m0, m2
259
    mova     m1, Z(0)
260
    mova     m2, Z(4)
261
    mova   Z(2), m5
262
    mova   Z(3), m4
263 dc77e985 Reimar Döffinger
    mova  Z2(6), m6
264
    mova  Z2(7), m0
265 5d0ddd1a Loren Merritt
    mova     m5, m1 ; r0
266
    mova     m4, m2 ; r2
267
    unpcklps m1, m3
268
    unpckhps m5, m3
269
    unpcklps m2, m7
270
    unpckhps m4, m7
271
    mova   Z(0), m1
272
    mova   Z(1), m5
273
    mova   Z(4), m2
274
    mova   Z(5), m4
275
%endif
276
%endmacro
277
278
%macro PUNPCK 3
279
    mova      %3, %1
280
    punpckldq %1, %2
281
    punpckhdq %3, %2
282
%endmacro
283
284
INIT_XMM
285 45213083 Loren Merritt
%define mova movaps
286 5d0ddd1a Loren Merritt
287
%define Z(x) [r0+mmsize*x]
288 dc77e985 Reimar Döffinger
%define Z2(x) [r0+mmsize*x]
289 5d0ddd1a Loren Merritt
290
align 16
291
fft4_sse:
292
    mova     m0, Z(0)
293
    mova     m1, Z(1)
294
    T4_SSE   m0, m1, m2
295
    mova   Z(0), m0
296
    mova   Z(1), m1
297
    ret
298
299
align 16
300
fft8_sse:
301
    mova     m0, Z(0)
302
    mova     m1, Z(1)
303
    T4_SSE   m0, m1, m2
304
    mova     m2, Z(2)
305
    mova     m3, Z(3)
306
    T8_SSE   m0, m1, m2, m3, m4, m5
307
    mova   Z(0), m0
308
    mova   Z(1), m1
309
    mova   Z(2), m2
310
    mova   Z(3), m3
311
    ret
312
313
align 16
314
fft16_sse:
315
    mova     m0, Z(0)
316
    mova     m1, Z(1)
317
    T4_SSE   m0, m1, m2
318
    mova     m2, Z(2)
319
    mova     m3, Z(3)
320
    T8_SSE   m0, m1, m2, m3, m4, m5
321
    mova     m4, Z(4)
322
    mova     m5, Z(5)
323
    mova   Z(0), m0
324
    mova   Z(1), m1
325
    mova   Z(2), m2
326
    mova   Z(3), m3
327
    T4_SSE   m4, m5, m6
328 dc77e985 Reimar Döffinger
    mova     m6, Z2(6)
329
    mova     m7, Z2(7)
330 5d0ddd1a Loren Merritt
    T4_SSE   m6, m7, m0
331 2966cc18 Jason Garrett-Glaser
    PASS_SMALL 0, [cos_16], [cos_16+16]
332 5d0ddd1a Loren Merritt
    ret
333
334
335
INIT_MMX
336
337
%macro FFT48_3DN 1
338
align 16
339
fft4%1:
340
    T2_3DN   m0, m1, Z(0), Z(1)
341
    mova     m2, Z(2)
342
    mova     m3, Z(3)
343
    T4_3DN   m0, m1, m2, m3, m4, m5
344
    PUNPCK   m0, m1, m4
345
    PUNPCK   m2, m3, m5
346
    mova   Z(0), m0
347
    mova   Z(1), m4
348
    mova   Z(2), m2
349
    mova   Z(3), m5
350
    ret
351
352
align 16
353
fft8%1:
354
    T2_3DN   m0, m1, Z(0), Z(1)
355
    mova     m2, Z(2)
356
    mova     m3, Z(3)
357
    T4_3DN   m0, m1, m2, m3, m4, m5
358
    mova   Z(0), m0
359
    mova   Z(2), m2
360 dc77e985 Reimar Döffinger
    T2_3DN   m4, m5,  Z(4),  Z(5)
361
    T2_3DN   m6, m7, Z2(6), Z2(7)
362 5d0ddd1a Loren Merritt
    pswapd   m0, m5
363
    pswapd   m2, m7
364 2966cc18 Jason Garrett-Glaser
    pxor     m0, [ps_m1p1]
365
    pxor     m2, [ps_m1p1]
366 5d0ddd1a Loren Merritt
    pfsub    m5, m0
367
    pfadd    m7, m2
368 2966cc18 Jason Garrett-Glaser
    pfmul    m5, [ps_root2]
369
    pfmul    m7, [ps_root2]
370 5d0ddd1a Loren Merritt
    T4_3DN   m1, m3, m5, m7, m0, m2
371
    mova   Z(5), m5
372 dc77e985 Reimar Döffinger
    mova  Z2(7), m7
373 5d0ddd1a Loren Merritt
    mova     m0, Z(0)
374
    mova     m2, Z(2)
375
    T4_3DN   m0, m2, m4, m6, m5, m7
376
    PUNPCK   m0, m1, m5
377
    PUNPCK   m2, m3, m7
378
    mova   Z(0), m0
379
    mova   Z(1), m5
380
    mova   Z(2), m2
381
    mova   Z(3), m7
382 dc77e985 Reimar Döffinger
    PUNPCK   m4,  Z(5), m5
383
    PUNPCK   m6, Z2(7), m7
384 5d0ddd1a Loren Merritt
    mova   Z(4), m4
385
    mova   Z(5), m5
386 dc77e985 Reimar Döffinger
    mova  Z2(6), m6
387
    mova  Z2(7), m7
388 5d0ddd1a Loren Merritt
    ret
389
%endmacro
390
391
FFT48_3DN _3dn2
392
393
%macro pswapd 2
394
%ifidn %1, %2
395
    movd [r0+12], %1
396
    punpckhdq %1, [r0+8]
397
%else
398
    movq  %1, %2
399
    psrlq %1, 32
400
    punpckldq %1, %2
401
%endif
402
%endmacro
403
404
FFT48_3DN _3dn
405
406
407 dc77e985 Reimar Döffinger
%define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
408
%define Z2(x) [zq + o3q + mmsize*(x&1)]
409 5d0ddd1a Loren Merritt
410
%macro DECL_PASS 2+ ; name, payload
411
align 16
412
%1:
413
DEFINE_ARGS z, w, n, o1, o3
414
    lea o3q, [nq*3]
415
    lea o1q, [nq*8]
416
    shl o3q, 4
417
.loop:
418
    %2
419
    add zq, mmsize*2
420
    add wq, mmsize
421
    sub nd, mmsize/8
422
    jg .loop
423
    rep ret
424
%endmacro
425
426
INIT_XMM
427 45213083 Loren Merritt
%define mova movaps
428 5d0ddd1a Loren Merritt
DECL_PASS pass_sse, PASS_BIG 1
429
DECL_PASS pass_interleave_sse, PASS_BIG 0
430
431
INIT_MMX
432
%define mulps pfmul
433
%define addps pfadd
434
%define subps pfsub
435
%define unpcklps punpckldq
436
%define unpckhps punpckhdq
437
DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
438
DECL_PASS pass_interleave_3dn, PASS_BIG 0
439
%define pass_3dn2 pass_3dn
440
%define pass_interleave_3dn2 pass_interleave_3dn
441
442 3d05c1fb Reimar Döffinger
%ifdef PIC
443
%define SECTION_REL - $$
444
%else
445
%define SECTION_REL
446
%endif
447 5d0ddd1a Loren Merritt
448 78b5c97d Alex Converse
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
449
    lea r2, [dispatch_tab%1]
450
    mov r2, [r2 + (%2q-2)*gprsize]
451
%ifdef PIC
452
    lea r3, [$$]
453
    add r2, r3
454
%endif
455
    call r2
456
%endmacro ; FFT_DISPATCH
457
458 5d0ddd1a Loren Merritt
%macro DECL_FFT 2-3 ; nbits, cpu, suffix
459 3d05c1fb Reimar Döffinger
%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
460 5d0ddd1a Loren Merritt
%if %1==5
461 3d05c1fb Reimar Döffinger
%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
462 5d0ddd1a Loren Merritt
%endif
463
464
%assign n 1<<%1
465
%rep 17-%1
466
%assign n2 n/2
467
%assign n4 n/4
468 3d05c1fb Reimar Döffinger
%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
469 5d0ddd1a Loren Merritt
470
align 16
471
fft %+ n %+ %3%2:
472
    call fft %+ n2 %+ %2
473
    add r0, n*4 - (n&(-2<<%1))
474
    call fft %+ n4 %+ %2
475
    add r0, n*2 - (n2&(-2<<%1))
476
    call fft %+ n4 %+ %2
477
    sub r0, n*6 + (n2&(-2<<%1))
478 2966cc18 Jason Garrett-Glaser
    lea r1, [cos_ %+ n]
479 5d0ddd1a Loren Merritt
    mov r2d, n4/2
480
    jmp pass%3%2
481
482
%assign n n*2
483
%endrep
484
%undef n
485
486
align 8
487
dispatch_tab%3%2: pointer list_of_fft
488
489 b6188c5a Alexander Strange
section .text
490
491 5d0ddd1a Loren Merritt
; On x86_32, this function does the register saving and restoring for all of fft.
492
; The others pass args in registers and don't spill anything.
493 3f87f39c John Adcock
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
494 78b5c97d Alex Converse
    FFT_DISPATCH %3%2, nbits
495 5d0ddd1a Loren Merritt
    RET
496
%endmacro ; DECL_FFT
497
498
DECL_FFT 5, _sse
499
DECL_FFT 5, _sse, _interleave
500
DECL_FFT 4, _3dn
501
DECL_FFT 4, _3dn, _interleave
502
DECL_FFT 4, _3dn2
503
DECL_FFT 4, _3dn2, _interleave
504
505 78b5c97d Alex Converse
INIT_XMM
506
%undef mulps
507
%undef addps
508
%undef subps
509
%undef unpcklps
510
%undef unpckhps
511
512
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
513
    movaps   xmm0, [%3+%2*4]
514
    movaps   xmm1, [%3+%1*4-0x10]
515
    movaps   xmm2, xmm0
516
    shufps   xmm0, xmm1, 0x88
517
    shufps   xmm1, xmm2, 0x77
518
    movlps   xmm4, [%4+%2*2]
519
    movlps   xmm5, [%5+%2*2+0x0]
520
    movhps   xmm4, [%4+%1*2-0x8]
521
    movhps   xmm5, [%5+%1*2-0x8]
522
    movaps   xmm2, xmm0
523
    movaps   xmm3, xmm1
524
    mulps    xmm0, xmm5
525
    mulps    xmm1, xmm4
526
    mulps    xmm2, xmm4
527
    mulps    xmm3, xmm5
528
    subps    xmm1, xmm0
529
    addps    xmm2, xmm3
530
    movaps   xmm0, xmm1
531
    unpcklps xmm1, xmm2
532
    unpckhps xmm0, xmm2
533
%endmacro
534
535
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
536
    movaps   xmm6, [%4+%1*2]
537
    movaps   %2,   [%4+%1*2+0x10]
538
    movaps   %3,   xmm6
539
    movaps   xmm7, %2
540 19d929f9 Loren Merritt
    mulps    xmm6, [%5+%1]
541
    mulps    %2,   [%6+%1]
542
    mulps    %3,   [%6+%1]
543
    mulps    xmm7, [%5+%1]
544 78b5c97d Alex Converse
    subps    %2,   xmm6
545
    addps    %3,   xmm7
546
%endmacro
547
548
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
549
.post:
550
    CMUL     %1,   xmm0, xmm1, %3, %4, %5
551
    CMUL     %2,   xmm4, xmm5, %3, %4, %5
552
    shufps   xmm1, xmm1, 0x1b
553
    shufps   xmm5, xmm5, 0x1b
554
    movaps   xmm6, xmm4
555
    unpckhps xmm4, xmm1
556
    unpcklps xmm6, xmm1
557
    movaps   xmm2, xmm0
558
    unpcklps xmm0, xmm5
559
    unpckhps xmm2, xmm5
560
    movaps   [%3+%2*2],      xmm6
561
    movaps   [%3+%2*2+0x10], xmm4
562
    movaps   [%3+%1*2],      xmm0
563
    movaps   [%3+%1*2+0x10], xmm2
564
    sub      %2,   0x10
565
    add      %1,   0x10
566
    jl       .post
567
%endmacro
568
569
cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
570
%ifdef ARCH_X86_64
571
%define rrevtab r10
572
%define rtcos   r11
573
%define rtsin   r12
574
    push  r12
575
    push  r13
576
    push  r14
577
%else
578
%define rrevtab r6
579
%define rtsin   r6
580
%define rtcos   r5
581
%endif
582
    mov   r3d, [r0+FFTContext.mdctsize]
583
    add   r2, r3
584
    shr   r3, 1
585
    mov   rtcos, [r0+FFTContext.tcos]
586
    mov   rtsin, [r0+FFTContext.tsin]
587
    add   rtcos, r3
588
    add   rtsin, r3
589
%ifndef ARCH_X86_64
590
    push  rtcos
591
    push  rtsin
592
%endif
593
    shr   r3, 1
594
    mov   rrevtab, [r0+FFTContext.revtab]
595
    add   rrevtab, r3
596
%ifndef ARCH_X86_64
597
    push  rrevtab
598
%endif
599
600
    sub   r3, 4
601
%ifdef ARCH_X86_64
602
    xor   r4, r4
603
    sub   r4, r3
604
%endif
605
.pre:
606
%ifndef ARCH_X86_64
607
;unspill
608
    xor   r4, r4
609
    sub   r4, r3
610
    mov   rtsin, [esp+4]
611
    mov   rtcos, [esp+8]
612
%endif
613
614
    PREROTATER r4, r3, r2, rtcos, rtsin
615
%ifdef ARCH_X86_64
616 19d929f9 Loren Merritt
    movzx  r5,  word [rrevtab+r4-4]
617
    movzx  r6,  word [rrevtab+r4-2]
618
    movzx  r13, word [rrevtab+r3]
619
    movzx  r14, word [rrevtab+r3+2]
620
    movlps [r1+r5 *8], xmm0
621
    movhps [r1+r6 *8], xmm0
622
    movlps [r1+r13*8], xmm1
623
    movhps [r1+r14*8], xmm1
624 78b5c97d Alex Converse
    add    r4, 4
625
%else
626
    mov    r6, [esp]
627 19d929f9 Loren Merritt
    movzx  r5, word [r6+r4-4]
628
    movzx  r4, word [r6+r4-2]
629
    movlps [r1+r5*8], xmm0
630
    movhps [r1+r4*8], xmm0
631
    movzx  r5, word [r6+r3]
632
    movzx  r4, word [r6+r3+2]
633
    movlps [r1+r5*8], xmm1
634
    movhps [r1+r4*8], xmm1
635 78b5c97d Alex Converse
%endif
636
    sub    r3, 4
637
    jns    .pre
638
639
    mov  r5, r0
640
    mov  r6, r1
641
    mov  r0, r1
642
    mov  r1d, [r5+FFTContext.nbits]
643
644
    FFT_DISPATCH _sse, r1
645
646
    mov  r0d, [r5+FFTContext.mdctsize]
647
    add  r6, r0
648
    shr  r0, 1
649
%ifndef ARCH_X86_64
650
%define rtcos r2
651
%define rtsin r3
652
    mov  rtcos, [esp+8]
653
    mov  rtsin, [esp+4]
654
%endif
655
    neg  r0
656
    mov  r1, -16
657
    sub  r1, r0
658
    POSROTATESHUF r0, r1, r6, rtcos, rtsin
659
%ifdef ARCH_X86_64
660
    pop  r14
661
    pop  r13
662
    pop  r12
663
%else
664
    add esp, 12
665
%endif
666
    RET