Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / fft_mmx.asm @ 888fa31e

History | View | Annotate | Download (20.5 KB)

1
;******************************************************************************
2
;* FFT transform with SSE/3DNow optimizations
3
;* Copyright (c) 2008 Loren Merritt
4
;* Copyright (c) 2011 Vitor Sessak
5
;*
6
;* This algorithm (though not any of the implementation details) is
7
;* based on libdjbfft by D. J. Bernstein.
8
;*
9
;* This file is part of Libav.
10
;*
11
;* Libav is free software; you can redistribute it and/or
12
;* modify it under the terms of the GNU Lesser General Public
13
;* License as published by the Free Software Foundation; either
14
;* version 2.1 of the License, or (at your option) any later version.
15
;*
16
;* Libav is distributed in the hope that it will be useful,
17
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
;* Lesser General Public License for more details.
20
;*
21
;* You should have received a copy of the GNU Lesser General Public
22
;* License along with Libav; if not, write to the Free Software
23
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24
;******************************************************************************
25

    
26
; These functions are not individually interchangeable with the C versions.
27
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28
; in blocks as conventient to the vector size.
29
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
30

    
31
%include "x86inc.asm"
32

    
33
%ifdef ARCH_X86_64
34
%define pointer resq
35
%else
36
%define pointer resd
37
%endif
38

    
39
struc FFTContext
40
    .nbits:    resd 1
41
    .reverse:  resd 1
42
    .revtab:   pointer 1
43
    .tmpbuf:   pointer 1
44
    .mdctsize: resd 1
45
    .mdctbits: resd 1
46
    .tcos:     pointer 1
47
    .tsin:     pointer 1
48
endstruc
49

    
50
SECTION_RODATA
51

    
52
%define M_SQRT1_2 0.70710678118654752440
53
%define M_COS_PI_1_8 0.923879532511287
54
%define M_COS_PI_3_8 0.38268343236509
55

    
56
align 32
57
ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
58
ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
59

    
60
ps_root2: times 8 dd M_SQRT1_2
61
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
62
ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
63

    
64
perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
65
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
66
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
67
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
68
ps_m1p1: dd 1<<31, 0
69

    
70
%assign i 16
71
%rep 13
72
cextern cos_ %+ i
73
%assign i i<<1
74
%endrep
75

    
76
%ifdef ARCH_X86_64
77
    %define pointer dq
78
%else
79
    %define pointer dd
80
%endif
81

    
82
%macro IF0 1+
83
%endmacro
84
%macro IF1 1+
85
    %1
86
%endmacro
87

    
88
section .text align=16
89

    
90
%macro T2_3DN 4 ; z0, z1, mem0, mem1
91
    mova     %1, %3
92
    mova     %2, %1
93
    pfadd    %1, %4
94
    pfsub    %2, %4
95
%endmacro
96

    
97
%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
98
    mova     %5, %3
99
    pfsub    %3, %4
100
    pfadd    %5, %4 ; {t6,t5}
101
    pxor     %3, [ps_m1p1] ; {t8,t7}
102
    mova     %6, %1
103
    pswapd   %3, %3
104
    pfadd    %1, %5 ; {r0,i0}
105
    pfsub    %6, %5 ; {r2,i2}
106
    mova     %4, %2
107
    pfadd    %2, %3 ; {r1,i1}
108
    pfsub    %4, %3 ; {r3,i3}
109
    SWAP     %3, %6
110
%endmacro
111

    
112
;  in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
113
;      %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
114
;      %3, %4, %5 tmp
115
; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
116
;      %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
117
%macro T8_AVX 5
118
    vsubps     %5, %1, %2       ; v  = %1 - %2
119
    vaddps     %3, %1, %2       ; w  = %1 + %2
120
    vmulps     %2, %5, [ps_p1p1m1p1root2]  ; v *= vals1
121
    vpermilps  %2, %2, [perm1]
122
    vblendps   %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
123
    vshufps    %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
124
    vsubps     %4, %5, %1       ; s = r - q
125
    vaddps     %1, %5, %1       ; u = r + q
126
    vpermilps  %1, %1, [perm2]  ; k  = {u1,u2,u3,u4,u6,u5,u7,u8}
127
    vshufps    %5, %4, %1, 0xbb
128
    vshufps    %3, %4, %1, 0xee
129
    vperm2f128 %3, %3, %5, 0x13
130
    vxorps     %4, %4, [ps_m1m1p1m1p1m1m1m1]  ; s *= {1,1,-1,-1,1,-1,-1,-1}
131
    vshufps    %2, %1, %4, 0xdd
132
    vshufps    %1, %1, %4, 0x88
133
    vperm2f128 %4, %2, %1, 0x02 ; v  = {k1,k3,s1,s3,k2,k4,s2,s4}
134
    vperm2f128 %1, %1, %2, 0x13 ; w  = {k6,k8,s6,s8,k5,k7,s5,s7}
135
    vsubps     %5, %1, %3
136
    vblendps   %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
137
    vsubps     %2, %4, %1       ; %2 = v - w
138
    vaddps     %1, %4, %1       ; %1 = v + w
139
%endmacro
140

    
141
; In SSE mode do one fft4 transforms
142
; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
143
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
144
;
145
; In AVX mode do two fft4 transforms
146
; in:  %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
147
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
148
%macro T4_SSE 3
149
    subps    %3, %1, %2       ; {t3,t4,-t8,t7}
150
    addps    %1, %1, %2       ; {t1,t2,t6,t5}
151
    xorps    %3, %3, [ps_p1p1m1p1]
152
    shufps   %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
153
    shufps   %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
154
    subps    %3, %1, %2       ; {r2,i2,r3,i3}
155
    addps    %1, %1, %2       ; {r0,i0,r1,i1}
156
    shufps   %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
157
    shufps   %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
158
%endmacro
159

    
160
; In SSE mode do one FFT8
161
; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
162
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
163
;
164
; In AVX mode do two FFT8
165
; in:  %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
166
;      %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
167
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
168
;      %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
169
%macro T8_SSE 6
170
    addps    %6, %3, %4       ; {t1,t2,t3,t4}
171
    subps    %3, %3, %4       ; {r5,i5,r7,i7}
172
    shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
173
    mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
174
    mulps    %4, %4, [ps_root2]
175
    addps    %3, %3, %4       ; {t8,t7,ta,t9}
176
    shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
177
    shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
178
    subps    %3, %6, %4       ; {t6,t5,tc,tb}
179
    addps    %6, %6, %4       ; {t1,t2,t9,ta}
180
    shufps   %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
181
    shufps   %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
182
    subps    %3, %1, %6       ; {r4,r5,r6,r7}
183
    addps    %1, %1, %6       ; {r0,r1,r2,r3}
184
    subps    %4, %2, %5       ; {i4,i5,i6,i7}
185
    addps    %2, %2, %5       ; {i0,i1,i2,i3}
186
%endmacro
187

    
188
; scheduled for cpu-bound sizes
189
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
190
IF%1 mova    m4, Z(4)
191
IF%1 mova    m5, Z(5)
192
    mova     m0, %2 ; wre
193
    mova     m1, %3 ; wim
194
    mulps    m2, m4, m0 ; r2*wre
195
IF%1 mova    m6, Z2(6)
196
    mulps    m3, m5, m1 ; i2*wim
197
IF%1 mova    m7, Z2(7)
198
    mulps    m4, m4, m1 ; r2*wim
199
    mulps    m5, m5, m0 ; i2*wre
200
    addps    m2, m2, m3 ; r2*wre + i2*wim
201
    mulps    m3, m1, m7 ; i3*wim
202
    subps    m5, m5, m4 ; i2*wre - r2*wim
203
    mulps    m1, m1, m6 ; r3*wim
204
    mulps    m4, m0, m6 ; r3*wre
205
    mulps    m0, m0, m7 ; i3*wre
206
    subps    m4, m4, m3 ; r3*wre - i3*wim
207
    mova     m3, Z(0)
208
    addps    m0, m0, m1 ; i3*wre + r3*wim
209
    subps    m1, m4, m2 ; t3
210
    addps    m4, m4, m2 ; t5
211
    subps    m3, m3, m4 ; r2
212
    addps    m4, m4, Z(0) ; r0
213
    mova     m6, Z(2)
214
    mova   Z(4), m3
215
    mova   Z(0), m4
216
    subps    m3, m5, m0 ; t4
217
    subps    m4, m6, m3 ; r3
218
    addps    m3, m3, m6 ; r1
219
    mova  Z2(6), m4
220
    mova   Z(2), m3
221
    mova     m2, Z(3)
222
    addps    m3, m5, m0 ; t6
223
    subps    m2, m2, m1 ; i3
224
    mova     m7, Z(1)
225
    addps    m1, m1, Z(3) ; i1
226
    mova  Z2(7), m2
227
    mova   Z(3), m1
228
    subps    m4, m7, m3 ; i2
229
    addps    m3, m3, m7 ; i0
230
    mova   Z(5), m4
231
    mova   Z(1), m3
232
%endmacro
233

    
234
; scheduled to avoid store->load aliasing
235
%macro PASS_BIG 1 ; (!interleave)
236
    mova     m4, Z(4) ; r2
237
    mova     m5, Z(5) ; i2
238
    mova     m0, [wq] ; wre
239
    mova     m1, [wq+o1q] ; wim
240
    mulps    m2, m4, m0 ; r2*wre
241
    mova     m6, Z2(6) ; r3
242
    mulps    m3, m5, m1 ; i2*wim
243
    mova     m7, Z2(7) ; i3
244
    mulps    m4, m4, m1 ; r2*wim
245
    mulps    m5, m5, m0 ; i2*wre
246
    addps    m2, m2, m3 ; r2*wre + i2*wim
247
    mulps    m3, m1, m7 ; i3*wim
248
    mulps    m1, m1, m6 ; r3*wim
249
    subps    m5, m5, m4 ; i2*wre - r2*wim
250
    mulps    m4, m0, m6 ; r3*wre
251
    mulps    m0, m0, m7 ; i3*wre
252
    subps    m4, m4, m3 ; r3*wre - i3*wim
253
    mova     m3, Z(0)
254
    addps    m0, m0, m1 ; i3*wre + r3*wim
255
    subps    m1, m4, m2 ; t3
256
    addps    m4, m4, m2 ; t5
257
    subps    m3, m3, m4 ; r2
258
    addps    m4, m4, Z(0) ; r0
259
    mova     m6, Z(2)
260
    mova   Z(4), m3
261
    mova   Z(0), m4
262
    subps    m3, m5, m0 ; t4
263
    subps    m4, m6, m3 ; r3
264
    addps    m3, m3, m6 ; r1
265
IF%1 mova Z2(6), m4
266
IF%1 mova  Z(2), m3
267
    mova     m2, Z(3)
268
    addps    m5, m5, m0 ; t6
269
    subps    m2, m2, m1 ; i3
270
    mova     m7, Z(1)
271
    addps    m1, m1, Z(3) ; i1
272
IF%1 mova Z2(7), m2
273
IF%1 mova  Z(3), m1
274
    subps    m6, m7, m5 ; i2
275
    addps    m5, m5, m7 ; i0
276
IF%1 mova  Z(5), m6
277
IF%1 mova  Z(1), m5
278
%if %1==0
279
    INTERL m1, m3, m7, Z, 2
280
    INTERL m2, m4, m0, Z2, 6
281

    
282
    mova     m1, Z(0)
283
    mova     m2, Z(4)
284

    
285
    INTERL m5, m1, m3, Z, 0
286
    INTERL m6, m2, m7, Z, 4
287
%endif
288
%endmacro
289

    
290
%macro PUNPCK 3
291
    mova      %3, %1
292
    punpckldq %1, %2
293
    punpckhdq %3, %2
294
%endmacro
295

    
296
%define Z(x) [r0+mmsize*x]
297
%define Z2(x) [r0+mmsize*x]
298
%define ZH(x) [r0+mmsize*x+mmsize/2]
299

    
300
INIT_YMM
301

    
302
align 16
303
fft8_avx:
304
    mova      m0, Z(0)
305
    mova      m1, Z(1)
306
    T8_AVX    m0, m1, m2, m3, m4
307
    mova      Z(0), m0
308
    mova      Z(1), m1
309
    ret
310

    
311

    
312
align 16
313
fft16_avx:
314
    mova       m2, Z(2)
315
    mova       m3, Z(3)
316
    T4_SSE     m2, m3, m7
317

    
318
    mova       m0, Z(0)
319
    mova       m1, Z(1)
320
    T8_AVX     m0, m1, m4, m5, m7
321

    
322
    mova       m4, [ps_cos16_1]
323
    mova       m5, [ps_cos16_2]
324
    vmulps     m6, m2, m4
325
    vmulps     m7, m3, m5
326
    vaddps     m7, m7, m6
327
    vmulps     m2, m2, m5
328
    vmulps     m3, m3, m4
329
    vsubps     m3, m3, m2
330
    vblendps   m2, m7, m3, 0xf0
331
    vperm2f128 m3, m7, m3, 0x21
332
    vaddps     m4, m2, m3
333
    vsubps     m2, m3, m2
334
    vperm2f128 m2, m2, m2, 0x01
335
    vsubps     m3, m1, m2
336
    vaddps     m1, m1, m2
337
    vsubps     m5, m0, m4
338
    vaddps     m0, m0, m4
339
    vextractf128   Z(0), m0, 0
340
    vextractf128  ZH(0), m1, 0
341
    vextractf128   Z(1), m0, 1
342
    vextractf128  ZH(1), m1, 1
343
    vextractf128   Z(2), m5, 0
344
    vextractf128  ZH(2), m3, 0
345
    vextractf128   Z(3), m5, 1
346
    vextractf128  ZH(3), m3, 1
347
    ret
348

    
349
align 16
350
fft32_avx:
351
    call fft16_avx
352

    
353
    mova m0, Z(4)
354
    mova m1, Z(5)
355

    
356
    T4_SSE      m0, m1, m4
357

    
358
    mova m2, Z(6)
359
    mova m3, Z(7)
360

    
361
    T8_SSE      m0, m1, m2, m3, m4, m6
362
    ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
363
    ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
364

    
365
    vperm2f128  m4, m0, m2, 0x20
366
    vperm2f128  m5, m1, m3, 0x20
367
    vperm2f128  m6, m0, m2, 0x31
368
    vperm2f128  m7, m1, m3, 0x31
369

    
370
    PASS_SMALL 0, [cos_32], [cos_32+32]
371

    
372
    ret
373

    
374
fft32_interleave_avx:
375
    call fft32_avx
376
    mov r2d, 32
377
.deint_loop:
378
    mova     m2, Z(0)
379
    mova     m3, Z(1)
380
    vunpcklps      m0, m2, m3
381
    vunpckhps      m1, m2, m3
382
    vextractf128   Z(0), m0, 0
383
    vextractf128  ZH(0), m1, 0
384
    vextractf128   Z(1), m0, 1
385
    vextractf128  ZH(1), m1, 1
386
    add r0, mmsize*2
387
    sub r2d, mmsize/4
388
    jg .deint_loop
389
    ret
390

    
391
INIT_XMM
392
%define movdqa  movaps
393

    
394
align 16
395
fft4_avx:
396
fft4_sse:
397
    mova     m0, Z(0)
398
    mova     m1, Z(1)
399
    T4_SSE   m0, m1, m2
400
    mova   Z(0), m0
401
    mova   Z(1), m1
402
    ret
403

    
404
align 16
405
fft8_sse:
406
    mova     m0, Z(0)
407
    mova     m1, Z(1)
408
    T4_SSE   m0, m1, m2
409
    mova     m2, Z(2)
410
    mova     m3, Z(3)
411
    T8_SSE   m0, m1, m2, m3, m4, m5
412
    mova   Z(0), m0
413
    mova   Z(1), m1
414
    mova   Z(2), m2
415
    mova   Z(3), m3
416
    ret
417

    
418
align 16
419
fft16_sse:
420
    mova     m0, Z(0)
421
    mova     m1, Z(1)
422
    T4_SSE   m0, m1, m2
423
    mova     m2, Z(2)
424
    mova     m3, Z(3)
425
    T8_SSE   m0, m1, m2, m3, m4, m5
426
    mova     m4, Z(4)
427
    mova     m5, Z(5)
428
    mova   Z(0), m0
429
    mova   Z(1), m1
430
    mova   Z(2), m2
431
    mova   Z(3), m3
432
    T4_SSE   m4, m5, m6
433
    mova     m6, Z2(6)
434
    mova     m7, Z2(7)
435
    T4_SSE   m6, m7, m0
436
    PASS_SMALL 0, [cos_16], [cos_16+16]
437
    ret
438

    
439

    
440
INIT_MMX
441

    
442
%macro FFT48_3DN 1
443
align 16
444
fft4%1:
445
    T2_3DN   m0, m1, Z(0), Z(1)
446
    mova     m2, Z(2)
447
    mova     m3, Z(3)
448
    T4_3DN   m0, m1, m2, m3, m4, m5
449
    PUNPCK   m0, m1, m4
450
    PUNPCK   m2, m3, m5
451
    mova   Z(0), m0
452
    mova   Z(1), m4
453
    mova   Z(2), m2
454
    mova   Z(3), m5
455
    ret
456

    
457
align 16
458
fft8%1:
459
    T2_3DN   m0, m1, Z(0), Z(1)
460
    mova     m2, Z(2)
461
    mova     m3, Z(3)
462
    T4_3DN   m0, m1, m2, m3, m4, m5
463
    mova   Z(0), m0
464
    mova   Z(2), m2
465
    T2_3DN   m4, m5,  Z(4),  Z(5)
466
    T2_3DN   m6, m7, Z2(6), Z2(7)
467
    pswapd   m0, m5
468
    pswapd   m2, m7
469
    pxor     m0, [ps_m1p1]
470
    pxor     m2, [ps_m1p1]
471
    pfsub    m5, m0
472
    pfadd    m7, m2
473
    pfmul    m5, [ps_root2]
474
    pfmul    m7, [ps_root2]
475
    T4_3DN   m1, m3, m5, m7, m0, m2
476
    mova   Z(5), m5
477
    mova  Z2(7), m7
478
    mova     m0, Z(0)
479
    mova     m2, Z(2)
480
    T4_3DN   m0, m2, m4, m6, m5, m7
481
    PUNPCK   m0, m1, m5
482
    PUNPCK   m2, m3, m7
483
    mova   Z(0), m0
484
    mova   Z(1), m5
485
    mova   Z(2), m2
486
    mova   Z(3), m7
487
    PUNPCK   m4,  Z(5), m5
488
    PUNPCK   m6, Z2(7), m7
489
    mova   Z(4), m4
490
    mova   Z(5), m5
491
    mova  Z2(6), m6
492
    mova  Z2(7), m7
493
    ret
494
%endmacro
495

    
496
FFT48_3DN _3dn2
497

    
498
%macro pswapd 2
499
%ifidn %1, %2
500
    movd [r0+12], %1
501
    punpckhdq %1, [r0+8]
502
%else
503
    movq  %1, %2
504
    psrlq %1, 32
505
    punpckldq %1, %2
506
%endif
507
%endmacro
508

    
509
FFT48_3DN _3dn
510

    
511

    
512
%define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
513
%define Z2(x) [zq + o3q + mmsize*(x&1)]
514
%define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
515
%define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
516

    
517
%macro DECL_PASS 2+ ; name, payload
518
align 16
519
%1:
520
DEFINE_ARGS z, w, n, o1, o3
521
    lea o3q, [nq*3]
522
    lea o1q, [nq*8]
523
    shl o3q, 4
524
.loop:
525
    %2
526
    add zq, mmsize*2
527
    add wq, mmsize
528
    sub nd, mmsize/8
529
    jg .loop
530
    rep ret
531
%endmacro
532

    
533
INIT_YMM
534

    
535
%macro INTERL_AVX 5
536
    vunpckhps      %3, %2, %1
537
    vunpcklps      %2, %2, %1
538
    vextractf128   %4(%5), %2, 0
539
    vextractf128  %4 %+ H(%5), %3, 0
540
    vextractf128   %4(%5 + 1), %2, 1
541
    vextractf128  %4 %+ H(%5 + 1), %3, 1
542
%endmacro
543

    
544
%define INTERL INTERL_AVX
545

    
546
DECL_PASS pass_avx, PASS_BIG 1
547
DECL_PASS pass_interleave_avx, PASS_BIG 0
548

    
549
INIT_XMM
550

    
551
%macro INTERL_SSE 5
552
    mova     %3, %2
553
    unpcklps %2, %1
554
    unpckhps %3, %1
555
    mova  %4(%5), %2
556
    mova  %4(%5+1), %3
557
%endmacro
558

    
559
%define INTERL INTERL_SSE
560

    
561
DECL_PASS pass_sse, PASS_BIG 1
562
DECL_PASS pass_interleave_sse, PASS_BIG 0
563

    
564
INIT_MMX
565
%define mulps pfmul
566
%define addps pfadd
567
%define subps pfsub
568
%define unpcklps punpckldq
569
%define unpckhps punpckhdq
570
DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
571
DECL_PASS pass_interleave_3dn, PASS_BIG 0
572
%define pass_3dn2 pass_3dn
573
%define pass_interleave_3dn2 pass_interleave_3dn
574

    
575
%ifdef PIC
576
%define SECTION_REL - $$
577
%else
578
%define SECTION_REL
579
%endif
580

    
581
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
582
    lea r2, [dispatch_tab%1]
583
    mov r2, [r2 + (%2q-2)*gprsize]
584
%ifdef PIC
585
    lea r3, [$$]
586
    add r2, r3
587
%endif
588
    call r2
589
%endmacro ; FFT_DISPATCH
590

    
591
%macro DECL_FFT 2-3 ; nbits, cpu, suffix
592
%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
593
%if %1>=5
594
%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
595
%endif
596
%if %1>=6
597
%xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
598
%endif
599

    
600
%assign n 1<<%1
601
%rep 17-%1
602
%assign n2 n/2
603
%assign n4 n/4
604
%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
605

    
606
align 16
607
fft %+ n %+ %3%2:
608
    call fft %+ n2 %+ %2
609
    add r0, n*4 - (n&(-2<<%1))
610
    call fft %+ n4 %+ %2
611
    add r0, n*2 - (n2&(-2<<%1))
612
    call fft %+ n4 %+ %2
613
    sub r0, n*6 + (n2&(-2<<%1))
614
    lea r1, [cos_ %+ n]
615
    mov r2d, n4/2
616
    jmp pass%3%2
617

    
618
%assign n n*2
619
%endrep
620
%undef n
621

    
622
align 8
623
dispatch_tab%3%2: pointer list_of_fft
624

    
625
section .text
626

    
627
; On x86_32, this function does the register saving and restoring for all of fft.
628
; The others pass args in registers and don't spill anything.
629
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
630
    FFT_DISPATCH %3%2, nbits
631
%ifidn %2, _avx
632
    vzeroupper
633
%endif
634
    RET
635
%endmacro ; DECL_FFT
636

    
637
DECL_FFT 6, _avx
638
DECL_FFT 6, _avx, _interleave
639
DECL_FFT 5, _sse
640
DECL_FFT 5, _sse, _interleave
641
DECL_FFT 4, _3dn
642
DECL_FFT 4, _3dn, _interleave
643
DECL_FFT 4, _3dn2
644
DECL_FFT 4, _3dn2, _interleave
645

    
646
INIT_XMM
647
%undef mulps
648
%undef addps
649
%undef subps
650
%undef unpcklps
651
%undef unpckhps
652

    
653
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
654
    movaps   xmm0, [%3+%2*4]
655
    movaps   xmm1, [%3+%1*4-0x10]
656
    movaps   xmm2, xmm0
657
    shufps   xmm0, xmm1, 0x88
658
    shufps   xmm1, xmm2, 0x77
659
    movlps   xmm4, [%4+%2*2]
660
    movlps   xmm5, [%5+%2*2+0x0]
661
    movhps   xmm4, [%4+%1*2-0x8]
662
    movhps   xmm5, [%5+%1*2-0x8]
663
    movaps   xmm2, xmm0
664
    movaps   xmm3, xmm1
665
    mulps    xmm0, xmm5
666
    mulps    xmm1, xmm4
667
    mulps    xmm2, xmm4
668
    mulps    xmm3, xmm5
669
    subps    xmm1, xmm0
670
    addps    xmm2, xmm3
671
    movaps   xmm0, xmm1
672
    unpcklps xmm1, xmm2
673
    unpckhps xmm0, xmm2
674
%endmacro
675

    
676
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
677
    mulps      m6, %3, [%5+%1]
678
    mulps      m7, %2, [%5+%1]
679
    mulps      %2, %2, [%6+%1]
680
    mulps      %3, %3, [%6+%1]
681
    subps      %2, %2, m6
682
    addps      %3, %3, m7
683
%endmacro
684

    
685
%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
686
.post:
687
    vmovaps      ymm1,   [%3+%1*2]
688
    vmovaps      ymm0,   [%3+%1*2+0x20]
689
    vmovaps      ymm3,   [%3+%2*2]
690
    vmovaps      ymm2,   [%3+%2*2+0x20]
691

    
692
    CMUL         %1, ymm0, ymm1, %3, %4, %5
693
    CMUL         %2, ymm2, ymm3, %3, %4, %5
694
    vshufps      ymm1, ymm1, ymm1, 0x1b
695
    vshufps      ymm3, ymm3, ymm3, 0x1b
696
    vperm2f128   ymm1, ymm1, ymm1, 0x01
697
    vperm2f128   ymm3, ymm3, ymm3, 0x01
698
    vunpcklps    ymm6, ymm2, ymm1
699
    vunpckhps    ymm4, ymm2, ymm1
700
    vunpcklps    ymm7, ymm0, ymm3
701
    vunpckhps    ymm5, ymm0, ymm3
702

    
703
    vextractf128 [%3+%1*2],      ymm7, 0
704
    vextractf128 [%3+%1*2+0x10], ymm5, 0
705
    vextractf128 [%3+%1*2+0x20], ymm7, 1
706
    vextractf128 [%3+%1*2+0x30], ymm5, 1
707

    
708
    vextractf128 [%3+%2*2],      ymm6, 0
709
    vextractf128 [%3+%2*2+0x10], ymm4, 0
710
    vextractf128 [%3+%2*2+0x20], ymm6, 1
711
    vextractf128 [%3+%2*2+0x30], ymm4, 1
712
    sub      %2,   0x20
713
    add      %1,   0x20
714
    jl       .post
715
%endmacro
716

    
717
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
718
.post:
719
    movaps   xmm1, [%3+%1*2]
720
    movaps   xmm0, [%3+%1*2+0x10]
721
    CMUL     %1,   xmm0, xmm1, %3, %4, %5
722
    movaps   xmm5, [%3+%2*2]
723
    movaps   xmm4, [%3+%2*2+0x10]
724
    CMUL     %2,   xmm4, xmm5, %3, %4, %5
725
    shufps   xmm1, xmm1, 0x1b
726
    shufps   xmm5, xmm5, 0x1b
727
    movaps   xmm6, xmm4
728
    unpckhps xmm4, xmm1
729
    unpcklps xmm6, xmm1
730
    movaps   xmm2, xmm0
731
    unpcklps xmm0, xmm5
732
    unpckhps xmm2, xmm5
733
    movaps   [%3+%2*2],      xmm6
734
    movaps   [%3+%2*2+0x10], xmm4
735
    movaps   [%3+%1*2],      xmm0
736
    movaps   [%3+%1*2+0x10], xmm2
737
    sub      %2,   0x10
738
    add      %1,   0x10
739
    jl       .post
740
%endmacro
741

    
742
%macro DECL_IMDCT 2
743
cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
744
%ifdef ARCH_X86_64
745
%define rrevtab r10
746
%define rtcos   r11
747
%define rtsin   r12
748
    push  r12
749
    push  r13
750
    push  r14
751
%else
752
%define rrevtab r6
753
%define rtsin   r6
754
%define rtcos   r5
755
%endif
756
    mov   r3d, [r0+FFTContext.mdctsize]
757
    add   r2, r3
758
    shr   r3, 1
759
    mov   rtcos, [r0+FFTContext.tcos]
760
    mov   rtsin, [r0+FFTContext.tsin]
761
    add   rtcos, r3
762
    add   rtsin, r3
763
%ifndef ARCH_X86_64
764
    push  rtcos
765
    push  rtsin
766
%endif
767
    shr   r3, 1
768
    mov   rrevtab, [r0+FFTContext.revtab]
769
    add   rrevtab, r3
770
%ifndef ARCH_X86_64
771
    push  rrevtab
772
%endif
773

    
774
    sub   r3, 4
775
%ifdef ARCH_X86_64
776
    xor   r4, r4
777
    sub   r4, r3
778
%endif
779
.pre:
780
%ifndef ARCH_X86_64
781
;unspill
782
    xor   r4, r4
783
    sub   r4, r3
784
    mov   rtsin, [esp+4]
785
    mov   rtcos, [esp+8]
786
%endif
787

    
788
    PREROTATER r4, r3, r2, rtcos, rtsin
789
%ifdef ARCH_X86_64
790
    movzx  r5,  word [rrevtab+r4-4]
791
    movzx  r6,  word [rrevtab+r4-2]
792
    movzx  r13, word [rrevtab+r3]
793
    movzx  r14, word [rrevtab+r3+2]
794
    movlps [r1+r5 *8], xmm0
795
    movhps [r1+r6 *8], xmm0
796
    movlps [r1+r13*8], xmm1
797
    movhps [r1+r14*8], xmm1
798
    add    r4, 4
799
%else
800
    mov    r6, [esp]
801
    movzx  r5, word [r6+r4-4]
802
    movzx  r4, word [r6+r4-2]
803
    movlps [r1+r5*8], xmm0
804
    movhps [r1+r4*8], xmm0
805
    movzx  r5, word [r6+r3]
806
    movzx  r4, word [r6+r3+2]
807
    movlps [r1+r5*8], xmm1
808
    movhps [r1+r4*8], xmm1
809
%endif
810
    sub    r3, 4
811
    jns    .pre
812

    
813
    mov  r5, r0
814
    mov  r6, r1
815
    mov  r0, r1
816
    mov  r1d, [r5+FFTContext.nbits]
817

    
818
    FFT_DISPATCH %1, r1
819

    
820
    mov  r0d, [r5+FFTContext.mdctsize]
821
    add  r6, r0
822
    shr  r0, 1
823
%ifndef ARCH_X86_64
824
%define rtcos r2
825
%define rtsin r3
826
    mov  rtcos, [esp+8]
827
    mov  rtsin, [esp+4]
828
%endif
829
    neg  r0
830
    mov  r1, -mmsize
831
    sub  r1, r0
832
    %2 r0, r1, r6, rtcos, rtsin
833
%ifdef ARCH_X86_64
834
    pop  r14
835
    pop  r13
836
    pop  r12
837
%else
838
    add esp, 12
839
%endif
840
%ifidn avx_enabled, 1
841
    vzeroupper
842
%endif
843
    RET
844
%endmacro
845

    
846
DECL_IMDCT _sse, POSROTATESHUF
847

    
848
INIT_YMM
849

    
850
DECL_IMDCT _avx, POSROTATESHUF_AVX