Statistics
| Branch: | Revision:

ffmpeg / libavcodec / x86 / fft_mmx.asm @ 1ee076b1

History | View | Annotate | Download (11 KB)

1
;******************************************************************************
2
;* FFT transform with SSE/3DNow optimizations
3
;* Copyright (c) 2008 Loren Merritt
4
;*
5
;* This algorithm (though not any of the implementation details) is
6
;* based on libdjbfft by D. J. Bernstein.
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24

    
25
; These functions are not individually interchangeable with the C versions.
26
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
27
; in blocks as conventient to the vector size.
28
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
29

    
30
%include "x86inc.asm"
31

    
32
SECTION_RODATA
33

    
34
%define M_SQRT1_2 0.70710678118654752440
35
ps_root2: times 4 dd M_SQRT1_2
36
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
37
ps_m1p1: dd 1<<31, 0
38

    
39
%assign i 16
40
%rep 13
41
cextern cos_ %+ i
42
%assign i i<<1
43
%endrep
44

    
45
%ifdef ARCH_X86_64
46
    %define pointer dq
47
%else
48
    %define pointer dd
49
%endif
50

    
51
%macro IF0 1+
52
%endmacro
53
%macro IF1 1+
54
    %1
55
%endmacro
56

    
57
section .text align=16
58

    
59
%macro T2_3DN 4 ; z0, z1, mem0, mem1
60
    mova     %1, %3
61
    mova     %2, %1
62
    pfadd    %1, %4
63
    pfsub    %2, %4
64
%endmacro
65

    
66
%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
67
    mova     %5, %3
68
    pfsub    %3, %4
69
    pfadd    %5, %4 ; {t6,t5}
70
    pxor     %3, [ps_m1p1] ; {t8,t7}
71
    mova     %6, %1
72
    pswapd   %3, %3
73
    pfadd    %1, %5 ; {r0,i0}
74
    pfsub    %6, %5 ; {r2,i2}
75
    mova     %4, %2
76
    pfadd    %2, %3 ; {r1,i1}
77
    pfsub    %4, %3 ; {r3,i3}
78
    SWAP     %3, %6
79
%endmacro
80

    
81
; in:  %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
82
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
83
%macro T4_SSE 3
84
    mova     %3, %1
85
    shufps   %1, %2, 0x64 ; {r0,i0,r3,i2}
86
    shufps   %3, %2, 0xce ; {r1,i1,r2,i3}
87
    mova     %2, %1
88
    addps    %1, %3       ; {t1,t2,t6,t5}
89
    subps    %2, %3       ; {t3,t4,t8,t7}
90
    mova     %3, %1
91
    shufps   %1, %2, 0x44 ; {t1,t2,t3,t4}
92
    shufps   %3, %2, 0xbe ; {t6,t5,t7,t8}
93
    mova     %2, %1
94
    addps    %1, %3       ; {r0,i0,r1,i1}
95
    subps    %2, %3       ; {r2,i2,r3,i3}
96
    mova     %3, %1
97
    shufps   %1, %2, 0x88 ; {r0,r1,r2,r3}
98
    shufps   %3, %2, 0xdd ; {i0,i1,i2,i3}
99
    SWAP     %2, %3
100
%endmacro
101

    
102
%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
103
    mova     %5, %3
104
    shufps   %3, %4, 0x44 ; {r4,i4,r6,i6}
105
    shufps   %5, %4, 0xee ; {r5,i5,r7,i7}
106
    mova     %6, %3
107
    subps    %3, %5       ; {r5,i5,r7,i7}
108
    addps    %6, %5       ; {t1,t2,t3,t4}
109
    mova     %5, %3
110
    shufps   %5, %5, 0xb1 ; {i5,r5,i7,r7}
111
    mulps    %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
112
    mulps    %5, [ps_root2]
113
    addps    %3, %5       ; {t8,t7,ta,t9}
114
    mova     %5, %6
115
    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
116
    shufps   %5, %3, 0x9c ; {t1,t4,t7,ta}
117
    mova     %3, %6
118
    addps    %6, %5       ; {t1,t2,t9,ta}
119
    subps    %3, %5       ; {t6,t5,tc,tb}
120
    mova     %5, %6
121
    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
122
    shufps   %5, %3, 0x8d ; {t2,ta,t6,tc}
123
    mova     %3, %1
124
    mova     %4, %2
125
    addps    %1, %6       ; {r0,r1,r2,r3}
126
    addps    %2, %5       ; {i0,i1,i2,i3}
127
    subps    %3, %6       ; {r4,r5,r6,r7}
128
    subps    %4, %5       ; {i4,i5,i6,i7}
129
%endmacro
130

    
131
; scheduled for cpu-bound sizes
132
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
133
IF%1 mova    m4, Z(4)
134
IF%1 mova    m5, Z(5)
135
    mova     m0, %2 ; wre
136
    mova     m2, m4
137
    mova     m1, %3 ; wim
138
    mova     m3, m5
139
    mulps    m2, m0 ; r2*wre
140
IF%1 mova    m6, Z(6)
141
    mulps    m3, m1 ; i2*wim
142
IF%1 mova    m7, Z(7)
143
    mulps    m4, m1 ; r2*wim
144
    mulps    m5, m0 ; i2*wre
145
    addps    m2, m3 ; r2*wre + i2*wim
146
    mova     m3, m1
147
    mulps    m1, m6 ; r3*wim
148
    subps    m5, m4 ; i2*wre - r2*wim
149
    mova     m4, m0
150
    mulps    m3, m7 ; i3*wim
151
    mulps    m4, m6 ; r3*wre
152
    mulps    m0, m7 ; i3*wre
153
    subps    m4, m3 ; r3*wre - i3*wim
154
    mova     m3, Z(0)
155
    addps    m0, m1 ; i3*wre + r3*wim
156
    mova     m1, m4
157
    addps    m4, m2 ; t5
158
    subps    m1, m2 ; t3
159
    subps    m3, m4 ; r2
160
    addps    m4, Z(0) ; r0
161
    mova     m6, Z(2)
162
    mova   Z(4), m3
163
    mova   Z(0), m4
164
    mova     m3, m5
165
    subps    m5, m0 ; t4
166
    mova     m4, m6
167
    subps    m6, m5 ; r3
168
    addps    m5, m4 ; r1
169
    mova   Z(6), m6
170
    mova   Z(2), m5
171
    mova     m2, Z(3)
172
    addps    m3, m0 ; t6
173
    subps    m2, m1 ; i3
174
    mova     m7, Z(1)
175
    addps    m1, Z(3) ; i1
176
    mova   Z(7), m2
177
    mova   Z(3), m1
178
    mova     m4, m7
179
    subps    m7, m3 ; i2
180
    addps    m3, m4 ; i0
181
    mova   Z(5), m7
182
    mova   Z(1), m3
183
%endmacro
184

    
185
; scheduled to avoid store->load aliasing
186
%macro PASS_BIG 1 ; (!interleave)
187
    mova     m4, Z(4) ; r2
188
    mova     m5, Z(5) ; i2
189
    mova     m2, m4
190
    mova     m0, [wq] ; wre
191
    mova     m3, m5
192
    mova     m1, [wq+o1q] ; wim
193
    mulps    m2, m0 ; r2*wre
194
    mova     m6, Z(6) ; r3
195
    mulps    m3, m1 ; i2*wim
196
    mova     m7, Z(7) ; i3
197
    mulps    m4, m1 ; r2*wim
198
    mulps    m5, m0 ; i2*wre
199
    addps    m2, m3 ; r2*wre + i2*wim
200
    mova     m3, m1
201
    mulps    m1, m6 ; r3*wim
202
    subps    m5, m4 ; i2*wre - r2*wim
203
    mova     m4, m0
204
    mulps    m3, m7 ; i3*wim
205
    mulps    m4, m6 ; r3*wre
206
    mulps    m0, m7 ; i3*wre
207
    subps    m4, m3 ; r3*wre - i3*wim
208
    mova     m3, Z(0)
209
    addps    m0, m1 ; i3*wre + r3*wim
210
    mova     m1, m4
211
    addps    m4, m2 ; t5
212
    subps    m1, m2 ; t3
213
    subps    m3, m4 ; r2
214
    addps    m4, Z(0) ; r0
215
    mova     m6, Z(2)
216
    mova   Z(4), m3
217
    mova   Z(0), m4
218
    mova     m3, m5
219
    subps    m5, m0 ; t4
220
    mova     m4, m6
221
    subps    m6, m5 ; r3
222
    addps    m5, m4 ; r1
223
IF%1 mova  Z(6), m6
224
IF%1 mova  Z(2), m5
225
    mova     m2, Z(3)
226
    addps    m3, m0 ; t6
227
    subps    m2, m1 ; i3
228
    mova     m7, Z(1)
229
    addps    m1, Z(3) ; i1
230
IF%1 mova  Z(7), m2
231
IF%1 mova  Z(3), m1
232
    mova     m4, m7
233
    subps    m7, m3 ; i2
234
    addps    m3, m4 ; i0
235
IF%1 mova  Z(5), m7
236
IF%1 mova  Z(1), m3
237
%if %1==0
238
    mova     m4, m5 ; r1
239
    mova     m0, m6 ; r3
240
    unpcklps m5, m1
241
    unpckhps m4, m1
242
    unpcklps m6, m2
243
    unpckhps m0, m2
244
    mova     m1, Z(0)
245
    mova     m2, Z(4)
246
    mova   Z(2), m5
247
    mova   Z(3), m4
248
    mova   Z(6), m6
249
    mova   Z(7), m0
250
    mova     m5, m1 ; r0
251
    mova     m4, m2 ; r2
252
    unpcklps m1, m3
253
    unpckhps m5, m3
254
    unpcklps m2, m7
255
    unpckhps m4, m7
256
    mova   Z(0), m1
257
    mova   Z(1), m5
258
    mova   Z(4), m2
259
    mova   Z(5), m4
260
%endif
261
%endmacro
262

    
263
%macro PUNPCK 3
264
    mova      %3, %1
265
    punpckldq %1, %2
266
    punpckhdq %3, %2
267
%endmacro
268

    
269
INIT_XMM
270
%define mova movaps
271

    
272
%define Z(x) [r0+mmsize*x]
273

    
274
align 16
275
fft4_sse:
276
    mova     m0, Z(0)
277
    mova     m1, Z(1)
278
    T4_SSE   m0, m1, m2
279
    mova   Z(0), m0
280
    mova   Z(1), m1
281
    ret
282

    
283
align 16
284
fft8_sse:
285
    mova     m0, Z(0)
286
    mova     m1, Z(1)
287
    T4_SSE   m0, m1, m2
288
    mova     m2, Z(2)
289
    mova     m3, Z(3)
290
    T8_SSE   m0, m1, m2, m3, m4, m5
291
    mova   Z(0), m0
292
    mova   Z(1), m1
293
    mova   Z(2), m2
294
    mova   Z(3), m3
295
    ret
296

    
297
align 16
298
fft16_sse:
299
    mova     m0, Z(0)
300
    mova     m1, Z(1)
301
    T4_SSE   m0, m1, m2
302
    mova     m2, Z(2)
303
    mova     m3, Z(3)
304
    T8_SSE   m0, m1, m2, m3, m4, m5
305
    mova     m4, Z(4)
306
    mova     m5, Z(5)
307
    mova   Z(0), m0
308
    mova   Z(1), m1
309
    mova   Z(2), m2
310
    mova   Z(3), m3
311
    T4_SSE   m4, m5, m6
312
    mova     m6, Z(6)
313
    mova     m7, Z(7)
314
    T4_SSE   m6, m7, m0
315
    PASS_SMALL 0, [cos_16], [cos_16+16]
316
    ret
317

    
318

    
319
INIT_MMX
320

    
321
%macro FFT48_3DN 1
322
align 16
323
fft4%1:
324
    T2_3DN   m0, m1, Z(0), Z(1)
325
    mova     m2, Z(2)
326
    mova     m3, Z(3)
327
    T4_3DN   m0, m1, m2, m3, m4, m5
328
    PUNPCK   m0, m1, m4
329
    PUNPCK   m2, m3, m5
330
    mova   Z(0), m0
331
    mova   Z(1), m4
332
    mova   Z(2), m2
333
    mova   Z(3), m5
334
    ret
335

    
336
align 16
337
fft8%1:
338
    T2_3DN   m0, m1, Z(0), Z(1)
339
    mova     m2, Z(2)
340
    mova     m3, Z(3)
341
    T4_3DN   m0, m1, m2, m3, m4, m5
342
    mova   Z(0), m0
343
    mova   Z(2), m2
344
    T2_3DN   m4, m5, Z(4), Z(5)
345
    T2_3DN   m6, m7, Z(6), Z(7)
346
    pswapd   m0, m5
347
    pswapd   m2, m7
348
    pxor     m0, [ps_m1p1]
349
    pxor     m2, [ps_m1p1]
350
    pfsub    m5, m0
351
    pfadd    m7, m2
352
    pfmul    m5, [ps_root2]
353
    pfmul    m7, [ps_root2]
354
    T4_3DN   m1, m3, m5, m7, m0, m2
355
    mova   Z(5), m5
356
    mova   Z(7), m7
357
    mova     m0, Z(0)
358
    mova     m2, Z(2)
359
    T4_3DN   m0, m2, m4, m6, m5, m7
360
    PUNPCK   m0, m1, m5
361
    PUNPCK   m2, m3, m7
362
    mova   Z(0), m0
363
    mova   Z(1), m5
364
    mova   Z(2), m2
365
    mova   Z(3), m7
366
    PUNPCK   m4, Z(5), m5
367
    PUNPCK   m6, Z(7), m7
368
    mova   Z(4), m4
369
    mova   Z(5), m5
370
    mova   Z(6), m6
371
    mova   Z(7), m7
372
    ret
373
%endmacro
374

    
375
FFT48_3DN _3dn2
376

    
377
%macro pswapd 2
378
%ifidn %1, %2
379
    movd [r0+12], %1
380
    punpckhdq %1, [r0+8]
381
%else
382
    movq  %1, %2
383
    psrlq %1, 32
384
    punpckldq %1, %2
385
%endif
386
%endmacro
387

    
388
FFT48_3DN _3dn
389

    
390

    
391
%define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)]
392

    
393
%macro DECL_PASS 2+ ; name, payload
394
align 16
395
%1:
396
DEFINE_ARGS z, w, n, o1, o3
397
    lea o3q, [nq*3]
398
    lea o1q, [nq*8]
399
    shl o3q, 4
400
.loop:
401
    %2
402
    add zq, mmsize*2
403
    add wq, mmsize
404
    sub nd, mmsize/8
405
    jg .loop
406
    rep ret
407
%endmacro
408

    
409
INIT_XMM
410
%define mova movaps
411
DECL_PASS pass_sse, PASS_BIG 1
412
DECL_PASS pass_interleave_sse, PASS_BIG 0
413

    
414
INIT_MMX
415
%define mulps pfmul
416
%define addps pfadd
417
%define subps pfsub
418
%define unpcklps punpckldq
419
%define unpckhps punpckhdq
420
DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
421
DECL_PASS pass_interleave_3dn, PASS_BIG 0
422
%define pass_3dn2 pass_3dn
423
%define pass_interleave_3dn2 pass_interleave_3dn
424

    
425
%ifdef PIC
426
%define SECTION_REL - $$
427
%else
428
%define SECTION_REL
429
%endif
430

    
431
%macro DECL_FFT 2-3 ; nbits, cpu, suffix
432
%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
433
%if %1==5
434
%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
435
%endif
436

    
437
%assign n 1<<%1
438
%rep 17-%1
439
%assign n2 n/2
440
%assign n4 n/4
441
%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
442

    
443
align 16
444
fft %+ n %+ %3%2:
445
    call fft %+ n2 %+ %2
446
    add r0, n*4 - (n&(-2<<%1))
447
    call fft %+ n4 %+ %2
448
    add r0, n*2 - (n2&(-2<<%1))
449
    call fft %+ n4 %+ %2
450
    sub r0, n*6 + (n2&(-2<<%1))
451
    lea r1, [cos_ %+ n]
452
    mov r2d, n4/2
453
    jmp pass%3%2
454

    
455
%assign n n*2
456
%endrep
457
%undef n
458

    
459
align 8
460
dispatch_tab%3%2: pointer list_of_fft
461

    
462
section .text
463

    
464
; On x86_32, this function does the register saving and restoring for all of fft.
465
; The others pass args in registers and don't spill anything.
466
cglobal fft_dispatch%3%2, 2,5,8, z, nbits
467
    lea r2, [dispatch_tab%3%2]
468
    mov r2, [r2 + (nbitsq-2)*gprsize]
469
%ifdef PIC
470
    lea r3, [$$]
471
    add r2, r3
472
%endif
473
    call r2
474
    RET
475
%endmacro ; DECL_FFT
476

    
477
DECL_FFT 5, _sse
478
DECL_FFT 5, _sse, _interleave
479
DECL_FFT 4, _3dn
480
DECL_FFT 4, _3dn, _interleave
481
DECL_FFT 4, _3dn2
482
DECL_FFT 4, _3dn2, _interleave
483