1 | 5d0ddd1a | Loren Merritt | ;****************************************************************************** |
2 | ;* FFT transform with SSE/3DNow optimizations |
3 | ;* Copyright (c) 2008 Loren Merritt |
4 | ;* |
5 | 1ee076b1 | Loren Merritt | ;* This algorithm (though not any of the implementation details) is |

6 | ;* based on libdjbfft by D. J. Bernstein. |
7 | ;* |
8 | 5d0ddd1a | Loren Merritt | ;* This file is part of FFmpeg. |

9 | ;* |
10 | ;* FFmpeg is free software; you can redistribute it and/or |
11 | ;* modify it under the terms of the GNU Lesser General Public |
12 | ;* License as published by the Free Software Foundation; either |
13 | ;* version 2.1 of the License, or (at your option) any later version. |
14 | ;* |
15 | ;* FFmpeg is distributed in the hope that it will be useful, |
16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | ;* Lesser General Public License for more details. |
19 | ;* |
20 | ;* You should have received a copy of the GNU Lesser General Public |
21 | ;* License along with FFmpeg; if not, write to the Free Software |
22 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
23 | ;****************************************************************************** |
25 | ; These functions are not individually interchangeable with the C versions. |
26 | ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results |
27 | ; in blocks as conventient to the vector size. |
28 | ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) |
30 | %include "x86inc.asm" |
32 | SECTION_RODATA |
34 | %define M_SQRT1_2 0.70710678118654752440 |
35 | ps_root2: times 4 dd M_SQRT1_2 |
36 | ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 |
37 | ps_m1p1: dd 1<<31, 0 |
39 | %assign i 16 |
40 | %rep 13 |
41 | 2966cc18 | Jason Garrett-Glaser | cextern cos_ %+ i |

42 | 5d0ddd1a | Loren Merritt | %assign i i<<1 |

43 | %endrep |
45 | %ifdef ARCH_X86_64 |
46 | %define pointer dq |
47 | %else |
48 | %define pointer dd |
49 | %endif |
51 | %macro IF0 1+ |
52 | %endmacro |
53 | %macro IF1 1+ |
54 | %1 |
55 | %endmacro |
57 | section .text align=16 |
59 | %macro T2_3DN 4 ; z0, z1, mem0, mem1 |
60 | mova %1, %3 |
61 | mova %2, %1 |
62 | pfadd %1, %4 |
63 | pfsub %2, %4 |
64 | %endmacro |
66 | %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 |
67 | mova %5, %3 |
68 | pfsub %3, %4 |
69 | pfadd %5, %4 ; {t6,t5} |
70 | 2966cc18 | Jason Garrett-Glaser | pxor %3, [ps_m1p1] ; {t8,t7} |

72 | pswapd %3, %3 |
73 | pfadd %1, %5 ; {r0,i0} |
74 | pfsub %6, %5 ; {r2,i2} |
75 | mova %4, %2 |
76 | pfadd %2, %3 ; {r1,i1} |
77 | pfsub %4, %3 ; {r3,i3} |
78 | SWAP %3, %6 |
79 | %endmacro |
81 | ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3} |
82 | ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} |
83 | %macro T4_SSE 3 |
84 | mova %3, %1 |
85 | shufps %1, %2, 0x64 ; {r0,i0,r3,i2} |
86 | shufps %3, %2, 0xce ; {r1,i1,r2,i3} |
87 | mova %2, %1 |
88 | addps %1, %3 ; {t1,t2,t6,t5} |
89 | subps %2, %3 ; {t3,t4,t8,t7} |
90 | mova %3, %1 |
91 | shufps %1, %2, 0x44 ; {t1,t2,t3,t4} |
92 | shufps %3, %2, 0xbe ; {t6,t5,t7,t8} |
93 | mova %2, %1 |
94 | addps %1, %3 ; {r0,i0,r1,i1} |
95 | subps %2, %3 ; {r2,i2,r3,i3} |
96 | mova %3, %1 |
97 | shufps %1, %2, 0x88 ; {r0,r1,r2,r3} |
98 | shufps %3, %2, 0xdd ; {i0,i1,i2,i3} |
99 | SWAP %2, %3 |
100 | %endmacro |
102 | %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1 |
103 | mova %5, %3 |
104 | shufps %3, %4, 0x44 ; {r4,i4,r6,i6} |
105 | shufps %5, %4, 0xee ; {r5,i5,r7,i7} |
106 | mova %6, %3 |
107 | subps %3, %5 ; {r5,i5,r7,i7} |
108 | addps %6, %5 ; {t1,t2,t3,t4} |
109 | mova %5, %3 |
110 | shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} |
111 | 2966cc18 | Jason Garrett-Glaser | mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} |

||

114 | mova %5, %6 |
115 | shufps %6, %3, 0x36 ; {t3,t2,t9,t8} |
116 | shufps %5, %3, 0x9c ; {t1,t4,t7,ta} |
117 | mova %3, %6 |
118 | addps %6, %5 ; {t1,t2,t9,ta} |
119 | subps %3, %5 ; {t6,t5,tc,tb} |
120 | mova %5, %6 |
121 | shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} |
122 | shufps %5, %3, 0x8d ; {t2,ta,t6,tc} |
123 | mova %3, %1 |
124 | mova %4, %2 |
125 | addps %1, %6 ; {r0,r1,r2,r3} |
126 | addps %2, %5 ; {i0,i1,i2,i3} |
127 | subps %3, %6 ; {r4,r5,r6,r7} |
128 | subps %4, %5 ; {i4,i5,i6,i7} |
129 | %endmacro |
131 | ; scheduled for cpu-bound sizes |
132 | %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim |
133 | IF%1 mova m4, Z(4) |
134 | IF%1 mova m5, Z(5) |
135 | mova m0, %2 ; wre |
136 | mova m2, m4 |
137 | mova m1, %3 ; wim |
138 | mova m3, m5 |
139 | mulps m2, m0 ; r2*wre |
140 | IF%1 mova m6, Z(6) |
141 | mulps m3, m1 ; i2*wim |
142 | IF%1 mova m7, Z(7) |
143 | mulps m4, m1 ; r2*wim |
144 | mulps m5, m0 ; i2*wre |
145 | addps m2, m3 ; r2*wre + i2*wim |
146 | mova m3, m1 |
147 | mulps m1, m6 ; r3*wim |
148 | subps m5, m4 ; i2*wre - r2*wim |
149 | mova m4, m0 |
150 | mulps m3, m7 ; i3*wim |
151 | mulps m4, m6 ; r3*wre |
152 | mulps m0, m7 ; i3*wre |
153 | subps m4, m3 ; r3*wre - i3*wim |
154 | mova m3, Z(0) |
155 | addps m0, m1 ; i3*wre + r3*wim |
156 | mova m1, m4 |
157 | addps m4, m2 ; t5 |
158 | subps m1, m2 ; t3 |
159 | subps m3, m4 ; r2 |
160 | addps m4, Z(0) ; r0 |
161 | mova m6, Z(2) |
162 | mova Z(4), m3 |
163 | mova Z(0), m4 |
164 | mova m3, m5 |
165 | subps m5, m0 ; t4 |
166 | mova m4, m6 |
167 | subps m6, m5 ; r3 |
168 | addps m5, m4 ; r1 |
169 | mova Z(6), m6 |
170 | mova Z(2), m5 |
171 | mova m2, Z(3) |
172 | addps m3, m0 ; t6 |
173 | subps m2, m1 ; i3 |
174 | mova m7, Z(1) |
175 | addps m1, Z(3) ; i1 |
176 | mova Z(7), m2 |
177 | mova Z(3), m1 |
178 | mova m4, m7 |
179 | subps m7, m3 ; i2 |
180 | addps m3, m4 ; i0 |
181 | mova Z(5), m7 |
182 | mova Z(1), m3 |
183 | %endmacro |
185 | ; scheduled to avoid store->load aliasing |
186 | %macro PASS_BIG 1 ; (!interleave) |
187 | mova m4, Z(4) ; r2 |
188 | mova m5, Z(5) ; i2 |
189 | mova m2, m4 |
190 | mova m0, [wq] ; wre |
191 | mova m3, m5 |
192 | mova m1, [wq+o1q] ; wim |
193 | mulps m2, m0 ; r2*wre |
194 | mova m6, Z(6) ; r3 |
195 | mulps m3, m1 ; i2*wim |
196 | mova m7, Z(7) ; i3 |
197 | mulps m4, m1 ; r2*wim |
198 | mulps m5, m0 ; i2*wre |
199 | addps m2, m3 ; r2*wre + i2*wim |
200 | mova m3, m1 |
201 | mulps m1, m6 ; r3*wim |
202 | subps m5, m4 ; i2*wre - r2*wim |
203 | mova m4, m0 |
204 | mulps m3, m7 ; i3*wim |
205 | mulps m4, m6 ; r3*wre |
206 | mulps m0, m7 ; i3*wre |
207 | subps m4, m3 ; r3*wre - i3*wim |
208 | mova m3, Z(0) |
209 | addps m0, m1 ; i3*wre + r3*wim |
210 | mova m1, m4 |
211 | addps m4, m2 ; t5 |
212 | subps m1, m2 ; t3 |
213 | subps m3, m4 ; r2 |
214 | addps m4, Z(0) ; r0 |
215 | mova m6, Z(2) |
216 | mova Z(4), m3 |
217 | mova Z(0), m4 |
218 | mova m3, m5 |
219 | subps m5, m0 ; t4 |
220 | mova m4, m6 |
221 | subps m6, m5 ; r3 |
222 | addps m5, m4 ; r1 |
223 | IF%1 mova Z(6), m6 |
224 | IF%1 mova Z(2), m5 |
225 | mova m2, Z(3) |
226 | addps m3, m0 ; t6 |
227 | subps m2, m1 ; i3 |
228 | mova m7, Z(1) |
229 | addps m1, Z(3) ; i1 |
230 | IF%1 mova Z(7), m2 |
231 | IF%1 mova Z(3), m1 |
232 | mova m4, m7 |
233 | subps m7, m3 ; i2 |
234 | addps m3, m4 ; i0 |
235 | IF%1 mova Z(5), m7 |
236 | IF%1 mova Z(1), m3 |
237 | %if %1==0 |
238 | mova m4, m5 ; r1 |
239 | mova m0, m6 ; r3 |
240 | unpcklps m5, m1 |
241 | unpckhps m4, m1 |
242 | unpcklps m6, m2 |
243 | unpckhps m0, m2 |
244 | mova m1, Z(0) |
245 | mova m2, Z(4) |
246 | mova Z(2), m5 |
247 | mova Z(3), m4 |
248 | mova Z(6), m6 |
249 | mova Z(7), m0 |
250 | mova m5, m1 ; r0 |
251 | mova m4, m2 ; r2 |
252 | unpcklps m1, m3 |
253 | unpckhps m5, m3 |
254 | unpcklps m2, m7 |
255 | unpckhps m4, m7 |
256 | mova Z(0), m1 |
257 | mova Z(1), m5 |
258 | mova Z(4), m2 |
259 | mova Z(5), m4 |
260 | %endif |
261 | %endmacro |
263 | %macro PUNPCK 3 |
264 | mova %3, %1 |
265 | punpckldq %1, %2 |
266 | punpckhdq %3, %2 |
267 | %endmacro |
269 | INIT_XMM |
270 | 45213083 | Loren Merritt | %define mova movaps |

272 | %define Z(x) [r0+mmsize*x] |
274 | align 16 |
275 | fft4_sse: |
276 | mova m0, Z(0) |
277 | mova m1, Z(1) |
278 | T4_SSE m0, m1, m2 |
279 | mova Z(0), m0 |
280 | mova Z(1), m1 |
281 | ret |
283 | align 16 |
284 | fft8_sse: |
285 | mova m0, Z(0) |
286 | mova m1, Z(1) |
287 | T4_SSE m0, m1, m2 |
288 | mova m2, Z(2) |
289 | mova m3, Z(3) |
290 | T8_SSE m0, m1, m2, m3, m4, m5 |
291 | mova Z(0), m0 |
292 | mova Z(1), m1 |
293 | mova Z(2), m2 |
294 | mova Z(3), m3 |
295 | ret |
297 | align 16 |
298 | fft16_sse: |
299 | mova m0, Z(0) |
300 | mova m1, Z(1) |
301 | T4_SSE m0, m1, m2 |
302 | mova m2, Z(2) |
303 | mova m3, Z(3) |
304 | T8_SSE m0, m1, m2, m3, m4, m5 |
305 | mova m4, Z(4) |
306 | mova m5, Z(5) |
307 | mova Z(0), m0 |
308 | mova Z(1), m1 |
309 | mova Z(2), m2 |
310 | mova Z(3), m3 |
311 | T4_SSE m4, m5, m6 |
312 | mova m6, Z(6) |
313 | mova m7, Z(7) |
314 | T4_SSE m6, m7, m0 |
315 | 2966cc18 | Jason Garrett-Glaser | PASS_SMALL 0, [cos_16], [cos_16+16] |

||

321 | %macro FFT48_3DN 1 |
322 | align 16 |
323 | fft4%1: |
324 | T2_3DN m0, m1, Z(0), Z(1) |
325 | mova m2, Z(2) |
326 | mova m3, Z(3) |
327 | T4_3DN m0, m1, m2, m3, m4, m5 |
328 | PUNPCK m0, m1, m4 |
329 | PUNPCK m2, m3, m5 |
330 | mova Z(0), m0 |
331 | mova Z(1), m4 |
332 | mova Z(2), m2 |
333 | mova Z(3), m5 |
334 | ret |
336 | align 16 |
337 | fft8%1: |
338 | T2_3DN m0, m1, Z(0), Z(1) |
339 | mova m2, Z(2) |
340 | mova m3, Z(3) |
341 | T4_3DN m0, m1, m2, m3, m4, m5 |
342 | mova Z(0), m0 |
343 | mova Z(2), m2 |
344 | T2_3DN m4, m5, Z(4), Z(5) |
345 | T2_3DN m6, m7, Z(6), Z(7) |
346 | pswapd m0, m5 |
347 | pswapd m2, m7 |
348 | 2966cc18 | Jason Garrett-Glaser | pxor m0, [ps_m1p1] |

||

351 | pfadd m7, m2 |
352 | 2966cc18 | Jason Garrett-Glaser | pfmul m5, [ps_root2] |

||

355 | mova Z(5), m5 |
356 | mova Z(7), m7 |
357 | mova m0, Z(0) |
358 | mova m2, Z(2) |
359 | T4_3DN m0, m2, m4, m6, m5, m7 |
360 | PUNPCK m0, m1, m5 |
361 | PUNPCK m2, m3, m7 |
362 | mova Z(0), m0 |
363 | mova Z(1), m5 |
364 | mova Z(2), m2 |
365 | mova Z(3), m7 |
366 | PUNPCK m4, Z(5), m5 |
367 | PUNPCK m6, Z(7), m7 |
368 | mova Z(4), m4 |
369 | mova Z(5), m5 |
370 | mova Z(6), m6 |
371 | mova Z(7), m7 |
372 | ret |
373 | %endmacro |
375 | FFT48_3DN _3dn2 |
377 | %macro pswapd 2 |
378 | %ifidn %1, %2 |
379 | movd [r0+12], %1 |
380 | punpckhdq %1, [r0+8] |
381 | %else |
382 | movq %1, %2 |
383 | psrlq %1, 32 |
384 | punpckldq %1, %2 |
385 | %endif |
386 | %endmacro |
388 | FFT48_3DN _3dn |
391 | %define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)] |
393 | %macro DECL_PASS 2+ ; name, payload |
394 | align 16 |
395 | %1: |
396 | DEFINE_ARGS z, w, n, o1, o3 |
397 | lea o3q, [nq*3] |
398 | lea o1q, [nq*8] |
399 | shl o3q, 4 |
400 | .loop: |
401 | %2 |
402 | add zq, mmsize*2 |
403 | add wq, mmsize |
404 | sub nd, mmsize/8 |
405 | jg .loop |
406 | rep ret |
407 | %endmacro |
409 | INIT_XMM |
410 | 45213083 | Loren Merritt | %define mova movaps |

412 | DECL_PASS pass_interleave_sse, PASS_BIG 0 |
414 | INIT_MMX |
415 | %define mulps pfmul |
416 | %define addps pfadd |
417 | %define subps pfsub |
418 | %define unpcklps punpckldq |
419 | %define unpckhps punpckhdq |
420 | DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q] |
421 | DECL_PASS pass_interleave_3dn, PASS_BIG 0 |
422 | %define pass_3dn2 pass_3dn |
423 | %define pass_interleave_3dn2 pass_interleave_3dn |
425 | 3d05c1fb | Reimar Döffinger | %ifdef PIC |

||

||

||

||

431 | %macro DECL_FFT 2-3 ; nbits, cpu, suffix |
432 | 3d05c1fb | Reimar Döffinger | %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL |

434 | 3d05c1fb | Reimar Döffinger | %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL |

437 | %assign n 1<<%1 |
438 | %rep 17-%1 |
439 | %assign n2 n/2 |
440 | %assign n4 n/4 |
441 | 3d05c1fb | Reimar Döffinger | %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL |

443 | align 16 |
444 | fft %+ n %+ %3%2: |
445 | call fft %+ n2 %+ %2 |
446 | add r0, n*4 - (n&(-2<<%1)) |
447 | call fft %+ n4 %+ %2 |
448 | add r0, n*2 - (n2&(-2<<%1)) |
449 | call fft %+ n4 %+ %2 |
450 | sub r0, n*6 + (n2&(-2<<%1)) |
451 | 2966cc18 | Jason Garrett-Glaser | lea r1, [cos_ %+ n] |

453 | jmp pass%3%2 |
455 | %assign n n*2 |
456 | %endrep |
457 | %undef n |
459 | align 8 |
460 | dispatch_tab%3%2: pointer list_of_fft |
461 | |||

463 | |||

465 | ; The others pass args in registers and don't spill anything. |
466 | 3f87f39c | John Adcock | cglobal fft_dispatch%3%2, 2,5,8, z, nbits |

468 | 5d0ddd1a | Loren Merritt | mov r2, [r2 + (nbitsq-2)*gprsize] |

470 | 2966cc18 | Jason Garrett-Glaser | lea r3, [$$] |

472 | %endif |
473 | 5d0ddd1a | Loren Merritt | call r2 |

||

477 | DECL_FFT 5, _sse |
478 | DECL_FFT 5, _sse, _interleave |
479 | DECL_FFT 4, _3dn |
480 | DECL_FFT 4, _3dn, _interleave |
481 | DECL_FFT 4, _3dn2 |
482 | DECL_FFT 4, _3dn2, _interleave |