Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / fft_altivec_s.S @ 1ee076b1

History | View | Annotate | Download (12.3 KB)

1
/*
2
 * FFT transform with Altivec optimizations
3
 * Copyright (c) 2009 Loren Merritt
4
 *
5
 * This algorithm (though not any of the implementation details) is
6
 * based on libdjbfft by D. J. Bernstein.
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/*
26
 * These functions are not individually interchangeable with the C versions.
27
 * While C takes arrays of FFTComplex, Altivec leaves intermediate results
28
 * in blocks as convenient to the vector size.
29
 * i.e. {4x real, 4x imaginary, 4x real, ...}
30
 *
31
 * I ignore standard calling convention.
32
 * Instead, the following registers are treated as global constants:
33
 * v14: zero
34
 * v15..v18: cosines
35
 * v19..v29: permutations
36
 * r9: 16
37
 * r12: ff_cos_tabs
38
 * and the rest are free for local use.
39
 */
40

    
41
#include "config.h"
42
#include "asm.S"
43

    
44
.text
45

    
46
.macro addi2 ra, imm // add 32-bit immediate
47
.if \imm & 0xffff
48
    addi \ra, \ra, \imm@l
49
.endif
50
.if (\imm+0x8000)>>16
51
    addis \ra, \ra, \imm@ha
52
.endif
53
.endm
54

    
55
.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
56
    vperm   \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
57
    vperm   \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
58
    vaddfp  \a0,\a2,\a3                         // {t1,t2,t6,t5}
59
    vsubfp  \a1,\a2,\a3                         // {t3,t4,t8,t7}
60
    vmrghw  \a2,\a0,\a1     // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
61
    vperm   \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
62
    vaddfp  \a0,\a2,\a3                         // {r0,r1,i0,i1}
63
    vsubfp  \a1,\a2,\a3                         // {r2,r3,i2,i3}
64
    vperm   \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
65
    vperm   \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
66
.endm
67

    
68
.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
69
    vperm   \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
70
    vperm   \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
71
    vperm   \b2,\b0,\b1,v20
72
    vperm   \b3,\b0,\b1,v21
73
    vaddfp  \a0,\a2,\a3                         // {t1,t2,t6,t5}
74
    vsubfp  \a1,\a2,\a3                         // {t3,t4,t8,t7}
75
    vaddfp  \b0,\b2,\b3
76
    vsubfp  \b1,\b2,\b3
77
    vmrghw  \a2,\a0,\a1     // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
78
    vperm   \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
79
    vmrghw  \b2,\b0,\b1
80
    vperm   \b3,\b0,\b1,v22
81
    vaddfp  \a0,\a2,\a3                         // {r0,r1,i0,i1}
82
    vsubfp  \a1,\a2,\a3                         // {r2,r3,i2,i3}
83
    vaddfp  \b0,\b2,\b3
84
    vsubfp  \b1,\b2,\b3
85
    vperm   \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
86
    vperm   \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
87
    vperm   \b2,\b0,\b1,v23
88
    vperm   \b3,\b0,\b1,v24
89
.endm
90

    
91
.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
92
    vmrghw  \b2,\b0,\b1     // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
93
    vmrglw  \b3,\b0,\b1     // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
94
    vperm   \a2,\a0,\a1,v20         // FFT4 ...
95
    vperm   \a3,\a0,\a1,v21
96
    vaddfp  \b0,\b2,\b3                         // {t1,t3,t2,t4}
97
    vsubfp  \b1,\b2,\b3                         // {r5,r7,i5,i7}
98
    vperm   \b4,\b1,\b1,v25 // vcprm(2,3,0,1)   // {i5,i7,r5,r7}
99
    vaddfp  \a0,\a2,\a3
100
    vsubfp  \a1,\a2,\a3
101
    vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
102
    vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
103
    vmrghw  \a2,\a0,\a1
104
    vperm   \a3,\a0,\a1,v22
105
    vperm   \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
106
    vperm   \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
107
    vaddfp  \a0,\a2,\a3
108
    vsubfp  \a1,\a2,\a3
109
    vaddfp  \b0,\b2,\b3                         // {t1,t2,t9,ta}
110
    vsubfp  \b1,\b2,\b3                         // {t6,t5,tc,tb}
111
    vperm   \a2,\a0,\a1,v23
112
    vperm   \a3,\a0,\a1,v24
113
    vperm   \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
114
    vperm   \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
115
    vsubfp  \b0,\a2,\b2                         // {r4,r5,r6,r7}
116
    vsubfp  \b1,\a3,\b3                         // {i4,i5,i6,i7}
117
    vaddfp  \a0,\a2,\b2                         // {r0,r1,r2,r3}
118
    vaddfp  \a1,\a3,\b3                         // {i0,i1,i2,i3}
119
.endm
120

    
121
.macro BF d0,d1,s0,s1
122
    vsubfp  \d1,\s0,\s1
123
    vaddfp  \d0,\s0,\s1
124
.endm
125

    
126
.macro zip d0,d1,s0,s1
127
    vmrghw  \d0,\s0,\s1
128
    vmrglw  \d1,\s0,\s1
129
.endm
130

    
131
.macro def_fft4 interleave
132
fft4\interleave\()_altivec:
133
    lvx    v0, 0,r3
134
    lvx    v1,r9,r3
135
    FFT4   v0,v1,v2,v3
136
.ifnb \interleave
137
    zip    v0,v1,v2,v3
138
    stvx   v0, 0,r3
139
    stvx   v1,r9,r3
140
.else
141
    stvx   v2, 0,r3
142
    stvx   v3,r9,r3
143
.endif
144
    blr
145
.endm
146

    
147
.macro def_fft8 interleave
148
fft8\interleave\()_altivec:
149
    addi   r4,r3,32
150
    lvx    v0, 0,r3
151
    lvx    v1,r9,r3
152
    lvx    v2, 0,r4
153
    lvx    v3,r9,r4
154
    FFT8   v0,v1,v2,v3,v4,v5,v6,v7,v8
155
.ifnb \interleave
156
    zip    v4,v5,v0,v1
157
    zip    v6,v7,v2,v3
158
    stvx   v4, 0,r3
159
    stvx   v5,r9,r3
160
    stvx   v6, 0,r4
161
    stvx   v7,r9,r4
162
.else
163
    stvx   v0, 0,r3
164
    stvx   v1,r9,r3
165
    stvx   v2, 0,r4
166
    stvx   v3,r9,r4
167
.endif
168
    blr
169
.endm
170

    
171
.macro def_fft16 interleave
172
fft16\interleave\()_altivec:
173
    addi   r5,r3,64
174
    addi   r6,r3,96
175
    addi   r4,r3,32
176
    lvx    v0, 0,r5
177
    lvx    v1,r9,r5
178
    lvx    v2, 0,r6
179
    lvx    v3,r9,r6
180
    FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
181
    lvx    v0, 0,r3
182
    lvx    v1,r9,r3
183
    lvx    v2, 0,r4
184
    lvx    v3,r9,r4
185
    FFT8   v0,v1,v2,v3,v8,v9,v10,v11,v12
186
    vmaddfp   v8,v4,v15,v14 // r2*wre
187
    vmaddfp   v9,v5,v15,v14 // i2*wre
188
    vmaddfp  v10,v6,v15,v14 // r3*wre
189
    vmaddfp  v11,v7,v15,v14 // i3*wre
190
    vmaddfp   v8,v5,v16,v8  // i2*wim
191
    vnmsubfp  v9,v4,v16,v9  // r2*wim
192
    vnmsubfp v10,v7,v16,v10 // i3*wim
193
    vmaddfp  v11,v6,v16,v11 // r3*wim
194
    BF     v10,v12,v10,v8
195
    BF     v11,v13,v9,v11
196
    BF     v0,v4,v0,v10
197
    BF     v3,v7,v3,v12
198
    BF     v1,v5,v1,v11
199
    BF     v2,v6,v2,v13
200
.ifnb \interleave
201
    zip     v8, v9,v0,v1
202
    zip    v10,v11,v2,v3
203
    zip    v12,v13,v4,v5
204
    zip    v14,v15,v6,v7
205
    stvx    v8, 0,r3
206
    stvx    v9,r9,r3
207
    stvx   v10, 0,r4
208
    stvx   v11,r9,r4
209
    stvx   v12, 0,r5
210
    stvx   v13,r9,r5
211
    stvx   v14, 0,r6
212
    stvx   v15,r9,r6
213
.else
214
    stvx   v0, 0,r3
215
    stvx   v4, 0,r5
216
    stvx   v3,r9,r4
217
    stvx   v7,r9,r6
218
    stvx   v1,r9,r3
219
    stvx   v5,r9,r5
220
    stvx   v2, 0,r4
221
    stvx   v6, 0,r6
222
.endif
223
    blr
224
.endm
225

    
226
// void pass(float *z, float *wre, int n)
227
.macro PASS interleave, suffix
228
fft_pass\suffix\()_altivec:
229
    mtctr  r5
230
    slwi   r0,r5,4
231
    slwi   r7,r5,6   // o2
232
    slwi   r5,r5,5   // o1
233
    add   r10,r5,r7  // o3
234
    add    r0,r4,r0  // wim
235
    addi   r6,r5,16  // o1+16
236
    addi   r8,r7,16  // o2+16
237
    addi  r11,r10,16 // o3+16
238
1:
239
    lvx    v8, 0,r4  // wre
240
    lvx   v10, 0,r0  // wim
241
    sub    r0,r0,r9
242
    lvx    v9, 0,r0
243
    vperm  v9,v9,v10,v19   // vcprm(s0,3,2,1) => wim[0 .. -3]
244
    lvx    v4,r3,r7        // r2 = z[o2]
245
    lvx    v5,r3,r8        // i2 = z[o2+16]
246
    lvx    v6,r3,r10       // r3 = z[o3]
247
    lvx    v7,r3,r11       // i3 = z[o3+16]
248
    vmaddfp  v10,v4,v8,v14 // r2*wre
249
    vmaddfp  v11,v5,v8,v14 // i2*wre
250
    vmaddfp  v12,v6,v8,v14 // r3*wre
251
    vmaddfp  v13,v7,v8,v14 // i3*wre
252
    lvx    v0, 0,r3        // r0 = z[0]
253
    lvx    v3,r3,r6        // i1 = z[o1+16]
254
    vmaddfp  v10,v5,v9,v10 // i2*wim
255
    vnmsubfp v11,v4,v9,v11 // r2*wim
256
    vnmsubfp v12,v7,v9,v12 // i3*wim
257
    vmaddfp  v13,v6,v9,v13 // r3*wim
258
    lvx    v1,r3,r9        // i0 = z[16]
259
    lvx    v2,r3,r5        // r1 = z[o1]
260
    BF     v12,v8,v12,v10
261
    BF     v13,v9,v11,v13
262
    BF     v0,v4,v0,v12
263
    BF     v3,v7,v3,v8
264
.if !\interleave
265
    stvx   v0, 0,r3
266
    stvx   v4,r3,r7
267
    stvx   v3,r3,r6
268
    stvx   v7,r3,r11
269
.endif
270
    BF     v1,v5,v1,v13
271
    BF     v2,v6,v2,v9
272
.if !\interleave
273
    stvx   v1,r3,r9
274
    stvx   v2,r3,r5
275
    stvx   v5,r3,r8
276
    stvx   v6,r3,r10
277
.else
278
    vmrghw v8,v0,v1
279
    vmrglw v9,v0,v1
280
    stvx   v8, 0,r3
281
    stvx   v9,r3,r9
282
    vmrghw v8,v2,v3
283
    vmrglw v9,v2,v3
284
    stvx   v8,r3,r5
285
    stvx   v9,r3,r6
286
    vmrghw v8,v4,v5
287
    vmrglw v9,v4,v5
288
    stvx   v8,r3,r7
289
    stvx   v9,r3,r8
290
    vmrghw v8,v6,v7
291
    vmrglw v9,v6,v7
292
    stvx   v8,r3,r10
293
    stvx   v9,r3,r11
294
.endif
295
    addi   r3,r3,32
296
    addi   r4,r4,16
297
    bdnz 1b
298
    sub    r3,r3,r5
299
    blr
300
.endm
301

    
302
#define M_SQRT1_2      0.70710678118654752440  /* 1/sqrt(2) */
303

    
304
#define WORD_0  0x00,0x01,0x02,0x03
305
#define WORD_1  0x04,0x05,0x06,0x07
306
#define WORD_2  0x08,0x09,0x0a,0x0b
307
#define WORD_3  0x0c,0x0d,0x0e,0x0f
308
#define WORD_s0 0x10,0x11,0x12,0x13
309
#define WORD_s1 0x14,0x15,0x16,0x17
310
#define WORD_s2 0x18,0x19,0x1a,0x1b
311
#define WORD_s3 0x1c,0x1d,0x1e,0x1f
312

    
313
#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
314

    
315
    .rodata
316
    .align 4
317
fft_data:
318
    .float  0, 0, 0, 0
319
    .float  1, 0.92387953, M_SQRT1_2, 0.38268343
320
    .float  0, 0.38268343, M_SQRT1_2, 0.92387953
321
    .float  -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
322
    .float   M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
323
    vcprm(s0,3,2,1)
324
    vcprm(0,1,s2,s1)
325
    vcprm(2,3,s0,s3)
326
    vcprm(2,s3,3,s2)
327
    vcprm(0,1,s0,s1)
328
    vcprm(2,3,s2,s3)
329
    vcprm(2,3,0,1)
330
    vcprm(1,2,s3,s0)
331
    vcprm(0,3,s2,s1)
332
    vcprm(0,2,s1,s3)
333
    vcprm(1,3,s0,s2)
334

    
335
.macro lvm  b, r, regs:vararg
336
    lvx     \r, 0, \b
337
    addi    \b, \b, 16
338
  .ifnb \regs
339
    lvm     \b, \regs
340
  .endif
341
.endm
342

    
343
.macro stvm b, r, regs:vararg
344
    stvx    \r, 0, \b
345
    addi    \b, \b, 16
346
  .ifnb \regs
347
    stvm    \b, \regs
348
  .endif
349
.endm
350

    
351
.macro fft_calc interleave
352
extfunc ff_fft_calc\interleave\()_altivec
353
    mflr    r0
354
    stp     r0, 2*PS(r1)
355
    stpu    r1, -(160+16*PS)(r1)
356
    addi    r6, r1, 16*PS
357
    stvm    r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
358
    mfvrsave r0
359
    stw     r0, 15*PS(r1)
360
    li      r6, 0xfffffffc
361
    mtvrsave r6
362

    
363
    movrel  r6, fft_data
364
    lvm     r6, v14, v15, v16, v17, v18, v19, v20, v21
365
    lvm     r6, v22, v23, v24, v25, v26, v27, v28, v29
366

    
367
    li      r9, 16
368
    movrel  r12, X(ff_cos_tabs)
369

    
370
    movrel  r6, fft_dispatch_tab\interleave\()_altivec
371
    lwz     r3, 0(r3)
372
    subi    r3, r3, 2
373
    slwi    r3, r3, 2+ARCH_PPC64
374
    lpx     r3, r3, r6
375
    mtctr   r3
376
    mr      r3, r4
377
    bctrl
378

    
379
    addi    r6, r1, 16*PS
380
    lvm     r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
381
    lwz     r6, 15*PS(r1)
382
    mtvrsave r6
383
    lp      r1, 0(r1)
384
    lp      r0, 2*PS(r1)
385
    mtlr    r0
386
    blr
387
.endm
388

    
389
.macro DECL_FFT suffix, bits, n, n2, n4
390
fft\n\suffix\()_altivec:
391
    mflr  r0
392
    stp   r0,PS*(\bits-3)(r1)
393
    bl    fft\n2\()_altivec
394
    addi2 r3,\n*4
395
    bl    fft\n4\()_altivec
396
    addi2 r3,\n*2
397
    bl    fft\n4\()_altivec
398
    addi2 r3,\n*-6
399
    lp    r0,PS*(\bits-3)(r1)
400
    lp    r4,\bits*PS(r12)
401
    mtlr  r0
402
    li    r5,\n/16
403
    b     fft_pass\suffix\()_altivec
404
.endm
405

    
406
.macro DECL_FFTS interleave, suffix
407
    .text
408
    def_fft4  \suffix
409
    def_fft8  \suffix
410
    def_fft16 \suffix
411
    PASS \interleave, \suffix
412
    DECL_FFT \suffix, 5,   32,   16,    8
413
    DECL_FFT \suffix, 6,   64,   32,   16
414
    DECL_FFT \suffix, 7,  128,   64,   32
415
    DECL_FFT \suffix, 8,  256,  128,   64
416
    DECL_FFT \suffix, 9,  512,  256,  128
417
    DECL_FFT \suffix,10, 1024,  512,  256
418
    DECL_FFT \suffix,11, 2048, 1024,  512
419
    DECL_FFT \suffix,12, 4096, 2048, 1024
420
    DECL_FFT \suffix,13, 8192, 4096, 2048
421
    DECL_FFT \suffix,14,16384, 8192, 4096
422
    DECL_FFT \suffix,15,32768,16384, 8192
423
    DECL_FFT \suffix,16,65536,32768,16384
424

    
425
    fft_calc \suffix
426

    
427
    .rodata
428
    .align 3
429
fft_dispatch_tab\suffix\()_altivec:
430
    PTR fft4\suffix\()_altivec
431
    PTR fft8\suffix\()_altivec
432
    PTR fft16\suffix\()_altivec
433
    PTR fft32\suffix\()_altivec
434
    PTR fft64\suffix\()_altivec
435
    PTR fft128\suffix\()_altivec
436
    PTR fft256\suffix\()_altivec
437
    PTR fft512\suffix\()_altivec
438
    PTR fft1024\suffix\()_altivec
439
    PTR fft2048\suffix\()_altivec
440
    PTR fft4096\suffix\()_altivec
441
    PTR fft8192\suffix\()_altivec
442
    PTR fft16384\suffix\()_altivec
443
    PTR fft32768\suffix\()_altivec
444
    PTR fft65536\suffix\()_altivec
445
.endm
446

    
447
DECL_FFTS 0
448
DECL_FFTS 1, _interleave