Revision e6b1ed69

View differences:

libavcodec/arm/fft_init_arm.c
44 44
        s->imdct_calc   = ff_imdct_calc_neon;
45 45
        s->imdct_half   = ff_imdct_half_neon;
46 46
        s->mdct_calc    = ff_mdct_calc_neon;
47
        s->permutation  = FF_MDCT_PERM_INTERLEAVE;
47
        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
48 48
    }
49 49
}
50 50

  
libavcodec/fft.c
97 97
    if (!s->tmp_buf)
98 98
        goto fail;
99 99
    s->inverse = inverse;
100
    s->fft_permutation = FF_FFT_PERM_DEFAULT;
100 101

  
101 102
    s->fft_permute = ff_fft_permute_c;
102 103
    s->fft_calc    = ff_fft_calc_c;
......
113 114
    for(j=4; j<=nbits; j++) {
114 115
        ff_init_ff_cos_tabs(j);
115 116
    }
116
    for(i=0; i<n; i++)
117
        s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i;
117
    for(i=0; i<n; i++) {
118
        int j = i;
119
        if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
120
            j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
121
        s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
122
    }
118 123

  
119 124
    return 0;
120 125
 fail:
libavcodec/fft.h
44 44
    void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
45 45
    void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
46 46
    void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
47
    int permutation;
47
    int fft_permutation;
48
#define FF_FFT_PERM_DEFAULT   0
49
#define FF_FFT_PERM_SWAP_LSBS 1
50
    int mdct_permutation;
48 51
#define FF_MDCT_PERM_NONE       0
49 52
#define FF_MDCT_PERM_INTERLEAVE 1
50 53
};
libavcodec/mdct.c
71 71
    s->mdct_bits = nbits;
72 72
    s->mdct_size = n;
73 73
    n4 = n >> 2;
74
    s->permutation = FF_MDCT_PERM_NONE;
74
    s->mdct_permutation = FF_MDCT_PERM_NONE;
75 75

  
76 76
    if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0)
77 77
        goto fail;
......
80 80
    if (!s->tcos)
81 81
        goto fail;
82 82

  
83
    switch (s->permutation) {
83
    switch (s->mdct_permutation) {
84 84
    case FF_MDCT_PERM_NONE:
85 85
        s->tsin = s->tcos + n4;
86 86
        tstep = 1;
libavcodec/x86/fft.c
30 30
        s->imdct_half  = ff_imdct_half_sse;
31 31
        s->fft_permute = ff_fft_permute_sse;
32 32
        s->fft_calc    = ff_fft_calc_sse;
33
        s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
33 34
    } else if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) {
34 35
        /* 3DNowEx for K7 */
35 36
        s->imdct_calc = ff_imdct_calc_3dn2;
libavcodec/x86/fft_mmx.asm
51 51
%define M_SQRT1_2 0.70710678118654752440
52 52
ps_root2: times 4 dd M_SQRT1_2
53 53
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
54
ps_p1p1m1p1: dd 0, 0, 1<<31, 0
54 55
ps_m1p1: dd 1<<31, 0
55 56

  
56 57
%assign i 16
......
95 96
    SWAP     %3, %6
96 97
%endmacro
97 98

  
98
; in:  %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
99
; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
99 100
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
100 101
%macro T4_SSE 3
101 102
    mova     %3, %1
102
    shufps   %1, %2, 0x64 ; {r0,i0,r3,i2}
103
    shufps   %3, %2, 0xce ; {r1,i1,r2,i3}
103
    addps    %1, %2       ; {t1,t2,t6,t5}
104
    subps    %3, %2       ; {t3,t4,-t8,t7}
105
    xorps    %3, [ps_p1p1m1p1]
104 106
    mova     %2, %1
105
    addps    %1, %3       ; {t1,t2,t6,t5}
106
    subps    %2, %3       ; {t3,t4,t8,t7}
107
    shufps   %1, %3, 0x44 ; {t1,t2,t3,t4}
108
    shufps   %2, %3, 0xbe ; {t6,t5,t7,t8}
107 109
    mova     %3, %1
108
    shufps   %1, %2, 0x44 ; {t1,t2,t3,t4}
109
    shufps   %3, %2, 0xbe ; {t6,t5,t7,t8}
110
    addps    %1, %2       ; {r0,i0,r1,i1}
111
    subps    %3, %2       ; {r2,i2,r3,i3}
110 112
    mova     %2, %1
111
    addps    %1, %3       ; {r0,i0,r1,i1}
112
    subps    %2, %3       ; {r2,i2,r3,i3}
113
    mova     %3, %1
114
    shufps   %1, %2, 0x88 ; {r0,r1,r2,r3}
115
    shufps   %3, %2, 0xdd ; {i0,i1,i2,i3}
116
    SWAP     %2, %3
113
    shufps   %1, %3, 0x88 ; {r0,r1,r2,r3}
114
    shufps   %2, %3, 0xdd ; {i0,i1,i2,i3}
117 115
%endmacro
118 116

  
119
%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
120
    mova     %5, %3
121
    shufps   %3, %4, 0x44 ; {r4,i4,r6,i6}
122
    shufps   %5, %4, 0xee ; {r5,i5,r7,i7}
117
; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
118
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
119
%macro T8_SSE 6
123 120
    mova     %6, %3
124
    subps    %3, %5       ; {r5,i5,r7,i7}
125
    addps    %6, %5       ; {t1,t2,t3,t4}
126
    mova     %5, %3
127
    shufps   %5, %5, 0xb1 ; {i5,r5,i7,r7}
121
    subps    %3, %4       ; {r5,i5,r7,i7}
122
    addps    %6, %4       ; {t1,t2,t3,t4}
123
    mova     %4, %3
124
    shufps   %4, %4, 0xb1 ; {i5,r5,i7,r7}
128 125
    mulps    %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
129
    mulps    %5, [ps_root2]
130
    addps    %3, %5       ; {t8,t7,ta,t9}
131
    mova     %5, %6
126
    mulps    %4, [ps_root2]
127
    addps    %3, %4       ; {t8,t7,ta,t9}
128
    mova     %4, %6
132 129
    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
133
    shufps   %5, %3, 0x9c ; {t1,t4,t7,ta}
130
    shufps   %4, %3, 0x9c ; {t1,t4,t7,ta}
134 131
    mova     %3, %6
135
    addps    %6, %5       ; {t1,t2,t9,ta}
136
    subps    %3, %5       ; {t6,t5,tc,tb}
137
    mova     %5, %6
132
    addps    %6, %4       ; {t1,t2,t9,ta}
133
    subps    %3, %4       ; {t6,t5,tc,tb}
134
    mova     %4, %6
138 135
    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
139
    shufps   %5, %3, 0x8d ; {t2,ta,t6,tc}
136
    shufps   %4, %3, 0x8d ; {t2,ta,t6,tc}
140 137
    mova     %3, %1
141
    mova     %4, %2
138
    mova     %5, %2
142 139
    addps    %1, %6       ; {r0,r1,r2,r3}
143
    addps    %2, %5       ; {i0,i1,i2,i3}
140
    addps    %2, %4       ; {i0,i1,i2,i3}
144 141
    subps    %3, %6       ; {r4,r5,r6,r7}
145
    subps    %4, %5       ; {i4,i5,i6,i7}
142
    subps    %5, %4       ; {i4,i5,i6,i7}
143
    SWAP     %4, %5
146 144
%endmacro
147 145

  
148 146
; scheduled for cpu-bound sizes

Also available in: Unified diff