Revision be449fca libavcodec/i386/fft_sse.c
libavcodec/i386/fft_sse.c  

36  36  
37  37 
if(n <= 16) { 
38  38 
x86_reg i = 8*n; 
39 
39 
__asm__ volatile(


40  40 
"1: \n" 
41  41 
"movaps (%0,%1), %%xmm0 \n" 
42  42 
"movaps %%xmm0, %%xmm1 \n" 
...  ...  
58  58 
int n = 1 << s>nbits; 
59  59 
int i; 
60  60 
for(i=0; i<n; i+=2) { 
61 
61 
__asm__ volatile(


62  62 
"movaps %2, %%xmm0 \n" 
63  63 
"movlps %%xmm0, %0 \n" 
64  64 
"movhps %%xmm0, %1 \n" 
...  ...  
84  84  
85  85 
/* pre rotation */ 
86  86 
for(k=n82; k>=0; k=2) { 
87 
87 
__asm__ volatile(


88  88 
"movaps (%2,%1,2), %%xmm0 \n" // { z[k].re, z[k].im, z[k+1].re, z[k+1].im } 
89  89 
"movaps 16(%2,%0,2), %%xmm1 \n" // { z[k2].re, z[k2].im, z[k1].re, z[k1].im } 
90  90 
"movaps %%xmm0, %%xmm2 \n" 
...  ...  
111  111 
#ifdef ARCH_X86_64 
112  112 
// if we have enough regs, don't let gcc make the luts latencybound 
113  113 
// but if not, latency is faster than spilling 
114 
114 
__asm__("movlps %%xmm0, %0 \n"


115  115 
"movhps %%xmm0, %1 \n" 
116  116 
"movlps %%xmm1, %2 \n" 
117  117 
"movhps %%xmm1, %3 \n" 
...  ...  
121  121 
"=m"(z[revtab[ k+1]]) 
122  122 
); 
123  123 
#else 
124 
126 
127 
124 
__asm__("movlps %%xmm0, %0" :"=m"(z[revtab[k2]]));


125 
__asm__("movhps %%xmm0, %0" :"=m"(z[revtab[k1]]));


126 
__asm__("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]]));


127 
__asm__("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]]));


128  128 
#endif 
129  129 
} 
130  130  
...  ...  
146  146  
147  147 
j = n2; 
148  148 
k = n216; 
149 
149 
__asm__ volatile(


150  150 
"1: \n" 
151  151 
CMUL(%0, %%xmm0, %%xmm1) 
152  152 
CMUL(%1, %%xmm4, %%xmm5) 
...  ...  
181  181  
182  182 
j = n; 
183  183 
k = n16; 
184 
184 
__asm__ volatile(


185  185 
"movaps %4, %%xmm7 \n" 
186  186 
"1: \n" 
187  187 
"movaps (%2,%1), %%xmm0 \n" 
