Revision d19b744a
libavcodec/x86/dsputil_mmx.c  

2193  2193 
#if HAVE_6REGS 
2194  2194 
static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, 
2195  2195 
const float *win, int len){ 
2196 
x86_reg i = len*4;


2197 
x86_reg j = len*48;


2198 
__asm__ volatile(


2199 
"1: \n"


2200 
"pswapd (%5,%1), %%mm1 \n"


2201 
"movq (%5,%0), %%mm0 \n"


2202 
"pswapd (%4,%1), %%mm5 \n"


2203 
"movq (%3,%0), %%mm4 \n"


2204 
"movq %%mm0, %%mm2 \n"


2205 
"movq %%mm1, %%mm3 \n"


2206 
"pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]


2207 
"pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]


2208 
"pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]


2209 
"pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]


2210 
"pfadd %%mm3, %%mm2 \n"


2211 
"pfsub %%mm0, %%mm1 \n"


2212 
"pswapd %%mm2, %%mm2 \n"


2213 
"movq %%mm1, (%2,%0) \n"


2214 
"movq %%mm2, (%2,%1) \n"


2215 
"sub $8, %1 \n"


2216 
"add $8, %0 \n"


2217 
"jl 1b \n"


2218 
"femms \n"


2219 
:"+r"(i), "+r"(j)


2220 
:"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)


2221 
);


2196 
x86_reg i = len*4; 

2197 
x86_reg j = len*48; 

2198 
__asm__ volatile( 

2199 
"1: \n" 

2200 
"pswapd (%5,%1), %%mm1 \n" 

2201 
"movq (%5,%0), %%mm0 \n" 

2202 
"pswapd (%4,%1), %%mm5 \n" 

2203 
"movq (%3,%0), %%mm4 \n" 

2204 
"movq %%mm0, %%mm2 \n" 

2205 
"movq %%mm1, %%mm3 \n" 

2206 
"pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] 

2207 
"pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] 

2208 
"pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] 

2209 
"pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] 

2210 
"pfadd %%mm3, %%mm2 \n" 

2211 
"pfsub %%mm0, %%mm1 \n" 

2212 
"pswapd %%mm2, %%mm2 \n" 

2213 
"movq %%mm1, (%2,%0) \n" 

2214 
"movq %%mm2, (%2,%1) \n" 

2215 
"sub $8, %1 \n" 

2216 
"add $8, %0 \n" 

2217 
"jl 1b \n" 

2218 
"femms \n" 

2219 
:"+r"(i), "+r"(j) 

2220 
:"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) 

2221 
); 

2222  2222 
} 
2223  2223  
2224  2224 
static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, 
2225  2225 
const float *win, int len){ 
2226 
x86_reg i = len*4;


2227 
x86_reg j = len*416;


2228 
__asm__ volatile(


2229 
"1: \n"


2230 
"movaps (%5,%1), %%xmm1 \n"


2231 
"movaps (%5,%0), %%xmm0 \n"


2232 
"movaps (%4,%1), %%xmm5 \n"


2233 
"movaps (%3,%0), %%xmm4 \n"


2234 
"shufps $0x1b, %%xmm1, %%xmm1 \n"


2235 
"shufps $0x1b, %%xmm5, %%xmm5 \n"


2236 
"movaps %%xmm0, %%xmm2 \n"


2237 
"movaps %%xmm1, %%xmm3 \n"


2238 
"mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]


2239 
"mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]


2240 
"mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]


2241 
"mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]


2242 
"addps %%xmm3, %%xmm2 \n"


2243 
"subps %%xmm0, %%xmm1 \n"


2244 
"shufps $0x1b, %%xmm2, %%xmm2 \n"


2245 
"movaps %%xmm1, (%2,%0) \n"


2246 
"movaps %%xmm2, (%2,%1) \n"


2247 
"sub $16, %1 \n"


2248 
"add $16, %0 \n"


2249 
"jl 1b \n"


2250 
:"+r"(i), "+r"(j)


2251 
:"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)


2252 
);


2226 
x86_reg i = len*4; 

2227 
x86_reg j = len*416; 

2228 
__asm__ volatile( 

2229 
"1: \n" 

2230 
"movaps (%5,%1), %%xmm1 \n" 

2231 
"movaps (%5,%0), %%xmm0 \n" 

2232 
"movaps (%4,%1), %%xmm5 \n" 

2233 
"movaps (%3,%0), %%xmm4 \n" 

2234 
"shufps $0x1b, %%xmm1, %%xmm1 \n" 

2235 
"shufps $0x1b, %%xmm5, %%xmm5 \n" 

2236 
"movaps %%xmm0, %%xmm2 \n" 

2237 
"movaps %%xmm1, %%xmm3 \n" 

2238 
"mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] 

2239 
"mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] 

2240 
"mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] 

2241 
"mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] 

2242 
"addps %%xmm3, %%xmm2 \n" 

2243 
"subps %%xmm0, %%xmm1 \n" 

2244 
"shufps $0x1b, %%xmm2, %%xmm2 \n" 

2245 
"movaps %%xmm1, (%2,%0) \n" 

2246 
"movaps %%xmm2, (%2,%1) \n" 

2247 
"sub $16, %1 \n" 

2248 
"add $16, %0 \n" 

2249 
"jl 1b \n" 

2250 
:"+r"(i), "+r"(j) 

2251 
:"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) 

2252 
); 

2253  2253 
} 
2254  2254 
#endif /* HAVE_6REGS */ 
2255  2255 
Also available in: Unified diff