Revision d19b744a libavcodec/x86/dsputil_mmx.c

View differences:

libavcodec/x86/dsputil_mmx.c
2193 2193
#if HAVE_6REGS
2194 2194
static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2195 2195
                                      const float *win, int len){
2196
        x86_reg i = -len*4;
2197
        x86_reg j = len*4-8;
2198
        __asm__ volatile(
2199
            "1: \n"
2200
            "pswapd  (%5,%1), %%mm1 \n"
2201
            "movq    (%5,%0), %%mm0 \n"
2202
            "pswapd  (%4,%1), %%mm5 \n"
2203
            "movq    (%3,%0), %%mm4 \n"
2204
            "movq      %%mm0, %%mm2 \n"
2205
            "movq      %%mm1, %%mm3 \n"
2206
            "pfmul     %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2207
            "pfmul     %%mm5, %%mm3 \n" // src1[    j]*win[len+j]
2208
            "pfmul     %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2209
            "pfmul     %%mm5, %%mm0 \n" // src1[    j]*win[len+i]
2210
            "pfadd     %%mm3, %%mm2 \n"
2211
            "pfsub     %%mm0, %%mm1 \n"
2212
            "pswapd    %%mm2, %%mm2 \n"
2213
            "movq      %%mm1, (%2,%0) \n"
2214
            "movq      %%mm2, (%2,%1) \n"
2215
            "sub $8, %1 \n"
2216
            "add $8, %0 \n"
2217
            "jl 1b \n"
2218
            "femms \n"
2219
            :"+r"(i), "+r"(j)
2220
            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2221
        );
2196
    x86_reg i = -len*4;
2197
    x86_reg j = len*4-8;
2198
    __asm__ volatile(
2199
        "1: \n"
2200
        "pswapd  (%5,%1), %%mm1 \n"
2201
        "movq    (%5,%0), %%mm0 \n"
2202
        "pswapd  (%4,%1), %%mm5 \n"
2203
        "movq    (%3,%0), %%mm4 \n"
2204
        "movq      %%mm0, %%mm2 \n"
2205
        "movq      %%mm1, %%mm3 \n"
2206
        "pfmul     %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2207
        "pfmul     %%mm5, %%mm3 \n" // src1[    j]*win[len+j]
2208
        "pfmul     %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2209
        "pfmul     %%mm5, %%mm0 \n" // src1[    j]*win[len+i]
2210
        "pfadd     %%mm3, %%mm2 \n"
2211
        "pfsub     %%mm0, %%mm1 \n"
2212
        "pswapd    %%mm2, %%mm2 \n"
2213
        "movq      %%mm1, (%2,%0) \n"
2214
        "movq      %%mm2, (%2,%1) \n"
2215
        "sub $8, %1 \n"
2216
        "add $8, %0 \n"
2217
        "jl 1b \n"
2218
        "femms \n"
2219
        :"+r"(i), "+r"(j)
2220
        :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2221
    );
2222 2222
}
2223 2223

  
2224 2224
static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2225 2225
                                   const float *win, int len){
2226
        x86_reg i = -len*4;
2227
        x86_reg j = len*4-16;
2228
        __asm__ volatile(
2229
            "1: \n"
2230
            "movaps       (%5,%1), %%xmm1 \n"
2231
            "movaps       (%5,%0), %%xmm0 \n"
2232
            "movaps       (%4,%1), %%xmm5 \n"
2233
            "movaps       (%3,%0), %%xmm4 \n"
2234
            "shufps $0x1b, %%xmm1, %%xmm1 \n"
2235
            "shufps $0x1b, %%xmm5, %%xmm5 \n"
2236
            "movaps        %%xmm0, %%xmm2 \n"
2237
            "movaps        %%xmm1, %%xmm3 \n"
2238
            "mulps         %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2239
            "mulps         %%xmm5, %%xmm3 \n" // src1[    j]*win[len+j]
2240
            "mulps         %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2241
            "mulps         %%xmm5, %%xmm0 \n" // src1[    j]*win[len+i]
2242
            "addps         %%xmm3, %%xmm2 \n"
2243
            "subps         %%xmm0, %%xmm1 \n"
2244
            "shufps $0x1b, %%xmm2, %%xmm2 \n"
2245
            "movaps        %%xmm1, (%2,%0) \n"
2246
            "movaps        %%xmm2, (%2,%1) \n"
2247
            "sub $16, %1 \n"
2248
            "add $16, %0 \n"
2249
            "jl 1b \n"
2250
            :"+r"(i), "+r"(j)
2251
            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2252
        );
2226
    x86_reg i = -len*4;
2227
    x86_reg j = len*4-16;
2228
    __asm__ volatile(
2229
        "1: \n"
2230
        "movaps       (%5,%1), %%xmm1 \n"
2231
        "movaps       (%5,%0), %%xmm0 \n"
2232
        "movaps       (%4,%1), %%xmm5 \n"
2233
        "movaps       (%3,%0), %%xmm4 \n"
2234
        "shufps $0x1b, %%xmm1, %%xmm1 \n"
2235
        "shufps $0x1b, %%xmm5, %%xmm5 \n"
2236
        "movaps        %%xmm0, %%xmm2 \n"
2237
        "movaps        %%xmm1, %%xmm3 \n"
2238
        "mulps         %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2239
        "mulps         %%xmm5, %%xmm3 \n" // src1[    j]*win[len+j]
2240
        "mulps         %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2241
        "mulps         %%xmm5, %%xmm0 \n" // src1[    j]*win[len+i]
2242
        "addps         %%xmm3, %%xmm2 \n"
2243
        "subps         %%xmm0, %%xmm1 \n"
2244
        "shufps $0x1b, %%xmm2, %%xmm2 \n"
2245
        "movaps        %%xmm1, (%2,%0) \n"
2246
        "movaps        %%xmm2, (%2,%1) \n"
2247
        "sub $16, %1 \n"
2248
        "add $16, %0 \n"
2249
        "jl 1b \n"
2250
        :"+r"(i), "+r"(j)
2251
        :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2252
    );
2253 2253
}
2254 2254
#endif /* HAVE_6REGS */
2255 2255

  

Also available in: Unified diff