Revision f27e1d64 libavcodec/i386/dsputil_mmx.c

View differences:

libavcodec/i386/dsputil_mmx.c
2022 2022
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
2023 2023
}
2024 2024

  
2025
static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2026
                                   const float *win, float add_bias, int len){
2027
#ifdef HAVE_6REGS
2028
    if(add_bias == 0){
2029
        x86_reg i = -len*2;
2030
        x86_reg j = len*2-16;
2031
        asm volatile(
2032
            "1: \n"
2033
            "movaps       (%5,%0), %%xmm0 \n"
2034
            "movaps       (%5,%1), %%xmm1 \n"
2035
            "movaps        %%xmm0, %%xmm2 \n"
2036
            "movaps        %%xmm1, %%xmm3 \n"
2037
            "shufps $0x1b, %%xmm2, %%xmm2 \n"
2038
            "shufps $0x1b, %%xmm3, %%xmm3 \n"
2039
            "mulps        (%4,%0), %%xmm0 \n"
2040
            "mulps        (%4,%1), %%xmm1 \n"
2041
            "mulps        (%3,%0), %%xmm3 \n"
2042
            "mulps        (%3,%1), %%xmm2 \n"
2043
            "addps         %%xmm3, %%xmm0 \n"
2044
            "addps         %%xmm2, %%xmm1 \n"
2045
            "movaps        %%xmm0, (%2,%0) \n"
2046
            "movaps        %%xmm1, (%2,%1) \n"
2047
            "sub $16, %1 \n"
2048
            "add $16, %0 \n"
2049
            "jl 1b \n"
2050
            :"+r"(i), "+r"(j)
2051
            :"r"(dst+len/2), "r"(src0+len/2), "r"(src1+len/2), "r"(win+len/2)
2052
        );
2053
    }else
2054
#endif
2055
        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
2056
}
2057

  
2025 2058
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
2026 2059
    // not bit-exact: pf2id uses different rounding than C and SSE
2027 2060
    asm volatile(
......
2083 2116
    );
2084 2117
}
2085 2118

  
2119
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
2120
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
2121
static av_noinline void float_to_int16_interleave2_##cpu(int16_t *dst, const float *src, long len, int channels){\
2122
    DECLARE_ALIGNED_16(int16_t, tmp[len*channels]);\
2123
    int i,j,c;\
2124
    float_to_int16_##cpu(tmp, src, len*channels);\
2125
    for(c=0; c<channels; c++){\
2126
        int16_t *ptmp = tmp+c*len;\
2127
        for(i=0, j=c; i<len; i++, j+=channels)\
2128
            dst[j] = ptmp[i];\
2129
    }\
2130
}\
2131
\
2132
static void float_to_int16_interleave_##cpu(int16_t *dst, const float *src, long len, int channels){\
2133
    if(channels==1)\
2134
        float_to_int16_##cpu(dst, src, len);\
2135
    else if(channels>2)\
2136
        float_to_int16_interleave2_##cpu(dst, src, len, channels);\
2137
    else{\
2138
        float *src1;\
2139
        asm volatile(\
2140
            "shl $2, %0 \n"\
2141
            "add %0, %1 \n"\
2142
            "add %0, %2 \n"\
2143
            "lea (%2,%0), %3 \n"\
2144
            "neg %0 \n"\
2145
            body\
2146
            :"+r"(len), "+r"(dst), "+r"(src), "=r"(src1)\
2147
        );\
2148
    }\
2149
}
2150

  
2151
FLOAT_TO_INT16_INTERLEAVE(3dnow,
2152
    "1:                         \n"
2153
    "pf2id     (%2,%0), %%mm0   \n"
2154
    "pf2id    8(%2,%0), %%mm1   \n"
2155
    "pf2id     (%3,%0), %%mm2   \n"
2156
    "pf2id    8(%3,%0), %%mm3   \n"
2157
    "packssdw    %%mm1, %%mm0   \n"
2158
    "packssdw    %%mm3, %%mm2   \n"
2159
    "movq        %%mm0, %%mm1   \n"
2160
    "punpcklwd   %%mm2, %%mm0   \n"
2161
    "punpckhwd   %%mm2, %%mm1   \n"
2162
    "movq        %%mm0,  (%1,%0)\n"
2163
    "movq        %%mm0, 8(%1,%0)\n"
2164
    "add $16, %0                \n"
2165
    "js 1b                      \n"
2166
    "femms                      \n"
2167
)
2168

  
2169
FLOAT_TO_INT16_INTERLEAVE(sse,
2170
    "1:                         \n"
2171
    "cvtps2pi  (%2,%0), %%mm0   \n"
2172
    "cvtps2pi 8(%2,%0), %%mm1   \n"
2173
    "cvtps2pi  (%3,%0), %%mm2   \n"
2174
    "cvtps2pi 8(%3,%0), %%mm3   \n"
2175
    "packssdw    %%mm1, %%mm0   \n"
2176
    "packssdw    %%mm3, %%mm2   \n"
2177
    "movq        %%mm0, %%mm1   \n"
2178
    "punpcklwd   %%mm2, %%mm0   \n"
2179
    "punpckhwd   %%mm2, %%mm1   \n"
2180
    "movq        %%mm0,  (%1,%0)\n"
2181
    "movq        %%mm0, 8(%1,%0)\n"
2182
    "add $16, %0                \n"
2183
    "js 1b                      \n"
2184
    "emms                       \n"
2185
)
2186

  
2187
FLOAT_TO_INT16_INTERLEAVE(sse2,
2188
    "1:                         \n"
2189
    "cvtps2dq  (%2,%0), %%xmm0  \n"
2190
    "cvtps2dq  (%3,%0), %%xmm1  \n"
2191
    "packssdw   %%xmm1, %%xmm0  \n"
2192
    "movhlps    %%xmm0, %%xmm1  \n"
2193
    "punpcklwd  %%xmm1, %%xmm0  \n"
2194
    "movdqa     %%xmm0, (%1,%0) \n"
2195
    "add $16, %0                \n"
2196
    "js 1b                      \n"
2197
)
2198

  
2199

  
2086 2200
extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
2087 2201
extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
2088 2202
extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
......
2519 2633
        if(mm_flags & MM_3DNOW){
2520 2634
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2521 2635
            c->vector_fmul = vector_fmul_3dnow;
2522
            if(!(avctx->flags & CODEC_FLAG_BITEXACT))
2636
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2523 2637
                c->float_to_int16 = float_to_int16_3dnow;
2638
                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
2639
            }
2524 2640
        }
2525 2641
        if(mm_flags & MM_3DNOWEXT)
2526 2642
            c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
......
2528 2644
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2529 2645
            c->vector_fmul = vector_fmul_sse;
2530 2646
            c->float_to_int16 = float_to_int16_sse;
2647
            c->float_to_int16_interleave = float_to_int16_interleave_sse;
2531 2648
            c->vector_fmul_reverse = vector_fmul_reverse_sse;
2532 2649
            c->vector_fmul_add_add = vector_fmul_add_add_sse;
2650
            c->vector_fmul_window = vector_fmul_window_sse;
2533 2651
        }
2534 2652
        if(mm_flags & MM_SSE2){
2535 2653
            c->float_to_int16 = float_to_int16_sse2;
2654
            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
2536 2655
        }
2537 2656
        if(mm_flags & MM_3DNOW)
2538 2657
            c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse

Also available in: Unified diff