Revision 35de5d24

View differences:

libavcodec/ppc/float_altivec.c
75 75
    vector unsigned char align = vec_lvsr(0,dst),
76 76
                         mask = vec_lvsl(0, dst);
77 77

  
78
        for (i=0; i<len-3; i+=4) {
79
            t0 = vec_ld(0, dst+i);
80
            t1 = vec_ld(15, dst+i);
81
            s0 = vec_ld(0, src0+i);
82
            s1 = vec_ld(0, src1+i);
83
            s2 = vec_ld(0, src2+i);
84
            edges = vec_perm(t1 ,t0, mask);
85
            d = vec_madd(s0,s1,s2);
86
            t1 = vec_perm(d, edges, align);
87
            t0 = vec_perm(edges, d, align);
88
            vec_st(t1, 15, dst+i);
89
            vec_st(t0, 0, dst+i);
90
        }
78
    for (i=0; i<len-3; i+=4) {
79
        t0 = vec_ld(0, dst+i);
80
        t1 = vec_ld(15, dst+i);
81
        s0 = vec_ld(0, src0+i);
82
        s1 = vec_ld(0, src1+i);
83
        s2 = vec_ld(0, src2+i);
84
        edges = vec_perm(t1 ,t0, mask);
85
        d = vec_madd(s0,s1,s2);
86
        t1 = vec_perm(d, edges, align);
87
        t0 = vec_perm(edges, d, align);
88
        vec_st(t1, 15, dst+i);
89
        vec_st(t0, 0, dst+i);
90
    }
91 91
}
92 92

  
93 93
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len)
libavcodec/x86/dsputil_mmx.c
2128 2128
static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2129 2129
                                  const float *src2, int len){
2130 2130
    x86_reg i = (len-4)*4;
2131
        __asm__ volatile(
2132
            "1: \n\t"
2133
            "movq    (%2,%0), %%mm0 \n\t"
2134
            "movq   8(%2,%0), %%mm1 \n\t"
2135
            "pfmul   (%3,%0), %%mm0 \n\t"
2136
            "pfmul  8(%3,%0), %%mm1 \n\t"
2137
            "pfadd   (%4,%0), %%mm0 \n\t"
2138
            "pfadd  8(%4,%0), %%mm1 \n\t"
2139
            "movq  %%mm0,   (%1,%0) \n\t"
2140
            "movq  %%mm1,  8(%1,%0) \n\t"
2141
            "sub  $16, %0 \n\t"
2142
            "jge  1b \n\t"
2143
            :"+r"(i)
2144
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2145
            :"memory"
2146
        );
2131
    __asm__ volatile(
2132
        "1: \n\t"
2133
        "movq    (%2,%0), %%mm0 \n\t"
2134
        "movq   8(%2,%0), %%mm1 \n\t"
2135
        "pfmul   (%3,%0), %%mm0 \n\t"
2136
        "pfmul  8(%3,%0), %%mm1 \n\t"
2137
        "pfadd   (%4,%0), %%mm0 \n\t"
2138
        "pfadd  8(%4,%0), %%mm1 \n\t"
2139
        "movq  %%mm0,   (%1,%0) \n\t"
2140
        "movq  %%mm1,  8(%1,%0) \n\t"
2141
        "sub  $16, %0 \n\t"
2142
        "jge  1b \n\t"
2143
        :"+r"(i)
2144
        :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2145
        :"memory"
2146
    );
2147 2147
    __asm__ volatile("femms");
2148 2148
}
2149 2149
static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2150 2150
                                const float *src2, int len){
2151 2151
    x86_reg i = (len-8)*4;
2152
        __asm__ volatile(
2153
            "1: \n\t"
2154
            "movaps   (%2,%0), %%xmm0 \n\t"
2155
            "movaps 16(%2,%0), %%xmm1 \n\t"
2156
            "mulps    (%3,%0), %%xmm0 \n\t"
2157
            "mulps  16(%3,%0), %%xmm1 \n\t"
2158
            "addps    (%4,%0), %%xmm0 \n\t"
2159
            "addps  16(%4,%0), %%xmm1 \n\t"
2160
            "movaps %%xmm0,   (%1,%0) \n\t"
2161
            "movaps %%xmm1, 16(%1,%0) \n\t"
2162
            "sub  $32, %0 \n\t"
2163
            "jge  1b \n\t"
2164
            :"+r"(i)
2165
            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2166
            :"memory"
2167
        );
2152
    __asm__ volatile(
2153
        "1: \n\t"
2154
        "movaps   (%2,%0), %%xmm0 \n\t"
2155
        "movaps 16(%2,%0), %%xmm1 \n\t"
2156
        "mulps    (%3,%0), %%xmm0 \n\t"
2157
        "mulps  16(%3,%0), %%xmm1 \n\t"
2158
        "addps    (%4,%0), %%xmm0 \n\t"
2159
        "addps  16(%4,%0), %%xmm1 \n\t"
2160
        "movaps %%xmm0,   (%1,%0) \n\t"
2161
        "movaps %%xmm1, 16(%1,%0) \n\t"
2162
        "sub  $32, %0 \n\t"
2163
        "jge  1b \n\t"
2164
        :"+r"(i)
2165
        :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2166
        :"memory"
2167
    );
2168 2168
}
2169 2169

  
2170 2170
static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,

Also available in: Unified diff