Revision b1159ad9

View differences:

libavcodec/apedec.c
648 648
    do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order);
649 649
}
650 650

  
651
static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits)
651
static void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits)
652 652
{
653 653
    int res;
654 654
    int absres;
655 655

  
656 656
    while (count--) {
657 657
        /* round fixedpoint scalar product */
658
        res = (ctx->dsp.scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits;
659

  
660
        if (*data < 0)
661
            ctx->dsp.add_int16(f->coeffs, f->adaptcoeffs - order, order);
662
        else if (*data > 0)
663
            ctx->dsp.sub_int16(f->coeffs, f->adaptcoeffs - order, order);
664

  
658
        res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data));
659
        res = (res + (1 << (fracbits - 1))) >> fracbits;
665 660
        res += *data;
666

  
667 661
        *data++ = res;
668 662

  
669 663
        /* Update the output history */
libavcodec/dsputil.c
4298 4298
    }
4299 4299
}
4300 4300

  
4301
static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4302
{
4303
    while (order--)
4304
       *v1++ += *v2++;
4305
}
4306

  
4307
static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4308
{
4309
    while (order--)
4310
        *v1++ -= *v2++;
4311
}
4312

  
4313 4301
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4314 4302
{
4315 4303
    int res = 0;
......
4320 4308
    return res;
4321 4309
}
4322 4310

  
4311
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4312
{
4313
    int res = 0;
4314
    while (order--) {
4315
        res   += *v1 * *v2++;
4316
        *v1++ += mul * *v3++;
4317
    }
4318
    return res;
4319
}
4320

  
4323 4321
#define W0 2048
4324 4322
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4325 4323
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
......
4848 4846
    c->vector_clipf = vector_clipf_c;
4849 4847
    c->float_to_int16 = ff_float_to_int16_c;
4850 4848
    c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4851
    c->add_int16 = add_int16_c;
4852
    c->sub_int16 = sub_int16_c;
4853 4849
    c->scalarproduct_int16 = scalarproduct_int16_c;
4850
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4854 4851
    c->scalarproduct_float = scalarproduct_float_c;
4855 4852
    c->butterflies_float = butterflies_float_c;
4856 4853
    c->vector_fmul_scalar = vector_fmul_scalar_c;
libavcodec/dsputil.h
560 560
    void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
561 561
           int * range, int * sum,  int edges);
562 562

  
563
    /* ape functions */
564
    /**
565
     * Add contents of the second vector to the first one.
566
     * @param len length of vectors, should be multiple of 16
567
     */
568
    void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
569
    /**
570
     * Add contents of the second vector to the first one.
571
     * @param len length of vectors, should be multiple of 16
572
     */
573
    void (*sub_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
574 563
    /**
575 564
     * Calculate scalar product of two vectors.
576 565
     * @param len length of vectors, should be multiple of 16
577 566
     * @param shift number of bits to discard from product
578 567
     */
579 568
    int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift);
569
    /* ape functions */
570
    /**
571
     * Calculate scalar product of v1 and v2,
572
     * and v1[i] += v3[i] * mul
573
     * @param len length of vectors, should be multiple of 16
574
     */
575
    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, int16_t *v2, int16_t *v3, int len, int mul);
580 576

  
581 577
    /* rv30 functions */
582 578
    qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
libavcodec/ppc/int_altivec.c
79 79
    return u.score[3];
80 80
}
81 81

  
82
static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
83
{
84
    int i;
85
    register vec_s16 vec, *pv;
86

  
87
    for(i = 0; i < order; i += 8){
88
        pv = (vec_s16*)v2;
89
        vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
90
        vec_st(vec_add(vec_ld(0, v1), vec), 0, v1);
91
        v1 += 8;
92
        v2 += 8;
93
    }
94
}
95

  
96
static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order)
97
{
98
    int i;
99
    register vec_s16 vec, *pv;
100

  
101
    for(i = 0; i < order; i += 8){
102
        pv = (vec_s16*)v2;
103
        vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
104
        vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1);
105
        v1 += 8;
106
        v2 += 8;
107
    }
108
}
109

  
110 82
static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order, const int shift)
111 83
{
112 84
    int i;
......
137 109
    return ires;
138 110
}
139 111

  
112
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
113
{
114
    LOAD_ZERO;
115
    vec_s16 *pv1 = (vec_s16*)v1;
116
    vec_s16 *pv2 = (vec_s16*)v2;
117
    vec_s16 *pv3 = (vec_s16*)v3;
118
    register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
119
    register vec_s16 t0, t1, i0, i1;
120
    register vec_s16 i2 = pv2[0], i3 = pv3[0];
121
    register vec_s32 res = zero_s32v;
122
    register vec_u8 align = vec_lvsl(0, v2);
123
    int32_t ires;
124
    order >>= 4;
125
    do {
126
        t0 = vec_perm(i2, pv2[1], align);
127
        i2 = pv2[2];
128
        t1 = vec_perm(pv2[1], i2, align);
129
        i0 = pv1[0];
130
        i1 = pv1[1];
131
        res = vec_msum(t0, i0, res);
132
        res = vec_msum(t1, i1, res);
133
        t0 = vec_perm(i3, pv3[1], align);
134
        i3 = pv3[2];
135
        t1 = vec_perm(pv3[1], i3, align);
136
        pv1[0] = vec_mladd(t0, muls, i0);
137
        pv1[1] = vec_mladd(t1, muls, i1);
138
        pv1 += 2;
139
        pv2 += 2;
140
        pv3 += 2;
141
    } while(--order);
142
    res = vec_splat(vec_sums(res, zero_s32v), 3);
143
    vec_ste(res, 0, &ires);
144
    return ires;
145
}
146

  
140 147
void int_init_altivec(DSPContext* c, AVCodecContext *avctx)
141 148
{
142 149
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
143
    c->add_int16 = add_int16_altivec;
144
    c->sub_int16 = sub_int16_altivec;
145 150
    c->scalarproduct_int16 = scalarproduct_int16_altivec;
151
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
146 152
}
libavcodec/x86/dsputil_mmx.c
2384 2384
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
2385 2385
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
2386 2386
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
2387
void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order);
2388
void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order);
2389
void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order);
2390
void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order);
2391
int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift);
2392
int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
2387
int32_t ff_scalarproduct_int16_mmx2(int16_t *v1, int16_t *v2, int order, int shift);
2388
int32_t ff_scalarproduct_int16_sse2(int16_t *v1, int16_t *v2, int order, int shift);
2389
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
2390
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
2391
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
2393 2392
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2394 2393
int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2395 2394
int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
......
2951 2950
        }
2952 2951
        if(mm_flags & FF_MM_MMX2){
2953 2952
#if HAVE_YASM
2954
            c->add_int16 = ff_add_int16_mmx2;
2955
            c->sub_int16 = ff_sub_int16_mmx2;
2956 2953
            c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2954
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2957 2955
#endif
2958 2956
        }
2959 2957
        if(mm_flags & FF_MM_SSE){
......
2975 2973
            c->float_to_int16 = float_to_int16_sse2;
2976 2974
            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
2977 2975
#if HAVE_YASM
2978
            c->add_int16 = ff_add_int16_sse2;
2979
            c->sub_int16 = ff_sub_int16_sse2;
2980 2976
            c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2977
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2981 2978
#endif
2982 2979
        }
2980
        if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit
2981
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2983 2982
    }
2984 2983

  
2985 2984
    if (CONFIG_ENCODERS)
libavcodec/x86/dsputil_yasm.asm
100 100

  
101 101

  
102 102
%macro SCALARPRODUCT 1
103
; void add_int16(int16_t * v1, int16_t * v2, int order)
104
cglobal add_int16_%1, 3,3,2, v1, v2, order
105
    shl orderq, 1
106
    add v1q, orderq
107
    add v2q, orderq
108
    neg orderq
109
.loop:
110
    movu    m0, [v2q + orderq]
111
    movu    m1, [v2q + orderq + mmsize]
112
    paddw   m0, [v1q + orderq]
113
    paddw   m1, [v1q + orderq + mmsize]
114
    mova    [v1q + orderq], m0
115
    mova    [v1q + orderq + mmsize], m1
116
    add     orderq, mmsize*2
117
    jl .loop
118
    REP_RET
119

  
120
; void sub_int16(int16_t * v1, int16_t * v2, int order)
121
cglobal sub_int16_%1, 3,3,4, v1, v2, order
122
    shl orderq, 1
123
    add v1q, orderq
124
    add v2q, orderq
125
    neg orderq
126
.loop:
127
    movu    m2, [v2q + orderq]
128
    movu    m3, [v2q + orderq + mmsize]
129
    mova    m0, [v1q + orderq]
130
    mova    m1, [v1q + orderq + mmsize]
131
    psubw   m0, m2
132
    psubw   m1, m3
133
    mova    [v1q + orderq], m0
134
    mova    [v1q + orderq + mmsize], m1
135
    add     orderq, mmsize*2
136
    jl .loop
137
    REP_RET
138

  
139
; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
103
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
140 104
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
141 105
    shl orderq, 1
142 106
    add v1q, orderq
......
165 129
    paddd   m2, m0
166 130
    movd   eax, m2
167 131
    RET
132

  
133
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
134
cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul
135
    shl orderq, 1
136
    movd    m7, mulm
137
%if mmsize == 16
138
    pshuflw m7, m7, 0
139
    punpcklqdq m7, m7
140
%else
141
    pshufw  m7, m7, 0
142
%endif
143
    pxor    m6, m6
144
    add v1q, orderq
145
    add v2q, orderq
146
    add v3q, orderq
147
    neg orderq
148
.loop:
149
    movu    m0, [v2q + orderq]
150
    movu    m1, [v2q + orderq + mmsize]
151
    mova    m4, [v1q + orderq]
152
    mova    m5, [v1q + orderq + mmsize]
153
    movu    m2, [v3q + orderq]
154
    movu    m3, [v3q + orderq + mmsize]
155
    pmaddwd m0, m4
156
    pmaddwd m1, m5
157
    pmullw  m2, m7
158
    pmullw  m3, m7
159
    paddd   m6, m0
160
    paddd   m6, m1
161
    paddw   m2, m4
162
    paddw   m3, m5
163
    mova    [v1q + orderq], m2
164
    mova    [v1q + orderq + mmsize], m3
165
    add     orderq, mmsize*2
166
    jl .loop
167
%if mmsize == 16
168
    movhlps m0, m6
169
    paddd   m6, m0
170
    pshuflw m0, m6, 0x4e
171
%else
172
    pshufw  m0, m6, 0x4e
173
%endif
174
    paddd   m6, m0
175
    movd   eax, m6
176
    RET
168 177
%endmacro
169 178

  
170 179
INIT_MMX
......
172 181
INIT_XMM
173 182
SCALARPRODUCT sse2
174 183

  
184
%macro SCALARPRODUCT_LOOP 1
185
align 16
186
.loop%1:
187
    sub     orderq, mmsize*2
188
%if %1
189
    mova    m1, m4
190
    mova    m4, [v2q + orderq]
191
    mova    m0, [v2q + orderq + mmsize]
192
    palignr m1, m0, %1
193
    palignr m0, m4, %1
194
    mova    m3, m5
195
    mova    m5, [v3q + orderq]
196
    mova    m2, [v3q + orderq + mmsize]
197
    palignr m3, m2, %1
198
    palignr m2, m5, %1
199
%else
200
    mova    m0, [v2q + orderq]
201
    mova    m1, [v2q + orderq + mmsize]
202
    mova    m2, [v3q + orderq]
203
    mova    m3, [v3q + orderq + mmsize]
204
%endif
205
    pmaddwd m0, [v1q + orderq]
206
    pmaddwd m1, [v1q + orderq + mmsize]
207
    pmullw  m2, m7
208
    pmullw  m3, m7
209
    paddw   m2, [v1q + orderq]
210
    paddw   m3, [v1q + orderq + mmsize]
211
    paddd   m6, m0
212
    paddd   m6, m1
213
    mova    [v1q + orderq], m2
214
    mova    [v1q + orderq + mmsize], m3
215
    jg .loop%1
216
%if %1
217
    jmp .end
218
%endif
219
%endmacro
220

  
221
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
222
cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul
223
    shl orderq, 1
224
    movd    m7, mulm
225
    pshuflw m7, m7, 0
226
    punpcklqdq m7, m7
227
    pxor    m6, m6
228
    mov    r4d, v2d
229
    and    r4d, 15
230
    and    v2q, ~15
231
    and    v3q, ~15
232
    mova    m4, [v2q + orderq]
233
    mova    m5, [v3q + orderq]
234
    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
235
    cmp    r4d, 0
236
    je .loop0
237
    cmp    r4d, 2
238
    je .loop2
239
    cmp    r4d, 4
240
    je .loop4
241
    cmp    r4d, 6
242
    je .loop6
243
    cmp    r4d, 8
244
    je .loop8
245
    cmp    r4d, 10
246
    je .loop10
247
    cmp    r4d, 12
248
    je .loop12
249
SCALARPRODUCT_LOOP 14
250
SCALARPRODUCT_LOOP 12
251
SCALARPRODUCT_LOOP 10
252
SCALARPRODUCT_LOOP 8
253
SCALARPRODUCT_LOOP 6
254
SCALARPRODUCT_LOOP 4
255
SCALARPRODUCT_LOOP 2
256
SCALARPRODUCT_LOOP 0
257
.end:
258
    movhlps m0, m6
259
    paddd   m6, m0
260
    pshuflw m0, m6, 0x4e
261
    paddd   m6, m0
262
    movd   eax, m6
263
    RET
264

  
175 265

  
176 266

  
177 267
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)

Also available in: Unified diff