Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / h264_template_altivec.c @ ed040f35

History | View | Annotate | Download (26.8 KB)

1
/*
2
 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
//#define DEBUG_ALIGNMENT
22
#ifdef DEBUG_ALIGNMENT
23
#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24
#else
25
#define ASSERT_ALIGNED(ptr) ;
26
#endif
27

    
28
/* this code assume that stride % 16 == 0 */
29

    
30
#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
31
        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32
        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
33
\
34
        psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
35
        psum = vec_mladd(vB, vsrc1ssH, psum);\
36
        psum = vec_mladd(vC, vsrc2ssH, psum);\
37
        psum = vec_mladd(vD, vsrc3ssH, psum);\
38
        psum = BIAS2(psum);\
39
        psum = vec_sr(psum, v6us);\
40
\
41
        vdst = vec_ld(0, dst);\
42
        ppsum = (vec_u8)vec_pack(psum, psum);\
43
        vfdst = vec_perm(vdst, ppsum, fperm);\
44
\
45
        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
46
\
47
        vec_st(fsum, 0, dst);\
48
\
49
        vsrc0ssH = vsrc2ssH;\
50
        vsrc1ssH = vsrc3ssH;\
51
\
52
        dst += stride;\
53
        src += stride;
54

    
55
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
56
\
57
        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
58
        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
59
\
60
        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
61
        psum = vec_mladd(vE, vsrc1ssH, psum);\
62
        psum = vec_sr(psum, v6us);\
63
\
64
        vdst = vec_ld(0, dst);\
65
        ppsum = (vec_u8)vec_pack(psum, psum);\
66
        vfdst = vec_perm(vdst, ppsum, fperm);\
67
\
68
        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
69
\
70
        vec_st(fsum, 0, dst);\
71
\
72
        dst += stride;\
73
        src += stride;
74

    
75
#define noop(a) a
76
#define add28(a) vec_add(v28ss, a)
77

    
78
#ifdef PREFIX_h264_chroma_mc8_altivec
79
static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
80
                                    int stride, int h, int x, int y) {
81
    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
82
                        {((8 - x) * (8 - y)),
83
                         ((    x) * (8 - y)),
84
                         ((8 - x) * (    y)),
85
                         ((    x) * (    y))};
86
    register int i;
87
    vec_u8 fperm;
88
    const vec_s32 vABCD = vec_ld(0, ABCD);
89
    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
90
    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
91
    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
92
    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
93
    LOAD_ZERO;
94
    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
95
    const vec_u16 v6us = vec_splat_u16(6);
96
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
97
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
98

    
99
    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
100
    vec_u8 vsrc0uc, vsrc1uc;
101
    vec_s16 vsrc0ssH, vsrc1ssH;
102
    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
103
    vec_s16 vsrc2ssH, vsrc3ssH, psum;
104
    vec_u8 vdst, ppsum, vfdst, fsum;
105

    
106
    if (((unsigned long)dst) % 16 == 0) {
107
        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
108
                         0x14, 0x15, 0x16, 0x17,
109
                         0x08, 0x09, 0x0A, 0x0B,
110
                         0x0C, 0x0D, 0x0E, 0x0F};
111
    } else {
112
        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
113
                         0x04, 0x05, 0x06, 0x07,
114
                         0x18, 0x19, 0x1A, 0x1B,
115
                         0x1C, 0x1D, 0x1E, 0x1F};
116
    }
117

    
118
    vsrcAuc = vec_ld(0, src);
119

    
120
    if (loadSecond)
121
        vsrcBuc = vec_ld(16, src);
122
    vsrcperm0 = vec_lvsl(0, src);
123
    vsrcperm1 = vec_lvsl(1, src);
124

    
125
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
126
    if (reallyBadAlign)
127
        vsrc1uc = vsrcBuc;
128
    else
129
        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
130

    
131
    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
132
    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
133

    
134
    if (ABCD[3]) {
135
        if (!loadSecond) {// -> !reallyBadAlign
136
            for (i = 0 ; i < h ; i++) {
137
                vsrcCuc = vec_ld(stride + 0, src);
138
                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
139
                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
140

    
141
                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
142
            }
143
        } else {
144
            vec_u8 vsrcDuc;
145
            for (i = 0 ; i < h ; i++) {
146
                vsrcCuc = vec_ld(stride + 0, src);
147
                vsrcDuc = vec_ld(stride + 16, src);
148
                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
149
                if (reallyBadAlign)
150
                    vsrc3uc = vsrcDuc;
151
                else
152
                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
153

    
154
                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
155
            }
156
        }
157
    } else {
158
        const vec_s16 vE = vec_add(vB, vC);
159
        if (ABCD[2]) { // x == 0 B == 0
160
            if (!loadSecond) {// -> !reallyBadAlign
161
                for (i = 0 ; i < h ; i++) {
162
                    vsrcCuc = vec_ld(stride + 0, src);
163
                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
164
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
165

    
166
                    vsrc0uc = vsrc1uc;
167
                }
168
            } else {
169
                vec_u8 vsrcDuc;
170
                for (i = 0 ; i < h ; i++) {
171
                    vsrcCuc = vec_ld(stride + 0, src);
172
                    vsrcDuc = vec_ld(stride + 15, src);
173
                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
174
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
175

    
176
                    vsrc0uc = vsrc1uc;
177
                }
178
            }
179
        } else { // y == 0 C == 0
180
            if (!loadSecond) {// -> !reallyBadAlign
181
                for (i = 0 ; i < h ; i++) {
182
                    vsrcCuc = vec_ld(0, src);
183
                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
184
                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
185

    
186
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
187
                }
188
            } else {
189
                vec_u8 vsrcDuc;
190
                for (i = 0 ; i < h ; i++) {
191
                    vsrcCuc = vec_ld(0, src);
192
                    vsrcDuc = vec_ld(15, src);
193
                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
194
                    if (reallyBadAlign)
195
                        vsrc1uc = vsrcDuc;
196
                    else
197
                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
198

    
199
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
200
                }
201
            }
202
        }
203
    }
204
}
205
#endif
206

    
207
/* this code assume that stride % 16 == 0 */
208
#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
209
static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
210
   DECLARE_ALIGNED(16, signed int, ABCD)[4] =
211
                        {((8 - x) * (8 - y)),
212
                         ((    x) * (8 - y)),
213
                         ((8 - x) * (    y)),
214
                         ((    x) * (    y))};
215
    register int i;
216
    vec_u8 fperm;
217
    const vec_s32 vABCD = vec_ld(0, ABCD);
218
    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
219
    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
220
    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
221
    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
222
    LOAD_ZERO;
223
    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
224
    const vec_u16 v6us  = vec_splat_u16(6);
225
    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
226
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
227

    
228
    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
229
    vec_u8 vsrc0uc, vsrc1uc;
230
    vec_s16 vsrc0ssH, vsrc1ssH;
231
    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
232
    vec_s16 vsrc2ssH, vsrc3ssH, psum;
233
    vec_u8 vdst, ppsum, vfdst, fsum;
234

    
235
    if (((unsigned long)dst) % 16 == 0) {
236
        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
237
                         0x14, 0x15, 0x16, 0x17,
238
                         0x08, 0x09, 0x0A, 0x0B,
239
                         0x0C, 0x0D, 0x0E, 0x0F};
240
    } else {
241
        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
242
                         0x04, 0x05, 0x06, 0x07,
243
                         0x18, 0x19, 0x1A, 0x1B,
244
                         0x1C, 0x1D, 0x1E, 0x1F};
245
    }
246

    
247
    vsrcAuc = vec_ld(0, src);
248

    
249
    if (loadSecond)
250
        vsrcBuc = vec_ld(16, src);
251
    vsrcperm0 = vec_lvsl(0, src);
252
    vsrcperm1 = vec_lvsl(1, src);
253

    
254
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
255
    if (reallyBadAlign)
256
        vsrc1uc = vsrcBuc;
257
    else
258
        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
259

    
260
    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
261
    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
262

    
263
    if (!loadSecond) {// -> !reallyBadAlign
264
        for (i = 0 ; i < h ; i++) {
265

    
266

    
267
            vsrcCuc = vec_ld(stride + 0, src);
268

    
269
            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
270
            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
271

    
272
            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
273
        }
274
    } else {
275
        vec_u8 vsrcDuc;
276
        for (i = 0 ; i < h ; i++) {
277
            vsrcCuc = vec_ld(stride + 0, src);
278
            vsrcDuc = vec_ld(stride + 16, src);
279

    
280
            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
281
            if (reallyBadAlign)
282
                vsrc3uc = vsrcDuc;
283
            else
284
                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
285

    
286
            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
287
        }
288
    }
289
}
290
#endif
291

    
292
#undef noop
293
#undef add28
294
#undef CHROMA_MC8_ALTIVEC_CORE
295

    
296
/* this code assume stride % 16 == 0 */
297
#ifdef PREFIX_h264_qpel16_h_lowpass_altivec
298
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
299
    register int i;
300

    
301
    LOAD_ZERO;
302
    const vec_u8 permM2 = vec_lvsl(-2, src);
303
    const vec_u8 permM1 = vec_lvsl(-1, src);
304
    const vec_u8 permP0 = vec_lvsl(+0, src);
305
    const vec_u8 permP1 = vec_lvsl(+1, src);
306
    const vec_u8 permP2 = vec_lvsl(+2, src);
307
    const vec_u8 permP3 = vec_lvsl(+3, src);
308
    const vec_s16 v5ss = vec_splat_s16(5);
309
    const vec_u16 v5us = vec_splat_u16(5);
310
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
311
    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
312

    
313
    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
314

    
315
    register int align = ((((unsigned long)src) - 2) % 16);
316

    
317
    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
318
              srcP2A, srcP2B, srcP3A, srcP3B,
319
              srcM1A, srcM1B, srcM2A, srcM2B,
320
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
321
              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
322
              psumA, psumB, sumA, sumB;
323

    
324
    vec_u8 sum, vdst, fsum;
325

    
326
    for (i = 0 ; i < 16 ; i ++) {
327
        vec_u8 srcR1 = vec_ld(-2, src);
328
        vec_u8 srcR2 = vec_ld(14, src);
329

    
330
        switch (align) {
331
        default: {
332
            srcM2 = vec_perm(srcR1, srcR2, permM2);
333
            srcM1 = vec_perm(srcR1, srcR2, permM1);
334
            srcP0 = vec_perm(srcR1, srcR2, permP0);
335
            srcP1 = vec_perm(srcR1, srcR2, permP1);
336
            srcP2 = vec_perm(srcR1, srcR2, permP2);
337
            srcP3 = vec_perm(srcR1, srcR2, permP3);
338
        } break;
339
        case 11: {
340
            srcM2 = vec_perm(srcR1, srcR2, permM2);
341
            srcM1 = vec_perm(srcR1, srcR2, permM1);
342
            srcP0 = vec_perm(srcR1, srcR2, permP0);
343
            srcP1 = vec_perm(srcR1, srcR2, permP1);
344
            srcP2 = vec_perm(srcR1, srcR2, permP2);
345
            srcP3 = srcR2;
346
        } break;
347
        case 12: {
348
            vec_u8 srcR3 = vec_ld(30, src);
349
            srcM2 = vec_perm(srcR1, srcR2, permM2);
350
            srcM1 = vec_perm(srcR1, srcR2, permM1);
351
            srcP0 = vec_perm(srcR1, srcR2, permP0);
352
            srcP1 = vec_perm(srcR1, srcR2, permP1);
353
            srcP2 = srcR2;
354
            srcP3 = vec_perm(srcR2, srcR3, permP3);
355
        } break;
356
        case 13: {
357
            vec_u8 srcR3 = vec_ld(30, src);
358
            srcM2 = vec_perm(srcR1, srcR2, permM2);
359
            srcM1 = vec_perm(srcR1, srcR2, permM1);
360
            srcP0 = vec_perm(srcR1, srcR2, permP0);
361
            srcP1 = srcR2;
362
            srcP2 = vec_perm(srcR2, srcR3, permP2);
363
            srcP3 = vec_perm(srcR2, srcR3, permP3);
364
        } break;
365
        case 14: {
366
            vec_u8 srcR3 = vec_ld(30, src);
367
            srcM2 = vec_perm(srcR1, srcR2, permM2);
368
            srcM1 = vec_perm(srcR1, srcR2, permM1);
369
            srcP0 = srcR2;
370
            srcP1 = vec_perm(srcR2, srcR3, permP1);
371
            srcP2 = vec_perm(srcR2, srcR3, permP2);
372
            srcP3 = vec_perm(srcR2, srcR3, permP3);
373
        } break;
374
        case 15: {
375
            vec_u8 srcR3 = vec_ld(30, src);
376
            srcM2 = vec_perm(srcR1, srcR2, permM2);
377
            srcM1 = srcR2;
378
            srcP0 = vec_perm(srcR2, srcR3, permP0);
379
            srcP1 = vec_perm(srcR2, srcR3, permP1);
380
            srcP2 = vec_perm(srcR2, srcR3, permP2);
381
            srcP3 = vec_perm(srcR2, srcR3, permP3);
382
        } break;
383
        }
384

    
385
        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
386
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
387
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
388
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
389

    
390
        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
391
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
392
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
393
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
394

    
395
        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
396
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
397
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
398
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
399

    
400
        sum1A = vec_adds(srcP0A, srcP1A);
401
        sum1B = vec_adds(srcP0B, srcP1B);
402
        sum2A = vec_adds(srcM1A, srcP2A);
403
        sum2B = vec_adds(srcM1B, srcP2B);
404
        sum3A = vec_adds(srcM2A, srcP3A);
405
        sum3B = vec_adds(srcM2B, srcP3B);
406

    
407
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
408
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
409

    
410
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
411
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
412

    
413
        pp3A = vec_add(sum3A, pp1A);
414
        pp3B = vec_add(sum3B, pp1B);
415

    
416
        psumA = vec_sub(pp3A, pp2A);
417
        psumB = vec_sub(pp3B, pp2B);
418

    
419
        sumA = vec_sra(psumA, v5us);
420
        sumB = vec_sra(psumB, v5us);
421

    
422
        sum = vec_packsu(sumA, sumB);
423

    
424
        ASSERT_ALIGNED(dst);
425
        vdst = vec_ld(0, dst);
426

    
427
        OP_U8_ALTIVEC(fsum, sum, vdst);
428

    
429
        vec_st(fsum, 0, dst);
430

    
431
        src += srcStride;
432
        dst += dstStride;
433
    }
434
}
435
#endif
436

    
437
/* this code assume stride % 16 == 0 */
438
#ifdef PREFIX_h264_qpel16_v_lowpass_altivec
439
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
440
    register int i;
441

    
442
    LOAD_ZERO;
443
    const vec_u8 perm = vec_lvsl(0, src);
444
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
445
    const vec_u16 v5us = vec_splat_u16(5);
446
    const vec_s16 v5ss = vec_splat_s16(5);
447
    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
448

    
449
    uint8_t *srcbis = src - (srcStride * 2);
450

    
451
    const vec_u8 srcM2a = vec_ld(0, srcbis);
452
    const vec_u8 srcM2b = vec_ld(16, srcbis);
453
    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
454
    //srcbis += srcStride;
455
    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
456
    const vec_u8 srcM1b = vec_ld(16, srcbis);
457
    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
458
    //srcbis += srcStride;
459
    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
460
    const vec_u8 srcP0b = vec_ld(16, srcbis);
461
    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
462
    //srcbis += srcStride;
463
    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
464
    const vec_u8 srcP1b = vec_ld(16, srcbis);
465
    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
466
    //srcbis += srcStride;
467
    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
468
    const vec_u8 srcP2b = vec_ld(16, srcbis);
469
    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
470
    //srcbis += srcStride;
471

    
472
    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
473
    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
474
    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
475
    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
476
    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
477
    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
478
    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
479
    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
480
    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
481
    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
482

    
483
    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
484
              psumA, psumB, sumA, sumB,
485
              srcP3ssA, srcP3ssB,
486
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
487

    
488
    vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
489

    
490
    for (i = 0 ; i < 16 ; i++) {
491
        srcP3a = vec_ld(0, srcbis += srcStride);
492
        srcP3b = vec_ld(16, srcbis);
493
        srcP3 = vec_perm(srcP3a, srcP3b, perm);
494
        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
495
        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
496
        //srcbis += srcStride;
497

    
498
        sum1A = vec_adds(srcP0ssA, srcP1ssA);
499
        sum1B = vec_adds(srcP0ssB, srcP1ssB);
500
        sum2A = vec_adds(srcM1ssA, srcP2ssA);
501
        sum2B = vec_adds(srcM1ssB, srcP2ssB);
502
        sum3A = vec_adds(srcM2ssA, srcP3ssA);
503
        sum3B = vec_adds(srcM2ssB, srcP3ssB);
504

    
505
        srcM2ssA = srcM1ssA;
506
        srcM2ssB = srcM1ssB;
507
        srcM1ssA = srcP0ssA;
508
        srcM1ssB = srcP0ssB;
509
        srcP0ssA = srcP1ssA;
510
        srcP0ssB = srcP1ssB;
511
        srcP1ssA = srcP2ssA;
512
        srcP1ssB = srcP2ssB;
513
        srcP2ssA = srcP3ssA;
514
        srcP2ssB = srcP3ssB;
515

    
516
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
517
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
518

    
519
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
520
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
521

    
522
        pp3A = vec_add(sum3A, pp1A);
523
        pp3B = vec_add(sum3B, pp1B);
524

    
525
        psumA = vec_sub(pp3A, pp2A);
526
        psumB = vec_sub(pp3B, pp2B);
527

    
528
        sumA = vec_sra(psumA, v5us);
529
        sumB = vec_sra(psumB, v5us);
530

    
531
        sum = vec_packsu(sumA, sumB);
532

    
533
        ASSERT_ALIGNED(dst);
534
        vdst = vec_ld(0, dst);
535

    
536
        OP_U8_ALTIVEC(fsum, sum, vdst);
537

    
538
        vec_st(fsum, 0, dst);
539

    
540
        dst += dstStride;
541
    }
542
}
543
#endif
544

    
545
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
546
#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
547
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
548
    register int i;
549
    LOAD_ZERO;
550
    const vec_u8 permM2 = vec_lvsl(-2, src);
551
    const vec_u8 permM1 = vec_lvsl(-1, src);
552
    const vec_u8 permP0 = vec_lvsl(+0, src);
553
    const vec_u8 permP1 = vec_lvsl(+1, src);
554
    const vec_u8 permP2 = vec_lvsl(+2, src);
555
    const vec_u8 permP3 = vec_lvsl(+3, src);
556
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
557
    const vec_u32 v10ui = vec_splat_u32(10);
558
    const vec_s16 v5ss = vec_splat_s16(5);
559
    const vec_s16 v1ss = vec_splat_s16(1);
560
    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
561
    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
562

    
563
    register int align = ((((unsigned long)src) - 2) % 16);
564

    
565
    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
566
              srcP2A, srcP2B, srcP3A, srcP3B,
567
              srcM1A, srcM1B, srcM2A, srcM2B,
568
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
569
              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
570

    
571
    const vec_u8 mperm = (const vec_u8)
572
        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
573
         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
574
    int16_t *tmpbis = tmp;
575

    
576
    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
577
              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
578
              tmpP2ssA, tmpP2ssB;
579

    
580
    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
581
              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
582
              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
583
              ssumAe, ssumAo, ssumBe, ssumBo;
584
    vec_u8 fsum, sumv, sum, vdst;
585
    vec_s16 ssume, ssumo;
586

    
587
    src -= (2 * srcStride);
588
    for (i = 0 ; i < 21 ; i ++) {
589
        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
590
        vec_u8 srcR1 = vec_ld(-2, src);
591
        vec_u8 srcR2 = vec_ld(14, src);
592

    
593
        switch (align) {
594
        default: {
595
            srcM2 = vec_perm(srcR1, srcR2, permM2);
596
            srcM1 = vec_perm(srcR1, srcR2, permM1);
597
            srcP0 = vec_perm(srcR1, srcR2, permP0);
598
            srcP1 = vec_perm(srcR1, srcR2, permP1);
599
            srcP2 = vec_perm(srcR1, srcR2, permP2);
600
            srcP3 = vec_perm(srcR1, srcR2, permP3);
601
        } break;
602
        case 11: {
603
            srcM2 = vec_perm(srcR1, srcR2, permM2);
604
            srcM1 = vec_perm(srcR1, srcR2, permM1);
605
            srcP0 = vec_perm(srcR1, srcR2, permP0);
606
            srcP1 = vec_perm(srcR1, srcR2, permP1);
607
            srcP2 = vec_perm(srcR1, srcR2, permP2);
608
            srcP3 = srcR2;
609
        } break;
610
        case 12: {
611
            vec_u8 srcR3 = vec_ld(30, src);
612
            srcM2 = vec_perm(srcR1, srcR2, permM2);
613
            srcM1 = vec_perm(srcR1, srcR2, permM1);
614
            srcP0 = vec_perm(srcR1, srcR2, permP0);
615
            srcP1 = vec_perm(srcR1, srcR2, permP1);
616
            srcP2 = srcR2;
617
            srcP3 = vec_perm(srcR2, srcR3, permP3);
618
        } break;
619
        case 13: {
620
            vec_u8 srcR3 = vec_ld(30, src);
621
            srcM2 = vec_perm(srcR1, srcR2, permM2);
622
            srcM1 = vec_perm(srcR1, srcR2, permM1);
623
            srcP0 = vec_perm(srcR1, srcR2, permP0);
624
            srcP1 = srcR2;
625
            srcP2 = vec_perm(srcR2, srcR3, permP2);
626
            srcP3 = vec_perm(srcR2, srcR3, permP3);
627
        } break;
628
        case 14: {
629
            vec_u8 srcR3 = vec_ld(30, src);
630
            srcM2 = vec_perm(srcR1, srcR2, permM2);
631
            srcM1 = vec_perm(srcR1, srcR2, permM1);
632
            srcP0 = srcR2;
633
            srcP1 = vec_perm(srcR2, srcR3, permP1);
634
            srcP2 = vec_perm(srcR2, srcR3, permP2);
635
            srcP3 = vec_perm(srcR2, srcR3, permP3);
636
        } break;
637
        case 15: {
638
            vec_u8 srcR3 = vec_ld(30, src);
639
            srcM2 = vec_perm(srcR1, srcR2, permM2);
640
            srcM1 = srcR2;
641
            srcP0 = vec_perm(srcR2, srcR3, permP0);
642
            srcP1 = vec_perm(srcR2, srcR3, permP1);
643
            srcP2 = vec_perm(srcR2, srcR3, permP2);
644
            srcP3 = vec_perm(srcR2, srcR3, permP3);
645
        } break;
646
        }
647

    
648
        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
649
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
650
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
651
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
652

    
653
        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
654
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
655
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
656
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
657

    
658
        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
659
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
660
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
661
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
662

    
663
        sum1A = vec_adds(srcP0A, srcP1A);
664
        sum1B = vec_adds(srcP0B, srcP1B);
665
        sum2A = vec_adds(srcM1A, srcP2A);
666
        sum2B = vec_adds(srcM1B, srcP2B);
667
        sum3A = vec_adds(srcM2A, srcP3A);
668
        sum3B = vec_adds(srcM2B, srcP3B);
669

    
670
        pp1A = vec_mladd(sum1A, v20ss, sum3A);
671
        pp1B = vec_mladd(sum1B, v20ss, sum3B);
672

    
673
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
674
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
675

    
676
        psumA = vec_sub(pp1A, pp2A);
677
        psumB = vec_sub(pp1B, pp2B);
678

    
679
        vec_st(psumA, 0, tmp);
680
        vec_st(psumB, 16, tmp);
681

    
682
        src += srcStride;
683
        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
684
    }
685

    
686
    tmpM2ssA = vec_ld(0, tmpbis);
687
    tmpM2ssB = vec_ld(16, tmpbis);
688
    tmpbis += tmpStride;
689
    tmpM1ssA = vec_ld(0, tmpbis);
690
    tmpM1ssB = vec_ld(16, tmpbis);
691
    tmpbis += tmpStride;
692
    tmpP0ssA = vec_ld(0, tmpbis);
693
    tmpP0ssB = vec_ld(16, tmpbis);
694
    tmpbis += tmpStride;
695
    tmpP1ssA = vec_ld(0, tmpbis);
696
    tmpP1ssB = vec_ld(16, tmpbis);
697
    tmpbis += tmpStride;
698
    tmpP2ssA = vec_ld(0, tmpbis);
699
    tmpP2ssB = vec_ld(16, tmpbis);
700
    tmpbis += tmpStride;
701

    
702
    for (i = 0 ; i < 16 ; i++) {
703
        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
704
        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
705

    
706
        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
707
        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
708
        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
709
        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
710
        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
711
        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
712

    
713
        tmpbis += tmpStride;
714

    
715
        tmpM2ssA = tmpM1ssA;
716
        tmpM2ssB = tmpM1ssB;
717
        tmpM1ssA = tmpP0ssA;
718
        tmpM1ssB = tmpP0ssB;
719
        tmpP0ssA = tmpP1ssA;
720
        tmpP0ssB = tmpP1ssB;
721
        tmpP1ssA = tmpP2ssA;
722
        tmpP1ssB = tmpP2ssB;
723
        tmpP2ssA = tmpP3ssA;
724
        tmpP2ssB = tmpP3ssB;
725

    
726
        pp1Ae = vec_mule(sum1A, v20ss);
727
        pp1Ao = vec_mulo(sum1A, v20ss);
728
        pp1Be = vec_mule(sum1B, v20ss);
729
        pp1Bo = vec_mulo(sum1B, v20ss);
730

    
731
        pp2Ae = vec_mule(sum2A, v5ss);
732
        pp2Ao = vec_mulo(sum2A, v5ss);
733
        pp2Be = vec_mule(sum2B, v5ss);
734
        pp2Bo = vec_mulo(sum2B, v5ss);
735

    
736
        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
737
        pp3Ao = vec_mulo(sum3A, v1ss);
738
        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
739
        pp3Bo = vec_mulo(sum3B, v1ss);
740

    
741
        pp1cAe = vec_add(pp1Ae, v512si);
742
        pp1cAo = vec_add(pp1Ao, v512si);
743
        pp1cBe = vec_add(pp1Be, v512si);
744
        pp1cBo = vec_add(pp1Bo, v512si);
745

    
746
        pp32Ae = vec_sub(pp3Ae, pp2Ae);
747
        pp32Ao = vec_sub(pp3Ao, pp2Ao);
748
        pp32Be = vec_sub(pp3Be, pp2Be);
749
        pp32Bo = vec_sub(pp3Bo, pp2Bo);
750

    
751
        sumAe = vec_add(pp1cAe, pp32Ae);
752
        sumAo = vec_add(pp1cAo, pp32Ao);
753
        sumBe = vec_add(pp1cBe, pp32Be);
754
        sumBo = vec_add(pp1cBo, pp32Bo);
755

    
756
        ssumAe = vec_sra(sumAe, v10ui);
757
        ssumAo = vec_sra(sumAo, v10ui);
758
        ssumBe = vec_sra(sumBe, v10ui);
759
        ssumBo = vec_sra(sumBo, v10ui);
760

    
761
        ssume = vec_packs(ssumAe, ssumBe);
762
        ssumo = vec_packs(ssumAo, ssumBo);
763

    
764
        sumv = vec_packsu(ssume, ssumo);
765
        sum = vec_perm(sumv, sumv, mperm);
766

    
767
        ASSERT_ALIGNED(dst);
768
        vdst = vec_ld(0, dst);
769

    
770
        OP_U8_ALTIVEC(fsum, sum, vdst);
771

    
772
        vec_st(fsum, 0, dst);
773

    
774
        dst += dstStride;
775
    }
776
}
777
#endif