Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / h264_template_altivec.c @ 84dc2d8a

History | View | Annotate | Download (27.3 KB)

1
/*
2
 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
//#define DEBUG_ALIGNMENT
22
#ifdef DEBUG_ALIGNMENT
23
#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24
#else
25
#define ASSERT_ALIGNED(ptr) ;
26
#endif
27

    
28
/* this code assume that stride % 16 == 0 */
29

    
30
#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
31
        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32
        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
33
\
34
        psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
35
        psum = vec_mladd(vB, vsrc1ssH, psum);\
36
        psum = vec_mladd(vC, vsrc2ssH, psum);\
37
        psum = vec_mladd(vD, vsrc3ssH, psum);\
38
        psum = BIAS2(psum);\
39
        psum = vec_sr(psum, v6us);\
40
\
41
        vdst = vec_ld(0, dst);\
42
        ppsum = (vec_u8)vec_pack(psum, psum);\
43
        vfdst = vec_perm(vdst, ppsum, fperm);\
44
\
45
        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
46
\
47
        vec_st(fsum, 0, dst);\
48
\
49
        vsrc0ssH = vsrc2ssH;\
50
        vsrc1ssH = vsrc3ssH;\
51
\
52
        dst += stride;\
53
        src += stride;
54

    
55
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
56
\
57
        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
58
        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
59
\
60
        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
61
        psum = vec_mladd(vE, vsrc1ssH, psum);\
62
        psum = vec_sr(psum, v6us);\
63
\
64
        vdst = vec_ld(0, dst);\
65
        ppsum = (vec_u8)vec_pack(psum, psum);\
66
        vfdst = vec_perm(vdst, ppsum, fperm);\
67
\
68
        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
69
\
70
        vec_st(fsum, 0, dst);\
71
\
72
        dst += stride;\
73
        src += stride;
74

    
75
#define noop(a) a
76
#define add28(a) vec_add(v28ss, a)
77

    
78
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
79
                                    int stride, int h, int x, int y) {
80
  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
81
    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
82
                        {((8 - x) * (8 - y)),
83
                         ((    x) * (8 - y)),
84
                         ((8 - x) * (    y)),
85
                         ((    x) * (    y))};
86
    register int i;
87
    vec_u8 fperm;
88
    const vec_s32 vABCD = vec_ld(0, ABCD);
89
    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
90
    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
91
    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
92
    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
93
    LOAD_ZERO;
94
    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
95
    const vec_u16 v6us = vec_splat_u16(6);
96
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
97
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
98

    
99
    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
100
    vec_u8 vsrc0uc, vsrc1uc;
101
    vec_s16 vsrc0ssH, vsrc1ssH;
102
    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
103
    vec_s16 vsrc2ssH, vsrc3ssH, psum;
104
    vec_u8 vdst, ppsum, vfdst, fsum;
105

    
106
  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
107

    
108
    if (((unsigned long)dst) % 16 == 0) {
109
        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
110
                         0x14, 0x15, 0x16, 0x17,
111
                         0x08, 0x09, 0x0A, 0x0B,
112
                         0x0C, 0x0D, 0x0E, 0x0F};
113
    } else {
114
        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
115
                         0x04, 0x05, 0x06, 0x07,
116
                         0x18, 0x19, 0x1A, 0x1B,
117
                         0x1C, 0x1D, 0x1E, 0x1F};
118
    }
119

    
120
    vsrcAuc = vec_ld(0, src);
121

    
122
    if (loadSecond)
123
        vsrcBuc = vec_ld(16, src);
124
    vsrcperm0 = vec_lvsl(0, src);
125
    vsrcperm1 = vec_lvsl(1, src);
126

    
127
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
128
    if (reallyBadAlign)
129
        vsrc1uc = vsrcBuc;
130
    else
131
        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
132

    
133
    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
134
    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
135

    
136
    if (ABCD[3]) {
137
        if (!loadSecond) {// -> !reallyBadAlign
138
            for (i = 0 ; i < h ; i++) {
139
                vsrcCuc = vec_ld(stride + 0, src);
140
                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
141
                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
142

    
143
                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
144
            }
145
        } else {
146
            vec_u8 vsrcDuc;
147
            for (i = 0 ; i < h ; i++) {
148
                vsrcCuc = vec_ld(stride + 0, src);
149
                vsrcDuc = vec_ld(stride + 16, src);
150
                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
151
                if (reallyBadAlign)
152
                    vsrc3uc = vsrcDuc;
153
                else
154
                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
155

    
156
                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
157
            }
158
        }
159
    } else {
160
        const vec_s16 vE = vec_add(vB, vC);
161
        if (ABCD[2]) { // x == 0 B == 0
162
            if (!loadSecond) {// -> !reallyBadAlign
163
                for (i = 0 ; i < h ; i++) {
164
                    vsrcCuc = vec_ld(stride + 0, src);
165
                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
166
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
167

    
168
                    vsrc0uc = vsrc1uc;
169
                }
170
            } else {
171
                vec_u8 vsrcDuc;
172
                for (i = 0 ; i < h ; i++) {
173
                    vsrcCuc = vec_ld(stride + 0, src);
174
                    vsrcDuc = vec_ld(stride + 15, src);
175
                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
176
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
177

    
178
                    vsrc0uc = vsrc1uc;
179
                }
180
            }
181
        } else { // y == 0 C == 0
182
            if (!loadSecond) {// -> !reallyBadAlign
183
                for (i = 0 ; i < h ; i++) {
184
                    vsrcCuc = vec_ld(0, src);
185
                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
186
                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
187

    
188
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
189
                }
190
            } else {
191
                vec_u8 vsrcDuc;
192
                for (i = 0 ; i < h ; i++) {
193
                    vsrcCuc = vec_ld(0, src);
194
                    vsrcDuc = vec_ld(15, src);
195
                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
196
                    if (reallyBadAlign)
197
                        vsrc1uc = vsrcDuc;
198
                    else
199
                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
200

    
201
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
202
                }
203
            }
204
        }
205
    }
206
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
207
}
208

    
209
/* this code assume that stride % 16 == 0 */
210
void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
211
   DECLARE_ALIGNED(16, signed int, ABCD)[4] =
212
                        {((8 - x) * (8 - y)),
213
                         ((    x) * (8 - y)),
214
                         ((8 - x) * (    y)),
215
                         ((    x) * (    y))};
216
    register int i;
217
    vec_u8 fperm;
218
    const vec_s32 vABCD = vec_ld(0, ABCD);
219
    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
220
    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
221
    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
222
    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
223
    LOAD_ZERO;
224
    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
225
    const vec_u16 v6us  = vec_splat_u16(6);
226
    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
227
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
228

    
229
    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
230
    vec_u8 vsrc0uc, vsrc1uc;
231
    vec_s16 vsrc0ssH, vsrc1ssH;
232
    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
233
    vec_s16 vsrc2ssH, vsrc3ssH, psum;
234
    vec_u8 vdst, ppsum, vfdst, fsum;
235

    
236
    if (((unsigned long)dst) % 16 == 0) {
237
        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
238
                         0x14, 0x15, 0x16, 0x17,
239
                         0x08, 0x09, 0x0A, 0x0B,
240
                         0x0C, 0x0D, 0x0E, 0x0F};
241
    } else {
242
        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
243
                         0x04, 0x05, 0x06, 0x07,
244
                         0x18, 0x19, 0x1A, 0x1B,
245
                         0x1C, 0x1D, 0x1E, 0x1F};
246
    }
247

    
248
    vsrcAuc = vec_ld(0, src);
249

    
250
    if (loadSecond)
251
        vsrcBuc = vec_ld(16, src);
252
    vsrcperm0 = vec_lvsl(0, src);
253
    vsrcperm1 = vec_lvsl(1, src);
254

    
255
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
256
    if (reallyBadAlign)
257
        vsrc1uc = vsrcBuc;
258
    else
259
        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
260

    
261
    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
262
    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
263

    
264
    if (!loadSecond) {// -> !reallyBadAlign
265
        for (i = 0 ; i < h ; i++) {
266

    
267

    
268
            vsrcCuc = vec_ld(stride + 0, src);
269

    
270
            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
271
            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
272

    
273
            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
274
        }
275
    } else {
276
        vec_u8 vsrcDuc;
277
        for (i = 0 ; i < h ; i++) {
278
            vsrcCuc = vec_ld(stride + 0, src);
279
            vsrcDuc = vec_ld(stride + 16, src);
280

    
281
            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
282
            if (reallyBadAlign)
283
                vsrc3uc = vsrcDuc;
284
            else
285
                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
286

    
287
            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
288
        }
289
    }
290
}
291

    
292
#undef noop
293
#undef add28
294
#undef CHROMA_MC8_ALTIVEC_CORE
295

    
296
/* this code assume stride % 16 == 0 */
297
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
298
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
299
    register int i;
300

    
301
    LOAD_ZERO;
302
    const vec_u8 permM2 = vec_lvsl(-2, src);
303
    const vec_u8 permM1 = vec_lvsl(-1, src);
304
    const vec_u8 permP0 = vec_lvsl(+0, src);
305
    const vec_u8 permP1 = vec_lvsl(+1, src);
306
    const vec_u8 permP2 = vec_lvsl(+2, src);
307
    const vec_u8 permP3 = vec_lvsl(+3, src);
308
    const vec_s16 v5ss = vec_splat_s16(5);
309
    const vec_u16 v5us = vec_splat_u16(5);
310
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
311
    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
312

    
313
    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
314

    
315
    register int align = ((((unsigned long)src) - 2) % 16);
316

    
317
    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
318
              srcP2A, srcP2B, srcP3A, srcP3B,
319
              srcM1A, srcM1B, srcM2A, srcM2B,
320
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
321
              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
322
              psumA, psumB, sumA, sumB;
323

    
324
    vec_u8 sum, vdst, fsum;
325

    
326
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
327

    
328
    for (i = 0 ; i < 16 ; i ++) {
329
        vec_u8 srcR1 = vec_ld(-2, src);
330
        vec_u8 srcR2 = vec_ld(14, src);
331

    
332
        switch (align) {
333
        default: {
334
            srcM2 = vec_perm(srcR1, srcR2, permM2);
335
            srcM1 = vec_perm(srcR1, srcR2, permM1);
336
            srcP0 = vec_perm(srcR1, srcR2, permP0);
337
            srcP1 = vec_perm(srcR1, srcR2, permP1);
338
            srcP2 = vec_perm(srcR1, srcR2, permP2);
339
            srcP3 = vec_perm(srcR1, srcR2, permP3);
340
        } break;
341
        case 11: {
342
            srcM2 = vec_perm(srcR1, srcR2, permM2);
343
            srcM1 = vec_perm(srcR1, srcR2, permM1);
344
            srcP0 = vec_perm(srcR1, srcR2, permP0);
345
            srcP1 = vec_perm(srcR1, srcR2, permP1);
346
            srcP2 = vec_perm(srcR1, srcR2, permP2);
347
            srcP3 = srcR2;
348
        } break;
349
        case 12: {
350
            vec_u8 srcR3 = vec_ld(30, src);
351
            srcM2 = vec_perm(srcR1, srcR2, permM2);
352
            srcM1 = vec_perm(srcR1, srcR2, permM1);
353
            srcP0 = vec_perm(srcR1, srcR2, permP0);
354
            srcP1 = vec_perm(srcR1, srcR2, permP1);
355
            srcP2 = srcR2;
356
            srcP3 = vec_perm(srcR2, srcR3, permP3);
357
        } break;
358
        case 13: {
359
            vec_u8 srcR3 = vec_ld(30, src);
360
            srcM2 = vec_perm(srcR1, srcR2, permM2);
361
            srcM1 = vec_perm(srcR1, srcR2, permM1);
362
            srcP0 = vec_perm(srcR1, srcR2, permP0);
363
            srcP1 = srcR2;
364
            srcP2 = vec_perm(srcR2, srcR3, permP2);
365
            srcP3 = vec_perm(srcR2, srcR3, permP3);
366
        } break;
367
        case 14: {
368
            vec_u8 srcR3 = vec_ld(30, src);
369
            srcM2 = vec_perm(srcR1, srcR2, permM2);
370
            srcM1 = vec_perm(srcR1, srcR2, permM1);
371
            srcP0 = srcR2;
372
            srcP1 = vec_perm(srcR2, srcR3, permP1);
373
            srcP2 = vec_perm(srcR2, srcR3, permP2);
374
            srcP3 = vec_perm(srcR2, srcR3, permP3);
375
        } break;
376
        case 15: {
377
            vec_u8 srcR3 = vec_ld(30, src);
378
            srcM2 = vec_perm(srcR1, srcR2, permM2);
379
            srcM1 = srcR2;
380
            srcP0 = vec_perm(srcR2, srcR3, permP0);
381
            srcP1 = vec_perm(srcR2, srcR3, permP1);
382
            srcP2 = vec_perm(srcR2, srcR3, permP2);
383
            srcP3 = vec_perm(srcR2, srcR3, permP3);
384
        } break;
385
        }
386

    
387
        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
388
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
389
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
390
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
391

    
392
        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
393
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
394
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
395
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
396

    
397
        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
398
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
399
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
400
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
401

    
402
        sum1A = vec_adds(srcP0A, srcP1A);
403
        sum1B = vec_adds(srcP0B, srcP1B);
404
        sum2A = vec_adds(srcM1A, srcP2A);
405
        sum2B = vec_adds(srcM1B, srcP2B);
406
        sum3A = vec_adds(srcM2A, srcP3A);
407
        sum3B = vec_adds(srcM2B, srcP3B);
408

    
409
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
410
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
411

    
412
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
413
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
414

    
415
        pp3A = vec_add(sum3A, pp1A);
416
        pp3B = vec_add(sum3B, pp1B);
417

    
418
        psumA = vec_sub(pp3A, pp2A);
419
        psumB = vec_sub(pp3B, pp2B);
420

    
421
        sumA = vec_sra(psumA, v5us);
422
        sumB = vec_sra(psumB, v5us);
423

    
424
        sum = vec_packsu(sumA, sumB);
425

    
426
        ASSERT_ALIGNED(dst);
427
        vdst = vec_ld(0, dst);
428

    
429
        OP_U8_ALTIVEC(fsum, sum, vdst);
430

    
431
        vec_st(fsum, 0, dst);
432

    
433
        src += srcStride;
434
        dst += dstStride;
435
    }
436
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
437
}
438

    
439
/* this code assume stride % 16 == 0 */
440
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
441
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
442

    
443
    register int i;
444

    
445
    LOAD_ZERO;
446
    const vec_u8 perm = vec_lvsl(0, src);
447
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
448
    const vec_u16 v5us = vec_splat_u16(5);
449
    const vec_s16 v5ss = vec_splat_s16(5);
450
    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
451

    
452
    uint8_t *srcbis = src - (srcStride * 2);
453

    
454
    const vec_u8 srcM2a = vec_ld(0, srcbis);
455
    const vec_u8 srcM2b = vec_ld(16, srcbis);
456
    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
457
    //srcbis += srcStride;
458
    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
459
    const vec_u8 srcM1b = vec_ld(16, srcbis);
460
    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
461
    //srcbis += srcStride;
462
    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
463
    const vec_u8 srcP0b = vec_ld(16, srcbis);
464
    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
465
    //srcbis += srcStride;
466
    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
467
    const vec_u8 srcP1b = vec_ld(16, srcbis);
468
    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
469
    //srcbis += srcStride;
470
    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
471
    const vec_u8 srcP2b = vec_ld(16, srcbis);
472
    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
473
    //srcbis += srcStride;
474

    
475
    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
476
    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
477
    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
478
    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
479
    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
480
    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
481
    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
482
    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
483
    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
484
    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
485

    
486
    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
487
              psumA, psumB, sumA, sumB,
488
              srcP3ssA, srcP3ssB,
489
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
490

    
491
    vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
492

    
493
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
494

    
495
    for (i = 0 ; i < 16 ; i++) {
496
        srcP3a = vec_ld(0, srcbis += srcStride);
497
        srcP3b = vec_ld(16, srcbis);
498
        srcP3 = vec_perm(srcP3a, srcP3b, perm);
499
        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
500
        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
501
        //srcbis += srcStride;
502

    
503
        sum1A = vec_adds(srcP0ssA, srcP1ssA);
504
        sum1B = vec_adds(srcP0ssB, srcP1ssB);
505
        sum2A = vec_adds(srcM1ssA, srcP2ssA);
506
        sum2B = vec_adds(srcM1ssB, srcP2ssB);
507
        sum3A = vec_adds(srcM2ssA, srcP3ssA);
508
        sum3B = vec_adds(srcM2ssB, srcP3ssB);
509

    
510
        srcM2ssA = srcM1ssA;
511
        srcM2ssB = srcM1ssB;
512
        srcM1ssA = srcP0ssA;
513
        srcM1ssB = srcP0ssB;
514
        srcP0ssA = srcP1ssA;
515
        srcP0ssB = srcP1ssB;
516
        srcP1ssA = srcP2ssA;
517
        srcP1ssB = srcP2ssB;
518
        srcP2ssA = srcP3ssA;
519
        srcP2ssB = srcP3ssB;
520

    
521
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
522
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
523

    
524
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
525
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
526

    
527
        pp3A = vec_add(sum3A, pp1A);
528
        pp3B = vec_add(sum3B, pp1B);
529

    
530
        psumA = vec_sub(pp3A, pp2A);
531
        psumB = vec_sub(pp3B, pp2B);
532

    
533
        sumA = vec_sra(psumA, v5us);
534
        sumB = vec_sra(psumB, v5us);
535

    
536
        sum = vec_packsu(sumA, sumB);
537

    
538
        ASSERT_ALIGNED(dst);
539
        vdst = vec_ld(0, dst);
540

    
541
        OP_U8_ALTIVEC(fsum, sum, vdst);
542

    
543
        vec_st(fsum, 0, dst);
544

    
545
        dst += dstStride;
546
    }
547
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
548
}
549

    
550
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
551
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
552
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
553
    register int i;
554
    LOAD_ZERO;
555
    const vec_u8 permM2 = vec_lvsl(-2, src);
556
    const vec_u8 permM1 = vec_lvsl(-1, src);
557
    const vec_u8 permP0 = vec_lvsl(+0, src);
558
    const vec_u8 permP1 = vec_lvsl(+1, src);
559
    const vec_u8 permP2 = vec_lvsl(+2, src);
560
    const vec_u8 permP3 = vec_lvsl(+3, src);
561
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
562
    const vec_u32 v10ui = vec_splat_u32(10);
563
    const vec_s16 v5ss = vec_splat_s16(5);
564
    const vec_s16 v1ss = vec_splat_s16(1);
565
    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
566
    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
567

    
568
    register int align = ((((unsigned long)src) - 2) % 16);
569

    
570
    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
571
              srcP2A, srcP2B, srcP3A, srcP3B,
572
              srcM1A, srcM1B, srcM2A, srcM2B,
573
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
574
              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
575

    
576
    const vec_u8 mperm = (const vec_u8)
577
        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
578
         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
579
    int16_t *tmpbis = tmp;
580

    
581
    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
582
              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
583
              tmpP2ssA, tmpP2ssB;
584

    
585
    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
586
              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
587
              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
588
              ssumAe, ssumAo, ssumBe, ssumBo;
589
    vec_u8 fsum, sumv, sum, vdst;
590
    vec_s16 ssume, ssumo;
591

    
592
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
593
    src -= (2 * srcStride);
594
    for (i = 0 ; i < 21 ; i ++) {
595
        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
596
        vec_u8 srcR1 = vec_ld(-2, src);
597
        vec_u8 srcR2 = vec_ld(14, src);
598

    
599
        switch (align) {
600
        default: {
601
            srcM2 = vec_perm(srcR1, srcR2, permM2);
602
            srcM1 = vec_perm(srcR1, srcR2, permM1);
603
            srcP0 = vec_perm(srcR1, srcR2, permP0);
604
            srcP1 = vec_perm(srcR1, srcR2, permP1);
605
            srcP2 = vec_perm(srcR1, srcR2, permP2);
606
            srcP3 = vec_perm(srcR1, srcR2, permP3);
607
        } break;
608
        case 11: {
609
            srcM2 = vec_perm(srcR1, srcR2, permM2);
610
            srcM1 = vec_perm(srcR1, srcR2, permM1);
611
            srcP0 = vec_perm(srcR1, srcR2, permP0);
612
            srcP1 = vec_perm(srcR1, srcR2, permP1);
613
            srcP2 = vec_perm(srcR1, srcR2, permP2);
614
            srcP3 = srcR2;
615
        } break;
616
        case 12: {
617
            vec_u8 srcR3 = vec_ld(30, src);
618
            srcM2 = vec_perm(srcR1, srcR2, permM2);
619
            srcM1 = vec_perm(srcR1, srcR2, permM1);
620
            srcP0 = vec_perm(srcR1, srcR2, permP0);
621
            srcP1 = vec_perm(srcR1, srcR2, permP1);
622
            srcP2 = srcR2;
623
            srcP3 = vec_perm(srcR2, srcR3, permP3);
624
        } break;
625
        case 13: {
626
            vec_u8 srcR3 = vec_ld(30, src);
627
            srcM2 = vec_perm(srcR1, srcR2, permM2);
628
            srcM1 = vec_perm(srcR1, srcR2, permM1);
629
            srcP0 = vec_perm(srcR1, srcR2, permP0);
630
            srcP1 = srcR2;
631
            srcP2 = vec_perm(srcR2, srcR3, permP2);
632
            srcP3 = vec_perm(srcR2, srcR3, permP3);
633
        } break;
634
        case 14: {
635
            vec_u8 srcR3 = vec_ld(30, src);
636
            srcM2 = vec_perm(srcR1, srcR2, permM2);
637
            srcM1 = vec_perm(srcR1, srcR2, permM1);
638
            srcP0 = srcR2;
639
            srcP1 = vec_perm(srcR2, srcR3, permP1);
640
            srcP2 = vec_perm(srcR2, srcR3, permP2);
641
            srcP3 = vec_perm(srcR2, srcR3, permP3);
642
        } break;
643
        case 15: {
644
            vec_u8 srcR3 = vec_ld(30, src);
645
            srcM2 = vec_perm(srcR1, srcR2, permM2);
646
            srcM1 = srcR2;
647
            srcP0 = vec_perm(srcR2, srcR3, permP0);
648
            srcP1 = vec_perm(srcR2, srcR3, permP1);
649
            srcP2 = vec_perm(srcR2, srcR3, permP2);
650
            srcP3 = vec_perm(srcR2, srcR3, permP3);
651
        } break;
652
        }
653

    
654
        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
655
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
656
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
657
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
658

    
659
        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
660
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
661
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
662
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
663

    
664
        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
665
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
666
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
667
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
668

    
669
        sum1A = vec_adds(srcP0A, srcP1A);
670
        sum1B = vec_adds(srcP0B, srcP1B);
671
        sum2A = vec_adds(srcM1A, srcP2A);
672
        sum2B = vec_adds(srcM1B, srcP2B);
673
        sum3A = vec_adds(srcM2A, srcP3A);
674
        sum3B = vec_adds(srcM2B, srcP3B);
675

    
676
        pp1A = vec_mladd(sum1A, v20ss, sum3A);
677
        pp1B = vec_mladd(sum1B, v20ss, sum3B);
678

    
679
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
680
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
681

    
682
        psumA = vec_sub(pp1A, pp2A);
683
        psumB = vec_sub(pp1B, pp2B);
684

    
685
        vec_st(psumA, 0, tmp);
686
        vec_st(psumB, 16, tmp);
687

    
688
        src += srcStride;
689
        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
690
    }
691

    
692
    tmpM2ssA = vec_ld(0, tmpbis);
693
    tmpM2ssB = vec_ld(16, tmpbis);
694
    tmpbis += tmpStride;
695
    tmpM1ssA = vec_ld(0, tmpbis);
696
    tmpM1ssB = vec_ld(16, tmpbis);
697
    tmpbis += tmpStride;
698
    tmpP0ssA = vec_ld(0, tmpbis);
699
    tmpP0ssB = vec_ld(16, tmpbis);
700
    tmpbis += tmpStride;
701
    tmpP1ssA = vec_ld(0, tmpbis);
702
    tmpP1ssB = vec_ld(16, tmpbis);
703
    tmpbis += tmpStride;
704
    tmpP2ssA = vec_ld(0, tmpbis);
705
    tmpP2ssB = vec_ld(16, tmpbis);
706
    tmpbis += tmpStride;
707

    
708
    for (i = 0 ; i < 16 ; i++) {
709
        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
710
        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
711

    
712
        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
713
        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
714
        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
715
        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
716
        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
717
        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
718

    
719
        tmpbis += tmpStride;
720

    
721
        tmpM2ssA = tmpM1ssA;
722
        tmpM2ssB = tmpM1ssB;
723
        tmpM1ssA = tmpP0ssA;
724
        tmpM1ssB = tmpP0ssB;
725
        tmpP0ssA = tmpP1ssA;
726
        tmpP0ssB = tmpP1ssB;
727
        tmpP1ssA = tmpP2ssA;
728
        tmpP1ssB = tmpP2ssB;
729
        tmpP2ssA = tmpP3ssA;
730
        tmpP2ssB = tmpP3ssB;
731

    
732
        pp1Ae = vec_mule(sum1A, v20ss);
733
        pp1Ao = vec_mulo(sum1A, v20ss);
734
        pp1Be = vec_mule(sum1B, v20ss);
735
        pp1Bo = vec_mulo(sum1B, v20ss);
736

    
737
        pp2Ae = vec_mule(sum2A, v5ss);
738
        pp2Ao = vec_mulo(sum2A, v5ss);
739
        pp2Be = vec_mule(sum2B, v5ss);
740
        pp2Bo = vec_mulo(sum2B, v5ss);
741

    
742
        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
743
        pp3Ao = vec_mulo(sum3A, v1ss);
744
        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
745
        pp3Bo = vec_mulo(sum3B, v1ss);
746

    
747
        pp1cAe = vec_add(pp1Ae, v512si);
748
        pp1cAo = vec_add(pp1Ao, v512si);
749
        pp1cBe = vec_add(pp1Be, v512si);
750
        pp1cBo = vec_add(pp1Bo, v512si);
751

    
752
        pp32Ae = vec_sub(pp3Ae, pp2Ae);
753
        pp32Ao = vec_sub(pp3Ao, pp2Ao);
754
        pp32Be = vec_sub(pp3Be, pp2Be);
755
        pp32Bo = vec_sub(pp3Bo, pp2Bo);
756

    
757
        sumAe = vec_add(pp1cAe, pp32Ae);
758
        sumAo = vec_add(pp1cAo, pp32Ao);
759
        sumBe = vec_add(pp1cBe, pp32Be);
760
        sumBo = vec_add(pp1cBo, pp32Bo);
761

    
762
        ssumAe = vec_sra(sumAe, v10ui);
763
        ssumAo = vec_sra(sumAo, v10ui);
764
        ssumBe = vec_sra(sumBe, v10ui);
765
        ssumBo = vec_sra(sumBo, v10ui);
766

    
767
        ssume = vec_packs(ssumAe, ssumBe);
768
        ssumo = vec_packs(ssumAo, ssumBo);
769

    
770
        sumv = vec_packsu(ssume, ssumo);
771
        sum = vec_perm(sumv, sumv, mperm);
772

    
773
        ASSERT_ALIGNED(dst);
774
        vdst = vec_ld(0, dst);
775

    
776
        OP_U8_ALTIVEC(fsum, sum, vdst);
777

    
778
        vec_st(fsum, 0, dst);
779

    
780
        dst += dstStride;
781
    }
782
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
783
}