Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / h264_template_altivec.c @ e3905ce0

History | View | Annotate | Download (24.6 KB)

1
/*
2
 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
//#define DEBUG_ALIGNMENT
22
#ifdef DEBUG_ALIGNMENT
23
#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24
#else
25
#define ASSERT_ALIGNED(ptr) ;
26
#endif
27

    
28
/* this code assume that stride % 16 == 0 */
29

    
30
#define CHROMA_MC8_ALTIVEC_CORE \
31
        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\
32
        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\
33
\
34
        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
35
        psum = vec_mladd(vB, vsrc1ssH, psum);\
36
        psum = vec_mladd(vC, vsrc2ssH, psum);\
37
        psum = vec_mladd(vD, vsrc3ssH, psum);\
38
        psum = vec_sr(psum, v6us);\
39
\
40
        vdst = vec_ld(0, dst);\
41
        ppsum = (vec_u8_t)vec_pack(psum, psum);\
42
        vfdst = vec_perm(vdst, ppsum, fperm);\
43
\
44
        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
45
\
46
        vec_st(fsum, 0, dst);\
47
\
48
        vsrc0ssH = vsrc2ssH;\
49
        vsrc1ssH = vsrc3ssH;\
50
\
51
        dst += stride;\
52
        src += stride;
53

    
54
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
55
\
56
        vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\
57
        vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\
58
\
59
        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
60
        psum = vec_mladd(vE, vsrc1ssH, psum);\
61
        psum = vec_sr(psum, v6us);\
62
\
63
        vdst = vec_ld(0, dst);\
64
        ppsum = (vec_u8_t)vec_pack(psum, psum);\
65
        vfdst = vec_perm(vdst, ppsum, fperm);\
66
\
67
        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
68
\
69
        vec_st(fsum, 0, dst);\
70
\
71
        dst += stride;\
72
        src += stride;
73

    
74
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
75
                                    int stride, int h, int x, int y) {
76
  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
77
    DECLARE_ALIGNED_16(signed int, ABCD[4]) =
78
                        {((8 - x) * (8 - y)),
79
                         ((    x) * (8 - y)),
80
                         ((8 - x) * (    y)),
81
                         ((    x) * (    y))};
82
    register int i;
83
    vec_u8_t fperm;
84
    const vec_s32_t vABCD = vec_ld(0, ABCD);
85
    const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
86
    const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
87
    const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
88
    const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
89
    LOAD_ZERO;
90
    const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
91
    const vec_u16_t v6us = vec_splat_u16(6);
92
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
93
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
94

    
95
    vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
96
    vec_u8_t vsrc0uc, vsrc1uc;
97
    vec_s16_t vsrc0ssH, vsrc1ssH;
98
    vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
99
    vec_s16_t vsrc2ssH, vsrc3ssH, psum;
100
    vec_u8_t vdst, ppsum, vfdst, fsum;
101

    
102
  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
103

    
104
    if (((unsigned long)dst) % 16 == 0) {
105
        fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
106
                              0x14, 0x15, 0x16, 0x17,
107
                              0x08, 0x09, 0x0A, 0x0B,
108
                              0x0C, 0x0D, 0x0E, 0x0F);
109
    } else {
110
        fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
111
                              0x04, 0x05, 0x06, 0x07,
112
                              0x18, 0x19, 0x1A, 0x1B,
113
                              0x1C, 0x1D, 0x1E, 0x1F);
114
    }
115

    
116
    vsrcAuc = vec_ld(0, src);
117

    
118
    if (loadSecond)
119
        vsrcBuc = vec_ld(16, src);
120
    vsrcperm0 = vec_lvsl(0, src);
121
    vsrcperm1 = vec_lvsl(1, src);
122

    
123
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
124
    if (reallyBadAlign)
125
        vsrc1uc = vsrcBuc;
126
    else
127
        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
128

    
129
    vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
130
    vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
131

    
132
    if (ABCD[3]) {
133
        if (!loadSecond) {// -> !reallyBadAlign
134
            for (i = 0 ; i < h ; i++) {
135
                vsrcCuc = vec_ld(stride + 0, src);
136
                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
137
                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
138

    
139
                CHROMA_MC8_ALTIVEC_CORE
140
            }
141
        } else {
142
            vec_u8_t vsrcDuc;
143
            for (i = 0 ; i < h ; i++) {
144
                vsrcCuc = vec_ld(stride + 0, src);
145
                vsrcDuc = vec_ld(stride + 16, src);
146
                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
147
                if (reallyBadAlign)
148
                    vsrc3uc = vsrcDuc;
149
                else
150
                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
151

    
152
                CHROMA_MC8_ALTIVEC_CORE
153
            }
154
        }
155
    } else {
156
        const vec_s16_t vE = vec_add(vB, vC);
157
        if (ABCD[2]) { // x == 0 B == 0
158
            if (!loadSecond) {// -> !reallyBadAlign
159
                for (i = 0 ; i < h ; i++) {
160
                    vsrcCuc = vec_ld(stride + 0, src);
161
                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
162
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
163

    
164
                    vsrc0uc = vsrc1uc;
165
                }
166
            } else {
167
                vec_u8_t vsrcDuc;
168
                for (i = 0 ; i < h ; i++) {
169
                    vsrcCuc = vec_ld(stride + 0, src);
170
                    vsrcDuc = vec_ld(stride + 15, src);
171
                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
172
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
173

    
174
                    vsrc0uc = vsrc1uc;
175
                }
176
            }
177
        } else { // y == 0 C == 0
178
            if (!loadSecond) {// -> !reallyBadAlign
179
                for (i = 0 ; i < h ; i++) {
180
                    vsrcCuc = vec_ld(0, src);
181
                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
182
                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
183

    
184
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
185
                }
186
            } else {
187
                vec_u8_t vsrcDuc;
188
                for (i = 0 ; i < h ; i++) {
189
                    vsrcCuc = vec_ld(0, src);
190
                    vsrcDuc = vec_ld(15, src);
191
                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
192
                    if (reallyBadAlign)
193
                        vsrc1uc = vsrcDuc;
194
                    else
195
                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
196

    
197
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
198
                }
199
            }
200
        }
201
    }
202
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
203
}
204

    
205
#undef CHROMA_MC8_ALTIVEC_CORE
206

    
207
/* this code assume stride % 16 == 0 */
208
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
209
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
210
    register int i;
211

    
212
    LOAD_ZERO;
213
    const vec_u8_t permM2 = vec_lvsl(-2, src);
214
    const vec_u8_t permM1 = vec_lvsl(-1, src);
215
    const vec_u8_t permP0 = vec_lvsl(+0, src);
216
    const vec_u8_t permP1 = vec_lvsl(+1, src);
217
    const vec_u8_t permP2 = vec_lvsl(+2, src);
218
    const vec_u8_t permP3 = vec_lvsl(+3, src);
219
    const vec_s16_t v5ss = vec_splat_s16(5);
220
    const vec_u16_t v5us = vec_splat_u16(5);
221
    const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
222
    const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
223

    
224
    vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
225

    
226
    register int align = ((((unsigned long)src) - 2) % 16);
227

    
228
    vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
229
              srcP2A, srcP2B, srcP3A, srcP3B,
230
              srcM1A, srcM1B, srcM2A, srcM2B,
231
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
232
              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
233
              psumA, psumB, sumA, sumB;
234

    
235
    vec_u8_t sum, vdst, fsum;
236

    
237
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
238

    
239
    for (i = 0 ; i < 16 ; i ++) {
240
        vec_u8_t srcR1 = vec_ld(-2, src);
241
        vec_u8_t srcR2 = vec_ld(14, src);
242

    
243
        switch (align) {
244
        default: {
245
            srcM2 = vec_perm(srcR1, srcR2, permM2);
246
            srcM1 = vec_perm(srcR1, srcR2, permM1);
247
            srcP0 = vec_perm(srcR1, srcR2, permP0);
248
            srcP1 = vec_perm(srcR1, srcR2, permP1);
249
            srcP2 = vec_perm(srcR1, srcR2, permP2);
250
            srcP3 = vec_perm(srcR1, srcR2, permP3);
251
        } break;
252
        case 11: {
253
            srcM2 = vec_perm(srcR1, srcR2, permM2);
254
            srcM1 = vec_perm(srcR1, srcR2, permM1);
255
            srcP0 = vec_perm(srcR1, srcR2, permP0);
256
            srcP1 = vec_perm(srcR1, srcR2, permP1);
257
            srcP2 = vec_perm(srcR1, srcR2, permP2);
258
            srcP3 = srcR2;
259
        } break;
260
        case 12: {
261
            vec_u8_t srcR3 = vec_ld(30, src);
262
            srcM2 = vec_perm(srcR1, srcR2, permM2);
263
            srcM1 = vec_perm(srcR1, srcR2, permM1);
264
            srcP0 = vec_perm(srcR1, srcR2, permP0);
265
            srcP1 = vec_perm(srcR1, srcR2, permP1);
266
            srcP2 = srcR2;
267
            srcP3 = vec_perm(srcR2, srcR3, permP3);
268
        } break;
269
        case 13: {
270
            vec_u8_t srcR3 = vec_ld(30, src);
271
            srcM2 = vec_perm(srcR1, srcR2, permM2);
272
            srcM1 = vec_perm(srcR1, srcR2, permM1);
273
            srcP0 = vec_perm(srcR1, srcR2, permP0);
274
            srcP1 = srcR2;
275
            srcP2 = vec_perm(srcR2, srcR3, permP2);
276
            srcP3 = vec_perm(srcR2, srcR3, permP3);
277
        } break;
278
        case 14: {
279
            vec_u8_t srcR3 = vec_ld(30, src);
280
            srcM2 = vec_perm(srcR1, srcR2, permM2);
281
            srcM1 = vec_perm(srcR1, srcR2, permM1);
282
            srcP0 = srcR2;
283
            srcP1 = vec_perm(srcR2, srcR3, permP1);
284
            srcP2 = vec_perm(srcR2, srcR3, permP2);
285
            srcP3 = vec_perm(srcR2, srcR3, permP3);
286
        } break;
287
        case 15: {
288
            vec_u8_t srcR3 = vec_ld(30, src);
289
            srcM2 = vec_perm(srcR1, srcR2, permM2);
290
            srcM1 = srcR2;
291
            srcP0 = vec_perm(srcR2, srcR3, permP0);
292
            srcP1 = vec_perm(srcR2, srcR3, permP1);
293
            srcP2 = vec_perm(srcR2, srcR3, permP2);
294
            srcP3 = vec_perm(srcR2, srcR3, permP3);
295
        } break;
296
        }
297

    
298
        srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
299
        srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
300
        srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
301
        srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
302

    
303
        srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
304
        srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
305
        srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
306
        srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
307

    
308
        srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
309
        srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
310
        srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
311
        srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
312

    
313
        sum1A = vec_adds(srcP0A, srcP1A);
314
        sum1B = vec_adds(srcP0B, srcP1B);
315
        sum2A = vec_adds(srcM1A, srcP2A);
316
        sum2B = vec_adds(srcM1B, srcP2B);
317
        sum3A = vec_adds(srcM2A, srcP3A);
318
        sum3B = vec_adds(srcM2B, srcP3B);
319

    
320
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
321
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
322

    
323
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
324
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
325

    
326
        pp3A = vec_add(sum3A, pp1A);
327
        pp3B = vec_add(sum3B, pp1B);
328

    
329
        psumA = vec_sub(pp3A, pp2A);
330
        psumB = vec_sub(pp3B, pp2B);
331

    
332
        sumA = vec_sra(psumA, v5us);
333
        sumB = vec_sra(psumB, v5us);
334

    
335
        sum = vec_packsu(sumA, sumB);
336

    
337
        ASSERT_ALIGNED(dst);
338
        vdst = vec_ld(0, dst);
339

    
340
        OP_U8_ALTIVEC(fsum, sum, vdst);
341

    
342
        vec_st(fsum, 0, dst);
343

    
344
        src += srcStride;
345
        dst += dstStride;
346
    }
347
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
348
}
349

    
350
/* this code assume stride % 16 == 0 */
351
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
352
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
353

    
354
    register int i;
355

    
356
    LOAD_ZERO;
357
    const vec_u8_t perm = vec_lvsl(0, src);
358
    const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
359
    const vec_u16_t v5us = vec_splat_u16(5);
360
    const vec_s16_t v5ss = vec_splat_s16(5);
361
    const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
362

    
363
    uint8_t *srcbis = src - (srcStride * 2);
364

    
365
    const vec_u8_t srcM2a = vec_ld(0, srcbis);
366
    const vec_u8_t srcM2b = vec_ld(16, srcbis);
367
    const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
368
    //srcbis += srcStride;
369
    const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
370
    const vec_u8_t srcM1b = vec_ld(16, srcbis);
371
    const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
372
    //srcbis += srcStride;
373
    const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
374
    const vec_u8_t srcP0b = vec_ld(16, srcbis);
375
    const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
376
    //srcbis += srcStride;
377
    const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
378
    const vec_u8_t srcP1b = vec_ld(16, srcbis);
379
    const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
380
    //srcbis += srcStride;
381
    const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
382
    const vec_u8_t srcP2b = vec_ld(16, srcbis);
383
    const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
384
    //srcbis += srcStride;
385

    
386
    vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
387
    vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
388
    vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
389
    vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
390
    vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
391
    vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
392
    vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
393
    vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
394
    vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
395
    vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
396

    
397
    vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
398
              psumA, psumB, sumA, sumB,
399
              srcP3ssA, srcP3ssB,
400
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
401

    
402
    vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
403

    
404
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
405

    
406
    for (i = 0 ; i < 16 ; i++) {
407
        srcP3a = vec_ld(0, srcbis += srcStride);
408
        srcP3b = vec_ld(16, srcbis);
409
        srcP3 = vec_perm(srcP3a, srcP3b, perm);
410
        srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
411
        srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
412
        //srcbis += srcStride;
413

    
414
        sum1A = vec_adds(srcP0ssA, srcP1ssA);
415
        sum1B = vec_adds(srcP0ssB, srcP1ssB);
416
        sum2A = vec_adds(srcM1ssA, srcP2ssA);
417
        sum2B = vec_adds(srcM1ssB, srcP2ssB);
418
        sum3A = vec_adds(srcM2ssA, srcP3ssA);
419
        sum3B = vec_adds(srcM2ssB, srcP3ssB);
420

    
421
        srcM2ssA = srcM1ssA;
422
        srcM2ssB = srcM1ssB;
423
        srcM1ssA = srcP0ssA;
424
        srcM1ssB = srcP0ssB;
425
        srcP0ssA = srcP1ssA;
426
        srcP0ssB = srcP1ssB;
427
        srcP1ssA = srcP2ssA;
428
        srcP1ssB = srcP2ssB;
429
        srcP2ssA = srcP3ssA;
430
        srcP2ssB = srcP3ssB;
431

    
432
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
433
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
434

    
435
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
436
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
437

    
438
        pp3A = vec_add(sum3A, pp1A);
439
        pp3B = vec_add(sum3B, pp1B);
440

    
441
        psumA = vec_sub(pp3A, pp2A);
442
        psumB = vec_sub(pp3B, pp2B);
443

    
444
        sumA = vec_sra(psumA, v5us);
445
        sumB = vec_sra(psumB, v5us);
446

    
447
        sum = vec_packsu(sumA, sumB);
448

    
449
        ASSERT_ALIGNED(dst);
450
        vdst = vec_ld(0, dst);
451

    
452
        OP_U8_ALTIVEC(fsum, sum, vdst);
453

    
454
        vec_st(fsum, 0, dst);
455

    
456
        dst += dstStride;
457
    }
458
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
459
}
460

    
461
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
462
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
463
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
464
    register int i;
465
    LOAD_ZERO;
466
    const vec_u8_t permM2 = vec_lvsl(-2, src);
467
    const vec_u8_t permM1 = vec_lvsl(-1, src);
468
    const vec_u8_t permP0 = vec_lvsl(+0, src);
469
    const vec_u8_t permP1 = vec_lvsl(+1, src);
470
    const vec_u8_t permP2 = vec_lvsl(+2, src);
471
    const vec_u8_t permP3 = vec_lvsl(+3, src);
472
    const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
473
    const vec_u32_t v10ui = vec_splat_u32(10);
474
    const vec_s16_t v5ss = vec_splat_s16(5);
475
    const vec_s16_t v1ss = vec_splat_s16(1);
476
    const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
477
    const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
478

    
479
    register int align = ((((unsigned long)src) - 2) % 16);
480

    
481
    vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
482
              srcP2A, srcP2B, srcP3A, srcP3B,
483
              srcM1A, srcM1B, srcM2A, srcM2B,
484
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485
              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
486

    
487
    const vec_u8_t mperm = (const vec_u8_t)
488
      AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
489
          0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
490
    int16_t *tmpbis = tmp;
491

    
492
    vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
493
              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
494
              tmpP2ssA, tmpP2ssB;
495

    
496
    vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
497
              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
498
              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
499
              ssumAe, ssumAo, ssumBe, ssumBo;
500
    vec_u8_t fsum, sumv, sum, vdst;
501
    vec_s16_t ssume, ssumo;
502

    
503
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
504
    src -= (2 * srcStride);
505
    for (i = 0 ; i < 21 ; i ++) {
506
        vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
507
        vec_u8_t srcR1 = vec_ld(-2, src);
508
        vec_u8_t srcR2 = vec_ld(14, src);
509

    
510
        switch (align) {
511
        default: {
512
            srcM2 = vec_perm(srcR1, srcR2, permM2);
513
            srcM1 = vec_perm(srcR1, srcR2, permM1);
514
            srcP0 = vec_perm(srcR1, srcR2, permP0);
515
            srcP1 = vec_perm(srcR1, srcR2, permP1);
516
            srcP2 = vec_perm(srcR1, srcR2, permP2);
517
            srcP3 = vec_perm(srcR1, srcR2, permP3);
518
        } break;
519
        case 11: {
520
            srcM2 = vec_perm(srcR1, srcR2, permM2);
521
            srcM1 = vec_perm(srcR1, srcR2, permM1);
522
            srcP0 = vec_perm(srcR1, srcR2, permP0);
523
            srcP1 = vec_perm(srcR1, srcR2, permP1);
524
            srcP2 = vec_perm(srcR1, srcR2, permP2);
525
            srcP3 = srcR2;
526
        } break;
527
        case 12: {
528
            vec_u8_t srcR3 = vec_ld(30, src);
529
            srcM2 = vec_perm(srcR1, srcR2, permM2);
530
            srcM1 = vec_perm(srcR1, srcR2, permM1);
531
            srcP0 = vec_perm(srcR1, srcR2, permP0);
532
            srcP1 = vec_perm(srcR1, srcR2, permP1);
533
            srcP2 = srcR2;
534
            srcP3 = vec_perm(srcR2, srcR3, permP3);
535
        } break;
536
        case 13: {
537
            vec_u8_t srcR3 = vec_ld(30, src);
538
            srcM2 = vec_perm(srcR1, srcR2, permM2);
539
            srcM1 = vec_perm(srcR1, srcR2, permM1);
540
            srcP0 = vec_perm(srcR1, srcR2, permP0);
541
            srcP1 = srcR2;
542
            srcP2 = vec_perm(srcR2, srcR3, permP2);
543
            srcP3 = vec_perm(srcR2, srcR3, permP3);
544
        } break;
545
        case 14: {
546
            vec_u8_t srcR3 = vec_ld(30, src);
547
            srcM2 = vec_perm(srcR1, srcR2, permM2);
548
            srcM1 = vec_perm(srcR1, srcR2, permM1);
549
            srcP0 = srcR2;
550
            srcP1 = vec_perm(srcR2, srcR3, permP1);
551
            srcP2 = vec_perm(srcR2, srcR3, permP2);
552
            srcP3 = vec_perm(srcR2, srcR3, permP3);
553
        } break;
554
        case 15: {
555
            vec_u8_t srcR3 = vec_ld(30, src);
556
            srcM2 = vec_perm(srcR1, srcR2, permM2);
557
            srcM1 = srcR2;
558
            srcP0 = vec_perm(srcR2, srcR3, permP0);
559
            srcP1 = vec_perm(srcR2, srcR3, permP1);
560
            srcP2 = vec_perm(srcR2, srcR3, permP2);
561
            srcP3 = vec_perm(srcR2, srcR3, permP3);
562
        } break;
563
        }
564

    
565
        srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
566
        srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
567
        srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
568
        srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
569

    
570
        srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
571
        srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
572
        srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
573
        srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
574

    
575
        srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
576
        srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
577
        srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
578
        srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
579

    
580
        sum1A = vec_adds(srcP0A, srcP1A);
581
        sum1B = vec_adds(srcP0B, srcP1B);
582
        sum2A = vec_adds(srcM1A, srcP2A);
583
        sum2B = vec_adds(srcM1B, srcP2B);
584
        sum3A = vec_adds(srcM2A, srcP3A);
585
        sum3B = vec_adds(srcM2B, srcP3B);
586

    
587
        pp1A = vec_mladd(sum1A, v20ss, sum3A);
588
        pp1B = vec_mladd(sum1B, v20ss, sum3B);
589

    
590
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
591
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
592

    
593
        psumA = vec_sub(pp1A, pp2A);
594
        psumB = vec_sub(pp1B, pp2B);
595

    
596
        vec_st(psumA, 0, tmp);
597
        vec_st(psumB, 16, tmp);
598

    
599
        src += srcStride;
600
        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
601
    }
602

    
603
    tmpM2ssA = vec_ld(0, tmpbis);
604
    tmpM2ssB = vec_ld(16, tmpbis);
605
    tmpbis += tmpStride;
606
    tmpM1ssA = vec_ld(0, tmpbis);
607
    tmpM1ssB = vec_ld(16, tmpbis);
608
    tmpbis += tmpStride;
609
    tmpP0ssA = vec_ld(0, tmpbis);
610
    tmpP0ssB = vec_ld(16, tmpbis);
611
    tmpbis += tmpStride;
612
    tmpP1ssA = vec_ld(0, tmpbis);
613
    tmpP1ssB = vec_ld(16, tmpbis);
614
    tmpbis += tmpStride;
615
    tmpP2ssA = vec_ld(0, tmpbis);
616
    tmpP2ssB = vec_ld(16, tmpbis);
617
    tmpbis += tmpStride;
618

    
619
    for (i = 0 ; i < 16 ; i++) {
620
        const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
621
        const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
622

    
623
        const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
624
        const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
625
        const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
626
        const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
627
        const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
628
        const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
629

    
630
        tmpbis += tmpStride;
631

    
632
        tmpM2ssA = tmpM1ssA;
633
        tmpM2ssB = tmpM1ssB;
634
        tmpM1ssA = tmpP0ssA;
635
        tmpM1ssB = tmpP0ssB;
636
        tmpP0ssA = tmpP1ssA;
637
        tmpP0ssB = tmpP1ssB;
638
        tmpP1ssA = tmpP2ssA;
639
        tmpP1ssB = tmpP2ssB;
640
        tmpP2ssA = tmpP3ssA;
641
        tmpP2ssB = tmpP3ssB;
642

    
643
        pp1Ae = vec_mule(sum1A, v20ss);
644
        pp1Ao = vec_mulo(sum1A, v20ss);
645
        pp1Be = vec_mule(sum1B, v20ss);
646
        pp1Bo = vec_mulo(sum1B, v20ss);
647

    
648
        pp2Ae = vec_mule(sum2A, v5ss);
649
        pp2Ao = vec_mulo(sum2A, v5ss);
650
        pp2Be = vec_mule(sum2B, v5ss);
651
        pp2Bo = vec_mulo(sum2B, v5ss);
652

    
653
        pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
654
        pp3Ao = vec_mulo(sum3A, v1ss);
655
        pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
656
        pp3Bo = vec_mulo(sum3B, v1ss);
657

    
658
        pp1cAe = vec_add(pp1Ae, v512si);
659
        pp1cAo = vec_add(pp1Ao, v512si);
660
        pp1cBe = vec_add(pp1Be, v512si);
661
        pp1cBo = vec_add(pp1Bo, v512si);
662

    
663
        pp32Ae = vec_sub(pp3Ae, pp2Ae);
664
        pp32Ao = vec_sub(pp3Ao, pp2Ao);
665
        pp32Be = vec_sub(pp3Be, pp2Be);
666
        pp32Bo = vec_sub(pp3Bo, pp2Bo);
667

    
668
        sumAe = vec_add(pp1cAe, pp32Ae);
669
        sumAo = vec_add(pp1cAo, pp32Ao);
670
        sumBe = vec_add(pp1cBe, pp32Be);
671
        sumBo = vec_add(pp1cBo, pp32Bo);
672

    
673
        ssumAe = vec_sra(sumAe, v10ui);
674
        ssumAo = vec_sra(sumAo, v10ui);
675
        ssumBe = vec_sra(sumBe, v10ui);
676
        ssumBo = vec_sra(sumBo, v10ui);
677

    
678
        ssume = vec_packs(ssumAe, ssumBe);
679
        ssumo = vec_packs(ssumAo, ssumBo);
680

    
681
        sumv = vec_packsu(ssume, ssumo);
682
        sum = vec_perm(sumv, sumv, mperm);
683

    
684
        ASSERT_ALIGNED(dst);
685
        vdst = vec_ld(0, dst);
686

    
687
        OP_U8_ALTIVEC(fsum, sum, vdst);
688

    
689
        vec_st(fsum, 0, dst);
690

    
691
        dst += dstStride;
692
    }
693
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
694
}