Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / h264_template_altivec.c @ a6b4448c

History | View | Annotate | Download (24.3 KB)

1
/*
2
 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
//#define DEBUG_ALIGNMENT
22
#ifdef DEBUG_ALIGNMENT
23
#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24
#else
25
#define ASSERT_ALIGNED(ptr) ;
26
#endif
27

    
28
/* this code assume that stride % 16 == 0 */
29

    
30
#define CHROMA_MC8_ALTIVEC_CORE \
31
        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32
        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
33
\
34
        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
35
        psum = vec_mladd(vB, vsrc1ssH, psum);\
36
        psum = vec_mladd(vC, vsrc2ssH, psum);\
37
        psum = vec_mladd(vD, vsrc3ssH, psum);\
38
        psum = vec_sr(psum, v6us);\
39
\
40
        vdst = vec_ld(0, dst);\
41
        ppsum = (vec_u8)vec_pack(psum, psum);\
42
        vfdst = vec_perm(vdst, ppsum, fperm);\
43
\
44
        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
45
\
46
        vec_st(fsum, 0, dst);\
47
\
48
        vsrc0ssH = vsrc2ssH;\
49
        vsrc1ssH = vsrc3ssH;\
50
\
51
        dst += stride;\
52
        src += stride;
53

    
54
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
55
\
56
        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
57
        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
58
\
59
        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
60
        psum = vec_mladd(vE, vsrc1ssH, psum);\
61
        psum = vec_sr(psum, v6us);\
62
\
63
        vdst = vec_ld(0, dst);\
64
        ppsum = (vec_u8)vec_pack(psum, psum);\
65
        vfdst = vec_perm(vdst, ppsum, fperm);\
66
\
67
        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
68
\
69
        vec_st(fsum, 0, dst);\
70
\
71
        dst += stride;\
72
        src += stride;
73

    
74
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
75
                                    int stride, int h, int x, int y) {
76
  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
77
    DECLARE_ALIGNED_16(signed int, ABCD[4]) =
78
                        {((8 - x) * (8 - y)),
79
                         ((    x) * (8 - y)),
80
                         ((8 - x) * (    y)),
81
                         ((    x) * (    y))};
82
    register int i;
83
    vec_u8 fperm;
84
    const vec_s32 vABCD = vec_ld(0, ABCD);
85
    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
86
    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
87
    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
88
    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
89
    LOAD_ZERO;
90
    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
91
    const vec_u16 v6us = vec_splat_u16(6);
92
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
93
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
94

    
95
    vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
96
    vec_u8 vsrc0uc, vsrc1uc;
97
    vec_s16 vsrc0ssH, vsrc1ssH;
98
    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
99
    vec_s16 vsrc2ssH, vsrc3ssH, psum;
100
    vec_u8 vdst, ppsum, vfdst, fsum;
101

    
102
  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
103

    
104
    if (((unsigned long)dst) % 16 == 0) {
105
        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
106
                           0x14, 0x15, 0x16, 0x17,
107
                           0x08, 0x09, 0x0A, 0x0B,
108
                           0x0C, 0x0D, 0x0E, 0x0F};
109
    } else {
110
        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
111
                           0x04, 0x05, 0x06, 0x07,
112
                           0x18, 0x19, 0x1A, 0x1B,
113
                           0x1C, 0x1D, 0x1E, 0x1F};
114
    }
115

    
116
    vsrcAuc = vec_ld(0, src);
117

    
118
    if (loadSecond)
119
        vsrcBuc = vec_ld(16, src);
120
    vsrcperm0 = vec_lvsl(0, src);
121
    vsrcperm1 = vec_lvsl(1, src);
122

    
123
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
124
    if (reallyBadAlign)
125
        vsrc1uc = vsrcBuc;
126
    else
127
        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
128

    
129
    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
130
    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
131

    
132
    if (ABCD[3]) {
133
        if (!loadSecond) {// -> !reallyBadAlign
134
            for (i = 0 ; i < h ; i++) {
135
                vsrcCuc = vec_ld(stride + 0, src);
136
                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
137
                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
138

    
139
                CHROMA_MC8_ALTIVEC_CORE
140
            }
141
        } else {
142
            vec_u8 vsrcDuc;
143
            for (i = 0 ; i < h ; i++) {
144
                vsrcCuc = vec_ld(stride + 0, src);
145
                vsrcDuc = vec_ld(stride + 16, src);
146
                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
147
                if (reallyBadAlign)
148
                    vsrc3uc = vsrcDuc;
149
                else
150
                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
151

    
152
                CHROMA_MC8_ALTIVEC_CORE
153
            }
154
        }
155
    } else {
156
        const vec_s16 vE = vec_add(vB, vC);
157
        if (ABCD[2]) { // x == 0 B == 0
158
            if (!loadSecond) {// -> !reallyBadAlign
159
                for (i = 0 ; i < h ; i++) {
160
                    vsrcCuc = vec_ld(stride + 0, src);
161
                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
162
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
163

    
164
                    vsrc0uc = vsrc1uc;
165
                }
166
            } else {
167
                vec_u8 vsrcDuc;
168
                for (i = 0 ; i < h ; i++) {
169
                    vsrcCuc = vec_ld(stride + 0, src);
170
                    vsrcDuc = vec_ld(stride + 15, src);
171
                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
172
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
173

    
174
                    vsrc0uc = vsrc1uc;
175
                }
176
            }
177
        } else { // y == 0 C == 0
178
            if (!loadSecond) {// -> !reallyBadAlign
179
                for (i = 0 ; i < h ; i++) {
180
                    vsrcCuc = vec_ld(0, src);
181
                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
182
                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
183

    
184
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
185
                }
186
            } else {
187
                vec_u8 vsrcDuc;
188
                for (i = 0 ; i < h ; i++) {
189
                    vsrcCuc = vec_ld(0, src);
190
                    vsrcDuc = vec_ld(15, src);
191
                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
192
                    if (reallyBadAlign)
193
                        vsrc1uc = vsrcDuc;
194
                    else
195
                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
196

    
197
                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
198
                }
199
            }
200
        }
201
    }
202
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
203
}
204

    
205
#undef CHROMA_MC8_ALTIVEC_CORE
206

    
207
/* this code assume stride % 16 == 0 */
208
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
209
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
210
    register int i;
211

    
212
    LOAD_ZERO;
213
    const vec_u8 permM2 = vec_lvsl(-2, src);
214
    const vec_u8 permM1 = vec_lvsl(-1, src);
215
    const vec_u8 permP0 = vec_lvsl(+0, src);
216
    const vec_u8 permP1 = vec_lvsl(+1, src);
217
    const vec_u8 permP2 = vec_lvsl(+2, src);
218
    const vec_u8 permP3 = vec_lvsl(+3, src);
219
    const vec_s16 v5ss = vec_splat_s16(5);
220
    const vec_u16 v5us = vec_splat_u16(5);
221
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
222
    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
223

    
224
    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
225

    
226
    register int align = ((((unsigned long)src) - 2) % 16);
227

    
228
    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
229
              srcP2A, srcP2B, srcP3A, srcP3B,
230
              srcM1A, srcM1B, srcM2A, srcM2B,
231
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
232
              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
233
              psumA, psumB, sumA, sumB;
234

    
235
    vec_u8 sum, vdst, fsum;
236

    
237
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
238

    
239
    for (i = 0 ; i < 16 ; i ++) {
240
        vec_u8 srcR1 = vec_ld(-2, src);
241
        vec_u8 srcR2 = vec_ld(14, src);
242

    
243
        switch (align) {
244
        default: {
245
            srcM2 = vec_perm(srcR1, srcR2, permM2);
246
            srcM1 = vec_perm(srcR1, srcR2, permM1);
247
            srcP0 = vec_perm(srcR1, srcR2, permP0);
248
            srcP1 = vec_perm(srcR1, srcR2, permP1);
249
            srcP2 = vec_perm(srcR1, srcR2, permP2);
250
            srcP3 = vec_perm(srcR1, srcR2, permP3);
251
        } break;
252
        case 11: {
253
            srcM2 = vec_perm(srcR1, srcR2, permM2);
254
            srcM1 = vec_perm(srcR1, srcR2, permM1);
255
            srcP0 = vec_perm(srcR1, srcR2, permP0);
256
            srcP1 = vec_perm(srcR1, srcR2, permP1);
257
            srcP2 = vec_perm(srcR1, srcR2, permP2);
258
            srcP3 = srcR2;
259
        } break;
260
        case 12: {
261
            vec_u8 srcR3 = vec_ld(30, src);
262
            srcM2 = vec_perm(srcR1, srcR2, permM2);
263
            srcM1 = vec_perm(srcR1, srcR2, permM1);
264
            srcP0 = vec_perm(srcR1, srcR2, permP0);
265
            srcP1 = vec_perm(srcR1, srcR2, permP1);
266
            srcP2 = srcR2;
267
            srcP3 = vec_perm(srcR2, srcR3, permP3);
268
        } break;
269
        case 13: {
270
            vec_u8 srcR3 = vec_ld(30, src);
271
            srcM2 = vec_perm(srcR1, srcR2, permM2);
272
            srcM1 = vec_perm(srcR1, srcR2, permM1);
273
            srcP0 = vec_perm(srcR1, srcR2, permP0);
274
            srcP1 = srcR2;
275
            srcP2 = vec_perm(srcR2, srcR3, permP2);
276
            srcP3 = vec_perm(srcR2, srcR3, permP3);
277
        } break;
278
        case 14: {
279
            vec_u8 srcR3 = vec_ld(30, src);
280
            srcM2 = vec_perm(srcR1, srcR2, permM2);
281
            srcM1 = vec_perm(srcR1, srcR2, permM1);
282
            srcP0 = srcR2;
283
            srcP1 = vec_perm(srcR2, srcR3, permP1);
284
            srcP2 = vec_perm(srcR2, srcR3, permP2);
285
            srcP3 = vec_perm(srcR2, srcR3, permP3);
286
        } break;
287
        case 15: {
288
            vec_u8 srcR3 = vec_ld(30, src);
289
            srcM2 = vec_perm(srcR1, srcR2, permM2);
290
            srcM1 = srcR2;
291
            srcP0 = vec_perm(srcR2, srcR3, permP0);
292
            srcP1 = vec_perm(srcR2, srcR3, permP1);
293
            srcP2 = vec_perm(srcR2, srcR3, permP2);
294
            srcP3 = vec_perm(srcR2, srcR3, permP3);
295
        } break;
296
        }
297

    
298
        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
299
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
300
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
301
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
302

    
303
        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
304
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
305
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
306
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
307

    
308
        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
309
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
310
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
311
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
312

    
313
        sum1A = vec_adds(srcP0A, srcP1A);
314
        sum1B = vec_adds(srcP0B, srcP1B);
315
        sum2A = vec_adds(srcM1A, srcP2A);
316
        sum2B = vec_adds(srcM1B, srcP2B);
317
        sum3A = vec_adds(srcM2A, srcP3A);
318
        sum3B = vec_adds(srcM2B, srcP3B);
319

    
320
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
321
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
322

    
323
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
324
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
325

    
326
        pp3A = vec_add(sum3A, pp1A);
327
        pp3B = vec_add(sum3B, pp1B);
328

    
329
        psumA = vec_sub(pp3A, pp2A);
330
        psumB = vec_sub(pp3B, pp2B);
331

    
332
        sumA = vec_sra(psumA, v5us);
333
        sumB = vec_sra(psumB, v5us);
334

    
335
        sum = vec_packsu(sumA, sumB);
336

    
337
        ASSERT_ALIGNED(dst);
338
        vdst = vec_ld(0, dst);
339

    
340
        OP_U8_ALTIVEC(fsum, sum, vdst);
341

    
342
        vec_st(fsum, 0, dst);
343

    
344
        src += srcStride;
345
        dst += dstStride;
346
    }
347
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
348
}
349

    
350
/* this code assume stride % 16 == 0 */
351
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
352
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
353

    
354
    register int i;
355

    
356
    LOAD_ZERO;
357
    const vec_u8 perm = vec_lvsl(0, src);
358
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
359
    const vec_u16 v5us = vec_splat_u16(5);
360
    const vec_s16 v5ss = vec_splat_s16(5);
361
    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
362

    
363
    uint8_t *srcbis = src - (srcStride * 2);
364

    
365
    const vec_u8 srcM2a = vec_ld(0, srcbis);
366
    const vec_u8 srcM2b = vec_ld(16, srcbis);
367
    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
368
    //srcbis += srcStride;
369
    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
370
    const vec_u8 srcM1b = vec_ld(16, srcbis);
371
    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
372
    //srcbis += srcStride;
373
    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
374
    const vec_u8 srcP0b = vec_ld(16, srcbis);
375
    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
376
    //srcbis += srcStride;
377
    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
378
    const vec_u8 srcP1b = vec_ld(16, srcbis);
379
    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
380
    //srcbis += srcStride;
381
    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
382
    const vec_u8 srcP2b = vec_ld(16, srcbis);
383
    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
384
    //srcbis += srcStride;
385

    
386
    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
387
    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
388
    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
389
    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
390
    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
391
    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
392
    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
393
    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
394
    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
395
    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
396

    
397
    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
398
              psumA, psumB, sumA, sumB,
399
              srcP3ssA, srcP3ssB,
400
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
401

    
402
    vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
403

    
404
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
405

    
406
    for (i = 0 ; i < 16 ; i++) {
407
        srcP3a = vec_ld(0, srcbis += srcStride);
408
        srcP3b = vec_ld(16, srcbis);
409
        srcP3 = vec_perm(srcP3a, srcP3b, perm);
410
        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
411
        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
412
        //srcbis += srcStride;
413

    
414
        sum1A = vec_adds(srcP0ssA, srcP1ssA);
415
        sum1B = vec_adds(srcP0ssB, srcP1ssB);
416
        sum2A = vec_adds(srcM1ssA, srcP2ssA);
417
        sum2B = vec_adds(srcM1ssB, srcP2ssB);
418
        sum3A = vec_adds(srcM2ssA, srcP3ssA);
419
        sum3B = vec_adds(srcM2ssB, srcP3ssB);
420

    
421
        srcM2ssA = srcM1ssA;
422
        srcM2ssB = srcM1ssB;
423
        srcM1ssA = srcP0ssA;
424
        srcM1ssB = srcP0ssB;
425
        srcP0ssA = srcP1ssA;
426
        srcP0ssB = srcP1ssB;
427
        srcP1ssA = srcP2ssA;
428
        srcP1ssB = srcP2ssB;
429
        srcP2ssA = srcP3ssA;
430
        srcP2ssB = srcP3ssB;
431

    
432
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
433
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
434

    
435
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
436
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
437

    
438
        pp3A = vec_add(sum3A, pp1A);
439
        pp3B = vec_add(sum3B, pp1B);
440

    
441
        psumA = vec_sub(pp3A, pp2A);
442
        psumB = vec_sub(pp3B, pp2B);
443

    
444
        sumA = vec_sra(psumA, v5us);
445
        sumB = vec_sra(psumB, v5us);
446

    
447
        sum = vec_packsu(sumA, sumB);
448

    
449
        ASSERT_ALIGNED(dst);
450
        vdst = vec_ld(0, dst);
451

    
452
        OP_U8_ALTIVEC(fsum, sum, vdst);
453

    
454
        vec_st(fsum, 0, dst);
455

    
456
        dst += dstStride;
457
    }
458
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
459
}
460

    
461
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
462
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
463
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
464
    register int i;
465
    LOAD_ZERO;
466
    const vec_u8 permM2 = vec_lvsl(-2, src);
467
    const vec_u8 permM1 = vec_lvsl(-1, src);
468
    const vec_u8 permP0 = vec_lvsl(+0, src);
469
    const vec_u8 permP1 = vec_lvsl(+1, src);
470
    const vec_u8 permP2 = vec_lvsl(+2, src);
471
    const vec_u8 permP3 = vec_lvsl(+3, src);
472
    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
473
    const vec_u32 v10ui = vec_splat_u32(10);
474
    const vec_s16 v5ss = vec_splat_s16(5);
475
    const vec_s16 v1ss = vec_splat_s16(1);
476
    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
477
    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
478

    
479
    register int align = ((((unsigned long)src) - 2) % 16);
480

    
481
    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
482
              srcP2A, srcP2B, srcP3A, srcP3B,
483
              srcM1A, srcM1B, srcM2A, srcM2B,
484
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485
              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
486

    
487
    const vec_u8 mperm = (const vec_u8)
488
        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
489
         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
490
    int16_t *tmpbis = tmp;
491

    
492
    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
493
              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
494
              tmpP2ssA, tmpP2ssB;
495

    
496
    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
497
              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
498
              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
499
              ssumAe, ssumAo, ssumBe, ssumBo;
500
    vec_u8 fsum, sumv, sum, vdst;
501
    vec_s16 ssume, ssumo;
502

    
503
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
504
    src -= (2 * srcStride);
505
    for (i = 0 ; i < 21 ; i ++) {
506
        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
507
        vec_u8 srcR1 = vec_ld(-2, src);
508
        vec_u8 srcR2 = vec_ld(14, src);
509

    
510
        switch (align) {
511
        default: {
512
            srcM2 = vec_perm(srcR1, srcR2, permM2);
513
            srcM1 = vec_perm(srcR1, srcR2, permM1);
514
            srcP0 = vec_perm(srcR1, srcR2, permP0);
515
            srcP1 = vec_perm(srcR1, srcR2, permP1);
516
            srcP2 = vec_perm(srcR1, srcR2, permP2);
517
            srcP3 = vec_perm(srcR1, srcR2, permP3);
518
        } break;
519
        case 11: {
520
            srcM2 = vec_perm(srcR1, srcR2, permM2);
521
            srcM1 = vec_perm(srcR1, srcR2, permM1);
522
            srcP0 = vec_perm(srcR1, srcR2, permP0);
523
            srcP1 = vec_perm(srcR1, srcR2, permP1);
524
            srcP2 = vec_perm(srcR1, srcR2, permP2);
525
            srcP3 = srcR2;
526
        } break;
527
        case 12: {
528
            vec_u8 srcR3 = vec_ld(30, src);
529
            srcM2 = vec_perm(srcR1, srcR2, permM2);
530
            srcM1 = vec_perm(srcR1, srcR2, permM1);
531
            srcP0 = vec_perm(srcR1, srcR2, permP0);
532
            srcP1 = vec_perm(srcR1, srcR2, permP1);
533
            srcP2 = srcR2;
534
            srcP3 = vec_perm(srcR2, srcR3, permP3);
535
        } break;
536
        case 13: {
537
            vec_u8 srcR3 = vec_ld(30, src);
538
            srcM2 = vec_perm(srcR1, srcR2, permM2);
539
            srcM1 = vec_perm(srcR1, srcR2, permM1);
540
            srcP0 = vec_perm(srcR1, srcR2, permP0);
541
            srcP1 = srcR2;
542
            srcP2 = vec_perm(srcR2, srcR3, permP2);
543
            srcP3 = vec_perm(srcR2, srcR3, permP3);
544
        } break;
545
        case 14: {
546
            vec_u8 srcR3 = vec_ld(30, src);
547
            srcM2 = vec_perm(srcR1, srcR2, permM2);
548
            srcM1 = vec_perm(srcR1, srcR2, permM1);
549
            srcP0 = srcR2;
550
            srcP1 = vec_perm(srcR2, srcR3, permP1);
551
            srcP2 = vec_perm(srcR2, srcR3, permP2);
552
            srcP3 = vec_perm(srcR2, srcR3, permP3);
553
        } break;
554
        case 15: {
555
            vec_u8 srcR3 = vec_ld(30, src);
556
            srcM2 = vec_perm(srcR1, srcR2, permM2);
557
            srcM1 = srcR2;
558
            srcP0 = vec_perm(srcR2, srcR3, permP0);
559
            srcP1 = vec_perm(srcR2, srcR3, permP1);
560
            srcP2 = vec_perm(srcR2, srcR3, permP2);
561
            srcP3 = vec_perm(srcR2, srcR3, permP3);
562
        } break;
563
        }
564

    
565
        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
566
        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
567
        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
568
        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
569

    
570
        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
571
        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
572
        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
573
        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
574

    
575
        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
576
        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
577
        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
578
        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
579

    
580
        sum1A = vec_adds(srcP0A, srcP1A);
581
        sum1B = vec_adds(srcP0B, srcP1B);
582
        sum2A = vec_adds(srcM1A, srcP2A);
583
        sum2B = vec_adds(srcM1B, srcP2B);
584
        sum3A = vec_adds(srcM2A, srcP3A);
585
        sum3B = vec_adds(srcM2B, srcP3B);
586

    
587
        pp1A = vec_mladd(sum1A, v20ss, sum3A);
588
        pp1B = vec_mladd(sum1B, v20ss, sum3B);
589

    
590
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
591
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
592

    
593
        psumA = vec_sub(pp1A, pp2A);
594
        psumB = vec_sub(pp1B, pp2B);
595

    
596
        vec_st(psumA, 0, tmp);
597
        vec_st(psumB, 16, tmp);
598

    
599
        src += srcStride;
600
        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
601
    }
602

    
603
    tmpM2ssA = vec_ld(0, tmpbis);
604
    tmpM2ssB = vec_ld(16, tmpbis);
605
    tmpbis += tmpStride;
606
    tmpM1ssA = vec_ld(0, tmpbis);
607
    tmpM1ssB = vec_ld(16, tmpbis);
608
    tmpbis += tmpStride;
609
    tmpP0ssA = vec_ld(0, tmpbis);
610
    tmpP0ssB = vec_ld(16, tmpbis);
611
    tmpbis += tmpStride;
612
    tmpP1ssA = vec_ld(0, tmpbis);
613
    tmpP1ssB = vec_ld(16, tmpbis);
614
    tmpbis += tmpStride;
615
    tmpP2ssA = vec_ld(0, tmpbis);
616
    tmpP2ssB = vec_ld(16, tmpbis);
617
    tmpbis += tmpStride;
618

    
619
    for (i = 0 ; i < 16 ; i++) {
620
        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
621
        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
622

    
623
        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
624
        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
625
        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
626
        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
627
        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
628
        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
629

    
630
        tmpbis += tmpStride;
631

    
632
        tmpM2ssA = tmpM1ssA;
633
        tmpM2ssB = tmpM1ssB;
634
        tmpM1ssA = tmpP0ssA;
635
        tmpM1ssB = tmpP0ssB;
636
        tmpP0ssA = tmpP1ssA;
637
        tmpP0ssB = tmpP1ssB;
638
        tmpP1ssA = tmpP2ssA;
639
        tmpP1ssB = tmpP2ssB;
640
        tmpP2ssA = tmpP3ssA;
641
        tmpP2ssB = tmpP3ssB;
642

    
643
        pp1Ae = vec_mule(sum1A, v20ss);
644
        pp1Ao = vec_mulo(sum1A, v20ss);
645
        pp1Be = vec_mule(sum1B, v20ss);
646
        pp1Bo = vec_mulo(sum1B, v20ss);
647

    
648
        pp2Ae = vec_mule(sum2A, v5ss);
649
        pp2Ao = vec_mulo(sum2A, v5ss);
650
        pp2Be = vec_mule(sum2B, v5ss);
651
        pp2Bo = vec_mulo(sum2B, v5ss);
652

    
653
        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
654
        pp3Ao = vec_mulo(sum3A, v1ss);
655
        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
656
        pp3Bo = vec_mulo(sum3B, v1ss);
657

    
658
        pp1cAe = vec_add(pp1Ae, v512si);
659
        pp1cAo = vec_add(pp1Ao, v512si);
660
        pp1cBe = vec_add(pp1Be, v512si);
661
        pp1cBo = vec_add(pp1Bo, v512si);
662

    
663
        pp32Ae = vec_sub(pp3Ae, pp2Ae);
664
        pp32Ao = vec_sub(pp3Ao, pp2Ao);
665
        pp32Be = vec_sub(pp3Be, pp2Be);
666
        pp32Bo = vec_sub(pp3Bo, pp2Bo);
667

    
668
        sumAe = vec_add(pp1cAe, pp32Ae);
669
        sumAo = vec_add(pp1cAo, pp32Ao);
670
        sumBe = vec_add(pp1cBe, pp32Be);
671
        sumBo = vec_add(pp1cBo, pp32Bo);
672

    
673
        ssumAe = vec_sra(sumAe, v10ui);
674
        ssumAo = vec_sra(sumAo, v10ui);
675
        ssumBe = vec_sra(sumBe, v10ui);
676
        ssumBo = vec_sra(sumBo, v10ui);
677

    
678
        ssume = vec_packs(ssumAe, ssumBe);
679
        ssumo = vec_packs(ssumAo, ssumBo);
680

    
681
        sumv = vec_packsu(ssume, ssumo);
682
        sum = vec_perm(sumv, sumv, mperm);
683

    
684
        ASSERT_ALIGNED(dst);
685
        vdst = vec_ld(0, dst);
686

    
687
        OP_U8_ALTIVEC(fsum, sum, vdst);
688

    
689
        vec_st(fsum, 0, dst);
690

    
691
        dst += dstStride;
692
    }
693
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
694
}