Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_h264_template_altivec.c @ 47261113

History | View | Annotate | Download (26.9 KB)

1
/*
2
 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3
 *
4
 * This library is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU Lesser General Public
6
 * License as published by the Free Software Foundation; either
7
 * version 2 of the License, or (at your option) any later version.
8
 *
9
 * This library is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
 * Lesser General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU Lesser General Public
15
 * License along with this library; if not, write to the Free Software
16
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
 */
18

    
19
/* this code assume that stride % 16 == 0 */
20
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
21
  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
22
    signed int ABCD[4] __attribute__((aligned(16))) =
23
                        {((8 - x) * (8 - y)),
24
                          ((x) * (8 - y)),
25
                          ((8 - x) * (y)),
26
                          ((x) * (y))};
27
    register int i;
28
    vector unsigned char fperm;
29
    const vector signed int vABCD = vec_ld(0, ABCD);
30
    const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
31
    const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
32
    const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
33
    const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
34
    const vector signed int vzero = vec_splat_s32(0);
35
    const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
36
    const vector unsigned short v6us = vec_splat_u16(6);
37
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
38
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
39

    
40
    vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
41
    vector unsigned char vsrc0uc, vsrc1uc;
42
    vector signed short vsrc0ssH, vsrc1ssH;
43
    vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
44
    vector signed short vsrc2ssH, vsrc3ssH, psum;
45
    vector unsigned char vdst, ppsum, vfdst, fsum;
46

    
47
  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
48

    
49
    if (((unsigned long)dst) % 16 == 0) {
50
      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
51
                                        0x14, 0x15, 0x16, 0x17,
52
                                        0x08, 0x09, 0x0A, 0x0B,
53
                                        0x0C, 0x0D, 0x0E, 0x0F);
54
    } else {
55
      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
56
                                        0x04, 0x05, 0x06, 0x07,
57
                                        0x18, 0x19, 0x1A, 0x1B,
58
                                        0x1C, 0x1D, 0x1E, 0x1F);
59
    }
60

    
61
    vsrcAuc = vec_ld(0, src);
62

    
63
    if (loadSecond)
64
      vsrcBuc = vec_ld(16, src);
65
    vsrcperm0 = vec_lvsl(0, src);
66
    vsrcperm1 = vec_lvsl(1, src);
67

    
68
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
69
    if (reallyBadAlign)
70
      vsrc1uc = vsrcBuc;
71
    else
72
      vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
73

    
74
    vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
75
                                               (vector unsigned char)vsrc0uc);
76
    vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
77
                                               (vector unsigned char)vsrc1uc);
78

    
79
    if (!loadSecond) {// -> !reallyBadAlign
80
      for (i = 0 ; i < h ; i++) {
81

    
82

    
83
        vsrcCuc = vec_ld(stride + 0, src);
84

    
85
        vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
86
        vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
87

    
88
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
89
                                                (vector unsigned char)vsrc2uc);
90
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
91
                                                (vector unsigned char)vsrc3uc);
92

    
93
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
94
        psum = vec_mladd(vB, vsrc1ssH, psum);
95
        psum = vec_mladd(vC, vsrc2ssH, psum);
96
        psum = vec_mladd(vD, vsrc3ssH, psum);
97
        psum = vec_add(v32ss, psum);
98
        psum = vec_sra(psum, v6us);
99

    
100
        vdst = vec_ld(0, dst);
101
        ppsum = (vector unsigned char)vec_packsu(psum, psum);
102
        vfdst = vec_perm(vdst, ppsum, fperm);
103

    
104
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
105

    
106
        vec_st(fsum, 0, dst);
107

    
108
        vsrc0ssH = vsrc2ssH;
109
        vsrc1ssH = vsrc3ssH;
110

    
111
        dst += stride;
112
        src += stride;
113
      }
114
    } else {
115
        vector unsigned char vsrcDuc;
116
      for (i = 0 ; i < h ; i++) {
117
        vsrcCuc = vec_ld(stride + 0, src);
118
        vsrcDuc = vec_ld(stride + 16, src);
119

    
120
        vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
121
        if (reallyBadAlign)
122
          vsrc3uc = vsrcDuc;
123
        else
124
          vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
125

    
126
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
127
                                                (vector unsigned char)vsrc2uc);
128
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
129
                                                (vector unsigned char)vsrc3uc);
130

    
131
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
132
        psum = vec_mladd(vB, vsrc1ssH, psum);
133
        psum = vec_mladd(vC, vsrc2ssH, psum);
134
        psum = vec_mladd(vD, vsrc3ssH, psum);
135
        psum = vec_add(v32ss, psum);
136
        psum = vec_sr(psum, v6us);
137

    
138
        vdst = vec_ld(0, dst);
139
        ppsum = (vector unsigned char)vec_pack(psum, psum);
140
        vfdst = vec_perm(vdst, ppsum, fperm);
141

    
142
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
143

    
144
        vec_st(fsum, 0, dst);
145

    
146
        vsrc0ssH = vsrc2ssH;
147
        vsrc1ssH = vsrc3ssH;
148

    
149
        dst += stride;
150
        src += stride;
151
      }
152
    }
153
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
154
}
155

    
156
/* this code assume stride % 16 == 0 */
157
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
158
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
159
  register int i;
160

    
161
  const vector signed int vzero = vec_splat_s32(0);
162
  const vector unsigned char permM2 = vec_lvsl(-2, src);
163
  const vector unsigned char permM1 = vec_lvsl(-1, src);
164
  const vector unsigned char permP0 = vec_lvsl(+0, src);
165
  const vector unsigned char permP1 = vec_lvsl(+1, src);
166
  const vector unsigned char permP2 = vec_lvsl(+2, src);
167
  const vector unsigned char permP3 = vec_lvsl(+3, src);
168
  const vector signed short v5ss = vec_splat_s16(5);
169
  const vector unsigned short v5us = vec_splat_u16(5);
170
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
171
  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
172
  const vector unsigned char dstperm = vec_lvsr(0, dst);
173
  const vector unsigned char neg1 =
174
                                (const vector unsigned char) vec_splat_s8(-1);
175

    
176
  const vector unsigned char dstmask =
177
                                vec_perm((const vector unsigned char)vzero,
178
                                                               neg1, dstperm);
179

    
180
  vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
181

    
182
  register int align = ((((unsigned long)src) - 2) % 16);
183

    
184
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
185
                      srcP2A, srcP2B, srcP3A, srcP3B,
186
                      srcM1A, srcM1B, srcM2A, srcM2B,
187
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
188
                      pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
189
                      psumA, psumB, sumA, sumB;
190

    
191
  vector unsigned char sum, dst1, dst2, vdst, fsum,
192
                       rsum, fdst1, fdst2;
193

    
194
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
195

    
196
  for (i = 0 ; i < 16 ; i ++) {
197
    vector unsigned char srcR1 = vec_ld(-2, src);
198
    vector unsigned char srcR2 = vec_ld(14, src);
199

    
200
    switch (align) {
201
    default: {
202
      srcM2 = vec_perm(srcR1, srcR2, permM2);
203
      srcM1 = vec_perm(srcR1, srcR2, permM1);
204
      srcP0 = vec_perm(srcR1, srcR2, permP0);
205
      srcP1 = vec_perm(srcR1, srcR2, permP1);
206
      srcP2 = vec_perm(srcR1, srcR2, permP2);
207
      srcP3 = vec_perm(srcR1, srcR2, permP3);
208
    } break;
209
    case 11: {
210
      srcM2 = vec_perm(srcR1, srcR2, permM2);
211
      srcM1 = vec_perm(srcR1, srcR2, permM1);
212
      srcP0 = vec_perm(srcR1, srcR2, permP0);
213
      srcP1 = vec_perm(srcR1, srcR2, permP1);
214
      srcP2 = vec_perm(srcR1, srcR2, permP2);
215
      srcP3 = srcR2;
216
    } break;
217
    case 12: {
218
      vector unsigned char srcR3 = vec_ld(30, src);
219
      srcM2 = vec_perm(srcR1, srcR2, permM2);
220
      srcM1 = vec_perm(srcR1, srcR2, permM1);
221
      srcP0 = vec_perm(srcR1, srcR2, permP0);
222
      srcP1 = vec_perm(srcR1, srcR2, permP1);
223
      srcP2 = srcR2;
224
      srcP3 = vec_perm(srcR2, srcR3, permP3);
225
    } break;
226
    case 13: {
227
      vector unsigned char srcR3 = vec_ld(30, src);
228
      srcM2 = vec_perm(srcR1, srcR2, permM2);
229
      srcM1 = vec_perm(srcR1, srcR2, permM1);
230
      srcP0 = vec_perm(srcR1, srcR2, permP0);
231
      srcP1 = srcR2;
232
      srcP2 = vec_perm(srcR2, srcR3, permP2);
233
      srcP3 = vec_perm(srcR2, srcR3, permP3);
234
    } break;
235
    case 14: {
236
      vector unsigned char srcR3 = vec_ld(30, src);
237
      srcM2 = vec_perm(srcR1, srcR2, permM2);
238
      srcM1 = vec_perm(srcR1, srcR2, permM1);
239
      srcP0 = srcR2;
240
      srcP1 = vec_perm(srcR2, srcR3, permP1);
241
      srcP2 = vec_perm(srcR2, srcR3, permP2);
242
      srcP3 = vec_perm(srcR2, srcR3, permP3);
243
    } break;
244
    case 15: {
245
      vector unsigned char srcR3 = vec_ld(30, src);
246
      srcM2 = vec_perm(srcR1, srcR2, permM2);
247
      srcM1 = srcR2;
248
      srcP0 = vec_perm(srcR2, srcR3, permP0);
249
      srcP1 = vec_perm(srcR2, srcR3, permP1);
250
      srcP2 = vec_perm(srcR2, srcR3, permP2);
251
      srcP3 = vec_perm(srcR2, srcR3, permP3);
252
    } break;
253
    }
254

    
255
    srcP0A = (vector signed short)
256
                vec_mergeh((vector unsigned char)vzero, srcP0);
257
    srcP0B = (vector signed short)
258
                vec_mergel((vector unsigned char)vzero, srcP0);
259
    srcP1A = (vector signed short)
260
                vec_mergeh((vector unsigned char)vzero, srcP1);
261
    srcP1B = (vector signed short)
262
                vec_mergel((vector unsigned char)vzero, srcP1);
263

    
264
    srcP2A = (vector signed short)
265
                vec_mergeh((vector unsigned char)vzero, srcP2);
266
    srcP2B = (vector signed short)
267
                vec_mergel((vector unsigned char)vzero, srcP2);
268
    srcP3A = (vector signed short)
269
                vec_mergeh((vector unsigned char)vzero, srcP3);
270
    srcP3B = (vector signed short)
271
                vec_mergel((vector unsigned char)vzero, srcP3);
272

    
273
    srcM1A = (vector signed short)
274
                vec_mergeh((vector unsigned char)vzero, srcM1);
275
    srcM1B = (vector signed short)
276
                vec_mergel((vector unsigned char)vzero, srcM1);
277
    srcM2A = (vector signed short)
278
                vec_mergeh((vector unsigned char)vzero, srcM2);
279
    srcM2B = (vector signed short)
280
                vec_mergel((vector unsigned char)vzero, srcM2);
281

    
282
    sum1A = vec_adds(srcP0A, srcP1A);
283
    sum1B = vec_adds(srcP0B, srcP1B);
284
    sum2A = vec_adds(srcM1A, srcP2A);
285
    sum2B = vec_adds(srcM1B, srcP2B);
286
    sum3A = vec_adds(srcM2A, srcP3A);
287
    sum3B = vec_adds(srcM2B, srcP3B);
288

    
289
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
290
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
291

    
292
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
293
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
294

    
295
    pp3A = vec_add(sum3A, pp1A);
296
    pp3B = vec_add(sum3B, pp1B);
297

    
298
    psumA = vec_sub(pp3A, pp2A);
299
    psumB = vec_sub(pp3B, pp2B);
300

    
301
    sumA = vec_sra(psumA, v5us);
302
    sumB = vec_sra(psumB, v5us);
303

    
304
    sum = vec_packsu(sumA, sumB);
305

    
306
    dst1 = vec_ld(0, dst);
307
    dst2 = vec_ld(16, dst);
308
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
309

    
310
    OP_U8_ALTIVEC(fsum, sum, vdst);
311

    
312
    rsum = vec_perm(fsum, fsum, dstperm);
313
    fdst1 = vec_sel(dst1, rsum, dstmask);
314
    fdst2 = vec_sel(rsum, dst2, dstmask);
315

    
316
    vec_st(fdst1, 0, dst);
317
    vec_st(fdst2, 16, dst);
318

    
319
    src += srcStride;
320
    dst += dstStride;
321
  }
322
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
323
}
324

    
325
/* this code assume stride % 16 == 0 */
326
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
327
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
328

    
329
  register int i;
330

    
331
  const vector signed int vzero = vec_splat_s32(0);
332
  const vector unsigned char perm = vec_lvsl(0, src);
333
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
334
  const vector unsigned short v5us = vec_splat_u16(5);
335
  const vector signed short v5ss = vec_splat_s16(5);
336
  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
337
  const vector unsigned char dstperm = vec_lvsr(0, dst);
338
  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
339
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
340

    
341
  uint8_t *srcbis = src - (srcStride * 2);
342

    
343
  const vector unsigned char srcM2a = vec_ld(0, srcbis);
344
  const vector unsigned char srcM2b = vec_ld(16, srcbis);
345
  const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
346
//  srcbis += srcStride;
347
  const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
348
  const vector unsigned char srcM1b = vec_ld(16, srcbis);
349
  const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
350
//  srcbis += srcStride;
351
  const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
352
  const vector unsigned char srcP0b = vec_ld(16, srcbis);
353
  const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
354
//  srcbis += srcStride;
355
  const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
356
  const vector unsigned char srcP1b = vec_ld(16, srcbis);
357
  const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
358
//  srcbis += srcStride;
359
  const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
360
  const vector unsigned char srcP2b = vec_ld(16, srcbis);
361
  const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
362
//  srcbis += srcStride;
363

    
364
  vector signed short srcM2ssA = (vector signed short)
365
                                vec_mergeh((vector unsigned char)vzero, srcM2);
366
  vector signed short srcM2ssB = (vector signed short)
367
                                vec_mergel((vector unsigned char)vzero, srcM2);
368
  vector signed short srcM1ssA = (vector signed short)
369
                                vec_mergeh((vector unsigned char)vzero, srcM1);
370
  vector signed short srcM1ssB = (vector signed short)
371
                                vec_mergel((vector unsigned char)vzero, srcM1);
372
  vector signed short srcP0ssA = (vector signed short)
373
                                vec_mergeh((vector unsigned char)vzero, srcP0);
374
  vector signed short srcP0ssB = (vector signed short)
375
                                vec_mergel((vector unsigned char)vzero, srcP0);
376
  vector signed short srcP1ssA = (vector signed short)
377
                                vec_mergeh((vector unsigned char)vzero, srcP1);
378
  vector signed short srcP1ssB = (vector signed short)
379
                                vec_mergel((vector unsigned char)vzero, srcP1);
380
  vector signed short srcP2ssA = (vector signed short)
381
                                vec_mergeh((vector unsigned char)vzero, srcP2);
382
  vector signed short srcP2ssB = (vector signed short)
383
                                vec_mergel((vector unsigned char)vzero, srcP2);
384

    
385
  vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
386
                      psumA, psumB, sumA, sumB,
387
                      srcP3ssA, srcP3ssB,
388
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
389

    
390
  vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
391
                       srcP3a, srcP3b, srcP3;
392

    
393
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
394

    
395
  for (i = 0 ; i < 16 ; i++) {
396
    srcP3a = vec_ld(0, srcbis += srcStride);
397
    srcP3b = vec_ld(16, srcbis);
398
    srcP3 = vec_perm(srcP3a, srcP3b, perm);
399
    srcP3ssA = (vector signed short)
400
                                vec_mergeh((vector unsigned char)vzero, srcP3);
401
    srcP3ssB = (vector signed short)
402
                                vec_mergel((vector unsigned char)vzero, srcP3);
403
//    srcbis += srcStride;
404

    
405
    sum1A = vec_adds(srcP0ssA, srcP1ssA);
406
    sum1B = vec_adds(srcP0ssB, srcP1ssB);
407
    sum2A = vec_adds(srcM1ssA, srcP2ssA);
408
    sum2B = vec_adds(srcM1ssB, srcP2ssB);
409
    sum3A = vec_adds(srcM2ssA, srcP3ssA);
410
    sum3B = vec_adds(srcM2ssB, srcP3ssB);
411

    
412
    srcM2ssA = srcM1ssA;
413
    srcM2ssB = srcM1ssB;
414
    srcM1ssA = srcP0ssA;
415
    srcM1ssB = srcP0ssB;
416
    srcP0ssA = srcP1ssA;
417
    srcP0ssB = srcP1ssB;
418
    srcP1ssA = srcP2ssA;
419
    srcP1ssB = srcP2ssB;
420
    srcP2ssA = srcP3ssA;
421
    srcP2ssB = srcP3ssB;
422

    
423
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
424
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
425

    
426
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
427
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
428

    
429
    pp3A = vec_add(sum3A, pp1A);
430
    pp3B = vec_add(sum3B, pp1B);
431

    
432
    psumA = vec_sub(pp3A, pp2A);
433
    psumB = vec_sub(pp3B, pp2B);
434

    
435
    sumA = vec_sra(psumA, v5us);
436
    sumB = vec_sra(psumB, v5us);
437

    
438
    sum = vec_packsu(sumA, sumB);
439

    
440
    dst1 = vec_ld(0, dst);
441
    dst2 = vec_ld(16, dst);
442
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
443

    
444
    OP_U8_ALTIVEC(fsum, sum, vdst);
445

    
446
    rsum = vec_perm(fsum, fsum, dstperm);
447
    fdst1 = vec_sel(dst1, rsum, dstmask);
448
    fdst2 = vec_sel(rsum, dst2, dstmask);
449

    
450
    vec_st(fdst1, 0, dst);
451
    vec_st(fdst2, 16, dst);
452

    
453
    dst += dstStride;
454
  }
455
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
456
}
457

    
458
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
459
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
460
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
461
  register int i;
462
  const vector signed int vzero = vec_splat_s32(0);
463
  const vector unsigned char permM2 = vec_lvsl(-2, src);
464
  const vector unsigned char permM1 = vec_lvsl(-1, src);
465
  const vector unsigned char permP0 = vec_lvsl(+0, src);
466
  const vector unsigned char permP1 = vec_lvsl(+1, src);
467
  const vector unsigned char permP2 = vec_lvsl(+2, src);
468
  const vector unsigned char permP3 = vec_lvsl(+3, src);
469
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
470
  const vector unsigned int v10ui = vec_splat_u32(10);
471
  const vector signed short v5ss = vec_splat_s16(5);
472
  const vector signed short v1ss = vec_splat_s16(1);
473
  const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
474
  const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
475

    
476
  register int align = ((((unsigned long)src) - 2) % 16);
477

    
478
  const vector unsigned char neg1 = (const vector unsigned char)
479
                                                        vec_splat_s8(-1);
480

    
481
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
482
                      srcP2A, srcP2B, srcP3A, srcP3B,
483
                      srcM1A, srcM1B, srcM2A, srcM2B,
484
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485
                      pp1A, pp1B, pp2A, pp2B, psumA, psumB;
486

    
487
  const vector unsigned char dstperm = vec_lvsr(0, dst);
488

    
489
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
490

    
491
  const vector unsigned char mperm = (const vector unsigned char)
492
    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
493
        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
494
  int16_t *tmpbis = tmp;
495

    
496
  vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
497
                      tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
498
                      tmpP2ssA, tmpP2ssB;
499

    
500
  vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
501
                    pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
502
                    pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
503
                    ssumAe, ssumAo, ssumBe, ssumBo;
504
  vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
505
                       rsum, fdst1, fdst2;
506
  vector signed short ssume, ssumo;
507

    
508
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
509
  src -= (2 * srcStride);
510
  for (i = 0 ; i < 21 ; i ++) {
511
    vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
512
    vector unsigned char srcR1 = vec_ld(-2, src);
513
    vector unsigned char srcR2 = vec_ld(14, src);
514

    
515
    switch (align) {
516
    default: {
517
      srcM2 = vec_perm(srcR1, srcR2, permM2);
518
      srcM1 = vec_perm(srcR1, srcR2, permM1);
519
      srcP0 = vec_perm(srcR1, srcR2, permP0);
520
      srcP1 = vec_perm(srcR1, srcR2, permP1);
521
      srcP2 = vec_perm(srcR1, srcR2, permP2);
522
      srcP3 = vec_perm(srcR1, srcR2, permP3);
523
    } break;
524
    case 11: {
525
      srcM2 = vec_perm(srcR1, srcR2, permM2);
526
      srcM1 = vec_perm(srcR1, srcR2, permM1);
527
      srcP0 = vec_perm(srcR1, srcR2, permP0);
528
      srcP1 = vec_perm(srcR1, srcR2, permP1);
529
      srcP2 = vec_perm(srcR1, srcR2, permP2);
530
      srcP3 = srcR2;
531
    } break;
532
    case 12: {
533
      vector unsigned char srcR3 = vec_ld(30, src);
534
      srcM2 = vec_perm(srcR1, srcR2, permM2);
535
      srcM1 = vec_perm(srcR1, srcR2, permM1);
536
      srcP0 = vec_perm(srcR1, srcR2, permP0);
537
      srcP1 = vec_perm(srcR1, srcR2, permP1);
538
      srcP2 = srcR2;
539
      srcP3 = vec_perm(srcR2, srcR3, permP3);
540
    } break;
541
    case 13: {
542
      vector unsigned char srcR3 = vec_ld(30, src);
543
      srcM2 = vec_perm(srcR1, srcR2, permM2);
544
      srcM1 = vec_perm(srcR1, srcR2, permM1);
545
      srcP0 = vec_perm(srcR1, srcR2, permP0);
546
      srcP1 = srcR2;
547
      srcP2 = vec_perm(srcR2, srcR3, permP2);
548
      srcP3 = vec_perm(srcR2, srcR3, permP3);
549
    } break;
550
    case 14: {
551
      vector unsigned char srcR3 = vec_ld(30, src);
552
      srcM2 = vec_perm(srcR1, srcR2, permM2);
553
      srcM1 = vec_perm(srcR1, srcR2, permM1);
554
      srcP0 = srcR2;
555
      srcP1 = vec_perm(srcR2, srcR3, permP1);
556
      srcP2 = vec_perm(srcR2, srcR3, permP2);
557
      srcP3 = vec_perm(srcR2, srcR3, permP3);
558
    } break;
559
    case 15: {
560
      vector unsigned char srcR3 = vec_ld(30, src);
561
      srcM2 = vec_perm(srcR1, srcR2, permM2);
562
      srcM1 = srcR2;
563
      srcP0 = vec_perm(srcR2, srcR3, permP0);
564
      srcP1 = vec_perm(srcR2, srcR3, permP1);
565
      srcP2 = vec_perm(srcR2, srcR3, permP2);
566
      srcP3 = vec_perm(srcR2, srcR3, permP3);
567
    } break;
568
    }
569

    
570
    srcP0A = (vector signed short)
571
                            vec_mergeh((vector unsigned char)vzero, srcP0);
572
    srcP0B = (vector signed short)
573
                            vec_mergel((vector unsigned char)vzero, srcP0);
574
    srcP1A = (vector signed short)
575
                            vec_mergeh((vector unsigned char)vzero, srcP1);
576
    srcP1B = (vector signed short)
577
                            vec_mergel((vector unsigned char)vzero, srcP1);
578

    
579
    srcP2A = (vector signed short)
580
                            vec_mergeh((vector unsigned char)vzero, srcP2);
581
    srcP2B = (vector signed short)
582
                            vec_mergel((vector unsigned char)vzero, srcP2);
583
    srcP3A = (vector signed short)
584
                            vec_mergeh((vector unsigned char)vzero, srcP3);
585
    srcP3B = (vector signed short)
586
                            vec_mergel((vector unsigned char)vzero, srcP3);
587

    
588
    srcM1A = (vector signed short)
589
                            vec_mergeh((vector unsigned char)vzero, srcM1);
590
    srcM1B = (vector signed short)
591
                            vec_mergel((vector unsigned char)vzero, srcM1);
592
    srcM2A = (vector signed short)
593
                            vec_mergeh((vector unsigned char)vzero, srcM2);
594
    srcM2B = (vector signed short)
595
                            vec_mergel((vector unsigned char)vzero, srcM2);
596

    
597
    sum1A = vec_adds(srcP0A, srcP1A);
598
    sum1B = vec_adds(srcP0B, srcP1B);
599
    sum2A = vec_adds(srcM1A, srcP2A);
600
    sum2B = vec_adds(srcM1B, srcP2B);
601
    sum3A = vec_adds(srcM2A, srcP3A);
602
    sum3B = vec_adds(srcM2B, srcP3B);
603

    
604
    pp1A = vec_mladd(sum1A, v20ss, sum3A);
605
    pp1B = vec_mladd(sum1B, v20ss, sum3B);
606

    
607
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
608
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
609

    
610
    psumA = vec_sub(pp1A, pp2A);
611
    psumB = vec_sub(pp1B, pp2B);
612

    
613
    vec_st(psumA, 0, tmp);
614
    vec_st(psumB, 16, tmp);
615

    
616
    src += srcStride;
617
    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
618
  }
619

    
620
  tmpM2ssA = vec_ld(0, tmpbis);
621
  tmpM2ssB = vec_ld(16, tmpbis);
622
  tmpbis += tmpStride;
623
  tmpM1ssA = vec_ld(0, tmpbis);
624
  tmpM1ssB = vec_ld(16, tmpbis);
625
  tmpbis += tmpStride;
626
  tmpP0ssA = vec_ld(0, tmpbis);
627
  tmpP0ssB = vec_ld(16, tmpbis);
628
  tmpbis += tmpStride;
629
  tmpP1ssA = vec_ld(0, tmpbis);
630
  tmpP1ssB = vec_ld(16, tmpbis);
631
  tmpbis += tmpStride;
632
  tmpP2ssA = vec_ld(0, tmpbis);
633
  tmpP2ssB = vec_ld(16, tmpbis);
634
  tmpbis += tmpStride;
635

    
636
  for (i = 0 ; i < 16 ; i++) {
637
    const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
638
    const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
639

    
640
    const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
641
    const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
642
    const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
643
    const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
644
    const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
645
    const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
646

    
647
    tmpbis += tmpStride;
648

    
649
    tmpM2ssA = tmpM1ssA;
650
    tmpM2ssB = tmpM1ssB;
651
    tmpM1ssA = tmpP0ssA;
652
    tmpM1ssB = tmpP0ssB;
653
    tmpP0ssA = tmpP1ssA;
654
    tmpP0ssB = tmpP1ssB;
655
    tmpP1ssA = tmpP2ssA;
656
    tmpP1ssB = tmpP2ssB;
657
    tmpP2ssA = tmpP3ssA;
658
    tmpP2ssB = tmpP3ssB;
659

    
660
    pp1Ae = vec_mule(sum1A, v20ss);
661
    pp1Ao = vec_mulo(sum1A, v20ss);
662
    pp1Be = vec_mule(sum1B, v20ss);
663
    pp1Bo = vec_mulo(sum1B, v20ss);
664

    
665
    pp2Ae = vec_mule(sum2A, v5ss);
666
    pp2Ao = vec_mulo(sum2A, v5ss);
667
    pp2Be = vec_mule(sum2B, v5ss);
668
    pp2Bo = vec_mulo(sum2B, v5ss);
669

    
670
    pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
671
    pp3Ao = vec_mulo(sum3A, v1ss);
672
    pp3Be = vec_sra((vector signed int)sum3B, v16ui);
673
    pp3Bo = vec_mulo(sum3B, v1ss);
674

    
675
    pp1cAe = vec_add(pp1Ae, v512si);
676
    pp1cAo = vec_add(pp1Ao, v512si);
677
    pp1cBe = vec_add(pp1Be, v512si);
678
    pp1cBo = vec_add(pp1Bo, v512si);
679

    
680
    pp32Ae = vec_sub(pp3Ae, pp2Ae);
681
    pp32Ao = vec_sub(pp3Ao, pp2Ao);
682
    pp32Be = vec_sub(pp3Be, pp2Be);
683
    pp32Bo = vec_sub(pp3Bo, pp2Bo);
684

    
685
    sumAe = vec_add(pp1cAe, pp32Ae);
686
    sumAo = vec_add(pp1cAo, pp32Ao);
687
    sumBe = vec_add(pp1cBe, pp32Be);
688
    sumBo = vec_add(pp1cBo, pp32Bo);
689

    
690
    ssumAe = vec_sra(sumAe, v10ui);
691
    ssumAo = vec_sra(sumAo, v10ui);
692
    ssumBe = vec_sra(sumBe, v10ui);
693
    ssumBo = vec_sra(sumBo, v10ui);
694

    
695
    ssume = vec_packs(ssumAe, ssumBe);
696
    ssumo = vec_packs(ssumAo, ssumBo);
697

    
698
    sumv = vec_packsu(ssume, ssumo);
699
    sum = vec_perm(sumv, sumv, mperm);
700

    
701
    dst1 = vec_ld(0, dst);
702
    dst2 = vec_ld(16, dst);
703
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
704

    
705
    OP_U8_ALTIVEC(fsum, sum, vdst);
706

    
707
    rsum = vec_perm(fsum, fsum, dstperm);
708
    fdst1 = vec_sel(dst1, rsum, dstmask);
709
    fdst2 = vec_sel(rsum, dst2, dstmask);
710

    
711
    vec_st(fdst1, 0, dst);
712
    vec_st(fdst2, 16, dst);
713

    
714
    dst += dstStride;
715
  }
716
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
717
}