Statistics
| Branch: | Revision:

ffmpeg / libavcodec / ppc / dsputil_h264_template_altivec.c @ e8772eec

History | View | Annotate | Download (26.4 KB)

1
/*
2
 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3
 *
4
 * This library is free software; you can redistribute it and/or
5
 * modify it under the terms of the GNU Lesser General Public
6
 * License as published by the Free Software Foundation; either
7
 * version 2 of the License, or (at your option) any later version.
8
 *
9
 * This library is distributed in the hope that it will be useful,
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12
 * Lesser General Public License for more details.
13
 *
14
 * You should have received a copy of the GNU Lesser General Public
15
 * License along with this library; if not, write to the Free Software
16
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
 */
18

    
19
/* this code assume that stride % 16 == 0 */
20
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
21
  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
22
    signed int ABCD[4] __attribute__((aligned(16))) =
23
                        {((8 - x) * (8 - y)),
24
                          ((x) * (8 - y)),
25
                          ((8 - x) * (y)),
26
                          ((x) * (y))};
27
    register int i;
28
    vector unsigned char fperm;
29
    const vector signed int vABCD = vec_ld(0, ABCD);
30
    const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
31
    const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
32
    const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
33
    const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
34
    const vector signed int vzero = vec_splat_s32(0);
35
    const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
36
    const vector unsigned short v6us = vec_splat_u16(6);
37
    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
38
    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
39

    
40
    vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
41
    vector unsigned char vsrc0uc, vsrc1uc;
42
    vector signed short vsrc0ssH, vsrc1ssH;
43
    vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
44
    vector signed short vsrc2ssH, vsrc3ssH, psum;
45
    vector unsigned char vdst, ppsum, vfdst, fsum;
46

    
47
  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
48

    
49
    if (((unsigned long)dst) % 16 == 0) {
50
      fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
51
                                        0x14, 0x15, 0x16, 0x17,
52
                                        0x08, 0x09, 0x0A, 0x0B,
53
                                        0x0C, 0x0D, 0x0E, 0x0F);
54
    } else {
55
      fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
56
                                        0x04, 0x05, 0x06, 0x07,
57
                                        0x18, 0x19, 0x1A, 0x1B,
58
                                        0x1C, 0x1D, 0x1E, 0x1F);
59
    }
60

    
61
    vsrcAuc = vec_ld(0, src);
62

    
63
    if (loadSecond)
64
      vsrcBuc = vec_ld(16, src);
65
    vsrcperm0 = vec_lvsl(0, src);
66
    vsrcperm1 = vec_lvsl(1, src);
67

    
68
    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
69
    if (reallyBadAlign)
70
      vsrc1uc = vsrcBuc;
71
    else
72
      vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
73

    
74
    vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
75
                                               (vector unsigned char)vsrc0uc);
76
    vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
77
                                               (vector unsigned char)vsrc1uc);
78

    
79
    if (!loadSecond) {// -> !reallyBadAlign
80
      for (i = 0 ; i < h ; i++) {
81

    
82

    
83
        vsrcCuc = vec_ld(stride + 0, src);
84

    
85
        vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
86
        vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
87

    
88
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
89
                                                (vector unsigned char)vsrc2uc);
90
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
91
                                                (vector unsigned char)vsrc3uc);
92

    
93
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
94
        psum = vec_mladd(vB, vsrc1ssH, psum);
95
        psum = vec_mladd(vC, vsrc2ssH, psum);
96
        psum = vec_mladd(vD, vsrc3ssH, psum);
97
        psum = vec_add(v32ss, psum);
98
        psum = vec_sra(psum, v6us);
99

    
100
        vdst = vec_ld(0, dst);
101
        ppsum = (vector unsigned char)vec_packsu(psum, psum);
102
        vfdst = vec_perm(vdst, ppsum, fperm);
103

    
104
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
105

    
106
        vec_st(fsum, 0, dst);
107

    
108
        vsrc0ssH = vsrc2ssH;
109
        vsrc1ssH = vsrc3ssH;
110

    
111
        dst += stride;
112
        src += stride;
113
      }
114
    } else {
115
        vector unsigned char vsrcDuc;
116
      for (i = 0 ; i < h ; i++) {
117
        vsrcCuc = vec_ld(stride + 0, src);
118
        vsrcDuc = vec_ld(stride + 16, src);
119

    
120
        vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
121
        if (reallyBadAlign)
122
          vsrc3uc = vsrcDuc;
123
        else
124
          vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
125

    
126
        vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
127
                                                (vector unsigned char)vsrc2uc);
128
        vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
129
                                                (vector unsigned char)vsrc3uc);
130

    
131
        psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
132
        psum = vec_mladd(vB, vsrc1ssH, psum);
133
        psum = vec_mladd(vC, vsrc2ssH, psum);
134
        psum = vec_mladd(vD, vsrc3ssH, psum);
135
        psum = vec_add(v32ss, psum);
136
        psum = vec_sr(psum, v6us);
137

    
138
        vdst = vec_ld(0, dst);
139
        ppsum = (vector unsigned char)vec_pack(psum, psum);
140
        vfdst = vec_perm(vdst, ppsum, fperm);
141

    
142
        OP_U8_ALTIVEC(fsum, vfdst, vdst);
143

    
144
        vec_st(fsum, 0, dst);
145

    
146
        vsrc0ssH = vsrc2ssH;
147
        vsrc1ssH = vsrc3ssH;
148

    
149
        dst += stride;
150
        src += stride;
151
      }
152
    }
153
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
154
}
155

    
156
/* this code assume stride % 16 == 0 */
157
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
158
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
159
  register int i;
160

    
161
  const vector signed int vzero = vec_splat_s32(0);
162
  const vector unsigned char permM2 = vec_lvsl(-2, src);
163
  const vector unsigned char permM1 = vec_lvsl(-1, src);
164
  const vector unsigned char permP0 = vec_lvsl(+0, src);
165
  const vector unsigned char permP1 = vec_lvsl(+1, src);
166
  const vector unsigned char permP2 = vec_lvsl(+2, src);
167
  const vector unsigned char permP3 = vec_lvsl(+3, src);
168
  const vector signed short v5ss = vec_splat_s16(5);
169
  const vector unsigned short v5us = vec_splat_u16(5);
170
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
171
  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
172
  const vector unsigned char dstperm = vec_lvsr(0, dst);
173
  const vector unsigned char neg1 =
174
                                (const vector unsigned char) vec_splat_s8(-1);
175

    
176
  const vector unsigned char dstmask =
177
                                vec_perm((const vector unsigned char)vzero,
178
                                                               neg1, dstperm);
179

    
180
  vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
181

    
182
  register int align = ((((unsigned long)src) - 2) % 16);
183

    
184
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
185
                      srcP2A, srcP2B, srcP3A, srcP3B,
186
                      srcM1A, srcM1B, srcM2A, srcM2B,
187
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
188
                      pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
189
                      psumA, psumB, sumA, sumB;
190

    
191
  vector unsigned char sum, dst1, dst2, vdst, fsum,
192
                       rsum, fdst1, fdst2;
193

    
194
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
195

    
196
  for (i = 0 ; i < 16 ; i ++) {
197
    vector unsigned char srcR1 = vec_ld(-2, src);
198
    vector unsigned char srcR2 = vec_ld(14, src);
199

    
200
    switch (align) {
201
    default: {
202
      srcM2 = vec_perm(srcR1, srcR2, permM2);
203
      srcM1 = vec_perm(srcR1, srcR2, permM1);
204
      srcP0 = vec_perm(srcR1, srcR2, permP0);
205
      srcP1 = vec_perm(srcR1, srcR2, permP1);
206
      srcP2 = vec_perm(srcR1, srcR2, permP2);
207
      srcP3 = vec_perm(srcR1, srcR2, permP3);
208
    } break;
209
    case 11: {
210
      srcM2 = vec_perm(srcR1, srcR2, permM2);
211
      srcM1 = vec_perm(srcR1, srcR2, permM1);
212
      srcP0 = vec_perm(srcR1, srcR2, permP0);
213
      srcP1 = vec_perm(srcR1, srcR2, permP1);
214
      srcP2 = vec_perm(srcR1, srcR2, permP2);
215
      srcP3 = srcR2;
216
    } break;
217
    case 12: {
218
      vector unsigned char srcR3 = vec_ld(30, src);
219
      srcM2 = vec_perm(srcR1, srcR2, permM2);
220
      srcM1 = vec_perm(srcR1, srcR2, permM1);
221
      srcP0 = vec_perm(srcR1, srcR2, permP0);
222
      srcP1 = vec_perm(srcR1, srcR2, permP1);
223
      srcP2 = srcR2;
224
      srcP3 = vec_perm(srcR2, srcR3, permP3);
225
    } break;
226
    case 13: {
227
      vector unsigned char srcR3 = vec_ld(30, src);
228
      srcM2 = vec_perm(srcR1, srcR2, permM2);
229
      srcM1 = vec_perm(srcR1, srcR2, permM1);
230
      srcP0 = vec_perm(srcR1, srcR2, permP0);
231
      srcP1 = srcR2;
232
      srcP2 = vec_perm(srcR2, srcR3, permP2);
233
      srcP3 = vec_perm(srcR2, srcR3, permP3);
234
    } break;
235
    case 14: {
236
      vector unsigned char srcR3 = vec_ld(30, src);
237
      srcM2 = vec_perm(srcR1, srcR2, permM2);
238
      srcM1 = vec_perm(srcR1, srcR2, permM1);
239
      srcP0 = srcR2;
240
      srcP1 = vec_perm(srcR2, srcR3, permP1);
241
      srcP2 = vec_perm(srcR2, srcR3, permP2);
242
      srcP3 = vec_perm(srcR2, srcR3, permP3);
243
    } break;
244
    case 15: {
245
      vector unsigned char srcR3 = vec_ld(30, src);
246
      srcM2 = vec_perm(srcR1, srcR2, permM2);
247
      srcM1 = srcR2;
248
      srcP0 = vec_perm(srcR2, srcR3, permP0);
249
      srcP1 = vec_perm(srcR2, srcR3, permP1);
250
      srcP2 = vec_perm(srcR2, srcR3, permP2);
251
      srcP3 = vec_perm(srcR2, srcR3, permP3);
252
    } break;
253
    }
254

    
255
    srcP0A = vec_mergeh((vector unsigned char)vzero, srcP0);
256
    srcP0B = vec_mergel((vector unsigned char)vzero, srcP0);
257
    srcP1A = vec_mergeh((vector unsigned char)vzero, srcP1);
258
    srcP1B = vec_mergel((vector unsigned char)vzero, srcP1);
259

    
260
    srcP2A = vec_mergeh((vector unsigned char)vzero, srcP2);
261
    srcP2B = vec_mergel((vector unsigned char)vzero, srcP2);
262
    srcP3A = vec_mergeh((vector unsigned char)vzero, srcP3);
263
    srcP3B = vec_mergel((vector unsigned char)vzero, srcP3);
264

    
265
    srcM1A = vec_mergeh((vector unsigned char)vzero, srcM1);
266
    srcM1B = vec_mergel((vector unsigned char)vzero, srcM1);
267
    srcM2A = vec_mergeh((vector unsigned char)vzero, srcM2);
268
    srcM2B = vec_mergel((vector unsigned char)vzero, srcM2);
269

    
270
    sum1A = vec_adds(srcP0A, srcP1A);
271
    sum1B = vec_adds(srcP0B, srcP1B);
272
    sum2A = vec_adds(srcM1A, srcP2A);
273
    sum2B = vec_adds(srcM1B, srcP2B);
274
    sum3A = vec_adds(srcM2A, srcP3A);
275
    sum3B = vec_adds(srcM2B, srcP3B);
276

    
277
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
278
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
279

    
280
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
281
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
282

    
283
    pp3A = vec_add(sum3A, pp1A);
284
    pp3B = vec_add(sum3B, pp1B);
285

    
286
    psumA = vec_sub(pp3A, pp2A);
287
    psumB = vec_sub(pp3B, pp2B);
288

    
289
    sumA = vec_sra(psumA, v5us);
290
    sumB = vec_sra(psumB, v5us);
291

    
292
    sum = vec_packsu(sumA, sumB);
293

    
294
    dst1 = vec_ld(0, dst);
295
    dst2 = vec_ld(16, dst);
296
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
297

    
298
    OP_U8_ALTIVEC(fsum, sum, vdst);
299

    
300
    rsum = vec_perm(fsum, fsum, dstperm);
301
    fdst1 = vec_sel(dst1, rsum, dstmask);
302
    fdst2 = vec_sel(rsum, dst2, dstmask);
303

    
304
    vec_st(fdst1, 0, dst);
305
    vec_st(fdst2, 16, dst);
306

    
307
    src += srcStride;
308
    dst += dstStride;
309
  }
310
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
311
}
312

    
313
/* this code assume stride % 16 == 0 */
314
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
315
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
316

    
317
  register int i;
318

    
319
  const vector signed int vzero = vec_splat_s32(0);
320
  const vector unsigned char perm = vec_lvsl(0, src);
321
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
322
  const vector unsigned short v5us = vec_splat_u16(5);
323
  const vector signed short v5ss = vec_splat_s16(5);
324
  const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
325
  const vector unsigned char dstperm = vec_lvsr(0, dst);
326
  const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
327
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
328

    
329
  uint8_t *srcbis = src - (srcStride * 2);
330

    
331
  const vector unsigned char srcM2a = vec_ld(0, srcbis);
332
  const vector unsigned char srcM2b = vec_ld(16, srcbis);
333
  const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
334
//  srcbis += srcStride;
335
  const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
336
  const vector unsigned char srcM1b = vec_ld(16, srcbis);
337
  const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
338
//  srcbis += srcStride;
339
  const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
340
  const vector unsigned char srcP0b = vec_ld(16, srcbis);
341
  const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
342
//  srcbis += srcStride;
343
  const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
344
  const vector unsigned char srcP1b = vec_ld(16, srcbis);
345
  const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
346
//  srcbis += srcStride;
347
  const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
348
  const vector unsigned char srcP2b = vec_ld(16, srcbis);
349
  const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
350
//  srcbis += srcStride;
351

    
352
  vector signed short srcM2ssA = (vector signed short)
353
                                vec_mergeh((vector unsigned char)vzero, srcM2);
354
  vector signed short srcM2ssB = (vector signed short)
355
                                vec_mergel((vector unsigned char)vzero, srcM2);
356
  vector signed short srcM1ssA = (vector signed short)
357
                                vec_mergeh((vector unsigned char)vzero, srcM1);
358
  vector signed short srcM1ssB = (vector signed short)
359
                                vec_mergel((vector unsigned char)vzero, srcM1);
360
  vector signed short srcP0ssA = (vector signed short)
361
                                vec_mergeh((vector unsigned char)vzero, srcP0);
362
  vector signed short srcP0ssB = (vector signed short)
363
                                vec_mergel((vector unsigned char)vzero, srcP0);
364
  vector signed short srcP1ssA = (vector signed short)
365
                                vec_mergeh((vector unsigned char)vzero, srcP1);
366
  vector signed short srcP1ssB = (vector signed short)
367
                                vec_mergel((vector unsigned char)vzero, srcP1);
368
  vector signed short srcP2ssA = (vector signed short)
369
                                vec_mergeh((vector unsigned char)vzero, srcP2);
370
  vector signed short srcP2ssB = (vector signed short)
371
                                vec_mergel((vector unsigned char)vzero, srcP2);
372

    
373
  vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
374
                      psumA, psumB, sumA, sumB,
375
                      srcP3ssA, srcP3ssB,
376
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
377

    
378
  vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
379
                       srcP3a, srcP3b, srcP3;
380

    
381
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
382

    
383
  for (i = 0 ; i < 16 ; i++) {
384
    srcP3a = vec_ld(0, srcbis += srcStride);
385
    srcP3b = vec_ld(16, srcbis);
386
    srcP3 = vec_perm(srcP3a, srcP3b, perm);
387
    srcP3ssA = (vector signed short)
388
                                vec_mergeh((vector unsigned char)vzero, srcP3);
389
    srcP3ssB = (vector signed short)
390
                                vec_mergel((vector unsigned char)vzero, srcP3);
391
//    srcbis += srcStride;
392

    
393
    sum1A = vec_adds(srcP0ssA, srcP1ssA);
394
    sum1B = vec_adds(srcP0ssB, srcP1ssB);
395
    sum2A = vec_adds(srcM1ssA, srcP2ssA);
396
    sum2B = vec_adds(srcM1ssB, srcP2ssB);
397
    sum3A = vec_adds(srcM2ssA, srcP3ssA);
398
    sum3B = vec_adds(srcM2ssB, srcP3ssB);
399

    
400
    srcM2ssA = srcM1ssA;
401
    srcM2ssB = srcM1ssB;
402
    srcM1ssA = srcP0ssA;
403
    srcM1ssB = srcP0ssB;
404
    srcP0ssA = srcP1ssA;
405
    srcP0ssB = srcP1ssB;
406
    srcP1ssA = srcP2ssA;
407
    srcP1ssB = srcP2ssB;
408
    srcP2ssA = srcP3ssA;
409
    srcP2ssB = srcP3ssB;
410

    
411
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
412
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
413

    
414
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
415
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
416

    
417
    pp3A = vec_add(sum3A, pp1A);
418
    pp3B = vec_add(sum3B, pp1B);
419

    
420
    psumA = vec_sub(pp3A, pp2A);
421
    psumB = vec_sub(pp3B, pp2B);
422

    
423
    sumA = vec_sra(psumA, v5us);
424
    sumB = vec_sra(psumB, v5us);
425

    
426
    sum = vec_packsu(sumA, sumB);
427

    
428
    dst1 = vec_ld(0, dst);
429
    dst2 = vec_ld(16, dst);
430
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
431

    
432
    OP_U8_ALTIVEC(fsum, sum, vdst);
433

    
434
    rsum = vec_perm(fsum, fsum, dstperm);
435
    fdst1 = vec_sel(dst1, rsum, dstmask);
436
    fdst2 = vec_sel(rsum, dst2, dstmask);
437

    
438
    vec_st(fdst1, 0, dst);
439
    vec_st(fdst2, 16, dst);
440

    
441
    dst += dstStride;
442
  }
443
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
444
}
445

    
446
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
447
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
448
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
449
  register int i;
450
  const vector signed int vzero = vec_splat_s32(0);
451
  const vector unsigned char permM2 = vec_lvsl(-2, src);
452
  const vector unsigned char permM1 = vec_lvsl(-1, src);
453
  const vector unsigned char permP0 = vec_lvsl(+0, src);
454
  const vector unsigned char permP1 = vec_lvsl(+1, src);
455
  const vector unsigned char permP2 = vec_lvsl(+2, src);
456
  const vector unsigned char permP3 = vec_lvsl(+3, src);
457
  const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
458
  const vector unsigned int v10ui = vec_splat_u32(10);
459
  const vector signed short v5ss = vec_splat_s16(5);
460
  const vector signed short v1ss = vec_splat_s16(1);
461
  const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
462
  const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
463

    
464
  register int align = ((((unsigned long)src) - 2) % 16);
465

    
466
  const vector unsigned char neg1 = (const vector unsigned char)
467
                                                        vec_splat_s8(-1);
468

    
469
  vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
470
                      srcP2A, srcP2B, srcP3A, srcP3B,
471
                      srcM1A, srcM1B, srcM2A, srcM2B,
472
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
473
                      pp1A, pp1B, pp2A, pp2B, psumA, psumB;
474

    
475
  const vector unsigned char dstperm = vec_lvsr(0, dst);
476

    
477
  const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
478

    
479
  const vector unsigned char mperm = (const vector unsigned char)
480
    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
481
        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
482
  int16_t *tmpbis = tmp;
483

    
484
  vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
485
                      tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
486
                      tmpP2ssA, tmpP2ssB;
487

    
488
  vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
489
                    pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
490
                    pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
491
                    ssumAe, ssumAo, ssumBe, ssumBo;
492
  vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
493
                       rsum, fdst1, fdst2;
494
  vector signed short ssume, ssumo;
495

    
496
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
497
  src -= (2 * srcStride);
498
  for (i = 0 ; i < 21 ; i ++) {
499
    vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
500
    vector unsigned char srcR1 = vec_ld(-2, src);
501
    vector unsigned char srcR2 = vec_ld(14, src);
502

    
503
    switch (align) {
504
    default: {
505
      srcM2 = vec_perm(srcR1, srcR2, permM2);
506
      srcM1 = vec_perm(srcR1, srcR2, permM1);
507
      srcP0 = vec_perm(srcR1, srcR2, permP0);
508
      srcP1 = vec_perm(srcR1, srcR2, permP1);
509
      srcP2 = vec_perm(srcR1, srcR2, permP2);
510
      srcP3 = vec_perm(srcR1, srcR2, permP3);
511
    } break;
512
    case 11: {
513
      srcM2 = vec_perm(srcR1, srcR2, permM2);
514
      srcM1 = vec_perm(srcR1, srcR2, permM1);
515
      srcP0 = vec_perm(srcR1, srcR2, permP0);
516
      srcP1 = vec_perm(srcR1, srcR2, permP1);
517
      srcP2 = vec_perm(srcR1, srcR2, permP2);
518
      srcP3 = srcR2;
519
    } break;
520
    case 12: {
521
      vector unsigned char srcR3 = vec_ld(30, src);
522
      srcM2 = vec_perm(srcR1, srcR2, permM2);
523
      srcM1 = vec_perm(srcR1, srcR2, permM1);
524
      srcP0 = vec_perm(srcR1, srcR2, permP0);
525
      srcP1 = vec_perm(srcR1, srcR2, permP1);
526
      srcP2 = srcR2;
527
      srcP3 = vec_perm(srcR2, srcR3, permP3);
528
    } break;
529
    case 13: {
530
      vector unsigned char srcR3 = vec_ld(30, src);
531
      srcM2 = vec_perm(srcR1, srcR2, permM2);
532
      srcM1 = vec_perm(srcR1, srcR2, permM1);
533
      srcP0 = vec_perm(srcR1, srcR2, permP0);
534
      srcP1 = srcR2;
535
      srcP2 = vec_perm(srcR2, srcR3, permP2);
536
      srcP3 = vec_perm(srcR2, srcR3, permP3);
537
    } break;
538
    case 14: {
539
      vector unsigned char srcR3 = vec_ld(30, src);
540
      srcM2 = vec_perm(srcR1, srcR2, permM2);
541
      srcM1 = vec_perm(srcR1, srcR2, permM1);
542
      srcP0 = srcR2;
543
      srcP1 = vec_perm(srcR2, srcR3, permP1);
544
      srcP2 = vec_perm(srcR2, srcR3, permP2);
545
      srcP3 = vec_perm(srcR2, srcR3, permP3);
546
    } break;
547
    case 15: {
548
      vector unsigned char srcR3 = vec_ld(30, src);
549
      srcM2 = vec_perm(srcR1, srcR2, permM2);
550
      srcM1 = srcR2;
551
      srcP0 = vec_perm(srcR2, srcR3, permP0);
552
      srcP1 = vec_perm(srcR2, srcR3, permP1);
553
      srcP2 = vec_perm(srcR2, srcR3, permP2);
554
      srcP3 = vec_perm(srcR2, srcR3, permP3);
555
    } break;
556
    }
557

    
558
    srcP0A = (vector signed short)
559
                            vec_mergeh((vector unsigned char)vzero, srcP0);
560
    srcP0B = (vector signed short)
561
                            vec_mergel((vector unsigned char)vzero, srcP0);
562
    srcP1A = (vector signed short)
563
                            vec_mergeh((vector unsigned char)vzero, srcP1);
564
    srcP1B = (vector signed short)
565
                            vec_mergel((vector unsigned char)vzero, srcP1);
566

    
567
    srcP2A = (vector signed short)
568
                            vec_mergeh((vector unsigned char)vzero, srcP2);
569
    srcP2B = (vector signed short)
570
                            vec_mergel((vector unsigned char)vzero, srcP2);
571
    srcP3A = (vector signed short)
572
                            vec_mergeh((vector unsigned char)vzero, srcP3);
573
    srcP3B = (vector signed short)
574
                            vec_mergel((vector unsigned char)vzero, srcP3);
575

    
576
    srcM1A = (vector signed short)
577
                            vec_mergeh((vector unsigned char)vzero, srcM1);
578
    srcM1B = (vector signed short)
579
                            vec_mergel((vector unsigned char)vzero, srcM1);
580
    srcM2A = (vector signed short)
581
                            vec_mergeh((vector unsigned char)vzero, srcM2);
582
    srcM2B = (vector signed short)
583
                            vec_mergel((vector unsigned char)vzero, srcM2);
584

    
585
    sum1A = vec_adds(srcP0A, srcP1A);
586
    sum1B = vec_adds(srcP0B, srcP1B);
587
    sum2A = vec_adds(srcM1A, srcP2A);
588
    sum2B = vec_adds(srcM1B, srcP2B);
589
    sum3A = vec_adds(srcM2A, srcP3A);
590
    sum3B = vec_adds(srcM2B, srcP3B);
591

    
592
    pp1A = vec_mladd(sum1A, v20ss, sum3A);
593
    pp1B = vec_mladd(sum1B, v20ss, sum3B);
594

    
595
    pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
596
    pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
597

    
598
    psumA = vec_sub(pp1A, pp2A);
599
    psumB = vec_sub(pp1B, pp2B);
600

    
601
    vec_st(psumA, 0, tmp);
602
    vec_st(psumB, 16, tmp);
603

    
604
    src += srcStride;
605
    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
606
  }
607

    
608
  tmpM2ssA = vec_ld(0, tmpbis);
609
  tmpM2ssB = vec_ld(16, tmpbis);
610
  tmpbis += tmpStride;
611
  tmpM1ssA = vec_ld(0, tmpbis);
612
  tmpM1ssB = vec_ld(16, tmpbis);
613
  tmpbis += tmpStride;
614
  tmpP0ssA = vec_ld(0, tmpbis);
615
  tmpP0ssB = vec_ld(16, tmpbis);
616
  tmpbis += tmpStride;
617
  tmpP1ssA = vec_ld(0, tmpbis);
618
  tmpP1ssB = vec_ld(16, tmpbis);
619
  tmpbis += tmpStride;
620
  tmpP2ssA = vec_ld(0, tmpbis);
621
  tmpP2ssB = vec_ld(16, tmpbis);
622
  tmpbis += tmpStride;
623

    
624
  for (i = 0 ; i < 16 ; i++) {
625
    const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
626
    const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
627

    
628
    const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
629
    const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
630
    const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
631
    const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
632
    const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
633
    const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
634

    
635
    tmpbis += tmpStride;
636

    
637
    tmpM2ssA = tmpM1ssA;
638
    tmpM2ssB = tmpM1ssB;
639
    tmpM1ssA = tmpP0ssA;
640
    tmpM1ssB = tmpP0ssB;
641
    tmpP0ssA = tmpP1ssA;
642
    tmpP0ssB = tmpP1ssB;
643
    tmpP1ssA = tmpP2ssA;
644
    tmpP1ssB = tmpP2ssB;
645
    tmpP2ssA = tmpP3ssA;
646
    tmpP2ssB = tmpP3ssB;
647

    
648
    pp1Ae = vec_mule(sum1A, v20ss);
649
    pp1Ao = vec_mulo(sum1A, v20ss);
650
    pp1Be = vec_mule(sum1B, v20ss);
651
    pp1Bo = vec_mulo(sum1B, v20ss);
652

    
653
    pp2Ae = vec_mule(sum2A, v5ss);
654
    pp2Ao = vec_mulo(sum2A, v5ss);
655
    pp2Be = vec_mule(sum2B, v5ss);
656
    pp2Bo = vec_mulo(sum2B, v5ss);
657

    
658
    pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
659
    pp3Ao = vec_mulo(sum3A, v1ss);
660
    pp3Be = vec_sra((vector signed int)sum3B, v16ui);
661
    pp3Bo = vec_mulo(sum3B, v1ss);
662

    
663
    pp1cAe = vec_add(pp1Ae, v512si);
664
    pp1cAo = vec_add(pp1Ao, v512si);
665
    pp1cBe = vec_add(pp1Be, v512si);
666
    pp1cBo = vec_add(pp1Bo, v512si);
667

    
668
    pp32Ae = vec_sub(pp3Ae, pp2Ae);
669
    pp32Ao = vec_sub(pp3Ao, pp2Ao);
670
    pp32Be = vec_sub(pp3Be, pp2Be);
671
    pp32Bo = vec_sub(pp3Bo, pp2Bo);
672

    
673
    sumAe = vec_add(pp1cAe, pp32Ae);
674
    sumAo = vec_add(pp1cAo, pp32Ao);
675
    sumBe = vec_add(pp1cBe, pp32Be);
676
    sumBo = vec_add(pp1cBo, pp32Bo);
677

    
678
    ssumAe = vec_sra(sumAe, v10ui);
679
    ssumAo = vec_sra(sumAo, v10ui);
680
    ssumBe = vec_sra(sumBe, v10ui);
681
    ssumBo = vec_sra(sumBo, v10ui);
682

    
683
    ssume = vec_packs(ssumAe, ssumBe);
684
    ssumo = vec_packs(ssumAo, ssumBo);
685

    
686
    sumv = vec_packsu(ssume, ssumo);
687
    sum = vec_perm(sumv, sumv, mperm);
688

    
689
    dst1 = vec_ld(0, dst);
690
    dst2 = vec_ld(16, dst);
691
    vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
692

    
693
    OP_U8_ALTIVEC(fsum, sum, vdst);
694

    
695
    rsum = vec_perm(fsum, fsum, dstperm);
696
    fdst1 = vec_sel(dst1, rsum, dstmask);
697
    fdst2 = vec_sel(rsum, dst2, dstmask);
698

    
699
    vec_st(fdst1, 0, dst);
700
    vec_st(fdst2, 16, dst);
701

    
702
    dst += dstStride;
703
  }
704
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
705
}