Revision e3905ce0 libavcodec/ppc/h264_template_altivec.c

View differences:

libavcodec/ppc/h264_template_altivec.c
206 206

  
207 207
/* this code assume stride % 16 == 0 */
208 208
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
209
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
210
  register int i;
211

  
212
  LOAD_ZERO;
213
  const vec_u8_t permM2 = vec_lvsl(-2, src);
214
  const vec_u8_t permM1 = vec_lvsl(-1, src);
215
  const vec_u8_t permP0 = vec_lvsl(+0, src);
216
  const vec_u8_t permP1 = vec_lvsl(+1, src);
217
  const vec_u8_t permP2 = vec_lvsl(+2, src);
218
  const vec_u8_t permP3 = vec_lvsl(+3, src);
219
  const vec_s16_t v5ss = vec_splat_s16(5);
220
  const vec_u16_t v5us = vec_splat_u16(5);
221
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
222
  const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
223

  
224
  vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
225

  
226
  register int align = ((((unsigned long)src) - 2) % 16);
227

  
228
  vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
229
                      srcP2A, srcP2B, srcP3A, srcP3B,
230
                      srcM1A, srcM1B, srcM2A, srcM2B,
231
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
232
                      pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
233
                      psumA, psumB, sumA, sumB;
234

  
235
  vec_u8_t sum, vdst, fsum;
236

  
237
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
238

  
239
  for (i = 0 ; i < 16 ; i ++) {
240
    vec_u8_t srcR1 = vec_ld(-2, src);
241
    vec_u8_t srcR2 = vec_ld(14, src);
242

  
243
    switch (align) {
244
    default: {
245
      srcM2 = vec_perm(srcR1, srcR2, permM2);
246
      srcM1 = vec_perm(srcR1, srcR2, permM1);
247
      srcP0 = vec_perm(srcR1, srcR2, permP0);
248
      srcP1 = vec_perm(srcR1, srcR2, permP1);
249
      srcP2 = vec_perm(srcR1, srcR2, permP2);
250
      srcP3 = vec_perm(srcR1, srcR2, permP3);
251
    } break;
252
    case 11: {
253
      srcM2 = vec_perm(srcR1, srcR2, permM2);
254
      srcM1 = vec_perm(srcR1, srcR2, permM1);
255
      srcP0 = vec_perm(srcR1, srcR2, permP0);
256
      srcP1 = vec_perm(srcR1, srcR2, permP1);
257
      srcP2 = vec_perm(srcR1, srcR2, permP2);
258
      srcP3 = srcR2;
259
    } break;
260
    case 12: {
261
      vec_u8_t srcR3 = vec_ld(30, src);
262
      srcM2 = vec_perm(srcR1, srcR2, permM2);
263
      srcM1 = vec_perm(srcR1, srcR2, permM1);
264
      srcP0 = vec_perm(srcR1, srcR2, permP0);
265
      srcP1 = vec_perm(srcR1, srcR2, permP1);
266
      srcP2 = srcR2;
267
      srcP3 = vec_perm(srcR2, srcR3, permP3);
268
    } break;
269
    case 13: {
270
      vec_u8_t srcR3 = vec_ld(30, src);
271
      srcM2 = vec_perm(srcR1, srcR2, permM2);
272
      srcM1 = vec_perm(srcR1, srcR2, permM1);
273
      srcP0 = vec_perm(srcR1, srcR2, permP0);
274
      srcP1 = srcR2;
275
      srcP2 = vec_perm(srcR2, srcR3, permP2);
276
      srcP3 = vec_perm(srcR2, srcR3, permP3);
277
    } break;
278
    case 14: {
279
      vec_u8_t srcR3 = vec_ld(30, src);
280
      srcM2 = vec_perm(srcR1, srcR2, permM2);
281
      srcM1 = vec_perm(srcR1, srcR2, permM1);
282
      srcP0 = srcR2;
283
      srcP1 = vec_perm(srcR2, srcR3, permP1);
284
      srcP2 = vec_perm(srcR2, srcR3, permP2);
285
      srcP3 = vec_perm(srcR2, srcR3, permP3);
286
    } break;
287
    case 15: {
288
      vec_u8_t srcR3 = vec_ld(30, src);
289
      srcM2 = vec_perm(srcR1, srcR2, permM2);
290
      srcM1 = srcR2;
291
      srcP0 = vec_perm(srcR2, srcR3, permP0);
292
      srcP1 = vec_perm(srcR2, srcR3, permP1);
293
      srcP2 = vec_perm(srcR2, srcR3, permP2);
294
      srcP3 = vec_perm(srcR2, srcR3, permP3);
295
    } break;
296
    }
209
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
210
    register int i;
297 211

  
298
    srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
299
    srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
300
    srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
301
    srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
212
    LOAD_ZERO;
213
    const vec_u8_t permM2 = vec_lvsl(-2, src);
214
    const vec_u8_t permM1 = vec_lvsl(-1, src);
215
    const vec_u8_t permP0 = vec_lvsl(+0, src);
216
    const vec_u8_t permP1 = vec_lvsl(+1, src);
217
    const vec_u8_t permP2 = vec_lvsl(+2, src);
218
    const vec_u8_t permP3 = vec_lvsl(+3, src);
219
    const vec_s16_t v5ss = vec_splat_s16(5);
220
    const vec_u16_t v5us = vec_splat_u16(5);
221
    const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
222
    const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
302 223

  
303
    srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
304
    srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
305
    srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
306
    srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
224
    vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
307 225

  
308
    srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
309
    srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
310
    srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
311
    srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
226
    register int align = ((((unsigned long)src) - 2) % 16);
227

  
228
    vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
229
              srcP2A, srcP2B, srcP3A, srcP3B,
230
              srcM1A, srcM1B, srcM2A, srcM2B,
231
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
232
              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
233
              psumA, psumB, sumA, sumB;
234

  
235
    vec_u8_t sum, vdst, fsum;
236

  
237
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
238

  
239
    for (i = 0 ; i < 16 ; i ++) {
240
        vec_u8_t srcR1 = vec_ld(-2, src);
241
        vec_u8_t srcR2 = vec_ld(14, src);
242

  
243
        switch (align) {
244
        default: {
245
            srcM2 = vec_perm(srcR1, srcR2, permM2);
246
            srcM1 = vec_perm(srcR1, srcR2, permM1);
247
            srcP0 = vec_perm(srcR1, srcR2, permP0);
248
            srcP1 = vec_perm(srcR1, srcR2, permP1);
249
            srcP2 = vec_perm(srcR1, srcR2, permP2);
250
            srcP3 = vec_perm(srcR1, srcR2, permP3);
251
        } break;
252
        case 11: {
253
            srcM2 = vec_perm(srcR1, srcR2, permM2);
254
            srcM1 = vec_perm(srcR1, srcR2, permM1);
255
            srcP0 = vec_perm(srcR1, srcR2, permP0);
256
            srcP1 = vec_perm(srcR1, srcR2, permP1);
257
            srcP2 = vec_perm(srcR1, srcR2, permP2);
258
            srcP3 = srcR2;
259
        } break;
260
        case 12: {
261
            vec_u8_t srcR3 = vec_ld(30, src);
262
            srcM2 = vec_perm(srcR1, srcR2, permM2);
263
            srcM1 = vec_perm(srcR1, srcR2, permM1);
264
            srcP0 = vec_perm(srcR1, srcR2, permP0);
265
            srcP1 = vec_perm(srcR1, srcR2, permP1);
266
            srcP2 = srcR2;
267
            srcP3 = vec_perm(srcR2, srcR3, permP3);
268
        } break;
269
        case 13: {
270
            vec_u8_t srcR3 = vec_ld(30, src);
271
            srcM2 = vec_perm(srcR1, srcR2, permM2);
272
            srcM1 = vec_perm(srcR1, srcR2, permM1);
273
            srcP0 = vec_perm(srcR1, srcR2, permP0);
274
            srcP1 = srcR2;
275
            srcP2 = vec_perm(srcR2, srcR3, permP2);
276
            srcP3 = vec_perm(srcR2, srcR3, permP3);
277
        } break;
278
        case 14: {
279
            vec_u8_t srcR3 = vec_ld(30, src);
280
            srcM2 = vec_perm(srcR1, srcR2, permM2);
281
            srcM1 = vec_perm(srcR1, srcR2, permM1);
282
            srcP0 = srcR2;
283
            srcP1 = vec_perm(srcR2, srcR3, permP1);
284
            srcP2 = vec_perm(srcR2, srcR3, permP2);
285
            srcP3 = vec_perm(srcR2, srcR3, permP3);
286
        } break;
287
        case 15: {
288
            vec_u8_t srcR3 = vec_ld(30, src);
289
            srcM2 = vec_perm(srcR1, srcR2, permM2);
290
            srcM1 = srcR2;
291
            srcP0 = vec_perm(srcR2, srcR3, permP0);
292
            srcP1 = vec_perm(srcR2, srcR3, permP1);
293
            srcP2 = vec_perm(srcR2, srcR3, permP2);
294
            srcP3 = vec_perm(srcR2, srcR3, permP3);
295
        } break;
296
        }
312 297

  
313
    sum1A = vec_adds(srcP0A, srcP1A);
314
    sum1B = vec_adds(srcP0B, srcP1B);
315
    sum2A = vec_adds(srcM1A, srcP2A);
316
    sum2B = vec_adds(srcM1B, srcP2B);
317
    sum3A = vec_adds(srcM2A, srcP3A);
318
    sum3B = vec_adds(srcM2B, srcP3B);
298
        srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
299
        srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
300
        srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
301
        srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
319 302

  
320
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
321
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
303
        srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
304
        srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
305
        srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
306
        srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
322 307

  
323
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
324
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
308
        srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
309
        srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
310
        srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
311
        srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
325 312

  
326
    pp3A = vec_add(sum3A, pp1A);
327
    pp3B = vec_add(sum3B, pp1B);
313
        sum1A = vec_adds(srcP0A, srcP1A);
314
        sum1B = vec_adds(srcP0B, srcP1B);
315
        sum2A = vec_adds(srcM1A, srcP2A);
316
        sum2B = vec_adds(srcM1B, srcP2B);
317
        sum3A = vec_adds(srcM2A, srcP3A);
318
        sum3B = vec_adds(srcM2B, srcP3B);
328 319

  
329
    psumA = vec_sub(pp3A, pp2A);
330
    psumB = vec_sub(pp3B, pp2B);
320
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
321
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
331 322

  
332
    sumA = vec_sra(psumA, v5us);
333
    sumB = vec_sra(psumB, v5us);
323
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
324
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
334 325

  
335
    sum = vec_packsu(sumA, sumB);
326
        pp3A = vec_add(sum3A, pp1A);
327
        pp3B = vec_add(sum3B, pp1B);
336 328

  
337
    ASSERT_ALIGNED(dst);
338
    vdst = vec_ld(0, dst);
329
        psumA = vec_sub(pp3A, pp2A);
330
        psumB = vec_sub(pp3B, pp2B);
339 331

  
340
    OP_U8_ALTIVEC(fsum, sum, vdst);
332
        sumA = vec_sra(psumA, v5us);
333
        sumB = vec_sra(psumB, v5us);
341 334

  
342
    vec_st(fsum, 0, dst);
335
        sum = vec_packsu(sumA, sumB);
343 336

  
344
    src += srcStride;
345
    dst += dstStride;
346
  }
347
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
337
        ASSERT_ALIGNED(dst);
338
        vdst = vec_ld(0, dst);
339

  
340
        OP_U8_ALTIVEC(fsum, sum, vdst);
341

  
342
        vec_st(fsum, 0, dst);
343

  
344
        src += srcStride;
345
        dst += dstStride;
346
    }
347
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
348 348
}
349 349

  
350 350
/* this code assume stride % 16 == 0 */
351 351
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
352
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
353

  
354
  register int i;
355

  
356
  LOAD_ZERO;
357
  const vec_u8_t perm = vec_lvsl(0, src);
358
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
359
  const vec_u16_t v5us = vec_splat_u16(5);
360
  const vec_s16_t v5ss = vec_splat_s16(5);
361
  const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
362

  
363
  uint8_t *srcbis = src - (srcStride * 2);
364

  
365
  const vec_u8_t srcM2a = vec_ld(0, srcbis);
366
  const vec_u8_t srcM2b = vec_ld(16, srcbis);
367
  const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
368
//  srcbis += srcStride;
369
  const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
370
  const vec_u8_t srcM1b = vec_ld(16, srcbis);
371
  const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
372
//  srcbis += srcStride;
373
  const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
374
  const vec_u8_t srcP0b = vec_ld(16, srcbis);
375
  const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
376
//  srcbis += srcStride;
377
  const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
378
  const vec_u8_t srcP1b = vec_ld(16, srcbis);
379
  const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
380
//  srcbis += srcStride;
381
  const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
382
  const vec_u8_t srcP2b = vec_ld(16, srcbis);
383
  const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
384
//  srcbis += srcStride;
385

  
386
  vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
387
  vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
388
  vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
389
  vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
390
  vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
391
  vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
392
  vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
393
  vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
394
  vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
395
  vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
396

  
397
  vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
398
                      psumA, psumB, sumA, sumB,
399
                      srcP3ssA, srcP3ssB,
400
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
401

  
402
  vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
403

  
404
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
405

  
406
  for (i = 0 ; i < 16 ; i++) {
407
    srcP3a = vec_ld(0, srcbis += srcStride);
408
    srcP3b = vec_ld(16, srcbis);
409
    srcP3 = vec_perm(srcP3a, srcP3b, perm);
410
    srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
411
    srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
412
//    srcbis += srcStride;
413

  
414
    sum1A = vec_adds(srcP0ssA, srcP1ssA);
415
    sum1B = vec_adds(srcP0ssB, srcP1ssB);
416
    sum2A = vec_adds(srcM1ssA, srcP2ssA);
417
    sum2B = vec_adds(srcM1ssB, srcP2ssB);
418
    sum3A = vec_adds(srcM2ssA, srcP3ssA);
419
    sum3B = vec_adds(srcM2ssB, srcP3ssB);
420

  
421
    srcM2ssA = srcM1ssA;
422
    srcM2ssB = srcM1ssB;
423
    srcM1ssA = srcP0ssA;
424
    srcM1ssB = srcP0ssB;
425
    srcP0ssA = srcP1ssA;
426
    srcP0ssB = srcP1ssB;
427
    srcP1ssA = srcP2ssA;
428
    srcP1ssB = srcP2ssB;
429
    srcP2ssA = srcP3ssA;
430
    srcP2ssB = srcP3ssB;
431

  
432
    pp1A = vec_mladd(sum1A, v20ss, v16ss);
433
    pp1B = vec_mladd(sum1B, v20ss, v16ss);
434

  
435
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
436
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
437

  
438
    pp3A = vec_add(sum3A, pp1A);
439
    pp3B = vec_add(sum3B, pp1B);
440

  
441
    psumA = vec_sub(pp3A, pp2A);
442
    psumB = vec_sub(pp3B, pp2B);
443

  
444
    sumA = vec_sra(psumA, v5us);
445
    sumB = vec_sra(psumB, v5us);
446

  
447
    sum = vec_packsu(sumA, sumB);
448

  
449
    ASSERT_ALIGNED(dst);
450
    vdst = vec_ld(0, dst);
451

  
452
    OP_U8_ALTIVEC(fsum, sum, vdst);
453

  
454
    vec_st(fsum, 0, dst);
455

  
456
    dst += dstStride;
457
  }
458
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
352
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
353

  
354
    register int i;
355

  
356
    LOAD_ZERO;
357
    const vec_u8_t perm = vec_lvsl(0, src);
358
    const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
359
    const vec_u16_t v5us = vec_splat_u16(5);
360
    const vec_s16_t v5ss = vec_splat_s16(5);
361
    const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
362

  
363
    uint8_t *srcbis = src - (srcStride * 2);
364

  
365
    const vec_u8_t srcM2a = vec_ld(0, srcbis);
366
    const vec_u8_t srcM2b = vec_ld(16, srcbis);
367
    const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
368
    //srcbis += srcStride;
369
    const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
370
    const vec_u8_t srcM1b = vec_ld(16, srcbis);
371
    const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
372
    //srcbis += srcStride;
373
    const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
374
    const vec_u8_t srcP0b = vec_ld(16, srcbis);
375
    const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
376
    //srcbis += srcStride;
377
    const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
378
    const vec_u8_t srcP1b = vec_ld(16, srcbis);
379
    const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
380
    //srcbis += srcStride;
381
    const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
382
    const vec_u8_t srcP2b = vec_ld(16, srcbis);
383
    const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
384
    //srcbis += srcStride;
385

  
386
    vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
387
    vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
388
    vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
389
    vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
390
    vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
391
    vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
392
    vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
393
    vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
394
    vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
395
    vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
396

  
397
    vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
398
              psumA, psumB, sumA, sumB,
399
              srcP3ssA, srcP3ssB,
400
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
401

  
402
    vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
403

  
404
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
405

  
406
    for (i = 0 ; i < 16 ; i++) {
407
        srcP3a = vec_ld(0, srcbis += srcStride);
408
        srcP3b = vec_ld(16, srcbis);
409
        srcP3 = vec_perm(srcP3a, srcP3b, perm);
410
        srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
411
        srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
412
        //srcbis += srcStride;
413

  
414
        sum1A = vec_adds(srcP0ssA, srcP1ssA);
415
        sum1B = vec_adds(srcP0ssB, srcP1ssB);
416
        sum2A = vec_adds(srcM1ssA, srcP2ssA);
417
        sum2B = vec_adds(srcM1ssB, srcP2ssB);
418
        sum3A = vec_adds(srcM2ssA, srcP3ssA);
419
        sum3B = vec_adds(srcM2ssB, srcP3ssB);
420

  
421
        srcM2ssA = srcM1ssA;
422
        srcM2ssB = srcM1ssB;
423
        srcM1ssA = srcP0ssA;
424
        srcM1ssB = srcP0ssB;
425
        srcP0ssA = srcP1ssA;
426
        srcP0ssB = srcP1ssB;
427
        srcP1ssA = srcP2ssA;
428
        srcP1ssB = srcP2ssB;
429
        srcP2ssA = srcP3ssA;
430
        srcP2ssB = srcP3ssB;
431

  
432
        pp1A = vec_mladd(sum1A, v20ss, v16ss);
433
        pp1B = vec_mladd(sum1B, v20ss, v16ss);
434

  
435
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
436
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
437

  
438
        pp3A = vec_add(sum3A, pp1A);
439
        pp3B = vec_add(sum3B, pp1B);
440

  
441
        psumA = vec_sub(pp3A, pp2A);
442
        psumB = vec_sub(pp3B, pp2B);
443

  
444
        sumA = vec_sra(psumA, v5us);
445
        sumB = vec_sra(psumB, v5us);
446

  
447
        sum = vec_packsu(sumA, sumB);
448

  
449
        ASSERT_ALIGNED(dst);
450
        vdst = vec_ld(0, dst);
451

  
452
        OP_U8_ALTIVEC(fsum, sum, vdst);
453

  
454
        vec_st(fsum, 0, dst);
455

  
456
        dst += dstStride;
457
    }
458
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
459 459
}
460 460

  
461 461
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
462 462
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
463
  POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
464
  register int i;
465
  LOAD_ZERO;
466
  const vec_u8_t permM2 = vec_lvsl(-2, src);
467
  const vec_u8_t permM1 = vec_lvsl(-1, src);
468
  const vec_u8_t permP0 = vec_lvsl(+0, src);
469
  const vec_u8_t permP1 = vec_lvsl(+1, src);
470
  const vec_u8_t permP2 = vec_lvsl(+2, src);
471
  const vec_u8_t permP3 = vec_lvsl(+3, src);
472
  const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
473
  const vec_u32_t v10ui = vec_splat_u32(10);
474
  const vec_s16_t v5ss = vec_splat_s16(5);
475
  const vec_s16_t v1ss = vec_splat_s16(1);
476
  const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
477
  const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
478

  
479
  register int align = ((((unsigned long)src) - 2) % 16);
480

  
481
  vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
482
                      srcP2A, srcP2B, srcP3A, srcP3B,
483
                      srcM1A, srcM1B, srcM2A, srcM2B,
484
                      sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485
                      pp1A, pp1B, pp2A, pp2B, psumA, psumB;
486

  
487
  const vec_u8_t mperm = (const vec_u8_t)
488
    AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
489
        0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
490
  int16_t *tmpbis = tmp;
491

  
492
  vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
493
                      tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
494
                      tmpP2ssA, tmpP2ssB;
495

  
496
  vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
497
                    pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
498
                    pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
499
                    ssumAe, ssumAo, ssumBe, ssumBo;
500
  vec_u8_t fsum, sumv, sum, vdst;
501
  vec_s16_t ssume, ssumo;
502

  
503
  POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
504
  src -= (2 * srcStride);
505
  for (i = 0 ; i < 21 ; i ++) {
506
    vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
507
    vec_u8_t srcR1 = vec_ld(-2, src);
508
    vec_u8_t srcR2 = vec_ld(14, src);
509

  
510
    switch (align) {
511
    default: {
512
      srcM2 = vec_perm(srcR1, srcR2, permM2);
513
      srcM1 = vec_perm(srcR1, srcR2, permM1);
514
      srcP0 = vec_perm(srcR1, srcR2, permP0);
515
      srcP1 = vec_perm(srcR1, srcR2, permP1);
516
      srcP2 = vec_perm(srcR1, srcR2, permP2);
517
      srcP3 = vec_perm(srcR1, srcR2, permP3);
518
    } break;
519
    case 11: {
520
      srcM2 = vec_perm(srcR1, srcR2, permM2);
521
      srcM1 = vec_perm(srcR1, srcR2, permM1);
522
      srcP0 = vec_perm(srcR1, srcR2, permP0);
523
      srcP1 = vec_perm(srcR1, srcR2, permP1);
524
      srcP2 = vec_perm(srcR1, srcR2, permP2);
525
      srcP3 = srcR2;
526
    } break;
527
    case 12: {
528
      vec_u8_t srcR3 = vec_ld(30, src);
529
      srcM2 = vec_perm(srcR1, srcR2, permM2);
530
      srcM1 = vec_perm(srcR1, srcR2, permM1);
531
      srcP0 = vec_perm(srcR1, srcR2, permP0);
532
      srcP1 = vec_perm(srcR1, srcR2, permP1);
533
      srcP2 = srcR2;
534
      srcP3 = vec_perm(srcR2, srcR3, permP3);
535
    } break;
536
    case 13: {
537
      vec_u8_t srcR3 = vec_ld(30, src);
538
      srcM2 = vec_perm(srcR1, srcR2, permM2);
539
      srcM1 = vec_perm(srcR1, srcR2, permM1);
540
      srcP0 = vec_perm(srcR1, srcR2, permP0);
541
      srcP1 = srcR2;
542
      srcP2 = vec_perm(srcR2, srcR3, permP2);
543
      srcP3 = vec_perm(srcR2, srcR3, permP3);
544
    } break;
545
    case 14: {
546
      vec_u8_t srcR3 = vec_ld(30, src);
547
      srcM2 = vec_perm(srcR1, srcR2, permM2);
548
      srcM1 = vec_perm(srcR1, srcR2, permM1);
549
      srcP0 = srcR2;
550
      srcP1 = vec_perm(srcR2, srcR3, permP1);
551
      srcP2 = vec_perm(srcR2, srcR3, permP2);
552
      srcP3 = vec_perm(srcR2, srcR3, permP3);
553
    } break;
554
    case 15: {
555
      vec_u8_t srcR3 = vec_ld(30, src);
556
      srcM2 = vec_perm(srcR1, srcR2, permM2);
557
      srcM1 = srcR2;
558
      srcP0 = vec_perm(srcR2, srcR3, permP0);
559
      srcP1 = vec_perm(srcR2, srcR3, permP1);
560
      srcP2 = vec_perm(srcR2, srcR3, permP2);
561
      srcP3 = vec_perm(srcR2, srcR3, permP3);
562
    } break;
563
    }
463
    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
464
    register int i;
465
    LOAD_ZERO;
466
    const vec_u8_t permM2 = vec_lvsl(-2, src);
467
    const vec_u8_t permM1 = vec_lvsl(-1, src);
468
    const vec_u8_t permP0 = vec_lvsl(+0, src);
469
    const vec_u8_t permP1 = vec_lvsl(+1, src);
470
    const vec_u8_t permP2 = vec_lvsl(+2, src);
471
    const vec_u8_t permP3 = vec_lvsl(+3, src);
472
    const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
473
    const vec_u32_t v10ui = vec_splat_u32(10);
474
    const vec_s16_t v5ss = vec_splat_s16(5);
475
    const vec_s16_t v1ss = vec_splat_s16(1);
476
    const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
477
    const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
478

  
479
    register int align = ((((unsigned long)src) - 2) % 16);
480

  
481
    vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
482
              srcP2A, srcP2B, srcP3A, srcP3B,
483
              srcM1A, srcM1B, srcM2A, srcM2B,
484
              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485
              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
486

  
487
    const vec_u8_t mperm = (const vec_u8_t)
488
      AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
489
          0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
490
    int16_t *tmpbis = tmp;
491

  
492
    vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
493
              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
494
              tmpP2ssA, tmpP2ssB;
495

  
496
    vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
497
              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
498
              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
499
              ssumAe, ssumAo, ssumBe, ssumBo;
500
    vec_u8_t fsum, sumv, sum, vdst;
501
    vec_s16_t ssume, ssumo;
502

  
503
    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
504
    src -= (2 * srcStride);
505
    for (i = 0 ; i < 21 ; i ++) {
506
        vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
507
        vec_u8_t srcR1 = vec_ld(-2, src);
508
        vec_u8_t srcR2 = vec_ld(14, src);
509

  
510
        switch (align) {
511
        default: {
512
            srcM2 = vec_perm(srcR1, srcR2, permM2);
513
            srcM1 = vec_perm(srcR1, srcR2, permM1);
514
            srcP0 = vec_perm(srcR1, srcR2, permP0);
515
            srcP1 = vec_perm(srcR1, srcR2, permP1);
516
            srcP2 = vec_perm(srcR1, srcR2, permP2);
517
            srcP3 = vec_perm(srcR1, srcR2, permP3);
518
        } break;
519
        case 11: {
520
            srcM2 = vec_perm(srcR1, srcR2, permM2);
521
            srcM1 = vec_perm(srcR1, srcR2, permM1);
522
            srcP0 = vec_perm(srcR1, srcR2, permP0);
523
            srcP1 = vec_perm(srcR1, srcR2, permP1);
524
            srcP2 = vec_perm(srcR1, srcR2, permP2);
525
            srcP3 = srcR2;
526
        } break;
527
        case 12: {
528
            vec_u8_t srcR3 = vec_ld(30, src);
529
            srcM2 = vec_perm(srcR1, srcR2, permM2);
530
            srcM1 = vec_perm(srcR1, srcR2, permM1);
531
            srcP0 = vec_perm(srcR1, srcR2, permP0);
532
            srcP1 = vec_perm(srcR1, srcR2, permP1);
533
            srcP2 = srcR2;
534
            srcP3 = vec_perm(srcR2, srcR3, permP3);
535
        } break;
536
        case 13: {
537
            vec_u8_t srcR3 = vec_ld(30, src);
538
            srcM2 = vec_perm(srcR1, srcR2, permM2);
539
            srcM1 = vec_perm(srcR1, srcR2, permM1);
540
            srcP0 = vec_perm(srcR1, srcR2, permP0);
541
            srcP1 = srcR2;
542
            srcP2 = vec_perm(srcR2, srcR3, permP2);
543
            srcP3 = vec_perm(srcR2, srcR3, permP3);
544
        } break;
545
        case 14: {
546
            vec_u8_t srcR3 = vec_ld(30, src);
547
            srcM2 = vec_perm(srcR1, srcR2, permM2);
548
            srcM1 = vec_perm(srcR1, srcR2, permM1);
549
            srcP0 = srcR2;
550
            srcP1 = vec_perm(srcR2, srcR3, permP1);
551
            srcP2 = vec_perm(srcR2, srcR3, permP2);
552
            srcP3 = vec_perm(srcR2, srcR3, permP3);
553
        } break;
554
        case 15: {
555
            vec_u8_t srcR3 = vec_ld(30, src);
556
            srcM2 = vec_perm(srcR1, srcR2, permM2);
557
            srcM1 = srcR2;
558
            srcP0 = vec_perm(srcR2, srcR3, permP0);
559
            srcP1 = vec_perm(srcR2, srcR3, permP1);
560
            srcP2 = vec_perm(srcR2, srcR3, permP2);
561
            srcP3 = vec_perm(srcR2, srcR3, permP3);
562
        } break;
563
        }
564

  
565
        srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
566
        srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
567
        srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
568
        srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
569

  
570
        srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
571
        srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
572
        srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
573
        srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
574

  
575
        srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
576
        srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
577
        srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
578
        srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
579

  
580
        sum1A = vec_adds(srcP0A, srcP1A);
581
        sum1B = vec_adds(srcP0B, srcP1B);
582
        sum2A = vec_adds(srcM1A, srcP2A);
583
        sum2B = vec_adds(srcM1B, srcP2B);
584
        sum3A = vec_adds(srcM2A, srcP3A);
585
        sum3B = vec_adds(srcM2B, srcP3B);
586

  
587
        pp1A = vec_mladd(sum1A, v20ss, sum3A);
588
        pp1B = vec_mladd(sum1B, v20ss, sum3B);
589

  
590
        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
591
        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
564 592

  
565
    srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
566
    srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
567
    srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
568
    srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
569

  
570
    srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
571
    srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
572
    srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
573
    srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
574

  
575
    srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
576
    srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
577
    srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
578
    srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
579

  
580
    sum1A = vec_adds(srcP0A, srcP1A);
581
    sum1B = vec_adds(srcP0B, srcP1B);
582
    sum2A = vec_adds(srcM1A, srcP2A);
583
    sum2B = vec_adds(srcM1B, srcP2B);
584
    sum3A = vec_adds(srcM2A, srcP3A);
585
    sum3B = vec_adds(srcM2B, srcP3B);
586

  
587
    pp1A = vec_mladd(sum1A, v20ss, sum3A);
588
    pp1B = vec_mladd(sum1B, v20ss, sum3B);
589

  
590
    pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
591
    pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
592

  
593
    psumA = vec_sub(pp1A, pp2A);
594
    psumB = vec_sub(pp1B, pp2B);
595

  
596
    vec_st(psumA, 0, tmp);
597
    vec_st(psumB, 16, tmp);
598

  
599
    src += srcStride;
600
    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
601
  }
602

  
603
  tmpM2ssA = vec_ld(0, tmpbis);
604
  tmpM2ssB = vec_ld(16, tmpbis);
605
  tmpbis += tmpStride;
606
  tmpM1ssA = vec_ld(0, tmpbis);
607
  tmpM1ssB = vec_ld(16, tmpbis);
608
  tmpbis += tmpStride;
609
  tmpP0ssA = vec_ld(0, tmpbis);
610
  tmpP0ssB = vec_ld(16, tmpbis);
611
  tmpbis += tmpStride;
612
  tmpP1ssA = vec_ld(0, tmpbis);
613
  tmpP1ssB = vec_ld(16, tmpbis);
614
  tmpbis += tmpStride;
615
  tmpP2ssA = vec_ld(0, tmpbis);
616
  tmpP2ssB = vec_ld(16, tmpbis);
617
  tmpbis += tmpStride;
618

  
619
  for (i = 0 ; i < 16 ; i++) {
620
    const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
621
    const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
622

  
623
    const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
624
    const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
625
    const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
626
    const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
627
    const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
628
    const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
593
        psumA = vec_sub(pp1A, pp2A);
594
        psumB = vec_sub(pp1B, pp2B);
629 595

  
596
        vec_st(psumA, 0, tmp);
597
        vec_st(psumB, 16, tmp);
598

  
599
        src += srcStride;
600
        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
601
    }
602

  
603
    tmpM2ssA = vec_ld(0, tmpbis);
604
    tmpM2ssB = vec_ld(16, tmpbis);
605
    tmpbis += tmpStride;
606
    tmpM1ssA = vec_ld(0, tmpbis);
607
    tmpM1ssB = vec_ld(16, tmpbis);
608
    tmpbis += tmpStride;
609
    tmpP0ssA = vec_ld(0, tmpbis);
610
    tmpP0ssB = vec_ld(16, tmpbis);
611
    tmpbis += tmpStride;
612
    tmpP1ssA = vec_ld(0, tmpbis);
613
    tmpP1ssB = vec_ld(16, tmpbis);
614
    tmpbis += tmpStride;
615
    tmpP2ssA = vec_ld(0, tmpbis);
616
    tmpP2ssB = vec_ld(16, tmpbis);
630 617
    tmpbis += tmpStride;
631 618

  
632
    tmpM2ssA = tmpM1ssA;
633
    tmpM2ssB = tmpM1ssB;
634
    tmpM1ssA = tmpP0ssA;
635
    tmpM1ssB = tmpP0ssB;
636
    tmpP0ssA = tmpP1ssA;
637
    tmpP0ssB = tmpP1ssB;
638
    tmpP1ssA = tmpP2ssA;
639
    tmpP1ssB = tmpP2ssB;
640
    tmpP2ssA = tmpP3ssA;
641
    tmpP2ssB = tmpP3ssB;
642

  
643
    pp1Ae = vec_mule(sum1A, v20ss);
644
    pp1Ao = vec_mulo(sum1A, v20ss);
645
    pp1Be = vec_mule(sum1B, v20ss);
646
    pp1Bo = vec_mulo(sum1B, v20ss);
647

  
648
    pp2Ae = vec_mule(sum2A, v5ss);
649
    pp2Ao = vec_mulo(sum2A, v5ss);
650
    pp2Be = vec_mule(sum2B, v5ss);
651
    pp2Bo = vec_mulo(sum2B, v5ss);
652

  
653
    pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
654
    pp3Ao = vec_mulo(sum3A, v1ss);
655
    pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
656
    pp3Bo = vec_mulo(sum3B, v1ss);
657

  
658
    pp1cAe = vec_add(pp1Ae, v512si);
659
    pp1cAo = vec_add(pp1Ao, v512si);
660
    pp1cBe = vec_add(pp1Be, v512si);
661
    pp1cBo = vec_add(pp1Bo, v512si);
662

  
663
    pp32Ae = vec_sub(pp3Ae, pp2Ae);
664
    pp32Ao = vec_sub(pp3Ao, pp2Ao);
665
    pp32Be = vec_sub(pp3Be, pp2Be);
666
    pp32Bo = vec_sub(pp3Bo, pp2Bo);
667

  
668
    sumAe = vec_add(pp1cAe, pp32Ae);
669
    sumAo = vec_add(pp1cAo, pp32Ao);
670
    sumBe = vec_add(pp1cBe, pp32Be);
671
    sumBo = vec_add(pp1cBo, pp32Bo);
672

  
673
    ssumAe = vec_sra(sumAe, v10ui);
674
    ssumAo = vec_sra(sumAo, v10ui);
675
    ssumBe = vec_sra(sumBe, v10ui);
676
    ssumBo = vec_sra(sumBo, v10ui);
677

  
678
    ssume = vec_packs(ssumAe, ssumBe);
679
    ssumo = vec_packs(ssumAo, ssumBo);
680

  
681
    sumv = vec_packsu(ssume, ssumo);
682
    sum = vec_perm(sumv, sumv, mperm);
683

  
684
    ASSERT_ALIGNED(dst);
685
    vdst = vec_ld(0, dst);
686

  
687
    OP_U8_ALTIVEC(fsum, sum, vdst);
688

  
689
    vec_st(fsum, 0, dst);
690

  
691
    dst += dstStride;
692
  }
693
  POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
619
    for (i = 0 ; i < 16 ; i++) {
620
        const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
621
        const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
622

  
623
        const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
624
        const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
625
        const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
626
        const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
627
        const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
628
        const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
629

  
630
        tmpbis += tmpStride;
631

  
632
        tmpM2ssA = tmpM1ssA;
633
        tmpM2ssB = tmpM1ssB;
634
        tmpM1ssA = tmpP0ssA;
635
        tmpM1ssB = tmpP0ssB;
636
        tmpP0ssA = tmpP1ssA;
637
        tmpP0ssB = tmpP1ssB;
638
        tmpP1ssA = tmpP2ssA;
639
        tmpP1ssB = tmpP2ssB;
640
        tmpP2ssA = tmpP3ssA;
641
        tmpP2ssB = tmpP3ssB;
642

  
643
        pp1Ae = vec_mule(sum1A, v20ss);
644
        pp1Ao = vec_mulo(sum1A, v20ss);
645
        pp1Be = vec_mule(sum1B, v20ss);
646
        pp1Bo = vec_mulo(sum1B, v20ss);
647

  
648
        pp2Ae = vec_mule(sum2A, v5ss);
649
        pp2Ao = vec_mulo(sum2A, v5ss);
650
        pp2Be = vec_mule(sum2B, v5ss);
651
        pp2Bo = vec_mulo(sum2B, v5ss);
652

  
653
        pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
654
        pp3Ao = vec_mulo(sum3A, v1ss);
655
        pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
656
        pp3Bo = vec_mulo(sum3B, v1ss);
657

  
658
        pp1cAe = vec_add(pp1Ae, v512si);
659
        pp1cAo = vec_add(pp1Ao, v512si);
660
        pp1cBe = vec_add(pp1Be, v512si);
661
        pp1cBo = vec_add(pp1Bo, v512si);
662

  
663
        pp32Ae = vec_sub(pp3Ae, pp2Ae);
664
        pp32Ao = vec_sub(pp3Ao, pp2Ao);
665
        pp32Be = vec_sub(pp3Be, pp2Be);
666
        pp32Bo = vec_sub(pp3Bo, pp2Bo);
667

  
668
        sumAe = vec_add(pp1cAe, pp32Ae);
669
        sumAo = vec_add(pp1cAo, pp32Ao);
670
        sumBe = vec_add(pp1cBe, pp32Be);
671
        sumBo = vec_add(pp1cBo, pp32Bo);
672

  
673
        ssumAe = vec_sra(sumAe, v10ui);
674
        ssumAo = vec_sra(sumAo, v10ui);
675
        ssumBe = vec_sra(sumBe, v10ui);
676
        ssumBo = vec_sra(sumBo, v10ui);
677

  
678
        ssume = vec_packs(ssumAe, ssumBe);
679
        ssumo = vec_packs(ssumAo, ssumBo);
680

  
681
        sumv = vec_packsu(ssume, ssumo);
682
        sum = vec_perm(sumv, sumv, mperm);
683

  
684
        ASSERT_ALIGNED(dst);
685
        vdst = vec_ld(0, dst);
686

  
687
        OP_U8_ALTIVEC(fsum, sum, vdst);
688

  
689
        vec_st(fsum, 0, dst);
690

  
691
        dst += dstStride;
692
    }
693
    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
694 694
}

Also available in: Unified diff