Revision e3905ce0 libavcodec/ppc/h264_template_altivec.c
libavcodec/ppc/h264_template_altivec.c  

206  206  
207  207 
/* this code assume stride % 16 == 0 */ 
208  208 
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 
209 
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); 

210 
register int i; 

211  
212 
LOAD_ZERO; 

213 
const vec_u8_t permM2 = vec_lvsl(2, src); 

214 
const vec_u8_t permM1 = vec_lvsl(1, src); 

215 
const vec_u8_t permP0 = vec_lvsl(+0, src); 

216 
const vec_u8_t permP1 = vec_lvsl(+1, src); 

217 
const vec_u8_t permP2 = vec_lvsl(+2, src); 

218 
const vec_u8_t permP3 = vec_lvsl(+3, src); 

219 
const vec_s16_t v5ss = vec_splat_s16(5); 

220 
const vec_u16_t v5us = vec_splat_u16(5); 

221 
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 

222 
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 

223  
224 
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 

225  
226 
register int align = ((((unsigned long)src)  2) % 16); 

227  
228 
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, 

229 
srcP2A, srcP2B, srcP3A, srcP3B, 

230 
srcM1A, srcM1B, srcM2A, srcM2B, 

231 
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 

232 
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 

233 
psumA, psumB, sumA, sumB; 

234  
235 
vec_u8_t sum, vdst, fsum; 

236  
237 
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 

238  
239 
for (i = 0 ; i < 16 ; i ++) { 

240 
vec_u8_t srcR1 = vec_ld(2, src); 

241 
vec_u8_t srcR2 = vec_ld(14, src); 

242  
243 
switch (align) { 

244 
default: { 

245 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

246 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

247 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

248 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

249 
srcP2 = vec_perm(srcR1, srcR2, permP2); 

250 
srcP3 = vec_perm(srcR1, srcR2, permP3); 

251 
} break; 

252 
case 11: { 

253 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

254 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

255 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

256 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

257 
srcP2 = vec_perm(srcR1, srcR2, permP2); 

258 
srcP3 = srcR2; 

259 
} break; 

260 
case 12: { 

261 
vec_u8_t srcR3 = vec_ld(30, src); 

262 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

263 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

264 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

265 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

266 
srcP2 = srcR2; 

267 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

268 
} break; 

269 
case 13: { 

270 
vec_u8_t srcR3 = vec_ld(30, src); 

271 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

272 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

273 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

274 
srcP1 = srcR2; 

275 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

276 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

277 
} break; 

278 
case 14: { 

279 
vec_u8_t srcR3 = vec_ld(30, src); 

280 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

281 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

282 
srcP0 = srcR2; 

283 
srcP1 = vec_perm(srcR2, srcR3, permP1); 

284 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

285 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

286 
} break; 

287 
case 15: { 

288 
vec_u8_t srcR3 = vec_ld(30, src); 

289 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

290 
srcM1 = srcR2; 

291 
srcP0 = vec_perm(srcR2, srcR3, permP0); 

292 
srcP1 = vec_perm(srcR2, srcR3, permP1); 

293 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

294 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

295 
} break; 

296 
} 

209 
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); 

210 
register int i; 

297  211  
298 
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); 

299 
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); 

300 
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); 

301 
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); 

212 
LOAD_ZERO; 

213 
const vec_u8_t permM2 = vec_lvsl(2, src); 

214 
const vec_u8_t permM1 = vec_lvsl(1, src); 

215 
const vec_u8_t permP0 = vec_lvsl(+0, src); 

216 
const vec_u8_t permP1 = vec_lvsl(+1, src); 

217 
const vec_u8_t permP2 = vec_lvsl(+2, src); 

218 
const vec_u8_t permP3 = vec_lvsl(+3, src); 

219 
const vec_s16_t v5ss = vec_splat_s16(5); 

220 
const vec_u16_t v5us = vec_splat_u16(5); 

221 
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 

222 
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 

302  223  
303 
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); 

304 
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); 

305 
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); 

306 
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); 

224 
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 

307  225  
308 
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); 

309 
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); 

310 
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); 

311 
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); 

226 
register int align = ((((unsigned long)src)  2) % 16); 

227  
228 
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, 

229 
srcP2A, srcP2B, srcP3A, srcP3B, 

230 
srcM1A, srcM1B, srcM2A, srcM2B, 

231 
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 

232 
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 

233 
psumA, psumB, sumA, sumB; 

234  
235 
vec_u8_t sum, vdst, fsum; 

236  
237 
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 

238  
239 
for (i = 0 ; i < 16 ; i ++) { 

240 
vec_u8_t srcR1 = vec_ld(2, src); 

241 
vec_u8_t srcR2 = vec_ld(14, src); 

242  
243 
switch (align) { 

244 
default: { 

245 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

246 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

247 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

248 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

249 
srcP2 = vec_perm(srcR1, srcR2, permP2); 

250 
srcP3 = vec_perm(srcR1, srcR2, permP3); 

251 
} break; 

252 
case 11: { 

253 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

254 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

255 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

256 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

257 
srcP2 = vec_perm(srcR1, srcR2, permP2); 

258 
srcP3 = srcR2; 

259 
} break; 

260 
case 12: { 

261 
vec_u8_t srcR3 = vec_ld(30, src); 

262 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

263 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

264 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

265 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

266 
srcP2 = srcR2; 

267 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

268 
} break; 

269 
case 13: { 

270 
vec_u8_t srcR3 = vec_ld(30, src); 

271 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

272 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

273 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

274 
srcP1 = srcR2; 

275 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

276 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

277 
} break; 

278 
case 14: { 

279 
vec_u8_t srcR3 = vec_ld(30, src); 

280 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

281 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

282 
srcP0 = srcR2; 

283 
srcP1 = vec_perm(srcR2, srcR3, permP1); 

284 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

285 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

286 
} break; 

287 
case 15: { 

288 
vec_u8_t srcR3 = vec_ld(30, src); 

289 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

290 
srcM1 = srcR2; 

291 
srcP0 = vec_perm(srcR2, srcR3, permP0); 

292 
srcP1 = vec_perm(srcR2, srcR3, permP1); 

293 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

294 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

295 
} break; 

296 
} 

312  297  
313 
sum1A = vec_adds(srcP0A, srcP1A); 

314 
sum1B = vec_adds(srcP0B, srcP1B); 

315 
sum2A = vec_adds(srcM1A, srcP2A); 

316 
sum2B = vec_adds(srcM1B, srcP2B); 

317 
sum3A = vec_adds(srcM2A, srcP3A); 

318 
sum3B = vec_adds(srcM2B, srcP3B); 

298 
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); 

299 
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); 

300 
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); 

301 
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); 

319  302  
320 
pp1A = vec_mladd(sum1A, v20ss, v16ss); 

321 
pp1B = vec_mladd(sum1B, v20ss, v16ss); 

303 
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); 

304 
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); 

305 
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); 

306 
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); 

322  307  
323 
pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 

324 
pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 

308 
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); 

309 
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); 

310 
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); 

311 
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); 

325  312  
326 
pp3A = vec_add(sum3A, pp1A); 

327 
pp3B = vec_add(sum3B, pp1B); 

313 
sum1A = vec_adds(srcP0A, srcP1A); 

314 
sum1B = vec_adds(srcP0B, srcP1B); 

315 
sum2A = vec_adds(srcM1A, srcP2A); 

316 
sum2B = vec_adds(srcM1B, srcP2B); 

317 
sum3A = vec_adds(srcM2A, srcP3A); 

318 
sum3B = vec_adds(srcM2B, srcP3B); 

328  319  
329 
psumA = vec_sub(pp3A, pp2A);


330 
psumB = vec_sub(pp3B, pp2B);


320 
pp1A = vec_mladd(sum1A, v20ss, v16ss);


321 
pp1B = vec_mladd(sum1B, v20ss, v16ss);


331  322  
332 
sumA = vec_sra(psumA, v5us);


333 
sumB = vec_sra(psumB, v5us);


323 
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);


324 
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);


334  325  
335 
sum = vec_packsu(sumA, sumB); 

326 
pp3A = vec_add(sum3A, pp1A); 

327 
pp3B = vec_add(sum3B, pp1B); 

336  328  
337 
ASSERT_ALIGNED(dst);


338 
vdst = vec_ld(0, dst);


329 
psumA = vec_sub(pp3A, pp2A);


330 
psumB = vec_sub(pp3B, pp2B);


339  331  
340 
OP_U8_ALTIVEC(fsum, sum, vdst); 

332 
sumA = vec_sra(psumA, v5us); 

333 
sumB = vec_sra(psumB, v5us); 

341  334  
342 
vec_st(fsum, 0, dst);


335 
sum = vec_packsu(sumA, sumB);


343  336  
344 
src += srcStride; 

345 
dst += dstStride; 

346 
} 

347 
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 

337 
ASSERT_ALIGNED(dst); 

338 
vdst = vec_ld(0, dst); 

339  
340 
OP_U8_ALTIVEC(fsum, sum, vdst); 

341  
342 
vec_st(fsum, 0, dst); 

343  
344 
src += srcStride; 

345 
dst += dstStride; 

346 
} 

347 
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 

348  348 
} 
349  349  
350  350 
/* this code assume stride % 16 == 0 */ 
351  351 
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 
352 
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); 

353  
354 
register int i; 

355  
356 
LOAD_ZERO; 

357 
const vec_u8_t perm = vec_lvsl(0, src); 

358 
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 

359 
const vec_u16_t v5us = vec_splat_u16(5); 

360 
const vec_s16_t v5ss = vec_splat_s16(5); 

361 
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 

362  
363 
uint8_t *srcbis = src  (srcStride * 2); 

364  
365 
const vec_u8_t srcM2a = vec_ld(0, srcbis); 

366 
const vec_u8_t srcM2b = vec_ld(16, srcbis); 

367 
const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); 

368 
// srcbis += srcStride;


369 
const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); 

370 
const vec_u8_t srcM1b = vec_ld(16, srcbis); 

371 
const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); 

372 
// srcbis += srcStride;


373 
const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); 

374 
const vec_u8_t srcP0b = vec_ld(16, srcbis); 

375 
const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); 

376 
// srcbis += srcStride;


377 
const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); 

378 
const vec_u8_t srcP1b = vec_ld(16, srcbis); 

379 
const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); 

380 
// srcbis += srcStride;


381 
const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); 

382 
const vec_u8_t srcP2b = vec_ld(16, srcbis); 

383 
const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); 

384 
// srcbis += srcStride;


385  
386 
vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); 

387 
vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); 

388 
vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); 

389 
vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); 

390 
vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); 

391 
vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); 

392 
vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); 

393 
vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); 

394 
vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); 

395 
vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); 

396  
397 
vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 

398 
psumA, psumB, sumA, sumB,


399 
srcP3ssA, srcP3ssB,


400 
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;


401  
402 
vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; 

403  
404 
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 

405  
406 
for (i = 0 ; i < 16 ; i++) { 

407 
srcP3a = vec_ld(0, srcbis += srcStride); 

408 
srcP3b = vec_ld(16, srcbis); 

409 
srcP3 = vec_perm(srcP3a, srcP3b, perm); 

410 
srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); 

411 
srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); 

412 
// srcbis += srcStride;


413  
414 
sum1A = vec_adds(srcP0ssA, srcP1ssA); 

415 
sum1B = vec_adds(srcP0ssB, srcP1ssB); 

416 
sum2A = vec_adds(srcM1ssA, srcP2ssA); 

417 
sum2B = vec_adds(srcM1ssB, srcP2ssB); 

418 
sum3A = vec_adds(srcM2ssA, srcP3ssA); 

419 
sum3B = vec_adds(srcM2ssB, srcP3ssB); 

420  
421 
srcM2ssA = srcM1ssA; 

422 
srcM2ssB = srcM1ssB; 

423 
srcM1ssA = srcP0ssA; 

424 
srcM1ssB = srcP0ssB; 

425 
srcP0ssA = srcP1ssA; 

426 
srcP0ssB = srcP1ssB; 

427 
srcP1ssA = srcP2ssA; 

428 
srcP1ssB = srcP2ssB; 

429 
srcP2ssA = srcP3ssA; 

430 
srcP2ssB = srcP3ssB; 

431  
432 
pp1A = vec_mladd(sum1A, v20ss, v16ss); 

433 
pp1B = vec_mladd(sum1B, v20ss, v16ss); 

434  
435 
pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 

436 
pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 

437  
438 
pp3A = vec_add(sum3A, pp1A); 

439 
pp3B = vec_add(sum3B, pp1B); 

440  
441 
psumA = vec_sub(pp3A, pp2A); 

442 
psumB = vec_sub(pp3B, pp2B); 

443  
444 
sumA = vec_sra(psumA, v5us); 

445 
sumB = vec_sra(psumB, v5us); 

446  
447 
sum = vec_packsu(sumA, sumB); 

448  
449 
ASSERT_ALIGNED(dst); 

450 
vdst = vec_ld(0, dst); 

451  
452 
OP_U8_ALTIVEC(fsum, sum, vdst); 

453  
454 
vec_st(fsum, 0, dst); 

455  
456 
dst += dstStride; 

457 
} 

458 
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 

352 
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);


353  
354 
register int i;


355  
356 
LOAD_ZERO;


357 
const vec_u8_t perm = vec_lvsl(0, src);


358 
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));


359 
const vec_u16_t v5us = vec_splat_u16(5);


360 
const vec_s16_t v5ss = vec_splat_s16(5);


361 
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));


362  
363 
uint8_t *srcbis = src  (srcStride * 2);


364  
365 
const vec_u8_t srcM2a = vec_ld(0, srcbis);


366 
const vec_u8_t srcM2b = vec_ld(16, srcbis);


367 
const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);


368 
//srcbis += srcStride;


369 
const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);


370 
const vec_u8_t srcM1b = vec_ld(16, srcbis);


371 
const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);


372 
//srcbis += srcStride;


373 
const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);


374 
const vec_u8_t srcP0b = vec_ld(16, srcbis);


375 
const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);


376 
//srcbis += srcStride;


377 
const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);


378 
const vec_u8_t srcP1b = vec_ld(16, srcbis);


379 
const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);


380 
//srcbis += srcStride;


381 
const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);


382 
const vec_u8_t srcP2b = vec_ld(16, srcbis);


383 
const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);


384 
//srcbis += srcStride;


385  
386 
vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);


387 
vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);


388 
vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);


389 
vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);


390 
vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);


391 
vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);


392 
vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);


393 
vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);


394 
vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);


395 
vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);


396  
397 
vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,


398 
psumA, psumB, sumA, sumB, 

399 
srcP3ssA, srcP3ssB, 

400 
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 

401  
402 
vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;


403  
404 
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);


405  
406 
for (i = 0 ; i < 16 ; i++) {


407 
srcP3a = vec_ld(0, srcbis += srcStride);


408 
srcP3b = vec_ld(16, srcbis);


409 
srcP3 = vec_perm(srcP3a, srcP3b, perm);


410 
srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);


411 
srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);


412 
//srcbis += srcStride;


413  
414 
sum1A = vec_adds(srcP0ssA, srcP1ssA);


415 
sum1B = vec_adds(srcP0ssB, srcP1ssB);


416 
sum2A = vec_adds(srcM1ssA, srcP2ssA);


417 
sum2B = vec_adds(srcM1ssB, srcP2ssB);


418 
sum3A = vec_adds(srcM2ssA, srcP3ssA);


419 
sum3B = vec_adds(srcM2ssB, srcP3ssB);


420  
421 
srcM2ssA = srcM1ssA;


422 
srcM2ssB = srcM1ssB;


423 
srcM1ssA = srcP0ssA;


424 
srcM1ssB = srcP0ssB;


425 
srcP0ssA = srcP1ssA;


426 
srcP0ssB = srcP1ssB;


427 
srcP1ssA = srcP2ssA;


428 
srcP1ssB = srcP2ssB;


429 
srcP2ssA = srcP3ssA;


430 
srcP2ssB = srcP3ssB;


431  
432 
pp1A = vec_mladd(sum1A, v20ss, v16ss);


433 
pp1B = vec_mladd(sum1B, v20ss, v16ss);


434  
435 
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);


436 
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);


437  
438 
pp3A = vec_add(sum3A, pp1A);


439 
pp3B = vec_add(sum3B, pp1B);


440  
441 
psumA = vec_sub(pp3A, pp2A);


442 
psumB = vec_sub(pp3B, pp2B);


443  
444 
sumA = vec_sra(psumA, v5us);


445 
sumB = vec_sra(psumB, v5us);


446  
447 
sum = vec_packsu(sumA, sumB);


448  
449 
ASSERT_ALIGNED(dst);


450 
vdst = vec_ld(0, dst);


451  
452 
OP_U8_ALTIVEC(fsum, sum, vdst);


453  
454 
vec_st(fsum, 0, dst);


455  
456 
dst += dstStride;


457 
}


458 
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);


459  459 
} 
460  460  
461  461 
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 
462  462 
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 
463 
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); 

464 
register int i; 

465 
LOAD_ZERO; 

466 
const vec_u8_t permM2 = vec_lvsl(2, src); 

467 
const vec_u8_t permM1 = vec_lvsl(1, src); 

468 
const vec_u8_t permP0 = vec_lvsl(+0, src); 

469 
const vec_u8_t permP1 = vec_lvsl(+1, src); 

470 
const vec_u8_t permP2 = vec_lvsl(+2, src); 

471 
const vec_u8_t permP3 = vec_lvsl(+3, src); 

472 
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 

473 
const vec_u32_t v10ui = vec_splat_u32(10); 

474 
const vec_s16_t v5ss = vec_splat_s16(5); 

475 
const vec_s16_t v1ss = vec_splat_s16(1); 

476 
const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 

477 
const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 

478  
479 
register int align = ((((unsigned long)src)  2) % 16); 

480  
481 
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, 

482 
srcP2A, srcP2B, srcP3A, srcP3B, 

483 
srcM1A, srcM1B, srcM2A, srcM2B, 

484 
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 

485 
pp1A, pp1B, pp2A, pp2B, psumA, psumB; 

486  
487 
const vec_u8_t mperm = (const vec_u8_t) 

488 
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 

489 
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); 

490 
int16_t *tmpbis = tmp; 

491  
492 
vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 

493 
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 

494 
tmpP2ssA, tmpP2ssB; 

495  
496 
vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 

497 
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 

498 
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 

499 
ssumAe, ssumAo, ssumBe, ssumBo; 

500 
vec_u8_t fsum, sumv, sum, vdst; 

501 
vec_s16_t ssume, ssumo; 

502  
503 
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 

504 
src = (2 * srcStride); 

505 
for (i = 0 ; i < 21 ; i ++) { 

506 
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 

507 
vec_u8_t srcR1 = vec_ld(2, src); 

508 
vec_u8_t srcR2 = vec_ld(14, src); 

509  
510 
switch (align) { 

511 
default: { 

512 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

513 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

514 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

515 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

516 
srcP2 = vec_perm(srcR1, srcR2, permP2); 

517 
srcP3 = vec_perm(srcR1, srcR2, permP3); 

518 
} break; 

519 
case 11: { 

520 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

521 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

522 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

523 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

524 
srcP2 = vec_perm(srcR1, srcR2, permP2); 

525 
srcP3 = srcR2; 

526 
} break; 

527 
case 12: { 

528 
vec_u8_t srcR3 = vec_ld(30, src); 

529 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

530 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

531 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

532 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

533 
srcP2 = srcR2; 

534 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

535 
} break; 

536 
case 13: { 

537 
vec_u8_t srcR3 = vec_ld(30, src); 

538 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

539 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

540 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

541 
srcP1 = srcR2; 

542 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

543 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

544 
} break; 

545 
case 14: { 

546 
vec_u8_t srcR3 = vec_ld(30, src); 

547 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

548 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

549 
srcP0 = srcR2; 

550 
srcP1 = vec_perm(srcR2, srcR3, permP1); 

551 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

552 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

553 
} break; 

554 
case 15: { 

555 
vec_u8_t srcR3 = vec_ld(30, src); 

556 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

557 
srcM1 = srcR2; 

558 
srcP0 = vec_perm(srcR2, srcR3, permP0); 

559 
srcP1 = vec_perm(srcR2, srcR3, permP1); 

560 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

561 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

562 
} break; 

563 
} 

463 
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); 

464 
register int i; 

465 
LOAD_ZERO; 

466 
const vec_u8_t permM2 = vec_lvsl(2, src); 

467 
const vec_u8_t permM1 = vec_lvsl(1, src); 

468 
const vec_u8_t permP0 = vec_lvsl(+0, src); 

469 
const vec_u8_t permP1 = vec_lvsl(+1, src); 

470 
const vec_u8_t permP2 = vec_lvsl(+2, src); 

471 
const vec_u8_t permP3 = vec_lvsl(+3, src); 

472 
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 

473 
const vec_u32_t v10ui = vec_splat_u32(10); 

474 
const vec_s16_t v5ss = vec_splat_s16(5); 

475 
const vec_s16_t v1ss = vec_splat_s16(1); 

476 
const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 

477 
const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 

478  
479 
register int align = ((((unsigned long)src)  2) % 16); 

480  
481 
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, 

482 
srcP2A, srcP2B, srcP3A, srcP3B, 

483 
srcM1A, srcM1B, srcM2A, srcM2B, 

484 
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 

485 
pp1A, pp1B, pp2A, pp2B, psumA, psumB; 

486  
487 
const vec_u8_t mperm = (const vec_u8_t) 

488 
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 

489 
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); 

490 
int16_t *tmpbis = tmp; 

491  
492 
vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 

493 
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 

494 
tmpP2ssA, tmpP2ssB; 

495  
496 
vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 

497 
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 

498 
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 

499 
ssumAe, ssumAo, ssumBe, ssumBo; 

500 
vec_u8_t fsum, sumv, sum, vdst; 

501 
vec_s16_t ssume, ssumo; 

502  
503 
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 

504 
src = (2 * srcStride); 

505 
for (i = 0 ; i < 21 ; i ++) { 

506 
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 

507 
vec_u8_t srcR1 = vec_ld(2, src); 

508 
vec_u8_t srcR2 = vec_ld(14, src); 

509  
510 
switch (align) { 

511 
default: { 

512 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

513 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

514 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

515 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

516 
srcP2 = vec_perm(srcR1, srcR2, permP2); 

517 
srcP3 = vec_perm(srcR1, srcR2, permP3); 

518 
} break; 

519 
case 11: { 

520 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

521 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

522 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

523 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

524 
srcP2 = vec_perm(srcR1, srcR2, permP2); 

525 
srcP3 = srcR2; 

526 
} break; 

527 
case 12: { 

528 
vec_u8_t srcR3 = vec_ld(30, src); 

529 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

530 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

531 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

532 
srcP1 = vec_perm(srcR1, srcR2, permP1); 

533 
srcP2 = srcR2; 

534 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

535 
} break; 

536 
case 13: { 

537 
vec_u8_t srcR3 = vec_ld(30, src); 

538 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

539 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

540 
srcP0 = vec_perm(srcR1, srcR2, permP0); 

541 
srcP1 = srcR2; 

542 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

543 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

544 
} break; 

545 
case 14: { 

546 
vec_u8_t srcR3 = vec_ld(30, src); 

547 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

548 
srcM1 = vec_perm(srcR1, srcR2, permM1); 

549 
srcP0 = srcR2; 

550 
srcP1 = vec_perm(srcR2, srcR3, permP1); 

551 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

552 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

553 
} break; 

554 
case 15: { 

555 
vec_u8_t srcR3 = vec_ld(30, src); 

556 
srcM2 = vec_perm(srcR1, srcR2, permM2); 

557 
srcM1 = srcR2; 

558 
srcP0 = vec_perm(srcR2, srcR3, permP0); 

559 
srcP1 = vec_perm(srcR2, srcR3, permP1); 

560 
srcP2 = vec_perm(srcR2, srcR3, permP2); 

561 
srcP3 = vec_perm(srcR2, srcR3, permP3); 

562 
} break; 

563 
} 

564  
565 
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); 

566 
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); 

567 
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); 

568 
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); 

569  
570 
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); 

571 
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); 

572 
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); 

573 
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); 

574  
575 
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); 

576 
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); 

577 
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); 

578 
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); 

579  
580 
sum1A = vec_adds(srcP0A, srcP1A); 

581 
sum1B = vec_adds(srcP0B, srcP1B); 

582 
sum2A = vec_adds(srcM1A, srcP2A); 

583 
sum2B = vec_adds(srcM1B, srcP2B); 

584 
sum3A = vec_adds(srcM2A, srcP3A); 

585 
sum3B = vec_adds(srcM2B, srcP3B); 

586  
587 
pp1A = vec_mladd(sum1A, v20ss, sum3A); 

588 
pp1B = vec_mladd(sum1B, v20ss, sum3B); 

589  
590 
pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 

591 
pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 

564  592  
565 
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); 

566 
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); 

567 
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); 

568 
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); 

569  
570 
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); 

571 
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); 

572 
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); 

573 
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); 

574  
575 
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); 

576 
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); 

577 
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); 

578 
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); 

579  
580 
sum1A = vec_adds(srcP0A, srcP1A); 

581 
sum1B = vec_adds(srcP0B, srcP1B); 

582 
sum2A = vec_adds(srcM1A, srcP2A); 

583 
sum2B = vec_adds(srcM1B, srcP2B); 

584 
sum3A = vec_adds(srcM2A, srcP3A); 

585 
sum3B = vec_adds(srcM2B, srcP3B); 

586  
587 
pp1A = vec_mladd(sum1A, v20ss, sum3A); 

588 
pp1B = vec_mladd(sum1B, v20ss, sum3B); 

589  
590 
pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 

591 
pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 

592  
593 
psumA = vec_sub(pp1A, pp2A); 

594 
psumB = vec_sub(pp1B, pp2B); 

595  
596 
vec_st(psumA, 0, tmp); 

597 
vec_st(psumB, 16, tmp); 

598  
599 
src += srcStride; 

600 
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 

601 
} 

602  
603 
tmpM2ssA = vec_ld(0, tmpbis); 

604 
tmpM2ssB = vec_ld(16, tmpbis); 

605 
tmpbis += tmpStride; 

606 
tmpM1ssA = vec_ld(0, tmpbis); 

607 
tmpM1ssB = vec_ld(16, tmpbis); 

608 
tmpbis += tmpStride; 

609 
tmpP0ssA = vec_ld(0, tmpbis); 

610 
tmpP0ssB = vec_ld(16, tmpbis); 

611 
tmpbis += tmpStride; 

612 
tmpP1ssA = vec_ld(0, tmpbis); 

613 
tmpP1ssB = vec_ld(16, tmpbis); 

614 
tmpbis += tmpStride; 

615 
tmpP2ssA = vec_ld(0, tmpbis); 

616 
tmpP2ssB = vec_ld(16, tmpbis); 

617 
tmpbis += tmpStride; 

618  
619 
for (i = 0 ; i < 16 ; i++) { 

620 
const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); 

621 
const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); 

622  
623 
const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 

624 
const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 

625 
const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 

626 
const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 

627 
const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 

628 
const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 

593 
psumA = vec_sub(pp1A, pp2A); 

594 
psumB = vec_sub(pp1B, pp2B); 

629  595  
596 
vec_st(psumA, 0, tmp); 

597 
vec_st(psumB, 16, tmp); 

598  
599 
src += srcStride; 

600 
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 

601 
} 

602  
603 
tmpM2ssA = vec_ld(0, tmpbis); 

604 
tmpM2ssB = vec_ld(16, tmpbis); 

605 
tmpbis += tmpStride; 

606 
tmpM1ssA = vec_ld(0, tmpbis); 

607 
tmpM1ssB = vec_ld(16, tmpbis); 

608 
tmpbis += tmpStride; 

609 
tmpP0ssA = vec_ld(0, tmpbis); 

610 
tmpP0ssB = vec_ld(16, tmpbis); 

611 
tmpbis += tmpStride; 

612 
tmpP1ssA = vec_ld(0, tmpbis); 

613 
tmpP1ssB = vec_ld(16, tmpbis); 

614 
tmpbis += tmpStride; 

615 
tmpP2ssA = vec_ld(0, tmpbis); 

616 
tmpP2ssB = vec_ld(16, tmpbis); 

630  617 
tmpbis += tmpStride; 
631  618  
632 
tmpM2ssA = tmpM1ssA; 

633 
tmpM2ssB = tmpM1ssB; 

634 
tmpM1ssA = tmpP0ssA; 

635 
tmpM1ssB = tmpP0ssB; 

636 
tmpP0ssA = tmpP1ssA; 

637 
tmpP0ssB = tmpP1ssB; 

638 
tmpP1ssA = tmpP2ssA; 

639 
tmpP1ssB = tmpP2ssB; 

640 
tmpP2ssA = tmpP3ssA; 

641 
tmpP2ssB = tmpP3ssB; 

642  
643 
pp1Ae = vec_mule(sum1A, v20ss); 

644 
pp1Ao = vec_mulo(sum1A, v20ss); 

645 
pp1Be = vec_mule(sum1B, v20ss); 

646 
pp1Bo = vec_mulo(sum1B, v20ss); 

647  
648 
pp2Ae = vec_mule(sum2A, v5ss); 

649 
pp2Ao = vec_mulo(sum2A, v5ss); 

650 
pp2Be = vec_mule(sum2B, v5ss); 

651 
pp2Bo = vec_mulo(sum2B, v5ss); 

652  
653 
pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); 

654 
pp3Ao = vec_mulo(sum3A, v1ss); 

655 
pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); 

656 
pp3Bo = vec_mulo(sum3B, v1ss); 

657  
658 
pp1cAe = vec_add(pp1Ae, v512si); 

659 
pp1cAo = vec_add(pp1Ao, v512si); 

660 
pp1cBe = vec_add(pp1Be, v512si); 

661 
pp1cBo = vec_add(pp1Bo, v512si); 

662  
663 
pp32Ae = vec_sub(pp3Ae, pp2Ae); 

664 
pp32Ao = vec_sub(pp3Ao, pp2Ao); 

665 
pp32Be = vec_sub(pp3Be, pp2Be); 

666 
pp32Bo = vec_sub(pp3Bo, pp2Bo); 

667  
668 
sumAe = vec_add(pp1cAe, pp32Ae); 

669 
sumAo = vec_add(pp1cAo, pp32Ao); 

670 
sumBe = vec_add(pp1cBe, pp32Be); 

671 
sumBo = vec_add(pp1cBo, pp32Bo); 

672  
673 
ssumAe = vec_sra(sumAe, v10ui); 

674 
ssumAo = vec_sra(sumAo, v10ui); 

675 
ssumBe = vec_sra(sumBe, v10ui); 

676 
ssumBo = vec_sra(sumBo, v10ui); 

677  
678 
ssume = vec_packs(ssumAe, ssumBe); 

679 
ssumo = vec_packs(ssumAo, ssumBo); 

680  
681 
sumv = vec_packsu(ssume, ssumo); 

682 
sum = vec_perm(sumv, sumv, mperm); 

683  
684 
ASSERT_ALIGNED(dst); 

685 
vdst = vec_ld(0, dst); 

686  
687 
OP_U8_ALTIVEC(fsum, sum, vdst); 

688  
689 
vec_st(fsum, 0, dst); 

690  
691 
dst += dstStride; 

692 
} 

693 
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 

619 
for (i = 0 ; i < 16 ; i++) { 

620 
const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); 

621 
const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); 

622  
623 
const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 

624 
const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 

625 
const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 

626 
const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 

627 
const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 

628 
const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 

629  
630 
tmpbis += tmpStride; 

631  
632 
tmpM2ssA = tmpM1ssA; 

633 
tmpM2ssB = tmpM1ssB; 

634 
tmpM1ssA = tmpP0ssA; 

635 
tmpM1ssB = tmpP0ssB; 

636 
tmpP0ssA = tmpP1ssA; 

637 
tmpP0ssB = tmpP1ssB; 

638 
tmpP1ssA = tmpP2ssA; 

639 
tmpP1ssB = tmpP2ssB; 

640 
tmpP2ssA = tmpP3ssA; 

641 
tmpP2ssB = tmpP3ssB; 

642  
643 
pp1Ae = vec_mule(sum1A, v20ss); 

644 
pp1Ao = vec_mulo(sum1A, v20ss); 

645 
pp1Be = vec_mule(sum1B, v20ss); 

646 
pp1Bo = vec_mulo(sum1B, v20ss); 

647  
648 
pp2Ae = vec_mule(sum2A, v5ss); 

649 
pp2Ao = vec_mulo(sum2A, v5ss); 

650 
pp2Be = vec_mule(sum2B, v5ss); 

651 
pp2Bo = vec_mulo(sum2B, v5ss); 

652  
653 
pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); 

654 
pp3Ao = vec_mulo(sum3A, v1ss); 

655 
pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); 

656 
pp3Bo = vec_mulo(sum3B, v1ss); 

657  
658 
pp1cAe = vec_add(pp1Ae, v512si); 

659 
pp1cAo = vec_add(pp1Ao, v512si); 

660 
pp1cBe = vec_add(pp1Be, v512si); 

661 
pp1cBo = vec_add(pp1Bo, v512si); 

662  
663 
pp32Ae = vec_sub(pp3Ae, pp2Ae); 

664 
pp32Ao = vec_sub(pp3Ao, pp2Ao); 

665 
pp32Be = vec_sub(pp3Be, pp2Be); 

666 
pp32Bo = vec_sub(pp3Bo, pp2Bo); 

667  
668 
sumAe = vec_add(pp1cAe, pp32Ae); 

669 
sumAo = vec_add(pp1cAo, pp32Ao); 

670 
sumBe = vec_add(pp1cBe, pp32Be); 

671 
sumBo = vec_add(pp1cBo, pp32Bo); 

672  
673 
ssumAe = vec_sra(sumAe, v10ui); 

674 
ssumAo = vec_sra(sumAo, v10ui); 

675 
ssumBe = vec_sra(sumBe, v10ui); 

676 
ssumBo = vec_sra(sumBo, v10ui); 

677  
678 
ssume = vec_packs(ssumAe, ssumBe); 

679 
ssumo = vec_packs(ssumAo, ssumBo); 

680  
681 
sumv = vec_packsu(ssume, ssumo); 

682 
sum = vec_perm(sumv, sumv, mperm); 

683  
684 
ASSERT_ALIGNED(dst); 

685 
vdst = vec_ld(0, dst); 

686  
687 
OP_U8_ALTIVEC(fsum, sum, vdst); 

688  
689 
vec_st(fsum, 0, dst); 

690  
691 
dst += dstStride; 

692 
} 

693 
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 

694  694 
} 
