ffmpeg / libavcodec / ppc / snow_altivec.c @ 19032450
History  View  Annotate  Download (22.5 KB)
1 
/*


2 
* Altivec optimized snow DSP utils

3 
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>

4 
*

5 
* This file is part of FFmpeg.

6 
*

7 
* FFmpeg is free software; you can redistribute it and/or

8 
* modify it under the terms of the GNU Lesser General Public

9 
* License as published by the Free Software Foundation; either

10 
* version 2.1 of the License, or (at your option) any later version.

11 
*

12 
* FFmpeg is distributed in the hope that it will be useful,

13 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

14 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 
* Lesser General Public License for more details.

16 
*

17 
* You should have received a copy of the GNU Lesser General Public

18 
* License along with FFmpeg; if not, write to the Free Software

19 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

20 
*/

21  
22 
#include "dsputil.h" 
23  
24 
#include "gcc_fixes.h" 
25 
#include "dsputil_altivec.h" 
26 
#include "snow.h" 
27  
28 
#undef NDEBUG

29 
#include <assert.h> 
30  
31  
32  
33 
//FIXME remove this replication

34 
#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)>line[line_num] ? (slice_buf)>line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))

35  
36 
static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line) 
37 
{ 
38 
int offset;

39 
DWTELEM * buffer; 
40  
41 
// av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);

42  
43 
assert(buf>data_stack_top >= 0);

44 
// assert(!buf>line[line]);

45 
if (buf>line[line])

46 
return buf>line[line];

47  
48 
offset = buf>line_width * line; 
49 
buffer = buf>data_stack[buf>data_stack_top]; 
50 
buf>data_stack_top; 
51 
buf>line[line] = buffer; 
52  
53 
// av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf>data_stack_top + 1);

54  
55 
return buffer;

56 
} 
57  
58  
59 
//altivec code

60  
61 
void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width) 
62 
{ 
63 
const int w2= (width+1)>>1; 
64 
DECLARE_ALIGNED_16(DWTELEM, temp[(width>>1)]);

65 
const int w_l= (width>>1); 
66 
const int w_r= w2  1; 
67 
int i;

68 
vector signed int t1, t2, x, y, tmp1, tmp2; 
69 
vector signed int *vbuf, *vtmp; 
70 
vector unsigned char align; 
71  
72  
73  
74 
{ // Lift 0

75 
DWTELEM * const ref = b + w2  1; 
76 
DWTELEM b_0 = b[0];

77 
vbuf = (vector signed int *)b; 
78  
79 
tmp1 = vec_ld (0, ref);

80 
align = vec_lvsl (0, ref);

81 
tmp2 = vec_ld (15, ref);

82 
t1= vec_perm(tmp1, tmp2, align); 
83  
84 
i = 0;

85  
86 
for (i=0; i<w_l15; i+=16) { 
87 
#if 0

88 
b[i+0] = b[i+0]  ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3);

89 
b[i+1] = b[i+1]  ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3);

90 
b[i+2] = b[i+2]  ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3);

91 
b[i+3] = b[i+3]  ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);

92 
#else

93  
94 
tmp1 = vec_ld (0, ref+4+i); 
95 
tmp2 = vec_ld (15, ref+4+i); 
96  
97 
t2 = vec_perm(tmp1, tmp2, align); 
98  
99 
y = vec_add(t1,vec_sld(t1,t2,4));

100 
y = vec_add(vec_add(y,y),y); 
101  
102 
tmp1 = vec_ld (0, ref+8+i); 
103  
104 
y = vec_add(y, vec_splat_s32(4));

105 
y = vec_sra(y, vec_splat_u32(3));

106  
107 
tmp2 = vec_ld (15, ref+8+i); 
108  
109 
*vbuf = vec_sub(*vbuf, y); 
110  
111 
t1=t2; 
112  
113 
vbuf++; 
114  
115 
t2 = vec_perm(tmp1, tmp2, align); 
116  
117 
y = vec_add(t1,vec_sld(t1,t2,4));

118 
y = vec_add(vec_add(y,y),y); 
119  
120 
tmp1 = vec_ld (0, ref+12+i); 
121  
122 
y = vec_add(y, vec_splat_s32(4));

123 
y = vec_sra(y, vec_splat_u32(3));

124  
125 
tmp2 = vec_ld (15, ref+12+i); 
126  
127 
*vbuf = vec_sub(*vbuf, y); 
128  
129 
t1=t2; 
130  
131 
vbuf++; 
132  
133 
t2 = vec_perm(tmp1, tmp2, align); 
134  
135 
y = vec_add(t1,vec_sld(t1,t2,4));

136 
y = vec_add(vec_add(y,y),y); 
137  
138 
tmp1 = vec_ld (0, ref+16+i); 
139  
140 
y = vec_add(y, vec_splat_s32(4));

141 
y = vec_sra(y, vec_splat_u32(3));

142  
143 
tmp2 = vec_ld (15, ref+16+i); 
144  
145 
*vbuf = vec_sub(*vbuf, y); 
146  
147 
t1=t2; 
148  
149 
t2 = vec_perm(tmp1, tmp2, align); 
150  
151 
y = vec_add(t1,vec_sld(t1,t2,4));

152 
y = vec_add(vec_add(y,y),y); 
153  
154 
vbuf++; 
155  
156 
y = vec_add(y, vec_splat_s32(4));

157 
y = vec_sra(y, vec_splat_u32(3));

158 
*vbuf = vec_sub(*vbuf, y); 
159  
160 
t1=t2; 
161  
162 
vbuf++; 
163  
164 
#endif

165 
} 
166  
167 
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);

168 
b[0] = b_0  ((W_DM * 2 * ref[1]+W_DO)>>W_DS); 
169 
} 
170  
171 
{ // Lift 1

172 
DWTELEM * const dst = b+w2;

173  
174 
i = 0;

175 
for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){ 
176 
dst[i] = dst[i]  (b[i] + b[i + 1]);

177 
} 
178  
179 
align = vec_lvsl(0, b+i);

180 
tmp1 = vec_ld(0, b+i);

181 
vbuf = (vector signed int*) (dst + i); 
182 
tmp2 = vec_ld(15, b+i);

183  
184 
t1 = vec_perm(tmp1, tmp2, align); 
185  
186 
for (; i<w_r3; i+=4) { 
187  
188 
#if 0

189 
dst[i] = dst[i]  (b[i] + b[i + 1]);

190 
dst[i+1] = dst[i+1]  (b[i+1] + b[i + 2]);

191 
dst[i+2] = dst[i+2]  (b[i+2] + b[i + 3]);

192 
dst[i+3] = dst[i+3]  (b[i+3] + b[i + 4]);

193 
#else

194  
195 
tmp1 = vec_ld(0, b+4+i); 
196 
tmp2 = vec_ld(15, b+4+i); 
197  
198 
t2 = vec_perm(tmp1, tmp2, align); 
199  
200 
y = vec_add(t1, vec_sld(t1,t2,4));

201 
*vbuf = vec_sub (*vbuf, y); 
202  
203 
vbuf++; 
204  
205 
t1 = t2; 
206  
207 
#endif

208  
209 
} 
210  
211 
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);

212 
} 
213  
214 
{ // Lift 2

215 
DWTELEM * const ref = b+w2  1; 
216 
DWTELEM b_0 = b[0];

217 
vbuf= (vector signed int *) b; 
218  
219 
tmp1 = vec_ld (0, ref);

220 
align = vec_lvsl (0, ref);

221 
tmp2 = vec_ld (15, ref);

222 
t1= vec_perm(tmp1, tmp2, align); 
223  
224 
i = 0;

225 
for (; i<w_l15; i+=16) { 
226 
#if 0

227 
b[i] = b[i]  (((8 (ref[i] + ref[i+1]))  (b[i] <<2)) >> 4);

228 
b[i+1] = b[i+1]  (((8 (ref[i+1] + ref[i+2]))  (b[i+1]<<2)) >> 4);

229 
b[i+2] = b[i+2]  (((8 (ref[i+2] + ref[i+3]))  (b[i+2]<<2)) >> 4);

230 
b[i+3] = b[i+3]  (((8 (ref[i+3] + ref[i+4]))  (b[i+3]<<2)) >> 4);

231 
#else

232 
tmp1 = vec_ld (0, ref+4+i); 
233 
tmp2 = vec_ld (15, ref+4+i); 
234  
235 
t2 = vec_perm(tmp1, tmp2, align); 
236  
237 
y = vec_add(t1,vec_sld(t1,t2,4));

238 
y = vec_sub(vec_splat_s32(8),y);

239  
240 
tmp1 = vec_ld (0, ref+8+i); 
241  
242 
x = vec_sl(*vbuf,vec_splat_u32(2));

243 
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));

244  
245 
tmp2 = vec_ld (15, ref+8+i); 
246  
247 
*vbuf = vec_sub( *vbuf, y); 
248  
249 
t1 = t2; 
250  
251 
vbuf++; 
252  
253 
t2 = vec_perm(tmp1, tmp2, align); 
254  
255 
y = vec_add(t1,vec_sld(t1,t2,4));

256 
y = vec_sub(vec_splat_s32(8),y);

257  
258 
tmp1 = vec_ld (0, ref+12+i); 
259  
260 
x = vec_sl(*vbuf,vec_splat_u32(2));

261 
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));

262  
263 
tmp2 = vec_ld (15, ref+12+i); 
264  
265 
*vbuf = vec_sub( *vbuf, y); 
266  
267 
t1 = t2; 
268  
269 
vbuf++; 
270  
271 
t2 = vec_perm(tmp1, tmp2, align); 
272  
273 
y = vec_add(t1,vec_sld(t1,t2,4));

274 
y = vec_sub(vec_splat_s32(8),y);

275  
276 
tmp1 = vec_ld (0, ref+16+i); 
277  
278 
x = vec_sl(*vbuf,vec_splat_u32(2));

279 
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));

280  
281 
tmp2 = vec_ld (15, ref+16+i); 
282  
283 
*vbuf = vec_sub( *vbuf, y); 
284  
285 
t1 = t2; 
286  
287 
vbuf++; 
288  
289 
t2 = vec_perm(tmp1, tmp2, align); 
290  
291 
y = vec_add(t1,vec_sld(t1,t2,4));

292 
y = vec_sub(vec_splat_s32(8),y);

293  
294 
t1 = t2; 
295  
296 
x = vec_sl(*vbuf,vec_splat_u32(2));

297 
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));

298 
*vbuf = vec_sub( *vbuf, y); 
299  
300 
vbuf++; 
301  
302 
#endif

303 
} 
304  
305 
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); 
306 
b[0] = b_0  (((2 * ref[1] + W_BO)  4 * b_0) >> W_BS); 
307 
} 
308  
309 
{ // Lift 3

310 
DWTELEM * const src = b+w2;

311  
312 
vbuf = (vector signed int *)b; 
313 
vtmp = (vector signed int *)temp; 
314  
315 
i = 0;

316 
align = vec_lvsl(0, src);

317  
318 
for (; i<w_r3; i+=4) { 
319 
#if 0

320 
temp[i] = src[i]  ((3*(b[i] + b[i+1]))>>1);

321 
temp[i+1] = src[i+1]  ((3*(b[i+1] + b[i+2]))>>1);

322 
temp[i+2] = src[i+2]  ((3*(b[i+2] + b[i+3]))>>1);

323 
temp[i+3] = src[i+3]  ((3*(b[i+3] + b[i+4]))>>1);

324 
#else

325 
tmp1 = vec_ld(0,src+i);

326 
t1 = vec_add(vbuf[0],vec_sld(vbuf[0],vbuf[1],4)); 
327 
tmp2 = vec_ld(15,src+i);

328 
t1 = vec_sub(vec_splat_s32(0),t1); //bad! 
329 
t1 = vec_add(t1,vec_add(t1,t1)); 
330 
t2 = vec_perm(tmp1 ,tmp2 ,align); 
331 
t1 = vec_sra(t1,vec_splat_u32(1));

332 
vbuf++; 
333 
*vtmp = vec_sub(t2,t1); 
334 
vtmp++; 
335  
336 
#endif

337  
338 
} 
339  
340 
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, 3, 0, 1); 
341 
} 
342  
343 
{ 
344 
//Interleave

345 
int a;

346 
vector signed int *t = (vector signed int *)temp, 
347 
*v = (vector signed int *)b; 
348  
349 
snow_interleave_line_header(&i, width, b, temp); 
350  
351 
for (; (i & 0xE) != 0xE; i=2){ 
352 
b[i+1] = temp[i>>1]; 
353 
b[i] = b[i>>1];

354 
} 
355 
for (i=14; i>=0; i=16){ 
356 
a=i/4;

357  
358 
v[a+3]=vec_mergel(v[(a>>1)+1],t[(a>>1)+1]); 
359 
v[a+2]=vec_mergeh(v[(a>>1)+1],t[(a>>1)+1]); 
360 
v[a+1]=vec_mergel(v[a>>1],t[a>>1]); 
361 
v[a]=vec_mergeh(v[a>>1],t[a>>1]); 
362  
363 
} 
364  
365 
} 
366 
} 
367  
368 
void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width) 
369 
{ 
370 
int i, w4 = width/4; 
371 
vector signed int *v0, *v1,*v2,*v3,*v4,*v5; 
372 
vector signed int t1, t2; 
373  
374 
v0=(vector signed int *)b0; 
375 
v1=(vector signed int *)b1; 
376 
v2=(vector signed int *)b2; 
377 
v3=(vector signed int *)b3; 
378 
v4=(vector signed int *)b4; 
379 
v5=(vector signed int *)b5; 
380  
381 
for (i=0; i< w4;i++) 
382 
{ 
383  
384 
#if 0

385 
b4[i] = (3*(b3[i] + b5[i])+4)>>3;

386 
b3[i] = ((b2[i] + b4[i]));

387 
b2[i] += ((b1[i] + b3[i])+4*b2[i]+8)>>4;

388 
b1[i] += (3*(b0[i] + b2[i]))>>1;

389 
#else

390 
t1 = vec_add(v3[i], v5[i]);

391 
t2 = vec_add(t1, vec_add(t1,t1));

392 
t1 = vec_add(t2, vec_splat_s32(4));

393 
v4[i] = vec_sub(v4[i], vec_sra(t1,vec_splat_u32(3)));

394 

395 
v3[i] = vec_sub(v3[i], vec_add(v2[i], v4[i]));

396 

397 
t1 = vec_add(vec_splat_s32(8), vec_add(v1[i], v3[i]));

398 
t2 = vec_sl(v2[i], vec_splat_u32(2));

399 
v2[i] = vec_add(v2[i], vec_sra(vec_add(t1,t2),vec_splat_u32(4)));

400 
t1 = vec_add(v0[i], v2[i]);

401 
t2 = vec_add(t1, vec_add(t1,t1));

402 
v1[i] = vec_add(v1[i], vec_sra(t2,vec_splat_u32(1)));

403 

404 
#endif

405 
}

406 

407 
for(i*=4; i < width; i++)

408 
{

409 
b4[i] = (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;

410 
b3[i] = (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;

411 
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;

412 
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;

413 
}

414 
}

415 

416 
#define LOAD_BLOCKS \

417 
tmp1 = vec_ld(0, &block[3][y*src_stride]);\

418 
align = vec_lvsl(0, &block[3][y*src_stride]);\

419 
tmp2 = vec_ld(15, &block[3][y*src_stride]);\

420 
\

421 
b3 = vec_perm(tmp1,tmp2,align);\

422 
\

423 
tmp1 = vec_ld(0, &block[2][y*src_stride]);\

424 
align = vec_lvsl(0, &block[2][y*src_stride]);\

425 
tmp2 = vec_ld(15, &block[2][y*src_stride]);\

426 
\

427 
b2 = vec_perm(tmp1,tmp2,align);\

428 
\

429 
tmp1 = vec_ld(0, &block[1][y*src_stride]);\

430 
align = vec_lvsl(0, &block[1][y*src_stride]);\

431 
tmp2 = vec_ld(15, &block[1][y*src_stride]);\

432 
\

433 
b1 = vec_perm(tmp1,tmp2,align);\

434 
\

435 
tmp1 = vec_ld(0, &block[0][y*src_stride]);\

436 
align = vec_lvsl(0, &block[0][y*src_stride]);\

437 
tmp2 = vec_ld(15, &block[0][y*src_stride]);\

438 
\

439 
b0 = vec_perm(tmp1,tmp2,align);

440 

441 
#define LOAD_OBMCS \

442 
tmp1 = vec_ld(0, obmc1);\

443 
align = vec_lvsl(0, obmc1);\

444 
tmp2 = vec_ld(15, obmc1);\

445 
\

446 
ob1 = vec_perm(tmp1,tmp2,align);\

447 
\

448 
tmp1 = vec_ld(0, obmc2);\

449 
align = vec_lvsl(0, obmc2);\

450 
tmp2 = vec_ld(15, obmc2);\

451 
\

452 
ob2 = vec_perm(tmp1,tmp2,align);\

453 
\

454 
tmp1 = vec_ld(0, obmc3);\

455 
align = vec_lvsl(0, obmc3);\

456 
tmp2 = vec_ld(15, obmc3);\

457 
\

458 
ob3 = vec_perm(tmp1,tmp2,align);\

459 
\

460 
tmp1 = vec_ld(0, obmc4);\

461 
align = vec_lvsl(0, obmc4);\

462 
tmp2 = vec_ld(15, obmc4);\

463 
\

464 
ob4 = vec_perm(tmp1,tmp2,align);

465 

466 
/* interleave logic

467 
* h1 < [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ]

468 
* h2 < [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ]

469 
* h < [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ]

470 
*/

471 

472 
#define STEPS_0_1\

473 
h1 = (vector unsigned short)\

474 
vec_mergeh(ob1, ob2);\

475 
\

476 
h2 = (vector unsigned short)\

477 
vec_mergeh(ob3, ob4);\

478 
\

479 
ih = (vector unsigned char)\

480 
vec_mergeh(h1,h2);\

481 
\

482 
l1 = (vector unsigned short) vec_mergeh(b3, b2);\

483 
\

484 
ih1 = (vector unsigned char) vec_mergel(h1, h2);\

485 
\

486 
l2 = (vector unsigned short) vec_mergeh(b1, b0);\

487 
\

488 
il = (vector unsigned char) vec_mergeh(l1, l2);\

489 
\

490 
v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\

491 
\

492 
il1 = (vector unsigned char) vec_mergel(l1, l2);\

493 
\

494 
v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));

495 

496 
#define FINAL_STEP_SCALAR\

497 
for(x=0; x<b_w; x++)\

498 
if(add){\

499 
vbuf[x] += dst[x + src_x];\

500 
vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS1))) >> FRAC_BITS;\

501 
if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\

502 
dst8[x + y*src_stride] = vbuf[x];\

503 
}else{\

504 
dst[x + src_x] = vbuf[x];\

505 
}

506 

507 
static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,

508 
const int obmc_stride,

509 
uint8_t * * block, int b_w,

510 
int b_h, int src_x, int src_y,

511 
int src_stride, slice_buffer * sb,

512 
int add, uint8_t * dst8)

513 
{

514 
int y, x;

515 
DWTELEM * dst;

516 
vector unsigned short h1, h2, l1, l2;

517 
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;

518 
vector unsigned char b0,b1,b2,b3;

519 
vector unsigned char ob1,ob2,ob3,ob4;

520 

521 
DECLARE_ALIGNED_16(int, vbuf[16]);

522 
vector signed int *v = (vector signed int *)vbuf, *d;

523 

524 
for(y=0; y<b_h; y++){

525 
//FIXME ugly misuse of obmc_stride

526 

527 
uint8_t *obmc1= obmc + y*obmc_stride;

528 
uint8_t *obmc2= obmc1+ (obmc_stride>>1);

529 
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);

530 
uint8_t *obmc4= obmc3+ (obmc_stride>>1);

531 

532 
dst = slice_buffer_get_line(sb, src_y + y);

533 
d = (vector signed int *)(dst + src_x);

534 

535 
//FIXME i could avoid some loads!

536 

537 
// load blocks

538 
LOAD_BLOCKS

539 

540 
// load obmcs

541 
LOAD_OBMCS

542 

543 
// steps 0 1

544 
STEPS_0_1

545 

546 
FINAL_STEP_SCALAR

547 

548 
}

549 

550 
}

551 

552 
#define STEPS_2_3\

553 
h1 = (vector unsigned short) vec_mergel(ob1, ob2);\

554 
\

555 
h2 = (vector unsigned short) vec_mergel(ob3, ob4);\

556 
\

557 
ih = (vector unsigned char) vec_mergeh(h1,h2);\

558 
\

559 
l1 = (vector unsigned short) vec_mergel(b3, b2);\

560 
\

561 
l2 = (vector unsigned short) vec_mergel(b1, b0);\

562 
\

563 
ih1 = (vector unsigned char) vec_mergel(h1,h2);\

564 
\

565 
il = (vector unsigned char) vec_mergeh(l1,l2);\

566 
\

567 
v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\

568 
\

569 
il1 = (vector unsigned char) vec_mergel(l1,l2);\

570 
\

571 
v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));

572 

573 

574 
static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,

575 
const int obmc_stride,

576 
uint8_t * * block, int b_w,

577 
int b_h, int src_x, int src_y,

578 
int src_stride, slice_buffer * sb,

579 
int add, uint8_t * dst8)

580 
{

581 
int y, x;

582 
DWTELEM * dst;

583 
vector unsigned short h1, h2, l1, l2;

584 
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;

585 
vector unsigned char b0,b1,b2,b3;

586 
vector unsigned char ob1,ob2,ob3,ob4;

587 
DECLARE_ALIGNED_16(int, vbuf[b_w]);

588 
vector signed int *v = (vector signed int *)vbuf, *d;

589 

590 
for(y=0; y<b_h; y++){

591 
//FIXME ugly misuse of obmc_stride

592 

593 
uint8_t *obmc1= obmc + y*obmc_stride;

594 
uint8_t *obmc2= obmc1+ (obmc_stride>>1);

595 
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);

596 
uint8_t *obmc4= obmc3+ (obmc_stride>>1);

597 

598 
dst = slice_buffer_get_line(sb, src_y + y);

599 
d = (vector signed int *)(dst + src_x);

600 

601 
// load blocks

602 
LOAD_BLOCKS

603 

604 
// load obmcs

605 
LOAD_OBMCS

606 

607 
// steps 0 1 2 3

608 
STEPS_0_1

609 

610 
STEPS_2_3

611 

612 
FINAL_STEP_SCALAR

613 

614 
}

615 
}

616 

617 
#define FINAL_STEP_VEC \

618 
\

619 
if(add)\

620 
{\

621 
for(x=0; x<b_w/4; x++)\

622 
{\

623 
v[x] = vec_add(v[x], d[x]);\

624 
v[x] = vec_sra(vec_add(v[x],\

625 
vec_sl( vec_splat_s32(1),\

626 
vec_splat_u32(7))),\

627 
vec_splat_u32(8));\

628 
\

629 
mask = (vector bool int) vec_sl((vector signed int)\

630 
vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\

631 
mask = (vector bool int) vec_and(v[x],vec_nor(mask,mask));\

632 
\

633 
mask = (vector bool int)\

634 
vec_cmpeq((vector signed int)mask,\

635 
(vector signed int)vec_splat_u32(0));\

636 
\

637 
vs = vec_sra(v[x],vec_splat_u32(8));\

638 
vs = vec_sra(v[x],vec_splat_u32(8));\

639 
vs = vec_sra(v[x],vec_splat_u32(15));\

640 
\

641 
vs = vec_nor(vs,vs);\

642 
\

643 
v[x]= vec_sel(v[x],vs,mask);\

644 
}\

645 
\

646 
for(x=0; x<b_w; x++)\

647 
dst8[x + y*src_stride] = vbuf[x];\

648 
\

649 
}\

650 
else\

651 
for(x=0; x<b_w/4; x++)\

652 
d[x] = vec_sub(d[x], v[x]);

653 

654 
static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc,

655 
const int obmc_stride,

656 
uint8_t * * block, int b_w,

657 
int b_h, int src_x, int src_y,

658 
int src_stride, slice_buffer * sb,

659 
int add, uint8_t * dst8)

660 
{

661 
int y, x;

662 
DWTELEM * dst;

663 
vector bool int mask;

664 
vector signed int vs;

665 
vector unsigned short h1, h2, l1, l2;

666 
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;

667 
vector unsigned char b0,b1,b2,b3;

668 
vector unsigned char ob1,ob2,ob3,ob4;

669 

670 
DECLARE_ALIGNED_16(int, vbuf[16]);

671 
vector signed int *v = (vector signed int *)vbuf, *d;

672 

673 
for(y=0; y<b_h; y++){

674 
//FIXME ugly misuse of obmc_stride

675 

676 
uint8_t *obmc1= obmc + y*obmc_stride;

677 
uint8_t *obmc2= obmc1+ (obmc_stride>>1);

678 
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);

679 
uint8_t *obmc4= obmc3+ (obmc_stride>>1);

680 

681 
dst = slice_buffer_get_line(sb, src_y + y);

682 
d = (vector signed int *)(dst + src_x);

683 

684 
//FIXME i could avoid some loads!

685 

686 
// load blocks

687 
LOAD_BLOCKS

688 

689 
// load obmcs

690 
LOAD_OBMCS

691 

692 
// steps 0 1

693 
STEPS_0_1

694 

695 
FINAL_STEP_VEC

696 

697 
}

698 

699 
}

700 

701 
static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc,

702 
const int obmc_stride,

703 
uint8_t * * block, int b_w,

704 
int b_h, int src_x, int src_y,

705 
int src_stride, slice_buffer * sb,

706 
int add, uint8_t * dst8)

707 
{

708 
int y, x;

709 
DWTELEM * dst;

710 
vector bool int mask;

711 
vector signed int vs;

712 
vector unsigned short h1, h2, l1, l2;

713 
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;

714 
vector unsigned char b0,b1,b2,b3;

715 
vector unsigned char ob1,ob2,ob3,ob4;

716 
DECLARE_ALIGNED_16(int, vbuf[b_w]);

717 
vector signed int *v = (vector signed int *)vbuf, *d;

718 

719 
for(y=0; y<b_h; y++){

720 
//FIXME ugly misuse of obmc_stride

721 

722 
uint8_t *obmc1= obmc + y*obmc_stride;

723 
uint8_t *obmc2= obmc1+ (obmc_stride>>1);

724 
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);

725 
uint8_t *obmc4= obmc3+ (obmc_stride>>1);

726 

727 
dst = slice_buffer_get_line(sb, src_y + y);

728 
d = (vector signed int *)(dst + src_x);

729 

730 
// load blocks

731 
LOAD_BLOCKS

732 

733 
// load obmcs

734 
LOAD_OBMCS

735 

736 
// steps 0 1 2 3

737 
STEPS_0_1

738 

739 
STEPS_2_3

740 

741 
FINAL_STEP_VEC

742 

743 
}

744 
}

745 

746 

747 
void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,

748 
uint8_t * * block, int b_w, int b_h,

749 
int src_x, int src_y, int src_stride,

750 
slice_buffer * sb, int add,

751 
uint8_t * dst8)

752 
{

753 
if (src_x&15) {

754 
if (b_w == 16)

755 
inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block,

756 
b_w, b_h, src_x, src_y,

757 
src_stride, sb, add, dst8);

758 
else if (b_w == 8)

759 
inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block,

760 
b_w, b_h, src_x, src_y,

761 
src_stride, sb, add, dst8);

762 
else

763 
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,

764 
src_y, src_stride, sb, add, dst8);

765 
} else {

766 
if (b_w == 16)

767 
inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block,

768 
b_w, b_h, src_x, src_y,

769 
src_stride, sb, add, dst8);

770 
else if (b_w == 8)

771 
inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block,

772 
b_w, b_h, src_x, src_y,

773 
src_stride, sb, add, dst8);

774 
else

775 
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,

776 
src_y, src_stride, sb, add, dst8);

777 
}

778 
}

779 

780 

781 
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx)

782 
{

783 
c>horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;

784 
c>vertical_compose97i = ff_snow_vertical_compose97i_altivec;

785 
c>inner_add_yblock = ff_snow_inner_add_yblock_altivec;

786 
}

787 
