;****************************************************************************** |
;* MMX/SSSE3-optimized functions for H264 chroma MC |

;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, |

;* 2005-2008 Loren Merritt |

;* |

;* This file is part of Libav. |

;* |

;* Libav is free software; you can redistribute it and/or |

;* modify it under the terms of the GNU Lesser General Public |

;* License as published by the Free Software Foundation; either |

;* version 2.1 of the License, or (at your option) any later version. |

;* |

;* Libav is distributed in the hope that it will be useful, |

;* but WITHOUT ANY WARRANTY; without even the implied warranty of |

;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

;* Lesser General Public License for more details. |

;* |

;* You should have received a copy of the GNU Lesser General Public |

;* License along with Libav; if not, write to the Free Software |

;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

;****************************************************************************** |

%include "x86inc.asm" |

%include "x86util.asm" |

SECTION_RODATA |

27 | |

rnd_rv40_2d_tbl: times 4 dw 0 |

times 4 dw 16 |

times 4 dw 32 |

times 4 dw 16 |

times 4 dw 32 |

times 4 dw 28 |

times 4 dw 32 |

times 4 dw 28 |

times 4 dw 0 |

times 4 dw 32 |

times 4 dw 16 |

times 4 dw 32 |

times 4 dw 32 |

times 4 dw 28 |

times 4 dw 32 |

times 4 dw 28 |

rnd_rv40_1d_tbl: times 4 dw 0 |

times 4 dw 2 |

times 4 dw 4 |

times 4 dw 2 |

times 4 dw 4 |

times 4 dw 3 |

times 4 dw 4 |

times 4 dw 3 |

times 4 dw 0 |

times 4 dw 4 |

times 4 dw 2 |

times 4 dw 4 |

times 4 dw 4 |

times 4 dw 3 |

times 4 dw 4 |

times 4 dw 3 |

cextern pw_3 |

cextern pw_4 |

cextern pw_8 |

cextern pw_28 |

cextern pw_32 |

cextern pw_64 |

SECTION .text |

%macro mv0_pixels_mc8 0 |

lea r4, [r2*2 ] |

.next4rows |

movq mm0, [r1 ] |

movq mm1, [r1+r2] |

CHROMAMC_AVG mm0, [r0 ] |

CHROMAMC_AVG mm1, [r0+r2] |

movq [r0 ], mm0 |

movq [r0+r2], mm1 |

add r0, r4 |

add r1, r4 |

movq mm0, [r1 ] |

movq mm1, [r1+r2] |

CHROMAMC_AVG mm0, [r0 ] |

CHROMAMC_AVG mm1, [r0+r2] |

add r1, r4 |

movq [r0 ], mm0 |

movq [r0+r2], mm1 |

add r0, r4 |

sub r3d, 4 |

jne .next4rows |

%endmacro |

%macro chroma_mc8_mmx_func 3 |

; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, |

; int stride, int h, int mx, int my) |

cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 |

%ifdef ARCH_X86_64 |

movsxd r2, r2d |

%endif |

mov r6d, r5d |

or r6d, r4d |

jne .at_least_one_non_zero |

; mx == 0 AND my == 0 - no filter needed |

mv0_pixels_mc8 |

REP_RET |

.at_least_one_non_zero |

%ifidn %2, rv40 |

%ifdef PIC |

%define rnd_1d_rv40 r11 |

%define rnd_2d_rv40 r11 |

%else ; no-PIC |

%define rnd_1d_rv40 rnd_rv40_1d_tbl |

%define rnd_2d_rv40 rnd_rv40_2d_tbl |

%endif |

%ifdef ARCH_X86_64 |

mov r10, r5 |

and r10, 6 ; &~1 for mx/my=[0,7] |

lea r10, [r10*4+r4] |

sar r10d, 1 |

%define rnd_bias r10 |

%define dest_reg r0 |

%else ; x86-32 |

mov r0, r5 |

and r0, 6 ; &~1 for mx/my=[0,7] |

lea r0, [r0*4+r4] |

sar r0d, 1 |

%define rnd_bias r0 |

%define dest_reg r5 |

%endif |

%else ; vc1, h264 |

%define rnd_bias 0 |

%define dest_reg r0 |

%endif |

test r5d, r5d |

mov r6, 1 |

je .my_is_zero |

test r4d, r4d |

mov r6, r2 ; dxy = x ? 1 : stride |

jne .both_non_zero |

.my_is_zero |

; mx == 0 XOR my == 0 - 1 dimensional filter only |

or r4d, r5d ; x + y |

146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 | |

movd m5, r4d |

movq m4, [pw_8] |

movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 |

punpcklwd m5, m5 |

punpckldq m5, m5 ; mm5 = B = x |

pxor m7, m7 |

psubw m4, m5 ; mm4 = A = 8-x |

163 |
164 |
165 |
167 |
168 |
169 |
170 |
171 |
punpckhbw m3, m7 |

pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] |

pmullw m1, m4 |

pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] |

pmullw m3, m5 |

178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 | |

add dest_reg, r2 |

add r1, r2 |

dec r3d |

jne .next1drow |

REP_RET |

.both_non_zero ; general case, bilinear |

movd m4, r4d ; x |

movd m6, r5d ; y |

%ifidn %2, rv40 |

%ifdef PIC |

lea r11, [rnd_rv40_2d_tbl] |

%endif |

%ifndef ARCH_X86_64 |

mov r5, r0m |

%endif |

%endif |

mov r6, rsp ; backup stack pointer |

and rsp, ~(mmsize-1) ; align stack |

sub rsp, 16 ; AA and DD |

punpcklwd m4, m4 |

punpcklwd m6, m6 |

punpckldq m4, m4 ; mm4 = x words |

punpckldq m6, m6 ; mm6 = y words |

movq m5, m4 |

pmullw m4, m6 ; mm4 = x * y |

psllw m5, 3 |

psllw m6, 3 |

movq m7, m5 |

paddw m7, m6 |

movq [rsp+8], m4 ; DD = x * y |

psubw m5, m4 ; mm5 = B = 8x - xy |

psubw m6, m4 ; mm6 = C = 8y - xy |

paddw m4, [pw_64] |

psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 |

pxor m7, m7 |

movq [rsp ], m4 |

movq m0, [r1 ] ; mm0 = src[0..7] |

movq m1, [r1+1] ; mm1 = src[1..8] |

.next2drow |

add r1, r2 |

movq m2, m0 |

movq m3, m1 |

punpckhbw m0, m7 |

punpcklbw m1, m7 |

punpcklbw m2, m7 |

punpckhbw m3, m7 |

pmullw m0, [rsp] |

pmullw m2, [rsp] |

pmullw m1, m5 |

pmullw m3, m5 |

paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] |

paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] |

movq m0, [r1] |

movq m1, m0 |

punpcklbw m0, m7 |

punpckhbw m1, m7 |

pmullw m0, m6 |

pmullw m1, m6 |

paddw m2, m0 |

paddw m3, m1 ; [mm2,mm3] += C * src[0..7] |

movq m1, [r1+1] |

movq m0, m1 |

movq m4, m1 |

punpcklbw m0, m7 |

punpckhbw m4, m7 |

pmullw m0, [rsp+8] |

pmullw m4, [rsp+8] |

paddw m2, m0 |

paddw m3, m4 ; [mm2,mm3] += D * src[1..8] |

movq m0, [r1] |

paddw m2, [rnd_2d_%2+rnd_bias*8] |

paddw m3, [rnd_2d_%2+rnd_bias*8] |

psrlw m2, 6 |

psrlw m3, 6 |

packuswb m2, m3 |

CHROMAMC_AVG m2, [dest_reg] |

movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 |

273 |
274 |
275 |
276 |
277 |
%endmacro |

%macro chroma_mc4_mmx_func 3 |

cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 |

%ifdef ARCH_X86_64 |

movsxd r2, r2d |

%endif |

pxor m7, m7 |

movd m2, r4d ; x |

movd m3, r5d ; y |

movq m4, [pw_8] |

movq m5, [pw_8] |

punpcklwd m2, m2 |

punpcklwd m3, m3 |

punpcklwd m2, m2 |

punpcklwd m3, m3 |

psubw m4, m2 |

psubw m5, m3 |

%ifidn %2, rv40 |

%ifdef PIC |

lea r11, [rnd_rv40_2d_tbl] |

%define rnd_2d_rv40 r11 |

%else |

%define rnd_2d_rv40 rnd_rv40_2d_tbl |

%endif |

and r5, 6 ; &~1 for mx/my=[0,7] |

lea r5, [r5*4+r4] |

sar r5d, 1 |

%define rnd_bias r5 |

%else ; vc1, h264 |

%define rnd_bias 0 |

%endif |

movd m0, [r1 ] |

movd m6, [r1+1] |

add r1, r2 |

punpcklbw m0, m7 |

punpcklbw m6, m7 |

pmullw m0, m4 |

pmullw m6, m2 |

paddw m6, m0 |

.next2rows |

movd m0, [r1 ] |

movd m1, [r1+1] |

add r1, r2 |

punpcklbw m0, m7 |

punpcklbw m1, m7 |

pmullw m0, m4 |

pmullw m1, m2 |

paddw m1, m0 |

movq m0, m1 |

pmullw m6, m5 |

pmullw m1, m3 |

paddw m6, [rnd_2d_%2+rnd_bias*8] |

paddw m1, m6 |

psrlw m1, 6 |

packuswb m1, m1 |

CHROMAMC_AVG4 m1, m6, [r0] |

movd [r0], m1 |

add r0, r2 |

movd m6, [r1 ] |

movd m1, [r1+1] |

add r1, r2 |

346 |
347 |
348 |
349 |
paddw m1, m6 |

350 |
movq m6, m1 |

351 |
pmullw m0, m5 |

352 |
pmullw m1, m3 |

353 |
paddw m0, [rnd_2d_%2+rnd_bias*8] |

354 |
paddw m1, m0 |

355 |
psrlw m1, 6 |

356 |
packuswb m1, m1 |

357 |
CHROMAMC_AVG4 m1, m0, [r0] |

358 |
movd [r0], m1 |

359 |
add r0, r2 |

360 |
sub r3d, 2 |

361 |
jnz .next2rows |

362 |
REP_RET |

363 |
%endmacro |

364 | |

365 |
%macro chroma_mc2_mmx_func 3 |

366 |
cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 |

367 |
%ifdef ARCH_X86_64 |

368 |
movsxd r2, r2d |

369 |
%endif |

370 | |

371 |
mov r6d, r4d |

372 |
shl r4d, 16 |

373 |
sub r4d, r6d |

374 |
add r4d, 8 |

375 |
imul r5d, r4d ; x*y<<16 | y*(8-x) |

376 |
shl r4d, 3 |

377 |
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) |

378 | |

379 |
movd m5, r4d |

380 |
movd m6, r5d |

381 |
punpckldq m5, m5 ; mm5 = {A,B,A,B} |

382 |
punpckldq m6, m6 ; mm6 = {C,D,C,D} |

383 |
pxor m7, m7 |

384 |
movd m2, [r1] |

385 |
punpcklbw m2, m7 |

386 |
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] |

387 | |

388 |
.nextrow |

389 |
add r1, r2 |

390 |
movq m1, m2 |

391 |
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] |

392 |
movd m0, [r1] |

393 |
punpcklbw m0, m7 |

394 |
pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] |

395 |
movq m2, m0 |

396 |
pmaddwd m0, m6 |

397 |
paddw m1, [rnd_2d_%2] |

398 |
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] |

399 |
psrlw m1, 6 |

400 |
packssdw m1, m7 |

401 |
packuswb m1, m7 |

402 |
CHROMAMC_AVG4 m1, m3, [r0] |

403 |
movd r5d, m1 |

404 |
mov [r0], r5w |

405 |
add r0, r2 |

406 |
sub r3d, 1 |

407 |
jnz .nextrow |

408 |
REP_RET |

409 |
%endmacro |

410 | |

411 |
%define rnd_1d_h264 pw_4 |

412 |
%define rnd_2d_h264 pw_32 |

413 |
%define rnd_1d_vc1 pw_3 |

414 |
%define rnd_2d_vc1 pw_28 |

415 | |

416 |
%macro NOTHING 2-3 |

417 |
%endmacro |

418 |
%macro DIRECT_AVG 2 |

419 |
PAVG %1, %2 |

420 |
%endmacro |

421 |
%macro COPY_AVG 3 |

422 |
movd %2, %3 |

423 |
PAVG %1, %2 |

424 |
%endmacro |

425 | |

426 |
INIT_MMX |

427 |
%define CHROMAMC_AVG NOTHING |

428 |
%define CHROMAMC_AVG4 NOTHING |

429 |
chroma_mc8_mmx_func put, h264, mmx_rnd |

430 |
chroma_mc8_mmx_func put, vc1, mmx_nornd |

431 |
chroma_mc8_mmx_func put, rv40, mmx |

432 |
chroma_mc4_mmx_func put, h264, mmx |

433 |
chroma_mc4_mmx_func put, rv40, mmx |

434 |
chroma_mc2_mmx_func put, h264, mmx2 |

435 | |

436 |
%define CHROMAMC_AVG DIRECT_AVG |

437 |
%define CHROMAMC_AVG4 COPY_AVG |

438 |
%define PAVG pavgb |

439 |
chroma_mc8_mmx_func avg, h264, mmx2_rnd |

440 |
chroma_mc8_mmx_func avg, vc1, mmx2_nornd |

441 |
chroma_mc8_mmx_func avg, rv40, mmx2 |

442 |
chroma_mc4_mmx_func avg, h264, mmx2 |

443 |
chroma_mc4_mmx_func avg, rv40, mmx2 |

444 |
chroma_mc2_mmx_func avg, h264, mmx2 |

445 | |

446 |
%define PAVG pavgusb |

447 |
chroma_mc8_mmx_func avg, h264, 3dnow_rnd |

448 |
chroma_mc8_mmx_func avg, vc1, 3dnow_nornd |

449 |
chroma_mc8_mmx_func avg, rv40, 3dnow |

450 |
chroma_mc4_mmx_func avg, h264, 3dnow |

451 |
chroma_mc4_mmx_func avg, rv40, 3dnow |

452 | |

453 |
%macro chroma_mc8_ssse3_func 3 |

454 |
cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 |

455 |
%ifdef ARCH_X86_64 |

456 |
movsxd r2, r2d |

457 |
%endif |

458 |
mov r6d, r5d |

459 |
or r6d, r4d |

460 |
jne .at_least_one_non_zero |

461 |
; mx == 0 AND my == 0 - no filter needed |

462 |
mv0_pixels_mc8 |

463 |
REP_RET |

464 | |

465 |
.at_least_one_non_zero |

466 |
test r5d, r5d |

467 |
je .my_is_zero |

468 |
test r4d, r4d |

469 |
je .mx_is_zero |

470 | |

471 |
; general case, bilinear |

472 |
mov r6d, r4d |

473 |
shl r4d, 8 |

474 |
sub r4, r6 |

475 |
add r4, 8 ; x*288+8 = x<<8 | (8-x) |

476 |
mov r6, 8 |

477 |
sub r6d, r5d |

478 |
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) |

479 |
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) |

480 | |

481 |
movd m7, r6d |

482 |
movd m6, r4d |

483 |
movdqa m5, [rnd_2d_%2] |

484 |
pshuflw m7, m7, 0 |

485 |
pshuflw m6, m6, 0 |

486 |
movlhps m7, m7 |

487 |
movlhps m6, m6 |

488 | |

489 |
movq m0, [r1 ] |

490 |
movq m1, [r1 +1] |

491 |
punpcklbw m0, m1 |

492 |
add r1, r2 |

493 |
.next2rows |

494 |
movq m1, [r1 ] |

495 |
movq m2, [r1 +1] |

496 |
movq m3, [r1+r2 ] |

497 |
movq m4, [r1+r2+1] |

498 |
lea r1, [r1+r2*2] |

499 |
punpcklbw m1, m2 |

500 |
punpcklbw m3, m4 |

501 |
movdqa m2, m1 |

502 |
movdqa m4, m3 |

503 |
pmaddubsw m0, m7 |

504 |
pmaddubsw m1, m6 |

505 |
pmaddubsw m2, m7 |

506 |
pmaddubsw m3, m6 |

507 |
paddw m0, m5 |

508 |
paddw m2, m5 |

509 |
paddw m1, m0 |

510 |
paddw m3, m2 |

511 |
movdqa m0, m4 |

512 |
psrlw m1, 6 |

513 |
psrlw m3, 6 |

514 |
%ifidn %1, avg |

515 |
movq m2, [r0 ] |

516 |
movhps m2, [r0+r2] |

517 |
%endif |

518 |
packuswb m1, m3 |

519 |
CHROMAMC_AVG m1, m2 |

520 |
movq [r0 ], m1 |

521 |
movhps [r0+r2], m1 |

522 |
sub r3d, 2 |

523 |
lea r0, [r0+r2*2] |

524 |
jg .next2rows |

525 |
REP_RET |

526 | |

527 |
.my_is_zero |

528 |
mov r5d, r4d |

529 |
shl r4d, 8 |

530 |
add r4, 8 |

531 |
sub r4, r5 ; 255*x+8 = x<<8 | (8-x) |

532 |
movd m7, r4d |

533 |
movdqa m6, [rnd_1d_%2] |

534 |
pshuflw m7, m7, 0 |

535 |
movlhps m7, m7 |

536 | |

537 |
.next2xrows |

538 |
movq m0, [r1 ] |

539 |
movq m1, [r1 +1] |

540 |
movq m2, [r1+r2 ] |

541 |
movq m3, [r1+r2+1] |

542 |
punpcklbw m0, m1 |

543 |
punpcklbw m2, m3 |

544 |
pmaddubsw m0, m7 |

545 |
pmaddubsw m2, m7 |

546 |
%ifidn %1, avg |

547 |
movq m4, [r0 ] |

548 |
movhps m4, [r0+r2] |

549 |
%endif |

550 |
paddw m0, m6 |

551 |
paddw m2, m6 |

552 |
psrlw m0, 3 |

553 |
psrlw m2, 3 |

554 |
packuswb m0, m2 |

555 |
CHROMAMC_AVG m0, m4 |

556 |
movq [r0 ], m0 |

557 |
movhps [r0+r2], m0 |

558 |
sub r3d, 2 |

559 |
lea r0, [r0+r2*2] |

560 |
lea r1, [r1+r2*2] |

561 |
jg .next2xrows |

562 |
REP_RET |

563 | |

564 |
.mx_is_zero |

565 |
mov r4d, r5d |

566 |
shl r5d, 8 |

567 |
add r5, 8 |

568 |
sub r5, r4 ; 255*y+8 = y<<8 | (8-y) |

569 |
movd m7, r5d |

570 |
movdqa m6, [rnd_1d_%2] |

571 |
pshuflw m7, m7, 0 |

572 |
movlhps m7, m7 |

573 | |

574 |
.next2yrows |

575 |
movq m0, [r1 ] |

576 |
movq m1, [r1+r2 ] |

577 |
movdqa m2, m1 |

578 |
movq m3, [r1+r2*2] |

579 |
punpcklbw m0, m1 |

580 |
punpcklbw m2, m3 |

581 |
pmaddubsw m0, m7 |

582 |
pmaddubsw m2, m7 |

583 |
%ifidn %1, avg |

584 |
movq m4, [r0 ] |

585 |
movhps m4, [r0+r2] |

586 |
%endif |

587 |
paddw m0, m6 |

588 |
paddw m2, m6 |

589 |
psrlw m0, 3 |

590 |
psrlw m2, 3 |

591 |
packuswb m0, m2 |

592 |
CHROMAMC_AVG m0, m4 |

593 |
movq [r0 ], m0 |

594 |
movhps [r0+r2], m0 |

595 |
sub r3d, 2 |

596 |
lea r0, [r0+r2*2] |

597 |
lea r1, [r1+r2*2] |

598 |
jg .next2yrows |

599 |
REP_RET |

600 |
%endmacro |

601 | |

602 |
%macro chroma_mc4_ssse3_func 3 |

603 |
cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 |

604 |
%ifdef ARCH_X86_64 |

605 |
movsxd r2, r2d |

606 |
%endif |

607 |
mov r6, r4 |

608 |
shl r4d, 8 |

609 |
sub r4d, r6d |

610 |
add r4d, 8 ; x*288+8 |

611 |
mov r6, 8 |

612 |
sub r6d, r5d |

613 |
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) |

614 |
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) |

615 | |

616 |
movd m7, r6d |

617 |
movd m6, r4d |

618 |
movq m5, [pw_32] |

619 |
pshufw m7, m7, 0 |

620 |
pshufw m6, m6, 0 |

621 | |

622 |
movd m0, [r1 ] |

623 |
punpcklbw m0, [r1 +1] |

624 |
add r1, r2 |

625 |
.next2rows |

626 |
movd m1, [r1 ] |

627 |
movd m3, [r1+r2 ] |

628 |
punpcklbw m1, [r1 +1] |

629 |
punpcklbw m3, [r1+r2+1] |

630 |
lea r1, [r1+r2*2] |

631 |
movq m2, m1 |

632 |
movq m4, m3 |

633 |
pmaddubsw m0, m7 |

634 |
pmaddubsw m1, m6 |

635 |
pmaddubsw m2, m7 |

636 |
pmaddubsw m3, m6 |

637 |
paddw m0, m5 |

638 |
paddw m2, m5 |

639 |
paddw m1, m0 |

640 |
paddw m3, m2 |

641 |
movq m0, m4 |

642 |
psrlw m1, 6 |

643 |
psrlw m3, 6 |

644 |
packuswb m1, m1 |

645 |
packuswb m3, m3 |

646 |
CHROMAMC_AVG m1, [r0 ] |

647 |
CHROMAMC_AVG m3, [r0+r2] |

648 |
movd [r0 ], m1 |

649 |
movd [r0+r2], m3 |

650 |
sub r3d, 2 |

651 |
lea r0, [r0+r2*2] |

652 |
jg .next2rows |

653 |
REP_RET |

654 |
%endmacro |

655 | |

656 |
%define CHROMAMC_AVG NOTHING |

657 |
INIT_XMM |

658 |
chroma_mc8_ssse3_func put, h264, ssse3_rnd |

659 |
chroma_mc8_ssse3_func put, vc1, ssse3_nornd |

660 |
INIT_MMX |

661 |
chroma_mc4_ssse3_func put, h264, ssse3 |

662 | |

663 |
%define CHROMAMC_AVG DIRECT_AVG |

664 |
%define PAVG pavgb |

665 |
INIT_XMM |

666 |
chroma_mc8_ssse3_func avg, h264, ssse3_rnd |

667 |
chroma_mc8_ssse3_func avg, vc1, ssse3_nornd |

668 |
INIT_MMX |

669 |
chroma_mc4_ssse3_func avg, h264, ssse3 |