Revision dd68d4db libavcodec/x86/h264_intrapred.asm

View differences:

libavcodec/x86/h264_intrapred.asm
24 24
SECTION_RODATA
25 25

  
26 26
tm_shuf: times 8 db 0x03, 0x80
27
plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
28
             db  1,  2,  3,  4,  5,  6,  7,  8
29
plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
30
             db  1,  2,  3,  4,  0,  0,  0,  0
31
pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
32
pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
33
pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
34
pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
27 35

  
28 36
SECTION .text
29 37

  
30 38
cextern pb_1
31 39
cextern pb_3
40
cextern pw_5
41
cextern pw_16
42
cextern pw_17
43
cextern pw_32
32 44

  
33 45
;-----------------------------------------------------------------------------
34 46
; void pred16x16_vertical(uint8_t *src, int stride)
......
266 278
    REP_RET
267 279

  
268 280
;-----------------------------------------------------------------------------
281
; void pred16x16_plane(uint8_t *src, int stride)
282
;-----------------------------------------------------------------------------
283

  
284
%macro H264_PRED16x16_PLANE 3
285
cglobal pred16x16_plane_%3_%1, 2, 7, %2
286
    mov          r2, r1           ; +stride
287
    neg          r1               ; -stride
288

  
289
    movh         m0, [r0+r1  -1]
290
%if mmsize == 8
291
    pxor         m4, m4
292
    movh         m1, [r0+r1  +3 ]
293
    movh         m2, [r0+r1  +8 ]
294
    movh         m3, [r0+r1  +12]
295
    punpcklbw    m0, m4
296
    punpcklbw    m1, m4
297
    punpcklbw    m2, m4
298
    punpcklbw    m3, m4
299
    pmullw       m0, [pw_m8tom1  ]
300
    pmullw       m1, [pw_m8tom1+8]
301
    pmullw       m2, [pw_1to8    ]
302
    pmullw       m3, [pw_1to8  +8]
303
    paddw        m0, m2
304
    paddw        m1, m3
305
%else ; mmsize == 16
306
%ifidn %1, sse2
307
    pxor         m2, m2
308
    movh         m1, [r0+r1  +8]
309
    punpcklbw    m0, m2
310
    punpcklbw    m1, m2
311
    pmullw       m0, [pw_m8tom1]
312
    pmullw       m1, [pw_1to8]
313
    paddw        m0, m1
314
%else ; ssse3
315
    movhps       m0, [r0+r1  +8]
316
    pmaddubsw    m0, [plane_shuf] ; H coefficients
317
%endif
318
    movhlps      m1, m0
319
%endif
320
    paddw        m0, m1
321
%ifidn %1, mmx
322
    mova         m1, m0
323
    psrlq        m1, 32
324
%elifidn %1, mmx2
325
    pshufw       m1, m0, 0xE
326
%else ; mmsize == 16
327
    pshuflw      m1, m0, 0xE
328
%endif
329
    paddw        m0, m1
330
%ifidn %1, mmx
331
    mova         m1, m0
332
    psrlq        m1, 16
333
%elifidn %1, mmx2
334
    pshufw       m1, m0, 0x1
335
%else
336
    pshuflw      m1, m0, 0x1
337
%endif
338
    paddw        m0, m1           ; sum of H coefficients
339

  
340
%ifidn %3, h264
341
    pmullw       m0, [pw_5]
342
    paddw        m0, [pw_32]
343
    psraw        m0, 6
344
%elifidn %3, rv40
345
    pmullw       m0, [pw_5]
346
    psraw        m0, 6
347
%elifidn %3, svq3
348
    movd         r3, m0
349
    movsx        r3, r3w
350
    test         r3, r3
351
    lea          r4, [r3+3]
352
    cmovs        r3, r4
353
    sar          r3, 2           ; H/4
354
    lea          r3, [r3*5]      ; 5*(H/4)
355
    test         r3, r3
356
    lea          r4, [r3+15]
357
    cmovs        r3, r4
358
    sar          r3, 4           ; (5*(H/4))/16
359
    movd         m0, r3d
360
%endif
361

  
362
    lea          r4, [r0+r2*8-1]
363
    lea          r3, [r0+r2*4-1]
364
    add          r4, r2
365

  
366
%ifdef ARCH_X86_64
367
%define e_reg r11
368
%else
369
%define e_reg r0
370
%endif
371

  
372
    movzx     e_reg, byte [r3+r2*2   ]
373
    movzx        r5, byte [r4+r1     ]
374
    sub          r5, e_reg
375

  
376
    movzx     e_reg, byte [r3+r2     ]
377
    movzx        r6, byte [r4        ]
378
    sub          r6, e_reg
379
    lea          r5, [r5+r6*2]
380

  
381
    movzx     e_reg, byte [r3+r1     ]
382
    movzx        r6, byte [r4+r2*2   ]
383
    sub          r6, e_reg
384
    lea          r5, [r5+r6*4]
385

  
386
    movzx     e_reg, byte [r3        ]
387
%ifdef ARCH_X86_64
388
    movzx       r10, byte [r4+r2     ]
389
    sub         r10, e_reg
390
%else
391
    movzx        r6, byte [r4+r2     ]
392
    sub          r6, e_reg
393
    lea          r5, [r5+r6*4]
394
    sub          r5, r6
395
%endif
396

  
397
    lea       e_reg, [r3+r1*4]
398
    lea          r3, [r4+r2*4]
399

  
400
    movzx        r4, byte [e_reg+r2  ]
401
    movzx        r6, byte [r3        ]
402
    sub          r6, r4
403
%ifdef ARCH_X86_64
404
    lea          r6, [r10+r6*2]
405
    lea          r5, [r5+r6*2]
406
    add          r5, r6
407
%else
408
    lea          r5, [r5+r6*4]
409
    lea          r5, [r5+r6*2]
410
%endif
411

  
412
    movzx        r4, byte [e_reg     ]
413
%ifdef ARCH_X86_64
414
    movzx       r10, byte [r3   +r2  ]
415
    sub         r10, r4
416
    sub          r5, r10
417
%else
418
    movzx        r6, byte [r3   +r2  ]
419
    sub          r6, r4
420
    lea          r5, [r5+r6*8]
421
    sub          r5, r6
422
%endif
423

  
424
    movzx        r4, byte [e_reg+r1  ]
425
    movzx        r6, byte [r3   +r2*2]
426
    sub          r6, r4
427
%ifdef ARCH_X86_64
428
    add          r6, r10
429
%endif
430
    lea          r5, [r5+r6*8]
431

  
432
    movzx        r4, byte [e_reg+r2*2]
433
    movzx        r6, byte [r3   +r1  ]
434
    sub          r6, r4
435
    lea          r5, [r5+r6*4]
436
    add          r5, r6           ; sum of V coefficients
437

  
438
%ifndef ARCH_X86_64
439
    mov          r0, r0m
440
%endif
441

  
442
%ifidn %3, h264
443
    lea          r5, [r5*5+32]
444
    sar          r5, 6
445
%elifidn %3, rv40
446
    lea          r5, [r5*5]
447
    sar          r5, 6
448
%elifidn %3, svq3
449
    test         r5, r5
450
    lea          r6, [r5+3]
451
    cmovs        r5, r6
452
    sar          r5, 2            ; V/4
453
    lea          r5, [r5*5]       ; 5*(V/4)
454
    test         r5, r5
455
    lea          r6, [r5+15]
456
    cmovs        r5, r6
457
    sar          r5, 4            ; (5*(V/4))/16
458
%endif
459

  
460
    movzx        r4, byte [r0+r1  +15]
461
    movzx        r3, byte [r3+r2*2   ]
462
    lea          r3, [r3+r4+1]
463
    shl          r3, 4
464
    movd        r1d, m0
465
    movsx       r1d, r1w
466
    add         r1d, r5d
467
    add         r3d, r1d
468
    shl         r1d, 3
469
    sub         r3d, r1d          ; a
470

  
471
    movd         m1, r5d
472
    movd         m3, r3d
473
%ifidn %1, mmx
474
    punpcklwd    m0, m0
475
    punpcklwd    m1, m1
476
    punpcklwd    m3, m3
477
    punpckldq    m0, m0
478
    punpckldq    m1, m1
479
    punpckldq    m3, m3
480
%elifidn %1, mmx2
481
    pshufw       m0, m0, 0x0
482
    pshufw       m1, m1, 0x0
483
    pshufw       m3, m3, 0x0
484
%else
485
    pshuflw      m0, m0, 0x0
486
    pshuflw      m1, m1, 0x0
487
    pshuflw      m3, m3, 0x0
488
    punpcklqdq   m0, m0           ; splat H (words)
489
    punpcklqdq   m1, m1           ; splat V (words)
490
    punpcklqdq   m3, m3           ; splat a (words)
491
%endif
492
%ifidn %3, svq3
493
    SWAP          0, 1
494
%endif
495
    mova         m2, m0
496
%if mmsize == 8
497
    mova         m5, m0
498
%endif
499
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
500
%if mmsize == 16
501
    psllw        m2, 3
502
%else
503
    psllw        m5, 3
504
    psllw        m2, 2
505
    mova         m6, m5
506
    paddw        m6, m2
507
%endif
508
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
509
    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
510
%if mmsize == 8
511
    paddw        m5, m0           ; a + {8,9,10,11}*H
512
    paddw        m6, m0           ; a + {12,13,14,15}*H
513
%endif
514

  
515
    mov          r4, 8
516
.loop
517
    mova         m3, m0           ; b[0..7]
518
    mova         m4, m2           ; b[8..15]
519
    psraw        m3, 5
520
    psraw        m4, 5
521
    packuswb     m3, m4
522
    mova       [r0], m3
523
%if mmsize == 8
524
    mova         m3, m5           ; b[8..11]
525
    mova         m4, m6           ; b[12..15]
526
    psraw        m3, 5
527
    psraw        m4, 5
528
    packuswb     m3, m4
529
    mova     [r0+8], m3
530
%endif
531
    paddw        m0, m1
532
    paddw        m2, m1
533
%if mmsize == 8
534
    paddw        m5, m1
535
    paddw        m6, m1
536
%endif
537

  
538
    mova         m3, m0           ; b[0..7]
539
    mova         m4, m2           ; b[8..15]
540
    psraw        m3, 5
541
    psraw        m4, 5
542
    packuswb     m3, m4
543
    mova    [r0+r2], m3
544
%if mmsize == 8
545
    mova         m3, m5           ; b[8..11]
546
    mova         m4, m6           ; b[12..15]
547
    psraw        m3, 5
548
    psraw        m4, 5
549
    packuswb     m3, m4
550
    mova  [r0+r2+8], m3
551
%endif
552
    paddw        m0, m1
553
    paddw        m2, m1
554
%if mmsize == 8
555
    paddw        m5, m1
556
    paddw        m6, m1
557
%endif
558

  
559
    lea          r0, [r0+r2*2]
560
    dec          r4
561
    jg .loop
562
    REP_RET
563
%endmacro
564

  
565
INIT_MMX
566
H264_PRED16x16_PLANE mmx,   0, h264
567
H264_PRED16x16_PLANE mmx,   0, rv40
568
H264_PRED16x16_PLANE mmx,   0, svq3
569
H264_PRED16x16_PLANE mmx2,  0, h264
570
H264_PRED16x16_PLANE mmx2,  0, rv40
571
H264_PRED16x16_PLANE mmx2,  0, svq3
572
INIT_XMM
573
H264_PRED16x16_PLANE sse2,  8, h264
574
H264_PRED16x16_PLANE sse2,  8, rv40
575
H264_PRED16x16_PLANE sse2,  8, svq3
576
H264_PRED16x16_PLANE ssse3, 8, h264
577
H264_PRED16x16_PLANE ssse3, 8, rv40
578
H264_PRED16x16_PLANE ssse3, 8, svq3
579

  
580
;-----------------------------------------------------------------------------
581
; void pred8x8_plane(uint8_t *src, int stride)
582
;-----------------------------------------------------------------------------
583

  
584
%macro H264_PRED8x8_PLANE 2
585
cglobal pred8x8_plane_%1, 2, 7, %2
586
    mov          r2, r1           ; +stride
587
    neg          r1               ; -stride
588

  
589
    movd         m0, [r0+r1  -1]
590
%if mmsize == 8
591
    pxor         m2, m2
592
    movh         m1, [r0+r1  +4 ]
593
    punpcklbw    m0, m2
594
    punpcklbw    m1, m2
595
    pmullw       m0, [pw_m4to4]
596
    pmullw       m1, [pw_m4to4+8]
597
%else ; mmsize == 16
598
%ifidn %1, sse2
599
    pxor         m2, m2
600
    movd         m1, [r0+r1  +4]
601
    punpckldq    m0, m1
602
    punpcklbw    m0, m2
603
    pmullw       m0, [pw_m4to4]
604
%else ; ssse3
605
    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
606
    pmaddubsw    m0, [plane8_shuf] ; H coefficients
607
%endif
608
    movhlps      m1, m0
609
%endif
610
    paddw        m0, m1
611

  
612
%ifnidn %1, ssse3
613
%ifidn %1, mmx
614
    mova         m1, m0
615
    psrlq        m1, 32
616
%elifidn %1, mmx2
617
    pshufw       m1, m0, 0xE
618
%else ; mmsize == 16
619
    pshuflw      m1, m0, 0xE
620
%endif
621
    paddw        m0, m1
622
%endif ; !ssse3
623

  
624
%ifidn %1, mmx
625
    mova         m1, m0
626
    psrlq        m1, 16
627
%elifidn %1, mmx2
628
    pshufw       m1, m0, 0x1
629
%else
630
    pshuflw      m1, m0, 0x1
631
%endif
632
    paddw        m0, m1           ; sum of H coefficients
633

  
634
    pmullw       m0, [pw_17]
635
    paddw        m0, [pw_16]
636
    psraw        m0, 5
637

  
638
    lea          r4, [r0+r2*4-1]
639
    lea          r3, [r0     -1]
640
    add          r4, r2
641

  
642
%ifdef ARCH_X86_64
643
%define e_reg r11
644
%else
645
%define e_reg r0
646
%endif
647

  
648
    movzx     e_reg, byte [r3+r2*2   ]
649
    movzx        r5, byte [r4+r1     ]
650
    sub          r5, e_reg
651

  
652
    movzx     e_reg, byte [r3        ]
653
%ifdef ARCH_X86_64
654
    movzx       r10, byte [r4+r2     ]
655
    sub         r10, e_reg
656
    sub          r5, r10
657
%else
658
    movzx        r6, byte [r4+r2     ]
659
    sub          r6, e_reg
660
    lea          r5, [r5+r6*4]
661
    sub          r5, r6
662
%endif
663

  
664
    movzx     e_reg, byte [r3+r1     ]
665
    movzx        r6, byte [r4+r2*2   ]
666
    sub          r6, e_reg
667
%ifdef ARCH_X86_64
668
    add          r6, r10
669
%endif
670
    lea          r5, [r5+r6*4]
671

  
672
    movzx     e_reg, byte [r3+r2     ]
673
    movzx        r6, byte [r4        ]
674
    sub          r6, e_reg
675
    lea          r6, [r5+r6*2]
676

  
677
    lea          r5, [r6*9+16]
678
    lea          r5, [r5+r6*8]
679
    sar          r5, 5
680

  
681
%ifndef ARCH_X86_64
682
    mov          r0, r0m
683
%endif
684

  
685
    movzx        r3, byte [r4+r2*2  ]
686
    movzx        r4, byte [r0+r1  +7]
687
    lea          r3, [r3+r4+1]
688
    shl          r3, 4
689
    movd        r1d, m0
690
    movsx       r1d, r1w
691
    add         r1d, r5d
692
    sub         r3d, r1d
693
    add         r1d, r1d
694
    sub         r3d, r1d          ; a
695

  
696
    movd         m1, r5d
697
    movd         m3, r3d
698
%ifidn %1, mmx
699
    punpcklwd    m0, m0
700
    punpcklwd    m1, m1
701
    punpcklwd    m3, m3
702
    punpckldq    m0, m0
703
    punpckldq    m1, m1
704
    punpckldq    m3, m3
705
%elifidn %1, mmx2
706
    pshufw       m0, m0, 0x0
707
    pshufw       m1, m1, 0x0
708
    pshufw       m3, m3, 0x0
709
%else
710
    pshuflw      m0, m0, 0x0
711
    pshuflw      m1, m1, 0x0
712
    pshuflw      m3, m3, 0x0
713
    punpcklqdq   m0, m0           ; splat H (words)
714
    punpcklqdq   m1, m1           ; splat V (words)
715
    punpcklqdq   m3, m3           ; splat a (words)
716
%endif
717
%if mmsize == 8
718
    mova         m2, m0
719
%endif
720
    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
721
    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
722
%if mmsize == 8
723
    psllw        m2, 2
724
    paddw        m2, m0           ; a + {4,5,6,7}*H
725
%endif
726

  
727
    mov          r4, 4
728
ALIGN 16
729
.loop
730
%if mmsize == 16
731
    mova         m3, m0           ; b[0..7]
732
    paddw        m0, m1
733
    psraw        m3, 5
734
    mova         m4, m0           ; V+b[0..7]
735
    paddw        m0, m1
736
    psraw        m4, 5
737
    packuswb     m3, m4
738
    movh       [r0], m3
739
    movhps  [r0+r2], m3
740
%else ; mmsize == 8
741
    mova         m3, m0           ; b[0..3]
742
    mova         m4, m2           ; b[4..7]
743
    paddw        m0, m1
744
    paddw        m2, m1
745
    psraw        m3, 5
746
    psraw        m4, 5
747
    mova         m5, m0           ; V+b[0..3]
748
    mova         m6, m2           ; V+b[4..7]
749
    paddw        m0, m1
750
    paddw        m2, m1
751
    psraw        m5, 5
752
    psraw        m6, 5
753
    packuswb     m3, m4
754
    packuswb     m5, m6
755
    mova       [r0], m3
756
    mova    [r0+r2], m5
757
%endif
758

  
759
    lea          r0, [r0+r2*2]
760
    dec          r4
761
    jg .loop
762
    REP_RET
763
%endmacro
764

  
765
INIT_MMX
766
H264_PRED8x8_PLANE mmx,   0
767
H264_PRED8x8_PLANE mmx2,  0
768
INIT_XMM
769
H264_PRED8x8_PLANE sse2,  8
770
H264_PRED8x8_PLANE ssse3, 8
771

  
772
;-----------------------------------------------------------------------------
269 773
; void pred8x8_vertical(uint8_t *src, int stride)
270 774
;-----------------------------------------------------------------------------
271 775

  

Also available in: Unified diff