Revision d2bb7db1 libavcodec/i386/idct_mmx.c

View differences:

libavcodec/i386/idct_mmx.c
598 598
declare_idct (ff_mmx_idct, mmx_table,
599 599
	      mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
600 600

  
601

  
602

  
603
/* in/out: mma=mma+mmb, mmb=mmb-mma */
604
#define SUMSUB_BA( a, b ) \
605
    "paddw "#b", "#a" \n\t"\
606
    "paddw "#b", "#b" \n\t"\
607
    "psubw "#a", "#b" \n\t"
608

  
609
#define SUMSUB_BADC( a, b, c, d ) \
610
    "paddw "#b", "#a" \n\t"\
611
    "paddw "#d", "#c" \n\t"\
612
    "paddw "#b", "#b" \n\t"\
613
    "paddw "#d", "#d" \n\t"\
614
    "psubw "#a", "#b" \n\t"\
615
    "psubw "#c", "#d" \n\t"
616

  
617
#define SUMSUBD2_AB( a, b, t ) \
618
    "movq  "#b", "#t" \n\t"\
619
    "psraw  $1 , "#b" \n\t"\
620
    "paddw "#a", "#b" \n\t"\
621
    "psraw  $1 , "#a" \n\t"\
622
    "psubw "#t", "#a" \n\t"
623

  
624
#define IDCT4_1D( s02, s13, d02, d13, t ) \
625
    SUMSUB_BA  ( s02, d02 )\
626
    SUMSUBD2_AB( s13, d13, t )\
627
    SUMSUB_BADC( d13, s02, s13, d02 )
628

  
629
#define SBUTTERFLY( a, b, t, n )  \
630
    "movq   "#a", "#t"       \n\t" /* abcd */\
631
    "punpckl"#n"  "#b", "#a" \n\t" /* aebf */\
632
    "punpckh"#n"  "#b", "#t" \n\t" /* cgdh */
633

  
634
#define TRANSPOSE4( a, b, c, d, t ) \
635
    SBUTTERFLY( a, b, t, wd ) /* a=aebf t=cgdh */\
636
    SBUTTERFLY( c, d, b, wd ) /* c=imjn b=kolp */\
637
    SBUTTERFLY( a, c, d, dq ) /* a=aeim d=bfjn */\
638
    SBUTTERFLY( t, b, c, dq ) /* t=cgko c=dhlp */
639

  
640
#define STORE_DIFF_4P( p, t, z ) \
641
        "psraw      $6,     "#p" \n\t"\
642
        "movd       (%0),   "#t" \n\t"\
643
        "punpcklbw "#z",    "#t" \n\t"\
644
        "paddsw    "#t",    "#p" \n\t"\
645
        "packuswb  "#z",    "#p" \n\t"\
646
        "movd      "#p",    (%0) \n\t"
647

  
648
static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
649

  
650
void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride)
651
{
652
    /* Load dct coeffs */
653
    asm volatile(
654
        "movq   (%0), %%mm0 \n\t"
655
        "movq  8(%0), %%mm1 \n\t"
656
        "movq 16(%0), %%mm2 \n\t"
657
        "movq 24(%0), %%mm3 \n\t"
658
    :: "r"(block) );
659

  
660
    asm volatile(
661
        /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
662
        IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
663

  
664
        "movq      %0,    %%mm6 \n\t"
665
        /* in: 1,4,0,2  out: 1,2,3,0 */
666
        TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
667

  
668
        "paddw     %%mm6, %%mm3 \n\t"
669

  
670
        /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
671
        IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
672

  
673
        "pxor %%mm7, %%mm7    \n\t"
674
    :: "m"(ff_pw_32));
675

  
676
    asm volatile(
677
    STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
678
        "add %1, %0             \n\t"
679
    STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
680
        "add %1, %0             \n\t"
681
    STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
682
        "add %1, %0             \n\t"
683
    STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
684
        : "+r"(dst)
685
        : "r" ((long)stride)
686
    );
687
}

Also available in: Unified diff