Revision 6d4c49a2 libavcodec/x86/dsputil_mmx.c

View differences:

libavcodec/x86/dsputil_mmx.c
579 579
        dst[i+0] += src[i+0];
580 580
}
581 581

  
582
static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
583
    x86_reg i=0;
584
    __asm__ volatile(
585
        "jmp 2f                         \n\t"
586
        "1:                             \n\t"
587
        "movq   (%2, %0), %%mm0         \n\t"
588
        "movq  8(%2, %0), %%mm1         \n\t"
589
        "paddb  (%3, %0), %%mm0         \n\t"
590
        "paddb 8(%3, %0), %%mm1         \n\t"
591
        "movq %%mm0,  (%1, %0)          \n\t"
592
        "movq %%mm1, 8(%1, %0)          \n\t"
593
        "add $16, %0                    \n\t"
594
        "2:                             \n\t"
595
        "cmp %4, %0                     \n\t"
596
        " js 1b                         \n\t"
597
        : "+r" (i)
598
        : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
599
    );
600
    for(; i<w; i++)
601
        dst[i] = src1[i] + src2[i];
602
}
603

  
604 582
#if HAVE_7REGS && HAVE_TEN_OPERANDS
605 583
static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
606 584
    x86_reg w2 = -w;
......
876 854
    }
877 855
}
878 856

  
879
#define PAETH(cpu, abs3)\
880
static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
881
{\
882
    x86_reg i = -bpp;\
883
    x86_reg end = w-3;\
884
    __asm__ volatile(\
885
        "pxor      %%mm7, %%mm7 \n"\
886
        "movd    (%1,%0), %%mm0 \n"\
887
        "movd    (%2,%0), %%mm1 \n"\
888
        "punpcklbw %%mm7, %%mm0 \n"\
889
        "punpcklbw %%mm7, %%mm1 \n"\
890
        "add       %4, %0 \n"\
891
        "1: \n"\
892
        "movq      %%mm1, %%mm2 \n"\
893
        "movd    (%2,%0), %%mm1 \n"\
894
        "movq      %%mm2, %%mm3 \n"\
895
        "punpcklbw %%mm7, %%mm1 \n"\
896
        "movq      %%mm2, %%mm4 \n"\
897
        "psubw     %%mm1, %%mm3 \n"\
898
        "psubw     %%mm0, %%mm4 \n"\
899
        "movq      %%mm3, %%mm5 \n"\
900
        "paddw     %%mm4, %%mm5 \n"\
901
        abs3\
902
        "movq      %%mm4, %%mm6 \n"\
903
        "pminsw    %%mm5, %%mm6 \n"\
904
        "pcmpgtw   %%mm6, %%mm3 \n"\
905
        "pcmpgtw   %%mm5, %%mm4 \n"\
906
        "movq      %%mm4, %%mm6 \n"\
907
        "pand      %%mm3, %%mm4 \n"\
908
        "pandn     %%mm3, %%mm6 \n"\
909
        "pandn     %%mm0, %%mm3 \n"\
910
        "movd    (%3,%0), %%mm0 \n"\
911
        "pand      %%mm1, %%mm6 \n"\
912
        "pand      %%mm4, %%mm2 \n"\
913
        "punpcklbw %%mm7, %%mm0 \n"\
914
        "movq      %6,    %%mm5 \n"\
915
        "paddw     %%mm6, %%mm0 \n"\
916
        "paddw     %%mm2, %%mm3 \n"\
917
        "paddw     %%mm3, %%mm0 \n"\
918
        "pand      %%mm5, %%mm0 \n"\
919
        "movq      %%mm0, %%mm3 \n"\
920
        "packuswb  %%mm3, %%mm3 \n"\
921
        "movd      %%mm3, (%1,%0) \n"\
922
        "add       %4, %0 \n"\
923
        "cmp       %5, %0 \n"\
924
        "jle 1b \n"\
925
        :"+r"(i)\
926
        :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
927
         "m"(ff_pw_255)\
928
        :"memory"\
929
    );\
930
}
931

  
932
#define ABS3_MMX2\
933
        "psubw     %%mm5, %%mm7 \n"\
934
        "pmaxsw    %%mm7, %%mm5 \n"\
935
        "pxor      %%mm6, %%mm6 \n"\
936
        "pxor      %%mm7, %%mm7 \n"\
937
        "psubw     %%mm3, %%mm6 \n"\
938
        "psubw     %%mm4, %%mm7 \n"\
939
        "pmaxsw    %%mm6, %%mm3 \n"\
940
        "pmaxsw    %%mm7, %%mm4 \n"\
941
        "pxor      %%mm7, %%mm7 \n"
942

  
943
#define ABS3_SSSE3\
944
        "pabsw     %%mm3, %%mm3 \n"\
945
        "pabsw     %%mm4, %%mm4 \n"\
946
        "pabsw     %%mm5, %%mm5 \n"
947

  
948
PAETH(mmx2, ABS3_MMX2)
949
#if HAVE_SSSE3
950
PAETH(ssse3, ABS3_SSSE3)
951
#endif
952

  
953 857
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
954 858
        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
955 859
        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
......
2537 2441
#endif
2538 2442

  
2539 2443
        c->add_bytes= add_bytes_mmx;
2540
        c->add_bytes_l2= add_bytes_l2_mmx;
2541 2444

  
2542 2445
        if (!h264_high_depth)
2543 2446
        c->draw_edges = draw_edges_mmx;
......
2658 2561
                c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2659 2562
#endif
2660 2563

  
2661
            c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
2662 2564
        } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
2663 2565
            c->prefetch = prefetch_3dnow;
2664 2566

  
......
2772 2674
            H264_QPEL_FUNCS(3, 2, ssse3);
2773 2675
            H264_QPEL_FUNCS(3, 3, ssse3);
2774 2676
            }
2775
            c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
2776 2677
#if HAVE_YASM
2777 2678
            if (!h264_high_depth) {
2778 2679
            c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;

Also available in: Unified diff