Revision e8600e5e

View differences:

libavcodec/i386/dsputil_mmx.c
2569 2569
extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
2570 2570
extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
2571 2571
extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
2572
extern void ff_snow_inner_add_yblock_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2573
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
2574
extern void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2575
                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
2572 2576
#endif
2573 2577

  
2574 2578
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
......
2962 2966
        if(mm_flags & MM_SSE2){
2963 2967
            c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
2964 2968
            c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
2969
            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
2965 2970
        }
2966 2971
        else{
2967 2972
            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
2968 2973
            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
2974
            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
2969 2975
        }
2970 2976
#endif
2971 2977
    }
libavcodec/i386/snowdsp_mmx.c
653 653
        "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
654 654
        "%"REG_a"","%"REG_b"","%"REG_c"");
655 655
}
656

  
657
#define snow_inner_add_yblock_sse2_header \
658
    DWTELEM * * dst_array = sb->line + src_y;\
659
    asm volatile(\
660
             "mov  %6, %%"REG_c"             \n\t"\
661
             "mov  %5, %%"REG_b"             \n\t"\
662
             "mov  %3, %%"REG_S"             \n\t"\
663
             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
664
             "pcmpeqd %%xmm3, %%xmm3         \n\t"\
665
             "pslld $31, %%xmm3              \n\t"\
666
             "psrld $24, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
667
             "1:                             \n\t"\
668
             "mov %1, %%"REG_D"              \n\t"\
669
             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
670
             "add %2, %%"REG_D"              \n\t"
671

  
672
#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
673
             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
674
             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
675
             "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
676
             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
677
             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
678
             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
679
             "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
680
             "punpcklbw %%xmm7, %%xmm0       \n\t"\
681
             "punpcklbw %%xmm7, %%xmm4       \n\t"\
682
             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
683
             "pmullw %%xmm4, %%"out_reg2"    \n\t"
684

  
685
#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
686
             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
687
             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
688
             "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
689
             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
690
             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
691
             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
692
             "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
693
             "punpcklbw %%xmm7, %%xmm0       \n\t"\
694
             "punpcklbw %%xmm7, %%xmm4       \n\t"\
695
             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
696
             "pmullw %%xmm4, %%"out_reg2"    \n\t"
697

  
698
#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
699
             snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
700
             "paddusw %%xmm2, %%xmm1         \n\t"\
701
             "paddusw %%xmm6, %%xmm5         \n\t"
702

  
703
#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
704
             snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
705
             "paddusw %%xmm2, %%xmm1         \n\t"\
706
             "paddusw %%xmm6, %%xmm5         \n\t"
707

  
708
#define snow_inner_add_yblock_sse2_end_common1\
709
             "add $32, %%"REG_S"             \n\t"\
710
             "add %%"REG_c", %0              \n\t"\
711
             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
712
             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
713
             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
714
             "add %%"REG_c", (%%"REG_a")     \n\t"
715

  
716
#define snow_inner_add_yblock_sse2_end_common2\
717
             "jnz 1b                         \n\t"\
718
             :"+m"(dst8),"+m"(dst_array)\
719
             :\
720
             "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):\
721
             "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
722

  
723
#define snow_inner_add_yblock_sse2_end_8\
724
             "sal $1, %%"REG_c"              \n\t"\
725
             "add $"PTR_SIZE"*2, %1          \n\t"\
726
             snow_inner_add_yblock_sse2_end_common1\
727
             "sar $1, %%"REG_c"              \n\t"\
728
             "sub $2, %%"REG_b"              \n\t"\
729
             snow_inner_add_yblock_sse2_end_common2
730

  
731
#define snow_inner_add_yblock_sse2_end_16\
732
             "add $"PTR_SIZE"*1, %1          \n\t"\
733
             snow_inner_add_yblock_sse2_end_common1\
734
             "dec %%"REG_b"                  \n\t"\
735
             snow_inner_add_yblock_sse2_end_common2
736

  
737
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
738
                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
739
snow_inner_add_yblock_sse2_header
740
snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
741
snow_inner_add_yblock_sse2_accum_8("2", "8")
742
snow_inner_add_yblock_sse2_accum_8("1", "128")
743
snow_inner_add_yblock_sse2_accum_8("0", "136")
744

  
745
             "mov %0, %%"REG_d"              \n\t"
746
             "movdqa (%%"REG_D"), %%xmm0     \n\t"
747
             "movdqa %%xmm1, %%xmm2          \n\t"
748

  
749
             "punpckhwd %%xmm7, %%xmm1       \n\t"
750
             "punpcklwd %%xmm7, %%xmm2       \n\t"
751
             "paddd %%xmm2, %%xmm0           \n\t"
752
             "movdqa 16(%%"REG_D"), %%xmm2   \n\t"
753
             "paddd %%xmm1, %%xmm2           \n\t"
754
             "paddd %%xmm3, %%xmm0           \n\t"
755
             "paddd %%xmm3, %%xmm2           \n\t"
756

  
757
             "mov %1, %%"REG_D"              \n\t"
758
             "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
759
             "add %2, %%"REG_D"              \n\t"
760

  
761
             "movdqa (%%"REG_D"), %%xmm4     \n\t"
762
             "movdqa %%xmm5, %%xmm6          \n\t"
763
             "punpckhwd %%xmm7, %%xmm5       \n\t"
764
             "punpcklwd %%xmm7, %%xmm6       \n\t"
765
             "paddd %%xmm6, %%xmm4           \n\t"
766
             "movdqa 16(%%"REG_D"), %%xmm6   \n\t"
767
             "paddd %%xmm5, %%xmm6           \n\t"
768
             "paddd %%xmm3, %%xmm4           \n\t"
769
             "paddd %%xmm3, %%xmm6           \n\t"
770

  
771
             "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
772
             "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
773
             "packssdw %%xmm2, %%xmm0        \n\t"
774
             "packuswb %%xmm7, %%xmm0        \n\t"
775
             "movq %%xmm0, (%%"REG_d")       \n\t"
776

  
777
             "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
778
             "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
779
             "packssdw %%xmm6, %%xmm4        \n\t"
780
             "packuswb %%xmm7, %%xmm4        \n\t"
781
             "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
782
snow_inner_add_yblock_sse2_end_8
783
}
784

  
785
static void inner_add_yblock_bw_16_obmc_32_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
786
                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
787
snow_inner_add_yblock_sse2_header
788
snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
789
snow_inner_add_yblock_sse2_accum_16("2", "16")
790
snow_inner_add_yblock_sse2_accum_16("1", "512")
791
snow_inner_add_yblock_sse2_accum_16("0", "528")
792

  
793
             "mov %0, %%"REG_d"              \n\t"
794
             "movdqa %%xmm1, %%xmm0          \n\t"
795
             "movdqa %%xmm5, %%xmm4          \n\t"
796
             "punpcklwd %%xmm7, %%xmm0       \n\t"
797
             "paddd (%%"REG_D"), %%xmm0      \n\t"
798
             "punpckhwd %%xmm7, %%xmm1       \n\t"
799
             "paddd 16(%%"REG_D"), %%xmm1    \n\t"
800
             "punpcklwd %%xmm7, %%xmm4       \n\t"
801
             "paddd 32(%%"REG_D"), %%xmm4    \n\t"
802
             "punpckhwd %%xmm7, %%xmm5       \n\t"
803
             "paddd 48(%%"REG_D"), %%xmm5    \n\t"
804
             "paddd %%xmm3, %%xmm0           \n\t"
805
             "paddd %%xmm3, %%xmm1           \n\t"
806
             "paddd %%xmm3, %%xmm4           \n\t"
807
             "paddd %%xmm3, %%xmm5           \n\t"
808
             "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
809
             "psrad $8, %%xmm1               \n\t" /* FRAC_BITS. */
810
             "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
811
             "psrad $8, %%xmm5               \n\t" /* FRAC_BITS. */
812

  
813
             "packssdw %%xmm1, %%xmm0        \n\t"
814
             "packssdw %%xmm5, %%xmm4        \n\t"
815
             "packuswb %%xmm4, %%xmm0        \n\t"
816

  
817
             "movdqu %%xmm0, (%%"REG_d")       \n\t"
818

  
819
snow_inner_add_yblock_sse2_end_16
820
}
821

  
822
#define snow_inner_add_yblock_mmx_header \
823
    DWTELEM * * dst_array = sb->line + src_y;\
824
    asm volatile(\
825
             "mov  %6, %%"REG_c"             \n\t"\
826
             "mov  %5, %%"REG_b"             \n\t"\
827
             "mov  %3, %%"REG_S"             \n\t"\
828
             "pxor %%mm7, %%mm7              \n\t" /* 0 */\
829
             "pcmpeqd %%mm3, %%mm3           \n\t"\
830
             "pslld $31, %%mm3               \n\t"\
831
             "psrld $24, %%mm3               \n\t" /* FRAC_BITS >> 1 */\
832
             "1:                             \n\t"\
833
             "mov %1, %%"REG_D"              \n\t"\
834
             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
835
             "add %2, %%"REG_D"              \n\t"
836

  
837
#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
838
             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
839
             "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
840
             "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
841
             "punpcklbw %%mm7, %%"out_reg1" \n\t"\
842
             "punpcklbw %%mm7, %%"out_reg2" \n\t"\
843
             "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
844
             "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
845
             "punpcklbw %%mm7, %%mm0       \n\t"\
846
             "punpcklbw %%mm7, %%mm4       \n\t"\
847
             "pmullw %%mm0, %%"out_reg1"    \n\t"\
848
             "pmullw %%mm4, %%"out_reg2"    \n\t"
849

  
850
#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
851
             snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
852
             "paddusw %%mm2, %%mm1         \n\t"\
853
             "paddusw %%mm6, %%mm5         \n\t"
854

  
855
#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
856
             "mov %0, %%"REG_d"              \n\t"\
857
             "movq %%mm1, %%mm0              \n\t"\
858
             "movq %%mm5, %%mm4              \n\t"\
859
             "punpcklwd %%mm7, %%mm0         \n\t"\
860
             "paddd "read_offset"(%%"REG_D"), %%mm0 \n\t"\
861
             "punpckhwd %%mm7, %%mm1         \n\t"\
862
             "paddd "read_offset"+8(%%"REG_D"), %%mm1 \n\t"\
863
             "punpcklwd %%mm7, %%mm4         \n\t"\
864
             "paddd "read_offset"+16(%%"REG_D"), %%mm4 \n\t"\
865
             "punpckhwd %%mm7, %%mm5         \n\t"\
866
             "paddd "read_offset"+24(%%"REG_D"), %%mm5 \n\t"\
867
             "paddd %%mm3, %%mm0             \n\t"\
868
             "paddd %%mm3, %%mm1             \n\t"\
869
             "paddd %%mm3, %%mm4             \n\t"\
870
             "paddd %%mm3, %%mm5             \n\t"\
871
             "psrad $8, %%mm0                \n\t"\
872
             "psrad $8, %%mm1                \n\t"\
873
             "psrad $8, %%mm4                \n\t"\
874
             "psrad $8, %%mm5                \n\t"\
875
\
876
             "packssdw %%mm1, %%mm0          \n\t"\
877
             "packssdw %%mm5, %%mm4          \n\t"\
878
             "packuswb %%mm4, %%mm0          \n\t"\
879
             "movq %%mm0, "write_offset"(%%"REG_d") \n\t"
880

  
881
#define snow_inner_add_yblock_mmx_end(s_step)\
882
             "add $"s_step", %%"REG_S"             \n\t"\
883
             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
884
             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
885
             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
886
             "add %%"REG_c", (%%"REG_a")     \n\t"\
887
             "add $"PTR_SIZE"*1, %1          \n\t"\
888
             "add %%"REG_c", %0              \n\t"\
889
             "dec %%"REG_b"                  \n\t"\
890
             "jnz 1b                         \n\t"\
891
             :"+m"(dst8),"+m"(dst_array)\
892
             :\
893
             "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"rm"((long)src_stride):\
894
             "%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
895

  
896
static void inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
897
                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
898
snow_inner_add_yblock_mmx_header
899
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
900
snow_inner_add_yblock_mmx_accum("2", "8", "0")
901
snow_inner_add_yblock_mmx_accum("1", "128", "0")
902
snow_inner_add_yblock_mmx_accum("0", "136", "0")
903
snow_inner_add_yblock_mmx_mix("0", "0")
904
snow_inner_add_yblock_mmx_end("16")
905
}
906

  
907
static void inner_add_yblock_bw_16_obmc_32_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
908
                      int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
909
snow_inner_add_yblock_mmx_header
910
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
911
snow_inner_add_yblock_mmx_accum("2", "16", "0")
912
snow_inner_add_yblock_mmx_accum("1", "512", "0")
913
snow_inner_add_yblock_mmx_accum("0", "528", "0")
914
snow_inner_add_yblock_mmx_mix("0", "0")
915

  
916
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
917
snow_inner_add_yblock_mmx_accum("2", "24", "8")
918
snow_inner_add_yblock_mmx_accum("1", "520", "8")
919
snow_inner_add_yblock_mmx_accum("0", "536", "8")
920
snow_inner_add_yblock_mmx_mix("32", "8")
921
snow_inner_add_yblock_mmx_end("32")
922
}
923

  
924
void ff_snow_inner_add_yblock_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
925
                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
926

  
927
    if (b_w == 16)
928
        inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
929
    else if (b_w == 8 && obmc_stride == 16) {
930
        if (!(b_h & 1))
931
            inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
932
        else
933
            inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
934
    } else
935
         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
936
}
937

  
938
void ff_snow_inner_add_yblock_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
939
                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
940
    if (b_w == 16)
941
        inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
942
    else if (b_w == 8 && obmc_stride == 16)
943
        inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
944
    else
945
        ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
946
}

Also available in: Unified diff