Revision 3178ee4c

View differences:

libavcodec/i386/dsputil_mmx.c
651 651

  
652 652
WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
653 653

  
654
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
655
#define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
656

  
654 657
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
655 658
        "paddw " #m4 ", " #m3 "		\n\t" /* x1 */\
656 659
        "movq " #pw_20 ", %%mm4		\n\t" /* 20 */\
......
672 675
        "packuswb %%mm5, %%mm5		\n\t"\
673 676
        OP(%%mm5, out, %%mm7, d)
674 677

  
675
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP)\
678
#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
676 679
void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
677 680
    uint64_t temp;\
678 681
\
......
738 741
        "psraw $5, %%mm3		\n\t"\
739 742
        "movq %7, %%mm1			\n\t"\
740 743
        "packuswb %%mm3, %%mm1		\n\t"\
741
        OP(%%mm1, (%1),%%mm4, q)\
744
        OP_MMX2(%%mm1, (%1),%%mm4, q)\
742 745
        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
743 746
        \
744 747
        "movq 9(%0), %%mm1		\n\t" /* JKLMNOPQ */\
......
784 787
        "paddw %%mm3, %%mm4		\n\t" /* 20a - 6b + 3c - d */\
785 788
        "psraw $5, %%mm4		\n\t"\
786 789
        "packuswb %%mm4, %%mm0		\n\t"\
787
        OP(%%mm0, 8(%1), %%mm4, q)\
790
        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
788 791
        \
789 792
        "addl %3, %0			\n\t"\
790 793
        "addl %4, %1			\n\t"\
......
828 831
            "psraw $5, %%mm0		\n\t"\
829 832
            "psraw $5, %%mm1		\n\t"\
830 833
            "packuswb %%mm1, %%mm0	\n\t"\
831
            OP(%%mm0, (%1), %%mm1, q)\
834
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
832 835
            "movq 16(%0), %%mm0		\n\t"\
833 836
            "movq 24(%0), %%mm1		\n\t"\
834 837
            "paddw %2, %%mm0		\n\t"\
......
836 839
            "psraw $5, %%mm0		\n\t"\
837 840
            "psraw $5, %%mm1		\n\t"\
838 841
            "packuswb %%mm1, %%mm0	\n\t"\
839
            OP(%%mm0, 8(%1), %%mm1, q)\
842
            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
840 843
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
841 844
        );\
842 845
        dst+=dstStride;\
......
844 847
    }\
845 848
}\
846 849
\
847
void OPNAME ## mpeg4_qpel16_v_lowpass_mmx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
848
    uint64_t temp[17*4];\
849
    uint64_t *temp_ptr= temp;\
850
    int count= 17;\
851
\
852
    /*FIXME unroll */\
853
    asm volatile(\
854
        "pxor %%mm7, %%mm7		\n\t"\
855
        "1:				\n\t"\
856
        "movq (%0), %%mm0		\n\t"\
857
        "movq (%0), %%mm1		\n\t"\
858
        "movq 8(%0), %%mm2		\n\t"\
859
        "movq 8(%0), %%mm3		\n\t"\
860
        "punpcklbw %%mm7, %%mm0		\n\t"\
861
        "punpckhbw %%mm7, %%mm1		\n\t"\
862
        "punpcklbw %%mm7, %%mm2		\n\t"\
863
        "punpckhbw %%mm7, %%mm3		\n\t"\
864
        "movq %%mm0, (%1)		\n\t"\
865
        "movq %%mm1, 17*8(%1)		\n\t"\
866
        "movq %%mm2, (%1, %4)		\n\t"\
867
        "movq %%mm3, (%1, %5)		\n\t"\
868
        "addl $8, %1			\n\t"\
869
        "addl %3, %0			\n\t"\
870
        "decl %2			\n\t"\
871
        " jnz 1b			\n\t"\
872
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
873
        : "r" (srcStride), "r"(2*8*17), "r"(3*8*17)\
874
    );\
875
    \
876
    temp_ptr= temp;\
877
    count=4;\
878
    \
879
/*FIXME reorder for speed */\
880
/*FIXME remove push/pop gcc 2.95 bug workaround here and in the other 3 lowpass filters */\
881
    asm volatile(\
882
        /*"pxor %%mm7, %%mm7		\n\t"*/\
883
        "pushl %0			\n\t"\
884
        "pushl %1			\n\t"\
885
        "pushl %2			\n\t"\
886
        "1:				\n\t"\
887
        "movq (%0), %%mm0		\n\t"\
888
        "movq 8(%0), %%mm1		\n\t"\
889
        "movq 16(%0), %%mm2		\n\t"\
890
        "movq 24(%0), %%mm3		\n\t"\
891
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
892
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
893
        "addl %4, %1			\n\t"\
894
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
895
        \
896
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
897
        "addl %4, %1			\n\t"\
898
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
899
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
900
        "addl %4, %1			\n\t"\
901
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
902
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
903
        "addl %4, %1			\n\t"\
904
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
905
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
906
        "addl %4, %1			\n\t"\
907
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
908
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
909
        "addl %4, %1			\n\t"\
910
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
911
        \
912
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
913
        "addl %4, %1			\n\t"  \
914
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
915
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
916
        \
917
        "addl $136, %0			\n\t"\
918
        "addl %8, %1			\n\t"\
919
        "decl %2			\n\t"\
920
        " jnz 1b			\n\t"\
921
        "popl %2			\n\t"\
922
        "popl %1			\n\t"\
923
        "popl %0			\n\t"\
924
        \
925
        :: "r"(temp_ptr), "r"(dst), "r"(count),\
926
         "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-14*dstStride)\
927
    );\
928
}\
929 850
void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
930 851
    uint64_t temp;\
931 852
\
......
983 904
        "paddw %%mm1, %%mm3		\n\t" /* 20a - 6b + 3c - d */\
984 905
        "psraw $5, %%mm3		\n\t"\
985 906
        "packuswb %%mm3, %%mm0		\n\t"\
986
        OP(%%mm0, (%1), %%mm4, q)\
907
        OP_MMX2(%%mm0, (%1), %%mm4, q)\
987 908
        \
988 909
        "addl %3, %0			\n\t"\
989 910
        "addl %4, %1			\n\t"\
......
1019 940
            "psraw $5, %%mm0		\n\t"\
1020 941
            "psraw $5, %%mm1		\n\t"\
1021 942
            "packuswb %%mm1, %%mm0	\n\t"\
1022
            OP(%%mm0, (%1), %%mm1, q)\
943
            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1023 944
            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1024 945
        );\
1025 946
        dst+=dstStride;\
1026 947
        src+=srcStride;\
1027 948
    }\
949
}
950

  
951
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
952
\
953
static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
954
    uint64_t temp[17*4];\
955
    uint64_t *temp_ptr= temp;\
956
    int count= 17;\
957
\
958
    /*FIXME unroll */\
959
    asm volatile(\
960
        "pxor %%mm7, %%mm7		\n\t"\
961
        "1:				\n\t"\
962
        "movq (%0), %%mm0		\n\t"\
963
        "movq (%0), %%mm1		\n\t"\
964
        "movq 8(%0), %%mm2		\n\t"\
965
        "movq 8(%0), %%mm3		\n\t"\
966
        "punpcklbw %%mm7, %%mm0		\n\t"\
967
        "punpckhbw %%mm7, %%mm1		\n\t"\
968
        "punpcklbw %%mm7, %%mm2		\n\t"\
969
        "punpckhbw %%mm7, %%mm3		\n\t"\
970
        "movq %%mm0, (%1)		\n\t"\
971
        "movq %%mm1, 17*8(%1)		\n\t"\
972
        "movq %%mm2, (%1, %4)		\n\t"\
973
        "movq %%mm3, (%1, %5)		\n\t"\
974
        "addl $8, %1			\n\t"\
975
        "addl %3, %0			\n\t"\
976
        "decl %2			\n\t"\
977
        " jnz 1b			\n\t"\
978
        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
979
        : "r" (srcStride), "r"(2*8*17), "r"(3*8*17)\
980
    );\
981
    \
982
    temp_ptr= temp;\
983
    count=4;\
984
    \
985
/*FIXME reorder for speed */\
986
/*FIXME remove push/pop gcc 2.95 bug workaround here and in the other 3 lowpass filters */\
987
    asm volatile(\
988
        /*"pxor %%mm7, %%mm7		\n\t"*/\
989
        "pushl %0			\n\t"\
990
        "pushl %1			\n\t"\
991
        "pushl %2			\n\t"\
992
        "1:				\n\t"\
993
        "movq (%0), %%mm0		\n\t"\
994
        "movq 8(%0), %%mm1		\n\t"\
995
        "movq 16(%0), %%mm2		\n\t"\
996
        "movq 24(%0), %%mm3		\n\t"\
997
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
998
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
999
        "addl %4, %1			\n\t"\
1000
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
1001
        \
1002
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1003
        "addl %4, %1			\n\t"\
1004
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1005
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1006
        "addl %4, %1			\n\t"\
1007
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1008
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1009
        "addl %4, %1			\n\t"\
1010
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1011
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1012
        "addl %4, %1			\n\t"\
1013
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1014
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1015
        "addl %4, %1			\n\t"\
1016
        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1017
        \
1018
        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1019
        "addl %4, %1			\n\t"  \
1020
        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1021
        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1022
        \
1023
        "addl $136, %0			\n\t"\
1024
        "addl %8, %1			\n\t"\
1025
        "decl %2			\n\t"\
1026
        " jnz 1b			\n\t"\
1027
        "popl %2			\n\t"\
1028
        "popl %1			\n\t"\
1029
        "popl %0			\n\t"\
1030
        \
1031
        :: "r"(temp_ptr), "r"(dst), "r"(count),\
1032
         "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-14*dstStride)\
1033
    );\
1028 1034
}\
1029 1035
\
1030
void OPNAME ## mpeg4_qpel8_v_lowpass_mmx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1036
void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1031 1037
    uint64_t temp[9*4];\
1032 1038
    uint64_t *temp_ptr= temp;\
1033 1039
    int count= 9;\
......
1089 1095
        :: "r"(temp_ptr), "r"(dst), "r"(count),\
1090 1096
         "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-6*dstStride)\
1091 1097
    );\
1092
}
1093

  
1094
#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1098
}\
1095 1099
\
1096 1100
static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
1097
    put_pixels8_mmx(dst, src, stride, 8);\
1101
    OPNAME ## pixels8_mmx(dst, src, stride, 8);\
1098 1102
}\
1099 1103
\
1100 1104
static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1118 1122
static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1119 1123
    uint64_t temp[32];\
1120 1124
    uint8_t * const half= (uint8_t*)temp;\
1121
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(half, src, 8, stride);\
1125
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1122 1126
    OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1123 1127
}\
1124 1128
\
1125 1129
static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1126
    OPNAME ## mpeg4_qpel8_v_lowpass_mmx(dst, src, stride, stride);\
1130
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1127 1131
}\
1128 1132
\
1129 1133
static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1130 1134
    uint64_t temp[32];\
1131 1135
    uint8_t * const half= (uint8_t*)temp;\
1132
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(half, src, 8, stride);\
1136
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1133 1137
    OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1134 1138
}\
1135 1139
static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1138 1142
    uint8_t * const halfV= ((uint8_t*)half);\
1139 1143
    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1140 1144
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1141
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\
1142
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1145
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1146
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1143 1147
    OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\
1144 1148
}\
1145 1149
static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1148 1152
    uint8_t * const halfV= ((uint8_t*)half);\
1149 1153
    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1150 1154
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1151
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\
1152
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1155
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1156
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1153 1157
    OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\
1154 1158
}\
1155 1159
static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1158 1162
    uint8_t * const halfV= ((uint8_t*)half);\
1159 1163
    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1160 1164
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1161
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\
1162
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1165
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1166
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1163 1167
    OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\
1164 1168
}\
1165 1169
static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1168 1172
    uint8_t * const halfV= ((uint8_t*)half);\
1169 1173
    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1170 1174
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src  , 8, stride, 9);\
1171
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\
1172
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1175
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1176
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1173 1177
    OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\
1174 1178
}\
1175 1179
static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1177 1181
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
1178 1182
    uint8_t * const halfHV= ((uint8_t*)half);\
1179 1183
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1180
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1184
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1181 1185
    OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1182 1186
}\
1183 1187
static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1185 1189
    uint8_t * const halfH= ((uint8_t*)half) + 64;\
1186 1190
    uint8_t * const halfHV= ((uint8_t*)half);\
1187 1191
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1188
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1192
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1189 1193
    OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1190 1194
}\
1191 1195
static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1194 1198
    uint8_t * const halfV= ((uint8_t*)half);\
1195 1199
    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1196 1200
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1197
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\
1198
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1201
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src, 8, stride);\
1202
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1199 1203
    OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1200 1204
}\
1201 1205
static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1204 1208
    uint8_t * const halfV= ((uint8_t*)half);\
1205 1209
    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1206 1210
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1207
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\
1208
    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1211
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfV, src+1, 8, stride);\
1212
    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1209 1213
    OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1210 1214
}\
1211 1215
static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1212 1216
    uint64_t half[9*2];\
1213 1217
    uint8_t * const halfH= ((uint8_t*)half);\
1214 1218
    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1215
    OPNAME ## mpeg4_qpel8_v_lowpass_mmx(dst, halfH, stride, 8);\
1219
    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1216 1220
}\
1217 1221
static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
1218
    put_pixels16_mmx(dst, src, stride, 16);\
1222
    OPNAME ## pixels16_mmx(dst, src, stride, 16);\
1219 1223
}\
1220 1224
\
1221 1225
static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1239 1243
static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1240 1244
    uint64_t temp[32];\
1241 1245
    uint8_t * const half= (uint8_t*)temp;\
1242
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(half, src, 16, stride);\
1246
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1243 1247
    OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1244 1248
}\
1245 1249
\
1246 1250
static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1247
    OPNAME ## mpeg4_qpel16_v_lowpass_mmx(dst, src, stride, stride);\
1251
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1248 1252
}\
1249 1253
\
1250 1254
static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1251 1255
    uint64_t temp[32];\
1252 1256
    uint8_t * const half= (uint8_t*)temp;\
1253
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(half, src, 16, stride);\
1257
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1254 1258
    OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1255 1259
}\
1256 1260
static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1259 1263
    uint8_t * const halfV= ((uint8_t*)half);\
1260 1264
    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1261 1265
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1262
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\
1263
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1266
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1267
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1264 1268
    OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\
1265 1269
}\
1266 1270
static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1269 1273
    uint8_t * const halfV= ((uint8_t*)half);\
1270 1274
    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1271 1275
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1272
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\
1273
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1276
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1277
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1274 1278
    OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\
1275 1279
}\
1276 1280
static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1279 1283
    uint8_t * const halfV= ((uint8_t*)half);\
1280 1284
    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1281 1285
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1282
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\
1283
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1286
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1287
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1284 1288
    OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\
1285 1289
}\
1286 1290
static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1289 1293
    uint8_t * const halfV= ((uint8_t*)half);\
1290 1294
    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1291 1295
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src  , 16, stride, 17);\
1292
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\
1293
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1296
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1297
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1294 1298
    OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\
1295 1299
}\
1296 1300
static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1298 1302
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
1299 1303
    uint8_t * const halfHV= ((uint8_t*)half);\
1300 1304
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1301
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1305
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1302 1306
    OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1303 1307
}\
1304 1308
static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1306 1310
    uint8_t * const halfH= ((uint8_t*)half) + 256;\
1307 1311
    uint8_t * const halfHV= ((uint8_t*)half);\
1308 1312
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1309
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1313
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1310 1314
    OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1311 1315
}\
1312 1316
static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1315 1319
    uint8_t * const halfV= ((uint8_t*)half);\
1316 1320
    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1317 1321
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1318
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\
1319
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1322
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src, 16, stride);\
1323
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1320 1324
    OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1321 1325
}\
1322 1326
static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
......
1325 1329
    uint8_t * const halfV= ((uint8_t*)half);\
1326 1330
    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1327 1331
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1328
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\
1329
    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1332
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfV, src+1, 16, stride);\
1333
    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1330 1334
    OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1331 1335
}\
1332 1336
static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1333 1337
    uint64_t half[17*2];\
1334 1338
    uint8_t * const halfH= ((uint8_t*)half);\
1335 1339
    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1336
    OPNAME ## mpeg4_qpel16_v_lowpass_mmx(dst, halfH, stride, 16);\
1340
    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1337 1341
}
1338 1342

  
1339 1343

  
1340 1344
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "	\n\t"
1341
#define AVG_OP(a,b,temp, size) \
1345
#define AVG_3DNOW_OP(a,b,temp, size) \
1342 1346
"mov" #size " " #b ", " #temp "	\n\t"\
1343 1347
"pavgusb " #temp ", " #a "	\n\t"\
1344 1348
"mov" #size " " #a ", " #b "	\n\t"
1345

  
1346
QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP)
1347
QPEL_BASE(avg_       , ff_pw_16, _       , AVG_OP)
1348
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1349
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
1350
QPEL_OP(avg_       , ff_pw_16, _       , AVG_OP, 3dnow)
1351
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1352

  
1353
#undef AVG_OP
1354
#define AVG_OP(a,b,temp, size) \
1349
#define AVG_MMX2_OP(a,b,temp, size) \
1355 1350
"mov" #size " " #b ", " #temp "	\n\t"\
1356 1351
"pavgb " #temp ", " #a "	\n\t"\
1357 1352
"mov" #size " " #a ", " #b "	\n\t"
1353

  
1354
QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
1355
QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
1356
QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1357
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
1358
QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
1359
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1358 1360
QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
1359
QPEL_OP(avg_       , ff_pw_16, _       , AVG_OP, mmx2)
1361
QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
1360 1362
QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1361 1363

  
1362 1364
#if 0
......
1485 1487
            c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
1486 1488
            c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
1487 1489
            c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
1490

  
1488 1491
            SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
1489 1492
            SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
1490 1493
            SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
libavcodec/i386/dsputil_mmx_rnd.h
657 657
	"movq	(%1), %%mm0		\n\t"
658 658
	"movq	(%2), %%mm1		\n\t"
659 659
	"movq	64(%2), %%mm2		\n\t"
660
	"movq	136(%4), %%mm4		\n\t"
660
	"movq	136(%2), %%mm4		\n\t"
661 661
	"punpckhbw %%mm7, %%mm0		\n\t"
662 662
	"punpckhbw %%mm7, %%mm1		\n\t"
663 663
	"punpckhbw %%mm7, %%mm2		\n\t"
......
670 670
	"packuswb  %%mm4, %%mm3		\n\t"
671 671
	"movq	(%0), %%mm4		\n\t"
672 672
        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
673
	"movq	%%mm3, (%0)		\n\t"
673
	"movq	%%mm0, (%0)		\n\t"
674 674
        "addl	%4, %0			\n\t"
675 675
        "addl	%4, %1			\n\t"
676 676
        "addl	$8, %2			\n\t" 
......
705 705
	"movq	(%1), %%mm0		\n\t"
706 706
	"movq	(%2), %%mm1		\n\t"
707 707
	"movq	256(%2), %%mm2		\n\t"
708
	"movq	528(%4), %%mm4		\n\t"
708
	"movq	528(%2), %%mm4		\n\t"
709 709
	"punpckhbw %%mm7, %%mm0		\n\t"
710 710
	"punpckhbw %%mm7, %%mm1		\n\t"
711 711
	"punpckhbw %%mm7, %%mm2		\n\t"
......
718 718
	"packuswb  %%mm4, %%mm3		\n\t"
719 719
	"movq	(%0), %%mm4		\n\t"
720 720
        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
721
	"movq	%%mm3, (%0)		\n\t"
721
	"movq	%%mm0, (%0)		\n\t"
722 722
	"movq	8(%1), %%mm0		\n\t"
723 723
	"movq	8(%2), %%mm1		\n\t"
724 724
	"movq	264(%2), %%mm2		\n\t"
......
735 735
	"movq	8(%1), %%mm0		\n\t"
736 736
	"movq	8(%2), %%mm1		\n\t"
737 737
	"movq	264(%2), %%mm2		\n\t"
738
	"movq	536(%4), %%mm4		\n\t"
738
	"movq	536(%2), %%mm4		\n\t"
739 739
	"punpckhbw %%mm7, %%mm0		\n\t"
740 740
	"punpckhbw %%mm7, %%mm1		\n\t"
741 741
	"punpckhbw %%mm7, %%mm2		\n\t"
......
748 748
	"packuswb  %%mm4, %%mm3		\n\t"
749 749
	"movq	8(%0), %%mm4		\n\t"
750 750
        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
751
	"movq	%%mm3, 8(%0)		\n\t"
751
	"movq	%%mm0, 8(%0)		\n\t"
752 752
        "addl	%4, %0			\n\t"
753 753
        "addl	%4, %1			\n\t"
754 754
        "addl	$16, %2			\n\t" 

Also available in: Unified diff