Revision 97d1d009 libavcodec/i386/dsputil_mmx.c

View differences:

libavcodec/i386/dsputil_mmx.c
74 74
#define JUMPALIGN() asm volatile (ASMALIGN(3)::)
75 75
#define MOVQ_ZERO(regd)  asm volatile ("pxor %%" #regd ", %%" #regd ::)
76 76

  
77
#define MOVQ_WONE(regd) \
78
    asm volatile ( \
79
    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
80
    "psrlw $15, %%" #regd ::)
81

  
82 77
#define MOVQ_BFE(regd) \
83 78
    asm volatile ( \
84 79
    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
......
220 215
/***********************************/
221 216
/* standard MMX */
222 217

  
223
#ifdef CONFIG_ENCODERS
224
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
225
{
226
    asm volatile(
227
        "mov $-128, %%"REG_a"           \n\t"
228
        "pxor %%mm7, %%mm7              \n\t"
229
        ASMALIGN(4)
230
        "1:                             \n\t"
231
        "movq (%0), %%mm0               \n\t"
232
        "movq (%0, %2), %%mm2           \n\t"
233
        "movq %%mm0, %%mm1              \n\t"
234
        "movq %%mm2, %%mm3              \n\t"
235
        "punpcklbw %%mm7, %%mm0         \n\t"
236
        "punpckhbw %%mm7, %%mm1         \n\t"
237
        "punpcklbw %%mm7, %%mm2         \n\t"
238
        "punpckhbw %%mm7, %%mm3         \n\t"
239
        "movq %%mm0, (%1, %%"REG_a")    \n\t"
240
        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
241
        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
242
        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
243
        "add %3, %0                     \n\t"
244
        "add $32, %%"REG_a"             \n\t"
245
        "js 1b                          \n\t"
246
        : "+r" (pixels)
247
        : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
248
        : "%"REG_a
249
    );
250
}
251

  
252
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
253
{
254
    asm volatile(
255
        "pxor %%mm7, %%mm7              \n\t"
256
        "mov $-128, %%"REG_a"           \n\t"
257
        ASMALIGN(4)
258
        "1:                             \n\t"
259
        "movq (%0), %%mm0               \n\t"
260
        "movq (%1), %%mm2               \n\t"
261
        "movq %%mm0, %%mm1              \n\t"
262
        "movq %%mm2, %%mm3              \n\t"
263
        "punpcklbw %%mm7, %%mm0         \n\t"
264
        "punpckhbw %%mm7, %%mm1         \n\t"
265
        "punpcklbw %%mm7, %%mm2         \n\t"
266
        "punpckhbw %%mm7, %%mm3         \n\t"
267
        "psubw %%mm2, %%mm0             \n\t"
268
        "psubw %%mm3, %%mm1             \n\t"
269
        "movq %%mm0, (%2, %%"REG_a")    \n\t"
270
        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
271
        "add %3, %0                     \n\t"
272
        "add %3, %1                     \n\t"
273
        "add $16, %%"REG_a"             \n\t"
274
        "jnz 1b                         \n\t"
275
        : "+r" (s1), "+r" (s2)
276
        : "r" (block+64), "r" ((long)stride)
277
        : "%"REG_a
278
    );
279
}
280
#endif //CONFIG_ENCODERS
281

  
282 218
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
283 219
{
284 220
    const DCTELEM *p;
......
544 480
        );
545 481
}
546 482

  
547
#ifdef CONFIG_ENCODERS
548
static int pix_sum16_mmx(uint8_t * pix, int line_size){
549
    const int h=16;
550
    int sum;
551
    long index= -line_size*h;
552

  
553
    asm volatile(
554
                "pxor %%mm7, %%mm7              \n\t"
555
                "pxor %%mm6, %%mm6              \n\t"
556
                "1:                             \n\t"
557
                "movq (%2, %1), %%mm0           \n\t"
558
                "movq (%2, %1), %%mm1           \n\t"
559
                "movq 8(%2, %1), %%mm2          \n\t"
560
                "movq 8(%2, %1), %%mm3          \n\t"
561
                "punpcklbw %%mm7, %%mm0         \n\t"
562
                "punpckhbw %%mm7, %%mm1         \n\t"
563
                "punpcklbw %%mm7, %%mm2         \n\t"
564
                "punpckhbw %%mm7, %%mm3         \n\t"
565
                "paddw %%mm0, %%mm1             \n\t"
566
                "paddw %%mm2, %%mm3             \n\t"
567
                "paddw %%mm1, %%mm3             \n\t"
568
                "paddw %%mm3, %%mm6             \n\t"
569
                "add %3, %1                     \n\t"
570
                " js 1b                         \n\t"
571
                "movq %%mm6, %%mm5              \n\t"
572
                "psrlq $32, %%mm6               \n\t"
573
                "paddw %%mm5, %%mm6             \n\t"
574
                "movq %%mm6, %%mm5              \n\t"
575
                "psrlq $16, %%mm6               \n\t"
576
                "paddw %%mm5, %%mm6             \n\t"
577
                "movd %%mm6, %0                 \n\t"
578
                "andl $0xFFFF, %0               \n\t"
579
                : "=&r" (sum), "+r" (index)
580
                : "r" (pix - index), "r" ((long)line_size)
581
        );
582

  
583
        return sum;
584
}
585
#endif //CONFIG_ENCODERS
586

  
587 483
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
588 484
    long i=0;
589 485
    asm volatile(
......
800 696
    }
801 697
}
802 698

  
803
#ifdef CONFIG_ENCODERS
804
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
805
    int tmp;
806
  asm volatile (
807
      "movl $16,%%ecx\n"
808
      "pxor %%mm0,%%mm0\n"
809
      "pxor %%mm7,%%mm7\n"
810
      "1:\n"
811
      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
812
      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
813

  
814
      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
815

  
816
      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
817
      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
818

  
819
      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
820
      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
821
      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
822

  
823
      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
824
      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
825

  
826
      "pmaddwd %%mm3,%%mm3\n"
827
      "pmaddwd %%mm4,%%mm4\n"
828

  
829
      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
830
                                          pix2^2+pix3^2+pix6^2+pix7^2) */
831
      "paddd %%mm3,%%mm4\n"
832
      "paddd %%mm2,%%mm7\n"
833

  
834
      "add %2, %0\n"
835
      "paddd %%mm4,%%mm7\n"
836
      "dec %%ecx\n"
837
      "jnz 1b\n"
838

  
839
      "movq %%mm7,%%mm1\n"
840
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
841
      "paddd %%mm7,%%mm1\n"
842
      "movd %%mm1,%1\n"
843
      : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
844
    return tmp;
845
}
846

  
847
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
848
    int tmp;
849
  asm volatile (
850
      "movl %4,%%ecx\n"
851
      "shr $1,%%ecx\n"
852
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
853
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
854
      "1:\n"
855
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
856
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
857
      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
858
      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
859

  
860
      /* todo: mm1-mm2, mm3-mm4 */
861
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
862
      /*       OR the results to get absolute difference */
863
      "movq %%mm1,%%mm5\n"
864
      "movq %%mm3,%%mm6\n"
865
      "psubusb %%mm2,%%mm1\n"
866
      "psubusb %%mm4,%%mm3\n"
867
      "psubusb %%mm5,%%mm2\n"
868
      "psubusb %%mm6,%%mm4\n"
869

  
870
      "por %%mm1,%%mm2\n"
871
      "por %%mm3,%%mm4\n"
872

  
873
      /* now convert to 16-bit vectors so we can square them */
874
      "movq %%mm2,%%mm1\n"
875
      "movq %%mm4,%%mm3\n"
876

  
877
      "punpckhbw %%mm0,%%mm2\n"
878
      "punpckhbw %%mm0,%%mm4\n"
879
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
880
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
881

  
882
      "pmaddwd %%mm2,%%mm2\n"
883
      "pmaddwd %%mm4,%%mm4\n"
884
      "pmaddwd %%mm1,%%mm1\n"
885
      "pmaddwd %%mm3,%%mm3\n"
886

  
887
      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
888
      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
889

  
890
      "paddd %%mm2,%%mm1\n"
891
      "paddd %%mm4,%%mm3\n"
892
      "paddd %%mm1,%%mm7\n"
893
      "paddd %%mm3,%%mm7\n"
894

  
895
      "decl %%ecx\n"
896
      "jnz 1b\n"
897

  
898
      "movq %%mm7,%%mm1\n"
899
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
900
      "paddd %%mm7,%%mm1\n"
901
      "movd %%mm1,%2\n"
902
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
903
      : "r" ((long)line_size) , "m" (h)
904
      : "%ecx");
905
    return tmp;
906
}
907

  
908
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
909
    int tmp;
910
  asm volatile (
911
      "movl %4,%%ecx\n"
912
      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
913
      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
914
      "1:\n"
915
      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
916
      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
917
      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
918
      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
919

  
920
      /* todo: mm1-mm2, mm3-mm4 */
921
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
922
      /*       OR the results to get absolute difference */
923
      "movq %%mm1,%%mm5\n"
924
      "movq %%mm3,%%mm6\n"
925
      "psubusb %%mm2,%%mm1\n"
926
      "psubusb %%mm4,%%mm3\n"
927
      "psubusb %%mm5,%%mm2\n"
928
      "psubusb %%mm6,%%mm4\n"
929

  
930
      "por %%mm1,%%mm2\n"
931
      "por %%mm3,%%mm4\n"
932

  
933
      /* now convert to 16-bit vectors so we can square them */
934
      "movq %%mm2,%%mm1\n"
935
      "movq %%mm4,%%mm3\n"
936

  
937
      "punpckhbw %%mm0,%%mm2\n"
938
      "punpckhbw %%mm0,%%mm4\n"
939
      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
940
      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
941

  
942
      "pmaddwd %%mm2,%%mm2\n"
943
      "pmaddwd %%mm4,%%mm4\n"
944
      "pmaddwd %%mm1,%%mm1\n"
945
      "pmaddwd %%mm3,%%mm3\n"
946

  
947
      "add %3,%0\n"
948
      "add %3,%1\n"
949

  
950
      "paddd %%mm2,%%mm1\n"
951
      "paddd %%mm4,%%mm3\n"
952
      "paddd %%mm1,%%mm7\n"
953
      "paddd %%mm3,%%mm7\n"
954

  
955
      "decl %%ecx\n"
956
      "jnz 1b\n"
957

  
958
      "movq %%mm7,%%mm1\n"
959
      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
960
      "paddd %%mm7,%%mm1\n"
961
      "movd %%mm1,%2\n"
962
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
963
      : "r" ((long)line_size) , "m" (h)
964
      : "%ecx");
965
    return tmp;
966
}
967

  
968
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
969
    int tmp;
970
  asm volatile (
971
      "shr $1,%2\n"
972
      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
973
      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
974
      "1:\n"
975
      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
976
      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
977
      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
978
      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
979

  
980
      /* todo: mm1-mm2, mm3-mm4 */
981
      /* algo: subtract mm1 from mm2 with saturation and vice versa */
982
      /*       OR the results to get absolute difference */
983
      "movdqa %%xmm1,%%xmm5\n"
984
      "movdqa %%xmm3,%%xmm6\n"
985
      "psubusb %%xmm2,%%xmm1\n"
986
      "psubusb %%xmm4,%%xmm3\n"
987
      "psubusb %%xmm5,%%xmm2\n"
988
      "psubusb %%xmm6,%%xmm4\n"
989

  
990
      "por %%xmm1,%%xmm2\n"
991
      "por %%xmm3,%%xmm4\n"
992

  
993
      /* now convert to 16-bit vectors so we can square them */
994
      "movdqa %%xmm2,%%xmm1\n"
995
      "movdqa %%xmm4,%%xmm3\n"
996

  
997
      "punpckhbw %%xmm0,%%xmm2\n"
998
      "punpckhbw %%xmm0,%%xmm4\n"
999
      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
1000
      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
1001

  
1002
      "pmaddwd %%xmm2,%%xmm2\n"
1003
      "pmaddwd %%xmm4,%%xmm4\n"
1004
      "pmaddwd %%xmm1,%%xmm1\n"
1005
      "pmaddwd %%xmm3,%%xmm3\n"
1006

  
1007
      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
1008
      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
1009

  
1010
      "paddd %%xmm2,%%xmm1\n"
1011
      "paddd %%xmm4,%%xmm3\n"
1012
      "paddd %%xmm1,%%xmm7\n"
1013
      "paddd %%xmm3,%%xmm7\n"
1014

  
1015
      "decl %2\n"
1016
      "jnz 1b\n"
1017

  
1018
      "movdqa %%xmm7,%%xmm1\n"
1019
      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
1020
      "paddd %%xmm1,%%xmm7\n"
1021
      "movdqa %%xmm7,%%xmm1\n"
1022
      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
1023
      "paddd %%xmm1,%%xmm7\n"
1024
      "movd %%xmm7,%3\n"
1025
      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
1026
      : "r" ((long)line_size));
1027
    return tmp;
1028
}
1029

  
1030
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
1031
    int tmp;
1032
  asm volatile (
1033
      "movl %3,%%ecx\n"
1034
      "pxor %%mm7,%%mm7\n"
1035
      "pxor %%mm6,%%mm6\n"
1036

  
1037
      "movq (%0),%%mm0\n"
1038
      "movq %%mm0, %%mm1\n"
1039
      "psllq $8, %%mm0\n"
1040
      "psrlq $8, %%mm1\n"
1041
      "psrlq $8, %%mm0\n"
1042
      "movq %%mm0, %%mm2\n"
1043
      "movq %%mm1, %%mm3\n"
1044
      "punpcklbw %%mm7,%%mm0\n"
1045
      "punpcklbw %%mm7,%%mm1\n"
1046
      "punpckhbw %%mm7,%%mm2\n"
1047
      "punpckhbw %%mm7,%%mm3\n"
1048
      "psubw %%mm1, %%mm0\n"
1049
      "psubw %%mm3, %%mm2\n"
1050

  
1051
      "add %2,%0\n"
1052

  
1053
      "movq (%0),%%mm4\n"
1054
      "movq %%mm4, %%mm1\n"
1055
      "psllq $8, %%mm4\n"
1056
      "psrlq $8, %%mm1\n"
1057
      "psrlq $8, %%mm4\n"
1058
      "movq %%mm4, %%mm5\n"
1059
      "movq %%mm1, %%mm3\n"
1060
      "punpcklbw %%mm7,%%mm4\n"
1061
      "punpcklbw %%mm7,%%mm1\n"
1062
      "punpckhbw %%mm7,%%mm5\n"
1063
      "punpckhbw %%mm7,%%mm3\n"
1064
      "psubw %%mm1, %%mm4\n"
1065
      "psubw %%mm3, %%mm5\n"
1066
      "psubw %%mm4, %%mm0\n"
1067
      "psubw %%mm5, %%mm2\n"
1068
      "pxor %%mm3, %%mm3\n"
1069
      "pxor %%mm1, %%mm1\n"
1070
      "pcmpgtw %%mm0, %%mm3\n\t"
1071
      "pcmpgtw %%mm2, %%mm1\n\t"
1072
      "pxor %%mm3, %%mm0\n"
1073
      "pxor %%mm1, %%mm2\n"
1074
      "psubw %%mm3, %%mm0\n"
1075
      "psubw %%mm1, %%mm2\n"
1076
      "paddw %%mm0, %%mm2\n"
1077
      "paddw %%mm2, %%mm6\n"
1078

  
1079
      "add %2,%0\n"
1080
      "1:\n"
1081

  
1082
      "movq (%0),%%mm0\n"
1083
      "movq %%mm0, %%mm1\n"
1084
      "psllq $8, %%mm0\n"
1085
      "psrlq $8, %%mm1\n"
1086
      "psrlq $8, %%mm0\n"
1087
      "movq %%mm0, %%mm2\n"
1088
      "movq %%mm1, %%mm3\n"
1089
      "punpcklbw %%mm7,%%mm0\n"
1090
      "punpcklbw %%mm7,%%mm1\n"
1091
      "punpckhbw %%mm7,%%mm2\n"
1092
      "punpckhbw %%mm7,%%mm3\n"
1093
      "psubw %%mm1, %%mm0\n"
1094
      "psubw %%mm3, %%mm2\n"
1095
      "psubw %%mm0, %%mm4\n"
1096
      "psubw %%mm2, %%mm5\n"
1097
      "pxor %%mm3, %%mm3\n"
1098
      "pxor %%mm1, %%mm1\n"
1099
      "pcmpgtw %%mm4, %%mm3\n\t"
1100
      "pcmpgtw %%mm5, %%mm1\n\t"
1101
      "pxor %%mm3, %%mm4\n"
1102
      "pxor %%mm1, %%mm5\n"
1103
      "psubw %%mm3, %%mm4\n"
1104
      "psubw %%mm1, %%mm5\n"
1105
      "paddw %%mm4, %%mm5\n"
1106
      "paddw %%mm5, %%mm6\n"
1107

  
1108
      "add %2,%0\n"
1109

  
1110
      "movq (%0),%%mm4\n"
1111
      "movq %%mm4, %%mm1\n"
1112
      "psllq $8, %%mm4\n"
1113
      "psrlq $8, %%mm1\n"
1114
      "psrlq $8, %%mm4\n"
1115
      "movq %%mm4, %%mm5\n"
1116
      "movq %%mm1, %%mm3\n"
1117
      "punpcklbw %%mm7,%%mm4\n"
1118
      "punpcklbw %%mm7,%%mm1\n"
1119
      "punpckhbw %%mm7,%%mm5\n"
1120
      "punpckhbw %%mm7,%%mm3\n"
1121
      "psubw %%mm1, %%mm4\n"
1122
      "psubw %%mm3, %%mm5\n"
1123
      "psubw %%mm4, %%mm0\n"
1124
      "psubw %%mm5, %%mm2\n"
1125
      "pxor %%mm3, %%mm3\n"
1126
      "pxor %%mm1, %%mm1\n"
1127
      "pcmpgtw %%mm0, %%mm3\n\t"
1128
      "pcmpgtw %%mm2, %%mm1\n\t"
1129
      "pxor %%mm3, %%mm0\n"
1130
      "pxor %%mm1, %%mm2\n"
1131
      "psubw %%mm3, %%mm0\n"
1132
      "psubw %%mm1, %%mm2\n"
1133
      "paddw %%mm0, %%mm2\n"
1134
      "paddw %%mm2, %%mm6\n"
1135

  
1136
      "add %2,%0\n"
1137
      "subl $2, %%ecx\n"
1138
      " jnz 1b\n"
1139

  
1140
      "movq %%mm6, %%mm0\n"
1141
      "punpcklwd %%mm7,%%mm0\n"
1142
      "punpckhwd %%mm7,%%mm6\n"
1143
      "paddd %%mm0, %%mm6\n"
1144

  
1145
      "movq %%mm6,%%mm0\n"
1146
      "psrlq $32, %%mm6\n"
1147
      "paddd %%mm6,%%mm0\n"
1148
      "movd %%mm0,%1\n"
1149
      : "+r" (pix1), "=r"(tmp)
1150
      : "r" ((long)line_size) , "g" (h-2)
1151
      : "%ecx");
1152
      return tmp;
1153
}
1154

  
1155
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
1156
    int tmp;
1157
    uint8_t * pix= pix1;
1158
  asm volatile (
1159
      "movl %3,%%ecx\n"
1160
      "pxor %%mm7,%%mm7\n"
1161
      "pxor %%mm6,%%mm6\n"
1162

  
1163
      "movq (%0),%%mm0\n"
1164
      "movq 1(%0),%%mm1\n"
1165
      "movq %%mm0, %%mm2\n"
1166
      "movq %%mm1, %%mm3\n"
1167
      "punpcklbw %%mm7,%%mm0\n"
1168
      "punpcklbw %%mm7,%%mm1\n"
1169
      "punpckhbw %%mm7,%%mm2\n"
1170
      "punpckhbw %%mm7,%%mm3\n"
1171
      "psubw %%mm1, %%mm0\n"
1172
      "psubw %%mm3, %%mm2\n"
1173

  
1174
      "add %2,%0\n"
1175

  
1176
      "movq (%0),%%mm4\n"
1177
      "movq 1(%0),%%mm1\n"
1178
      "movq %%mm4, %%mm5\n"
1179
      "movq %%mm1, %%mm3\n"
1180
      "punpcklbw %%mm7,%%mm4\n"
1181
      "punpcklbw %%mm7,%%mm1\n"
1182
      "punpckhbw %%mm7,%%mm5\n"
1183
      "punpckhbw %%mm7,%%mm3\n"
1184
      "psubw %%mm1, %%mm4\n"
1185
      "psubw %%mm3, %%mm5\n"
1186
      "psubw %%mm4, %%mm0\n"
1187
      "psubw %%mm5, %%mm2\n"
1188
      "pxor %%mm3, %%mm3\n"
1189
      "pxor %%mm1, %%mm1\n"
1190
      "pcmpgtw %%mm0, %%mm3\n\t"
1191
      "pcmpgtw %%mm2, %%mm1\n\t"
1192
      "pxor %%mm3, %%mm0\n"
1193
      "pxor %%mm1, %%mm2\n"
1194
      "psubw %%mm3, %%mm0\n"
1195
      "psubw %%mm1, %%mm2\n"
1196
      "paddw %%mm0, %%mm2\n"
1197
      "paddw %%mm2, %%mm6\n"
1198

  
1199
      "add %2,%0\n"
1200
      "1:\n"
1201

  
1202
      "movq (%0),%%mm0\n"
1203
      "movq 1(%0),%%mm1\n"
1204
      "movq %%mm0, %%mm2\n"
1205
      "movq %%mm1, %%mm3\n"
1206
      "punpcklbw %%mm7,%%mm0\n"
1207
      "punpcklbw %%mm7,%%mm1\n"
1208
      "punpckhbw %%mm7,%%mm2\n"
1209
      "punpckhbw %%mm7,%%mm3\n"
1210
      "psubw %%mm1, %%mm0\n"
1211
      "psubw %%mm3, %%mm2\n"
1212
      "psubw %%mm0, %%mm4\n"
1213
      "psubw %%mm2, %%mm5\n"
1214
      "pxor %%mm3, %%mm3\n"
1215
      "pxor %%mm1, %%mm1\n"
1216
      "pcmpgtw %%mm4, %%mm3\n\t"
1217
      "pcmpgtw %%mm5, %%mm1\n\t"
1218
      "pxor %%mm3, %%mm4\n"
1219
      "pxor %%mm1, %%mm5\n"
1220
      "psubw %%mm3, %%mm4\n"
1221
      "psubw %%mm1, %%mm5\n"
1222
      "paddw %%mm4, %%mm5\n"
1223
      "paddw %%mm5, %%mm6\n"
1224

  
1225
      "add %2,%0\n"
1226

  
1227
      "movq (%0),%%mm4\n"
1228
      "movq 1(%0),%%mm1\n"
1229
      "movq %%mm4, %%mm5\n"
1230
      "movq %%mm1, %%mm3\n"
1231
      "punpcklbw %%mm7,%%mm4\n"
1232
      "punpcklbw %%mm7,%%mm1\n"
1233
      "punpckhbw %%mm7,%%mm5\n"
1234
      "punpckhbw %%mm7,%%mm3\n"
1235
      "psubw %%mm1, %%mm4\n"
1236
      "psubw %%mm3, %%mm5\n"
1237
      "psubw %%mm4, %%mm0\n"
1238
      "psubw %%mm5, %%mm2\n"
1239
      "pxor %%mm3, %%mm3\n"
1240
      "pxor %%mm1, %%mm1\n"
1241
      "pcmpgtw %%mm0, %%mm3\n\t"
1242
      "pcmpgtw %%mm2, %%mm1\n\t"
1243
      "pxor %%mm3, %%mm0\n"
1244
      "pxor %%mm1, %%mm2\n"
1245
      "psubw %%mm3, %%mm0\n"
1246
      "psubw %%mm1, %%mm2\n"
1247
      "paddw %%mm0, %%mm2\n"
1248
      "paddw %%mm2, %%mm6\n"
1249

  
1250
      "add %2,%0\n"
1251
      "subl $2, %%ecx\n"
1252
      " jnz 1b\n"
1253

  
1254
      "movq %%mm6, %%mm0\n"
1255
      "punpcklwd %%mm7,%%mm0\n"
1256
      "punpckhwd %%mm7,%%mm6\n"
1257
      "paddd %%mm0, %%mm6\n"
1258

  
1259
      "movq %%mm6,%%mm0\n"
1260
      "psrlq $32, %%mm6\n"
1261
      "paddd %%mm6,%%mm0\n"
1262
      "movd %%mm0,%1\n"
1263
      : "+r" (pix1), "=r"(tmp)
1264
      : "r" ((long)line_size) , "g" (h-2)
1265
      : "%ecx");
1266
      return tmp + hf_noise8_mmx(pix+8, line_size, h);
1267
}
1268

  
1269
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1270
    MpegEncContext *c = p;
1271
    int score1, score2;
1272

  
1273
    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
1274
    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
1275
    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
1276

  
1277
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1278
    else  return score1 + FFABS(score2)*8;
1279
}
1280

  
1281
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1282
    MpegEncContext *c = p;
1283
    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
1284
    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
1285

  
1286
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1287
    else  return score1 + FFABS(score2)*8;
1288
}
1289

  
1290
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1291
    int tmp;
1292

  
1293
    assert( (((int)pix) & 7) == 0);
1294
    assert((line_size &7) ==0);
1295

  
1296
#define SUM(in0, in1, out0, out1) \
1297
      "movq (%0), %%mm2\n"\
1298
      "movq 8(%0), %%mm3\n"\
1299
      "add %2,%0\n"\
1300
      "movq %%mm2, " #out0 "\n"\
1301
      "movq %%mm3, " #out1 "\n"\
1302
      "psubusb " #in0 ", %%mm2\n"\
1303
      "psubusb " #in1 ", %%mm3\n"\
1304
      "psubusb " #out0 ", " #in0 "\n"\
1305
      "psubusb " #out1 ", " #in1 "\n"\
1306
      "por %%mm2, " #in0 "\n"\
1307
      "por %%mm3, " #in1 "\n"\
1308
      "movq " #in0 ", %%mm2\n"\
1309
      "movq " #in1 ", %%mm3\n"\
1310
      "punpcklbw %%mm7, " #in0 "\n"\
1311
      "punpcklbw %%mm7, " #in1 "\n"\
1312
      "punpckhbw %%mm7, %%mm2\n"\
1313
      "punpckhbw %%mm7, %%mm3\n"\
1314
      "paddw " #in1 ", " #in0 "\n"\
1315
      "paddw %%mm3, %%mm2\n"\
1316
      "paddw %%mm2, " #in0 "\n"\
1317
      "paddw " #in0 ", %%mm6\n"
1318

  
1319

  
1320
  asm volatile (
1321
      "movl %3,%%ecx\n"
1322
      "pxor %%mm6,%%mm6\n"
1323
      "pxor %%mm7,%%mm7\n"
1324
      "movq (%0),%%mm0\n"
1325
      "movq 8(%0),%%mm1\n"
1326
      "add %2,%0\n"
1327
      "subl $2, %%ecx\n"
1328
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1329
      "1:\n"
1330

  
1331
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1332

  
1333
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1334

  
1335
      "subl $2, %%ecx\n"
1336
      "jnz 1b\n"
1337

  
1338
      "movq %%mm6,%%mm0\n"
1339
      "psrlq $32, %%mm6\n"
1340
      "paddw %%mm6,%%mm0\n"
1341
      "movq %%mm0,%%mm6\n"
1342
      "psrlq $16, %%mm0\n"
1343
      "paddw %%mm6,%%mm0\n"
1344
      "movd %%mm0,%1\n"
1345
      : "+r" (pix), "=r"(tmp)
1346
      : "r" ((long)line_size) , "m" (h)
1347
      : "%ecx");
1348
    return tmp & 0xFFFF;
1349
}
1350
#undef SUM
1351

  
1352
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
1353
    int tmp;
1354

  
1355
    assert( (((int)pix) & 7) == 0);
1356
    assert((line_size &7) ==0);
1357

  
1358
#define SUM(in0, in1, out0, out1) \
1359
      "movq (%0), " #out0 "\n"\
1360
      "movq 8(%0), " #out1 "\n"\
1361
      "add %2,%0\n"\
1362
      "psadbw " #out0 ", " #in0 "\n"\
1363
      "psadbw " #out1 ", " #in1 "\n"\
1364
      "paddw " #in1 ", " #in0 "\n"\
1365
      "paddw " #in0 ", %%mm6\n"
1366

  
1367
  asm volatile (
1368
      "movl %3,%%ecx\n"
1369
      "pxor %%mm6,%%mm6\n"
1370
      "pxor %%mm7,%%mm7\n"
1371
      "movq (%0),%%mm0\n"
1372
      "movq 8(%0),%%mm1\n"
1373
      "add %2,%0\n"
1374
      "subl $2, %%ecx\n"
1375
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1376
      "1:\n"
1377

  
1378
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1379

  
1380
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1381

  
1382
      "subl $2, %%ecx\n"
1383
      "jnz 1b\n"
1384

  
1385
      "movd %%mm6,%1\n"
1386
      : "+r" (pix), "=r"(tmp)
1387
      : "r" ((long)line_size) , "m" (h)
1388
      : "%ecx");
1389
    return tmp;
1390
}
1391
#undef SUM
1392

  
1393
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1394
    int tmp;
1395

  
1396
    assert( (((int)pix1) & 7) == 0);
1397
    assert( (((int)pix2) & 7) == 0);
1398
    assert((line_size &7) ==0);
1399

  
1400
#define SUM(in0, in1, out0, out1) \
1401
      "movq (%0),%%mm2\n"\
1402
      "movq (%1)," #out0 "\n"\
1403
      "movq 8(%0),%%mm3\n"\
1404
      "movq 8(%1)," #out1 "\n"\
1405
      "add %3,%0\n"\
1406
      "add %3,%1\n"\
1407
      "psubb " #out0 ", %%mm2\n"\
1408
      "psubb " #out1 ", %%mm3\n"\
1409
      "pxor %%mm7, %%mm2\n"\
1410
      "pxor %%mm7, %%mm3\n"\
1411
      "movq %%mm2, " #out0 "\n"\
1412
      "movq %%mm3, " #out1 "\n"\
1413
      "psubusb " #in0 ", %%mm2\n"\
1414
      "psubusb " #in1 ", %%mm3\n"\
1415
      "psubusb " #out0 ", " #in0 "\n"\
1416
      "psubusb " #out1 ", " #in1 "\n"\
1417
      "por %%mm2, " #in0 "\n"\
1418
      "por %%mm3, " #in1 "\n"\
1419
      "movq " #in0 ", %%mm2\n"\
1420
      "movq " #in1 ", %%mm3\n"\
1421
      "punpcklbw %%mm7, " #in0 "\n"\
1422
      "punpcklbw %%mm7, " #in1 "\n"\
1423
      "punpckhbw %%mm7, %%mm2\n"\
1424
      "punpckhbw %%mm7, %%mm3\n"\
1425
      "paddw " #in1 ", " #in0 "\n"\
1426
      "paddw %%mm3, %%mm2\n"\
1427
      "paddw %%mm2, " #in0 "\n"\
1428
      "paddw " #in0 ", %%mm6\n"
1429

  
1430

  
1431
  asm volatile (
1432
      "movl %4,%%ecx\n"
1433
      "pxor %%mm6,%%mm6\n"
1434
      "pcmpeqw %%mm7,%%mm7\n"
1435
      "psllw $15, %%mm7\n"
1436
      "packsswb %%mm7, %%mm7\n"
1437
      "movq (%0),%%mm0\n"
1438
      "movq (%1),%%mm2\n"
1439
      "movq 8(%0),%%mm1\n"
1440
      "movq 8(%1),%%mm3\n"
1441
      "add %3,%0\n"
1442
      "add %3,%1\n"
1443
      "subl $2, %%ecx\n"
1444
      "psubb %%mm2, %%mm0\n"
1445
      "psubb %%mm3, %%mm1\n"
1446
      "pxor %%mm7, %%mm0\n"
1447
      "pxor %%mm7, %%mm1\n"
1448
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1449
      "1:\n"
1450

  
1451
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1452

  
1453
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1454

  
1455
      "subl $2, %%ecx\n"
1456
      "jnz 1b\n"
1457

  
1458
      "movq %%mm6,%%mm0\n"
1459
      "psrlq $32, %%mm6\n"
1460
      "paddw %%mm6,%%mm0\n"
1461
      "movq %%mm0,%%mm6\n"
1462
      "psrlq $16, %%mm0\n"
1463
      "paddw %%mm6,%%mm0\n"
1464
      "movd %%mm0,%2\n"
1465
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1466
      : "r" ((long)line_size) , "m" (h)
1467
      : "%ecx");
1468
    return tmp & 0x7FFF;
1469
}
1470
#undef SUM
1471

  
1472
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1473
    int tmp;
1474

  
1475
    assert( (((int)pix1) & 7) == 0);
1476
    assert( (((int)pix2) & 7) == 0);
1477
    assert((line_size &7) ==0);
1478

  
1479
#define SUM(in0, in1, out0, out1) \
1480
      "movq (%0)," #out0 "\n"\
1481
      "movq (%1),%%mm2\n"\
1482
      "movq 8(%0)," #out1 "\n"\
1483
      "movq 8(%1),%%mm3\n"\
1484
      "add %3,%0\n"\
1485
      "add %3,%1\n"\
1486
      "psubb %%mm2, " #out0 "\n"\
1487
      "psubb %%mm3, " #out1 "\n"\
1488
      "pxor %%mm7, " #out0 "\n"\
1489
      "pxor %%mm7, " #out1 "\n"\
1490
      "psadbw " #out0 ", " #in0 "\n"\
1491
      "psadbw " #out1 ", " #in1 "\n"\
1492
      "paddw " #in1 ", " #in0 "\n"\
1493
      "paddw " #in0 ", %%mm6\n"
1494

  
1495
  asm volatile (
1496
      "movl %4,%%ecx\n"
1497
      "pxor %%mm6,%%mm6\n"
1498
      "pcmpeqw %%mm7,%%mm7\n"
1499
      "psllw $15, %%mm7\n"
1500
      "packsswb %%mm7, %%mm7\n"
1501
      "movq (%0),%%mm0\n"
1502
      "movq (%1),%%mm2\n"
1503
      "movq 8(%0),%%mm1\n"
1504
      "movq 8(%1),%%mm3\n"
1505
      "add %3,%0\n"
1506
      "add %3,%1\n"
1507
      "subl $2, %%ecx\n"
1508
      "psubb %%mm2, %%mm0\n"
1509
      "psubb %%mm3, %%mm1\n"
1510
      "pxor %%mm7, %%mm0\n"
1511
      "pxor %%mm7, %%mm1\n"
1512
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1513
      "1:\n"
1514

  
1515
      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1516

  
1517
      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1518

  
1519
      "subl $2, %%ecx\n"
1520
      "jnz 1b\n"
1521

  
1522
      "movd %%mm6,%2\n"
1523
      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1524
      : "r" ((long)line_size) , "m" (h)
1525
      : "%ecx");
1526
    return tmp;
1527
}
1528
#undef SUM
1529

  
1530
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1531
    long i=0;
1532
    asm volatile(
1533
        "1:                             \n\t"
1534
        "movq  (%2, %0), %%mm0          \n\t"
1535
        "movq  (%1, %0), %%mm1          \n\t"
1536
        "psubb %%mm0, %%mm1             \n\t"
1537
        "movq %%mm1, (%3, %0)           \n\t"
1538
        "movq 8(%2, %0), %%mm0          \n\t"
1539
        "movq 8(%1, %0), %%mm1          \n\t"
1540
        "psubb %%mm0, %%mm1             \n\t"
1541
        "movq %%mm1, 8(%3, %0)          \n\t"
1542
        "add $16, %0                    \n\t"
1543
        "cmp %4, %0                     \n\t"
1544
        " jb 1b                         \n\t"
1545
        : "+r" (i)
1546
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1547
    );
1548
    for(; i<w; i++)
1549
        dst[i+0] = src1[i+0]-src2[i+0];
1550
}
1551

  
1552
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1553
    long i=0;
1554
    uint8_t l, lt;
1555

  
1556
    asm volatile(
1557
        "1:                             \n\t"
1558
        "movq  -1(%1, %0), %%mm0        \n\t" // LT
1559
        "movq  (%1, %0), %%mm1          \n\t" // T
1560
        "movq  -1(%2, %0), %%mm2        \n\t" // L
1561
        "movq  (%2, %0), %%mm3          \n\t" // X
1562
        "movq %%mm2, %%mm4              \n\t" // L
1563
        "psubb %%mm0, %%mm2             \n\t"
1564
        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
1565
        "movq %%mm4, %%mm5              \n\t" // L
1566
        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
1567
        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
1568
        "pminub %%mm2, %%mm4            \n\t"
1569
        "pmaxub %%mm1, %%mm4            \n\t"
1570
        "psubb %%mm4, %%mm3             \n\t" // dst - pred
1571
        "movq %%mm3, (%3, %0)           \n\t"
1572
        "add $8, %0                     \n\t"
1573
        "cmp %4, %0                     \n\t"
1574
        " jb 1b                         \n\t"
1575
        : "+r" (i)
1576
        : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1577
    );
1578

  
1579
    l= *left;
1580
    lt= *left_top;
1581

  
1582
    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
1583

  
1584
    *left_top= src1[w-1];
1585
    *left    = src2[w-1];
1586
}
1587

  
1588 699
#define PAETH(cpu, abs3)\
1589 700
void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
1590 701
{\
......
1659 770
PAETH(ssse3, ABS3_SSSE3)
1660 771
#endif
1661 772

  
1662
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
1663
    "mov"#m" "#p1", "#a"              \n\t"\
1664
    "mov"#m" "#p2", "#t"              \n\t"\
1665
    "punpcklbw "#a", "#t"             \n\t"\
1666
    "punpcklbw "#a", "#a"             \n\t"\
1667
    "psubw     "#t", "#a"             \n\t"\
1668

  
1669
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
1670
    uint8_t *p1b=p1, *p2b=p2;\
1671
    asm volatile(\
1672
        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
1673
        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
1674
        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
1675
        "add %4, %1                   \n\t"\
1676
        "add %4, %2                   \n\t"\
1677
        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
1678
        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
1679
        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
1680
        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
1681
        "mov"#m1" "#mm"0, %0          \n\t"\
1682
        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
1683
        "mov"#m1" %0, "#mm"0          \n\t"\
1684
        : "+m"(temp), "+r"(p1b), "+r"(p2b)\
1685
        : "r"((long)stride), "r"((long)stride*3)\
1686
    );\
1687
}
1688
    //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
1689

  
1690
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
1691
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
1692

  
1693
#define LBUTTERFLY2(a1,b1,a2,b2)\
1694
    "paddw " #b1 ", " #a1 "           \n\t"\
1695
    "paddw " #b2 ", " #a2 "           \n\t"\
1696
    "paddw " #b1 ", " #b1 "           \n\t"\
1697
    "paddw " #b2 ", " #b2 "           \n\t"\
1698
    "psubw " #a1 ", " #b1 "           \n\t"\
1699
    "psubw " #a2 ", " #b2 "           \n\t"
1700

  
1701
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
1702
        LBUTTERFLY2(m0, m1, m2, m3)\
1703
        LBUTTERFLY2(m4, m5, m6, m7)\
1704
        LBUTTERFLY2(m0, m2, m1, m3)\
1705
        LBUTTERFLY2(m4, m6, m5, m7)\
1706
        LBUTTERFLY2(m0, m4, m1, m5)\
1707
        LBUTTERFLY2(m2, m6, m3, m7)\
1708

  
1709
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
1710

  
1711
#define MMABS_MMX(a,z)\
1712
    "pxor " #z ", " #z "              \n\t"\
1713
    "pcmpgtw " #a ", " #z "           \n\t"\
1714
    "pxor " #z ", " #a "              \n\t"\
1715
    "psubw " #z ", " #a "             \n\t"
1716

  
1717
#define MMABS_MMX2(a,z)\
1718
    "pxor " #z ", " #z "              \n\t"\
1719
    "psubw " #a ", " #z "             \n\t"\
1720
    "pmaxsw " #z ", " #a "            \n\t"
1721

  
1722
#define MMABS_SSSE3(a,z)\
1723
    "pabsw " #a ", " #a "             \n\t"
1724

  
1725
#define MMABS_SUM(a,z, sum)\
1726
    MMABS(a,z)\
1727
    "paddusw " #a ", " #sum "         \n\t"
1728

  
1729
#define MMABS_SUM_8x8_NOSPILL\
1730
    MMABS(%%xmm0, %%xmm8)\
1731
    MMABS(%%xmm1, %%xmm9)\
1732
    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1733
    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1734
    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1735
    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1736
    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1737
    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1738
    "paddusw %%xmm1, %%xmm0           \n\t"
1739

  
1740
#ifdef ARCH_X86_64
1741
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1742
#else
1743
#define MMABS_SUM_8x8_SSE2\
1744
    "movdqa %%xmm7, (%1)              \n\t"\
1745
    MMABS(%%xmm0, %%xmm7)\
1746
    MMABS(%%xmm1, %%xmm7)\
1747
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1748
    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1749
    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1750
    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1751
    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1752
    "movdqa (%1), %%xmm2              \n\t"\
1753
    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1754
    "paddusw %%xmm1, %%xmm0           \n\t"
1755
#endif
1756

  
1757
#define LOAD4(o, a, b, c, d)\
1758
    "movq "#o"(%1),    "#a"           \n\t"\
1759
    "movq "#o"+8(%1),  "#b"           \n\t"\
1760
    "movq "#o"+16(%1), "#c"           \n\t"\
1761
    "movq "#o"+24(%1), "#d"           \n\t"\
1762

  
1763
#define STORE4(o, a, b, c, d)\
1764
    "movq "#a", "#o"(%1)              \n\t"\
1765
    "movq "#b", "#o"+8(%1)            \n\t"\
1766
    "movq "#c", "#o"+16(%1)           \n\t"\
1767
    "movq "#d", "#o"+24(%1)           \n\t"\
1768

  
1769
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1770
 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1771
 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1772
#define HSUM_MMX(a, t, dst)\
1773
    "movq "#a", "#t"                  \n\t"\
1774
    "psrlq $32, "#a"                  \n\t"\
1775
    "paddusw "#t", "#a"               \n\t"\
1776
    "movq "#a", "#t"                  \n\t"\
1777
    "psrlq $16, "#a"                  \n\t"\
1778
    "paddusw "#t", "#a"               \n\t"\
1779
    "movd "#a", "#dst"                \n\t"\
1780

  
1781
#define HSUM_MMX2(a, t, dst)\
1782
    "pshufw $0x0E, "#a", "#t"         \n\t"\
1783
    "paddusw "#t", "#a"               \n\t"\
1784
    "pshufw $0x01, "#a", "#t"         \n\t"\
1785
    "paddusw "#t", "#a"               \n\t"\
1786
    "movd "#a", "#dst"                \n\t"\
1787

  
1788
#define HSUM_SSE2(a, t, dst)\
1789
    "movhlps "#a", "#t"               \n\t"\
1790
    "paddusw "#t", "#a"               \n\t"\
1791
    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1792
    "paddusw "#t", "#a"               \n\t"\
1793
    "pshuflw $0x01, "#a", "#t"        \n\t"\
1794
    "paddusw "#t", "#a"               \n\t"\
1795
    "movd "#a", "#dst"                \n\t"\
1796

  
1797
#define HADAMARD8_DIFF_MMX(cpu) \
1798
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1799
    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1800
    int sum;\
1801
\
1802
    assert(h==8);\
1803
\
1804
    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1805
\
1806
    asm volatile(\
1807
        HADAMARD48\
1808
\
1809
        "movq %%mm7, 96(%1)             \n\t"\
1810
\
1811
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1812
        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1813
\
1814
        "movq 96(%1), %%mm7             \n\t"\
1815
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1816
        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1817
\
1818
        : "=r" (sum)\
1819
        : "r"(temp)\
1820
    );\
1821
\
1822
    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1823
\
1824
    asm volatile(\
1825
        HADAMARD48\
1826
\
1827
        "movq %%mm7, 96(%1)             \n\t"\
1828
\
1829
        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1830
        STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1831
\
1832
        "movq 96(%1), %%mm7             \n\t"\
1833
        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1834
        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1835
        "movq %%mm6, %%mm7              \n\t"\
1836
        "movq %%mm0, %%mm6              \n\t"\
1837
\
1838
        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1839
\
1840
        HADAMARD48\
1841
        "movq %%mm7, 64(%1)             \n\t"\
1842
        MMABS(%%mm0, %%mm7)\
1843
        MMABS(%%mm1, %%mm7)\
1844
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1845
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1846
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1847
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1848
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1849
        "movq 64(%1), %%mm2             \n\t"\
1850
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1851
        "paddusw %%mm1, %%mm0           \n\t"\
1852
        "movq %%mm0, 64(%1)             \n\t"\
1853
\
1854
        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1855
        LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1856
\
1857
        HADAMARD48\
1858
        "movq %%mm7, (%1)               \n\t"\
1859
        MMABS(%%mm0, %%mm7)\
1860
        MMABS(%%mm1, %%mm7)\
1861
        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1862
        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1863
        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1864
        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1865
        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1866
        "movq (%1), %%mm2               \n\t"\
1867
        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1868
        "paddusw 64(%1), %%mm0          \n\t"\
1869
        "paddusw %%mm1, %%mm0           \n\t"\
1870
\
1871
        HSUM(%%mm0, %%mm1, %0)\
1872
\
1873
        : "=r" (sum)\
1874
        : "r"(temp)\
1875
    );\
1876
    return sum&0xFFFF;\
1877
}\
1878
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1879

  
1880
#define HADAMARD8_DIFF_SSE2(cpu) \
1881
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1882
    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1883
    int sum;\
1884
\
1885
    assert(h==8);\
1886
\
1887
    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1888
\
1889
    asm volatile(\
1890
        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1891
        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1892
        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1893
        MMABS_SUM_8x8\
1894
        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1895
        : "=r" (sum)\
1896
        : "r"(temp)\
1897
    );\
1898
    return sum&0xFFFF;\
1899
}\
1900
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1901

  
1902
#define MMABS(a,z)         MMABS_MMX(a,z)
1903
#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1904
HADAMARD8_DIFF_MMX(mmx)
1905
#undef MMABS
1906
#undef HSUM
1907

  
1908
#define MMABS(a,z)         MMABS_MMX2(a,z)
1909
#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1910
#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1911
HADAMARD8_DIFF_MMX(mmx2)
1912
HADAMARD8_DIFF_SSE2(sse2)
1913
#undef MMABS
1914
#undef MMABS_SUM_8x8
1915
#undef HSUM
1916

  
1917
#ifdef HAVE_SSSE3
1918
#define MMABS(a,z)         MMABS_SSSE3(a,z)
1919
#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1920
HADAMARD8_DIFF_SSE2(ssse3)
1921
#undef MMABS
1922
#undef MMABS_SUM_8x8
1923
#endif
1924

  
1925
#define DCT_SAD4(m,mm,o)\
1926
    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1927
    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1928
    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1929
    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1930
    MMABS_SUM(mm##2, mm##6, mm##0)\
1931
    MMABS_SUM(mm##3, mm##7, mm##1)\
1932
    MMABS_SUM(mm##4, mm##6, mm##0)\
1933
    MMABS_SUM(mm##5, mm##7, mm##1)\
1934

  
1935
#define DCT_SAD_MMX\
1936
    "pxor %%mm0, %%mm0                \n\t"\
1937
    "pxor %%mm1, %%mm1                \n\t"\
1938
    DCT_SAD4(q, %%mm, 0)\
1939
    DCT_SAD4(q, %%mm, 8)\
1940
    DCT_SAD4(q, %%mm, 64)\
1941
    DCT_SAD4(q, %%mm, 72)\
1942
    "paddusw %%mm1, %%mm0             \n\t"\
1943
    HSUM(%%mm0, %%mm1, %0)
1944

  
1945
#define DCT_SAD_SSE2\
1946
    "pxor %%xmm0, %%xmm0              \n\t"\
1947
    "pxor %%xmm1, %%xmm1              \n\t"\
1948
    DCT_SAD4(dqa, %%xmm, 0)\
1949
    DCT_SAD4(dqa, %%xmm, 64)\
1950
    "paddusw %%xmm1, %%xmm0           \n\t"\
1951
    HSUM(%%xmm0, %%xmm1, %0)
1952

  
1953
#define DCT_SAD_FUNC(cpu) \
1954
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1955
    int sum;\
1956
    asm volatile(\
1957
        DCT_SAD\
1958
        :"=r"(sum)\
1959
        :"r"(block)\
1960
    );\
1961
    return sum&0xFFFF;\
1962
}
1963

  
1964
#define DCT_SAD       DCT_SAD_MMX
1965
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1966
#define MMABS(a,z)    MMABS_MMX(a,z)
1967
DCT_SAD_FUNC(mmx)
1968
#undef MMABS
1969
#undef HSUM
1970

  
1971
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1972
#define MMABS(a,z)    MMABS_MMX2(a,z)
1973
DCT_SAD_FUNC(mmx2)
1974
#undef HSUM
1975
#undef DCT_SAD
1976

  
1977
#define DCT_SAD       DCT_SAD_SSE2
1978
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1979
DCT_SAD_FUNC(sse2)
1980
#undef MMABS
1981

  
1982
#ifdef HAVE_SSSE3
1983
#define MMABS(a,z)    MMABS_SSSE3(a,z)
1984
DCT_SAD_FUNC(ssse3)
1985
#undef MMABS
1986
#endif
1987
#undef HSUM
1988
#undef DCT_SAD
1989

  
1990
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1991
    int sum;
1992
    long i=size;
1993
    asm volatile(
1994
        "pxor %%mm4, %%mm4 \n"
1995
        "1: \n"
1996
        "sub $8, %0 \n"
1997
        "movq (%2,%0), %%mm2 \n"
1998
        "movq (%3,%0,2), %%mm0 \n"
1999
        "movq 8(%3,%0,2), %%mm1 \n"
2000
        "punpckhbw %%mm2, %%mm3 \n"
2001
        "punpcklbw %%mm2, %%mm2 \n"
2002
        "psraw $8, %%mm3 \n"
2003
        "psraw $8, %%mm2 \n"
2004
        "psubw %%mm3, %%mm1 \n"
2005
        "psubw %%mm2, %%mm0 \n"
2006
        "pmaddwd %%mm1, %%mm1 \n"
2007
        "pmaddwd %%mm0, %%mm0 \n"
2008
        "paddd %%mm1, %%mm4 \n"
2009
        "paddd %%mm0, %%mm4 \n"
2010
        "jg 1b \n"
2011
        "movq %%mm4, %%mm3 \n"
2012
        "psrlq $32, %%mm3 \n"
2013
        "paddd %%mm3, %%mm4 \n"
2014
        "movd %%mm4, %1 \n"
2015
        :"+r"(i), "=r"(sum)
2016
        :"r"(pix1), "r"(pix2)
2017
    );
2018
    return sum;
2019
}
2020

  
2021
#endif //CONFIG_ENCODERS
2022

  
2023 773
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
2024 774
        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
2025 775
        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
......
2858 1608
    }
2859 1609
}
2860 1610

  
2861
#ifdef CONFIG_ENCODERS
2862

  
2863
#define PHADDD(a, t)\
2864
    "movq "#a", "#t"                  \n\t"\
2865
    "psrlq $32, "#a"                  \n\t"\
2866
    "paddd "#t", "#a"                 \n\t"
2867
/*
2868
   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
2869
   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
2870
   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
2871
 */
2872
#define PMULHRW(x, y, s, o)\
2873
    "pmulhw " #s ", "#x "            \n\t"\
2874
    "pmulhw " #s ", "#y "            \n\t"\
2875
    "paddw " #o ", "#x "             \n\t"\
2876
    "paddw " #o ", "#y "             \n\t"\
2877
    "psraw $1, "#x "                 \n\t"\
2878
    "psraw $1, "#y "                 \n\t"
2879
#define DEF(x) x ## _mmx
2880
#define SET_RND MOVQ_WONE
2881
#define SCALE_OFFSET 1
2882

  
2883
#include "dsputil_mmx_qns.h"
2884

  
2885
#undef DEF
2886
#undef SET_RND
2887
#undef SCALE_OFFSET
2888
#undef PMULHRW
2889

  
2890
#define DEF(x) x ## _3dnow
2891
#define SET_RND(x)
2892
#define SCALE_OFFSET 0
2893
#define PMULHRW(x, y, s, o)\
2894
    "pmulhrw " #s ", "#x "           \n\t"\
2895
    "pmulhrw " #s ", "#y "           \n\t"
2896

  
2897
#include "dsputil_mmx_qns.h"
2898

  
2899
#undef DEF
2900
#undef SET_RND
2901
#undef SCALE_OFFSET
2902
#undef PMULHRW
2903

  
2904
#ifdef HAVE_SSSE3
2905
#undef PHADDD
2906
#define DEF(x) x ## _ssse3
2907
#define SET_RND(x)
2908
#define SCALE_OFFSET -1
2909
#define PHADDD(a, t)\
2910
    "pshufw $0x0E, "#a", "#t"         \n\t"\
2911
    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
2912
#define PMULHRW(x, y, s, o)\
2913
    "pmulhrsw " #s ", "#x "          \n\t"\
2914
    "pmulhrsw " #s ", "#y "          \n\t"
2915

  
2916
#include "dsputil_mmx_qns.h"
2917

  
2918
#undef DEF
2919
#undef SET_RND
2920
#undef SCALE_OFFSET
2921
#undef PMULHRW
2922
#undef PHADDD
2923
#endif //HAVE_SSSE3
2924

  
2925
#endif /* CONFIG_ENCODERS */
2926

  
2927 1611
#define PREFETCH(name, op) \
2928 1612
static void name(void *mem, int stride, int h){\
2929 1613
    const uint8_t *p= mem;\
......
2954 1638
    avg_pixels16_mmx(dst, src, stride, 16);
2955 1639
}
2956 1640

  
2957
/* FLAC specific */
2958
void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
2959
                                   double *autoc);
2960

  
2961 1641
/* VC1 specific */
2962 1642
void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
2963 1643

  
......
3320 2000
    if (mm_flags & MM_MMX) {
3321 2001
        const int idct_algo= avctx->idct_algo;
3322 2002

  
3323
#ifdef CONFIG_ENCODERS
3324
        const int dct_algo = avctx->dct_algo;
3325
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
3326
            if(mm_flags & MM_SSE2){
3327
                c->fdct = ff_fdct_sse2;
3328
            }else if(mm_flags & MM_MMXEXT){
3329
                c->fdct = ff_fdct_mmx2;
3330
            }else{
3331
                c->fdct = ff_fdct_mmx;
3332
            }
3333
        }
3334
#endif //CONFIG_ENCODERS
3335 2003
        if(avctx->lowres==0){
3336 2004
            if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
3337 2005
                c->idct_put= ff_simple_idct_put_mmx;
......
3382 2050
            }
3383 2051
        }
3384 2052

  
3385
#ifdef CONFIG_ENCODERS
3386
        c->get_pixels = get_pixels_mmx;
3387
        c->diff_pixels = diff_pixels_mmx;
3388
#endif //CONFIG_ENCODERS
3389 2053
        c->put_pixels_clamped = put_pixels_clamped_mmx;
3390 2054
        c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
3391 2055
        c->add_pixels_clamped = add_pixels_clamped_mmx;
3392 2056
        c->clear_blocks = clear_blocks_mmx;
3393
#ifdef CONFIG_ENCODERS
3394
        c->pix_sum = pix_sum16_mmx;
3395
#endif //CONFIG_ENCODERS
3396 2057

  
3397 2058
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
3398 2059
        c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
......
3413 2074

  
3414 2075
        c->add_bytes= add_bytes_mmx;
3415 2076
        c->add_bytes_l2= add_bytes_l2_mmx;
3416
#ifdef CONFIG_ENCODERS
3417
        c->diff_bytes= diff_bytes_mmx;
3418
        c->sum_abs_dctelem= sum_abs_dctelem_mmx;
3419

  
3420
        c->hadamard8_diff[0]= hadamard8_diff16_mmx;
3421
        c->hadamard8_diff[1]= hadamard8_diff_mmx;
3422

  
3423
        c->pix_norm1 = pix_norm1_mmx;
3424
        c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
3425
          c->sse[1] = sse8_mmx;
3426
        c->vsad[4]= vsad_intra16_mmx;
3427

  
3428
        c->nsse[0] = nsse16_mmx;
3429
        c->nsse[1] = nsse8_mmx;
3430
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3431
            c->vsad[0] = vsad16_mmx;
3432
        }
3433

  
3434
        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
3435
            c->try_8x8basis= try_8x8basis_mmx;
3436
        }
3437
        c->add_8x8basis= add_8x8basis_mmx;
3438

  
3439
        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
3440

  
3441
#endif //CONFIG_ENCODERS
3442 2077

  
3443 2078
        if (ENABLE_ANY_H263) {
3444 2079
            c->h263_v_loop_filter= h263_v_loop_filter_mmx;
......
3472 2107
            c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
3473 2108
            c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
3474 2109

  
3475
#ifdef CONFIG_ENCODERS
3476
            c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
3477
            c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
3478
            c->hadamard8_diff[1]= hadamard8_diff_mmx2;
3479
            c->vsad[4]= vsad_intra16_mmx2;
3480
#endif //CONFIG_ENCODERS
3481

  
3482 2110
            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
3483 2111
            c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
3484 2112

  
......
3489 2117
                c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
3490 2118
                c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
3491 2119
                c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
3492
#ifdef CONFIG_ENCODERS
3493
                c->vsad[0] = vsad16_mmx2;
3494
#endif //CONFIG_ENCODERS
3495 2120
            }
3496 2121

  
3497 2122
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
......
3568 2193
                ff_vc1dsp_init_mmx(c, avctx);
3569 2194

  
3570 2195
            c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
3571
#ifdef CONFIG_ENCODERS
3572
            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
3573
#endif //CONFIG_ENCODERS
3574 2196
        } else if (mm_flags & MM_3DNOW) {
3575 2197
            c->prefetch = prefetch_3dnow;
3576 2198

  
......
3666 2288
        }
3667 2289
#endif
3668 2290

  
3669
#ifdef CONFIG_ENCODERS
3670
        if(mm_flags & MM_SSE2){
3671
            c->sum_abs_dctelem= sum_abs_dctelem_sse2;
3672
            c->hadamard8_diff[0]= hadamard8_diff16_sse2;
3673
            c->hadamard8_diff[1]= hadamard8_diff_sse2;
3674
            if (ENABLE_FLAC_ENCODER)
3675
                c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
3676
        }
3677

  
3678
#ifdef HAVE_SSSE3
3679
        if(mm_flags & MM_SSSE3){
3680
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff