909 
909 
#undef H264_CHROMA_OP

910 
910 
#undef H264_CHROMA_MC8_TMPL

911 
911 


912 
/***********************************/


913 
/* weighted prediction */


914 


915 
static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)


916 
{


917 
int x, y;


918 
offset <<= log2_denom;


919 
offset += (1 << log2_denom) >> 1;


920 
asm volatile(


921 
"movd %0, %%mm4 \n\t"


922 
"movd %1, %%mm5 \n\t"


923 
"movd %2, %%mm6 \n\t"


924 
"pshufw $0, %%mm4, %%mm4 \n\t"


925 
"pshufw $0, %%mm5, %%mm5 \n\t"


926 
"pxor %%mm7, %%mm7 \n\t"


927 
:: "g"(weight), "g"(offset), "g"(log2_denom)


928 
);


929 
for(y=0; y<h; y+=2){


930 
for(x=0; x<w; x+=4){


931 
asm volatile(


932 
"movd %0, %%mm0 \n\t"


933 
"movd %1, %%mm1 \n\t"


934 
"punpcklbw %%mm7, %%mm0 \n\t"


935 
"punpcklbw %%mm7, %%mm1 \n\t"


936 
"pmullw %%mm4, %%mm0 \n\t"


937 
"pmullw %%mm4, %%mm1 \n\t"


938 
"paddw %%mm5, %%mm0 \n\t"


939 
"paddw %%mm5, %%mm1 \n\t"


940 
"psraw %%mm6, %%mm0 \n\t"


941 
"psraw %%mm6, %%mm1 \n\t"


942 
"packuswb %%mm7, %%mm0 \n\t"


943 
"packuswb %%mm7, %%mm1 \n\t"


944 
"movd %%mm0, %0 \n\t"


945 
"movd %%mm1, %1 \n\t"


946 
: "+m"(*(uint32_t*)(dst+x)),


947 
"+m"(*(uint32_t*)(dst+x+stride))


948 
);


949 
}


950 
dst += 2*stride;


951 
}


952 
}


953 


954 
static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets, int w, int h)


955 
{


956 
int x, y;


957 
int offset = ((offsets + offsetd + 1)  1) << log2_denom;


958 
asm volatile(


959 
"movd %0, %%mm3 \n\t"


960 
"movd %1, %%mm4 \n\t"


961 
"movd %2, %%mm5 \n\t"


962 
"movd %3, %%mm6 \n\t"


963 
"pshufw $0, %%mm3, %%mm3 \n\t"


964 
"pshufw $0, %%mm4, %%mm4 \n\t"


965 
"pshufw $0, %%mm5, %%mm5 \n\t"


966 
"pxor %%mm7, %%mm7 \n\t"


967 
:: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)


968 
);


969 
for(y=0; y<h; y++){


970 
for(x=0; x<w; x+=4){


971 
asm volatile(


972 
"movd %0, %%mm0 \n\t"


973 
"movd %1, %%mm1 \n\t"


974 
"punpcklbw %%mm7, %%mm0 \n\t"


975 
"punpcklbw %%mm7, %%mm1 \n\t"


976 
"pmullw %%mm3, %%mm0 \n\t"


977 
"pmullw %%mm4, %%mm1 \n\t"


978 
"paddw %%mm5, %%mm0 \n\t"


979 
"paddw %%mm1, %%mm0 \n\t"


980 
"psraw %%mm6, %%mm0 \n\t"


981 
"packuswb %%mm0, %%mm0 \n\t"


982 
"movd %%mm0, %0 \n\t"


983 
: "+m"(*(uint32_t*)(dst+x))


984 
: "m"(*(uint32_t*)(src+x))


985 
);


986 
}


987 
src += stride;


988 
dst += stride;


989 
}


990 
}


991 


992 
#define H264_WEIGHT(W,H) \


993 
static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \


994 
ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offsetd, offsets, W, H); \


995 
} \


996 
static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \


997 
ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \


998 
}


999 


1000 
H264_WEIGHT(16,16)


1001 
H264_WEIGHT(16, 8)


1002 
H264_WEIGHT( 8,16)


1003 
H264_WEIGHT( 8, 8)


1004 
H264_WEIGHT( 8, 4)


1005 
H264_WEIGHT( 4, 8)


1006 
H264_WEIGHT( 4, 4)


1007 
H264_WEIGHT( 4, 2)


1008 
