Revision b10fa1bb libavcodec/x86/dsputil_mmx.c
libavcodec/x86/dsputil_mmx.c  

2384  2384 
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); 
2385  2385 
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); 
2386  2386 
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); 
2387 
void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order); 

2388 
void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order); 

2389 
void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order); 

2390 
void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order); 

2391 
int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift); 

2392 
int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift); 

2387  2393 
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); 
2388  2394 
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); 
2389  2395 
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); 
...  ...  
2507  2513 
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); 
2508  2514  
2509  2515  
2510 
static void add_int16_sse2(int16_t * v1, int16_t * v2, int order) 

2511 
{ 

2512 
x86_reg o = (order << 1); 

2513 
v1 += order; 

2514 
v2 += order; 

2515 
__asm__ volatile( 

2516 
"1: \n\t" 

2517 
"movdqu (%1,%2), %%xmm0 \n\t" 

2518 
"movdqu 16(%1,%2), %%xmm1 \n\t" 

2519 
"paddw (%0,%2), %%xmm0 \n\t" 

2520 
"paddw 16(%0,%2), %%xmm1 \n\t" 

2521 
"movdqa %%xmm0, (%0,%2) \n\t" 

2522 
"movdqa %%xmm1, 16(%0,%2) \n\t" 

2523 
"add $32, %2 \n\t" 

2524 
"js 1b \n\t" 

2525 
: "+r"(v1), "+r"(v2), "+r"(o) 

2526 
); 

2527 
} 

2528  
2529 
static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order) 

2530 
{ 

2531 
x86_reg o = (order << 1); 

2532 
v1 += order; 

2533 
v2 += order; 

2534 
__asm__ volatile( 

2535 
"1: \n\t" 

2536 
"movdqa (%0,%2), %%xmm0 \n\t" 

2537 
"movdqa 16(%0,%2), %%xmm2 \n\t" 

2538 
"movdqu (%1,%2), %%xmm1 \n\t" 

2539 
"movdqu 16(%1,%2), %%xmm3 \n\t" 

2540 
"psubw %%xmm1, %%xmm0 \n\t" 

2541 
"psubw %%xmm3, %%xmm2 \n\t" 

2542 
"movdqa %%xmm0, (%0,%2) \n\t" 

2543 
"movdqa %%xmm2, 16(%0,%2) \n\t" 

2544 
"add $32, %2 \n\t" 

2545 
"js 1b \n\t" 

2546 
: "+r"(v1), "+r"(v2), "+r"(o) 

2547 
); 

2548 
} 

2549  
2550 
static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) 

2551 
{ 

2552 
int res = 0; 

2553 
DECLARE_ALIGNED_16(xmm_reg, sh); 

2554 
x86_reg o = (order << 1); 

2555  
2556 
v1 += order; 

2557 
v2 += order; 

2558 
sh.a = shift; 

2559 
__asm__ volatile( 

2560 
"pxor %%xmm7, %%xmm7 \n\t" 

2561 
"1: \n\t" 

2562 
"movdqu (%0,%3), %%xmm0 \n\t" 

2563 
"movdqu 16(%0,%3), %%xmm1 \n\t" 

2564 
"pmaddwd (%1,%3), %%xmm0 \n\t" 

2565 
"pmaddwd 16(%1,%3), %%xmm1 \n\t" 

2566 
"paddd %%xmm0, %%xmm7 \n\t" 

2567 
"paddd %%xmm1, %%xmm7 \n\t" 

2568 
"add $32, %3 \n\t" 

2569 
"js 1b \n\t" 

2570 
"movhlps %%xmm7, %%xmm2 \n\t" 

2571 
"paddd %%xmm2, %%xmm7 \n\t" 

2572 
"psrad %4, %%xmm7 \n\t" 

2573 
"pshuflw $0x4E, %%xmm7,%%xmm2 \n\t" 

2574 
"paddd %%xmm2, %%xmm7 \n\t" 

2575 
"movd %%xmm7, %2 \n\t" 

2576 
: "+r"(v1), "+r"(v2), "=r"(res), "+r"(o) 

2577 
: "m"(sh) 

2578 
); 

2579 
return res; 

2580 
} 

2581  
2582  2516 
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) 
2583  2517 
{ 
2584  2518 
mm_flags = mm_support(); 
...  ...  
3015  2949 
c>float_to_int16_interleave = float_to_int16_interleave_3dn2; 
3016  2950 
} 
3017  2951 
} 
2952 
if(mm_flags & FF_MM_MMX2){ 

2953 
#if HAVE_YASM 

2954 
c>add_int16 = ff_add_int16_mmx2; 

2955 
c>sub_int16 = ff_sub_int16_mmx2; 

2956 
c>scalarproduct_int16 = ff_scalarproduct_int16_mmx2; 

2957 
#endif 

2958 
} 

3018  2959 
if(mm_flags & FF_MM_SSE){ 
3019  2960 
c>vorbis_inverse_coupling = vorbis_inverse_coupling_sse; 
3020  2961 
c>ac3_downmix = ac3_downmix_sse; 
...  ...  
3033  2974 
c>int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; 
3034  2975 
c>float_to_int16 = float_to_int16_sse2; 
3035  2976 
c>float_to_int16_interleave = float_to_int16_interleave_sse2; 
3036 
c>add_int16 = add_int16_sse2; 

3037 
c>sub_int16 = sub_int16_sse2; 

3038 
c>scalarproduct_int16 = scalarproduct_int16_sse2; 

2977 
#if HAVE_YASM 

2978 
c>add_int16 = ff_add_int16_sse2; 

2979 
c>sub_int16 = ff_sub_int16_sse2; 

2980 
c>scalarproduct_int16 = ff_scalarproduct_int16_sse2; 

2981 
#endif 

3039  2982 
} 
3040  2983 
} 
3041  2984 
Also available in: Unified diff