Revision fe2ff6d2 libavcodec/x86/dsputil_mmx.c
libavcodec/x86/dsputil_mmx.c  

2349  2349 
} 
2350  2350 
#endif /* HAVE_6REGS */ 
2351  2351  
2352 
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) 

2353 
{ 

2354 
x86_reg i = 4*len; 

2355 
__asm__ volatile( 

2356 
"movss %3, %%xmm4 \n" 

2357 
"shufps $0, %%xmm4, %%xmm4 \n" 

2358 
"1: \n" 

2359 
"cvtpi2ps (%2,%0), %%xmm0 \n" 

2360 
"cvtpi2ps 8(%2,%0), %%xmm1 \n" 

2361 
"cvtpi2ps 16(%2,%0), %%xmm2 \n" 

2362 
"cvtpi2ps 24(%2,%0), %%xmm3 \n" 

2363 
"movlhps %%xmm1, %%xmm0 \n" 

2364 
"movlhps %%xmm3, %%xmm2 \n" 

2365 
"mulps %%xmm4, %%xmm0 \n" 

2366 
"mulps %%xmm4, %%xmm2 \n" 

2367 
"movaps %%xmm0, (%1,%0) \n" 

2368 
"movaps %%xmm2, 16(%1,%0) \n" 

2369 
"add $32, %0 \n" 

2370 
"jl 1b \n" 

2371 
:"+r"(i) 

2372 
:"r"(dst+len), "r"(src+len), "m"(mul) 

2373 
); 

2374 
} 

2375  
2376 
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) 

2377 
{ 

2378 
x86_reg i = 4*len; 

2379 
__asm__ volatile( 

2380 
"movss %3, %%xmm4 \n" 

2381 
"shufps $0, %%xmm4, %%xmm4 \n" 

2382 
"1: \n" 

2383 
"cvtdq2ps (%2,%0), %%xmm0 \n" 

2384 
"cvtdq2ps 16(%2,%0), %%xmm1 \n" 

2385 
"mulps %%xmm4, %%xmm0 \n" 

2386 
"mulps %%xmm4, %%xmm1 \n" 

2387 
"movaps %%xmm0, (%1,%0) \n" 

2388 
"movaps %%xmm1, 16(%1,%0) \n" 

2389 
"add $32, %0 \n" 

2390 
"jl 1b \n" 

2391 
:"+r"(i) 

2392 
:"r"(dst+len), "r"(src+len), "m"(mul) 

2393 
); 

2394 
} 

2395  
2396  2352 
static void vector_clipf_sse(float *dst, const float *src, float min, float max, 
2397  2353 
int len) 
2398  2354 
{ 
...  ...  
2427  2383 
); 
2428  2384 
} 
2429  2385  
2430 
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ 

2431 
x86_reg reglen = len; 

2432 
// not bitexact: pf2id uses different rounding than C and SSE 

2433 
__asm__ volatile( 

2434 
"add %0 , %0 \n\t" 

2435 
"lea (%2,%0,2) , %2 \n\t" 

2436 
"add %0 , %1 \n\t" 

2437 
"neg %0 \n\t" 

2438 
"1: \n\t" 

2439 
"pf2id (%2,%0,2) , %%mm0 \n\t" 

2440 
"pf2id 8(%2,%0,2) , %%mm1 \n\t" 

2441 
"pf2id 16(%2,%0,2) , %%mm2 \n\t" 

2442 
"pf2id 24(%2,%0,2) , %%mm3 \n\t" 

2443 
"packssdw %%mm1 , %%mm0 \n\t" 

2444 
"packssdw %%mm3 , %%mm2 \n\t" 

2445 
"movq %%mm0 , (%1,%0) \n\t" 

2446 
"movq %%mm2 , 8(%1,%0) \n\t" 

2447 
"add $16 , %0 \n\t" 

2448 
" js 1b \n\t" 

2449 
"femms \n\t" 

2450 
:"+r"(reglen), "+r"(dst), "+r"(src) 

2451 
); 

2452 
} 

2453 
static void float_to_int16_sse(int16_t *dst, const float *src, long len){ 

2454 
x86_reg reglen = len; 

2455 
__asm__ volatile( 

2456 
"add %0 , %0 \n\t" 

2457 
"lea (%2,%0,2) , %2 \n\t" 

2458 
"add %0 , %1 \n\t" 

2459 
"neg %0 \n\t" 

2460 
"1: \n\t" 

2461 
"cvtps2pi (%2,%0,2) , %%mm0 \n\t" 

2462 
"cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" 

2463 
"cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" 

2464 
"cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" 

2465 
"packssdw %%mm1 , %%mm0 \n\t" 

2466 
"packssdw %%mm3 , %%mm2 \n\t" 

2467 
"movq %%mm0 , (%1,%0) \n\t" 

2468 
"movq %%mm2 , 8(%1,%0) \n\t" 

2469 
"add $16 , %0 \n\t" 

2470 
" js 1b \n\t" 

2471 
"emms \n\t" 

2472 
:"+r"(reglen), "+r"(dst), "+r"(src) 

2473 
); 

2474 
} 

2475  
2476 
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ 

2477 
x86_reg reglen = len; 

2478 
__asm__ volatile( 

2479 
"add %0 , %0 \n\t" 

2480 
"lea (%2,%0,2) , %2 \n\t" 

2481 
"add %0 , %1 \n\t" 

2482 
"neg %0 \n\t" 

2483 
"1: \n\t" 

2484 
"cvtps2dq (%2,%0,2) , %%xmm0 \n\t" 

2485 
"cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" 

2486 
"packssdw %%xmm1 , %%xmm0 \n\t" 

2487 
"movdqa %%xmm0 , (%1,%0) \n\t" 

2488 
"add $16 , %0 \n\t" 

2489 
" js 1b \n\t" 

2490 
:"+r"(reglen), "+r"(dst), "+r"(src) 

2491 
); 

2492 
} 

2493  
2494  2386 
void ff_vp3_idct_mmx(int16_t *input_data); 
2495  2387 
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); 
2496  2388 
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); 
...  ...  
2504  2396 
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); 
2505  2397 
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); 
2506  2398  
2507 
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); 

2508 
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); 

2509 
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); 

2510  2399 
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift); 
2511  2400 
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift); 
2512  2401 
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); 
...  ...  
2516  2405 
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); 
2517  2406 
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); 
2518  2407  
2519 
#if !HAVE_YASM 

2520 
#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) 

2521 
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) 

2522 
#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) 

2523 
#endif 

2524 
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse 

2525  
2526 
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ 

2527 
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ 

2528 
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ 

2529 
DECLARE_ALIGNED(16, int16_t, tmp)[len];\ 

2530 
int i,j,c;\ 

2531 
for(c=0; c<channels; c++){\ 

2532 
float_to_int16_##cpu(tmp, src[c], len);\ 

2533 
for(i=0, j=c; i<len; i++, j+=channels)\ 

2534 
dst[j] = tmp[i];\ 

2535 
}\ 

2536 
}\ 

2537 
\ 

2538 
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ 

2539 
if(channels==1)\ 

2540 
float_to_int16_##cpu(dst, src[0], len);\ 

2541 
else if(channels==2){\ 

2542 
x86_reg reglen = len; \ 

2543 
const float *src0 = src[0];\ 

2544 
const float *src1 = src[1];\ 

2545 
__asm__ volatile(\ 

2546 
"shl $2, %0 \n"\ 

2547 
"add %0, %1 \n"\ 

2548 
"add %0, %2 \n"\ 

2549 
"add %0, %3 \n"\ 

2550 
"neg %0 \n"\ 

2551 
body\ 

2552 
:"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ 

2553 
);\ 

2554 
}else if(channels==6){\ 

2555 
ff_float_to_int16_interleave6_##cpu(dst, src, len);\ 

2556 
}else\ 

2557 
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ 

2558 
} 

2559  
2560 
FLOAT_TO_INT16_INTERLEAVE(3dnow, 

2561 
"1: \n" 

2562 
"pf2id (%2,%0), %%mm0 \n" 

2563 
"pf2id 8(%2,%0), %%mm1 \n" 

2564 
"pf2id (%3,%0), %%mm2 \n" 

2565 
"pf2id 8(%3,%0), %%mm3 \n" 

2566 
"packssdw %%mm1, %%mm0 \n" 

2567 
"packssdw %%mm3, %%mm2 \n" 

2568 
"movq %%mm0, %%mm1 \n" 

2569 
"punpcklwd %%mm2, %%mm0 \n" 

2570 
"punpckhwd %%mm2, %%mm1 \n" 

2571 
"movq %%mm0, (%1,%0)\n" 

2572 
"movq %%mm1, 8(%1,%0)\n" 

2573 
"add $16, %0 \n" 

2574 
"js 1b \n" 

2575 
"femms \n" 

2576 
) 

2577  
2578 
FLOAT_TO_INT16_INTERLEAVE(sse, 

2579 
"1: \n" 

2580 
"cvtps2pi (%2,%0), %%mm0 \n" 

2581 
"cvtps2pi 8(%2,%0), %%mm1 \n" 

2582 
"cvtps2pi (%3,%0), %%mm2 \n" 

2583 
"cvtps2pi 8(%3,%0), %%mm3 \n" 

2584 
"packssdw %%mm1, %%mm0 \n" 

2585 
"packssdw %%mm3, %%mm2 \n" 

2586 
"movq %%mm0, %%mm1 \n" 

2587 
"punpcklwd %%mm2, %%mm0 \n" 

2588 
"punpckhwd %%mm2, %%mm1 \n" 

2589 
"movq %%mm0, (%1,%0)\n" 

2590 
"movq %%mm1, 8(%1,%0)\n" 

2591 
"add $16, %0 \n" 

2592 
"js 1b \n" 

2593 
"emms \n" 

2594 
) 

2595  
2596 
FLOAT_TO_INT16_INTERLEAVE(sse2, 

2597 
"1: \n" 

2598 
"cvtps2dq (%2,%0), %%xmm0 \n" 

2599 
"cvtps2dq (%3,%0), %%xmm1 \n" 

2600 
"packssdw %%xmm1, %%xmm0 \n" 

2601 
"movhlps %%xmm0, %%xmm1 \n" 

2602 
"punpcklwd %%xmm1, %%xmm0 \n" 

2603 
"movdqa %%xmm0, (%1,%0) \n" 

2604 
"add $16, %0 \n" 

2605 
"js 1b \n" 

2606 
) 

2607  
2608 
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ 

2609 
if(channels==6) 

2610 
ff_float_to_int16_interleave6_3dn2(dst, src, len); 

2611 
else 

2612 
float_to_int16_interleave_3dnow(dst, src, len, channels); 

2613 
} 

2614  
2615  2408 
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); 
2616  2409  
2617  2410 
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) 
...  ...  
2968  2761 
if(mm_flags & AV_CPU_FLAG_3DNOW){ 
2969  2762 
c>vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; 
2970  2763 
c>vector_fmul = vector_fmul_3dnow; 
2971 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){ 

2972 
c>float_to_int16 = float_to_int16_3dnow; 

2973 
c>float_to_int16_interleave = float_to_int16_interleave_3dnow; 

2974 
} 

2975  2764 
} 
2976  2765 
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ 
2977  2766 
c>vector_fmul_reverse = vector_fmul_reverse_3dnow2; 
2978  2767 
#if HAVE_6REGS 
2979  2768 
c>vector_fmul_window = vector_fmul_window_3dnow2; 
2980  2769 
#endif 
2981 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){ 

2982 
c>float_to_int16_interleave = float_to_int16_interleave_3dn2; 

2983 
} 

2984  2770 
} 
2985  2771 
if(mm_flags & AV_CPU_FLAG_MMX2){ 
2986  2772 
#if HAVE_YASM 
...  ...  
2997  2783 
#if HAVE_6REGS 
2998  2784 
c>vector_fmul_window = vector_fmul_window_sse; 
2999  2785 
#endif 
3000 
c>int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; 

3001  2786 
c>vector_clipf = vector_clipf_sse; 
3002 
c>float_to_int16 = float_to_int16_sse; 

3003 
c>float_to_int16_interleave = float_to_int16_interleave_sse; 

3004  2787 
#if HAVE_YASM 
3005  2788 
c>scalarproduct_float = ff_scalarproduct_float_sse; 
3006  2789 
#endif 
...  ...  
3008  2791 
if(mm_flags & AV_CPU_FLAG_3DNOW) 
3009  2792 
c>vector_fmul_add = vector_fmul_add_3dnow; // faster than sse 
3010  2793 
if(mm_flags & AV_CPU_FLAG_SSE2){ 
3011 
c>int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; 

3012 
c>float_to_int16 = float_to_int16_sse2; 

3013 
c>float_to_int16_interleave = float_to_int16_interleave_sse2; 

3014  2794 
#if HAVE_YASM 
3015  2795 
c>scalarproduct_int16 = ff_scalarproduct_int16_sse2; 
3016  2796 
c>scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; 
Also available in: Unified diff