Revision fe2ff6d2 libavcodec/x86/dsputil_mmx.c

View differences:

libavcodec/x86/dsputil_mmx.c
2349 2349
}
2350 2350
#endif /* HAVE_6REGS */
2351 2351

  
2352
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
2353
{
2354
    x86_reg i = -4*len;
2355
    __asm__ volatile(
2356
        "movss  %3, %%xmm4 \n"
2357
        "shufps $0, %%xmm4, %%xmm4 \n"
2358
        "1: \n"
2359
        "cvtpi2ps   (%2,%0), %%xmm0 \n"
2360
        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
2361
        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
2362
        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
2363
        "movlhps  %%xmm1,    %%xmm0 \n"
2364
        "movlhps  %%xmm3,    %%xmm2 \n"
2365
        "mulps    %%xmm4,    %%xmm0 \n"
2366
        "mulps    %%xmm4,    %%xmm2 \n"
2367
        "movaps   %%xmm0,   (%1,%0) \n"
2368
        "movaps   %%xmm2, 16(%1,%0) \n"
2369
        "add $32, %0 \n"
2370
        "jl 1b \n"
2371
        :"+r"(i)
2372
        :"r"(dst+len), "r"(src+len), "m"(mul)
2373
    );
2374
}
2375

  
2376
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
2377
{
2378
    x86_reg i = -4*len;
2379
    __asm__ volatile(
2380
        "movss  %3, %%xmm4 \n"
2381
        "shufps $0, %%xmm4, %%xmm4 \n"
2382
        "1: \n"
2383
        "cvtdq2ps   (%2,%0), %%xmm0 \n"
2384
        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
2385
        "mulps    %%xmm4,    %%xmm0 \n"
2386
        "mulps    %%xmm4,    %%xmm1 \n"
2387
        "movaps   %%xmm0,   (%1,%0) \n"
2388
        "movaps   %%xmm1, 16(%1,%0) \n"
2389
        "add $32, %0 \n"
2390
        "jl 1b \n"
2391
        :"+r"(i)
2392
        :"r"(dst+len), "r"(src+len), "m"(mul)
2393
    );
2394
}
2395

  
2396 2352
static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2397 2353
                             int len)
2398 2354
{
......
2427 2383
    );
2428 2384
}
2429 2385

  
2430
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
2431
    x86_reg reglen = len;
2432
    // not bit-exact: pf2id uses different rounding than C and SSE
2433
    __asm__ volatile(
2434
        "add        %0          , %0        \n\t"
2435
        "lea         (%2,%0,2)  , %2        \n\t"
2436
        "add        %0          , %1        \n\t"
2437
        "neg        %0                      \n\t"
2438
        "1:                                 \n\t"
2439
        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
2440
        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
2441
        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
2442
        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
2443
        "packssdw   %%mm1       , %%mm0     \n\t"
2444
        "packssdw   %%mm3       , %%mm2     \n\t"
2445
        "movq       %%mm0       ,  (%1,%0)  \n\t"
2446
        "movq       %%mm2       , 8(%1,%0)  \n\t"
2447
        "add        $16         , %0        \n\t"
2448
        " js 1b                             \n\t"
2449
        "femms                              \n\t"
2450
        :"+r"(reglen), "+r"(dst), "+r"(src)
2451
    );
2452
}
2453
static void float_to_int16_sse(int16_t *dst, const float *src, long len){
2454
    x86_reg reglen = len;
2455
    __asm__ volatile(
2456
        "add        %0          , %0        \n\t"
2457
        "lea         (%2,%0,2)  , %2        \n\t"
2458
        "add        %0          , %1        \n\t"
2459
        "neg        %0                      \n\t"
2460
        "1:                                 \n\t"
2461
        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
2462
        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
2463
        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
2464
        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
2465
        "packssdw   %%mm1       , %%mm0     \n\t"
2466
        "packssdw   %%mm3       , %%mm2     \n\t"
2467
        "movq       %%mm0       ,  (%1,%0)  \n\t"
2468
        "movq       %%mm2       , 8(%1,%0)  \n\t"
2469
        "add        $16         , %0        \n\t"
2470
        " js 1b                             \n\t"
2471
        "emms                               \n\t"
2472
        :"+r"(reglen), "+r"(dst), "+r"(src)
2473
    );
2474
}
2475

  
2476
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
2477
    x86_reg reglen = len;
2478
    __asm__ volatile(
2479
        "add        %0          , %0        \n\t"
2480
        "lea         (%2,%0,2)  , %2        \n\t"
2481
        "add        %0          , %1        \n\t"
2482
        "neg        %0                      \n\t"
2483
        "1:                                 \n\t"
2484
        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
2485
        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
2486
        "packssdw   %%xmm1      , %%xmm0    \n\t"
2487
        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
2488
        "add        $16         , %0        \n\t"
2489
        " js 1b                             \n\t"
2490
        :"+r"(reglen), "+r"(dst), "+r"(src)
2491
    );
2492
}
2493

  
2494 2386
void ff_vp3_idct_mmx(int16_t *input_data);
2495 2387
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2496 2388
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
......
2504 2396
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2505 2397
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2506 2398

  
2507
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
2508
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
2509
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
2510 2399
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2511 2400
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2512 2401
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
......
2516 2405
int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2517 2406
int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2518 2407

  
2519
#if !HAVE_YASM
2520
#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
2521
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
2522
#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
2523
#endif
2524
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
2525

  
2526
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
2527
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
2528
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
2529
    DECLARE_ALIGNED(16, int16_t, tmp)[len];\
2530
    int i,j,c;\
2531
    for(c=0; c<channels; c++){\
2532
        float_to_int16_##cpu(tmp, src[c], len);\
2533
        for(i=0, j=c; i<len; i++, j+=channels)\
2534
            dst[j] = tmp[i];\
2535
    }\
2536
}\
2537
\
2538
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
2539
    if(channels==1)\
2540
        float_to_int16_##cpu(dst, src[0], len);\
2541
    else if(channels==2){\
2542
        x86_reg reglen = len; \
2543
        const float *src0 = src[0];\
2544
        const float *src1 = src[1];\
2545
        __asm__ volatile(\
2546
            "shl $2, %0 \n"\
2547
            "add %0, %1 \n"\
2548
            "add %0, %2 \n"\
2549
            "add %0, %3 \n"\
2550
            "neg %0 \n"\
2551
            body\
2552
            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
2553
        );\
2554
    }else if(channels==6){\
2555
        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
2556
    }else\
2557
        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
2558
}
2559

  
2560
FLOAT_TO_INT16_INTERLEAVE(3dnow,
2561
    "1:                         \n"
2562
    "pf2id     (%2,%0), %%mm0   \n"
2563
    "pf2id    8(%2,%0), %%mm1   \n"
2564
    "pf2id     (%3,%0), %%mm2   \n"
2565
    "pf2id    8(%3,%0), %%mm3   \n"
2566
    "packssdw    %%mm1, %%mm0   \n"
2567
    "packssdw    %%mm3, %%mm2   \n"
2568
    "movq        %%mm0, %%mm1   \n"
2569
    "punpcklwd   %%mm2, %%mm0   \n"
2570
    "punpckhwd   %%mm2, %%mm1   \n"
2571
    "movq        %%mm0,  (%1,%0)\n"
2572
    "movq        %%mm1, 8(%1,%0)\n"
2573
    "add $16, %0                \n"
2574
    "js 1b                      \n"
2575
    "femms                      \n"
2576
)
2577

  
2578
FLOAT_TO_INT16_INTERLEAVE(sse,
2579
    "1:                         \n"
2580
    "cvtps2pi  (%2,%0), %%mm0   \n"
2581
    "cvtps2pi 8(%2,%0), %%mm1   \n"
2582
    "cvtps2pi  (%3,%0), %%mm2   \n"
2583
    "cvtps2pi 8(%3,%0), %%mm3   \n"
2584
    "packssdw    %%mm1, %%mm0   \n"
2585
    "packssdw    %%mm3, %%mm2   \n"
2586
    "movq        %%mm0, %%mm1   \n"
2587
    "punpcklwd   %%mm2, %%mm0   \n"
2588
    "punpckhwd   %%mm2, %%mm1   \n"
2589
    "movq        %%mm0,  (%1,%0)\n"
2590
    "movq        %%mm1, 8(%1,%0)\n"
2591
    "add $16, %0                \n"
2592
    "js 1b                      \n"
2593
    "emms                       \n"
2594
)
2595

  
2596
FLOAT_TO_INT16_INTERLEAVE(sse2,
2597
    "1:                         \n"
2598
    "cvtps2dq  (%2,%0), %%xmm0  \n"
2599
    "cvtps2dq  (%3,%0), %%xmm1  \n"
2600
    "packssdw   %%xmm1, %%xmm0  \n"
2601
    "movhlps    %%xmm0, %%xmm1  \n"
2602
    "punpcklwd  %%xmm1, %%xmm0  \n"
2603
    "movdqa     %%xmm0, (%1,%0) \n"
2604
    "add $16, %0                \n"
2605
    "js 1b                      \n"
2606
)
2607

  
2608
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
2609
    if(channels==6)
2610
        ff_float_to_int16_interleave6_3dn2(dst, src, len);
2611
    else
2612
        float_to_int16_interleave_3dnow(dst, src, len, channels);
2613
}
2614

  
2615 2408
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2616 2409

  
2617 2410
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
......
2968 2761
        if(mm_flags & AV_CPU_FLAG_3DNOW){
2969 2762
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2970 2763
            c->vector_fmul = vector_fmul_3dnow;
2971
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2972
                c->float_to_int16 = float_to_int16_3dnow;
2973
                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
2974
            }
2975 2764
        }
2976 2765
        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
2977 2766
            c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2978 2767
#if HAVE_6REGS
2979 2768
            c->vector_fmul_window = vector_fmul_window_3dnow2;
2980 2769
#endif
2981
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2982
                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
2983
            }
2984 2770
        }
2985 2771
        if(mm_flags & AV_CPU_FLAG_MMX2){
2986 2772
#if HAVE_YASM
......
2997 2783
#if HAVE_6REGS
2998 2784
            c->vector_fmul_window = vector_fmul_window_sse;
2999 2785
#endif
3000
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
3001 2786
            c->vector_clipf = vector_clipf_sse;
3002
            c->float_to_int16 = float_to_int16_sse;
3003
            c->float_to_int16_interleave = float_to_int16_interleave_sse;
3004 2787
#if HAVE_YASM
3005 2788
            c->scalarproduct_float = ff_scalarproduct_float_sse;
3006 2789
#endif
......
3008 2791
        if(mm_flags & AV_CPU_FLAG_3DNOW)
3009 2792
            c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
3010 2793
        if(mm_flags & AV_CPU_FLAG_SSE2){
3011
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
3012
            c->float_to_int16 = float_to_int16_sse2;
3013
            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
3014 2794
#if HAVE_YASM
3015 2795
            c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
3016 2796
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;

Also available in: Unified diff