Revision b10fa1bb libavcodec/x86/dsputil_mmx.c

View differences:

libavcodec/x86/dsputil_mmx.c
2384 2384
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
2385 2385
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
2386 2386
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
2387
void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order);
2388
void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order);
2389
void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order);
2390
void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order);
2391
int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift);
2392
int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
2387 2393
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2388 2394
int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2389 2395
int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
......
2507 2513
                                  int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
2508 2514

  
2509 2515

  
2510
static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
2511
{
2512
    x86_reg o = -(order << 1);
2513
    v1 += order;
2514
    v2 += order;
2515
    __asm__ volatile(
2516
        "1:                          \n\t"
2517
        "movdqu   (%1,%2),   %%xmm0  \n\t"
2518
        "movdqu 16(%1,%2),   %%xmm1  \n\t"
2519
        "paddw    (%0,%2),   %%xmm0  \n\t"
2520
        "paddw  16(%0,%2),   %%xmm1  \n\t"
2521
        "movdqa   %%xmm0,    (%0,%2) \n\t"
2522
        "movdqa   %%xmm1,  16(%0,%2) \n\t"
2523
        "add      $32,       %2      \n\t"
2524
        "js       1b                 \n\t"
2525
        : "+r"(v1), "+r"(v2), "+r"(o)
2526
    );
2527
}
2528

  
2529
static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
2530
{
2531
    x86_reg o = -(order << 1);
2532
    v1 += order;
2533
    v2 += order;
2534
    __asm__ volatile(
2535
        "1:                           \n\t"
2536
        "movdqa    (%0,%2),   %%xmm0  \n\t"
2537
        "movdqa  16(%0,%2),   %%xmm2  \n\t"
2538
        "movdqu    (%1,%2),   %%xmm1  \n\t"
2539
        "movdqu  16(%1,%2),   %%xmm3  \n\t"
2540
        "psubw     %%xmm1,    %%xmm0  \n\t"
2541
        "psubw     %%xmm3,    %%xmm2  \n\t"
2542
        "movdqa    %%xmm0,    (%0,%2) \n\t"
2543
        "movdqa    %%xmm2,  16(%0,%2) \n\t"
2544
        "add       $32,       %2      \n\t"
2545
        "js        1b                 \n\t"
2546
        : "+r"(v1), "+r"(v2), "+r"(o)
2547
    );
2548
}
2549

  
2550
static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
2551
{
2552
    int res = 0;
2553
    DECLARE_ALIGNED_16(xmm_reg, sh);
2554
    x86_reg o = -(order << 1);
2555

  
2556
    v1 += order;
2557
    v2 += order;
2558
    sh.a = shift;
2559
    __asm__ volatile(
2560
        "pxor      %%xmm7,  %%xmm7        \n\t"
2561
        "1:                               \n\t"
2562
        "movdqu    (%0,%3), %%xmm0        \n\t"
2563
        "movdqu  16(%0,%3), %%xmm1        \n\t"
2564
        "pmaddwd   (%1,%3), %%xmm0        \n\t"
2565
        "pmaddwd 16(%1,%3), %%xmm1        \n\t"
2566
        "paddd     %%xmm0,  %%xmm7        \n\t"
2567
        "paddd     %%xmm1,  %%xmm7        \n\t"
2568
        "add       $32,     %3            \n\t"
2569
        "js        1b                     \n\t"
2570
        "movhlps   %%xmm7,  %%xmm2        \n\t"
2571
        "paddd     %%xmm2,  %%xmm7        \n\t"
2572
        "psrad     %4,      %%xmm7        \n\t"
2573
        "pshuflw   $0x4E,   %%xmm7,%%xmm2 \n\t"
2574
        "paddd     %%xmm2,  %%xmm7        \n\t"
2575
        "movd      %%xmm7,  %2            \n\t"
2576
        : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
2577
        : "m"(sh)
2578
    );
2579
    return res;
2580
}
2581

  
2582 2516
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2583 2517
{
2584 2518
    mm_flags = mm_support();
......
3015 2949
                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
3016 2950
            }
3017 2951
        }
2952
        if(mm_flags & FF_MM_MMX2){
2953
#if HAVE_YASM
2954
            c->add_int16 = ff_add_int16_mmx2;
2955
            c->sub_int16 = ff_sub_int16_mmx2;
2956
            c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2957
#endif
2958
        }
3018 2959
        if(mm_flags & FF_MM_SSE){
3019 2960
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3020 2961
            c->ac3_downmix = ac3_downmix_sse;
......
3033 2974
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
3034 2975
            c->float_to_int16 = float_to_int16_sse2;
3035 2976
            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
3036
            c->add_int16 = add_int16_sse2;
3037
            c->sub_int16 = sub_int16_sse2;
3038
            c->scalarproduct_int16 = scalarproduct_int16_sse2;
2977
#if HAVE_YASM
2978
            c->add_int16 = ff_add_int16_sse2;
2979
            c->sub_int16 = ff_sub_int16_sse2;
2980
            c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2981
#endif
3039 2982
        }
3040 2983
    }
3041 2984

  

Also available in: Unified diff