Revision b10fa1bb

View differences:

libavcodec/x86/dsputil_mmx.c
2384 2384
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
2385 2385
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
2386 2386
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
2387
void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order);
2388
void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order);
2389
void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order);
2390
void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order);
2391
int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift);
2392
int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
2387 2393
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2388 2394
int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2389 2395
int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
......
2507 2513
                                  int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
2508 2514

  
2509 2515

  
2510
static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
2511
{
2512
    x86_reg o = -(order << 1);
2513
    v1 += order;
2514
    v2 += order;
2515
    __asm__ volatile(
2516
        "1:                          \n\t"
2517
        "movdqu   (%1,%2),   %%xmm0  \n\t"
2518
        "movdqu 16(%1,%2),   %%xmm1  \n\t"
2519
        "paddw    (%0,%2),   %%xmm0  \n\t"
2520
        "paddw  16(%0,%2),   %%xmm1  \n\t"
2521
        "movdqa   %%xmm0,    (%0,%2) \n\t"
2522
        "movdqa   %%xmm1,  16(%0,%2) \n\t"
2523
        "add      $32,       %2      \n\t"
2524
        "js       1b                 \n\t"
2525
        : "+r"(v1), "+r"(v2), "+r"(o)
2526
    );
2527
}
2528

  
2529
static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
2530
{
2531
    x86_reg o = -(order << 1);
2532
    v1 += order;
2533
    v2 += order;
2534
    __asm__ volatile(
2535
        "1:                           \n\t"
2536
        "movdqa    (%0,%2),   %%xmm0  \n\t"
2537
        "movdqa  16(%0,%2),   %%xmm2  \n\t"
2538
        "movdqu    (%1,%2),   %%xmm1  \n\t"
2539
        "movdqu  16(%1,%2),   %%xmm3  \n\t"
2540
        "psubw     %%xmm1,    %%xmm0  \n\t"
2541
        "psubw     %%xmm3,    %%xmm2  \n\t"
2542
        "movdqa    %%xmm0,    (%0,%2) \n\t"
2543
        "movdqa    %%xmm2,  16(%0,%2) \n\t"
2544
        "add       $32,       %2      \n\t"
2545
        "js        1b                 \n\t"
2546
        : "+r"(v1), "+r"(v2), "+r"(o)
2547
    );
2548
}
2549

  
2550
static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
2551
{
2552
    int res = 0;
2553
    DECLARE_ALIGNED_16(xmm_reg, sh);
2554
    x86_reg o = -(order << 1);
2555

  
2556
    v1 += order;
2557
    v2 += order;
2558
    sh.a = shift;
2559
    __asm__ volatile(
2560
        "pxor      %%xmm7,  %%xmm7        \n\t"
2561
        "1:                               \n\t"
2562
        "movdqu    (%0,%3), %%xmm0        \n\t"
2563
        "movdqu  16(%0,%3), %%xmm1        \n\t"
2564
        "pmaddwd   (%1,%3), %%xmm0        \n\t"
2565
        "pmaddwd 16(%1,%3), %%xmm1        \n\t"
2566
        "paddd     %%xmm0,  %%xmm7        \n\t"
2567
        "paddd     %%xmm1,  %%xmm7        \n\t"
2568
        "add       $32,     %3            \n\t"
2569
        "js        1b                     \n\t"
2570
        "movhlps   %%xmm7,  %%xmm2        \n\t"
2571
        "paddd     %%xmm2,  %%xmm7        \n\t"
2572
        "psrad     %4,      %%xmm7        \n\t"
2573
        "pshuflw   $0x4E,   %%xmm7,%%xmm2 \n\t"
2574
        "paddd     %%xmm2,  %%xmm7        \n\t"
2575
        "movd      %%xmm7,  %2            \n\t"
2576
        : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
2577
        : "m"(sh)
2578
    );
2579
    return res;
2580
}
2581

  
2582 2516
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2583 2517
{
2584 2518
    mm_flags = mm_support();
......
3015 2949
                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
3016 2950
            }
3017 2951
        }
2952
        if(mm_flags & FF_MM_MMX2){
2953
#if HAVE_YASM
2954
            c->add_int16 = ff_add_int16_mmx2;
2955
            c->sub_int16 = ff_sub_int16_mmx2;
2956
            c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2957
#endif
2958
        }
3018 2959
        if(mm_flags & FF_MM_SSE){
3019 2960
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3020 2961
            c->ac3_downmix = ac3_downmix_sse;
......
3033 2974
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
3034 2975
            c->float_to_int16 = float_to_int16_sse2;
3035 2976
            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
3036
            c->add_int16 = add_int16_sse2;
3037
            c->sub_int16 = sub_int16_sse2;
3038
            c->scalarproduct_int16 = scalarproduct_int16_sse2;
2977
#if HAVE_YASM
2978
            c->add_int16 = ff_add_int16_sse2;
2979
            c->sub_int16 = ff_sub_int16_sse2;
2980
            c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2981
#endif
3039 2982
        }
3040 2983
    }
3041 2984

  
libavcodec/x86/dsputil_yasm.asm
99 99

  
100 100

  
101 101

  
102
%macro SCALARPRODUCT 1
103
; void add_int16(int16_t * v1, int16_t * v2, int order)
104
cglobal add_int16_%1, 3,3,2, v1, v2, order
105
    shl orderq, 1
106
    add v1q, orderq
107
    add v2q, orderq
108
    neg orderq
109
.loop:
110
    movu    m0, [v2q + orderq]
111
    movu    m1, [v2q + orderq + mmsize]
112
    paddw   m0, [v1q + orderq]
113
    paddw   m1, [v1q + orderq + mmsize]
114
    mova    [v1q + orderq], m0
115
    mova    [v1q + orderq + mmsize], m1
116
    add     orderq, mmsize*2
117
    jl .loop
118
    REP_RET
119

  
120
; void sub_int16(int16_t * v1, int16_t * v2, int order)
121
cglobal sub_int16_%1, 3,3,4, v1, v2, order
122
    shl orderq, 1
123
    add v1q, orderq
124
    add v2q, orderq
125
    neg orderq
126
.loop:
127
    movu    m2, [v2q + orderq]
128
    movu    m3, [v2q + orderq + mmsize]
129
    mova    m0, [v1q + orderq]
130
    mova    m1, [v1q + orderq + mmsize]
131
    psubw   m0, m2
132
    psubw   m1, m3
133
    mova    [v1q + orderq], m0
134
    mova    [v1q + orderq + mmsize], m1
135
    add     orderq, mmsize*2
136
    jl .loop
137
    REP_RET
138

  
139
; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
140
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
141
    shl orderq, 1
142
    add v1q, orderq
143
    add v2q, orderq
144
    neg orderq
145
    movd    m3, shiftm
146
    pxor    m2, m2
147
.loop:
148
    movu    m0, [v1q + orderq]
149
    movu    m1, [v1q + orderq + mmsize]
150
    pmaddwd m0, [v2q + orderq]
151
    pmaddwd m1, [v2q + orderq + mmsize]
152
    paddd   m2, m0
153
    paddd   m2, m1
154
    add     orderq, mmsize*2
155
    jl .loop
156
%if mmsize == 16
157
    movhlps m0, m2
158
    paddd   m2, m0
159
    psrad   m2, m3
160
    pshuflw m0, m2, 0x4e
161
%else
162
    psrad   m2, m3
163
    pshufw  m0, m2, 0x4e
164
%endif
165
    paddd   m2, m0
166
    movd   eax, m2
167
    RET
168
%endmacro
169

  
170
INIT_MMX
171
SCALARPRODUCT mmx2
172
INIT_XMM
173
SCALARPRODUCT sse2
174

  
175

  
176

  
102 177
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
103 178
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
104 179
    movq    mm0, [topq]

Also available in: Unified diff