Revision f27e1d64

View differences:

libavcodec/dsputil.c
3930 3930
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3931 3931
}
3932 3932

  
3933
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3934
    int i;
3935
    for(i=0; i<len; i++)
3936
        dst[i] = src0[i]*win[len-i-1] + src1[i]*win[i] + add_bias;
3937
}
3938

  
3939
static av_always_inline int float_to_int16_one(const float *src){
3940
    int_fast32_t tmp = *(const int32_t*)src;
3941
    if(tmp & 0xf0000){
3942
        tmp = (0x43c0ffff - tmp)>>31;
3943
        // is this faster on some gcc/cpu combinations?
3944
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3945
//      else                 tmp = 0;
3946
    }
3947
    return tmp - 0x8000;
3948
}
3949

  
3933 3950
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3934 3951
    int i;
3935
    for(i=0; i<len; i++) {
3936
        int_fast32_t tmp = ((const int32_t*)src)[i];
3937
        if(tmp & 0xf0000){
3938
            tmp = (0x43c0ffff - tmp)>>31;
3939
            // is this faster on some gcc/cpu combinations?
3940
//          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3941
//          else                 tmp = 0;
3952
    for(i=0; i<len; i++)
3953
        dst[i] = float_to_int16_one(src+i);
3954
}
3955

  
3956
void ff_float_to_int16_interleave_c(int16_t *dst, const float *src, long len, int channels){
3957
    int i,j,c;
3958
    if(channels==2){
3959
        for(i=0; i<len; i++){
3960
            dst[2*i]   = float_to_int16_one(src+i);
3961
            dst[2*i+1] = float_to_int16_one(src+i+len);
3942 3962
        }
3943
        dst[i] = tmp - 0x8000;
3963
    }else{
3964
        for(c=0; c<channels; c++, src+=len)
3965
            for(i=0, j=c; i<len; i++, j+=channels)
3966
                dst[j] = float_to_int16_one(src+i);
3944 3967
    }
3945 3968
}
3946 3969

  
......
4450 4473
    c->vector_fmul = vector_fmul_c;
4451 4474
    c->vector_fmul_reverse = vector_fmul_reverse_c;
4452 4475
    c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4476
    c->vector_fmul_window = ff_vector_fmul_window_c;
4453 4477
    c->float_to_int16 = ff_float_to_int16_c;
4478
    c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4454 4479
    c->add_int16 = add_int16_c;
4455 4480
    c->sub_int16 = sub_int16_c;
4456 4481
    c->scalarproduct_int16 = scalarproduct_int16_c;
libavcodec/dsputil.h
63 63

  
64 64
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1,
65 65
                              const float *src2, int src3, int blocksize, int step);
66
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
67
                             const float *win, float add_bias, int len);
66 68
void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
67 69

  
68 70
/* encoding scans */
......
364 366
    void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
365 367
    /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
366 368
    void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step);
369
    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
370
    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
367 371

  
368 372
    /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767]
369 373
     * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
370 374
    void (*float_to_int16)(int16_t *dst, const float *src, long len);
375
    void (*float_to_int16_interleave)(int16_t *dst, const float *src, long len, int channels);
371 376

  
372 377
    /* (I)DCT */
373 378
    void (*fdct)(DCTELEM *block/* align 16*/);
libavcodec/i386/dsputil_mmx.c
2022 2022
        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
2023 2023
}
2024 2024

  
2025
static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2026
                                   const float *win, float add_bias, int len){
2027
#ifdef HAVE_6REGS
2028
    if(add_bias == 0){
2029
        x86_reg i = -len*2;
2030
        x86_reg j = len*2-16;
2031
        asm volatile(
2032
            "1: \n"
2033
            "movaps       (%5,%0), %%xmm0 \n"
2034
            "movaps       (%5,%1), %%xmm1 \n"
2035
            "movaps        %%xmm0, %%xmm2 \n"
2036
            "movaps        %%xmm1, %%xmm3 \n"
2037
            "shufps $0x1b, %%xmm2, %%xmm2 \n"
2038
            "shufps $0x1b, %%xmm3, %%xmm3 \n"
2039
            "mulps        (%4,%0), %%xmm0 \n"
2040
            "mulps        (%4,%1), %%xmm1 \n"
2041
            "mulps        (%3,%0), %%xmm3 \n"
2042
            "mulps        (%3,%1), %%xmm2 \n"
2043
            "addps         %%xmm3, %%xmm0 \n"
2044
            "addps         %%xmm2, %%xmm1 \n"
2045
            "movaps        %%xmm0, (%2,%0) \n"
2046
            "movaps        %%xmm1, (%2,%1) \n"
2047
            "sub $16, %1 \n"
2048
            "add $16, %0 \n"
2049
            "jl 1b \n"
2050
            :"+r"(i), "+r"(j)
2051
            :"r"(dst+len/2), "r"(src0+len/2), "r"(src1+len/2), "r"(win+len/2)
2052
        );
2053
    }else
2054
#endif
2055
        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
2056
}
2057

  
2025 2058
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
2026 2059
    // not bit-exact: pf2id uses different rounding than C and SSE
2027 2060
    asm volatile(
......
2083 2116
    );
2084 2117
}
2085 2118

  
2119
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
2120
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
2121
static av_noinline void float_to_int16_interleave2_##cpu(int16_t *dst, const float *src, long len, int channels){\
2122
    DECLARE_ALIGNED_16(int16_t, tmp[len*channels]);\
2123
    int i,j,c;\
2124
    float_to_int16_##cpu(tmp, src, len*channels);\
2125
    for(c=0; c<channels; c++){\
2126
        int16_t *ptmp = tmp+c*len;\
2127
        for(i=0, j=c; i<len; i++, j+=channels)\
2128
            dst[j] = ptmp[i];\
2129
    }\
2130
}\
2131
\
2132
static void float_to_int16_interleave_##cpu(int16_t *dst, const float *src, long len, int channels){\
2133
    if(channels==1)\
2134
        float_to_int16_##cpu(dst, src, len);\
2135
    else if(channels>2)\
2136
        float_to_int16_interleave2_##cpu(dst, src, len, channels);\
2137
    else{\
2138
        float *src1;\
2139
        asm volatile(\
2140
            "shl $2, %0 \n"\
2141
            "add %0, %1 \n"\
2142
            "add %0, %2 \n"\
2143
            "lea (%2,%0), %3 \n"\
2144
            "neg %0 \n"\
2145
            body\
2146
            :"+r"(len), "+r"(dst), "+r"(src), "=r"(src1)\
2147
        );\
2148
    }\
2149
}
2150

  
2151
FLOAT_TO_INT16_INTERLEAVE(3dnow,
2152
    "1:                         \n"
2153
    "pf2id     (%2,%0), %%mm0   \n"
2154
    "pf2id    8(%2,%0), %%mm1   \n"
2155
    "pf2id     (%3,%0), %%mm2   \n"
2156
    "pf2id    8(%3,%0), %%mm3   \n"
2157
    "packssdw    %%mm1, %%mm0   \n"
2158
    "packssdw    %%mm3, %%mm2   \n"
2159
    "movq        %%mm0, %%mm1   \n"
2160
    "punpcklwd   %%mm2, %%mm0   \n"
2161
    "punpckhwd   %%mm2, %%mm1   \n"
2162
    "movq        %%mm0,  (%1,%0)\n"
2163
    "movq        %%mm0, 8(%1,%0)\n"
2164
    "add $16, %0                \n"
2165
    "js 1b                      \n"
2166
    "femms                      \n"
2167
)
2168

  
2169
FLOAT_TO_INT16_INTERLEAVE(sse,
2170
    "1:                         \n"
2171
    "cvtps2pi  (%2,%0), %%mm0   \n"
2172
    "cvtps2pi 8(%2,%0), %%mm1   \n"
2173
    "cvtps2pi  (%3,%0), %%mm2   \n"
2174
    "cvtps2pi 8(%3,%0), %%mm3   \n"
2175
    "packssdw    %%mm1, %%mm0   \n"
2176
    "packssdw    %%mm3, %%mm2   \n"
2177
    "movq        %%mm0, %%mm1   \n"
2178
    "punpcklwd   %%mm2, %%mm0   \n"
2179
    "punpckhwd   %%mm2, %%mm1   \n"
2180
    "movq        %%mm0,  (%1,%0)\n"
2181
    "movq        %%mm0, 8(%1,%0)\n"
2182
    "add $16, %0                \n"
2183
    "js 1b                      \n"
2184
    "emms                       \n"
2185
)
2186

  
2187
FLOAT_TO_INT16_INTERLEAVE(sse2,
2188
    "1:                         \n"
2189
    "cvtps2dq  (%2,%0), %%xmm0  \n"
2190
    "cvtps2dq  (%3,%0), %%xmm1  \n"
2191
    "packssdw   %%xmm1, %%xmm0  \n"
2192
    "movhlps    %%xmm0, %%xmm1  \n"
2193
    "punpcklwd  %%xmm1, %%xmm0  \n"
2194
    "movdqa     %%xmm0, (%1,%0) \n"
2195
    "add $16, %0                \n"
2196
    "js 1b                      \n"
2197
)
2198

  
2199

  
2086 2200
extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
2087 2201
extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
2088 2202
extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
......
2519 2633
        if(mm_flags & MM_3DNOW){
2520 2634
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2521 2635
            c->vector_fmul = vector_fmul_3dnow;
2522
            if(!(avctx->flags & CODEC_FLAG_BITEXACT))
2636
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2523 2637
                c->float_to_int16 = float_to_int16_3dnow;
2638
                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
2639
            }
2524 2640
        }
2525 2641
        if(mm_flags & MM_3DNOWEXT)
2526 2642
            c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
......
2528 2644
            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2529 2645
            c->vector_fmul = vector_fmul_sse;
2530 2646
            c->float_to_int16 = float_to_int16_sse;
2647
            c->float_to_int16_interleave = float_to_int16_interleave_sse;
2531 2648
            c->vector_fmul_reverse = vector_fmul_reverse_sse;
2532 2649
            c->vector_fmul_add_add = vector_fmul_add_add_sse;
2650
            c->vector_fmul_window = vector_fmul_window_sse;
2533 2651
        }
2534 2652
        if(mm_flags & MM_SSE2){
2535 2653
            c->float_to_int16 = float_to_int16_sse2;
2654
            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
2536 2655
        }
2537 2656
        if(mm_flags & MM_3DNOW)
2538 2657
            c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
libavcodec/vorbis_dec.c
149 149
    uint_fast8_t mode_count;
150 150
    vorbis_mode *modes;
151 151
    uint_fast8_t mode_number; // mode number for the current packet
152
    uint_fast8_t previous_window;
152 153
    float *channel_residues;
153 154
    float *channel_floors;
154 155
    float *saved;
155
    uint_fast16_t saved_start;
156 156
    float *ret;
157 157
    float *buf;
158 158
    float *buf_tmp;
......
903 903
    vc->ret             = av_malloc((vc->blocksize[1]/2)*vc->audio_channels * sizeof(float));
904 904
    vc->buf             = av_malloc( vc->blocksize[1]                       * sizeof(float));
905 905
    vc->buf_tmp         = av_malloc( vc->blocksize[1]                       * sizeof(float));
906
    vc->saved_start=0;
906
    vc->previous_window=0;
907 907

  
908 908
    ff_mdct_init(&vc->mdct[0], bl0, 1);
909 909
    ff_mdct_init(&vc->mdct[1], bl1, 1);
......
1394 1394
    }
1395 1395
}
1396 1396

  
1397
static void copy_normalize(float *dst, float *src, int len, int exp_bias, float add_bias)
1398
{
1399
    int i;
1400
    if(exp_bias) {
1401
        for(i=0; i<len; i++)
1402
            ((uint32_t*)dst)[i] = ((uint32_t*)src)[i] + exp_bias; // dst[k]=src[i]*(1<<bias)
1403
    } else {
1404
        for(i=0; i<len; i++)
1405
            dst[i] = src[i] + add_bias;
1406
    }
1407
}
1408

  
1397 1409
// Decode the audio packet using the functions above
1398 1410

  
1399 1411
static int vorbis_parse_audio_packet(vorbis_context *vc) {
1400 1412
    GetBitContext *gb=&vc->gb;
1401 1413

  
1402
    uint_fast8_t previous_window=0,next_window=0;
1414
    uint_fast8_t previous_window=vc->previous_window;
1403 1415
    uint_fast8_t mode_number;
1416
    uint_fast8_t blockflag;
1404 1417
    uint_fast16_t blocksize;
1405 1418
    int_fast32_t i,j;
1406 1419
    uint_fast8_t no_residue[vc->audio_channels];
......
1411 1424
    uint_fast8_t res_chan[vc->audio_channels];
1412 1425
    uint_fast8_t res_num=0;
1413 1426
    int_fast16_t retlen=0;
1414
    uint_fast16_t saved_start=0;
1415 1427
    float fadd_bias = vc->add_bias;
1416 1428

  
1417 1429
    if (get_bits1(gb)) {
......
1429 1441

  
1430 1442
    AV_DEBUG(" Mode number: %d , mapping: %d , blocktype %d \n", mode_number, vc->modes[mode_number].mapping, vc->modes[mode_number].blockflag);
1431 1443

  
1432
    if (vc->modes[mode_number].blockflag) {
1433
        previous_window=get_bits1(gb);
1434
        next_window=get_bits1(gb);
1444
    blockflag=vc->modes[mode_number].blockflag;
1445
    blocksize=vc->blocksize[blockflag];
1446
    if (blockflag) {
1447
        skip_bits(gb, 2); // previous_window, next_window
1435 1448
    }
1436 1449

  
1437
    blocksize=vc->blocksize[vc->modes[mode_number].blockflag];
1438 1450
    memset(ch_res_ptr, 0, sizeof(float)*vc->audio_channels*blocksize/2); //FIXME can this be removed ?
1439 1451
    memset(ch_floor_ptr, 0, sizeof(float)*vc->audio_channels*blocksize/2); //FIXME can this be removed ?
1440 1452

  
......
1504 1516

  
1505 1517
// MDCT, overlap/add, save data for next overlapping  FPMATH
1506 1518

  
1519
    retlen = (blocksize + vc->blocksize[previous_window])/4;
1507 1520
    for(j=0;j<vc->audio_channels;++j) {
1508
        uint_fast8_t step=vc->audio_channels;
1509
        uint_fast16_t k;
1510
        float *saved=vc->saved+j*vc->blocksize[1]/2;
1511
        float *ret=vc->ret;
1512
        const float *lwin=vc->win[1];
1513
        const float *swin=vc->win[0];
1521
        uint_fast16_t bs0=vc->blocksize[0];
1522
        uint_fast16_t bs1=vc->blocksize[1];
1523
        float *saved=vc->saved+j*bs1/2;
1524
        float *ret=vc->ret+j*retlen;
1514 1525
        float *buf=vc->buf;
1515
        float *buf_tmp=vc->buf_tmp;
1516

  
1517
        ch_floor_ptr=vc->channel_floors+j*blocksize/2;
1518

  
1519
        saved_start=vc->saved_start;
1526
        const float *win=vc->win[blockflag&previous_window];
1520 1527

  
1521
        vc->mdct[0].fft.imdct_calc(&vc->mdct[vc->modes[mode_number].blockflag], buf, ch_floor_ptr, buf_tmp);
1528
        vc->mdct[0].fft.imdct_calc(&vc->mdct[blockflag], buf, vc->channel_floors+j*blocksize/2, vc->buf_tmp);
1522 1529

  
1523
        //FIXME process channels together, to allow faster simd vector_fmul_add_add?
1524
        if (vc->modes[mode_number].blockflag) {
1525
            // -- overlap/add
1526
            if (previous_window) {
1527
                vc->dsp.vector_fmul_add_add(ret+j, buf, lwin, saved, vc->add_bias, vc->blocksize[1]/2, step);
1528
                retlen=vc->blocksize[1]/2;
1529
            } else {
1530
                int len = (vc->blocksize[1]-vc->blocksize[0])/4;
1531
                buf += len;
1532
                vc->dsp.vector_fmul_add_add(ret+j, buf, swin, saved, vc->add_bias, vc->blocksize[0]/2, step);
1533
                k = vc->blocksize[0]/2*step + j;
1534
                buf += vc->blocksize[0]/2;
1535
                if(vc->exp_bias){
1536
                    for(i=0; i<len; i++, k+=step)
1537
                        ((uint32_t*)ret)[k] = ((uint32_t*)buf)[i] + vc->exp_bias; // ret[k]=buf[i]*(1<<bias)
1538
                } else {
1539
                    for(i=0; i<len; i++, k+=step)
1540
                        ret[k] = buf[i] + fadd_bias;
1541
                }
1542
                buf=vc->buf;
1543
                retlen=vc->blocksize[0]/2+len;
1544
            }
1545
            // -- save
1546
            if (next_window) {
1547
                buf += vc->blocksize[1]/2;
1548
                vc->dsp.vector_fmul_reverse(saved, buf, lwin, vc->blocksize[1]/2);
1549
                saved_start=0;
1550
            } else {
1551
                saved_start=(vc->blocksize[1]-vc->blocksize[0])/4;
1552
                buf += vc->blocksize[1]/2;
1553
                for(i=0; i<saved_start; i++)
1554
                    ((uint32_t*)saved)[i] = ((uint32_t*)buf)[i] + vc->exp_bias;
1555
                vc->dsp.vector_fmul_reverse(saved+saved_start, buf+saved_start, swin, vc->blocksize[0]/2);
1556
            }
1530
        if(blockflag == previous_window) {
1531
            vc->dsp.vector_fmul_window(ret, saved, buf, win, fadd_bias, blocksize/2);
1532
        } else if(blockflag > previous_window) {
1533
            vc->dsp.vector_fmul_window(ret, saved, buf+(bs1-bs0)/4, win, fadd_bias, bs0/2);
1534
            copy_normalize(ret+bs0/2, buf+(bs1+bs0)/4, (bs1-bs0)/4, vc->exp_bias, fadd_bias);
1557 1535
        } else {
1558
            // --overlap/add
1559
            if(vc->add_bias) {
1560
                for(k=j, i=0;i<saved_start;++i, k+=step)
1561
                    ret[k] = saved[i] + fadd_bias;
1562
            } else {
1563
                for(k=j, i=0;i<saved_start;++i, k+=step)
1564
                    ret[k] = saved[i];
1565
            }
1566
            vc->dsp.vector_fmul_add_add(ret+k, buf, swin, saved+saved_start, vc->add_bias, vc->blocksize[0]/2, step);
1567
            retlen=saved_start+vc->blocksize[0]/2;
1568
            // -- save
1569
            buf += vc->blocksize[0]/2;
1570
            vc->dsp.vector_fmul_reverse(saved, buf, swin, vc->blocksize[0]/2);
1571
            saved_start=0;
1536
            copy_normalize(ret, saved, (bs1-bs0)/4, vc->exp_bias, fadd_bias);
1537
            vc->dsp.vector_fmul_window(ret+(bs1-bs0)/4, saved+(bs1-bs0)/4, buf, win, fadd_bias, bs0/2);
1572 1538
        }
1539
        memcpy(saved, buf+blocksize/2, blocksize/2*sizeof(float));
1573 1540
    }
1574
    vc->saved_start=saved_start;
1575 1541

  
1576
    return retlen*vc->audio_channels;
1542
    vc->previous_window = blockflag;
1543
    return retlen;
1577 1544
}
1578 1545

  
1579 1546
// Return the decoded audio packet through the standard api
......
1610 1577

  
1611 1578
    AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len);
1612 1579

  
1613
    vc->dsp.float_to_int16(data, vc->ret, len);
1614
    *data_size=len*2;
1580
    vc->dsp.float_to_int16_interleave(data, vc->ret, len, vc->audio_channels);
1581
    *data_size=len*2*vc->audio_channels;
1615 1582

  
1616 1583
    return buf_size ;
1617 1584
}
libavutil/x86_cpu.h
68 68
#    define HAVE_7REGS 1
69 69
#endif
70 70

  
71
#if defined(ARCH_X86_64) || (defined(ARCH_X86_32) && (defined(HAVE_EBX_AVAILABLE) || defined(HAVE_EBP_AVAILABLE)))
72
#    define HAVE_6REGS 1
73
#endif
74

  
71 75
#if defined(ARCH_X86_64) && defined(PIC)
72 76
#    define BROKEN_RELOCATIONS 1
73 77
#endif

Also available in: Unified diff