Statistics
| Branch: | Revision:

ffmpeg / libavcodec / wmavoice.c @ 72415b2a

History | View | Annotate | Download (61.6 KB)

1
/*
2
 * Windows Media Audio Voice decoder.
3
 * Copyright (c) 2009 Ronald S. Bultje
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
/**
23
 * @file libavcodec/wmavoice.c
24
 * @brief Windows Media Audio Voice compatible decoder
25
 * @author Ronald S. Bultje <rsbultje@gmail.com>
26
 */
27

    
28
#include <math.h>
29
#include "avcodec.h"
30
#include "get_bits.h"
31
#include "put_bits.h"
32
#include "wmavoice_data.h"
33
#include "celp_math.h"
34
#include "celp_filters.h"
35
#include "acelp_vectors.h"
36
#include "acelp_filters.h"
37
#include "lsp.h"
38
#include "libavutil/lzo.h"
39

    
40
#define MAX_BLOCKS           8   ///< maximum number of blocks per frame
41
#define MAX_LSPS             16  ///< maximum filter order
42
#define MAX_FRAMES           3   ///< maximum number of frames per superframe
43
#define MAX_FRAMESIZE        160 ///< maximum number of samples per frame
44
#define MAX_SIGNAL_HISTORY   416 ///< maximum excitation signal history
45
#define MAX_SFRAMESIZE       (MAX_FRAMESIZE * MAX_FRAMES)
46
                                 ///< maximum number of samples per superframe
47
#define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
48
                                 ///< was split over two packets
49
#define VLC_NBITS            6   ///< number of bits to read per VLC iteration
50

    
51
/**
52
 * Frame type VLC coding.
53
 */
54
static VLC frame_type_vlc;
55

    
56
/**
57
 * Adaptive codebook types.
58
 */
59
enum {
60
    ACB_TYPE_NONE       = 0, ///< no adaptive codebook (only hardcoded fixed)
61
    ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
62
                             ///< we interpolate to get a per-sample pitch.
63
                             ///< Signal is generated using an asymmetric sinc
64
                             ///< window function
65
                             ///< @note see #wmavoice_ipol1_coeffs
66
    ACB_TYPE_HAMMING    = 2  ///< Per-block pitch with signal generation using
67
                             ///< a Hamming sinc window function
68
                             ///< @note see #wmavoice_ipol2_coeffs
69
};
70

    
71
/**
72
 * Fixed codebook types.
73
 */
74
enum {
75
    FCB_TYPE_SILENCE    = 0, ///< comfort noise during silence
76
                             ///< generated from a hardcoded (fixed) codebook
77
                             ///< with per-frame (low) gain values
78
    FCB_TYPE_HARDCODED  = 1, ///< hardcoded (fixed) codebook with per-block
79
                             ///< gain values
80
    FCB_TYPE_AW_PULSES  = 2, ///< Pitch-adaptive window (AW) pulse signals,
81
                             ///< used in particular for low-bitrate streams
82
    FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
83
                             ///< combinations of either single pulses or
84
                             ///< pulse pairs
85
};
86

    
87
/**
88
 * Description of frame types.
89
 */
90
static const struct frame_type_desc {
91
    uint8_t n_blocks;     ///< amount of blocks per frame (each block
92
                          ///< (contains 160/#n_blocks samples)
93
    uint8_t log_n_blocks; ///< log2(#n_blocks)
94
    uint8_t acb_type;     ///< Adaptive codebook type (ACB_TYPE_*)
95
    uint8_t fcb_type;     ///< Fixed codebook type (FCB_TYPE_*)
96
    uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
97
                          ///< (rather than just one single pulse)
98
                          ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
99
    uint16_t frame_size;  ///< the amount of bits that make up the block
100
                          ///< data (per frame)
101
} frame_descs[17] = {
102
    { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0,   0 },
103
    { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0,  28 },
104
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0,  46 },
105
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2,  80 },
106
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
107
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
108
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
109
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
110
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0,  64 },
111
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2,  80 },
112
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 104 },
113
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0, 108 },
114
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2, 132 },
115
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 168 },
116
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0, 176 },
117
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2, 208 },
118
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 256 }
119
};
120

    
121
/**
122
 * WMA Voice decoding context.
123
 */
124
typedef struct {
125
    /**
126
     * @defgroup struct_global Global values
127
     * Global values, specified in the stream header / extradata or used
128
     * all over.
129
     * @{
130
     */
131
    GetBitContext gb;             ///< packet bitreader. During decoder init,
132
                                  ///< it contains the extradata from the
133
                                  ///< demuxer. During decoding, it contains
134
                                  ///< packet data.
135
    int8_t vbm_tree[25];          ///< converts VLC codes to frame type
136

    
137
    int spillover_bitsize;        ///< number of bits used to specify
138
                                  ///< #spillover_nbits in the packet header
139
                                  ///< = ceil(log2(ctx->block_align << 3))
140
    int history_nsamples;         ///< number of samples in history for signal
141
                                  ///< prediction (through ACB)
142

    
143
    int do_apf;                   ///< whether to apply the averaged
144
                                  ///< projection filter (APF)
145

    
146
    int lsps;                     ///< number of LSPs per frame [10 or 16]
147
    int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
148
    int lsp_def_mode;             ///< defines different sets of LSP defaults
149
                                  ///< [0, 1]
150
    int frame_lsp_bitsize;        ///< size (in bits) of LSPs, when encoded
151
                                  ///< per-frame (independent coding)
152
    int sframe_lsp_bitsize;       ///< size (in bits) of LSPs, when encoded
153
                                  ///< per superframe (residual coding)
154

    
155
    int min_pitch_val;            ///< base value for pitch parsing code
156
    int max_pitch_val;            ///< max value + 1 for pitch parsing
157
    int pitch_nbits;              ///< number of bits used to specify the
158
                                  ///< pitch value in the frame header
159
    int block_pitch_nbits;        ///< number of bits used to specify the
160
                                  ///< first block's pitch value
161
    int block_pitch_range;        ///< range of the block pitch
162
    int block_delta_pitch_nbits;  ///< number of bits used to specify the
163
                                  ///< delta pitch between this and the last
164
                                  ///< block's pitch value, used in all but
165
                                  ///< first block
166
    int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
167
                                  ///< from -this to +this-1)
168
    uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
169
                                  ///< conversion
170

    
171
    /**
172
     * @}
173
     * @defgroup struct_packet Packet values
174
     * Packet values, specified in the packet header or related to a packet.
175
     * A packet is considered to be a single unit of data provided to this
176
     * decoder by the demuxer.
177
     * @{
178
     */
179
    int spillover_nbits;          ///< number of bits of the previous packet's
180
                                  ///< last superframe preceeding this
181
                                  ///< packet's first full superframe (useful
182
                                  ///< for re-synchronization also)
183
    int has_residual_lsps;        ///< if set, superframes contain one set of
184
                                  ///< LSPs that cover all frames, encoded as
185
                                  ///< independent and residual LSPs; if not
186
                                  ///< set, each frame contains its own, fully
187
                                  ///< independent, LSPs
188
    int skip_bits_next;           ///< number of bits to skip at the next call
189
                                  ///< to #wmavoice_decode_packet() (since
190
                                  ///< they're part of the previous superframe)
191

    
192
    uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + FF_INPUT_BUFFER_PADDING_SIZE];
193
                                  ///< cache for superframe data split over
194
                                  ///< multiple packets
195
    int sframe_cache_size;        ///< set to >0 if we have data from an
196
                                  ///< (incomplete) superframe from a previous
197
                                  ///< packet that spilled over in the current
198
                                  ///< packet; specifies the amount of bits in
199
                                  ///< #sframe_cache
200
    PutBitContext pb;             ///< bitstream writer for #sframe_cache
201

    
202
    /**
203
     * @}
204
     * @defgroup struct_frame Frame and superframe values
205
     * Superframe and frame data - these can change from frame to frame,
206
     * although some of them do in that case serve as a cache / history for
207
     * the next frame or superframe.
208
     * @{
209
     */
210
    double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
211
                                  ///< superframe
212
    int last_pitch_val;           ///< pitch value of the previous frame
213
    int last_acb_type;            ///< frame type [0-2] of the previous frame
214
    int pitch_diff_sh16;          ///< ((cur_pitch_val - #last_pitch_val)
215
                                  ///< << 16) / #MAX_FRAMESIZE
216
    float silence_gain;           ///< set for use in blocks if #ACB_TYPE_NONE
217

    
218
    int aw_idx_is_ext;            ///< whether the AW index was encoded in
219
                                  ///< 8 bits (instead of 6)
220
    int aw_pulse_range;           ///< the range over which #aw_pulse_set1()
221
                                  ///< can apply the pulse, relative to the
222
                                  ///< value in aw_first_pulse_off. The exact
223
                                  ///< position of the first AW-pulse is within
224
                                  ///< [pulse_off, pulse_off + this], and
225
                                  ///< depends on bitstream values; [16 or 24]
226
    int aw_n_pulses[2];           ///< number of AW-pulses in each block; note
227
                                  ///< that this number can be negative (in
228
                                  ///< which case it basically means "zero")
229
    int aw_first_pulse_off[2];    ///< index of first sample to which to
230
                                  ///< apply AW-pulses, or -0xff if unset
231
    int aw_next_pulse_off_cache;  ///< the position (relative to start of the
232
                                  ///< second block) at which pulses should
233
                                  ///< start to be positioned, serves as a
234
                                  ///< cache for pitch-adaptive window pulses
235
                                  ///< between blocks
236

    
237
    int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
238
                                  ///< only used for comfort noise in #pRNG()
239
    float gain_pred_err[6];       ///< cache for gain prediction
240
    float excitation_history[MAX_SIGNAL_HISTORY];
241
                                  ///< cache of the signal of previous
242
                                  ///< superframes, used as a history for
243
                                  ///< signal generation
244
    float synth_history[MAX_LSPS]; ///< see #excitation_history
245
    /**
246
     * @}
247
     */
248
} WMAVoiceContext;
249

    
250
/**
251
 * Sets up the variable bit mode (VBM) tree from container extradata.
252
 * @param gb bit I/O context.
253
 *           The bit context (s->gb) should be loaded with byte 23-46 of the
254
 *           container extradata (i.e. the ones containing the VBM tree).
255
 * @param vbm_tree pointer to array to which the decoded VBM tree will be
256
 *                 written.
257
 * @return 0 on success, <0 on error.
258
 */
259
static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
260
{
261
    static const uint8_t bits[] = {
262
         2,  2,  2,  4,  4,  4,
263
         6,  6,  6,  8,  8,  8,
264
        10, 10, 10, 12, 12, 12,
265
        14, 14, 14, 14
266
    };
267
    static const uint16_t codes[] = {
268
          0x0000, 0x0001, 0x0002,        //              00/01/10
269
          0x000c, 0x000d, 0x000e,        //           11+00/01/10
270
          0x003c, 0x003d, 0x003e,        //         1111+00/01/10
271
          0x00fc, 0x00fd, 0x00fe,        //       111111+00/01/10
272
          0x03fc, 0x03fd, 0x03fe,        //     11111111+00/01/10
273
          0x0ffc, 0x0ffd, 0x0ffe,        //   1111111111+00/01/10
274
          0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
275
    };
276
    int cntr[8], n, res;
277

    
278
    memset(vbm_tree, 0xff, sizeof(vbm_tree));
279
    memset(cntr,     0,    sizeof(cntr));
280
    for (n = 0; n < 17; n++) {
281
        res = get_bits(gb, 3);
282
        if (cntr[res] > 3) // should be >= 3 + (res == 7))
283
            return -1;
284
        vbm_tree[res * 3 + cntr[res]++] = n;
285
    }
286
    INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
287
                    bits, 1, 1, codes, 2, 2, 132);
288
    return 0;
289
}
290

    
291
/**
292
 * Set up decoder with parameters from demuxer (extradata etc.).
293
 */
294
static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
295
{
296
    int n, flags, pitch_range, lsp16_flag;
297
    WMAVoiceContext *s = ctx->priv_data;
298

    
299
    /**
300
     * Extradata layout:
301
     * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
302
     * - byte 19-22: flags field (annoyingly in LE; see below for known
303
     *               values),
304
     * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
305
     *               rest is 0).
306
     */
307
    if (ctx->extradata_size != 46) {
308
        av_log(ctx, AV_LOG_ERROR,
309
               "Invalid extradata size %d (should be 46)\n",
310
               ctx->extradata_size);
311
        return -1;
312
    }
313
    flags                = AV_RL32(ctx->extradata + 18);
314
    s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
315
    s->do_apf            =    flags & 0x1;
316
    s->lsp_q_mode        = !!(flags & 0x2000);
317
    s->lsp_def_mode      = !!(flags & 0x4000);
318
    lsp16_flag           =    flags & 0x1000;
319
    if (lsp16_flag) {
320
        s->lsps               = 16;
321
        s->frame_lsp_bitsize  = 34;
322
        s->sframe_lsp_bitsize = 60;
323
    } else {
324
        s->lsps               = 10;
325
        s->frame_lsp_bitsize  = 24;
326
        s->sframe_lsp_bitsize = 48;
327
    }
328
    for (n = 0; n < s->lsps; n++)
329
        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
330

    
331
    init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
332
    if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
333
        av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
334
        return -1;
335
    }
336

    
337
    s->min_pitch_val    = ((ctx->sample_rate << 8)      /  400 + 50) >> 8;
338
    s->max_pitch_val    = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
339
    pitch_range         = s->max_pitch_val - s->min_pitch_val;
340
    s->pitch_nbits      = av_ceil_log2(pitch_range);
341
    s->last_pitch_val   = 40;
342
    s->last_acb_type    = ACB_TYPE_NONE;
343
    s->history_nsamples = s->max_pitch_val + 8;
344

    
345
    if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
346
        int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
347
            max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
348

    
349
        av_log(ctx, AV_LOG_ERROR,
350
               "Unsupported samplerate %d (min=%d, max=%d)\n",
351
               ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
352

    
353
        return -1;
354
    }
355

    
356
    s->block_conv_table[0]      = s->min_pitch_val;
357
    s->block_conv_table[1]      = (pitch_range * 25) >> 6;
358
    s->block_conv_table[2]      = (pitch_range * 44) >> 6;
359
    s->block_conv_table[3]      = s->max_pitch_val - 1;
360
    s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
361
    s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
362
    s->block_pitch_range        = s->block_conv_table[2] +
363
                                  s->block_conv_table[3] + 1 +
364
                                  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
365
    s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
366

    
367
    ctx->sample_fmt             = SAMPLE_FMT_FLT;
368

    
369
    return 0;
370
}
371

    
372
/**
373
 * Dequantize LSPs
374
 * @param lsps output pointer to the array that will hold the LSPs
375
 * @param num number of LSPs to be dequantized
376
 * @param values quantized values, contains n_stages values
377
 * @param sizes range (i.e. max value) of each quantized value
378
 * @param n_stages number of dequantization runs
379
 * @param table dequantization table to be used
380
 * @param mul_q LSF multiplier
381
 * @param base_q base (lowest) LSF values
382
 */
383
static void dequant_lsps(double *lsps, int num,
384
                         const uint16_t *values,
385
                         const uint16_t *sizes,
386
                         int n_stages, const uint8_t *table,
387
                         const double *mul_q,
388
                         const double *base_q)
389
{
390
    int n, m;
391

    
392
    memset(lsps, 0, num * sizeof(*lsps));
393
    for (n = 0; n < n_stages; n++) {
394
        const uint8_t *t_off = &table[values[n] * num];
395
        double base = base_q[n], mul = mul_q[n];
396

    
397
        for (m = 0; m < num; m++)
398
            lsps[m] += base + mul * t_off[m];
399

    
400
        table += sizes[n] * num;
401
    }
402
}
403

    
404
/**
405
 * @defgroup lsp_dequant LSP dequantization routines
406
 * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
407
 * @note we assume enough bits are available, caller should check.
408
 * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
409
 * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
410
 * @{
411
 */
412
/**
413
 * Parse 10 independently-coded LSPs.
414
 */
415
static void dequant_lsp10i(GetBitContext *gb, double *lsps)
416
{
417
    static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
418
    static const double mul_lsf[4] = {
419
        5.2187144800e-3,    1.4626986422e-3,
420
        9.6179549166e-4,    1.1325736225e-3
421
    };
422
    static const double base_lsf[4] = {
423
        M_PI * -2.15522e-1, M_PI * -6.1646e-2,
424
        M_PI * -3.3486e-2,  M_PI * -5.7408e-2
425
    };
426
    uint16_t v[4];
427

    
428
    v[0] = get_bits(gb, 8);
429
    v[1] = get_bits(gb, 6);
430
    v[2] = get_bits(gb, 5);
431
    v[3] = get_bits(gb, 5);
432

    
433
    dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
434
                 mul_lsf, base_lsf);
435
}
436

    
437
/**
438
 * Parse 10 independently-coded LSPs, and then derive the tables to
439
 * generate LSPs for the other frames from them (residual coding).
440
 */
441
static void dequant_lsp10r(GetBitContext *gb,
442
                           double *i_lsps, const double *old,
443
                           double *a1, double *a2, int q_mode)
444
{
445
    static const uint16_t vec_sizes[3] = { 128, 64, 64 };
446
    static const double mul_lsf[3] = {
447
        2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
448
    };
449
    static const double base_lsf[3] = {
450
        M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
451
    };
452
    const float (*ipol_tab)[2][10] = q_mode ?
453
        wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
454
    uint16_t interpol, v[3];
455
    int n;
456

    
457
    dequant_lsp10i(gb, i_lsps);
458

    
459
    interpol = get_bits(gb, 5);
460
    v[0]     = get_bits(gb, 7);
461
    v[1]     = get_bits(gb, 6);
462
    v[2]     = get_bits(gb, 6);
463

    
464
    for (n = 0; n < 10; n++) {
465
        double delta = old[n] - i_lsps[n];
466
        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
467
        a1[10 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
468
    }
469

    
470
    dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
471
                 mul_lsf, base_lsf);
472
}
473

    
474
/**
475
 * Parse 16 independently-coded LSPs.
476
 */
477
static void dequant_lsp16i(GetBitContext *gb, double *lsps)
478
{
479
    static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
480
    static const double mul_lsf[5] = {
481
        3.3439586280e-3,    6.9908173703e-4,
482
        3.3216608306e-3,    1.0334960326e-3,
483
        3.1899104283e-3
484
    };
485
    static const double base_lsf[5] = {
486
        M_PI * -1.27576e-1, M_PI * -2.4292e-2,
487
        M_PI * -1.28094e-1, M_PI * -3.2128e-2,
488
        M_PI * -1.29816e-1
489
    };
490
    uint16_t v[5];
491

    
492
    v[0] = get_bits(gb, 8);
493
    v[1] = get_bits(gb, 6);
494
    v[2] = get_bits(gb, 7);
495
    v[3] = get_bits(gb, 6);
496
    v[4] = get_bits(gb, 7);
497

    
498
    dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
499
                 wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
500
    dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
501
                 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
502
    dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
503
                 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
504
}
505

    
506
/**
507
 * Parse 16 independently-coded LSPs, and then derive the tables to
508
 * generate LSPs for the other frames from them (residual coding).
509
 */
510
static void dequant_lsp16r(GetBitContext *gb,
511
                           double *i_lsps, const double *old,
512
                           double *a1, double *a2, int q_mode)
513
{
514
    static const uint16_t vec_sizes[3] = { 128, 128, 128 };
515
    static const double mul_lsf[3] = {
516
        1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
517
    };
518
    static const double base_lsf[3] = {
519
        M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
520
    };
521
    const float (*ipol_tab)[2][16] = q_mode ?
522
        wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
523
    uint16_t interpol, v[3];
524
    int n;
525

    
526
    dequant_lsp16i(gb, i_lsps);
527

    
528
    interpol = get_bits(gb, 5);
529
    v[0]     = get_bits(gb, 7);
530
    v[1]     = get_bits(gb, 7);
531
    v[2]     = get_bits(gb, 7);
532

    
533
    for (n = 0; n < 16; n++) {
534
        double delta = old[n] - i_lsps[n];
535
        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
536
        a1[16 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
537
    }
538

    
539
    dequant_lsps( a2,     10,  v,     vec_sizes,    1,
540
                 wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
541
    dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
542
                 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
543
    dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
544
                 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
545
}
546

    
547
/**
548
 * @}
549
 * @defgroup aw Pitch-adaptive window coding functions
550
 * The next few functions are for pitch-adaptive window coding.
551
 * @{
552
 */
553
/**
554
 * Parse the offset of the first pitch-adaptive window pulses, and
555
 * the distribution of pulses between the two blocks in this frame.
556
 * @param s WMA Voice decoding context private data
557
 * @param gb bit I/O context
558
 * @param pitch pitch for each block in this frame
559
 */
560
static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
561
                            const int *pitch)
562
{
563
    static const int16_t start_offset[94] = {
564
        -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
565
         13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
566
         27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
567
         45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
568
         69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
569
         93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
570
        117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
571
        141, 143, 145, 147, 149, 151, 153, 155, 157, 159
572
    };
573
    int bits, offset;
574

    
575
    /* position of pulse */
576
    s->aw_idx_is_ext = 0;
577
    if ((bits = get_bits(gb, 6)) >= 54) {
578
        s->aw_idx_is_ext = 1;
579
        bits += (bits - 54) * 3 + get_bits(gb, 2);
580
    }
581

    
582
    /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
583
     * the distribution of the pulses in each block contained in this frame. */
584
    s->aw_pulse_range        = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
585
    for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
586
    s->aw_n_pulses[0]        = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
587
    s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
588
    offset                  += s->aw_n_pulses[0] * pitch[0];
589
    s->aw_n_pulses[1]        = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
590
    s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
591

    
592
    /* if continuing from a position before the block, reset position to
593
     * start of block (when corrected for the range over which it can be
594
     * spread in aw_pulse_set1()). */
595
    if (start_offset[bits] < MAX_FRAMESIZE / 2) {
596
        while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
597
            s->aw_first_pulse_off[1] -= pitch[1];
598
        if (start_offset[bits] < 0)
599
            while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
600
                s->aw_first_pulse_off[0] -= pitch[0];
601
    }
602
}
603

    
604
/**
605
 * Apply second set of pitch-adaptive window pulses.
606
 * @param s WMA Voice decoding context private data
607
 * @param gb bit I/O context
608
 * @param block_idx block index in frame [0, 1]
609
 * @param fcb structure containing fixed codebook vector info
610
 */
611
static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
612
                          int block_idx, AMRFixed *fcb)
613
{
614
    uint16_t use_mask[7]; // only 5 are used, rest is padding
615
    /* in this function, idx is the index in the 80-bit (+ padding) use_mask
616
     * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
617
     * of idx are the position of the bit within a particular item in the
618
     * array (0 being the most significant bit, and 15 being the least
619
     * significant bit), and the remainder (>> 4) is the index in the
620
     * use_mask[]-array. This is faster and uses less memory than using a
621
     * 80-byte/80-int array. */
622
    int pulse_off = s->aw_first_pulse_off[block_idx],
623
        pulse_start, n, idx, range, aidx, start_off = 0;
624

    
625
    /* set offset of first pulse to within this block */
626
    if (s->aw_n_pulses[block_idx] > 0)
627
        while (pulse_off + s->aw_pulse_range < 1)
628
            pulse_off += fcb->pitch_lag;
629

    
630
    /* find range per pulse */
631
    if (s->aw_n_pulses[0] > 0) {
632
        if (block_idx == 0) {
633
            range = 32;
634
        } else /* block_idx = 1 */ {
635
            range = 8;
636
            if (s->aw_n_pulses[block_idx] > 0)
637
                pulse_off = s->aw_next_pulse_off_cache;
638
        }
639
    } else
640
        range = 16;
641
    pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
642

    
643
    /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
644
     * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
645
     * we exclude that range from being pulsed again in this function. */
646
    memset( use_mask,   -1, 5 * sizeof(use_mask[0]));
647
    memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
648
    if (s->aw_n_pulses[block_idx] > 0)
649
        for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
650
            int excl_range         = s->aw_pulse_range; // always 16 or 24
651
            uint16_t *use_mask_ptr = &use_mask[idx >> 4];
652
            int first_sh           = 16 - (idx & 15);
653
            *use_mask_ptr++       &= 0xFFFF << first_sh;
654
            excl_range            -= first_sh;
655
            if (excl_range >= 16) {
656
                *use_mask_ptr++    = 0;
657
                *use_mask_ptr     &= 0xFFFF >> (excl_range - 16);
658
            } else
659
                *use_mask_ptr     &= 0xFFFF >> excl_range;
660
        }
661

    
662
    /* find the 'aidx'th offset that is not excluded */
663
    aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
664
    for (n = 0; n <= aidx; pulse_start++) {
665
        for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
666
        if (idx >= MAX_FRAMESIZE / 2) { // find from zero
667
            if (use_mask[0])      idx = 0x0F;
668
            else if (use_mask[1]) idx = 0x1F;
669
            else if (use_mask[2]) idx = 0x2F;
670
            else if (use_mask[3]) idx = 0x3F;
671
            else if (use_mask[4]) idx = 0x4F;
672
            else                  return;
673
            idx -= av_log2_16bit(use_mask[idx >> 4]);
674
        }
675
        if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
676
            use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
677
            n++;
678
            start_off = idx;
679
        }
680
    }
681

    
682
    fcb->x[fcb->n] = start_off;
683
    fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
684
    fcb->n++;
685

    
686
    /* set offset for next block, relative to start of that block */
687
    n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
688
    s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
689
}
690

    
691
/**
692
 * Apply first set of pitch-adaptive window pulses.
693
 * @param s WMA Voice decoding context private data
694
 * @param gb bit I/O context
695
 * @param block_idx block index in frame [0, 1]
696
 * @param fcb storage location for fixed codebook pulse info
697
 */
698
static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
699
                          int block_idx, AMRFixed *fcb)
700
{
701
    int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
702
    float v;
703

    
704
    if (s->aw_n_pulses[block_idx] > 0) {
705
        int n, v_mask, i_mask, sh, n_pulses;
706

    
707
        if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
708
            n_pulses = 3;
709
            v_mask   = 8;
710
            i_mask   = 7;
711
            sh       = 4;
712
        } else { // 4 pulses, 1:sign + 2:index each
713
            n_pulses = 4;
714
            v_mask   = 4;
715
            i_mask   = 3;
716
            sh       = 3;
717
        }
718

    
719
        for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
720
            fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
721
            fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
722
                                 s->aw_first_pulse_off[block_idx];
723
            while (fcb->x[fcb->n] < 0)
724
                fcb->x[fcb->n] += fcb->pitch_lag;
725
            if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
726
                fcb->n++;
727
        }
728
    } else {
729
        int num2 = (val & 0x1FF) >> 1, delta, idx;
730

    
731
        if (num2 < 1 * 79)      { delta = 1; idx = num2 + 1; }
732
        else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
733
        else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
734
        else                    { delta = 7; idx = num2 + 1 - 3 * 75; }
735
        v = (val & 0x200) ? -1.0 : 1.0;
736

    
737
        fcb->no_repeat_mask |= 3 << fcb->n;
738
        fcb->x[fcb->n]       = idx - delta;
739
        fcb->y[fcb->n]       = v;
740
        fcb->x[fcb->n + 1]   = idx;
741
        fcb->y[fcb->n + 1]   = (val & 1) ? -v : v;
742
        fcb->n              += 2;
743
    }
744
}
745

    
746
/**
747
 * @}
748
 *
749
 * Generate a random number from frame_cntr and block_idx, which will lief
750
 * in the range [0, 1000 - block_size] (so it can be used as an index in a
751
 * table of size 1000 of which you want to read block_size entries).
752
 *
753
 * @param frame_cntr current frame number
754
 * @param block_num current block index
755
 * @param block_size amount of entries we want to read from a table
756
 *                   that has 1000 entries
757
 * @return a (non-)random number in the [0, 1000 - block_size] range.
758
 */
759
static int pRNG(int frame_cntr, int block_num, int block_size)
760
{
761
    /* array to simplify the calculation of z:
762
     * y = (x % 9) * 5 + 6;
763
     * z = (49995 * x) / y;
764
     * Since y only has 9 values, we can remove the division by using a
765
     * LUT and using FASTDIV-style divisions. For each of the 9 values
766
     * of y, we can rewrite z as:
767
     * z = x * (49995 / y) + x * ((49995 % y) / y)
768
     * In this table, each col represents one possible value of y, the
769
     * first number is 49995 / y, and the second is the FASTDIV variant
770
     * of 49995 % y / y. */
771
    static const unsigned int div_tbl[9][2] = {
772
        { 8332,  3 * 715827883U }, // y =  6
773
        { 4545,  0 * 390451573U }, // y = 11
774
        { 3124, 11 * 268435456U }, // y = 16
775
        { 2380, 15 * 204522253U }, // y = 21
776
        { 1922, 23 * 165191050U }, // y = 26
777
        { 1612, 23 * 138547333U }, // y = 31
778
        { 1388, 27 * 119304648U }, // y = 36
779
        { 1219, 16 * 104755300U }, // y = 41
780
        { 1086, 39 *  93368855U }  // y = 46
781
    };
782
    unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
783
    if (x >= 0xFFFF) x -= 0xFFFF;   // max value of x is 8*1877+0xFFFE=0x13AA6,
784
                                    // so this is effectively a modulo (%)
785
    y = x - 9 * MULH(477218589, x); // x % 9
786
    z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
787
                                    // z = x * 49995 / (y * 5 + 6)
788
    return z % (1000 - block_size);
789
}
790

    
791
/**
792
 * Parse hardcoded signal for a single block.
793
 * @note see #synth_block().
794
 */
795
static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
796
                                 int block_idx, int size,
797
                                 const struct frame_type_desc *frame_desc,
798
                                 float *excitation)
799
{
800
    float gain;
801
    int n, r_idx;
802

    
803
    assert(size <= MAX_FRAMESIZE);
804

    
805
    /* Set the offset from which we start reading wmavoice_std_codebook */
806
    if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
807
        r_idx = pRNG(s->frame_cntr, block_idx, size);
808
        gain  = s->silence_gain;
809
    } else /* FCB_TYPE_HARDCODED */ {
810
        r_idx = get_bits(gb, 8);
811
        gain  = wmavoice_gain_universal[get_bits(gb, 6)];
812
    }
813

    
814
    /* Clear gain prediction parameters */
815
    memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
816

    
817
    /* Apply gain to hardcoded codebook and use that as excitation signal */
818
    for (n = 0; n < size; n++)
819
        excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
820
}
821

    
822
/**
823
 * Parse FCB/ACB signal for a single block.
824
 * @note see #synth_block().
825
 */
826
static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
827
                                int block_idx, int size,
828
                                int block_pitch_sh2,
829
                                const struct frame_type_desc *frame_desc,
830
                                float *excitation)
831
{
832
    static const float gain_coeff[6] = {
833
        0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
834
    };
835
    float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
836
    int n, idx, gain_weight;
837
    AMRFixed fcb;
838

    
839
    assert(size <= MAX_FRAMESIZE / 2);
840
    memset(pulses, 0, sizeof(*pulses) * size);
841

    
842
    fcb.pitch_lag      = block_pitch_sh2 >> 2;
843
    fcb.pitch_fac      = 1.0;
844
    fcb.no_repeat_mask = 0;
845
    fcb.n              = 0;
846

    
847
    /* For the other frame types, this is where we apply the innovation
848
     * (fixed) codebook pulses of the speech signal. */
849
    if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
850
        aw_pulse_set1(s, gb, block_idx, &fcb);
851
        aw_pulse_set2(s, gb, block_idx, &fcb);
852
    } else /* FCB_TYPE_EXC_PULSES */ {
853
        int offset_nbits = 5 - frame_desc->log_n_blocks;
854

    
855
        fcb.no_repeat_mask = -1;
856
        /* similar to ff_decode_10_pulses_35bits(), but with single pulses
857
         * (instead of double) for a subset of pulses */
858
        for (n = 0; n < 5; n++) {
859
            float sign;
860
            int pos1, pos2;
861

    
862
            sign           = get_bits1(gb) ? 1.0 : -1.0;
863
            pos1           = get_bits(gb, offset_nbits);
864
            fcb.x[fcb.n]   = n + 5 * pos1;
865
            fcb.y[fcb.n++] = sign;
866
            if (n < frame_desc->dbl_pulses) {
867
                pos2           = get_bits(gb, offset_nbits);
868
                fcb.x[fcb.n]   = n + 5 * pos2;
869
                fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
870
            }
871
        }
872
    }
873
    ff_set_fixed_vector(pulses, &fcb, 1.0, size);
874

    
875
    /* Calculate gain for adaptive & fixed codebook signal.
876
     * see ff_amr_set_fixed_gain(). */
877
    idx = get_bits(gb, 7);
878
    fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
879
                    5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
880
    acb_gain = wmavoice_gain_codebook_acb[idx];
881
    pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
882
                        -2.9957322736 /* log(0.05) */,
883
                         1.6094379124 /* log(5.0)  */);
884

    
885
    gain_weight = 8 >> frame_desc->log_n_blocks;
886
    memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
887
            sizeof(*s->gain_pred_err) * (6 - gain_weight));
888
    for (n = 0; n < gain_weight; n++)
889
        s->gain_pred_err[n] = pred_err;
890

    
891
    /* Calculation of adaptive codebook */
892
    if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
893
        int len;
894
        for (n = 0; n < size; n += len) {
895
            int next_idx_sh16;
896
            int abs_idx    = block_idx * size + n;
897
            int pitch_sh16 = (s->last_pitch_val << 16) +
898
                             s->pitch_diff_sh16 * abs_idx;
899
            int pitch      = (pitch_sh16 + 0x6FFF) >> 16;
900
            int idx_sh16   = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
901
            idx            = idx_sh16 >> 16;
902
            if (s->pitch_diff_sh16) {
903
                if (s->pitch_diff_sh16 > 0) {
904
                    next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
905
                } else
906
                    next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
907
                len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
908
                              1, size - n);
909
            } else
910
                len = size;
911

    
912
            ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
913
                                  wmavoice_ipol1_coeffs, 17,
914
                                  idx, 9, len);
915
        }
916
    } else /* ACB_TYPE_HAMMING */ {
917
        int block_pitch = block_pitch_sh2 >> 2;
918
        idx             = block_pitch_sh2 & 3;
919
        if (idx) {
920
            ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
921
                                  wmavoice_ipol2_coeffs, 4,
922
                                  idx, 8, size);
923
        } else
924
            av_memcpy_backptr(excitation, sizeof(float) * block_pitch,
925
                              sizeof(float) * size);
926
    }
927

    
928
    /* Interpolate ACB/FCB and use as excitation signal */
929
    ff_weighted_vector_sumf(excitation, excitation, pulses,
930
                            acb_gain, fcb_gain, size);
931
}
932

    
933
/**
934
 * Parse data in a single block.
935
 * @note we assume enough bits are available, caller should check.
936
 *
937
 * @param s WMA Voice decoding context private data
938
 * @param gb bit I/O context
939
 * @param block_idx index of the to-be-read block
940
 * @param size amount of samples to be read in this block
941
 * @param block_pitch_sh2 pitch for this block << 2
942
 * @param lsps LSPs for (the end of) this frame
943
 * @param prev_lsps LSPs for the last frame
944
 * @param frame_desc frame type descriptor
945
 * @param excitation target memory for the ACB+FCB interpolated signal
946
 * @param synth target memory for the speech synthesis filter output
947
 * @return 0 on success, <0 on error.
948
 */
949
static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
950
                        int block_idx, int size,
951
                        int block_pitch_sh2,
952
                        const double *lsps, const double *prev_lsps,
953
                        const struct frame_type_desc *frame_desc,
954
                        float *excitation, float *synth)
955
{
956
    double i_lsps[MAX_LSPS];
957
    float lpcs[MAX_LSPS];
958
    float fac;
959
    int n;
960

    
961
    if (frame_desc->acb_type == ACB_TYPE_NONE)
962
        synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
963
    else
964
        synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
965
                            frame_desc, excitation);
966

    
967
    /* convert interpolated LSPs to LPCs */
968
    fac = (block_idx + 0.5) / frame_desc->n_blocks;
969
    for (n = 0; n < s->lsps; n++) // LSF -> LSP
970
        i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
971
    ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
972

    
973
    /* Speech synthesis */
974
    ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
975
}
976

    
977
/**
978
 * Synthesize output samples for a single frame.
979
 * @note we assume enough bits are available, caller should check.
980
 *
981
 * @param ctx WMA Voice decoder context
982
 * @param gb bit I/O context (s->gb or one for cross-packet superframes)
983
 * @param samples pointer to output sample buffer, has space for at least 160
984
 *                samples
985
 * @param lsps LSP array
986
 * @param prev_lsps array of previous frame's LSPs
987
 * @param excitation target buffer for excitation signal
988
 * @param synth target buffer for synthesized speech data
989
 * @return 0 on success, <0 on error.
990
 */
991
static int synth_frame(AVCodecContext *ctx, GetBitContext *gb,
992
                       float *samples,
993
                       const double *lsps, const double *prev_lsps,
994
                       float *excitation, float *synth)
995
{
996
    WMAVoiceContext *s = ctx->priv_data;
997
    int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
998
    int pitch[MAX_BLOCKS], last_block_pitch;
999

    
1000
    /* Parse frame type ("frame header"), see frame_descs */
1001
    int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)],
1002
        block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1003

    
1004
    if (bd_idx < 0) {
1005
        av_log(ctx, AV_LOG_ERROR,
1006
               "Invalid frame type VLC code, skipping\n");
1007
        return -1;
1008
    }
1009

    
1010
    /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1011
    if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1012
        /* Pitch is provided per frame, which is interpreted as the pitch of
1013
         * the last sample of the last block of this frame. We can interpolate
1014
         * the pitch of other blocks (and even pitch-per-sample) by gradually
1015
         * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1016
        n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
1017
        log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
1018
        cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1019
        cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1020
        if (s->last_acb_type == ACB_TYPE_NONE ||
1021
            20 * abs(cur_pitch_val - s->last_pitch_val) >
1022
                (cur_pitch_val + s->last_pitch_val))
1023
            s->last_pitch_val = cur_pitch_val;
1024

    
1025
        /* pitch per block */
1026
        for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1027
            int fac = n * 2 + 1;
1028

    
1029
            pitch[n] = (MUL16(fac,                 cur_pitch_val) +
1030
                        MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1031
                        frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1032
        }
1033

    
1034
        /* "pitch-diff-per-sample" for calculation of pitch per sample */
1035
        s->pitch_diff_sh16 =
1036
            ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1037
    }
1038

    
1039
    /* Global gain (if silence) and pitch-adaptive window coordinates */
1040
    switch (frame_descs[bd_idx].fcb_type) {
1041
    case FCB_TYPE_SILENCE:
1042
        s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1043
        break;
1044
    case FCB_TYPE_AW_PULSES:
1045
        aw_parse_coords(s, gb, pitch);
1046
        break;
1047
    }
1048

    
1049
    for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1050
        int bl_pitch_sh2;
1051

    
1052
        /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1053
        switch (frame_descs[bd_idx].acb_type) {
1054
        case ACB_TYPE_HAMMING: {
1055
            /* Pitch is given per block. Per-block pitches are encoded as an
1056
             * absolute value for the first block, and then delta values
1057
             * relative to this value) for all subsequent blocks. The scale of
1058
             * this pitch value is semi-logaritmic compared to its use in the
1059
             * decoder, so we convert it to normal scale also. */
1060
            int block_pitch,
1061
                t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1062
                t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1063
                t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
1064

    
1065
            if (n == 0) {
1066
                block_pitch = get_bits(gb, s->block_pitch_nbits);
1067
            } else
1068
                block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1069
                                 get_bits(gb, s->block_delta_pitch_nbits);
1070
            /* Convert last_ so that any next delta is within _range */
1071
            last_block_pitch = av_clip(block_pitch,
1072
                                       s->block_delta_pitch_hrange,
1073
                                       s->block_pitch_range -
1074
                                           s->block_delta_pitch_hrange);
1075

    
1076
            /* Convert semi-log-style scale back to normal scale */
1077
            if (block_pitch < t1) {
1078
                bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1079
            } else {
1080
                block_pitch -= t1;
1081
                if (block_pitch < t2) {
1082
                    bl_pitch_sh2 =
1083
                        (s->block_conv_table[1] << 2) + (block_pitch << 1);
1084
                } else {
1085
                    block_pitch -= t2;
1086
                    if (block_pitch < t3) {
1087
                        bl_pitch_sh2 =
1088
                            (s->block_conv_table[2] + block_pitch) << 2;
1089
                    } else
1090
                        bl_pitch_sh2 = s->block_conv_table[3] << 2;
1091
                }
1092
            }
1093
            pitch[n] = bl_pitch_sh2 >> 2;
1094
            break;
1095
        }
1096

    
1097
        case ACB_TYPE_ASYMMETRIC: {
1098
            bl_pitch_sh2 = pitch[n] << 2;
1099
            break;
1100
        }
1101

    
1102
        default: // ACB_TYPE_NONE has no pitch
1103
            bl_pitch_sh2 = 0;
1104
            break;
1105
        }
1106

    
1107
        synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1108
                    lsps, prev_lsps, &frame_descs[bd_idx],
1109
                    &excitation[n * block_nsamples],
1110
                    &synth[n * block_nsamples]);
1111
    }
1112

    
1113
    /* Averaging projection filter, if applicable. Else, just copy samples
1114
     * from synthesis buffer */
1115
    if (s->do_apf) {
1116
        // FIXME this is where APF would take place, currently not implemented
1117
        av_log_missing_feature(ctx, "APF", 0);
1118
        s->do_apf = 0;
1119
    } //else
1120
        for (n = 0; n < 160; n++)
1121
            samples[n] = av_clipf(synth[n], -1.0, 1.0);
1122

    
1123
    /* Cache values for next frame */
1124
    s->frame_cntr++;
1125
    if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1126
    s->last_acb_type = frame_descs[bd_idx].acb_type;
1127
    switch (frame_descs[bd_idx].acb_type) {
1128
    case ACB_TYPE_NONE:
1129
        s->last_pitch_val = 0;
1130
        break;
1131
    case ACB_TYPE_ASYMMETRIC:
1132
        s->last_pitch_val = cur_pitch_val;
1133
        break;
1134
    case ACB_TYPE_HAMMING:
1135
        s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1136
        break;
1137
    }
1138

    
1139
    return 0;
1140
}
1141

    
1142
/**
1143
 * Ensure minimum value for first item, maximum value for last value,
1144
 * proper spacing between each value and proper ordering.
1145
 *
1146
 * @param lsps array of LSPs
1147
 * @param num size of LSP array
1148
 *
1149
 * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1150
 *       useful to put in a generic location later on. Parts are also
1151
 *       present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1152
 *       which is in float.
1153
 */
1154
static void stabilize_lsps(double *lsps, int num)
1155
{
1156
    int n, m, l;
1157

    
1158
    /* set minimum value for first, maximum value for last and minimum
1159
     * spacing between LSF values.
1160
     * Very similar to ff_set_min_dist_lsf(), but in double. */
1161
    lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
1162
    for (n = 1; n < num; n++)
1163
        lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
1164
    lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1165

    
1166
    /* reorder (looks like one-time / non-recursed bubblesort).
1167
     * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1168
    for (n = 1; n < num; n++) {
1169
        if (lsps[n] < lsps[n - 1]) {
1170
            for (m = 1; m < num; m++) {
1171
                double tmp = lsps[m];
1172
                for (l = m - 1; l >= 0; l--) {
1173
                    if (lsps[l] <= tmp) break;
1174
                    lsps[l + 1] = lsps[l];
1175
                }
1176
                lsps[l + 1] = tmp;
1177
            }
1178
            break;
1179
        }
1180
    }
1181
}
1182

    
1183
/**
1184
 * Test if there's enough bits to read 1 superframe.
1185
 *
1186
 * @param orig_gb bit I/O context used for reading. This function
1187
 *                does not modify the state of the bitreader; it
1188
 *                only uses it to copy the current stream position
1189
 * @param s WMA Voice decoding context private data
1190
 * @return -1 if unsupported, 1 on not enough bits or 0 if OK.
1191
 */
1192
static int check_bits_for_superframe(GetBitContext *orig_gb,
1193
                                     WMAVoiceContext *s)
1194
{
1195
    GetBitContext s_gb, *gb = &s_gb;
1196
    int n, need_bits, bd_idx;
1197
    const struct frame_type_desc *frame_desc;
1198

    
1199
    /* initialize a copy */
1200
    init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
1201
    skip_bits_long(gb, get_bits_count(orig_gb));
1202
    assert(get_bits_left(gb) == get_bits_left(orig_gb));
1203

    
1204
    /* superframe header */
1205
    if (get_bits_left(gb) < 14)
1206
        return 1;
1207
    if (!get_bits1(gb))
1208
        return -1;                        // WMAPro-in-WMAVoice superframe
1209
    if (get_bits1(gb)) skip_bits(gb, 12); // number of  samples in superframe
1210
    if (s->has_residual_lsps) {           // residual LSPs (for all frames)
1211
        if (get_bits_left(gb) < s->sframe_lsp_bitsize)
1212
            return 1;
1213
        skip_bits_long(gb, s->sframe_lsp_bitsize);
1214
    }
1215

    
1216
    /* frames */
1217
    for (n = 0; n < MAX_FRAMES; n++) {
1218
        int aw_idx_is_ext = 0;
1219

    
1220
        if (!s->has_residual_lsps) {     // independent LSPs (per-frame)
1221
           if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
1222
           skip_bits_long(gb, s->frame_lsp_bitsize);
1223
        }
1224
        bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
1225
        if (bd_idx < 0)
1226
            return -1;                   // invalid frame type VLC code
1227
        frame_desc = &frame_descs[bd_idx];
1228
        if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1229
            if (get_bits_left(gb) < s->pitch_nbits)
1230
                return 1;
1231
            skip_bits_long(gb, s->pitch_nbits);
1232
        }
1233
        if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1234
            skip_bits(gb, 8);
1235
        } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1236
            int tmp = get_bits(gb, 6);
1237
            if (tmp >= 0x36) {
1238
                skip_bits(gb, 2);
1239
                aw_idx_is_ext = 1;
1240
            }
1241
        }
1242

    
1243
        /* blocks */
1244
        if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
1245
            need_bits = s->block_pitch_nbits +
1246
                (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
1247
        } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1248
            need_bits = 2 * !aw_idx_is_ext;
1249
        } else
1250
            need_bits = 0;
1251
        need_bits += frame_desc->frame_size;
1252
        if (get_bits_left(gb) < need_bits)
1253
            return 1;
1254
        skip_bits_long(gb, need_bits);
1255
    }
1256

    
1257
    return 0;
1258
}
1259

    
1260
/**
1261
 * Synthesize output samples for a single superframe. If we have any data
1262
 * cached in s->sframe_cache, that will be used instead of whatever is loaded
1263
 * in s->gb.
1264
 *
1265
 * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1266
 * to give a total of 480 samples per frame. See #synth_frame() for frame
1267
 * parsing. In addition to 3 frames, superframes can also contain the LSPs
1268
 * (if these are globally specified for all frames (residually); they can
1269
 * also be specified individually per-frame. See the s->has_residual_lsps
1270
 * option), and can specify the number of samples encoded in this superframe
1271
 * (if less than 480), usually used to prevent blanks at track boundaries.
1272
 *
1273
 * @param ctx WMA Voice decoder context
1274
 * @param samples pointer to output buffer for voice samples
1275
 * @param data_size pointer containing the size of #samples on input, and the
1276
 *                  amount of #samples filled on output
1277
 * @return 0 on success, <0 on error or 1 if there was not enough data to
1278
 *         fully parse the superframe
1279
 */
1280
static int synth_superframe(AVCodecContext *ctx,
1281
                            float *samples, int *data_size)
1282
{
1283
    WMAVoiceContext *s = ctx->priv_data;
1284
    GetBitContext *gb = &s->gb, s_gb;
1285
    int n, res, n_samples = 480;
1286
    double lsps[MAX_FRAMES][MAX_LSPS];
1287
    const double *mean_lsf = s->lsps == 16 ?
1288
        wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1289
    float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1290
    float synth[MAX_LSPS + MAX_SFRAMESIZE];
1291

    
1292
    memcpy(synth,      s->synth_history,
1293
           s->lsps             * sizeof(*synth));
1294
    memcpy(excitation, s->excitation_history,
1295
           s->history_nsamples * sizeof(*excitation));
1296

    
1297
    if (s->sframe_cache_size > 0) {
1298
        gb = &s_gb;
1299
        init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1300
        s->sframe_cache_size = 0;
1301
    }
1302

    
1303
    if ((res = check_bits_for_superframe(gb, s)) == 1) return 1;
1304

    
1305
    /* First bit is speech/music bit, it differentiates between WMAVoice
1306
     * speech samples (the actual codec) and WMAVoice music samples, which
1307
     * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1308
     * the wild yet. */
1309
    if (!get_bits1(gb)) {
1310
        av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1);
1311
        return -1;
1312
    }
1313

    
1314
    /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1315
    if (get_bits1(gb)) {
1316
        if ((n_samples = get_bits(gb, 12)) > 480) {
1317
            av_log(ctx, AV_LOG_ERROR,
1318
                   "Superframe encodes >480 samples (%d), not allowed\n",
1319
                   n_samples);
1320
            return -1;
1321
        }
1322
    }
1323
    /* Parse LSPs, if global for the superframe (can also be per-frame). */
1324
    if (s->has_residual_lsps) {
1325
        double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1326

    
1327
        for (n = 0; n < s->lsps; n++)
1328
            prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1329

    
1330
        if (s->lsps == 10) {
1331
            dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1332
        } else /* s->lsps == 16 */
1333
            dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1334

    
1335
        for (n = 0; n < s->lsps; n++) {
1336
            lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
1337
            lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1338
            lsps[2][n] += mean_lsf[n];
1339
        }
1340
        for (n = 0; n < 3; n++)
1341
            stabilize_lsps(lsps[n], s->lsps);
1342
    }
1343

    
1344
    /* Parse frames, optionally preceeded by per-frame (independent) LSPs. */
1345
    for (n = 0; n < 3; n++) {
1346
        if (!s->has_residual_lsps) {
1347
            int m;
1348

    
1349
            if (s->lsps == 10) {
1350
                dequant_lsp10i(gb, lsps[n]);
1351
            } else /* s->lsps == 16 */
1352
                dequant_lsp16i(gb, lsps[n]);
1353

    
1354
            for (m = 0; m < s->lsps; m++)
1355
                lsps[n][m] += mean_lsf[m];
1356
            stabilize_lsps(lsps[n], s->lsps);
1357
        }
1358

    
1359
        if ((res = synth_frame(ctx, gb,
1360
                               &samples[n * MAX_FRAMESIZE],
1361
                               lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1362
                               &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1363
                               &synth[s->lsps + n * MAX_FRAMESIZE])))
1364
            return res;
1365
    }
1366

    
1367
    /* Statistics? FIXME - we don't check for length, a slight overrun
1368
     * will be caught by internal buffer padding, and anything else
1369
     * will be skipped, not read. */
1370
    if (get_bits1(gb)) {
1371
        res = get_bits(gb, 4);
1372
        skip_bits(gb, 10 * (res + 1));
1373
    }
1374

    
1375
    /* Specify nr. of output samples */
1376
    *data_size = n_samples * sizeof(float);
1377

    
1378
    /* Update history */
1379
    memcpy(s->prev_lsps,           lsps[2],
1380
           s->lsps             * sizeof(*s->prev_lsps));
1381
    memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
1382
           s->lsps             * sizeof(*synth));
1383
    memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1384
           s->history_nsamples * sizeof(*excitation));
1385

    
1386
    return 0;
1387
}
1388

    
1389
/**
1390
 * Parse the packet header at the start of each packet (input data to this
1391
 * decoder).
1392
 *
1393
 * @param s WMA Voice decoding context private data
1394
 * @return 1 if not enough bits were available, or 0 on success.
1395
 */
1396
static int parse_packet_header(WMAVoiceContext *s)
1397
{
1398
    GetBitContext *gb = &s->gb;
1399
    unsigned int res;
1400

    
1401
    if (get_bits_left(gb) < 11)
1402
        return 1;
1403
    skip_bits(gb, 4);          // packet sequence number
1404
    s->has_residual_lsps = get_bits1(gb);
1405
    do {
1406
        res = get_bits(gb, 6); // number of superframes per packet
1407
                               // (minus first one if there is spillover)
1408
        if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
1409
            return 1;
1410
    } while (res == 0x3F);
1411
    s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
1412

    
1413
    return 0;
1414
}
1415

    
1416
/**
1417
 * Copy (unaligned) bits from gb/data/size to pb.
1418
 *
1419
 * @param pb target buffer to copy bits into
1420
 * @param data source buffer to copy bits from
1421
 * @param size size of the source data, in bytes
1422
 * @param gb bit I/O context specifying the current position in the source.
1423
 *           data. This function might use this to align the bit position to
1424
 *           a whole-byte boundary before calling #ff_copy_bits() on aligned
1425
 *           source data
1426
 * @param nbits the amount of bits to copy from source to target
1427
 *
1428
 * @note after calling this function, the current position in the input bit
1429
 *       I/O context is undefined.
1430
 */
1431
static void copy_bits(PutBitContext *pb,
1432
                      const uint8_t *data, int size,
1433
                      GetBitContext *gb, int nbits)
1434
{
1435
    int rmn_bytes, rmn_bits;
1436

    
1437
    rmn_bits = rmn_bytes = get_bits_left(gb);
1438
    if (rmn_bits < nbits)
1439
        return;
1440
    rmn_bits &= 7; rmn_bytes >>= 3;
1441
    if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1442
        put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1443
    ff_copy_bits(pb, data + size - rmn_bytes,
1444
                 FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1445
}
1446

    
1447
/**
1448
 * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1449
 * and we expect that the demuxer / application provides it to us as such
1450
 * (else you'll probably get garbage as output). Every packet has a size of
1451
 * ctx->block_align bytes, starts with a packet header (see
1452
 * #parse_packet_header()), and then a series of superframes. Superframe
1453
 * boundaries may exceed packets, i.e. superframes can split data over
1454
 * multiple (two) packets.
1455
 *
1456
 * For more information about frames, see #synth_superframe().
1457
 */
1458
static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
1459
                                  int *data_size, AVPacket *avpkt)
1460
{
1461
    WMAVoiceContext *s = ctx->priv_data;
1462
    GetBitContext *gb = &s->gb;
1463
    int size, res, pos;
1464

    
1465
    if (*data_size < 480 * sizeof(float)) {
1466
        av_log(ctx, AV_LOG_ERROR,
1467
               "Output buffer too small (%d given - %lu needed)\n",
1468
               *data_size, 480 * sizeof(float));
1469
        return -1;
1470
    }
1471
    *data_size = 0;
1472

    
1473
    /* Packets are sometimes a multiple of ctx->block_align, with a packet
1474
     * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1475
     * feeds us ASF packets, which may concatenate multiple "codec" packets
1476
     * in a single "muxer" packet, so we artificially emulate that by
1477
     * capping the packet size at ctx->block_align. */
1478
    for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1479
    if (!size)
1480
        return 0;
1481
    init_get_bits(&s->gb, avpkt->data, size << 3);
1482

    
1483
    /* size == ctx->block_align is used to indicate whether we are dealing with
1484
     * a new packet or a packet of which we already read the packet header
1485
     * previously. */
1486
    if (size == ctx->block_align) { // new packet header
1487
        if ((res = parse_packet_header(s)) < 0)
1488
            return res;
1489

    
1490
        /* If the packet header specifies a s->spillover_nbits, then we want
1491
         * to push out all data of the previous packet (+ spillover) before
1492
         * continuing to parse new superframes in the current packet. */
1493
        if (s->spillover_nbits > 0) {
1494
            if (s->sframe_cache_size > 0) {
1495
                int cnt = get_bits_count(gb);
1496
                copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1497
                flush_put_bits(&s->pb);
1498
                s->sframe_cache_size += s->spillover_nbits;
1499
                if ((res = synth_superframe(ctx, data, data_size)) == 0 &&
1500
                    *data_size > 0) {
1501
                    cnt += s->spillover_nbits;
1502
                    s->skip_bits_next = cnt & 7;
1503
                    return cnt >> 3;
1504
                } else
1505
                    skip_bits_long (gb, s->spillover_nbits - cnt +
1506
                                    get_bits_count(gb)); // resync
1507
            } else
1508
                skip_bits_long(gb, s->spillover_nbits);  // resync
1509
        }
1510
    } else if (s->skip_bits_next)
1511
        skip_bits(gb, s->skip_bits_next);
1512

    
1513
    /* Try parsing superframes in current packet */
1514
    s->sframe_cache_size = 0;
1515
    s->skip_bits_next = 0;
1516
    pos = get_bits_left(gb);
1517
    if ((res = synth_superframe(ctx, data, data_size)) < 0) {
1518
        return res;
1519
    } else if (*data_size > 0) {
1520
        int cnt = get_bits_count(gb);
1521
        s->skip_bits_next = cnt & 7;
1522
        return cnt >> 3;
1523
    } else if ((s->sframe_cache_size = pos) > 0) {
1524
        /* rewind bit reader to start of last (incomplete) superframe... */
1525
        init_get_bits(gb, avpkt->data, size << 3);
1526
        skip_bits_long(gb, (size << 3) - pos);
1527
        assert(get_bits_left(gb) == pos);
1528

    
1529
        /* ...and cache it for spillover in next packet */
1530
        init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1531
        copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1532
        // FIXME bad - just copy bytes as whole and add use the
1533
        // skip_bits_next field
1534
    }
1535

    
1536
    return size;
1537
}
1538

    
1539
static av_cold void wmavoice_flush(AVCodecContext *ctx)
1540
{
1541
    WMAVoiceContext *s = ctx->priv_data;
1542
    int n;
1543

    
1544
    s->sframe_cache_size = 0;
1545
    s->skip_bits_next    = 0;
1546
    for (n = 0; n < s->lsps; n++)
1547
        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
1548
    memset(s->excitation_history, 0,
1549
           sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
1550
    memset(s->synth_history,      0,
1551
           sizeof(*s->synth_history)      * MAX_LSPS);
1552
    memset(s->gain_pred_err,      0,
1553
           sizeof(s->gain_pred_err));
1554
}
1555

    
1556
AVCodec wmavoice_decoder = {
1557
    "wmavoice",
1558
    AVMEDIA_TYPE_AUDIO,
1559
    CODEC_ID_WMAVOICE,
1560
    sizeof(WMAVoiceContext),
1561
    wmavoice_decode_init,
1562
    NULL,
1563
    NULL,
1564
    wmavoice_decode_packet,
1565
    CODEC_CAP_SUBFRAMES,
1566
    .flush     = wmavoice_flush,
1567
    .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
1568
};