Statistics
| Branch: | Revision:

ffmpeg / libavcodec / wmavoice.c @ 2d2b5a14

History | View | Annotate | Download (80.1 KB)

1
/*
2
 * Windows Media Audio Voice decoder.
3
 * Copyright (c) 2009 Ronald S. Bultje
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
/**
23
 * @file
24
 * @brief Windows Media Audio Voice compatible decoder
25
 * @author Ronald S. Bultje <rsbultje@gmail.com>
26
 */
27

    
28
#include <math.h>
29
#include "avcodec.h"
30
#include "get_bits.h"
31
#include "put_bits.h"
32
#include "wmavoice_data.h"
33
#include "celp_math.h"
34
#include "celp_filters.h"
35
#include "acelp_vectors.h"
36
#include "acelp_filters.h"
37
#include "lsp.h"
38
#include "libavutil/lzo.h"
39
#include "dct.h"
40
#include "rdft.h"
41
#include "sinewin.h"
42

    
43
#define MAX_BLOCKS           8   ///< maximum number of blocks per frame
44
#define MAX_LSPS             16  ///< maximum filter order
45
#define MAX_LSPS_ALIGN16     16  ///< same as #MAX_LSPS; needs to be multiple
46
                                 ///< of 16 for ASM input buffer alignment
47
#define MAX_FRAMES           3   ///< maximum number of frames per superframe
48
#define MAX_FRAMESIZE        160 ///< maximum number of samples per frame
49
#define MAX_SIGNAL_HISTORY   416 ///< maximum excitation signal history
50
#define MAX_SFRAMESIZE       (MAX_FRAMESIZE * MAX_FRAMES)
51
                                 ///< maximum number of samples per superframe
52
#define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
53
                                 ///< was split over two packets
54
#define VLC_NBITS            6   ///< number of bits to read per VLC iteration
55

    
56
/**
57
 * Frame type VLC coding.
58
 */
59
static VLC frame_type_vlc;
60

    
61
/**
62
 * Adaptive codebook types.
63
 */
64
enum {
65
    ACB_TYPE_NONE       = 0, ///< no adaptive codebook (only hardcoded fixed)
66
    ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
67
                             ///< we interpolate to get a per-sample pitch.
68
                             ///< Signal is generated using an asymmetric sinc
69
                             ///< window function
70
                             ///< @note see #wmavoice_ipol1_coeffs
71
    ACB_TYPE_HAMMING    = 2  ///< Per-block pitch with signal generation using
72
                             ///< a Hamming sinc window function
73
                             ///< @note see #wmavoice_ipol2_coeffs
74
};
75

    
76
/**
77
 * Fixed codebook types.
78
 */
79
enum {
80
    FCB_TYPE_SILENCE    = 0, ///< comfort noise during silence
81
                             ///< generated from a hardcoded (fixed) codebook
82
                             ///< with per-frame (low) gain values
83
    FCB_TYPE_HARDCODED  = 1, ///< hardcoded (fixed) codebook with per-block
84
                             ///< gain values
85
    FCB_TYPE_AW_PULSES  = 2, ///< Pitch-adaptive window (AW) pulse signals,
86
                             ///< used in particular for low-bitrate streams
87
    FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
88
                             ///< combinations of either single pulses or
89
                             ///< pulse pairs
90
};
91

    
92
/**
93
 * Description of frame types.
94
 */
95
static const struct frame_type_desc {
96
    uint8_t n_blocks;     ///< amount of blocks per frame (each block
97
                          ///< (contains 160/#n_blocks samples)
98
    uint8_t log_n_blocks; ///< log2(#n_blocks)
99
    uint8_t acb_type;     ///< Adaptive codebook type (ACB_TYPE_*)
100
    uint8_t fcb_type;     ///< Fixed codebook type (FCB_TYPE_*)
101
    uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
102
                          ///< (rather than just one single pulse)
103
                          ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
104
    uint16_t frame_size;  ///< the amount of bits that make up the block
105
                          ///< data (per frame)
106
} frame_descs[17] = {
107
    { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0,   0 },
108
    { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0,  28 },
109
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0,  46 },
110
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2,  80 },
111
    { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
112
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
113
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
114
    { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
115
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0,  64 },
116
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2,  80 },
117
    { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 104 },
118
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0, 108 },
119
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2, 132 },
120
    { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 168 },
121
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0, 176 },
122
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2, 208 },
123
    { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 256 }
124
};
125

    
126
/**
127
 * WMA Voice decoding context.
128
 */
129
typedef struct {
130
    /**
131
     * @defgroup struct_global Global values
132
     * Global values, specified in the stream header / extradata or used
133
     * all over.
134
     * @{
135
     */
136
    GetBitContext gb;             ///< packet bitreader. During decoder init,
137
                                  ///< it contains the extradata from the
138
                                  ///< demuxer. During decoding, it contains
139
                                  ///< packet data.
140
    int8_t vbm_tree[25];          ///< converts VLC codes to frame type
141

    
142
    int spillover_bitsize;        ///< number of bits used to specify
143
                                  ///< #spillover_nbits in the packet header
144
                                  ///< = ceil(log2(ctx->block_align << 3))
145
    int history_nsamples;         ///< number of samples in history for signal
146
                                  ///< prediction (through ACB)
147

    
148
    /* postfilter specific values */
149
    int do_apf;                   ///< whether to apply the averaged
150
                                  ///< projection filter (APF)
151
    int denoise_strength;         ///< strength of denoising in Wiener filter
152
                                  ///< [0-11]
153
    int denoise_tilt_corr;        ///< Whether to apply tilt correction to the
154
                                  ///< Wiener filter coefficients (postfilter)
155
    int dc_level;                 ///< Predicted amount of DC noise, based
156
                                  ///< on which a DC removal filter is used
157

    
158
    int lsps;                     ///< number of LSPs per frame [10 or 16]
159
    int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
160
    int lsp_def_mode;             ///< defines different sets of LSP defaults
161
                                  ///< [0, 1]
162
    int frame_lsp_bitsize;        ///< size (in bits) of LSPs, when encoded
163
                                  ///< per-frame (independent coding)
164
    int sframe_lsp_bitsize;       ///< size (in bits) of LSPs, when encoded
165
                                  ///< per superframe (residual coding)
166

    
167
    int min_pitch_val;            ///< base value for pitch parsing code
168
    int max_pitch_val;            ///< max value + 1 for pitch parsing
169
    int pitch_nbits;              ///< number of bits used to specify the
170
                                  ///< pitch value in the frame header
171
    int block_pitch_nbits;        ///< number of bits used to specify the
172
                                  ///< first block's pitch value
173
    int block_pitch_range;        ///< range of the block pitch
174
    int block_delta_pitch_nbits;  ///< number of bits used to specify the
175
                                  ///< delta pitch between this and the last
176
                                  ///< block's pitch value, used in all but
177
                                  ///< first block
178
    int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
179
                                  ///< from -this to +this-1)
180
    uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
181
                                  ///< conversion
182

    
183
    /**
184
     * @}
185
     * @defgroup struct_packet Packet values
186
     * Packet values, specified in the packet header or related to a packet.
187
     * A packet is considered to be a single unit of data provided to this
188
     * decoder by the demuxer.
189
     * @{
190
     */
191
    int spillover_nbits;          ///< number of bits of the previous packet's
192
                                  ///< last superframe preceeding this
193
                                  ///< packet's first full superframe (useful
194
                                  ///< for re-synchronization also)
195
    int has_residual_lsps;        ///< if set, superframes contain one set of
196
                                  ///< LSPs that cover all frames, encoded as
197
                                  ///< independent and residual LSPs; if not
198
                                  ///< set, each frame contains its own, fully
199
                                  ///< independent, LSPs
200
    int skip_bits_next;           ///< number of bits to skip at the next call
201
                                  ///< to #wmavoice_decode_packet() (since
202
                                  ///< they're part of the previous superframe)
203

    
204
    uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + FF_INPUT_BUFFER_PADDING_SIZE];
205
                                  ///< cache for superframe data split over
206
                                  ///< multiple packets
207
    int sframe_cache_size;        ///< set to >0 if we have data from an
208
                                  ///< (incomplete) superframe from a previous
209
                                  ///< packet that spilled over in the current
210
                                  ///< packet; specifies the amount of bits in
211
                                  ///< #sframe_cache
212
    PutBitContext pb;             ///< bitstream writer for #sframe_cache
213

    
214
    /**
215
     * @}
216
     * @defgroup struct_frame Frame and superframe values
217
     * Superframe and frame data - these can change from frame to frame,
218
     * although some of them do in that case serve as a cache / history for
219
     * the next frame or superframe.
220
     * @{
221
     */
222
    double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
223
                                  ///< superframe
224
    int last_pitch_val;           ///< pitch value of the previous frame
225
    int last_acb_type;            ///< frame type [0-2] of the previous frame
226
    int pitch_diff_sh16;          ///< ((cur_pitch_val - #last_pitch_val)
227
                                  ///< << 16) / #MAX_FRAMESIZE
228
    float silence_gain;           ///< set for use in blocks if #ACB_TYPE_NONE
229

    
230
    int aw_idx_is_ext;            ///< whether the AW index was encoded in
231
                                  ///< 8 bits (instead of 6)
232
    int aw_pulse_range;           ///< the range over which #aw_pulse_set1()
233
                                  ///< can apply the pulse, relative to the
234
                                  ///< value in aw_first_pulse_off. The exact
235
                                  ///< position of the first AW-pulse is within
236
                                  ///< [pulse_off, pulse_off + this], and
237
                                  ///< depends on bitstream values; [16 or 24]
238
    int aw_n_pulses[2];           ///< number of AW-pulses in each block; note
239
                                  ///< that this number can be negative (in
240
                                  ///< which case it basically means "zero")
241
    int aw_first_pulse_off[2];    ///< index of first sample to which to
242
                                  ///< apply AW-pulses, or -0xff if unset
243
    int aw_next_pulse_off_cache;  ///< the position (relative to start of the
244
                                  ///< second block) at which pulses should
245
                                  ///< start to be positioned, serves as a
246
                                  ///< cache for pitch-adaptive window pulses
247
                                  ///< between blocks
248

    
249
    int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
250
                                  ///< only used for comfort noise in #pRNG()
251
    float gain_pred_err[6];       ///< cache for gain prediction
252
    float excitation_history[MAX_SIGNAL_HISTORY];
253
                                  ///< cache of the signal of previous
254
                                  ///< superframes, used as a history for
255
                                  ///< signal generation
256
    float synth_history[MAX_LSPS]; ///< see #excitation_history
257
    /**
258
     * @}
259
     * @defgroup post_filter Postfilter values
260
     * Variables used for postfilter implementation, mostly history for
261
     * smoothing and so on, and context variables for FFT/iFFT.
262
     * @{
263
     */
264
    RDFTContext rdft, irdft;      ///< contexts for FFT-calculation in the
265
                                  ///< postfilter (for denoise filter)
266
    DCTContext dct, dst;          ///< contexts for phase shift (in Hilbert
267
                                  ///< transform, part of postfilter)
268
    float sin[511], cos[511];     ///< 8-bit cosine/sine windows over [-pi,pi]
269
                                  ///< range
270
    float postfilter_agc;         ///< gain control memory, used in
271
                                  ///< #adaptive_gain_control()
272
    float dcf_mem[2];             ///< DC filter history
273
    float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
274
                                  ///< zero filter output (i.e. excitation)
275
                                  ///< by postfilter
276
    float denoise_filter_cache[MAX_FRAMESIZE];
277
    int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
278
    DECLARE_ALIGNED(16, float, tilted_lpcs_pf)[0x80];
279
                                  ///< aligned buffer for LPC tilting
280
    DECLARE_ALIGNED(16, float, denoise_coeffs_pf)[0x80];
281
                                  ///< aligned buffer for denoise coefficients
282
    DECLARE_ALIGNED(16, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
283
                                  ///< aligned buffer for postfilter speech
284
                                  ///< synthesis
285
    /**
286
     * @}
287
     */
288
} WMAVoiceContext;
289

    
290
/**
291
 * Set up the variable bit mode (VBM) tree from container extradata.
292
 * @param gb bit I/O context.
293
 *           The bit context (s->gb) should be loaded with byte 23-46 of the
294
 *           container extradata (i.e. the ones containing the VBM tree).
295
 * @param vbm_tree pointer to array to which the decoded VBM tree will be
296
 *                 written.
297
 * @return 0 on success, <0 on error.
298
 */
299
static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
300
{
301
    static const uint8_t bits[] = {
302
         2,  2,  2,  4,  4,  4,
303
         6,  6,  6,  8,  8,  8,
304
        10, 10, 10, 12, 12, 12,
305
        14, 14, 14, 14
306
    };
307
    static const uint16_t codes[] = {
308
          0x0000, 0x0001, 0x0002,        //              00/01/10
309
          0x000c, 0x000d, 0x000e,        //           11+00/01/10
310
          0x003c, 0x003d, 0x003e,        //         1111+00/01/10
311
          0x00fc, 0x00fd, 0x00fe,        //       111111+00/01/10
312
          0x03fc, 0x03fd, 0x03fe,        //     11111111+00/01/10
313
          0x0ffc, 0x0ffd, 0x0ffe,        //   1111111111+00/01/10
314
          0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
315
    };
316
    int cntr[8], n, res;
317

    
318
    memset(vbm_tree, 0xff, sizeof(vbm_tree));
319
    memset(cntr,     0,    sizeof(cntr));
320
    for (n = 0; n < 17; n++) {
321
        res = get_bits(gb, 3);
322
        if (cntr[res] > 3) // should be >= 3 + (res == 7))
323
            return -1;
324
        vbm_tree[res * 3 + cntr[res]++] = n;
325
    }
326
    INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
327
                    bits, 1, 1, codes, 2, 2, 132);
328
    return 0;
329
}
330

    
331
/**
332
 * Set up decoder with parameters from demuxer (extradata etc.).
333
 */
334
static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
335
{
336
    int n, flags, pitch_range, lsp16_flag;
337
    WMAVoiceContext *s = ctx->priv_data;
338

    
339
    /**
340
     * Extradata layout:
341
     * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
342
     * - byte 19-22: flags field (annoyingly in LE; see below for known
343
     *               values),
344
     * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
345
     *               rest is 0).
346
     */
347
    if (ctx->extradata_size != 46) {
348
        av_log(ctx, AV_LOG_ERROR,
349
               "Invalid extradata size %d (should be 46)\n",
350
               ctx->extradata_size);
351
        return -1;
352
    }
353
    flags                = AV_RL32(ctx->extradata + 18);
354
    s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
355
    s->do_apf            =    flags & 0x1;
356
    if (s->do_apf) {
357
        ff_rdft_init(&s->rdft,  7, DFT_R2C);
358
        ff_rdft_init(&s->irdft, 7, IDFT_C2R);
359
        ff_dct_init(&s->dct,  6, DCT_I);
360
        ff_dct_init(&s->dst,  6, DST_I);
361

    
362
        ff_sine_window_init(s->cos, 256);
363
        memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
364
        for (n = 0; n < 255; n++) {
365
            s->sin[n]       = -s->sin[510 - n];
366
            s->cos[510 - n] =  s->cos[n];
367
        }
368
    }
369
    s->denoise_strength  =   (flags >> 2) & 0xF;
370
    if (s->denoise_strength >= 12) {
371
        av_log(ctx, AV_LOG_ERROR,
372
               "Invalid denoise filter strength %d (max=11)\n",
373
               s->denoise_strength);
374
        return -1;
375
    }
376
    s->denoise_tilt_corr = !!(flags & 0x40);
377
    s->dc_level          =   (flags >> 7) & 0xF;
378
    s->lsp_q_mode        = !!(flags & 0x2000);
379
    s->lsp_def_mode      = !!(flags & 0x4000);
380
    lsp16_flag           =    flags & 0x1000;
381
    if (lsp16_flag) {
382
        s->lsps               = 16;
383
        s->frame_lsp_bitsize  = 34;
384
        s->sframe_lsp_bitsize = 60;
385
    } else {
386
        s->lsps               = 10;
387
        s->frame_lsp_bitsize  = 24;
388
        s->sframe_lsp_bitsize = 48;
389
    }
390
    for (n = 0; n < s->lsps; n++)
391
        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
392

    
393
    init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
394
    if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
395
        av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
396
        return -1;
397
    }
398

    
399
    s->min_pitch_val    = ((ctx->sample_rate << 8)      /  400 + 50) >> 8;
400
    s->max_pitch_val    = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
401
    pitch_range         = s->max_pitch_val - s->min_pitch_val;
402
    s->pitch_nbits      = av_ceil_log2(pitch_range);
403
    s->last_pitch_val   = 40;
404
    s->last_acb_type    = ACB_TYPE_NONE;
405
    s->history_nsamples = s->max_pitch_val + 8;
406

    
407
    if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
408
        int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
409
            max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
410

    
411
        av_log(ctx, AV_LOG_ERROR,
412
               "Unsupported samplerate %d (min=%d, max=%d)\n",
413
               ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
414

    
415
        return -1;
416
    }
417

    
418
    s->block_conv_table[0]      = s->min_pitch_val;
419
    s->block_conv_table[1]      = (pitch_range * 25) >> 6;
420
    s->block_conv_table[2]      = (pitch_range * 44) >> 6;
421
    s->block_conv_table[3]      = s->max_pitch_val - 1;
422
    s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
423
    s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
424
    s->block_pitch_range        = s->block_conv_table[2] +
425
                                  s->block_conv_table[3] + 1 +
426
                                  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
427
    s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
428

    
429
    ctx->sample_fmt             = AV_SAMPLE_FMT_FLT;
430

    
431
    return 0;
432
}
433

    
434
/**
435
 * @defgroup postfilter Postfilter functions
436
 * Postfilter functions (gain control, wiener denoise filter, DC filter,
437
 * kalman smoothening, plus surrounding code to wrap it)
438
 * @{
439
 */
440
/**
441
 * Adaptive gain control (as used in postfilter).
442
 *
443
 * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
444
 * that the energy here is calculated using sum(abs(...)), whereas the
445
 * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
446
 *
447
 * @param out output buffer for filtered samples
448
 * @param in input buffer containing the samples as they are after the
449
 *           postfilter steps so far
450
 * @param speech_synth input buffer containing speech synth before postfilter
451
 * @param size input buffer size
452
 * @param alpha exponential filter factor
453
 * @param gain_mem pointer to filter memory (single float)
454
 */
455
static void adaptive_gain_control(float *out, const float *in,
456
                                  const float *speech_synth,
457
                                  int size, float alpha, float *gain_mem)
458
{
459
    int i;
460
    float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
461
    float mem = *gain_mem;
462

    
463
    for (i = 0; i < size; i++) {
464
        speech_energy     += fabsf(speech_synth[i]);
465
        postfilter_energy += fabsf(in[i]);
466
    }
467
    gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy;
468

    
469
    for (i = 0; i < size; i++) {
470
        mem = alpha * mem + gain_scale_factor;
471
        out[i] = in[i] * mem;
472
    }
473

    
474
    *gain_mem = mem;
475
}
476

    
477
/**
478
 * Kalman smoothing function.
479
 *
480
 * This function looks back pitch +/- 3 samples back into history to find
481
 * the best fitting curve (that one giving the optimal gain of the two
482
 * signals, i.e. the highest dot product between the two), and then
483
 * uses that signal history to smoothen the output of the speech synthesis
484
 * filter.
485
 *
486
 * @param s WMA Voice decoding context
487
 * @param pitch pitch of the speech signal
488
 * @param in input speech signal
489
 * @param out output pointer for smoothened signal
490
 * @param size input/output buffer size
491
 *
492
 * @returns -1 if no smoothening took place, e.g. because no optimal
493
 *          fit could be found, or 0 on success.
494
 */
495
static int kalman_smoothen(WMAVoiceContext *s, int pitch,
496
                           const float *in, float *out, int size)
497
{
498
    int n;
499
    float optimal_gain = 0, dot;
500
    const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
501
                *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
502
                *best_hist_ptr;
503

    
504
    /* find best fitting point in history */
505
    do {
506
        dot = ff_dot_productf(in, ptr, size);
507
        if (dot > optimal_gain) {
508
            optimal_gain  = dot;
509
            best_hist_ptr = ptr;
510
        }
511
    } while (--ptr >= end);
512

    
513
    if (optimal_gain <= 0)
514
        return -1;
515
    dot = ff_dot_productf(best_hist_ptr, best_hist_ptr, size);
516
    if (dot <= 0) // would be 1.0
517
        return -1;
518

    
519
    if (optimal_gain <= dot) {
520
        dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
521
    } else
522
        dot = 0.625;
523

    
524
    /* actual smoothing */
525
    for (n = 0; n < size; n++)
526
        out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
527

    
528
    return 0;
529
}
530

    
531
/**
532
 * Get the tilt factor of a formant filter from its transfer function
533
 * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
534
 *      but somehow (??) it does a speech synthesis filter in the
535
 *      middle, which is missing here
536
 *
537
 * @param lpcs LPC coefficients
538
 * @param n_lpcs Size of LPC buffer
539
 * @returns the tilt factor
540
 */
541
static float tilt_factor(const float *lpcs, int n_lpcs)
542
{
543
    float rh0, rh1;
544

    
545
    rh0 = 1.0     + ff_dot_productf(lpcs,  lpcs,    n_lpcs);
546
    rh1 = lpcs[0] + ff_dot_productf(lpcs, &lpcs[1], n_lpcs - 1);
547

    
548
    return rh1 / rh0;
549
}
550

    
551
/**
552
 * Derive denoise filter coefficients (in real domain) from the LPCs.
553
 */
554
static void calc_input_response(WMAVoiceContext *s, float *lpcs,
555
                                int fcb_type, float *coeffs, int remainder)
556
{
557
    float last_coeff, min = 15.0, max = -15.0;
558
    float irange, angle_mul, gain_mul, range, sq;
559
    int n, idx;
560

    
561
    /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
562
    s->rdft.rdft_calc(&s->rdft, lpcs);
563
#define log_range(var, assign) do { \
564
        float tmp = log10f(assign);  var = tmp; \
565
        max       = FFMAX(max, tmp); min = FFMIN(min, tmp); \
566
    } while (0)
567
    log_range(last_coeff,  lpcs[1]         * lpcs[1]);
568
    for (n = 1; n < 64; n++)
569
        log_range(lpcs[n], lpcs[n * 2]     * lpcs[n * 2] +
570
                           lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
571
    log_range(lpcs[0],     lpcs[0]         * lpcs[0]);
572
#undef log_range
573
    range    = max - min;
574
    lpcs[64] = last_coeff;
575

    
576
    /* Now, use this spectrum to pick out these frequencies with higher
577
     * (relative) power/energy (which we then take to be "not noise"),
578
     * and set up a table (still in lpc[]) of (relative) gains per frequency.
579
     * These frequencies will be maintained, while others ("noise") will be
580
     * decreased in the filter output. */
581
    irange    = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
582
    gain_mul  = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
583
                                                          (5.0 / 14.7));
584
    angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
585
    for (n = 0; n <= 64; n++) {
586
        float pwr;
587

    
588
        idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
589
        pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
590
        lpcs[n] = angle_mul * pwr;
591

    
592
        /* 70.57 =~ 1/log10(1.0331663) */
593
        idx = (pwr * gain_mul - 0.0295) * 70.570526123;
594
        if (idx > 127) { // fallback if index falls outside table range
595
            coeffs[n] = wmavoice_energy_table[127] *
596
                        powf(1.0331663, idx - 127);
597
        } else
598
            coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
599
    }
600

    
601
    /* calculate the Hilbert transform of the gains, which we do (since this
602
     * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()).
603
     * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
604
     * "moment" of the LPCs in this filter. */
605
    s->dct.dct_calc(&s->dct, lpcs);
606
    s->dst.dct_calc(&s->dst, lpcs);
607

    
608
    /* Split out the coefficient indexes into phase/magnitude pairs */
609
    idx = 255 + av_clip(lpcs[64],               -255, 255);
610
    coeffs[0]  = coeffs[0]  * s->cos[idx];
611
    idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
612
    last_coeff = coeffs[64] * s->cos[idx];
613
    for (n = 63;; n--) {
614
        idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
615
        coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
616
        coeffs[n * 2]     = coeffs[n] * s->cos[idx];
617

    
618
        if (!--n) break;
619

    
620
        idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
621
        coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
622
        coeffs[n * 2]     = coeffs[n] * s->cos[idx];
623
    }
624
    coeffs[1] = last_coeff;
625

    
626
    /* move into real domain */
627
    s->irdft.rdft_calc(&s->irdft, coeffs);
628

    
629
    /* tilt correction and normalize scale */
630
    memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
631
    if (s->denoise_tilt_corr) {
632
        float tilt_mem = 0;
633

    
634
        coeffs[remainder - 1] = 0;
635
        ff_tilt_compensation(&tilt_mem,
636
                             -1.8 * tilt_factor(coeffs, remainder - 1),
637
                             coeffs, remainder);
638
    }
639
    sq = (1.0 / 64.0) * sqrtf(1 / ff_dot_productf(coeffs, coeffs, remainder));
640
    for (n = 0; n < remainder; n++)
641
        coeffs[n] *= sq;
642
}
643

    
644
/**
645
 * This function applies a Wiener filter on the (noisy) speech signal as
646
 * a means to denoise it.
647
 *
648
 * - take RDFT of LPCs to get the power spectrum of the noise + speech;
649
 * - using this power spectrum, calculate (for each frequency) the Wiener
650
 *    filter gain, which depends on the frequency power and desired level
651
 *    of noise subtraction (when set too high, this leads to artifacts)
652
 *    We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
653
 *    of 4-8kHz);
654
 * - by doing a phase shift, calculate the Hilbert transform of this array
655
 *    of per-frequency filter-gains to get the filtering coefficients;
656
 * - smoothen/normalize/de-tilt these filter coefficients as desired;
657
 * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
658
 *    to get the denoised speech signal;
659
 * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
660
 *    the frame boundary) are saved and applied to subsequent frames by an
661
 *    overlap-add method (otherwise you get clicking-artifacts).
662
 *
663
 * @param s WMA Voice decoding context
664
 * @param fcb_type Frame (codebook) type
665
 * @param synth_pf input: the noisy speech signal, output: denoised speech
666
 *                 data; should be 16-byte aligned (for ASM purposes)
667
 * @param size size of the speech data
668
 * @param lpcs LPCs used to synthesize this frame's speech data
669
 */
670
static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
671
                           float *synth_pf, int size,
672
                           const float *lpcs)
673
{
674
    int remainder, lim, n;
675

    
676
    if (fcb_type != FCB_TYPE_SILENCE) {
677
        float *tilted_lpcs = s->tilted_lpcs_pf,
678
              *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
679

    
680
        tilted_lpcs[0]           = 1.0;
681
        memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
682
        memset(&tilted_lpcs[s->lsps + 1], 0,
683
               sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
684
        ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
685
                             tilted_lpcs, s->lsps + 2);
686

    
687
        /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
688
         * size is applied to the next frame. All input beyond this is zero,
689
         * and thus all output beyond this will go towards zero, hence we can
690
         * limit to min(size-1, 127-size) as a performance consideration. */
691
        remainder = FFMIN(127 - size, size - 1);
692
        calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
693

    
694
        /* apply coefficients (in frequency spectrum domain), i.e. complex
695
         * number multiplication */
696
        memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
697
        s->rdft.rdft_calc(&s->rdft, synth_pf);
698
        s->rdft.rdft_calc(&s->rdft, coeffs);
699
        synth_pf[0] *= coeffs[0];
700
        synth_pf[1] *= coeffs[1];
701
        for (n = 1; n < 64; n++) {
702
            float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
703
            synth_pf[n * 2]     = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
704
            synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
705
        }
706
        s->irdft.rdft_calc(&s->irdft, synth_pf);
707
    }
708

    
709
    /* merge filter output with the history of previous runs */
710
    if (s->denoise_filter_cache_size) {
711
        lim = FFMIN(s->denoise_filter_cache_size, size);
712
        for (n = 0; n < lim; n++)
713
            synth_pf[n] += s->denoise_filter_cache[n];
714
        s->denoise_filter_cache_size -= lim;
715
        memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
716
                sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
717
    }
718

    
719
    /* move remainder of filter output into a cache for future runs */
720
    if (fcb_type != FCB_TYPE_SILENCE) {
721
        lim = FFMIN(remainder, s->denoise_filter_cache_size);
722
        for (n = 0; n < lim; n++)
723
            s->denoise_filter_cache[n] += synth_pf[size + n];
724
        if (lim < remainder) {
725
            memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
726
                   sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
727
            s->denoise_filter_cache_size = remainder;
728
        }
729
    }
730
}
731

    
732
/**
733
 * Averaging projection filter, the postfilter used in WMAVoice.
734
 *
735
 * This uses the following steps:
736
 * - A zero-synthesis filter (generate excitation from synth signal)
737
 * - Kalman smoothing on excitation, based on pitch
738
 * - Re-synthesized smoothened output
739
 * - Iterative Wiener denoise filter
740
 * - Adaptive gain filter
741
 * - DC filter
742
 *
743
 * @param s WMAVoice decoding context
744
 * @param synth Speech synthesis output (before postfilter)
745
 * @param samples Output buffer for filtered samples
746
 * @param size Buffer size of synth & samples
747
 * @param lpcs Generated LPCs used for speech synthesis
748
 * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
749
 * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
750
 * @param pitch Pitch of the input signal
751
 */
752
static void postfilter(WMAVoiceContext *s, const float *synth,
753
                       float *samples,    int size,
754
                       const float *lpcs, float *zero_exc_pf,
755
                       int fcb_type,      int pitch)
756
{
757
    float synth_filter_in_buf[MAX_FRAMESIZE / 2],
758
          *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
759
          *synth_filter_in = zero_exc_pf;
760

    
761
    assert(size <= MAX_FRAMESIZE / 2);
762

    
763
    /* generate excitation from input signal */
764
    ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
765

    
766
    if (fcb_type >= FCB_TYPE_AW_PULSES &&
767
        !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
768
        synth_filter_in = synth_filter_in_buf;
769

    
770
    /* re-synthesize speech after smoothening, and keep history */
771
    ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
772
                                 synth_filter_in, size, s->lsps);
773
    memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
774
           sizeof(synth_pf[0]) * s->lsps);
775

    
776
    wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
777

    
778
    adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
779
                          &s->postfilter_agc);
780

    
781
    if (s->dc_level > 8) {
782
        /* remove ultra-low frequency DC noise / highpass filter;
783
         * coefficients are identical to those used in SIPR decoding,
784
         * and very closely resemble those used in AMR-NB decoding. */
785
        ff_acelp_apply_order_2_transfer_function(samples, samples,
786
            (const float[2]) { -1.99997,      1.0 },
787
            (const float[2]) { -1.9330735188, 0.93589198496 },
788
            0.93980580475, s->dcf_mem, size);
789
    }
790
}
791
/**
792
 * @}
793
 */
794

    
795
/**
796
 * Dequantize LSPs
797
 * @param lsps output pointer to the array that will hold the LSPs
798
 * @param num number of LSPs to be dequantized
799
 * @param values quantized values, contains n_stages values
800
 * @param sizes range (i.e. max value) of each quantized value
801
 * @param n_stages number of dequantization runs
802
 * @param table dequantization table to be used
803
 * @param mul_q LSF multiplier
804
 * @param base_q base (lowest) LSF values
805
 */
806
static void dequant_lsps(double *lsps, int num,
807
                         const uint16_t *values,
808
                         const uint16_t *sizes,
809
                         int n_stages, const uint8_t *table,
810
                         const double *mul_q,
811
                         const double *base_q)
812
{
813
    int n, m;
814

    
815
    memset(lsps, 0, num * sizeof(*lsps));
816
    for (n = 0; n < n_stages; n++) {
817
        const uint8_t *t_off = &table[values[n] * num];
818
        double base = base_q[n], mul = mul_q[n];
819

    
820
        for (m = 0; m < num; m++)
821
            lsps[m] += base + mul * t_off[m];
822

    
823
        table += sizes[n] * num;
824
    }
825
}
826

    
827
/**
828
 * @defgroup lsp_dequant LSP dequantization routines
829
 * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
830
 * @note we assume enough bits are available, caller should check.
831
 * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
832
 * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
833
 * @{
834
 */
835
/**
836
 * Parse 10 independently-coded LSPs.
837
 */
838
static void dequant_lsp10i(GetBitContext *gb, double *lsps)
839
{
840
    static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
841
    static const double mul_lsf[4] = {
842
        5.2187144800e-3,    1.4626986422e-3,
843
        9.6179549166e-4,    1.1325736225e-3
844
    };
845
    static const double base_lsf[4] = {
846
        M_PI * -2.15522e-1, M_PI * -6.1646e-2,
847
        M_PI * -3.3486e-2,  M_PI * -5.7408e-2
848
    };
849
    uint16_t v[4];
850

    
851
    v[0] = get_bits(gb, 8);
852
    v[1] = get_bits(gb, 6);
853
    v[2] = get_bits(gb, 5);
854
    v[3] = get_bits(gb, 5);
855

    
856
    dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
857
                 mul_lsf, base_lsf);
858
}
859

    
860
/**
861
 * Parse 10 independently-coded LSPs, and then derive the tables to
862
 * generate LSPs for the other frames from them (residual coding).
863
 */
864
static void dequant_lsp10r(GetBitContext *gb,
865
                           double *i_lsps, const double *old,
866
                           double *a1, double *a2, int q_mode)
867
{
868
    static const uint16_t vec_sizes[3] = { 128, 64, 64 };
869
    static const double mul_lsf[3] = {
870
        2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
871
    };
872
    static const double base_lsf[3] = {
873
        M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
874
    };
875
    const float (*ipol_tab)[2][10] = q_mode ?
876
        wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
877
    uint16_t interpol, v[3];
878
    int n;
879

    
880
    dequant_lsp10i(gb, i_lsps);
881

    
882
    interpol = get_bits(gb, 5);
883
    v[0]     = get_bits(gb, 7);
884
    v[1]     = get_bits(gb, 6);
885
    v[2]     = get_bits(gb, 6);
886

    
887
    for (n = 0; n < 10; n++) {
888
        double delta = old[n] - i_lsps[n];
889
        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
890
        a1[10 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
891
    }
892

    
893
    dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
894
                 mul_lsf, base_lsf);
895
}
896

    
897
/**
898
 * Parse 16 independently-coded LSPs.
899
 */
900
static void dequant_lsp16i(GetBitContext *gb, double *lsps)
901
{
902
    static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
903
    static const double mul_lsf[5] = {
904
        3.3439586280e-3,    6.9908173703e-4,
905
        3.3216608306e-3,    1.0334960326e-3,
906
        3.1899104283e-3
907
    };
908
    static const double base_lsf[5] = {
909
        M_PI * -1.27576e-1, M_PI * -2.4292e-2,
910
        M_PI * -1.28094e-1, M_PI * -3.2128e-2,
911
        M_PI * -1.29816e-1
912
    };
913
    uint16_t v[5];
914

    
915
    v[0] = get_bits(gb, 8);
916
    v[1] = get_bits(gb, 6);
917
    v[2] = get_bits(gb, 7);
918
    v[3] = get_bits(gb, 6);
919
    v[4] = get_bits(gb, 7);
920

    
921
    dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
922
                 wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
923
    dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
924
                 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
925
    dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
926
                 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
927
}
928

    
929
/**
930
 * Parse 16 independently-coded LSPs, and then derive the tables to
931
 * generate LSPs for the other frames from them (residual coding).
932
 */
933
static void dequant_lsp16r(GetBitContext *gb,
934
                           double *i_lsps, const double *old,
935
                           double *a1, double *a2, int q_mode)
936
{
937
    static const uint16_t vec_sizes[3] = { 128, 128, 128 };
938
    static const double mul_lsf[3] = {
939
        1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
940
    };
941
    static const double base_lsf[3] = {
942
        M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
943
    };
944
    const float (*ipol_tab)[2][16] = q_mode ?
945
        wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
946
    uint16_t interpol, v[3];
947
    int n;
948

    
949
    dequant_lsp16i(gb, i_lsps);
950

    
951
    interpol = get_bits(gb, 5);
952
    v[0]     = get_bits(gb, 7);
953
    v[1]     = get_bits(gb, 7);
954
    v[2]     = get_bits(gb, 7);
955

    
956
    for (n = 0; n < 16; n++) {
957
        double delta = old[n] - i_lsps[n];
958
        a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
959
        a1[16 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
960
    }
961

    
962
    dequant_lsps( a2,     10,  v,     vec_sizes,    1,
963
                 wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
964
    dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
965
                 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
966
    dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
967
                 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
968
}
969

    
970
/**
971
 * @}
972
 * @defgroup aw Pitch-adaptive window coding functions
973
 * The next few functions are for pitch-adaptive window coding.
974
 * @{
975
 */
976
/**
977
 * Parse the offset of the first pitch-adaptive window pulses, and
978
 * the distribution of pulses between the two blocks in this frame.
979
 * @param s WMA Voice decoding context private data
980
 * @param gb bit I/O context
981
 * @param pitch pitch for each block in this frame
982
 */
983
static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
984
                            const int *pitch)
985
{
986
    static const int16_t start_offset[94] = {
987
        -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
988
         13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
989
         27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
990
         45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
991
         69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
992
         93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
993
        117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
994
        141, 143, 145, 147, 149, 151, 153, 155, 157, 159
995
    };
996
    int bits, offset;
997

    
998
    /* position of pulse */
999
    s->aw_idx_is_ext = 0;
1000
    if ((bits = get_bits(gb, 6)) >= 54) {
1001
        s->aw_idx_is_ext = 1;
1002
        bits += (bits - 54) * 3 + get_bits(gb, 2);
1003
    }
1004

    
1005
    /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1006
     * the distribution of the pulses in each block contained in this frame. */
1007
    s->aw_pulse_range        = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1008
    for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1009
    s->aw_n_pulses[0]        = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1010
    s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1011
    offset                  += s->aw_n_pulses[0] * pitch[0];
1012
    s->aw_n_pulses[1]        = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1013
    s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1014

    
1015
    /* if continuing from a position before the block, reset position to
1016
     * start of block (when corrected for the range over which it can be
1017
     * spread in aw_pulse_set1()). */
1018
    if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1019
        while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1020
            s->aw_first_pulse_off[1] -= pitch[1];
1021
        if (start_offset[bits] < 0)
1022
            while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1023
                s->aw_first_pulse_off[0] -= pitch[0];
1024
    }
1025
}
1026

    
1027
/**
1028
 * Apply second set of pitch-adaptive window pulses.
1029
 * @param s WMA Voice decoding context private data
1030
 * @param gb bit I/O context
1031
 * @param block_idx block index in frame [0, 1]
1032
 * @param fcb structure containing fixed codebook vector info
1033
 */
1034
static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1035
                          int block_idx, AMRFixed *fcb)
1036
{
1037
    uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1038
    uint16_t *use_mask = use_mask_mem + 2;
1039
    /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1040
     * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1041
     * of idx are the position of the bit within a particular item in the
1042
     * array (0 being the most significant bit, and 15 being the least
1043
     * significant bit), and the remainder (>> 4) is the index in the
1044
     * use_mask[]-array. This is faster and uses less memory than using a
1045
     * 80-byte/80-int array. */
1046
    int pulse_off = s->aw_first_pulse_off[block_idx],
1047
        pulse_start, n, idx, range, aidx, start_off = 0;
1048

    
1049
    /* set offset of first pulse to within this block */
1050
    if (s->aw_n_pulses[block_idx] > 0)
1051
        while (pulse_off + s->aw_pulse_range < 1)
1052
            pulse_off += fcb->pitch_lag;
1053

    
1054
    /* find range per pulse */
1055
    if (s->aw_n_pulses[0] > 0) {
1056
        if (block_idx == 0) {
1057
            range = 32;
1058
        } else /* block_idx = 1 */ {
1059
            range = 8;
1060
            if (s->aw_n_pulses[block_idx] > 0)
1061
                pulse_off = s->aw_next_pulse_off_cache;
1062
        }
1063
    } else
1064
        range = 16;
1065
    pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1066

    
1067
    /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1068
     * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1069
     * we exclude that range from being pulsed again in this function. */
1070
    memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1071
    memset( use_mask,   -1, 5 * sizeof(use_mask[0]));
1072
    memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1073
    if (s->aw_n_pulses[block_idx] > 0)
1074
        for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1075
            int excl_range         = s->aw_pulse_range; // always 16 or 24
1076
            uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1077
            int first_sh           = 16 - (idx & 15);
1078
            *use_mask_ptr++       &= 0xFFFF << first_sh;
1079
            excl_range            -= first_sh;
1080
            if (excl_range >= 16) {
1081
                *use_mask_ptr++    = 0;
1082
                *use_mask_ptr     &= 0xFFFF >> (excl_range - 16);
1083
            } else
1084
                *use_mask_ptr     &= 0xFFFF >> excl_range;
1085
        }
1086

    
1087
    /* find the 'aidx'th offset that is not excluded */
1088
    aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1089
    for (n = 0; n <= aidx; pulse_start++) {
1090
        for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1091
        if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1092
            if (use_mask[0])      idx = 0x0F;
1093
            else if (use_mask[1]) idx = 0x1F;
1094
            else if (use_mask[2]) idx = 0x2F;
1095
            else if (use_mask[3]) idx = 0x3F;
1096
            else if (use_mask[4]) idx = 0x4F;
1097
            else                  return;
1098
            idx -= av_log2_16bit(use_mask[idx >> 4]);
1099
        }
1100
        if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1101
            use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1102
            n++;
1103
            start_off = idx;
1104
        }
1105
    }
1106

    
1107
    fcb->x[fcb->n] = start_off;
1108
    fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1109
    fcb->n++;
1110

    
1111
    /* set offset for next block, relative to start of that block */
1112
    n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1113
    s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1114
}
1115

    
1116
/**
1117
 * Apply first set of pitch-adaptive window pulses.
1118
 * @param s WMA Voice decoding context private data
1119
 * @param gb bit I/O context
1120
 * @param block_idx block index in frame [0, 1]
1121
 * @param fcb storage location for fixed codebook pulse info
1122
 */
1123
static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1124
                          int block_idx, AMRFixed *fcb)
1125
{
1126
    int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1127
    float v;
1128

    
1129
    if (s->aw_n_pulses[block_idx] > 0) {
1130
        int n, v_mask, i_mask, sh, n_pulses;
1131

    
1132
        if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1133
            n_pulses = 3;
1134
            v_mask   = 8;
1135
            i_mask   = 7;
1136
            sh       = 4;
1137
        } else { // 4 pulses, 1:sign + 2:index each
1138
            n_pulses = 4;
1139
            v_mask   = 4;
1140
            i_mask   = 3;
1141
            sh       = 3;
1142
        }
1143

    
1144
        for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1145
            fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1146
            fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1147
                                 s->aw_first_pulse_off[block_idx];
1148
            while (fcb->x[fcb->n] < 0)
1149
                fcb->x[fcb->n] += fcb->pitch_lag;
1150
            if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1151
                fcb->n++;
1152
        }
1153
    } else {
1154
        int num2 = (val & 0x1FF) >> 1, delta, idx;
1155

    
1156
        if (num2 < 1 * 79)      { delta = 1; idx = num2 + 1; }
1157
        else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1158
        else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1159
        else                    { delta = 7; idx = num2 + 1 - 3 * 75; }
1160
        v = (val & 0x200) ? -1.0 : 1.0;
1161

    
1162
        fcb->no_repeat_mask |= 3 << fcb->n;
1163
        fcb->x[fcb->n]       = idx - delta;
1164
        fcb->y[fcb->n]       = v;
1165
        fcb->x[fcb->n + 1]   = idx;
1166
        fcb->y[fcb->n + 1]   = (val & 1) ? -v : v;
1167
        fcb->n              += 2;
1168
    }
1169
}
1170

    
1171
/**
1172
 * @}
1173
 *
1174
 * Generate a random number from frame_cntr and block_idx, which will lief
1175
 * in the range [0, 1000 - block_size] (so it can be used as an index in a
1176
 * table of size 1000 of which you want to read block_size entries).
1177
 *
1178
 * @param frame_cntr current frame number
1179
 * @param block_num current block index
1180
 * @param block_size amount of entries we want to read from a table
1181
 *                   that has 1000 entries
1182
 * @return a (non-)random number in the [0, 1000 - block_size] range.
1183
 */
1184
static int pRNG(int frame_cntr, int block_num, int block_size)
1185
{
1186
    /* array to simplify the calculation of z:
1187
     * y = (x % 9) * 5 + 6;
1188
     * z = (49995 * x) / y;
1189
     * Since y only has 9 values, we can remove the division by using a
1190
     * LUT and using FASTDIV-style divisions. For each of the 9 values
1191
     * of y, we can rewrite z as:
1192
     * z = x * (49995 / y) + x * ((49995 % y) / y)
1193
     * In this table, each col represents one possible value of y, the
1194
     * first number is 49995 / y, and the second is the FASTDIV variant
1195
     * of 49995 % y / y. */
1196
    static const unsigned int div_tbl[9][2] = {
1197
        { 8332,  3 * 715827883U }, // y =  6
1198
        { 4545,  0 * 390451573U }, // y = 11
1199
        { 3124, 11 * 268435456U }, // y = 16
1200
        { 2380, 15 * 204522253U }, // y = 21
1201
        { 1922, 23 * 165191050U }, // y = 26
1202
        { 1612, 23 * 138547333U }, // y = 31
1203
        { 1388, 27 * 119304648U }, // y = 36
1204
        { 1219, 16 * 104755300U }, // y = 41
1205
        { 1086, 39 *  93368855U }  // y = 46
1206
    };
1207
    unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1208
    if (x >= 0xFFFF) x -= 0xFFFF;   // max value of x is 8*1877+0xFFFE=0x13AA6,
1209
                                    // so this is effectively a modulo (%)
1210
    y = x - 9 * MULH(477218589, x); // x % 9
1211
    z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1212
                                    // z = x * 49995 / (y * 5 + 6)
1213
    return z % (1000 - block_size);
1214
}
1215

    
1216
/**
1217
 * Parse hardcoded signal for a single block.
1218
 * @note see #synth_block().
1219
 */
1220
static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1221
                                 int block_idx, int size,
1222
                                 const struct frame_type_desc *frame_desc,
1223
                                 float *excitation)
1224
{
1225
    float gain;
1226
    int n, r_idx;
1227

    
1228
    assert(size <= MAX_FRAMESIZE);
1229

    
1230
    /* Set the offset from which we start reading wmavoice_std_codebook */
1231
    if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1232
        r_idx = pRNG(s->frame_cntr, block_idx, size);
1233
        gain  = s->silence_gain;
1234
    } else /* FCB_TYPE_HARDCODED */ {
1235
        r_idx = get_bits(gb, 8);
1236
        gain  = wmavoice_gain_universal[get_bits(gb, 6)];
1237
    }
1238

    
1239
    /* Clear gain prediction parameters */
1240
    memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1241

    
1242
    /* Apply gain to hardcoded codebook and use that as excitation signal */
1243
    for (n = 0; n < size; n++)
1244
        excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1245
}
1246

    
1247
/**
1248
 * Parse FCB/ACB signal for a single block.
1249
 * @note see #synth_block().
1250
 */
1251
static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1252
                                int block_idx, int size,
1253
                                int block_pitch_sh2,
1254
                                const struct frame_type_desc *frame_desc,
1255
                                float *excitation)
1256
{
1257
    static const float gain_coeff[6] = {
1258
        0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1259
    };
1260
    float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1261
    int n, idx, gain_weight;
1262
    AMRFixed fcb;
1263

    
1264
    assert(size <= MAX_FRAMESIZE / 2);
1265
    memset(pulses, 0, sizeof(*pulses) * size);
1266

    
1267
    fcb.pitch_lag      = block_pitch_sh2 >> 2;
1268
    fcb.pitch_fac      = 1.0;
1269
    fcb.no_repeat_mask = 0;
1270
    fcb.n              = 0;
1271

    
1272
    /* For the other frame types, this is where we apply the innovation
1273
     * (fixed) codebook pulses of the speech signal. */
1274
    if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1275
        aw_pulse_set1(s, gb, block_idx, &fcb);
1276
        aw_pulse_set2(s, gb, block_idx, &fcb);
1277
    } else /* FCB_TYPE_EXC_PULSES */ {
1278
        int offset_nbits = 5 - frame_desc->log_n_blocks;
1279

    
1280
        fcb.no_repeat_mask = -1;
1281
        /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1282
         * (instead of double) for a subset of pulses */
1283
        for (n = 0; n < 5; n++) {
1284
            float sign;
1285
            int pos1, pos2;
1286

    
1287
            sign           = get_bits1(gb) ? 1.0 : -1.0;
1288
            pos1           = get_bits(gb, offset_nbits);
1289
            fcb.x[fcb.n]   = n + 5 * pos1;
1290
            fcb.y[fcb.n++] = sign;
1291
            if (n < frame_desc->dbl_pulses) {
1292
                pos2           = get_bits(gb, offset_nbits);
1293
                fcb.x[fcb.n]   = n + 5 * pos2;
1294
                fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1295
            }
1296
        }
1297
    }
1298
    ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1299

    
1300
    /* Calculate gain for adaptive & fixed codebook signal.
1301
     * see ff_amr_set_fixed_gain(). */
1302
    idx = get_bits(gb, 7);
1303
    fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
1304
                    5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1305
    acb_gain = wmavoice_gain_codebook_acb[idx];
1306
    pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1307
                        -2.9957322736 /* log(0.05) */,
1308
                         1.6094379124 /* log(5.0)  */);
1309

    
1310
    gain_weight = 8 >> frame_desc->log_n_blocks;
1311
    memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1312
            sizeof(*s->gain_pred_err) * (6 - gain_weight));
1313
    for (n = 0; n < gain_weight; n++)
1314
        s->gain_pred_err[n] = pred_err;
1315

    
1316
    /* Calculation of adaptive codebook */
1317
    if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1318
        int len;
1319
        for (n = 0; n < size; n += len) {
1320
            int next_idx_sh16;
1321
            int abs_idx    = block_idx * size + n;
1322
            int pitch_sh16 = (s->last_pitch_val << 16) +
1323
                             s->pitch_diff_sh16 * abs_idx;
1324
            int pitch      = (pitch_sh16 + 0x6FFF) >> 16;
1325
            int idx_sh16   = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1326
            idx            = idx_sh16 >> 16;
1327
            if (s->pitch_diff_sh16) {
1328
                if (s->pitch_diff_sh16 > 0) {
1329
                    next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1330
                } else
1331
                    next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1332
                len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1333
                              1, size - n);
1334
            } else
1335
                len = size;
1336

    
1337
            ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1338
                                  wmavoice_ipol1_coeffs, 17,
1339
                                  idx, 9, len);
1340
        }
1341
    } else /* ACB_TYPE_HAMMING */ {
1342
        int block_pitch = block_pitch_sh2 >> 2;
1343
        idx             = block_pitch_sh2 & 3;
1344
        if (idx) {
1345
            ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1346
                                  wmavoice_ipol2_coeffs, 4,
1347
                                  idx, 8, size);
1348
        } else
1349
            av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1350
                              sizeof(float) * size);
1351
    }
1352

    
1353
    /* Interpolate ACB/FCB and use as excitation signal */
1354
    ff_weighted_vector_sumf(excitation, excitation, pulses,
1355
                            acb_gain, fcb_gain, size);
1356
}
1357

    
1358
/**
1359
 * Parse data in a single block.
1360
 * @note we assume enough bits are available, caller should check.
1361
 *
1362
 * @param s WMA Voice decoding context private data
1363
 * @param gb bit I/O context
1364
 * @param block_idx index of the to-be-read block
1365
 * @param size amount of samples to be read in this block
1366
 * @param block_pitch_sh2 pitch for this block << 2
1367
 * @param lsps LSPs for (the end of) this frame
1368
 * @param prev_lsps LSPs for the last frame
1369
 * @param frame_desc frame type descriptor
1370
 * @param excitation target memory for the ACB+FCB interpolated signal
1371
 * @param synth target memory for the speech synthesis filter output
1372
 * @return 0 on success, <0 on error.
1373
 */
1374
static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1375
                        int block_idx, int size,
1376
                        int block_pitch_sh2,
1377
                        const double *lsps, const double *prev_lsps,
1378
                        const struct frame_type_desc *frame_desc,
1379
                        float *excitation, float *synth)
1380
{
1381
    double i_lsps[MAX_LSPS];
1382
    float lpcs[MAX_LSPS];
1383
    float fac;
1384
    int n;
1385

    
1386
    if (frame_desc->acb_type == ACB_TYPE_NONE)
1387
        synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1388
    else
1389
        synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1390
                            frame_desc, excitation);
1391

    
1392
    /* convert interpolated LSPs to LPCs */
1393
    fac = (block_idx + 0.5) / frame_desc->n_blocks;
1394
    for (n = 0; n < s->lsps; n++) // LSF -> LSP
1395
        i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1396
    ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1397

    
1398
    /* Speech synthesis */
1399
    ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1400
}
1401

    
1402
/**
1403
 * Synthesize output samples for a single frame.
1404
 * @note we assume enough bits are available, caller should check.
1405
 *
1406
 * @param ctx WMA Voice decoder context
1407
 * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1408
 * @param frame_idx Frame number within superframe [0-2]
1409
 * @param samples pointer to output sample buffer, has space for at least 160
1410
 *                samples
1411
 * @param lsps LSP array
1412
 * @param prev_lsps array of previous frame's LSPs
1413
 * @param excitation target buffer for excitation signal
1414
 * @param synth target buffer for synthesized speech data
1415
 * @return 0 on success, <0 on error.
1416
 */
1417
static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1418
                       float *samples,
1419
                       const double *lsps, const double *prev_lsps,
1420
                       float *excitation, float *synth)
1421
{
1422
    WMAVoiceContext *s = ctx->priv_data;
1423
    int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
1424
    int pitch[MAX_BLOCKS], last_block_pitch;
1425

    
1426
    /* Parse frame type ("frame header"), see frame_descs */
1427
    int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)],
1428
        block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1429

    
1430
    if (bd_idx < 0) {
1431
        av_log(ctx, AV_LOG_ERROR,
1432
               "Invalid frame type VLC code, skipping\n");
1433
        return -1;
1434
    }
1435

    
1436
    /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1437
    if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1438
        /* Pitch is provided per frame, which is interpreted as the pitch of
1439
         * the last sample of the last block of this frame. We can interpolate
1440
         * the pitch of other blocks (and even pitch-per-sample) by gradually
1441
         * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1442
        n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
1443
        log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
1444
        cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1445
        cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1446
        if (s->last_acb_type == ACB_TYPE_NONE ||
1447
            20 * abs(cur_pitch_val - s->last_pitch_val) >
1448
                (cur_pitch_val + s->last_pitch_val))
1449
            s->last_pitch_val = cur_pitch_val;
1450

    
1451
        /* pitch per block */
1452
        for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1453
            int fac = n * 2 + 1;
1454

    
1455
            pitch[n] = (MUL16(fac,                 cur_pitch_val) +
1456
                        MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1457
                        frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1458
        }
1459

    
1460
        /* "pitch-diff-per-sample" for calculation of pitch per sample */
1461
        s->pitch_diff_sh16 =
1462
            ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1463
    }
1464

    
1465
    /* Global gain (if silence) and pitch-adaptive window coordinates */
1466
    switch (frame_descs[bd_idx].fcb_type) {
1467
    case FCB_TYPE_SILENCE:
1468
        s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1469
        break;
1470
    case FCB_TYPE_AW_PULSES:
1471
        aw_parse_coords(s, gb, pitch);
1472
        break;
1473
    }
1474

    
1475
    for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1476
        int bl_pitch_sh2;
1477

    
1478
        /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1479
        switch (frame_descs[bd_idx].acb_type) {
1480
        case ACB_TYPE_HAMMING: {
1481
            /* Pitch is given per block. Per-block pitches are encoded as an
1482
             * absolute value for the first block, and then delta values
1483
             * relative to this value) for all subsequent blocks. The scale of
1484
             * this pitch value is semi-logaritmic compared to its use in the
1485
             * decoder, so we convert it to normal scale also. */
1486
            int block_pitch,
1487
                t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1488
                t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1489
                t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
1490

    
1491
            if (n == 0) {
1492
                block_pitch = get_bits(gb, s->block_pitch_nbits);
1493
            } else
1494
                block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1495
                                 get_bits(gb, s->block_delta_pitch_nbits);
1496
            /* Convert last_ so that any next delta is within _range */
1497
            last_block_pitch = av_clip(block_pitch,
1498
                                       s->block_delta_pitch_hrange,
1499
                                       s->block_pitch_range -
1500
                                           s->block_delta_pitch_hrange);
1501

    
1502
            /* Convert semi-log-style scale back to normal scale */
1503
            if (block_pitch < t1) {
1504
                bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1505
            } else {
1506
                block_pitch -= t1;
1507
                if (block_pitch < t2) {
1508
                    bl_pitch_sh2 =
1509
                        (s->block_conv_table[1] << 2) + (block_pitch << 1);
1510
                } else {
1511
                    block_pitch -= t2;
1512
                    if (block_pitch < t3) {
1513
                        bl_pitch_sh2 =
1514
                            (s->block_conv_table[2] + block_pitch) << 2;
1515
                    } else
1516
                        bl_pitch_sh2 = s->block_conv_table[3] << 2;
1517
                }
1518
            }
1519
            pitch[n] = bl_pitch_sh2 >> 2;
1520
            break;
1521
        }
1522

    
1523
        case ACB_TYPE_ASYMMETRIC: {
1524
            bl_pitch_sh2 = pitch[n] << 2;
1525
            break;
1526
        }
1527

    
1528
        default: // ACB_TYPE_NONE has no pitch
1529
            bl_pitch_sh2 = 0;
1530
            break;
1531
        }
1532

    
1533
        synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1534
                    lsps, prev_lsps, &frame_descs[bd_idx],
1535
                    &excitation[n * block_nsamples],
1536
                    &synth[n * block_nsamples]);
1537
    }
1538

    
1539
    /* Averaging projection filter, if applicable. Else, just copy samples
1540
     * from synthesis buffer */
1541
    if (s->do_apf) {
1542
        double i_lsps[MAX_LSPS];
1543
        float lpcs[MAX_LSPS];
1544

    
1545
        for (n = 0; n < s->lsps; n++) // LSF -> LSP
1546
            i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1547
        ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1548
        postfilter(s, synth, samples, 80, lpcs,
1549
                   &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1550
                   frame_descs[bd_idx].fcb_type, pitch[0]);
1551

    
1552
        for (n = 0; n < s->lsps; n++) // LSF -> LSP
1553
            i_lsps[n] = cos(lsps[n]);
1554
        ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1555
        postfilter(s, &synth[80], &samples[80], 80, lpcs,
1556
                   &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1557
                   frame_descs[bd_idx].fcb_type, pitch[0]);
1558
    } else
1559
        memcpy(samples, synth, 160 * sizeof(synth[0]));
1560

    
1561
    /* Cache values for next frame */
1562
    s->frame_cntr++;
1563
    if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1564
    s->last_acb_type = frame_descs[bd_idx].acb_type;
1565
    switch (frame_descs[bd_idx].acb_type) {
1566
    case ACB_TYPE_NONE:
1567
        s->last_pitch_val = 0;
1568
        break;
1569
    case ACB_TYPE_ASYMMETRIC:
1570
        s->last_pitch_val = cur_pitch_val;
1571
        break;
1572
    case ACB_TYPE_HAMMING:
1573
        s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1574
        break;
1575
    }
1576

    
1577
    return 0;
1578
}
1579

    
1580
/**
1581
 * Ensure minimum value for first item, maximum value for last value,
1582
 * proper spacing between each value and proper ordering.
1583
 *
1584
 * @param lsps array of LSPs
1585
 * @param num size of LSP array
1586
 *
1587
 * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1588
 *       useful to put in a generic location later on. Parts are also
1589
 *       present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1590
 *       which is in float.
1591
 */
1592
static void stabilize_lsps(double *lsps, int num)
1593
{
1594
    int n, m, l;
1595

    
1596
    /* set minimum value for first, maximum value for last and minimum
1597
     * spacing between LSF values.
1598
     * Very similar to ff_set_min_dist_lsf(), but in double. */
1599
    lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
1600
    for (n = 1; n < num; n++)
1601
        lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
1602
    lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1603

    
1604
    /* reorder (looks like one-time / non-recursed bubblesort).
1605
     * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1606
    for (n = 1; n < num; n++) {
1607
        if (lsps[n] < lsps[n - 1]) {
1608
            for (m = 1; m < num; m++) {
1609
                double tmp = lsps[m];
1610
                for (l = m - 1; l >= 0; l--) {
1611
                    if (lsps[l] <= tmp) break;
1612
                    lsps[l + 1] = lsps[l];
1613
                }
1614
                lsps[l + 1] = tmp;
1615
            }
1616
            break;
1617
        }
1618
    }
1619
}
1620

    
1621
/**
1622
 * Test if there's enough bits to read 1 superframe.
1623
 *
1624
 * @param orig_gb bit I/O context used for reading. This function
1625
 *                does not modify the state of the bitreader; it
1626
 *                only uses it to copy the current stream position
1627
 * @param s WMA Voice decoding context private data
1628
 * @return -1 if unsupported, 1 on not enough bits or 0 if OK.
1629
 */
1630
static int check_bits_for_superframe(GetBitContext *orig_gb,
1631
                                     WMAVoiceContext *s)
1632
{
1633
    GetBitContext s_gb, *gb = &s_gb;
1634
    int n, need_bits, bd_idx;
1635
    const struct frame_type_desc *frame_desc;
1636

    
1637
    /* initialize a copy */
1638
    init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
1639
    skip_bits_long(gb, get_bits_count(orig_gb));
1640
    assert(get_bits_left(gb) == get_bits_left(orig_gb));
1641

    
1642
    /* superframe header */
1643
    if (get_bits_left(gb) < 14)
1644
        return 1;
1645
    if (!get_bits1(gb))
1646
        return -1;                        // WMAPro-in-WMAVoice superframe
1647
    if (get_bits1(gb)) skip_bits(gb, 12); // number of  samples in superframe
1648
    if (s->has_residual_lsps) {           // residual LSPs (for all frames)
1649
        if (get_bits_left(gb) < s->sframe_lsp_bitsize)
1650
            return 1;
1651
        skip_bits_long(gb, s->sframe_lsp_bitsize);
1652
    }
1653

    
1654
    /* frames */
1655
    for (n = 0; n < MAX_FRAMES; n++) {
1656
        int aw_idx_is_ext = 0;
1657

    
1658
        if (!s->has_residual_lsps) {     // independent LSPs (per-frame)
1659
           if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
1660
           skip_bits_long(gb, s->frame_lsp_bitsize);
1661
        }
1662
        bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
1663
        if (bd_idx < 0)
1664
            return -1;                   // invalid frame type VLC code
1665
        frame_desc = &frame_descs[bd_idx];
1666
        if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1667
            if (get_bits_left(gb) < s->pitch_nbits)
1668
                return 1;
1669
            skip_bits_long(gb, s->pitch_nbits);
1670
        }
1671
        if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1672
            skip_bits(gb, 8);
1673
        } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1674
            int tmp = get_bits(gb, 6);
1675
            if (tmp >= 0x36) {
1676
                skip_bits(gb, 2);
1677
                aw_idx_is_ext = 1;
1678
            }
1679
        }
1680

    
1681
        /* blocks */
1682
        if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
1683
            need_bits = s->block_pitch_nbits +
1684
                (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
1685
        } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1686
            need_bits = 2 * !aw_idx_is_ext;
1687
        } else
1688
            need_bits = 0;
1689
        need_bits += frame_desc->frame_size;
1690
        if (get_bits_left(gb) < need_bits)
1691
            return 1;
1692
        skip_bits_long(gb, need_bits);
1693
    }
1694

    
1695
    return 0;
1696
}
1697

    
1698
/**
1699
 * Synthesize output samples for a single superframe. If we have any data
1700
 * cached in s->sframe_cache, that will be used instead of whatever is loaded
1701
 * in s->gb.
1702
 *
1703
 * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1704
 * to give a total of 480 samples per frame. See #synth_frame() for frame
1705
 * parsing. In addition to 3 frames, superframes can also contain the LSPs
1706
 * (if these are globally specified for all frames (residually); they can
1707
 * also be specified individually per-frame. See the s->has_residual_lsps
1708
 * option), and can specify the number of samples encoded in this superframe
1709
 * (if less than 480), usually used to prevent blanks at track boundaries.
1710
 *
1711
 * @param ctx WMA Voice decoder context
1712
 * @param samples pointer to output buffer for voice samples
1713
 * @param data_size pointer containing the size of #samples on input, and the
1714
 *                  amount of #samples filled on output
1715
 * @return 0 on success, <0 on error or 1 if there was not enough data to
1716
 *         fully parse the superframe
1717
 */
1718
static int synth_superframe(AVCodecContext *ctx,
1719
                            float *samples, int *data_size)
1720
{
1721
    WMAVoiceContext *s = ctx->priv_data;
1722
    GetBitContext *gb = &s->gb, s_gb;
1723
    int n, res, n_samples = 480;
1724
    double lsps[MAX_FRAMES][MAX_LSPS];
1725
    const double *mean_lsf = s->lsps == 16 ?
1726
        wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1727
    float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1728
    float synth[MAX_LSPS + MAX_SFRAMESIZE];
1729

    
1730
    memcpy(synth,      s->synth_history,
1731
           s->lsps             * sizeof(*synth));
1732
    memcpy(excitation, s->excitation_history,
1733
           s->history_nsamples * sizeof(*excitation));
1734

    
1735
    if (s->sframe_cache_size > 0) {
1736
        gb = &s_gb;
1737
        init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1738
        s->sframe_cache_size = 0;
1739
    }
1740

    
1741
    if ((res = check_bits_for_superframe(gb, s)) == 1) return 1;
1742

    
1743
    /* First bit is speech/music bit, it differentiates between WMAVoice
1744
     * speech samples (the actual codec) and WMAVoice music samples, which
1745
     * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1746
     * the wild yet. */
1747
    if (!get_bits1(gb)) {
1748
        av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1);
1749
        return -1;
1750
    }
1751

    
1752
    /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1753
    if (get_bits1(gb)) {
1754
        if ((n_samples = get_bits(gb, 12)) > 480) {
1755
            av_log(ctx, AV_LOG_ERROR,
1756
                   "Superframe encodes >480 samples (%d), not allowed\n",
1757
                   n_samples);
1758
            return -1;
1759
        }
1760
    }
1761
    /* Parse LSPs, if global for the superframe (can also be per-frame). */
1762
    if (s->has_residual_lsps) {
1763
        double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1764

    
1765
        for (n = 0; n < s->lsps; n++)
1766
            prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1767

    
1768
        if (s->lsps == 10) {
1769
            dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1770
        } else /* s->lsps == 16 */
1771
            dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1772

    
1773
        for (n = 0; n < s->lsps; n++) {
1774
            lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
1775
            lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1776
            lsps[2][n] += mean_lsf[n];
1777
        }
1778
        for (n = 0; n < 3; n++)
1779
            stabilize_lsps(lsps[n], s->lsps);
1780
    }
1781

    
1782
    /* Parse frames, optionally preceeded by per-frame (independent) LSPs. */
1783
    for (n = 0; n < 3; n++) {
1784
        if (!s->has_residual_lsps) {
1785
            int m;
1786

    
1787
            if (s->lsps == 10) {
1788
                dequant_lsp10i(gb, lsps[n]);
1789
            } else /* s->lsps == 16 */
1790
                dequant_lsp16i(gb, lsps[n]);
1791

    
1792
            for (m = 0; m < s->lsps; m++)
1793
                lsps[n][m] += mean_lsf[m];
1794
            stabilize_lsps(lsps[n], s->lsps);
1795
        }
1796

    
1797
        if ((res = synth_frame(ctx, gb, n,
1798
                               &samples[n * MAX_FRAMESIZE],
1799
                               lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1800
                               &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1801
                               &synth[s->lsps + n * MAX_FRAMESIZE])))
1802
            return res;
1803
    }
1804

    
1805
    /* Statistics? FIXME - we don't check for length, a slight overrun
1806
     * will be caught by internal buffer padding, and anything else
1807
     * will be skipped, not read. */
1808
    if (get_bits1(gb)) {
1809
        res = get_bits(gb, 4);
1810
        skip_bits(gb, 10 * (res + 1));
1811
    }
1812

    
1813
    /* Specify nr. of output samples */
1814
    *data_size = n_samples * sizeof(float);
1815

    
1816
    /* Update history */
1817
    memcpy(s->prev_lsps,           lsps[2],
1818
           s->lsps             * sizeof(*s->prev_lsps));
1819
    memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
1820
           s->lsps             * sizeof(*synth));
1821
    memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1822
           s->history_nsamples * sizeof(*excitation));
1823
    if (s->do_apf)
1824
        memmove(s->zero_exc_pf,       &s->zero_exc_pf[MAX_SFRAMESIZE],
1825
                s->history_nsamples * sizeof(*s->zero_exc_pf));
1826

    
1827
    return 0;
1828
}
1829

    
1830
/**
1831
 * Parse the packet header at the start of each packet (input data to this
1832
 * decoder).
1833
 *
1834
 * @param s WMA Voice decoding context private data
1835
 * @return 1 if not enough bits were available, or 0 on success.
1836
 */
1837
static int parse_packet_header(WMAVoiceContext *s)
1838
{
1839
    GetBitContext *gb = &s->gb;
1840
    unsigned int res;
1841

    
1842
    if (get_bits_left(gb) < 11)
1843
        return 1;
1844
    skip_bits(gb, 4);          // packet sequence number
1845
    s->has_residual_lsps = get_bits1(gb);
1846
    do {
1847
        res = get_bits(gb, 6); // number of superframes per packet
1848
                               // (minus first one if there is spillover)
1849
        if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
1850
            return 1;
1851
    } while (res == 0x3F);
1852
    s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
1853

    
1854
    return 0;
1855
}
1856

    
1857
/**
1858
 * Copy (unaligned) bits from gb/data/size to pb.
1859
 *
1860
 * @param pb target buffer to copy bits into
1861
 * @param data source buffer to copy bits from
1862
 * @param size size of the source data, in bytes
1863
 * @param gb bit I/O context specifying the current position in the source.
1864
 *           data. This function might use this to align the bit position to
1865
 *           a whole-byte boundary before calling #ff_copy_bits() on aligned
1866
 *           source data
1867
 * @param nbits the amount of bits to copy from source to target
1868
 *
1869
 * @note after calling this function, the current position in the input bit
1870
 *       I/O context is undefined.
1871
 */
1872
static void copy_bits(PutBitContext *pb,
1873
                      const uint8_t *data, int size,
1874
                      GetBitContext *gb, int nbits)
1875
{
1876
    int rmn_bytes, rmn_bits;
1877

    
1878
    rmn_bits = rmn_bytes = get_bits_left(gb);
1879
    if (rmn_bits < nbits)
1880
        return;
1881
    rmn_bits &= 7; rmn_bytes >>= 3;
1882
    if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1883
        put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1884
    ff_copy_bits(pb, data + size - rmn_bytes,
1885
                 FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1886
}
1887

    
1888
/**
1889
 * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1890
 * and we expect that the demuxer / application provides it to us as such
1891
 * (else you'll probably get garbage as output). Every packet has a size of
1892
 * ctx->block_align bytes, starts with a packet header (see
1893
 * #parse_packet_header()), and then a series of superframes. Superframe
1894
 * boundaries may exceed packets, i.e. superframes can split data over
1895
 * multiple (two) packets.
1896
 *
1897
 * For more information about frames, see #synth_superframe().
1898
 */
1899
static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
1900
                                  int *data_size, AVPacket *avpkt)
1901
{
1902
    WMAVoiceContext *s = ctx->priv_data;
1903
    GetBitContext *gb = &s->gb;
1904
    int size, res, pos;
1905

    
1906
    if (*data_size < 480 * sizeof(float)) {
1907
        av_log(ctx, AV_LOG_ERROR,
1908
               "Output buffer too small (%d given - %zu needed)\n",
1909
               *data_size, 480 * sizeof(float));
1910
        return -1;
1911
    }
1912
    *data_size = 0;
1913

    
1914
    /* Packets are sometimes a multiple of ctx->block_align, with a packet
1915
     * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1916
     * feeds us ASF packets, which may concatenate multiple "codec" packets
1917
     * in a single "muxer" packet, so we artificially emulate that by
1918
     * capping the packet size at ctx->block_align. */
1919
    for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1920
    if (!size)
1921
        return 0;
1922
    init_get_bits(&s->gb, avpkt->data, size << 3);
1923

    
1924
    /* size == ctx->block_align is used to indicate whether we are dealing with
1925
     * a new packet or a packet of which we already read the packet header
1926
     * previously. */
1927
    if (size == ctx->block_align) { // new packet header
1928
        if ((res = parse_packet_header(s)) < 0)
1929
            return res;
1930

    
1931
        /* If the packet header specifies a s->spillover_nbits, then we want
1932
         * to push out all data of the previous packet (+ spillover) before
1933
         * continuing to parse new superframes in the current packet. */
1934
        if (s->spillover_nbits > 0) {
1935
            if (s->sframe_cache_size > 0) {
1936
                int cnt = get_bits_count(gb);
1937
                copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1938
                flush_put_bits(&s->pb);
1939
                s->sframe_cache_size += s->spillover_nbits;
1940
                if ((res = synth_superframe(ctx, data, data_size)) == 0 &&
1941
                    *data_size > 0) {
1942
                    cnt += s->spillover_nbits;
1943
                    s->skip_bits_next = cnt & 7;
1944
                    return cnt >> 3;
1945
                } else
1946
                    skip_bits_long (gb, s->spillover_nbits - cnt +
1947
                                    get_bits_count(gb)); // resync
1948
            } else
1949
                skip_bits_long(gb, s->spillover_nbits);  // resync
1950
        }
1951
    } else if (s->skip_bits_next)
1952
        skip_bits(gb, s->skip_bits_next);
1953

    
1954
    /* Try parsing superframes in current packet */
1955
    s->sframe_cache_size = 0;
1956
    s->skip_bits_next = 0;
1957
    pos = get_bits_left(gb);
1958
    if ((res = synth_superframe(ctx, data, data_size)) < 0) {
1959
        return res;
1960
    } else if (*data_size > 0) {
1961
        int cnt = get_bits_count(gb);
1962
        s->skip_bits_next = cnt & 7;
1963
        return cnt >> 3;
1964
    } else if ((s->sframe_cache_size = pos) > 0) {
1965
        /* rewind bit reader to start of last (incomplete) superframe... */
1966
        init_get_bits(gb, avpkt->data, size << 3);
1967
        skip_bits_long(gb, (size << 3) - pos);
1968
        assert(get_bits_left(gb) == pos);
1969

    
1970
        /* ...and cache it for spillover in next packet */
1971
        init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1972
        copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1973
        // FIXME bad - just copy bytes as whole and add use the
1974
        // skip_bits_next field
1975
    }
1976

    
1977
    return size;
1978
}
1979

    
1980
static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
1981
{
1982
    WMAVoiceContext *s = ctx->priv_data;
1983

    
1984
    if (s->do_apf) {
1985
        ff_rdft_end(&s->rdft);
1986
        ff_rdft_end(&s->irdft);
1987
        ff_dct_end(&s->dct);
1988
        ff_dct_end(&s->dst);
1989
    }
1990

    
1991
    return 0;
1992
}
1993

    
1994
static av_cold void wmavoice_flush(AVCodecContext *ctx)
1995
{
1996
    WMAVoiceContext *s = ctx->priv_data;
1997
    int n;
1998

    
1999
    s->postfilter_agc    = 0;
2000
    s->sframe_cache_size = 0;
2001
    s->skip_bits_next    = 0;
2002
    for (n = 0; n < s->lsps; n++)
2003
        s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
2004
    memset(s->excitation_history, 0,
2005
           sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
2006
    memset(s->synth_history,      0,
2007
           sizeof(*s->synth_history)      * MAX_LSPS);
2008
    memset(s->gain_pred_err,      0,
2009
           sizeof(s->gain_pred_err));
2010

    
2011
    if (s->do_apf) {
2012
        memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
2013
               sizeof(*s->synth_filter_out_buf) * s->lsps);
2014
        memset(s->dcf_mem,              0,
2015
               sizeof(*s->dcf_mem)              * 2);
2016
        memset(s->zero_exc_pf,          0,
2017
               sizeof(*s->zero_exc_pf)          * s->history_nsamples);
2018
        memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
2019
    }
2020
}
2021

    
2022
AVCodec ff_wmavoice_decoder = {
2023
    "wmavoice",
2024
    AVMEDIA_TYPE_AUDIO,
2025
    CODEC_ID_WMAVOICE,
2026
    sizeof(WMAVoiceContext),
2027
    wmavoice_decode_init,
2028
    NULL,
2029
    wmavoice_decode_end,
2030
    wmavoice_decode_packet,
2031
    CODEC_CAP_SUBFRAMES,
2032
    .flush     = wmavoice_flush,
2033
    .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2034
};