ffmpeg / libavcodec / wmavoice.c @ d36beb3f
History  View  Annotate  Download (80 KB)
1 
/*


2 
* Windows Media Audio Voice decoder.

3 
* Copyright (c) 2009 Ronald S. Bultje

4 
*

5 
* This file is part of FFmpeg.

6 
*

7 
* FFmpeg is free software; you can redistribute it and/or

8 
* modify it under the terms of the GNU Lesser General Public

9 
* License as published by the Free Software Foundation; either

10 
* version 2.1 of the License, or (at your option) any later version.

11 
*

12 
* FFmpeg is distributed in the hope that it will be useful,

13 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

14 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 
* Lesser General Public License for more details.

16 
*

17 
* You should have received a copy of the GNU Lesser General Public

18 
* License along with FFmpeg; if not, write to the Free Software

19 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

20 
*/

21  
22 
/**

23 
* @file

24 
* @brief Windows Media Audio Voice compatible decoder

25 
* @author Ronald S. Bultje <rsbultje@gmail.com>

26 
*/

27  
28 
#include <math.h> 
29 
#include "avcodec.h" 
30 
#include "get_bits.h" 
31 
#include "put_bits.h" 
32 
#include "wmavoice_data.h" 
33 
#include "celp_math.h" 
34 
#include "celp_filters.h" 
35 
#include "acelp_vectors.h" 
36 
#include "acelp_filters.h" 
37 
#include "lsp.h" 
38 
#include "libavutil/lzo.h" 
39 
#include "avfft.h" 
40 
#include "fft.h" 
41  
42 
#define MAX_BLOCKS 8 ///< maximum number of blocks per frame 
43 
#define MAX_LSPS 16 ///< maximum filter order 
44 
#define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple 
45 
///< of 16 for ASM input buffer alignment

46 
#define MAX_FRAMES 3 ///< maximum number of frames per superframe 
47 
#define MAX_FRAMESIZE 160 ///< maximum number of samples per frame 
48 
#define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history 
49 
#define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)

50 
///< maximum number of samples per superframe

51 
#define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that 
52 
///< was split over two packets

53 
#define VLC_NBITS 6 ///< number of bits to read per VLC iteration 
54  
55 
/**

56 
* Frame type VLC coding.

57 
*/

58 
static VLC frame_type_vlc;

59  
60 
/**

61 
* Adaptive codebook types.

62 
*/

63 
enum {

64 
ACB_TYPE_NONE = 0, ///< no adaptive codebook (only hardcoded fixed) 
65 
ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with perframe pitch, which 
66 
///< we interpolate to get a persample pitch.

67 
///< Signal is generated using an asymmetric sinc

68 
///< window function

69 
///< @note see #wmavoice_ipol1_coeffs

70 
ACB_TYPE_HAMMING = 2 ///< Perblock pitch with signal generation using 
71 
///< a Hamming sinc window function

72 
///< @note see #wmavoice_ipol2_coeffs

73 
}; 
74  
75 
/**

76 
* Fixed codebook types.

77 
*/

78 
enum {

79 
FCB_TYPE_SILENCE = 0, ///< comfort noise during silence 
80 
///< generated from a hardcoded (fixed) codebook

81 
///< with perframe (low) gain values

82 
FCB_TYPE_HARDCODED = 1, ///< hardcoded (fixed) codebook with perblock 
83 
///< gain values

84 
FCB_TYPE_AW_PULSES = 2, ///< Pitchadaptive window (AW) pulse signals, 
85 
///< used in particular for lowbitrate streams

86 
FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in 
87 
///< combinations of either single pulses or

88 
///< pulse pairs

89 
}; 
90  
91 
/**

92 
* Description of frame types.

93 
*/

94 
static const struct frame_type_desc { 
95 
uint8_t n_blocks; ///< amount of blocks per frame (each block

96 
///< (contains 160/#n_blocks samples)

97 
uint8_t log_n_blocks; ///< log2(#n_blocks)

98 
uint8_t acb_type; ///< Adaptive codebook type (ACB_TYPE_*)

99 
uint8_t fcb_type; ///< Fixed codebook type (FCB_TYPE_*)

100 
uint8_t dbl_pulses; ///< how many pulse vectors have pulse pairs

101 
///< (rather than just one single pulse)

102 
///< only if #fcb_type == #FCB_TYPE_EXC_PULSES

103 
uint16_t frame_size; ///< the amount of bits that make up the block

104 
///< data (per frame)

105 
} frame_descs[17] = {

106 
{ 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 }, 
107 
{ 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 }, 
108 
{ 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 }, 
109 
{ 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 }, 
110 
{ 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 }, 
111 
{ 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 }, 
112 
{ 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 }, 
113 
{ 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 }, 
114 
{ 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 }, 
115 
{ 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 }, 
116 
{ 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 }, 
117 
{ 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 }, 
118 
{ 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 }, 
119 
{ 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 }, 
120 
{ 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 }, 
121 
{ 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 }, 
122 
{ 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 } 
123 
}; 
124  
125 
/**

126 
* WMA Voice decoding context.

127 
*/

128 
typedef struct { 
129 
/**

130 
* @defgroup struct_global Global values

131 
* Global values, specified in the stream header / extradata or used

132 
* all over.

133 
* @{

134 
*/

135 
GetBitContext gb; ///< packet bitreader. During decoder init,

136 
///< it contains the extradata from the

137 
///< demuxer. During decoding, it contains

138 
///< packet data.

139 
int8_t vbm_tree[25]; ///< converts VLC codes to frame type 
140  
141 
int spillover_bitsize; ///< number of bits used to specify 
142 
///< #spillover_nbits in the packet header

143 
///< = ceil(log2(ctx>block_align << 3))

144 
int history_nsamples; ///< number of samples in history for signal 
145 
///< prediction (through ACB)

146  
147 
/* postfilter specific values */

148 
int do_apf; ///< whether to apply the averaged 
149 
///< projection filter (APF)

150 
int denoise_strength; ///< strength of denoising in Wiener filter 
151 
///< [011]

152 
int denoise_tilt_corr; ///< Whether to apply tilt correction to the 
153 
///< Wiener filter coefficients (postfilter)

154 
int dc_level; ///< Predicted amount of DC noise, based 
155 
///< on which a DC removal filter is used

156  
157 
int lsps; ///< number of LSPs per frame [10 or 16] 
158 
int lsp_q_mode; ///< defines quantizer defaults [0, 1] 
159 
int lsp_def_mode; ///< defines different sets of LSP defaults 
160 
///< [0, 1]

161 
int frame_lsp_bitsize; ///< size (in bits) of LSPs, when encoded 
162 
///< perframe (independent coding)

163 
int sframe_lsp_bitsize; ///< size (in bits) of LSPs, when encoded 
164 
///< per superframe (residual coding)

165  
166 
int min_pitch_val; ///< base value for pitch parsing code 
167 
int max_pitch_val; ///< max value + 1 for pitch parsing 
168 
int pitch_nbits; ///< number of bits used to specify the 
169 
///< pitch value in the frame header

170 
int block_pitch_nbits; ///< number of bits used to specify the 
171 
///< first block's pitch value

172 
int block_pitch_range; ///< range of the block pitch 
173 
int block_delta_pitch_nbits; ///< number of bits used to specify the 
174 
///< delta pitch between this and the last

175 
///< block's pitch value, used in all but

176 
///< first block

177 
int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is 
178 
///< from this to +this1)

179 
uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale 
180 
///< conversion

181  
182 
/**

183 
* @}

184 
* @defgroup struct_packet Packet values

185 
* Packet values, specified in the packet header or related to a packet.

186 
* A packet is considered to be a single unit of data provided to this

187 
* decoder by the demuxer.

188 
* @{

189 
*/

190 
int spillover_nbits; ///< number of bits of the previous packet's 
191 
///< last superframe preceeding this

192 
///< packet's first full superframe (useful

193 
///< for resynchronization also)

194 
int has_residual_lsps; ///< if set, superframes contain one set of 
195 
///< LSPs that cover all frames, encoded as

196 
///< independent and residual LSPs; if not

197 
///< set, each frame contains its own, fully

198 
///< independent, LSPs

199 
int skip_bits_next; ///< number of bits to skip at the next call 
200 
///< to #wmavoice_decode_packet() (since

201 
///< they're part of the previous superframe)

202  
203 
uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + FF_INPUT_BUFFER_PADDING_SIZE]; 
204 
///< cache for superframe data split over

205 
///< multiple packets

206 
int sframe_cache_size; ///< set to >0 if we have data from an 
207 
///< (incomplete) superframe from a previous

208 
///< packet that spilled over in the current

209 
///< packet; specifies the amount of bits in

210 
///< #sframe_cache

211 
PutBitContext pb; ///< bitstream writer for #sframe_cache

212  
213 
/**

214 
* @}

215 
* @defgroup struct_frame Frame and superframe values

216 
* Superframe and frame data  these can change from frame to frame,

217 
* although some of them do in that case serve as a cache / history for

218 
* the next frame or superframe.

219 
* @{

220 
*/

221 
double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous 
222 
///< superframe

223 
int last_pitch_val; ///< pitch value of the previous frame 
224 
int last_acb_type; ///< frame type [02] of the previous frame 
225 
int pitch_diff_sh16; ///< ((cur_pitch_val  #last_pitch_val) 
226 
///< << 16) / #MAX_FRAMESIZE

227 
float silence_gain; ///< set for use in blocks if #ACB_TYPE_NONE 
228  
229 
int aw_idx_is_ext; ///< whether the AW index was encoded in 
230 
///< 8 bits (instead of 6)

231 
int aw_pulse_range; ///< the range over which #aw_pulse_set1() 
232 
///< can apply the pulse, relative to the

233 
///< value in aw_first_pulse_off. The exact

234 
///< position of the first AWpulse is within

235 
///< [pulse_off, pulse_off + this], and

236 
///< depends on bitstream values; [16 or 24]

237 
int aw_n_pulses[2]; ///< number of AWpulses in each block; note 
238 
///< that this number can be negative (in

239 
///< which case it basically means "zero")

240 
int aw_first_pulse_off[2]; ///< index of first sample to which to 
241 
///< apply AWpulses, or 0xff if unset

242 
int aw_next_pulse_off_cache; ///< the position (relative to start of the 
243 
///< second block) at which pulses should

244 
///< start to be positioned, serves as a

245 
///< cache for pitchadaptive window pulses

246 
///< between blocks

247  
248 
int frame_cntr; ///< current frame index [0  0xFFFE]; is 
249 
///< only used for comfort noise in #pRNG()

250 
float gain_pred_err[6]; ///< cache for gain prediction 
251 
float excitation_history[MAX_SIGNAL_HISTORY];

252 
///< cache of the signal of previous

253 
///< superframes, used as a history for

254 
///< signal generation

255 
float synth_history[MAX_LSPS]; ///< see #excitation_history 
256 
/**

257 
* @}

258 
* @defgroup post_filter Postfilter values

259 
* Variables used for postfilter implementation, mostly history for

260 
* smoothing and so on, and context variables for FFT/iFFT.

261 
* @{

262 
*/

263 
RDFTContext rdft, irdft; ///< contexts for FFTcalculation in the

264 
///< postfilter (for denoise filter)

265 
DCTContext dct, dst; ///< contexts for phase shift (in Hilbert

266 
///< transform, part of postfilter)

267 
float sin[511], cos[511]; ///< 8bit cosine/sine windows over [pi,pi] 
268 
///< range

269 
float postfilter_agc; ///< gain control memory, used in 
270 
///< #adaptive_gain_control()

271 
float dcf_mem[2]; ///< DC filter history 
272 
float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];

273 
///< zero filter output (i.e. excitation)

274 
///< by postfilter

275 
float denoise_filter_cache[MAX_FRAMESIZE];

276 
int denoise_filter_cache_size; ///< samples in #denoise_filter_cache 
277 
DECLARE_ALIGNED(16, float, tilted_lpcs_pf)[0x80]; 
278 
///< aligned buffer for LPC tilting

279 
DECLARE_ALIGNED(16, float, denoise_coeffs_pf)[0x80]; 
280 
///< aligned buffer for denoise coefficients

281 
DECLARE_ALIGNED(16, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16]; 
282 
///< aligned buffer for postfilter speech

283 
///< synthesis

284 
/**

285 
* @}

286 
*/

287 
} WMAVoiceContext; 
288  
289 
/**

290 
* Set up the variable bit mode (VBM) tree from container extradata.

291 
* @param gb bit I/O context.

292 
* The bit context (s>gb) should be loaded with byte 2346 of the

293 
* container extradata (i.e. the ones containing the VBM tree).

294 
* @param vbm_tree pointer to array to which the decoded VBM tree will be

295 
* written.

296 
* @return 0 on success, <0 on error.

297 
*/

298 
static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25]) 
299 
{ 
300 
static const uint8_t bits[] = { 
301 
2, 2, 2, 4, 4, 4, 
302 
6, 6, 6, 8, 8, 8, 
303 
10, 10, 10, 12, 12, 12, 
304 
14, 14, 14, 14 
305 
}; 
306 
static const uint16_t codes[] = { 
307 
0x0000, 0x0001, 0x0002, // 00/01/10 
308 
0x000c, 0x000d, 0x000e, // 11+00/01/10 
309 
0x003c, 0x003d, 0x003e, // 1111+00/01/10 
310 
0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10 
311 
0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10 
312 
0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10 
313 
0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx 
314 
}; 
315 
int cntr[8], n, res; 
316  
317 
memset(vbm_tree, 0xff, sizeof(vbm_tree)); 
318 
memset(cntr, 0, sizeof(cntr)); 
319 
for (n = 0; n < 17; n++) { 
320 
res = get_bits(gb, 3);

321 
if (cntr[res] > 3) // should be >= 3 + (res == 7)) 
322 
return 1; 
323 
vbm_tree[res * 3 + cntr[res]++] = n;

324 
} 
325 
INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),

326 
bits, 1, 1, codes, 2, 2, 132); 
327 
return 0; 
328 
} 
329  
330 
/**

331 
* Set up decoder with parameters from demuxer (extradata etc.).

332 
*/

333 
static av_cold int wmavoice_decode_init(AVCodecContext *ctx) 
334 
{ 
335 
int n, flags, pitch_range, lsp16_flag;

336 
WMAVoiceContext *s = ctx>priv_data; 
337  
338 
/**

339 
* Extradata layout:

340 
*  byte 018: WMAProinWMAVoice extradata (see wmaprodec.c),

341 
*  byte 1922: flags field (annoyingly in LE; see below for known

342 
* values),

343 
*  byte 2346: variable bitmode tree (really just 17 * 3 bits,

344 
* rest is 0).

345 
*/

346 
if (ctx>extradata_size != 46) { 
347 
av_log(ctx, AV_LOG_ERROR, 
348 
"Invalid extradata size %d (should be 46)\n",

349 
ctx>extradata_size); 
350 
return 1; 
351 
} 
352 
flags = AV_RL32(ctx>extradata + 18);

353 
s>spillover_bitsize = 3 + av_ceil_log2(ctx>block_align);

354 
s>do_apf = flags & 0x1;

355 
if (s>do_apf) {

356 
ff_rdft_init(&s>rdft, 7, DFT_R2C);

357 
ff_rdft_init(&s>irdft, 7, IDFT_C2R);

358 
ff_dct_init(&s>dct, 6, DCT_I);

359 
ff_dct_init(&s>dst, 6, DST_I);

360  
361 
ff_sine_window_init(s>cos, 256);

362 
memcpy(&s>sin[255], s>cos, 256 * sizeof(s>cos[0])); 
363 
for (n = 0; n < 255; n++) { 
364 
s>sin[n] = s>sin[510  n];

365 
s>cos[510  n] = s>cos[n];

366 
} 
367 
} 
368 
s>denoise_strength = (flags >> 2) & 0xF; 
369 
if (s>denoise_strength >= 12) { 
370 
av_log(ctx, AV_LOG_ERROR, 
371 
"Invalid denoise filter strength %d (max=11)\n",

372 
s>denoise_strength); 
373 
return 1; 
374 
} 
375 
s>denoise_tilt_corr = !!(flags & 0x40);

376 
s>dc_level = (flags >> 7) & 0xF; 
377 
s>lsp_q_mode = !!(flags & 0x2000);

378 
s>lsp_def_mode = !!(flags & 0x4000);

379 
lsp16_flag = flags & 0x1000;

380 
if (lsp16_flag) {

381 
s>lsps = 16;

382 
s>frame_lsp_bitsize = 34;

383 
s>sframe_lsp_bitsize = 60;

384 
} else {

385 
s>lsps = 10;

386 
s>frame_lsp_bitsize = 24;

387 
s>sframe_lsp_bitsize = 48;

388 
} 
389 
for (n = 0; n < s>lsps; n++) 
390 
s>prev_lsps[n] = M_PI * (n + 1.0) / (s>lsps + 1.0); 
391  
392 
init_get_bits(&s>gb, ctx>extradata + 22, (ctx>extradata_size  22) << 3); 
393 
if (decode_vbmtree(&s>gb, s>vbm_tree) < 0) { 
394 
av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");

395 
return 1; 
396 
} 
397  
398 
s>min_pitch_val = ((ctx>sample_rate << 8) / 400 + 50) >> 8; 
399 
s>max_pitch_val = ((ctx>sample_rate << 8) * 37 / 2000 + 50) >> 8; 
400 
pitch_range = s>max_pitch_val  s>min_pitch_val; 
401 
s>pitch_nbits = av_ceil_log2(pitch_range); 
402 
s>last_pitch_val = 40;

403 
s>last_acb_type = ACB_TYPE_NONE; 
404 
s>history_nsamples = s>max_pitch_val + 8;

405  
406 
if (s>min_pitch_val < 1  s>history_nsamples > MAX_SIGNAL_HISTORY) { 
407 
int min_sr = ((((1 << 8)  50) * 400) + 0xFF) >> 8, 
408 
max_sr = ((((MAX_SIGNAL_HISTORY  8) << 8) + 205) * 2000 / 37) >> 8; 
409  
410 
av_log(ctx, AV_LOG_ERROR, 
411 
"Unsupported samplerate %d (min=%d, max=%d)\n",

412 
ctx>sample_rate, min_sr, max_sr); // 32222097 Hz

413  
414 
return 1; 
415 
} 
416  
417 
s>block_conv_table[0] = s>min_pitch_val;

418 
s>block_conv_table[1] = (pitch_range * 25) >> 6; 
419 
s>block_conv_table[2] = (pitch_range * 44) >> 6; 
420 
s>block_conv_table[3] = s>max_pitch_val  1; 
421 
s>block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF; 
422 
s>block_delta_pitch_nbits = 1 + av_ceil_log2(s>block_delta_pitch_hrange);

423 
s>block_pitch_range = s>block_conv_table[2] +

424 
s>block_conv_table[3] + 1 + 
425 
2 * (s>block_conv_table[1]  2 * s>min_pitch_val); 
426 
s>block_pitch_nbits = av_ceil_log2(s>block_pitch_range); 
427  
428 
ctx>sample_fmt = AV_SAMPLE_FMT_FLT; 
429  
430 
return 0; 
431 
} 
432  
433 
/**

434 
* @defgroup postfilter Postfilter functions

435 
* Postfilter functions (gain control, wiener denoise filter, DC filter,

436 
* kalman smoothening, plus surrounding code to wrap it)

437 
* @{

438 
*/

439 
/**

440 
* Adaptive gain control (as used in postfilter).

441 
*

442 
* Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except

443 
* that the energy here is calculated using sum(abs(...)), whereas the

444 
* other codecs (e.g. AMRNB, SIPRO) use sqrt(dotproduct(...)).

445 
*

446 
* @param out output buffer for filtered samples

447 
* @param in input buffer containing the samples as they are after the

448 
* postfilter steps so far

449 
* @param speech_synth input buffer containing speech synth before postfilter

450 
* @param size input buffer size

451 
* @param alpha exponential filter factor

452 
* @param gain_mem pointer to filter memory (single float)

453 
*/

454 
static void adaptive_gain_control(float *out, const float *in, 
455 
const float *speech_synth, 
456 
int size, float alpha, float *gain_mem) 
457 
{ 
458 
int i;

459 
float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor; 
460 
float mem = *gain_mem;

461  
462 
for (i = 0; i < size; i++) { 
463 
speech_energy += fabsf(speech_synth[i]); 
464 
postfilter_energy += fabsf(in[i]); 
465 
} 
466 
gain_scale_factor = (1.0  alpha) * speech_energy / postfilter_energy; 
467  
468 
for (i = 0; i < size; i++) { 
469 
mem = alpha * mem + gain_scale_factor; 
470 
out[i] = in[i] * mem; 
471 
} 
472  
473 
*gain_mem = mem; 
474 
} 
475  
476 
/**

477 
* Kalman smoothing function.

478 
*

479 
* This function looks back pitch +/ 3 samples back into history to find

480 
* the best fitting curve (that one giving the optimal gain of the two

481 
* signals, i.e. the highest dot product between the two), and then

482 
* uses that signal history to smoothen the output of the speech synthesis

483 
* filter.

484 
*

485 
* @param s WMA Voice decoding context

486 
* @param pitch pitch of the speech signal

487 
* @param in input speech signal

488 
* @param out output pointer for smoothened signal

489 
* @param size input/output buffer size

490 
*

491 
* @returns 1 if no smoothening took place, e.g. because no optimal

492 
* fit could be found, or 0 on success.

493 
*/

494 
static int kalman_smoothen(WMAVoiceContext *s, int pitch, 
495 
const float *in, float *out, int size) 
496 
{ 
497 
int n;

498 
float optimal_gain = 0, dot; 
499 
const float *ptr = &in[FFMAX(s>min_pitch_val, pitch  3)], 
500 
*end = &in[FFMIN(s>max_pitch_val, pitch + 3)],

501 
*best_hist_ptr; 
502  
503 
/* find best fitting point in history */

504 
do {

505 
dot = ff_dot_productf(in, ptr, size); 
506 
if (dot > optimal_gain) {

507 
optimal_gain = dot; 
508 
best_hist_ptr = ptr; 
509 
} 
510 
} while (ptr >= end);

511  
512 
if (optimal_gain <= 0) 
513 
return 1; 
514 
dot = ff_dot_productf(best_hist_ptr, best_hist_ptr, size); 
515 
if (dot <= 0) // would be 1.0 
516 
return 1; 
517  
518 
if (optimal_gain <= dot) {

519 
dot = dot / (dot + 0.6 * optimal_gain); // 0.6251.000 
520 
} else

521 
dot = 0.625; 
522  
523 
/* actual smoothing */

524 
for (n = 0; n < size; n++) 
525 
out[n] = best_hist_ptr[n] + dot * (in[n]  best_hist_ptr[n]); 
526  
527 
return 0; 
528 
} 
529  
530 
/**

531 
* Get the tilt factor of a formant filter from its transfer function

532 
* @see #tilt_factor() in amrnbdec.c, which does essentially the same,

533 
* but somehow (??) it does a speech synthesis filter in the

534 
* middle, which is missing here

535 
*

536 
* @param lpcs LPC coefficients

537 
* @param n_lpcs Size of LPC buffer

538 
* @returns the tilt factor

539 
*/

540 
static float tilt_factor(const float *lpcs, int n_lpcs) 
541 
{ 
542 
float rh0, rh1;

543  
544 
rh0 = 1.0 + ff_dot_productf(lpcs, lpcs, n_lpcs); 
545 
rh1 = lpcs[0] + ff_dot_productf(lpcs, &lpcs[1], n_lpcs  1); 
546  
547 
return rh1 / rh0;

548 
} 
549  
550 
/**

551 
* Derive denoise filter coefficients (in real domain) from the LPCs.

552 
*/

553 
static void calc_input_response(WMAVoiceContext *s, float *lpcs, 
554 
int fcb_type, float *coeffs, int remainder) 
555 
{ 
556 
float last_coeff, min = 15.0, max = 15.0; 
557 
float irange, angle_mul, gain_mul, range, sq;

558 
int n, idx;

559  
560 
/* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */

561 
ff_rdft_calc(&s>rdft, lpcs); 
562 
#define log_range(var, assign) do { \ 
563 
float tmp = log10f(assign); var = tmp; \

564 
max = FFMAX(max, tmp); min = FFMIN(min, tmp); \ 
565 
} while (0) 
566 
log_range(last_coeff, lpcs[1] * lpcs[1]); 
567 
for (n = 1; n < 64; n++) 
568 
log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] + 
569 
lpcs[n * 2 + 1] * lpcs[n * 2 + 1]); 
570 
log_range(lpcs[0], lpcs[0] * lpcs[0]); 
571 
#undef log_range

572 
range = max  min; 
573 
lpcs[64] = last_coeff;

574  
575 
/* Now, use this spectrum to pick out these frequencies with higher

576 
* (relative) power/energy (which we then take to be "not noise"),

577 
* and set up a table (still in lpc[]) of (relative) gains per frequency.

578 
* These frequencies will be maintained, while others ("noise") will be

579 
* decreased in the filter output. */

580 
irange = 64.0 / range; // so irange*(maxvalue) is in the range [0, 63] 
581 
gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) : 
582 
(5.0 / 14.7)); 
583 
angle_mul = gain_mul * (8.0 * M_LN10 / M_PI); 
584 
for (n = 0; n <= 64; n++) { 
585 
float pwr;

586  
587 
idx = FFMAX(0, lrint((max  lpcs[n]) * irange)  1); 
588 
pwr = wmavoice_denoise_power_table[s>denoise_strength][idx]; 
589 
lpcs[n] = angle_mul * pwr; 
590  
591 
/* 70.57 =~ 1/log10(1.0331663) */

592 
idx = (pwr * gain_mul  0.0295) * 70.570526123; 
593 
if (idx > 127) { // fallback if index falls outside table range 
594 
coeffs[n] = wmavoice_energy_table[127] *

595 
powf(1.0331663, idx  127); 
596 
} else

597 
coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];

598 
} 
599  
600 
/* calculate the Hilbert transform of the gains, which we do (since this

601 
* is a sinus input) by doing a phase shift (in theory, H(sin())=cos()).

602 
* Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the

603 
* "moment" of the LPCs in this filter. */

604 
ff_dct_calc(&s>dct, lpcs); 
605 
ff_dct_calc(&s>dst, lpcs); 
606  
607 
/* Split out the coefficient indexes into phase/magnitude pairs */

608 
idx = 255 + av_clip(lpcs[64], 255, 255); 
609 
coeffs[0] = coeffs[0] * s>cos[idx]; 
610 
idx = 255 + av_clip(lpcs[64]  2 * lpcs[63], 255, 255); 
611 
last_coeff = coeffs[64] * s>cos[idx];

612 
for (n = 63;; n) { 
613 
idx = 255 + av_clip(lpcs[64]  2 * lpcs[n  1], 255, 255); 
614 
coeffs[n * 2 + 1] = coeffs[n] * s>sin[idx]; 
615 
coeffs[n * 2] = coeffs[n] * s>cos[idx];

616  
617 
if (!n) break; 
618  
619 
idx = 255 + av_clip( lpcs[64]  2 * lpcs[n  1], 255, 255); 
620 
coeffs[n * 2 + 1] = coeffs[n] * s>sin[idx]; 
621 
coeffs[n * 2] = coeffs[n] * s>cos[idx];

622 
} 
623 
coeffs[1] = last_coeff;

624  
625 
/* move into real domain */

626 
ff_rdft_calc(&s>irdft, coeffs); 
627  
628 
/* tilt correction and normalize scale */

629 
memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128  remainder)); 
630 
if (s>denoise_tilt_corr) {

631 
float tilt_mem = 0; 
632  
633 
coeffs[remainder  1] = 0; 
634 
ff_tilt_compensation(&tilt_mem, 
635 
1.8 * tilt_factor(coeffs, remainder  1), 
636 
coeffs, remainder); 
637 
} 
638 
sq = (1.0 / 64.0) * sqrtf(1 / ff_dot_productf(coeffs, coeffs, remainder)); 
639 
for (n = 0; n < remainder; n++) 
640 
coeffs[n] *= sq; 
641 
} 
642  
643 
/**

644 
* This function applies a Wiener filter on the (noisy) speech signal as

645 
* a means to denoise it.

646 
*

647 
*  take RDFT of LPCs to get the power spectrum of the noise + speech;

648 
*  using this power spectrum, calculate (for each frequency) the Wiener

649 
* filter gain, which depends on the frequency power and desired level

650 
* of noise subtraction (when set too high, this leads to artifacts)

651 
* We can do this symmetrically over the Xaxis (so 04kHz is the inverse

652 
* of 48kHz);

653 
*  by doing a phase shift, calculate the Hilbert transform of this array

654 
* of perfrequency filtergains to get the filtering coefficients;

655 
*  smoothen/normalize/detilt these filter coefficients as desired;

656 
*  take RDFT of noisy sound, apply the coefficients and take its IRDFT

657 
* to get the denoised speech signal;

658 
*  the leftover (i.e. output of the IRDFT on denoised speech data beyond

659 
* the frame boundary) are saved and applied to subsequent frames by an

660 
* overlapadd method (otherwise you get clickingartifacts).

661 
*

662 
* @param s WMA Voice decoding context

663 
* @param fcb_type Frame (codebook) type

664 
* @param synth_pf input: the noisy speech signal, output: denoised speech

665 
* data; should be 16byte aligned (for ASM purposes)

666 
* @param size size of the speech data

667 
* @param lpcs LPCs used to synthesize this frame's speech data

668 
*/

669 
static void wiener_denoise(WMAVoiceContext *s, int fcb_type, 
670 
float *synth_pf, int size, 
671 
const float *lpcs) 
672 
{ 
673 
int remainder, lim, n;

674  
675 
if (fcb_type != FCB_TYPE_SILENCE) {

676 
float *tilted_lpcs = s>tilted_lpcs_pf,

677 
*coeffs = s>denoise_coeffs_pf, tilt_mem = 0;

678  
679 
tilted_lpcs[0] = 1.0; 
680 
memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s>lsps); 
681 
memset(&tilted_lpcs[s>lsps + 1], 0, 
682 
sizeof(tilted_lpcs[0]) * (128  s>lsps  1)); 
683 
ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s>lsps), 
684 
tilted_lpcs, s>lsps + 2);

685  
686 
/* The IRDFT output (127 samples for 7bit filter) beyond the frame

687 
* size is applied to the next frame. All input beyond this is zero,

688 
* and thus all output beyond this will go towards zero, hence we can

689 
* limit to min(size1, 127size) as a performance consideration. */

690 
remainder = FFMIN(127  size, size  1); 
691 
calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder); 
692  
693 
/* apply coefficients (in frequency spectrum domain), i.e. complex

694 
* number multiplication */

695 
memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128  size)); 
696 
ff_rdft_calc(&s>rdft, synth_pf); 
697 
ff_rdft_calc(&s>rdft, coeffs); 
698 
synth_pf[0] *= coeffs[0]; 
699 
synth_pf[1] *= coeffs[1]; 
700 
for (n = 1; n < 64; n++) { 
701 
float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1]; 
702 
synth_pf[n * 2] = v1 * coeffs[n * 2]  v2 * coeffs[n * 2 + 1]; 
703 
synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1]; 
704 
} 
705 
ff_rdft_calc(&s>irdft, synth_pf); 
706 
} 
707  
708 
/* merge filter output with the history of previous runs */

709 
if (s>denoise_filter_cache_size) {

710 
lim = FFMIN(s>denoise_filter_cache_size, size); 
711 
for (n = 0; n < lim; n++) 
712 
synth_pf[n] += s>denoise_filter_cache[n]; 
713 
s>denoise_filter_cache_size = lim; 
714 
memmove(s>denoise_filter_cache, &s>denoise_filter_cache[size], 
715 
sizeof(s>denoise_filter_cache[0]) * s>denoise_filter_cache_size); 
716 
} 
717  
718 
/* move remainder of filter output into a cache for future runs */

719 
if (fcb_type != FCB_TYPE_SILENCE) {

720 
lim = FFMIN(remainder, s>denoise_filter_cache_size); 
721 
for (n = 0; n < lim; n++) 
722 
s>denoise_filter_cache[n] += synth_pf[size + n]; 
723 
if (lim < remainder) {

724 
memcpy(&s>denoise_filter_cache[lim], &synth_pf[size + lim], 
725 
sizeof(s>denoise_filter_cache[0]) * (remainder  lim)); 
726 
s>denoise_filter_cache_size = remainder; 
727 
} 
728 
} 
729 
} 
730  
731 
/**

732 
* Averaging projection filter, the postfilter used in WMAVoice.

733 
*

734 
* This uses the following steps:

735 
*  A zerosynthesis filter (generate excitation from synth signal)

736 
*  Kalman smoothing on excitation, based on pitch

737 
*  Resynthesized smoothened output

738 
*  Iterative Wiener denoise filter

739 
*  Adaptive gain filter

740 
*  DC filter

741 
*

742 
* @param s WMAVoice decoding context

743 
* @param synth Speech synthesis output (before postfilter)

744 
* @param samples Output buffer for filtered samples

745 
* @param size Buffer size of synth & samples

746 
* @param lpcs Generated LPCs used for speech synthesis

747 
* @param zero_exc_pf destination for zero synthesis filter (16byte aligned)

748 
* @param fcb_type Frame type (silence, hardcoded, AWpulses or FCBpulses)

749 
* @param pitch Pitch of the input signal

750 
*/

751 
static void postfilter(WMAVoiceContext *s, const float *synth, 
752 
float *samples, int size, 
753 
const float *lpcs, float *zero_exc_pf, 
754 
int fcb_type, int pitch) 
755 
{ 
756 
float synth_filter_in_buf[MAX_FRAMESIZE / 2], 
757 
*synth_pf = &s>synth_filter_out_buf[MAX_LSPS_ALIGN16], 
758 
*synth_filter_in = zero_exc_pf; 
759  
760 
assert(size <= MAX_FRAMESIZE / 2);

761  
762 
/* generate excitation from input signal */

763 
ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s>lsps); 
764  
765 
if (fcb_type >= FCB_TYPE_AW_PULSES &&

766 
!kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size)) 
767 
synth_filter_in = synth_filter_in_buf; 
768  
769 
/* resynthesize speech after smoothening, and keep history */

770 
ff_celp_lp_synthesis_filterf(synth_pf, lpcs, 
771 
synth_filter_in, size, s>lsps); 
772 
memcpy(&synth_pf[s>lsps], &synth_pf[size  s>lsps], 
773 
sizeof(synth_pf[0]) * s>lsps); 
774  
775 
wiener_denoise(s, fcb_type, synth_pf, size, lpcs); 
776  
777 
adaptive_gain_control(samples, synth_pf, synth, size, 0.99, 
778 
&s>postfilter_agc); 
779  
780 
if (s>dc_level > 8) { 
781 
/* remove ultralow frequency DC noise / highpass filter;

782 
* coefficients are identical to those used in SIPR decoding,

783 
* and very closely resemble those used in AMRNB decoding. */

784 
ff_acelp_apply_order_2_transfer_function(samples, samples, 
785 
(const float[2]) { 1.99997, 1.0 }, 
786 
(const float[2]) { 1.9330735188, 0.93589198496 }, 
787 
0.93980580475, s>dcf_mem, size); 
788 
} 
789 
} 
790 
/**

791 
* @}

792 
*/

793  
794 
/**

795 
* Dequantize LSPs

796 
* @param lsps output pointer to the array that will hold the LSPs

797 
* @param num number of LSPs to be dequantized

798 
* @param values quantized values, contains n_stages values

799 
* @param sizes range (i.e. max value) of each quantized value

800 
* @param n_stages number of dequantization runs

801 
* @param table dequantization table to be used

802 
* @param mul_q LSF multiplier

803 
* @param base_q base (lowest) LSF values

804 
*/

805 
static void dequant_lsps(double *lsps, int num, 
806 
const uint16_t *values,

807 
const uint16_t *sizes,

808 
int n_stages, const uint8_t *table, 
809 
const double *mul_q, 
810 
const double *base_q) 
811 
{ 
812 
int n, m;

813  
814 
memset(lsps, 0, num * sizeof(*lsps)); 
815 
for (n = 0; n < n_stages; n++) { 
816 
const uint8_t *t_off = &table[values[n] * num];

817 
double base = base_q[n], mul = mul_q[n];

818  
819 
for (m = 0; m < num; m++) 
820 
lsps[m] += base + mul * t_off[m]; 
821  
822 
table += sizes[n] * num; 
823 
} 
824 
} 
825  
826 
/**

827 
* @defgroup lsp_dequant LSP dequantization routines

828 
* LSP dequantization routines, for 10/16LSPs and independent/residual coding.

829 
* @note we assume enough bits are available, caller should check.

830 
* lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;

831 
* lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.

832 
* @{

833 
*/

834 
/**

835 
* Parse 10 independentlycoded LSPs.

836 
*/

837 
static void dequant_lsp10i(GetBitContext *gb, double *lsps) 
838 
{ 
839 
static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 }; 
840 
static const double mul_lsf[4] = { 
841 
5.2187144800e3, 1.4626986422e3, 
842 
9.6179549166e4, 1.1325736225e3 
843 
}; 
844 
static const double base_lsf[4] = { 
845 
M_PI * 2.15522e1, M_PI * 6.1646e2, 
846 
M_PI * 3.3486e2, M_PI * 5.7408e2 
847 
}; 
848 
uint16_t v[4];

849  
850 
v[0] = get_bits(gb, 8); 
851 
v[1] = get_bits(gb, 6); 
852 
v[2] = get_bits(gb, 5); 
853 
v[3] = get_bits(gb, 5); 
854  
855 
dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i, 
856 
mul_lsf, base_lsf); 
857 
} 
858  
859 
/**

860 
* Parse 10 independentlycoded LSPs, and then derive the tables to

861 
* generate LSPs for the other frames from them (residual coding).

862 
*/

863 
static void dequant_lsp10r(GetBitContext *gb, 
864 
double *i_lsps, const double *old, 
865 
double *a1, double *a2, int q_mode) 
866 
{ 
867 
static const uint16_t vec_sizes[3] = { 128, 64, 64 }; 
868 
static const double mul_lsf[3] = { 
869 
2.5807601174e3, 1.2354460219e3, 1.1763821673e3 
870 
}; 
871 
static const double base_lsf[3] = { 
872 
M_PI * 1.07448e1, M_PI * 5.2706e2, M_PI * 5.1634e2 
873 
}; 
874 
const float (*ipol_tab)[2][10] = q_mode ? 
875 
wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a; 
876 
uint16_t interpol, v[3];

877 
int n;

878  
879 
dequant_lsp10i(gb, i_lsps); 
880  
881 
interpol = get_bits(gb, 5);

882 
v[0] = get_bits(gb, 7); 
883 
v[1] = get_bits(gb, 6); 
884 
v[2] = get_bits(gb, 6); 
885  
886 
for (n = 0; n < 10; n++) { 
887 
double delta = old[n]  i_lsps[n];

888 
a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];

889 
a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n]; 
890 
} 
891  
892 
dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r, 
893 
mul_lsf, base_lsf); 
894 
} 
895  
896 
/**

897 
* Parse 16 independentlycoded LSPs.

898 
*/

899 
static void dequant_lsp16i(GetBitContext *gb, double *lsps) 
900 
{ 
901 
static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 }; 
902 
static const double mul_lsf[5] = { 
903 
3.3439586280e3, 6.9908173703e4, 
904 
3.3216608306e3, 1.0334960326e3, 
905 
3.1899104283e3 
906 
}; 
907 
static const double base_lsf[5] = { 
908 
M_PI * 1.27576e1, M_PI * 2.4292e2, 
909 
M_PI * 1.28094e1, M_PI * 3.2128e2, 
910 
M_PI * 1.29816e1 
911 
}; 
912 
uint16_t v[5];

913  
914 
v[0] = get_bits(gb, 8); 
915 
v[1] = get_bits(gb, 6); 
916 
v[2] = get_bits(gb, 7); 
917 
v[3] = get_bits(gb, 6); 
918 
v[4] = get_bits(gb, 7); 
919  
920 
dequant_lsps( lsps, 5, v, vec_sizes, 2, 
921 
wmavoice_dq_lsp16i1, mul_lsf, base_lsf); 
922 
dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2, 
923 
wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]); 
924 
dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1, 
925 
wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]); 
926 
} 
927  
928 
/**

929 
* Parse 16 independentlycoded LSPs, and then derive the tables to

930 
* generate LSPs for the other frames from them (residual coding).

931 
*/

932 
static void dequant_lsp16r(GetBitContext *gb, 
933 
double *i_lsps, const double *old, 
934 
double *a1, double *a2, int q_mode) 
935 
{ 
936 
static const uint16_t vec_sizes[3] = { 128, 128, 128 }; 
937 
static const double mul_lsf[3] = { 
938 
1.2232979501e3, 1.4062241527e3, 1.6114744851e3 
939 
}; 
940 
static const double base_lsf[3] = { 
941 
M_PI * 5.5830e2, M_PI * 5.2908e2, M_PI * 5.4776e2 
942 
}; 
943 
const float (*ipol_tab)[2][16] = q_mode ? 
944 
wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a; 
945 
uint16_t interpol, v[3];

946 
int n;

947  
948 
dequant_lsp16i(gb, i_lsps); 
949  
950 
interpol = get_bits(gb, 5);

951 
v[0] = get_bits(gb, 7); 
952 
v[1] = get_bits(gb, 7); 
953 
v[2] = get_bits(gb, 7); 
954  
955 
for (n = 0; n < 16; n++) { 
956 
double delta = old[n]  i_lsps[n];

957 
a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];

958 
a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n]; 
959 
} 
960  
961 
dequant_lsps( a2, 10, v, vec_sizes, 1, 
962 
wmavoice_dq_lsp16r1, mul_lsf, base_lsf); 
963 
dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1, 
964 
wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]); 
965 
dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1, 
966 
wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]); 
967 
} 
968  
969 
/**

970 
* @}

971 
* @defgroup aw Pitchadaptive window coding functions

972 
* The next few functions are for pitchadaptive window coding.

973 
* @{

974 
*/

975 
/**

976 
* Parse the offset of the first pitchadaptive window pulses, and

977 
* the distribution of pulses between the two blocks in this frame.

978 
* @param s WMA Voice decoding context private data

979 
* @param gb bit I/O context

980 
* @param pitch pitch for each block in this frame

981 
*/

982 
static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb, 
983 
const int *pitch) 
984 
{ 
985 
static const int16_t start_offset[94] = { 
986 
11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 
987 
13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26, 
988 
27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43, 
989 
45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 
990 
69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 
991 
93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 
992 
117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139, 
993 
141, 143, 145, 147, 149, 151, 153, 155, 157, 159 
994 
}; 
995 
int bits, offset;

996  
997 
/* position of pulse */

998 
s>aw_idx_is_ext = 0;

999 
if ((bits = get_bits(gb, 6)) >= 54) { 
1000 
s>aw_idx_is_ext = 1;

1001 
bits += (bits  54) * 3 + get_bits(gb, 2); 
1002 
} 
1003  
1004 
/* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count

1005 
* the distribution of the pulses in each block contained in this frame. */

1006 
s>aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16; 
1007 
for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ; 
1008 
s>aw_n_pulses[0] = (pitch[0]  1 + MAX_FRAMESIZE / 2  offset) / pitch[0]; 
1009 
s>aw_first_pulse_off[0] = offset  s>aw_pulse_range / 2; 
1010 
offset += s>aw_n_pulses[0] * pitch[0]; 
1011 
s>aw_n_pulses[1] = (pitch[1]  1 + MAX_FRAMESIZE  offset) / pitch[1]; 
1012 
s>aw_first_pulse_off[1] = offset  (MAX_FRAMESIZE + s>aw_pulse_range) / 2; 
1013  
1014 
/* if continuing from a position before the block, reset position to

1015 
* start of block (when corrected for the range over which it can be

1016 
* spread in aw_pulse_set1()). */

1017 
if (start_offset[bits] < MAX_FRAMESIZE / 2) { 
1018 
while (s>aw_first_pulse_off[1]  pitch[1] + s>aw_pulse_range > 0) 
1019 
s>aw_first_pulse_off[1] = pitch[1]; 
1020 
if (start_offset[bits] < 0) 
1021 
while (s>aw_first_pulse_off[0]  pitch[0] + s>aw_pulse_range > 0) 
1022 
s>aw_first_pulse_off[0] = pitch[0]; 
1023 
} 
1024 
} 
1025  
1026 
/**

1027 
* Apply second set of pitchadaptive window pulses.

1028 
* @param s WMA Voice decoding context private data

1029 
* @param gb bit I/O context

1030 
* @param block_idx block index in frame [0, 1]

1031 
* @param fcb structure containing fixed codebook vector info

1032 
*/

1033 
static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb, 
1034 
int block_idx, AMRFixed *fcb)

1035 
{ 
1036 
uint16_t use_mask_mem[9]; // only 5 are used, rest is padding 
1037 
uint16_t *use_mask = use_mask_mem + 2;

1038 
/* in this function, idx is the index in the 80bit (+ padding) use_mask

1039 
* bitarray. Since use_mask consists of 16bit values, the lower 4 bits

1040 
* of idx are the position of the bit within a particular item in the

1041 
* array (0 being the most significant bit, and 15 being the least

1042 
* significant bit), and the remainder (>> 4) is the index in the

1043 
* use_mask[]array. This is faster and uses less memory than using a

1044 
* 80byte/80int array. */

1045 
int pulse_off = s>aw_first_pulse_off[block_idx],

1046 
pulse_start, n, idx, range, aidx, start_off = 0;

1047  
1048 
/* set offset of first pulse to within this block */

1049 
if (s>aw_n_pulses[block_idx] > 0) 
1050 
while (pulse_off + s>aw_pulse_range < 1) 
1051 
pulse_off += fcb>pitch_lag; 
1052  
1053 
/* find range per pulse */

1054 
if (s>aw_n_pulses[0] > 0) { 
1055 
if (block_idx == 0) { 
1056 
range = 32;

1057 
} else /* block_idx = 1 */ { 
1058 
range = 8;

1059 
if (s>aw_n_pulses[block_idx] > 0) 
1060 
pulse_off = s>aw_next_pulse_off_cache; 
1061 
} 
1062 
} else

1063 
range = 16;

1064 
pulse_start = s>aw_n_pulses[block_idx] > 0 ? pulse_off  range / 2 : 0; 
1065  
1066 
/* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,

1067 
* in the range of [pulse_off, pulse_off + s>aw_pulse_range], and thus

1068 
* we exclude that range from being pulsed again in this function. */

1069 
memset(&use_mask[2], 0, 2 * sizeof(use_mask[0])); 
1070 
memset( use_mask, 1, 5 * sizeof(use_mask[0])); 
1071 
memset(&use_mask[5], 0, 2 * sizeof(use_mask[0])); 
1072 
if (s>aw_n_pulses[block_idx] > 0) 
1073 
for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb>pitch_lag) { 
1074 
int excl_range = s>aw_pulse_range; // always 16 or 24 
1075 
uint16_t *use_mask_ptr = &use_mask[idx >> 4];

1076 
int first_sh = 16  (idx & 15); 
1077 
*use_mask_ptr++ &= 0xFFFF << first_sh;

1078 
excl_range = first_sh; 
1079 
if (excl_range >= 16) { 
1080 
*use_mask_ptr++ = 0;

1081 
*use_mask_ptr &= 0xFFFF >> (excl_range  16); 
1082 
} else

1083 
*use_mask_ptr &= 0xFFFF >> excl_range;

1084 
} 
1085  
1086 
/* find the 'aidx'th offset that is not excluded */

1087 
aidx = get_bits(gb, s>aw_n_pulses[0] > 0 ? 5  2 * block_idx : 4); 
1088 
for (n = 0; n <= aidx; pulse_start++) { 
1089 
for (idx = pulse_start; idx < 0; idx += fcb>pitch_lag) ; 
1090 
if (idx >= MAX_FRAMESIZE / 2) { // find from zero 
1091 
if (use_mask[0]) idx = 0x0F; 
1092 
else if (use_mask[1]) idx = 0x1F; 
1093 
else if (use_mask[2]) idx = 0x2F; 
1094 
else if (use_mask[3]) idx = 0x3F; 
1095 
else if (use_mask[4]) idx = 0x4F; 
1096 
else return; 
1097 
idx = av_log2_16bit(use_mask[idx >> 4]);

1098 
} 
1099 
if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) { 
1100 
use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15)); 
1101 
n++; 
1102 
start_off = idx; 
1103 
} 
1104 
} 
1105  
1106 
fcb>x[fcb>n] = start_off; 
1107 
fcb>y[fcb>n] = get_bits1(gb) ? 1.0 : 1.0; 
1108 
fcb>n++; 
1109  
1110 
/* set offset for next block, relative to start of that block */

1111 
n = (MAX_FRAMESIZE / 2  start_off) % fcb>pitch_lag;

1112 
s>aw_next_pulse_off_cache = n ? fcb>pitch_lag  n : 0;

1113 
} 
1114  
1115 
/**

1116 
* Apply first set of pitchadaptive window pulses.

1117 
* @param s WMA Voice decoding context private data

1118 
* @param gb bit I/O context

1119 
* @param block_idx block index in frame [0, 1]

1120 
* @param fcb storage location for fixed codebook pulse info

1121 
*/

1122 
static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb, 
1123 
int block_idx, AMRFixed *fcb)

1124 
{ 
1125 
int val = get_bits(gb, 12  2 * (s>aw_idx_is_ext && !block_idx)); 
1126 
float v;

1127  
1128 
if (s>aw_n_pulses[block_idx] > 0) { 
1129 
int n, v_mask, i_mask, sh, n_pulses;

1130  
1131 
if (s>aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each 
1132 
n_pulses = 3;

1133 
v_mask = 8;

1134 
i_mask = 7;

1135 
sh = 4;

1136 
} else { // 4 pulses, 1:sign + 2:index each 
1137 
n_pulses = 4;

1138 
v_mask = 4;

1139 
i_mask = 3;

1140 
sh = 3;

1141 
} 
1142  
1143 
for (n = n_pulses  1; n >= 0; n, val >>= sh) { 
1144 
fcb>y[fcb>n] = (val & v_mask) ? 1.0 : 1.0; 
1145 
fcb>x[fcb>n] = (val & i_mask) * n_pulses + n + 
1146 
s>aw_first_pulse_off[block_idx]; 
1147 
while (fcb>x[fcb>n] < 0) 
1148 
fcb>x[fcb>n] += fcb>pitch_lag; 
1149 
if (fcb>x[fcb>n] < MAX_FRAMESIZE / 2) 
1150 
fcb>n++; 
1151 
} 
1152 
} else {

1153 
int num2 = (val & 0x1FF) >> 1, delta, idx; 
1154  
1155 
if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; } 
1156 
else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1  1 * 77; } 
1157 
else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1  2 * 76; } 
1158 
else { delta = 7; idx = num2 + 1  3 * 75; } 
1159 
v = (val & 0x200) ? 1.0 : 1.0; 
1160  
1161 
fcb>no_repeat_mask = 3 << fcb>n;

1162 
fcb>x[fcb>n] = idx  delta; 
1163 
fcb>y[fcb>n] = v; 
1164 
fcb>x[fcb>n + 1] = idx;

1165 
fcb>y[fcb>n + 1] = (val & 1) ? v : v; 
1166 
fcb>n += 2;

1167 
} 
1168 
} 
1169  
1170 
/**

1171 
* @}

1172 
*

1173 
* Generate a random number from frame_cntr and block_idx, which will lief

1174 
* in the range [0, 1000  block_size] (so it can be used as an index in a

1175 
* table of size 1000 of which you want to read block_size entries).

1176 
*

1177 
* @param frame_cntr current frame number

1178 
* @param block_num current block index

1179 
* @param block_size amount of entries we want to read from a table

1180 
* that has 1000 entries

1181 
* @return a (non)random number in the [0, 1000  block_size] range.

1182 
*/

1183 
static int pRNG(int frame_cntr, int block_num, int block_size) 
1184 
{ 
1185 
/* array to simplify the calculation of z:

1186 
* y = (x % 9) * 5 + 6;

1187 
* z = (49995 * x) / y;

1188 
* Since y only has 9 values, we can remove the division by using a

1189 
* LUT and using FASTDIVstyle divisions. For each of the 9 values

1190 
* of y, we can rewrite z as:

1191 
* z = x * (49995 / y) + x * ((49995 % y) / y)

1192 
* In this table, each col represents one possible value of y, the

1193 
* first number is 49995 / y, and the second is the FASTDIV variant

1194 
* of 49995 % y / y. */

1195 
static const unsigned int div_tbl[9][2] = { 
1196 
{ 8332, 3 * 715827883U }, // y = 6 
1197 
{ 4545, 0 * 390451573U }, // y = 11 
1198 
{ 3124, 11 * 268435456U }, // y = 16 
1199 
{ 2380, 15 * 204522253U }, // y = 21 
1200 
{ 1922, 23 * 165191050U }, // y = 26 
1201 
{ 1612, 23 * 138547333U }, // y = 31 
1202 
{ 1388, 27 * 119304648U }, // y = 36 
1203 
{ 1219, 16 * 104755300U }, // y = 41 
1204 
{ 1086, 39 * 93368855U } // y = 46 
1205 
}; 
1206 
unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr; 
1207 
if (x >= 0xFFFF) x = 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6, 
1208 
// so this is effectively a modulo (%)

1209 
y = x  9 * MULH(477218589, x); // x % 9 
1210 
z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1])); 
1211 
// z = x * 49995 / (y * 5 + 6)

1212 
return z % (1000  block_size); 
1213 
} 
1214  
1215 
/**

1216 
* Parse hardcoded signal for a single block.

1217 
* @note see #synth_block().

1218 
*/

1219 
static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb, 
1220 
int block_idx, int size, 
1221 
const struct frame_type_desc *frame_desc, 
1222 
float *excitation)

1223 
{ 
1224 
float gain;

1225 
int n, r_idx;

1226  
1227 
assert(size <= MAX_FRAMESIZE); 
1228  
1229 
/* Set the offset from which we start reading wmavoice_std_codebook */

1230 
if (frame_desc>fcb_type == FCB_TYPE_SILENCE) {

1231 
r_idx = pRNG(s>frame_cntr, block_idx, size); 
1232 
gain = s>silence_gain; 
1233 
} else /* FCB_TYPE_HARDCODED */ { 
1234 
r_idx = get_bits(gb, 8);

1235 
gain = wmavoice_gain_universal[get_bits(gb, 6)];

1236 
} 
1237  
1238 
/* Clear gain prediction parameters */

1239 
memset(s>gain_pred_err, 0, sizeof(s>gain_pred_err)); 
1240  
1241 
/* Apply gain to hardcoded codebook and use that as excitation signal */

1242 
for (n = 0; n < size; n++) 
1243 
excitation[n] = wmavoice_std_codebook[r_idx + n] * gain; 
1244 
} 
1245  
1246 
/**

1247 
* Parse FCB/ACB signal for a single block.

1248 
* @note see #synth_block().

1249 
*/

1250 
static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb, 
1251 
int block_idx, int size, 
1252 
int block_pitch_sh2,

1253 
const struct frame_type_desc *frame_desc, 
1254 
float *excitation)

1255 
{ 
1256 
static const float gain_coeff[6] = { 
1257 
0.8169, 0.06545, 0.1726, 0.0185, 0.0359, 0.0458 
1258 
}; 
1259 
float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain; 
1260 
int n, idx, gain_weight;

1261 
AMRFixed fcb; 
1262  
1263 
assert(size <= MAX_FRAMESIZE / 2);

1264 
memset(pulses, 0, sizeof(*pulses) * size); 
1265  
1266 
fcb.pitch_lag = block_pitch_sh2 >> 2;

1267 
fcb.pitch_fac = 1.0; 
1268 
fcb.no_repeat_mask = 0;

1269 
fcb.n = 0;

1270  
1271 
/* For the other frame types, this is where we apply the innovation

1272 
* (fixed) codebook pulses of the speech signal. */

1273 
if (frame_desc>fcb_type == FCB_TYPE_AW_PULSES) {

1274 
aw_pulse_set1(s, gb, block_idx, &fcb); 
1275 
aw_pulse_set2(s, gb, block_idx, &fcb); 
1276 
} else /* FCB_TYPE_EXC_PULSES */ { 
1277 
int offset_nbits = 5  frame_desc>log_n_blocks; 
1278  
1279 
fcb.no_repeat_mask = 1;

1280 
/* similar to ff_decode_10_pulses_35bits(), but with single pulses

1281 
* (instead of double) for a subset of pulses */

1282 
for (n = 0; n < 5; n++) { 
1283 
float sign;

1284 
int pos1, pos2;

1285  
1286 
sign = get_bits1(gb) ? 1.0 : 1.0; 
1287 
pos1 = get_bits(gb, offset_nbits); 
1288 
fcb.x[fcb.n] = n + 5 * pos1;

1289 
fcb.y[fcb.n++] = sign; 
1290 
if (n < frame_desc>dbl_pulses) {

1291 
pos2 = get_bits(gb, offset_nbits); 
1292 
fcb.x[fcb.n] = n + 5 * pos2;

1293 
fcb.y[fcb.n++] = (pos1 < pos2) ? sign : sign; 
1294 
} 
1295 
} 
1296 
} 
1297 
ff_set_fixed_vector(pulses, &fcb, 1.0, size); 
1298  
1299 
/* Calculate gain for adaptive & fixed codebook signal.

1300 
* see ff_amr_set_fixed_gain(). */

1301 
idx = get_bits(gb, 7);

1302 
fcb_gain = expf(ff_dot_productf(s>gain_pred_err, gain_coeff, 6) 

1303 
5.2409161640 + wmavoice_gain_codebook_fcb[idx]); 
1304 
acb_gain = wmavoice_gain_codebook_acb[idx]; 
1305 
pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx], 
1306 
2.9957322736 /* log(0.05) */, 
1307 
1.6094379124 /* log(5.0) */); 
1308  
1309 
gain_weight = 8 >> frame_desc>log_n_blocks;

1310 
memmove(&s>gain_pred_err[gain_weight], s>gain_pred_err, 
1311 
sizeof(*s>gain_pred_err) * (6  gain_weight)); 
1312 
for (n = 0; n < gain_weight; n++) 
1313 
s>gain_pred_err[n] = pred_err; 
1314  
1315 
/* Calculation of adaptive codebook */

1316 
if (frame_desc>acb_type == ACB_TYPE_ASYMMETRIC) {

1317 
int len;

1318 
for (n = 0; n < size; n += len) { 
1319 
int next_idx_sh16;

1320 
int abs_idx = block_idx * size + n;

1321 
int pitch_sh16 = (s>last_pitch_val << 16) + 
1322 
s>pitch_diff_sh16 * abs_idx; 
1323 
int pitch = (pitch_sh16 + 0x6FFF) >> 16; 
1324 
int idx_sh16 = ((pitch << 16)  pitch_sh16) * 8 + 0x58000; 
1325 
idx = idx_sh16 >> 16;

1326 
if (s>pitch_diff_sh16) {

1327 
if (s>pitch_diff_sh16 > 0) { 
1328 
next_idx_sh16 = (idx_sh16) &~ 0xFFFF;

1329 
} else

1330 
next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF; 
1331 
len = av_clip((idx_sh16  next_idx_sh16) / s>pitch_diff_sh16 / 8,

1332 
1, size  n);

1333 
} else

1334 
len = size; 
1335  
1336 
ff_acelp_interpolatef(&excitation[n], &excitation[n  pitch], 
1337 
wmavoice_ipol1_coeffs, 17,

1338 
idx, 9, len);

1339 
} 
1340 
} else /* ACB_TYPE_HAMMING */ { 
1341 
int block_pitch = block_pitch_sh2 >> 2; 
1342 
idx = block_pitch_sh2 & 3;

1343 
if (idx) {

1344 
ff_acelp_interpolatef(excitation, &excitation[block_pitch], 
1345 
wmavoice_ipol2_coeffs, 4,

1346 
idx, 8, size);

1347 
} else

1348 
av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch, 
1349 
sizeof(float) * size); 
1350 
} 
1351  
1352 
/* Interpolate ACB/FCB and use as excitation signal */

1353 
ff_weighted_vector_sumf(excitation, excitation, pulses, 
1354 
acb_gain, fcb_gain, size); 
1355 
} 
1356  
1357 
/**

1358 
* Parse data in a single block.

1359 
* @note we assume enough bits are available, caller should check.

1360 
*

1361 
* @param s WMA Voice decoding context private data

1362 
* @param gb bit I/O context

1363 
* @param block_idx index of the toberead block

1364 
* @param size amount of samples to be read in this block

1365 
* @param block_pitch_sh2 pitch for this block << 2

1366 
* @param lsps LSPs for (the end of) this frame

1367 
* @param prev_lsps LSPs for the last frame

1368 
* @param frame_desc frame type descriptor

1369 
* @param excitation target memory for the ACB+FCB interpolated signal

1370 
* @param synth target memory for the speech synthesis filter output

1371 
* @return 0 on success, <0 on error.

1372 
*/

1373 
static void synth_block(WMAVoiceContext *s, GetBitContext *gb, 
1374 
int block_idx, int size, 
1375 
int block_pitch_sh2,

1376 
const double *lsps, const double *prev_lsps, 
1377 
const struct frame_type_desc *frame_desc, 
1378 
float *excitation, float *synth) 
1379 
{ 
1380 
double i_lsps[MAX_LSPS];

1381 
float lpcs[MAX_LSPS];

1382 
float fac;

1383 
int n;

1384  
1385 
if (frame_desc>acb_type == ACB_TYPE_NONE)

1386 
synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation); 
1387 
else

1388 
synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2, 
1389 
frame_desc, excitation); 
1390  
1391 
/* convert interpolated LSPs to LPCs */

1392 
fac = (block_idx + 0.5) / frame_desc>n_blocks; 
1393 
for (n = 0; n < s>lsps; n++) // LSF > LSP 
1394 
i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n]  prev_lsps[n])); 
1395 
ff_acelp_lspd2lpc(i_lsps, lpcs, s>lsps >> 1);

1396  
1397 
/* Speech synthesis */

1398 
ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s>lsps); 
1399 
} 
1400  
1401 
/**

1402 
* Synthesize output samples for a single frame.

1403 
* @note we assume enough bits are available, caller should check.

1404 
*

1405 
* @param ctx WMA Voice decoder context

1406 
* @param gb bit I/O context (s>gb or one for crosspacket superframes)

1407 
* @param frame_idx Frame number within superframe [02]

1408 
* @param samples pointer to output sample buffer, has space for at least 160

1409 
* samples

1410 
* @param lsps LSP array

1411 
* @param prev_lsps array of previous frame's LSPs

1412 
* @param excitation target buffer for excitation signal

1413 
* @param synth target buffer for synthesized speech data

1414 
* @return 0 on success, <0 on error.

1415 
*/

1416 
static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx, 
1417 
float *samples,

1418 
const double *lsps, const double *prev_lsps, 
1419 
float *excitation, float *synth) 
1420 
{ 
1421 
WMAVoiceContext *s = ctx>priv_data; 
1422 
int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;

1423 
int pitch[MAX_BLOCKS], last_block_pitch;

1424  
1425 
/* Parse frame type ("frame header"), see frame_descs */

1426 
int bd_idx = s>vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], 
1427 
block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks; 
1428  
1429 
if (bd_idx < 0) { 
1430 
av_log(ctx, AV_LOG_ERROR, 
1431 
"Invalid frame type VLC code, skipping\n");

1432 
return 1; 
1433 
} 
1434  
1435 
/* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitchperframe") */

1436 
if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {

1437 
/* Pitch is provided per frame, which is interpreted as the pitch of

1438 
* the last sample of the last block of this frame. We can interpolate

1439 
* the pitch of other blocks (and even pitchpersample) by gradually

1440 
* incrementing/decrementing prev_frame_pitch to cur_pitch_val. */

1441 
n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;

1442 
log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;

1443 
cur_pitch_val = s>min_pitch_val + get_bits(gb, s>pitch_nbits); 
1444 
cur_pitch_val = FFMIN(cur_pitch_val, s>max_pitch_val  1);

1445 
if (s>last_acb_type == ACB_TYPE_NONE 

1446 
20 * abs(cur_pitch_val  s>last_pitch_val) >

1447 
(cur_pitch_val + s>last_pitch_val)) 
1448 
s>last_pitch_val = cur_pitch_val; 
1449  
1450 
/* pitch per block */

1451 
for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) { 
1452 
int fac = n * 2 + 1; 
1453  
1454 
pitch[n] = (MUL16(fac, cur_pitch_val) + 
1455 
MUL16((n_blocks_x2  fac), s>last_pitch_val) + 
1456 
frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2; 
1457 
} 
1458  
1459 
/* "pitchdiffpersample" for calculation of pitch per sample */

1460 
s>pitch_diff_sh16 = 
1461 
((cur_pitch_val  s>last_pitch_val) << 16) / MAX_FRAMESIZE;

1462 
} 
1463  
1464 
/* Global gain (if silence) and pitchadaptive window coordinates */

1465 
switch (frame_descs[bd_idx].fcb_type) {

1466 
case FCB_TYPE_SILENCE:

1467 
s>silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];

1468 
break;

1469 
case FCB_TYPE_AW_PULSES:

1470 
aw_parse_coords(s, gb, pitch); 
1471 
break;

1472 
} 
1473  
1474 
for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) { 
1475 
int bl_pitch_sh2;

1476  
1477 
/* Pitch calculation for ACB_TYPE_HAMMING ("pitchperblock") */

1478 
switch (frame_descs[bd_idx].acb_type) {

1479 
case ACB_TYPE_HAMMING: {

1480 
/* Pitch is given per block. Perblock pitches are encoded as an

1481 
* absolute value for the first block, and then delta values

1482 
* relative to this value) for all subsequent blocks. The scale of

1483 
* this pitch value is semilogaritmic compared to its use in the

1484 
* decoder, so we convert it to normal scale also. */

1485 
int block_pitch,

1486 
t1 = (s>block_conv_table[1]  s>block_conv_table[0]) << 2, 
1487 
t2 = (s>block_conv_table[2]  s>block_conv_table[1]) << 1, 
1488 
t3 = s>block_conv_table[3]  s>block_conv_table[2] + 1; 
1489  
1490 
if (n == 0) { 
1491 
block_pitch = get_bits(gb, s>block_pitch_nbits); 
1492 
} else

1493 
block_pitch = last_block_pitch  s>block_delta_pitch_hrange + 
1494 
get_bits(gb, s>block_delta_pitch_nbits); 
1495 
/* Convert last_ so that any next delta is within _range */

1496 
last_block_pitch = av_clip(block_pitch, 
1497 
s>block_delta_pitch_hrange, 
1498 
s>block_pitch_range  
1499 
s>block_delta_pitch_hrange); 
1500  
1501 
/* Convert semilogstyle scale back to normal scale */

1502 
if (block_pitch < t1) {

1503 
bl_pitch_sh2 = (s>block_conv_table[0] << 2) + block_pitch; 
1504 
} else {

1505 
block_pitch = t1; 
1506 
if (block_pitch < t2) {

1507 
bl_pitch_sh2 = 
1508 
(s>block_conv_table[1] << 2) + (block_pitch << 1); 
1509 
} else {

1510 
block_pitch = t2; 
1511 
if (block_pitch < t3) {

1512 
bl_pitch_sh2 = 
1513 
(s>block_conv_table[2] + block_pitch) << 2; 
1514 
} else

1515 
bl_pitch_sh2 = s>block_conv_table[3] << 2; 
1516 
} 
1517 
} 
1518 
pitch[n] = bl_pitch_sh2 >> 2;

1519 
break;

1520 
} 
1521  
1522 
case ACB_TYPE_ASYMMETRIC: {

1523 
bl_pitch_sh2 = pitch[n] << 2;

1524 
break;

1525 
} 
1526  
1527 
default: // ACB_TYPE_NONE has no pitch 
1528 
bl_pitch_sh2 = 0;

1529 
break;

1530 
} 
1531  
1532 
synth_block(s, gb, n, block_nsamples, bl_pitch_sh2, 
1533 
lsps, prev_lsps, &frame_descs[bd_idx], 
1534 
&excitation[n * block_nsamples], 
1535 
&synth[n * block_nsamples]); 
1536 
} 
1537  
1538 
/* Averaging projection filter, if applicable. Else, just copy samples

1539 
* from synthesis buffer */

1540 
if (s>do_apf) {

1541 
double i_lsps[MAX_LSPS];

1542 
float lpcs[MAX_LSPS];

1543  
1544 
for (n = 0; n < s>lsps; n++) // LSF > LSP 
1545 
i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n])); 
1546 
ff_acelp_lspd2lpc(i_lsps, lpcs, s>lsps >> 1);

1547 
postfilter(s, synth, samples, 80, lpcs,

1548 
&s>zero_exc_pf[s>history_nsamples + MAX_FRAMESIZE * frame_idx], 
1549 
frame_descs[bd_idx].fcb_type, pitch[0]);

1550  
1551 
for (n = 0; n < s>lsps; n++) // LSF > LSP 
1552 
i_lsps[n] = cos(lsps[n]); 
1553 
ff_acelp_lspd2lpc(i_lsps, lpcs, s>lsps >> 1);

1554 
postfilter(s, &synth[80], &samples[80], 80, lpcs, 
1555 
&s>zero_exc_pf[s>history_nsamples + MAX_FRAMESIZE * frame_idx + 80],

1556 
frame_descs[bd_idx].fcb_type, pitch[0]);

1557 
} else

1558 
memcpy(samples, synth, 160 * sizeof(synth[0])); 
1559  
1560 
/* Cache values for next frame */

1561 
s>frame_cntr++; 
1562 
if (s>frame_cntr >= 0xFFFF) s>frame_cntr = 0xFFFF; // i.e. modulo (%) 
1563 
s>last_acb_type = frame_descs[bd_idx].acb_type; 
1564 
switch (frame_descs[bd_idx].acb_type) {

1565 
case ACB_TYPE_NONE:

1566 
s>last_pitch_val = 0;

1567 
break;

1568 
case ACB_TYPE_ASYMMETRIC:

1569 
s>last_pitch_val = cur_pitch_val; 
1570 
break;

1571 
case ACB_TYPE_HAMMING:

1572 
s>last_pitch_val = pitch[frame_descs[bd_idx].n_blocks  1];

1573 
break;

1574 
} 
1575  
1576 
return 0; 
1577 
} 
1578  
1579 
/**

1580 
* Ensure minimum value for first item, maximum value for last value,

1581 
* proper spacing between each value and proper ordering.

1582 
*

1583 
* @param lsps array of LSPs

1584 
* @param num size of LSP array

1585 
*

1586 
* @note basically a double version of #ff_acelp_reorder_lsf(), might be

1587 
* useful to put in a generic location later on. Parts are also

1588 
* present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),

1589 
* which is in float.

1590 
*/

1591 
static void stabilize_lsps(double *lsps, int num) 
1592 
{ 
1593 
int n, m, l;

1594  
1595 
/* set minimum value for first, maximum value for last and minimum

1596 
* spacing between LSF values.

1597 
* Very similar to ff_set_min_dist_lsf(), but in double. */

1598 
lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI); 
1599 
for (n = 1; n < num; n++) 
1600 
lsps[n] = FFMAX(lsps[n], lsps[n  1] + 0.0125 * M_PI); 
1601 
lsps[num  1] = FFMIN(lsps[num  1], 0.9985 * M_PI); 
1602  
1603 
/* reorder (looks like onetime / nonrecursed bubblesort).

1604 
* Very similar to ff_sort_nearly_sorted_floats(), but in double. */

1605 
for (n = 1; n < num; n++) { 
1606 
if (lsps[n] < lsps[n  1]) { 
1607 
for (m = 1; m < num; m++) { 
1608 
double tmp = lsps[m];

1609 
for (l = m  1; l >= 0; l) { 
1610 
if (lsps[l] <= tmp) break; 
1611 
lsps[l + 1] = lsps[l];

1612 
} 
1613 
lsps[l + 1] = tmp;

1614 
} 
1615 
break;

1616 
} 
1617 
} 
1618 
} 
1619  
1620 
/**

1621 
* Test if there's enough bits to read 1 superframe.

1622 
*

1623 
* @param orig_gb bit I/O context used for reading. This function

1624 
* does not modify the state of the bitreader; it

1625 
* only uses it to copy the current stream position

1626 
* @param s WMA Voice decoding context private data

1627 
* @return 1 if unsupported, 1 on not enough bits or 0 if OK.

1628 
*/

1629 
static int check_bits_for_superframe(GetBitContext *orig_gb, 
1630 
WMAVoiceContext *s) 
1631 
{ 
1632 
GetBitContext s_gb, *gb = &s_gb; 
1633 
int n, need_bits, bd_idx;

1634 
const struct frame_type_desc *frame_desc; 
1635  
1636 
/* initialize a copy */

1637 
init_get_bits(gb, orig_gb>buffer, orig_gb>size_in_bits); 
1638 
skip_bits_long(gb, get_bits_count(orig_gb)); 
1639 
assert(get_bits_left(gb) == get_bits_left(orig_gb)); 
1640  
1641 
/* superframe header */

1642 
if (get_bits_left(gb) < 14) 
1643 
return 1; 
1644 
if (!get_bits1(gb))

1645 
return 1; // WMAProinWMAVoice superframe 
1646 
if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe 
1647 
if (s>has_residual_lsps) { // residual LSPs (for all frames) 
1648 
if (get_bits_left(gb) < s>sframe_lsp_bitsize)

1649 
return 1; 
1650 
skip_bits_long(gb, s>sframe_lsp_bitsize); 
1651 
} 
1652  
1653 
/* frames */

1654 
for (n = 0; n < MAX_FRAMES; n++) { 
1655 
int aw_idx_is_ext = 0; 
1656  
1657 
if (!s>has_residual_lsps) { // independent LSPs (perframe) 
1658 
if (get_bits_left(gb) < s>frame_lsp_bitsize) return 1; 
1659 
skip_bits_long(gb, s>frame_lsp_bitsize); 
1660 
} 
1661 
bd_idx = s>vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)]; 
1662 
if (bd_idx < 0) 
1663 
return 1; // invalid frame type VLC code 
1664 
frame_desc = &frame_descs[bd_idx]; 
1665 
if (frame_desc>acb_type == ACB_TYPE_ASYMMETRIC) {

1666 
if (get_bits_left(gb) < s>pitch_nbits)

1667 
return 1; 
1668 
skip_bits_long(gb, s>pitch_nbits); 
1669 
} 
1670 
if (frame_desc>fcb_type == FCB_TYPE_SILENCE) {

1671 
skip_bits(gb, 8);

1672 
} else if (frame_desc>fcb_type == FCB_TYPE_AW_PULSES) { 
1673 
int tmp = get_bits(gb, 6); 
1674 
if (tmp >= 0x36) { 
1675 
skip_bits(gb, 2);

1676 
aw_idx_is_ext = 1;

1677 
} 
1678 
} 
1679  
1680 
/* blocks */

1681 
if (frame_desc>acb_type == ACB_TYPE_HAMMING) {

1682 
need_bits = s>block_pitch_nbits + 
1683 
(frame_desc>n_blocks  1) * s>block_delta_pitch_nbits;

1684 
} else if (frame_desc>fcb_type == FCB_TYPE_AW_PULSES) { 
1685 
need_bits = 2 * !aw_idx_is_ext;

1686 
} else

1687 
need_bits = 0;

1688 
need_bits += frame_desc>frame_size; 
1689 
if (get_bits_left(gb) < need_bits)

1690 
return 1; 
1691 
skip_bits_long(gb, need_bits); 
1692 
} 
1693  
1694 
return 0; 
1695 
} 
1696  
1697 
/**

1698 
* Synthesize output samples for a single superframe. If we have any data

1699 
* cached in s>sframe_cache, that will be used instead of whatever is loaded

1700 
* in s>gb.

1701 
*

1702 
* WMA Voice superframes contain 3 frames, each containing 160 audio samples,

1703 
* to give a total of 480 samples per frame. See #synth_frame() for frame

1704 
* parsing. In addition to 3 frames, superframes can also contain the LSPs

1705 
* (if these are globally specified for all frames (residually); they can

1706 
* also be specified individually perframe. See the s>has_residual_lsps

1707 
* option), and can specify the number of samples encoded in this superframe

1708 
* (if less than 480), usually used to prevent blanks at track boundaries.

1709 
*

1710 
* @param ctx WMA Voice decoder context

1711 
* @param samples pointer to output buffer for voice samples

1712 
* @param data_size pointer containing the size of #samples on input, and the

1713 
* amount of #samples filled on output

1714 
* @return 0 on success, <0 on error or 1 if there was not enough data to

1715 
* fully parse the superframe

1716 
*/

1717 
static int synth_superframe(AVCodecContext *ctx, 
1718 
float *samples, int *data_size) 
1719 
{ 
1720 
WMAVoiceContext *s = ctx>priv_data; 
1721 
GetBitContext *gb = &s>gb, s_gb; 
1722 
int n, res, n_samples = 480; 
1723 
double lsps[MAX_FRAMES][MAX_LSPS];

1724 
const double *mean_lsf = s>lsps == 16 ? 
1725 
wmavoice_mean_lsf16[s>lsp_def_mode] : wmavoice_mean_lsf10[s>lsp_def_mode]; 
1726 
float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12]; 
1727 
float synth[MAX_LSPS + MAX_SFRAMESIZE];

1728  
1729 
memcpy(synth, s>synth_history, 
1730 
s>lsps * sizeof(*synth));

1731 
memcpy(excitation, s>excitation_history, 
1732 
s>history_nsamples * sizeof(*excitation));

1733  
1734 
if (s>sframe_cache_size > 0) { 
1735 
gb = &s_gb; 
1736 
init_get_bits(gb, s>sframe_cache, s>sframe_cache_size); 
1737 
s>sframe_cache_size = 0;

1738 
} 
1739  
1740 
if ((res = check_bits_for_superframe(gb, s)) == 1) return 1; 
1741  
1742 
/* First bit is speech/music bit, it differentiates between WMAVoice

1743 
* speech samples (the actual codec) and WMAVoice music samples, which

1744 
* are really WMAProinWMAVoicesuperframes. I've never seen those in

1745 
* the wild yet. */

1746 
if (!get_bits1(gb)) {

1747 
av_log_missing_feature(ctx, "WMAProinWMAVoice support", 1); 
1748 
return 1; 
1749 
} 
1750  
1751 
/* (optional) nr. of samples in superframe; always <= 480 and >= 0 */

1752 
if (get_bits1(gb)) {

1753 
if ((n_samples = get_bits(gb, 12)) > 480) { 
1754 
av_log(ctx, AV_LOG_ERROR, 
1755 
"Superframe encodes >480 samples (%d), not allowed\n",

1756 
n_samples); 
1757 
return 1; 
1758 
} 
1759 
} 
1760 
/* Parse LSPs, if global for the superframe (can also be perframe). */

1761 
if (s>has_residual_lsps) {

1762 
double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2]; 
1763  
1764 
for (n = 0; n < s>lsps; n++) 
1765 
prev_lsps[n] = s>prev_lsps[n]  mean_lsf[n]; 
1766  
1767 
if (s>lsps == 10) { 
1768 
dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s>lsp_q_mode);

1769 
} else /* s>lsps == 16 */ 
1770 
dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s>lsp_q_mode);

1771  
1772 
for (n = 0; n < s>lsps; n++) { 
1773 
lsps[0][n] = mean_lsf[n] + (a1[n]  a2[n * 2]); 
1774 
lsps[1][n] = mean_lsf[n] + (a1[s>lsps + n]  a2[n * 2 + 1]); 
1775 
lsps[2][n] += mean_lsf[n];

1776 
} 
1777 
for (n = 0; n < 3; n++) 
1778 
stabilize_lsps(lsps[n], s>lsps); 
1779 
} 
1780  
1781 
/* Parse frames, optionally preceeded by perframe (independent) LSPs. */

1782 
for (n = 0; n < 3; n++) { 
1783 
if (!s>has_residual_lsps) {

1784 
int m;

1785  
1786 
if (s>lsps == 10) { 
1787 
dequant_lsp10i(gb, lsps[n]); 
1788 
} else /* s>lsps == 16 */ 
1789 
dequant_lsp16i(gb, lsps[n]); 
1790  
1791 
for (m = 0; m < s>lsps; m++) 
1792 
lsps[n][m] += mean_lsf[m]; 
1793 
stabilize_lsps(lsps[n], s>lsps); 
1794 
} 
1795  
1796 
if ((res = synth_frame(ctx, gb, n,

1797 
&samples[n * MAX_FRAMESIZE], 
1798 
lsps[n], n == 0 ? s>prev_lsps : lsps[n  1], 
1799 
&excitation[s>history_nsamples + n * MAX_FRAMESIZE], 
1800 
&synth[s>lsps + n * MAX_FRAMESIZE]))) 
1801 
return res;

1802 
} 
1803  
1804 
/* Statistics? FIXME  we don't check for length, a slight overrun

1805 
* will be caught by internal buffer padding, and anything else

1806 
* will be skipped, not read. */

1807 
if (get_bits1(gb)) {

1808 
res = get_bits(gb, 4);

1809 
skip_bits(gb, 10 * (res + 1)); 
1810 
} 
1811  
1812 
/* Specify nr. of output samples */

1813 
*data_size = n_samples * sizeof(float); 
1814  
1815 
/* Update history */

1816 
memcpy(s>prev_lsps, lsps[2],

1817 
s>lsps * sizeof(*s>prev_lsps));

1818 
memcpy(s>synth_history, &synth[MAX_SFRAMESIZE], 
1819 
s>lsps * sizeof(*synth));

1820 
memcpy(s>excitation_history, &excitation[MAX_SFRAMESIZE], 
1821 
s>history_nsamples * sizeof(*excitation));

1822 
if (s>do_apf)

1823 
memmove(s>zero_exc_pf, &s>zero_exc_pf[MAX_SFRAMESIZE], 
1824 
s>history_nsamples * sizeof(*s>zero_exc_pf));

1825  
1826 
return 0; 
1827 
} 
1828  
1829 
/**

1830 
* Parse the packet header at the start of each packet (input data to this

1831 
* decoder).

1832 
*

1833 
* @param s WMA Voice decoding context private data

1834 
* @return 1 if not enough bits were available, or 0 on success.

1835 
*/

1836 
static int parse_packet_header(WMAVoiceContext *s) 
1837 
{ 
1838 
GetBitContext *gb = &s>gb; 
1839 
unsigned int res; 
1840  
1841 
if (get_bits_left(gb) < 11) 
1842 
return 1; 
1843 
skip_bits(gb, 4); // packet sequence number 
1844 
s>has_residual_lsps = get_bits1(gb); 
1845 
do {

1846 
res = get_bits(gb, 6); // number of superframes per packet 
1847 
// (minus first one if there is spillover)

1848 
if (get_bits_left(gb) < 6 * (res == 0x3F) + s>spillover_bitsize) 
1849 
return 1; 
1850 
} while (res == 0x3F); 
1851 
s>spillover_nbits = get_bits(gb, s>spillover_bitsize); 
1852  
1853 
return 0; 
1854 
} 
1855  
1856 
/**

1857 
* Copy (unaligned) bits from gb/data/size to pb.

1858 
*

1859 
* @param pb target buffer to copy bits into

1860 
* @param data source buffer to copy bits from

1861 
* @param size size of the source data, in bytes

1862 
* @param gb bit I/O context specifying the current position in the source.

1863 
* data. This function might use this to align the bit position to

1864 
* a wholebyte boundary before calling #ff_copy_bits() on aligned

1865 
* source data

1866 
* @param nbits the amount of bits to copy from source to target

1867 
*

1868 
* @note after calling this function, the current position in the input bit

1869 
* I/O context is undefined.

1870 
*/

1871 
static void copy_bits(PutBitContext *pb, 
1872 
const uint8_t *data, int size, 
1873 
GetBitContext *gb, int nbits)

1874 
{ 
1875 
int rmn_bytes, rmn_bits;

1876  
1877 
rmn_bits = rmn_bytes = get_bits_left(gb); 
1878 
if (rmn_bits < nbits)

1879 
return;

1880 
rmn_bits &= 7; rmn_bytes >>= 3; 
1881 
if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0) 
1882 
put_bits(pb, rmn_bits, get_bits(gb, rmn_bits)); 
1883 
ff_copy_bits(pb, data + size  rmn_bytes, 
1884 
FFMIN(nbits  rmn_bits, rmn_bytes << 3));

1885 
} 
1886  
1887 
/**

1888 
* Packet decoding: a packet is anything that the (ASF) demuxer contains,

1889 
* and we expect that the demuxer / application provides it to us as such

1890 
* (else you'll probably get garbage as output). Every packet has a size of

1891 
* ctx>block_align bytes, starts with a packet header (see

1892 
* #parse_packet_header()), and then a series of superframes. Superframe

1893 
* boundaries may exceed packets, i.e. superframes can split data over

1894 
* multiple (two) packets.

1895 
*

1896 
* For more information about frames, see #synth_superframe().

1897 
*/

1898 
static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, 
1899 
int *data_size, AVPacket *avpkt)

1900 
{ 
1901 
WMAVoiceContext *s = ctx>priv_data; 
1902 
GetBitContext *gb = &s>gb; 
1903 
int size, res, pos;

1904  
1905 
if (*data_size < 480 * sizeof(float)) { 
1906 
av_log(ctx, AV_LOG_ERROR, 
1907 
"Output buffer too small (%d given  %zu needed)\n",

1908 
*data_size, 480 * sizeof(float)); 
1909 
return 1; 
1910 
} 
1911 
*data_size = 0;

1912  
1913 
/* Packets are sometimes a multiple of ctx>block_align, with a packet

1914 
* header at each ctx>block_align bytes. However, FFmpeg's ASF demuxer

1915 
* feeds us ASF packets, which may concatenate multiple "codec" packets

1916 
* in a single "muxer" packet, so we artificially emulate that by

1917 
* capping the packet size at ctx>block_align. */

1918 
for (size = avpkt>size; size > ctx>block_align; size = ctx>block_align);

1919 
if (!size)

1920 
return 0; 
1921 
init_get_bits(&s>gb, avpkt>data, size << 3);

1922  
1923 
/* size == ctx>block_align is used to indicate whether we are dealing with

1924 
* a new packet or a packet of which we already read the packet header

1925 
* previously. */

1926 
if (size == ctx>block_align) { // new packet header 
1927 
if ((res = parse_packet_header(s)) < 0) 
1928 
return res;

1929  
1930 
/* If the packet header specifies a s>spillover_nbits, then we want

1931 
* to push out all data of the previous packet (+ spillover) before

1932 
* continuing to parse new superframes in the current packet. */

1933 
if (s>spillover_nbits > 0) { 
1934 
if (s>sframe_cache_size > 0) { 
1935 
int cnt = get_bits_count(gb);

1936 
copy_bits(&s>pb, avpkt>data, size, gb, s>spillover_nbits); 
1937 
flush_put_bits(&s>pb); 
1938 
s>sframe_cache_size += s>spillover_nbits; 
1939 
if ((res = synth_superframe(ctx, data, data_size)) == 0 && 
1940 
*data_size > 0) {

1941 
cnt += s>spillover_nbits; 
1942 
s>skip_bits_next = cnt & 7;

1943 
return cnt >> 3; 
1944 
} else

1945 
skip_bits_long (gb, s>spillover_nbits  cnt + 
1946 
get_bits_count(gb)); // resync

1947 
} else

1948 
skip_bits_long(gb, s>spillover_nbits); // resync

1949 
} 
1950 
} else if (s>skip_bits_next) 
1951 
skip_bits(gb, s>skip_bits_next); 
1952  
1953 
/* Try parsing superframes in current packet */

1954 
s>sframe_cache_size = 0;

1955 
s>skip_bits_next = 0;

1956 
pos = get_bits_left(gb); 
1957 
if ((res = synth_superframe(ctx, data, data_size)) < 0) { 
1958 
return res;

1959 
} else if (*data_size > 0) { 
1960 
int cnt = get_bits_count(gb);

1961 
s>skip_bits_next = cnt & 7;

1962 
return cnt >> 3; 
1963 
} else if ((s>sframe_cache_size = pos) > 0) { 
1964 
/* rewind bit reader to start of last (incomplete) superframe... */

1965 
init_get_bits(gb, avpkt>data, size << 3);

1966 
skip_bits_long(gb, (size << 3)  pos);

1967 
assert(get_bits_left(gb) == pos); 
1968  
1969 
/* ...and cache it for spillover in next packet */

1970 
init_put_bits(&s>pb, s>sframe_cache, SFRAME_CACHE_MAXSIZE); 
1971 
copy_bits(&s>pb, avpkt>data, size, gb, s>sframe_cache_size); 
1972 
// FIXME bad  just copy bytes as whole and add use the

1973 
// skip_bits_next field

1974 
} 
1975  
1976 
return size;

1977 
} 
1978  
1979 
static av_cold int wmavoice_decode_end(AVCodecContext *ctx) 
1980 
{ 
1981 
WMAVoiceContext *s = ctx>priv_data; 
1982  
1983 
if (s>do_apf) {

1984 
ff_rdft_end(&s>rdft); 
1985 
ff_rdft_end(&s>irdft); 
1986 
ff_dct_end(&s>dct); 
1987 
ff_dct_end(&s>dst); 
1988 
} 
1989  
1990 
return 0; 
1991 
} 
1992  
1993 
static av_cold void wmavoice_flush(AVCodecContext *ctx) 
1994 
{ 
1995 
WMAVoiceContext *s = ctx>priv_data; 
1996 
int n;

1997  
1998 
s>postfilter_agc = 0;

1999 
s>sframe_cache_size = 0;

2000 
s>skip_bits_next = 0;

2001 
for (n = 0; n < s>lsps; n++) 
2002 
s>prev_lsps[n] = M_PI * (n + 1.0) / (s>lsps + 1.0); 
2003 
memset(s>excitation_history, 0,

2004 
sizeof(*s>excitation_history) * MAX_SIGNAL_HISTORY);

2005 
memset(s>synth_history, 0,

2006 
sizeof(*s>synth_history) * MAX_LSPS);

2007 
memset(s>gain_pred_err, 0,

2008 
sizeof(s>gain_pred_err));

2009  
2010 
if (s>do_apf) {

2011 
memset(&s>synth_filter_out_buf[MAX_LSPS_ALIGN16  s>lsps], 0,

2012 
sizeof(*s>synth_filter_out_buf) * s>lsps);

2013 
memset(s>dcf_mem, 0,

2014 
sizeof(*s>dcf_mem) * 2); 
2015 
memset(s>zero_exc_pf, 0,

2016 
sizeof(*s>zero_exc_pf) * s>history_nsamples);

2017 
memset(s>denoise_filter_cache, 0, sizeof(s>denoise_filter_cache)); 
2018 
} 
2019 
} 
2020  
2021 
AVCodec ff_wmavoice_decoder = { 
2022 
"wmavoice",

2023 
AVMEDIA_TYPE_AUDIO, 
2024 
CODEC_ID_WMAVOICE, 
2025 
sizeof(WMAVoiceContext),

2026 
wmavoice_decode_init, 
2027 
NULL,

2028 
wmavoice_decode_end, 
2029 
wmavoice_decode_packet, 
2030 
CODEC_CAP_SUBFRAMES, 
2031 
.flush = wmavoice_flush, 
2032 
.long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),

2033 
}; 