FFmpeg
|
Windows Media Audio Voice compatible decoder. More...
#include <math.h>
#include "libavutil/channel_layout.h"
#include "libavutil/float_dsp.h"
#include "libavutil/mem.h"
#include "avcodec.h"
#include "internal.h"
#include "get_bits.h"
#include "put_bits.h"
#include "wmavoice_data.h"
#include "celp_filters.h"
#include "acelp_vectors.h"
#include "acelp_filters.h"
#include "lsp.h"
#include "dct.h"
#include "rdft.h"
#include "sinewin.h"
Go to the source code of this file.
Data Structures | |
struct | frame_type_desc |
Description of frame types. More... | |
struct | WMAVoiceContext |
WMA Voice decoding context. More... | |
Macros | |
#define | MAX_BLOCKS 8 |
maximum number of blocks per frame More... | |
#define | MAX_LSPS 16 |
maximum filter order More... | |
#define | MAX_LSPS_ALIGN16 16 |
same as MAX_LSPS; needs to be multiple More... | |
#define | MAX_FRAMES 3 |
maximum number of frames per superframe More... | |
#define | MAX_FRAMESIZE 160 |
maximum number of samples per frame More... | |
#define | MAX_SIGNAL_HISTORY 416 |
maximum excitation signal history More... | |
#define | MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES) |
maximum number of samples per superframe More... | |
#define | SFRAME_CACHE_MAXSIZE 256 |
maximum cache size for frame data that More... | |
#define | VLC_NBITS 6 |
number of bits to read per VLC iteration More... | |
#define | log_range(var, assign) |
Enumerations | |
enum | { ACB_TYPE_NONE = 0, ACB_TYPE_ASYMMETRIC = 1, ACB_TYPE_HAMMING = 2 } |
Adaptive codebook types. More... | |
enum | { FCB_TYPE_SILENCE = 0, FCB_TYPE_HARDCODED = 1, FCB_TYPE_AW_PULSES = 2, FCB_TYPE_EXC_PULSES = 3 } |
Fixed codebook types. More... | |
Functions | |
static av_cold int | decode_vbmtree (GetBitContext *gb, int8_t vbm_tree[25]) |
Set up the variable bit mode (VBM) tree from container extradata. More... | |
static av_cold int | wmavoice_decode_init (AVCodecContext *ctx) |
Set up decoder with parameters from demuxer (extradata etc.). More... | |
static void | dequant_lsps (double *lsps, int num, const uint16_t *values, const uint16_t *sizes, int n_stages, const uint8_t *table, const double *mul_q, const double *base_q) |
Dequantize LSPs. More... | |
static int | pRNG (int frame_cntr, int block_num, int block_size) |
Generate a random number from frame_cntr and block_idx, which will lief in the range [0, 1000 - block_size] (so it can be used as an index in a table of size 1000 of which you want to read block_size entries). More... | |
static void | synth_block_hardcoded (WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, const struct frame_type_desc *frame_desc, float *excitation) |
Parse hardcoded signal for a single block. More... | |
static void | synth_block_fcb_acb (WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const struct frame_type_desc *frame_desc, float *excitation) |
Parse FCB/ACB signal for a single block. More... | |
static void | synth_block (WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const double *lsps, const double *prev_lsps, const struct frame_type_desc *frame_desc, float *excitation, float *synth) |
Parse data in a single block. More... | |
static int | synth_frame (AVCodecContext *ctx, GetBitContext *gb, int frame_idx, float *samples, const double *lsps, const double *prev_lsps, float *excitation, float *synth) |
Synthesize output samples for a single frame. More... | |
static void | stabilize_lsps (double *lsps, int num) |
Ensure minimum value for first item, maximum value for last value, proper spacing between each value and proper ordering. More... | |
static int | check_bits_for_superframe (GetBitContext *orig_gb, WMAVoiceContext *s) |
Test if there's enough bits to read 1 superframe. More... | |
static int | synth_superframe (AVCodecContext *ctx, AVFrame *frame, int *got_frame_ptr) |
Synthesize output samples for a single superframe. More... | |
static int | parse_packet_header (WMAVoiceContext *s) |
Parse the packet header at the start of each packet (input data to this decoder). More... | |
static void | copy_bits (PutBitContext *pb, const uint8_t *data, int size, GetBitContext *gb, int nbits) |
Copy (unaligned) bits from gb/data/size to pb. More... | |
static int | wmavoice_decode_packet (AVCodecContext *ctx, void *data, int *got_frame_ptr, AVPacket *avpkt) |
Packet decoding: a packet is anything that the (ASF) demuxer contains, and we expect that the demuxer / application provides it to us as such (else you'll probably get garbage as output). More... | |
static av_cold int | wmavoice_decode_end (AVCodecContext *ctx) |
static av_cold void | wmavoice_flush (AVCodecContext *ctx) |
Postfilter functions | |
Postfilter functions (gain control, wiener denoise filter, DC filter, kalman smoothening, plus surrounding code to wrap it) | |
static void | adaptive_gain_control (float *out, const float *in, const float *speech_synth, int size, float alpha, float *gain_mem) |
Adaptive gain control (as used in postfilter). More... | |
static int | kalman_smoothen (WMAVoiceContext *s, int pitch, const float *in, float *out, int size) |
Kalman smoothing function. More... | |
static float | tilt_factor (const float *lpcs, int n_lpcs) |
Get the tilt factor of a formant filter from its transfer function. More... | |
static void | calc_input_response (WMAVoiceContext *s, float *lpcs, int fcb_type, float *coeffs, int remainder) |
Derive denoise filter coefficients (in real domain) from the LPCs. More... | |
static void | wiener_denoise (WMAVoiceContext *s, int fcb_type, float *synth_pf, int size, const float *lpcs) |
This function applies a Wiener filter on the (noisy) speech signal as a means to denoise it. More... | |
static void | postfilter (WMAVoiceContext *s, const float *synth, float *samples, int size, const float *lpcs, float *zero_exc_pf, int fcb_type, int pitch) |
Averaging projection filter, the postfilter used in WMAVoice. More... | |
LSP dequantization routines | |
LSP dequantization routines, for 10/16LSPs and independent/residual coding.
| |
static void | dequant_lsp10i (GetBitContext *gb, double *lsps) |
Parse 10 independently-coded LSPs. More... | |
static void | dequant_lsp10r (GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode) |
Parse 10 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding). More... | |
static void | dequant_lsp16i (GetBitContext *gb, double *lsps) |
Parse 16 independently-coded LSPs. More... | |
static void | dequant_lsp16r (GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode) |
Parse 16 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding). More... | |
Pitch-adaptive window coding functions | |
The next few functions are for pitch-adaptive window coding. | |
static void | aw_parse_coords (WMAVoiceContext *s, GetBitContext *gb, const int *pitch) |
Parse the offset of the first pitch-adaptive window pulses, and the distribution of pulses between the two blocks in this frame. More... | |
static void | aw_pulse_set2 (WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb) |
Apply second set of pitch-adaptive window pulses. More... | |
static void | aw_pulse_set1 (WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb) |
Apply first set of pitch-adaptive window pulses. More... | |
Variables | |
static VLC | frame_type_vlc |
Frame type VLC coding. More... | |
static const struct frame_type_desc | frame_descs [17] |
AVCodec | ff_wmavoice_decoder |
Detailed Description
Windows Media Audio Voice compatible decoder.
Definition in file wmavoice.c.
Macro Definition Documentation
#define log_range | ( | var, | |
assign | |||
) |
Referenced by calc_input_response().
#define MAX_BLOCKS 8 |
maximum number of blocks per frame
Definition at line 46 of file wmavoice.c.
Referenced by synth_frame().
#define MAX_FRAMES 3 |
maximum number of frames per superframe
Definition at line 50 of file wmavoice.c.
Referenced by check_bits_for_superframe(), and synth_superframe().
#define MAX_FRAMESIZE 160 |
maximum number of samples per frame
Definition at line 51 of file wmavoice.c.
Referenced by aw_parse_coords(), aw_pulse_set1(), aw_pulse_set2(), postfilter(), synth_block_fcb_acb(), synth_block_hardcoded(), synth_frame(), and synth_superframe().
#define MAX_LSPS 16 |
maximum filter order
Definition at line 47 of file wmavoice.c.
Referenced by synth_block(), synth_frame(), synth_superframe(), and wmavoice_flush().
#define MAX_LSPS_ALIGN16 16 |
same as MAX_LSPS; needs to be multiple
of 16 for ASM input buffer alignment
Definition at line 48 of file wmavoice.c.
Referenced by postfilter(), and wmavoice_flush().
#define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES) |
maximum number of samples per superframe
Definition at line 53 of file wmavoice.c.
Referenced by synth_superframe().
#define MAX_SIGNAL_HISTORY 416 |
maximum excitation signal history
Definition at line 52 of file wmavoice.c.
Referenced by synth_superframe(), wmavoice_decode_init(), and wmavoice_flush().
#define SFRAME_CACHE_MAXSIZE 256 |
maximum cache size for frame data that
was split over two packets
Definition at line 55 of file wmavoice.c.
Referenced by wmavoice_decode_packet().
#define VLC_NBITS 6 |
number of bits to read per VLC iteration
Definition at line 57 of file wmavoice.c.
Referenced by decode_vbmtree().
Enumeration Type Documentation
anonymous enum |
Adaptive codebook types.
Enumerator | |
---|---|
ACB_TYPE_NONE |
no adaptive codebook (only hardcoded fixed) |
ACB_TYPE_ASYMMETRIC |
adaptive codebook with per-frame pitch, which we interpolate to get a per-sample pitch. Signal is generated using an asymmetric sinc window function
|
ACB_TYPE_HAMMING |
Per-block pitch with signal generation using a Hamming sinc window function.
|
Definition at line 67 of file wmavoice.c.
anonymous enum |
Fixed codebook types.
Definition at line 82 of file wmavoice.c.
Function Documentation
|
static |
Adaptive gain control (as used in postfilter).
Identical to ff_adaptive_gain_control() in acelp_vectors.c, except that the energy here is calculated using sum(abs(...)), whereas the other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
- Parameters
-
out output buffer for filtered samples in input buffer containing the samples as they are after the postfilter steps so far speech_synth input buffer containing speech synth before postfilter size input buffer size alpha exponential filter factor gain_mem pointer to filter memory (single float)
Definition at line 469 of file wmavoice.c.
Referenced by postfilter().
|
static |
Parse the offset of the first pitch-adaptive window pulses, and the distribution of pulses between the two blocks in this frame.
- Parameters
-
s WMA Voice decoding context private data gb bit I/O context pitch pitch for each block in this frame
Definition at line 998 of file wmavoice.c.
Referenced by synth_frame().
|
static |
Apply first set of pitch-adaptive window pulses.
- Parameters
-
s WMA Voice decoding context private data gb bit I/O context block_idx block index in frame [0, 1] fcb storage location for fixed codebook pulse info
Definition at line 1138 of file wmavoice.c.
Referenced by synth_block_fcb_acb().
|
static |
Apply second set of pitch-adaptive window pulses.
- Parameters
-
s WMA Voice decoding context private data gb bit I/O context block_idx block index in frame [0, 1] fcb structure containing fixed codebook vector info
Definition at line 1049 of file wmavoice.c.
Referenced by synth_block_fcb_acb().
|
static |
Derive denoise filter coefficients (in real domain) from the LPCs.
Definition at line 568 of file wmavoice.c.
Referenced by wiener_denoise().
|
static |
Test if there's enough bits to read 1 superframe.
- Parameters
-
orig_gb bit I/O context used for reading. This function does not modify the state of the bitreader; it only uses it to copy the current stream position s WMA Voice decoding context private data
- Returns
- -1 if unsupported, 1 on not enough bits or 0 if OK.
Definition at line 1647 of file wmavoice.c.
Referenced by synth_superframe().
|
static |
Copy (unaligned) bits from gb/data/size to pb.
- Parameters
-
pb target buffer to copy bits into data source buffer to copy bits from size size of the source data, in bytes gb bit I/O context specifying the current position in the source. data. This function might use this to align the bit position to a whole-byte boundary before calling avpriv_copy_bits() on aligned source data nbits the amount of bits to copy from source to target
- Note
- after calling this function, the current position in the input bit I/O context is undefined.
Definition at line 1898 of file wmavoice.c.
Referenced by wmavoice_decode_packet().
|
static |
Set up the variable bit mode (VBM) tree from container extradata.
- Parameters
-
gb bit I/O context. The bit context (s->gb) should be loaded with byte 23-46 of the container extradata (i.e. the ones containing the VBM tree). vbm_tree pointer to array to which the decoded VBM tree will be written.
- Returns
- 0 on success, <0 on error.
Definition at line 304 of file wmavoice.c.
Referenced by wmavoice_decode_init().
|
static |
Parse 10 independently-coded LSPs.
Definition at line 853 of file wmavoice.c.
Referenced by dequant_lsp10r(), and synth_superframe().
|
static |
Parse 10 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding).
Definition at line 879 of file wmavoice.c.
Referenced by synth_superframe().
|
static |
Parse 16 independently-coded LSPs.
Definition at line 915 of file wmavoice.c.
Referenced by dequant_lsp16r(), and synth_superframe().
|
static |
Parse 16 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding).
Definition at line 948 of file wmavoice.c.
Referenced by synth_superframe().
|
static |
Dequantize LSPs.
- Parameters
-
lsps output pointer to the array that will hold the LSPs num number of LSPs to be dequantized values quantized values, contains n_stages values sizes range (i.e. max value) of each quantized value n_stages number of dequantization runs table dequantization table to be used mul_q LSF multiplier base_q base (lowest) LSF values
Definition at line 821 of file wmavoice.c.
Referenced by dequant_lsp10i(), dequant_lsp10r(), dequant_lsp16i(), and dequant_lsp16r().
|
static |
Kalman smoothing function.
This function looks back pitch +/- 3 samples back into history to find the best fitting curve (that one giving the optimal gain of the two signals, i.e. the highest dot product between the two), and then uses that signal history to smoothen the output of the speech synthesis filter.
- Parameters
-
s WMA Voice decoding context pitch pitch of the speech signal in input speech signal out output pointer for smoothened signal size input/output buffer size
- Returns
- -1 if no smoothening took place, e.g. because no optimal fit could be found, or 0 on success.
Definition at line 509 of file wmavoice.c.
Referenced by postfilter().
|
static |
Parse the packet header at the start of each packet (input data to this decoder).
- Parameters
-
s WMA Voice decoding context private data
- Returns
- 1 if not enough bits were available, or 0 on success.
Definition at line 1863 of file wmavoice.c.
Referenced by wmavoice_decode_packet().
|
static |
Averaging projection filter, the postfilter used in WMAVoice.
This uses the following steps:
- A zero-synthesis filter (generate excitation from synth signal)
- Kalman smoothing on excitation, based on pitch
- Re-synthesized smoothened output
- Iterative Wiener denoise filter
- Adaptive gain filter
- DC filter
- Parameters
-
s WMAVoice decoding context synth Speech synthesis output (before postfilter) samples Output buffer for filtered samples size Buffer size of synth & samples lpcs Generated LPCs used for speech synthesis zero_exc_pf destination for zero synthesis filter (16-byte aligned) fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses) pitch Pitch of the input signal
Definition at line 767 of file wmavoice.c.
Referenced by synth_frame().
|
static |
Generate a random number from frame_cntr and block_idx, which will lief in the range [0, 1000 - block_size] (so it can be used as an index in a table of size 1000 of which you want to read block_size entries).
- Parameters
-
frame_cntr current frame number block_num current block index block_size amount of entries we want to read from a table that has 1000 entries
- Returns
- a (non-)random number in the [0, 1000 - block_size] range.
Definition at line 1199 of file wmavoice.c.
Referenced by synth_block_hardcoded().
|
static |
Ensure minimum value for first item, maximum value for last value, proper spacing between each value and proper ordering.
- Parameters
-
lsps array of LSPs num size of LSP array
- Note
- basically a double version of ff_acelp_reorder_lsf(), might be useful to put in a generic location later on. Parts are also present in ff_set_min_dist_lsf() + ff_sort_nearly_sorted_floats(), which is in float.
Definition at line 1609 of file wmavoice.c.
Referenced by synth_superframe().
|
static |
Parse data in a single block.
- Note
- we assume enough bits are available, caller should check.
- Parameters
-
s WMA Voice decoding context private data gb bit I/O context block_idx index of the to-be-read block size amount of samples to be read in this block block_pitch_sh2 pitch for this block << 2 lsps LSPs for (the end of) this frame prev_lsps LSPs for the last frame frame_desc frame type descriptor excitation target memory for the ACB+FCB interpolated signal synth target memory for the speech synthesis filter output
- Returns
- 0 on success, <0 on error.
Definition at line 1390 of file wmavoice.c.
Referenced by synth_frame().
|
static |
Parse FCB/ACB signal for a single block.
- Note
- see synth_block().
Definition at line 1266 of file wmavoice.c.
Referenced by synth_block().
|
static |
Parse hardcoded signal for a single block.
- Note
- see synth_block().
Definition at line 1235 of file wmavoice.c.
Referenced by synth_block().
|
static |
Synthesize output samples for a single frame.
- Note
- we assume enough bits are available, caller should check.
- Parameters
-
ctx WMA Voice decoder context gb bit I/O context (s->gb or one for cross-packet superframes) frame_idx Frame number within superframe [0-2] samples pointer to output sample buffer, has space for at least 160 samples lsps LSP array prev_lsps array of previous frame's LSPs excitation target buffer for excitation signal synth target buffer for synthesized speech data
- Returns
- 0 on success, <0 on error.
Definition at line 1433 of file wmavoice.c.
Referenced by synth_superframe().
|
static |
Synthesize output samples for a single superframe.
If we have any data cached in s->sframe_cache, that will be used instead of whatever is loaded in s->gb.
WMA Voice superframes contain 3 frames, each containing 160 audio samples, to give a total of 480 samples per frame. See synth_frame() for frame parsing. In addition to 3 frames, superframes can also contain the LSPs (if these are globally specified for all frames (residually); they can also be specified individually per-frame. See the s->has_residual_lsps option), and can specify the number of samples encoded in this superframe (if less than 480), usually used to prevent blanks at track boundaries.
- Parameters
-
ctx WMA Voice decoder context
- Returns
- 0 on success, <0 on error or 1 if there was not enough data to fully parse the superframe
Definition at line 1732 of file wmavoice.c.
Referenced by wmavoice_decode_packet().
|
static |
Get the tilt factor of a formant filter from its transfer function.
- See also
- tilt_factor() in amrnbdec.c, which does essentially the same, but somehow (??) it does a speech synthesis filter in the middle, which is missing here
- Parameters
-
lpcs LPC coefficients n_lpcs Size of LPC buffer
- Returns
- the tilt factor
Definition at line 555 of file wmavoice.c.
Referenced by calc_input_response(), and wiener_denoise().
|
static |
This function applies a Wiener filter on the (noisy) speech signal as a means to denoise it.
- take RDFT of LPCs to get the power spectrum of the noise + speech;
- using this power spectrum, calculate (for each frequency) the Wiener filter gain, which depends on the frequency power and desired level of noise subtraction (when set too high, this leads to artifacts) We can do this symmetrically over the X-axis (so 0-4kHz is the inverse of 4-8kHz);
- by doing a phase shift, calculate the Hilbert transform of this array of per-frequency filter-gains to get the filtering coefficients;
- smoothen/normalize/de-tilt these filter coefficients as desired;
- take RDFT of noisy sound, apply the coefficients and take its IRDFT to get the denoised speech signal;
- the leftover (i.e. output of the IRDFT on denoised speech data beyond the frame boundary) are saved and applied to subsequent frames by an overlap-add method (otherwise you get clicking-artifacts).
- Parameters
-
s WMA Voice decoding context fcb_type Frame (codebook) type synth_pf input: the noisy speech signal, output: denoised speech data; should be 16-byte aligned (for ASM purposes) size size of the speech data lpcs LPCs used to synthesize this frame's speech data
Definition at line 685 of file wmavoice.c.
Referenced by postfilter().
|
static |
Definition at line 2002 of file wmavoice.c.
|
static |
Set up decoder with parameters from demuxer (extradata etc.).
Extradata layout:
- byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
- byte 19-22: flags field (annoyingly in LE; see below for known values),
- byte 23-46: variable bitmode tree (really just 17 * 3 bits, rest is 0).
Definition at line 338 of file wmavoice.c.
|
static |
Packet decoding: a packet is anything that the (ASF) demuxer contains, and we expect that the demuxer / application provides it to us as such (else you'll probably get garbage as output).
Every packet has a size of ctx->block_align bytes, starts with a packet header (see parse_packet_header()), and then a series of superframes. Superframe boundaries may exceed packets, i.e. superframes can split data over multiple (two) packets.
For more information about frames, see synth_superframe().
Definition at line 1927 of file wmavoice.c.
|
static |
Definition at line 2016 of file wmavoice.c.
Variable Documentation
AVCodec ff_wmavoice_decoder |
Definition at line 2044 of file wmavoice.c.
|
static |
Referenced by check_bits_for_superframe(), and synth_frame().
|
static |
Definition at line 62 of file wmavoice.c.
Generated on Fri Dec 20 2024 06:56:17 for FFmpeg by 1.8.11