dcase2013_ed_vuegenetal: functions/fe

annotate functions/fe_funcs/FE.m @ 0:2fadb31a9d55 tip

Import code by Vuegen et al

author	Dan Stowell <dan.stowell@elec.qmul.ac.uk>
date	Fri, 11 Oct 2013 12:02:43 +0100
parents
children

rev	line source
dan@0	1 % Yet another feature extractor, this time with CHiME in mind.
dan@0	2 % Some cleanup, more support for customised audio parameters.
dan@0	3 % Updated 9th Aug 2011
dan@0	4 %
dan@0	5 % Outputs have been changed. Currently no logarithms are taken here any
dan@0	6 % more.
dan@0	7 %
dan@0	8 % Input:
dan@0	9 % - 'sam' is a the audio, either as column or row channels vectors.
dan@0	10 % (Longer dimension is treated as time, shorter as channel count.)
dan@0	11 % - 'audioconf' is as defined in getconfigs.m . All of its parameters
dan@0	12 % ARE respected now, so pass a temporary, edited copy if you want
dan@0	13 % to change the behaviour.
dan@0	14 %
dan@0	15 % Output:
dan@0	16 % - 'feats' is a [bands x frames x featchannels] array of mel features.
dan@0	17 % If audioconf.melbands is zero, FFT magnitudes are returned instead.
dan@0	18 % - 'energies' is a [frames x audiochannels] matrix of frame energies
dan@0	19 % - 'frameaudio' is a [framelen x frames x audiochannels] array of chopped
dan@0	20 % audio data (with preprocessing but without the window function).
dan@0	21 % - 'frameFFT' is an [FFTlow x frames x audiochannels] array of frame FFTs.
dan@0	22 % The windowing function has been applied, and the result has been
dan@0	23 % truncated to Nfft/2 + 1 bands. However, no abs is taken. You can
dan@0	24 % do this in the calling function, or pick the abs values from 'feats'
dan@0	25 % by using zero melbands.
dan@0	26 %
dan@0	27 % The main feature output respects audioconf.featchannels, which should
dan@0	28 % be either the same as audioconf.channels (the number of input streams)
dan@0	29 % or 1 (downmixed to mono by taking the mean of feature channels). Other
dan@0	30 % outputs use original audio channels, because their averaging is not as
dan@0	31 % well defined. Note that there is a significant difference between
dan@0	32 % averaging the audio (causing waveform level phase attenuation) and the
dan@0	33 % abs-FFT or Mel features (phase-invariant energy mean). If the former is
dan@0	34 % what you need, downmix the audio in the calling function.
dan@0	35 %
dan@0	36 % Some warnings are shown if audio parameters are missing or they do not
dan@0	37 % match with the data.
dan@0	38
dan@0	39 function [feats, energies, frameaudio, frameFFT] = FE(sam, audioconf)
dan@0	40
dan@0	41 verbose = 0;
dan@0	42
dan@0	43 % Default config. To guarantee intended operation, you should always
dan@0	44 % pass your own, though.
dan@0	45
dan@0	46 defconf.channels = 2; % input channels, in CHiME always 2
dan@0	47 defconf.featchannels = 1; % feature level channels
dan@0	48 defconf.fs = 16000; % sampling rate for internal processing
dan@0	49 defconf.maxf = 8000; % maximum frequency to be considered
dan@0	50 defconf.minf = 64; % maximum frequency to be considered
dan@0	51 defconf.melbands = 26; % mel band count (0 to disable)
dan@0	52 defconf.framelen_ms = 25; % millisecond length of each frame
dan@0	53 defconf.framestep_ms = 10; % millisecond step between frames
dan@0	54 defconf.windowfunc = 'hamming'; % window function name
dan@0	55 defconf.preemphasis = 0.97; % 0 to disable
dan@0	56 defconf.dcremoval = true; % DC removal in the feature extractor
dan@0	57 defconf.Nfft = 0; % Number of FFT bands (0 to calculate from framelength)
dan@0	58
dan@0	59 if nargin < 2
dan@0	60 if verbose
dan@0	61 disp('No audioconf given, using defaults.')
dan@0	62 end
dan@0	63 audioconf = defconf;
dan@0	64 else
dan@0	65 fldnames = fieldnames(defconf);
dan@0	66 for fl = 1:length(fldnames)
dan@0	67 if ~isfield(audioconf, fldnames{fl})
dan@0	68 if verbose
dan@0	69 fprintf('Field %s missing, copying from defaults.\n', fldnames{fl})
dan@0	70 end
dan@0	71 audioconf.(f)=defconf.(f);
dan@0	72 end
dan@0	73 end
dan@0	74 end
dan@0	75
dan@0	76 % Fetch the shorthand variables.
dan@0	77 featbands = audioconf.melbands;
dan@0	78 featchans = audioconf.featchannels;
dan@0	79 fs = audioconf.fs;
dan@0	80 fhigh = audioconf.maxf;
dan@0	81 flow = audioconf.minf;
dan@0	82
dan@0	83 framelen = ceil(fs * audioconf.framelen_ms / 1000);
dan@0	84 frameshift = ceil(fs * audioconf.framestep_ms / 1000);
dan@0	85 % framelen = (fs * audioconf.framelen_ms / 1000);
dan@0	86 % frameshift = (fs * audioconf.framestep_ms / 1000);
dan@0	87
dan@0	88 if audioconf.Nfft == 0
dan@0	89 Nfft = 2^nextpow2(framelen);
dan@0	90 else
dan@0	91 Nfft = audioconf.Nfft;
dan@0	92 end
dan@0	93
dan@0	94 winfunc = str2func(audioconf.windowfunc);
dan@0	95 win = winfunc(framelen);
dan@0	96
dan@0	97 if featbands == 0
dan@0	98 melmode = false;
dan@0	99 else
dan@0	100 melmode = true;
dan@0	101 end
dan@0	102
dan@0	103 % Switch audio to columns.
dan@0	104 if size(sam, 1) < size(sam,2)
dan@0	105 sam = sam';
dan@0	106 end
dan@0	107
dan@0	108 samlen = size(sam, 1);
dan@0	109 samchans = size(sam, 2);
dan@0	110
dan@0	111 if samchans ~= audioconf.channels
dan@0	112 if verbose
dan@0	113 fprintf('Warning: Audio has %i channels, config states %i.\n', samchans, audioconf.channels);
dan@0	114 end
dan@0	115 end
dan@0	116
dan@0	117 if melmode
dan@0	118 melmat = mel_matrix(fs, featbands, Nfft, 1, fhigh, flow)';
dan@0	119 if size(melmat, 1) ~= featbands
dan@0	120 fprintf('Mel matrix has %i bands (config: %i).\n', size(melmat, 1), featbands);
dan@0	121 end
dan@0	122 if size(melmat, 2) ~= (Nfft/2 + 1)
dan@0	123 fprintf('Mel matrix has %i FFT coeffs (expected: %i).\n', size(melmat, 2), Nfft/2 + 1);
dan@0	124 end
dan@0	125 end
dan@0	126
dan@0	127 % Truncate to full frames, get the number.
dan@0	128 numframes = floor((samlen-framelen+frameshift) / frameshift);
dan@0	129 sam = sam(1:(numframes*frameshift+framelen-frameshift), :);
dan@0	130
dan@0	131 % DC removal - introduces a 1-unit filter delay, thus we discard the
dan@0	132 % first sample. Note that this behaviour has changed from earlier
dan@0	133 % versions of FE.
dan@0	134 if audioconf.dcremoval
dan@0	135 samf = filter([1;-1], [1;-0.999], [zeros(1,samchans);sam]);
dan@0	136 sam = samf(2:end, :);
dan@0	137 end
dan@0	138 samtrlen = size(sam, 1); % trimmed length
dan@0	139
dan@0	140 % Pre-emphasis if nonzero. Can be done for the whole audio at once.
dan@0	141 if (audioconf.preemphasis > 0)
dan@0	142 sam = [zeros(1, samchans); sam(2:samtrlen, :) - audioconf.preemphasis * sam(1:(end-1), :)];
dan@0	143 end
dan@0	144
dan@0	145 if melmode
dan@0	146 tmpfeats = zeros(featbands, numframes, samchans);
dan@0	147 else
dan@0	148 tmpfeats = zeros(Nfft/2 + 1, numframes, samchans);
dan@0	149 end
dan@0	150
dan@0	151 energies = zeros(numframes, samchans);
dan@0	152 frameaudio = zeros(framelen, numframes, samchans);
dan@0	153 frameFFT = zeros(Nfft/2+1, numframes, samchans);
dan@0	154
dan@0	155
dan@0	156 % Process channels one by one. Trying to perform these ops simultaneously
dan@0	157 % for all channels might be possible but tricky.
dan@0	158 for c = 1:samchans
dan@0	159
dan@0	160 % starting sample numbers of each frame
dan@0	161 ind1 = 1:frameshift:samtrlen-1-framelen+frameshift;
dan@0	162 % linear 1-step vector (1...frame length)
dan@0	163 ind2 = (1:framelen)';
dan@0	164
dan@0	165 % Pick frame audio. The index matrix (framelen x numframes) consists
dan@0	166 % of four summed parts:
dan@0	167 % 1) Constant column vectors, each denoting the frame's start sample.
dan@0	168 % 2) Increasing sample index column vectors
dan@0	169 % 3) Scalar jump to get into the correct channel in linear indexing
dan@0	170 % 4) -1 because the first two indices are both one-based.
dan@0	171 %
dan@0	172 % [start1 start2 ] [ 1 1 ]
dan@0	173 % sam( [ ... ... ] + [... ... ] + channel jump - 1) =
dan@0	174 % [start1 start2 ] [frl frl ]
dan@0	175 %
dan@0	176 % [ start1+1 start2+1 ]
dan@0	177 % sam( [ ... ... ] + channel jump - 1)
dan@0	178 % [start1+frl start2+frl ]
dan@0	179 %
dan@0	180 % Thus we get an index matrix, where each frame column picks the
dan@0	181 % samples belonging to it. These samples are then fetched to 'fra'.
dan@0	182
dan@0	183 fra = sam(ind1(ones(framelen,1),:) + ind2(:,ones(1,numframes)) + (c-1)*samtrlen - 1);
dan@0	184 frameaudio(:,:,c) = fra;
dan@0	185
dan@0	186 % Calculate the energies.
dan@0	187 energies(:,c) = sum(fra.^2,1)';
dan@0	188
dan@0	189 % Apply window function, take FFT.
dan@0	190 fFFT = fft(win(:,ones(1,numframes)) .* fra, Nfft);
dan@0	191 % Truncate and reset constant factor, but do not take abs yet.
dan@0	192 fFFT(1,:) = 0;
dan@0	193 fFFT = fFFT(1:Nfft/2+1,:);
dan@0	194
dan@0	195 % Store the returned FFTs with phase.
dan@0	196 frameFFT(:,:,c) = fFFT;
dan@0	197
dan@0	198 if melmode
dan@0	199 tmpfeats(:,:,c) = melmat * abs(fFFT);
dan@0	200 else
dan@0	201 tmpfeats(:,:,c) = abs(fFFT);
dan@0	202 end
dan@0	203 end
dan@0	204
dan@0	205 % Flatten the features if downmixing to 1 is defined.
dan@0	206 if featchans == 1
dan@0	207 if samchans > 1
dan@0	208 feats = mean(tmpfeats, 3);
dan@0	209 else
dan@0	210 feats = tmpfeats;
dan@0	211 end
dan@0	212 else
dan@0	213 if samchans ~= featchans
dan@0	214 fprintf('Requested %i feature channels for %i audio - not defined. Returning %i.\n', featchans, samchans, samchans)
dan@0	215 end
dan@0	216 feats = tmpfeats;
dan@0	217 end

Mercurial > hg > dcase2013_ed_vuegenetal

annotate functions/fe_funcs/FE.m @ 0:2fadb31a9d55 tip