dan@0: % Yet another feature extractor, this time with CHiME in mind. dan@0: % Some cleanup, more support for customised audio parameters. dan@0: % Updated 9th Aug 2011 dan@0: % dan@0: % Outputs have been changed. Currently no logarithms are taken here any dan@0: % more. dan@0: % dan@0: % Input: dan@0: % - 'sam' is a the audio, either as column or row channels vectors. dan@0: % (Longer dimension is treated as time, shorter as channel count.) dan@0: % - 'audioconf' is as defined in getconfigs.m . All of its parameters dan@0: % ARE respected now, so pass a temporary, edited copy if you want dan@0: % to change the behaviour. dan@0: % dan@0: % Output: dan@0: % - 'feats' is a [bands x frames x featchannels] array of mel features. dan@0: % If audioconf.melbands is zero, FFT magnitudes are returned instead. dan@0: % - 'energies' is a [frames x audiochannels] matrix of frame energies dan@0: % - 'frameaudio' is a [framelen x frames x audiochannels] array of chopped dan@0: % audio data (with preprocessing but without the window function). dan@0: % - 'frameFFT' is an [FFTlow x frames x audiochannels] array of frame FFTs. dan@0: % The windowing function has been applied, and the result has been dan@0: % truncated to Nfft/2 + 1 bands. However, no abs is taken. You can dan@0: % do this in the calling function, or pick the abs values from 'feats' dan@0: % by using zero melbands. dan@0: % dan@0: % The main feature output respects audioconf.featchannels, which should dan@0: % be either the same as audioconf.channels (the number of input streams) dan@0: % or 1 (downmixed to mono by taking the mean of feature channels). Other dan@0: % outputs use original audio channels, because their averaging is not as dan@0: % well defined. Note that there is a significant difference between dan@0: % averaging the audio (causing waveform level phase attenuation) and the dan@0: % abs-FFT or Mel features (phase-invariant energy mean). If the former is dan@0: % what you need, downmix the audio in the calling function. dan@0: % dan@0: % Some warnings are shown if audio parameters are missing or they do not dan@0: % match with the data. dan@0: dan@0: function [feats, energies, frameaudio, frameFFT] = FE(sam, audioconf) dan@0: dan@0: verbose = 0; dan@0: dan@0: % Default config. To guarantee intended operation, you should always dan@0: % pass your own, though. dan@0: dan@0: defconf.channels = 2; % input channels, in CHiME always 2 dan@0: defconf.featchannels = 1; % feature level channels dan@0: defconf.fs = 16000; % sampling rate for internal processing dan@0: defconf.maxf = 8000; % maximum frequency to be considered dan@0: defconf.minf = 64; % maximum frequency to be considered dan@0: defconf.melbands = 26; % mel band count (0 to disable) dan@0: defconf.framelen_ms = 25; % millisecond length of each frame dan@0: defconf.framestep_ms = 10; % millisecond step between frames dan@0: defconf.windowfunc = 'hamming'; % window function name dan@0: defconf.preemphasis = 0.97; % 0 to disable dan@0: defconf.dcremoval = true; % DC removal in the feature extractor dan@0: defconf.Nfft = 0; % Number of FFT bands (0 to calculate from framelength) dan@0: dan@0: if nargin < 2 dan@0: if verbose dan@0: disp('No audioconf given, using defaults.') dan@0: end dan@0: audioconf = defconf; dan@0: else dan@0: fldnames = fieldnames(defconf); dan@0: for fl = 1:length(fldnames) dan@0: if ~isfield(audioconf, fldnames{fl}) dan@0: if verbose dan@0: fprintf('Field %s missing, copying from defaults.\n', fldnames{fl}) dan@0: end dan@0: audioconf.(f)=defconf.(f); dan@0: end dan@0: end dan@0: end dan@0: dan@0: % Fetch the shorthand variables. dan@0: featbands = audioconf.melbands; dan@0: featchans = audioconf.featchannels; dan@0: fs = audioconf.fs; dan@0: fhigh = audioconf.maxf; dan@0: flow = audioconf.minf; dan@0: dan@0: framelen = ceil(fs * audioconf.framelen_ms / 1000); dan@0: frameshift = ceil(fs * audioconf.framestep_ms / 1000); dan@0: % framelen = (fs * audioconf.framelen_ms / 1000); dan@0: % frameshift = (fs * audioconf.framestep_ms / 1000); dan@0: dan@0: if audioconf.Nfft == 0 dan@0: Nfft = 2^nextpow2(framelen); dan@0: else dan@0: Nfft = audioconf.Nfft; dan@0: end dan@0: dan@0: winfunc = str2func(audioconf.windowfunc); dan@0: win = winfunc(framelen); dan@0: dan@0: if featbands == 0 dan@0: melmode = false; dan@0: else dan@0: melmode = true; dan@0: end dan@0: dan@0: % Switch audio to columns. dan@0: if size(sam, 1) < size(sam,2) dan@0: sam = sam'; dan@0: end dan@0: dan@0: samlen = size(sam, 1); dan@0: samchans = size(sam, 2); dan@0: dan@0: if samchans ~= audioconf.channels dan@0: if verbose dan@0: fprintf('Warning: Audio has %i channels, config states %i.\n', samchans, audioconf.channels); dan@0: end dan@0: end dan@0: dan@0: if melmode dan@0: melmat = mel_matrix(fs, featbands, Nfft, 1, fhigh, flow)'; dan@0: if size(melmat, 1) ~= featbands dan@0: fprintf('Mel matrix has %i bands (config: %i).\n', size(melmat, 1), featbands); dan@0: end dan@0: if size(melmat, 2) ~= (Nfft/2 + 1) dan@0: fprintf('Mel matrix has %i FFT coeffs (expected: %i).\n', size(melmat, 2), Nfft/2 + 1); dan@0: end dan@0: end dan@0: dan@0: % Truncate to full frames, get the number. dan@0: numframes = floor((samlen-framelen+frameshift) / frameshift); dan@0: sam = sam(1:(numframes*frameshift+framelen-frameshift), :); dan@0: dan@0: % DC removal - introduces a 1-unit filter delay, thus we discard the dan@0: % first sample. Note that this behaviour has changed from earlier dan@0: % versions of FE. dan@0: if audioconf.dcremoval dan@0: samf = filter([1;-1], [1;-0.999], [zeros(1,samchans);sam]); dan@0: sam = samf(2:end, :); dan@0: end dan@0: samtrlen = size(sam, 1); % trimmed length dan@0: dan@0: % Pre-emphasis if nonzero. Can be done for the whole audio at once. dan@0: if (audioconf.preemphasis > 0) dan@0: sam = [zeros(1, samchans); sam(2:samtrlen, :) - audioconf.preemphasis * sam(1:(end-1), :)]; dan@0: end dan@0: dan@0: if melmode dan@0: tmpfeats = zeros(featbands, numframes, samchans); dan@0: else dan@0: tmpfeats = zeros(Nfft/2 + 1, numframes, samchans); dan@0: end dan@0: dan@0: energies = zeros(numframes, samchans); dan@0: frameaudio = zeros(framelen, numframes, samchans); dan@0: frameFFT = zeros(Nfft/2+1, numframes, samchans); dan@0: dan@0: dan@0: % Process channels one by one. Trying to perform these ops simultaneously dan@0: % for all channels might be possible but tricky. dan@0: for c = 1:samchans dan@0: dan@0: % starting sample numbers of each frame dan@0: ind1 = 1:frameshift:samtrlen-1-framelen+frameshift; dan@0: % linear 1-step vector (1...frame length) dan@0: ind2 = (1:framelen)'; dan@0: dan@0: % Pick frame audio. The index matrix (framelen x numframes) consists dan@0: % of four summed parts: dan@0: % 1) Constant column vectors, each denoting the frame's start sample. dan@0: % 2) Increasing sample index column vectors dan@0: % 3) Scalar jump to get into the correct channel in linear indexing dan@0: % 4) -1 because the first two indices are both one-based. dan@0: % dan@0: % [start1 start2 ] [ 1 1 ] dan@0: % sam( [ ... ... ] + [... ... ] + channel jump - 1) = dan@0: % [start1 start2 ] [frl frl ] dan@0: % dan@0: % [ start1+1 start2+1 ] dan@0: % sam( [ ... ... ] + channel jump - 1) dan@0: % [start1+frl start2+frl ] dan@0: % dan@0: % Thus we get an index matrix, where each frame column picks the dan@0: % samples belonging to it. These samples are then fetched to 'fra'. dan@0: dan@0: fra = sam(ind1(ones(framelen,1),:) + ind2(:,ones(1,numframes)) + (c-1)*samtrlen - 1); dan@0: frameaudio(:,:,c) = fra; dan@0: dan@0: % Calculate the energies. dan@0: energies(:,c) = sum(fra.^2,1)'; dan@0: dan@0: % Apply window function, take FFT. dan@0: fFFT = fft(win(:,ones(1,numframes)) .* fra, Nfft); dan@0: % Truncate and reset constant factor, but do not take abs yet. dan@0: fFFT(1,:) = 0; dan@0: fFFT = fFFT(1:Nfft/2+1,:); dan@0: dan@0: % Store the returned FFTs with phase. dan@0: frameFFT(:,:,c) = fFFT; dan@0: dan@0: if melmode dan@0: tmpfeats(:,:,c) = melmat * abs(fFFT); dan@0: else dan@0: tmpfeats(:,:,c) = abs(fFFT); dan@0: end dan@0: end dan@0: dan@0: % Flatten the features if downmixing to 1 is defined. dan@0: if featchans == 1 dan@0: if samchans > 1 dan@0: feats = mean(tmpfeats, 3); dan@0: else dan@0: feats = tmpfeats; dan@0: end dan@0: else dan@0: if samchans ~= featchans dan@0: fprintf('Requested %i feature channels for %i audio - not defined. Returning %i.\n', featchans, samchans, samchans) dan@0: end dan@0: feats = tmpfeats; dan@0: end