dan@0: % Yet another feature extractor, this time with CHiME in mind.
dan@0: % Some cleanup, more support for customised audio parameters.
dan@0: % Updated 9th Aug 2011
dan@0: %
dan@0: % Outputs have been changed. Currently no logarithms are taken here any 
dan@0: % more. 
dan@0: %
dan@0: % Input:
dan@0: % - 'sam' is a the audio, either as column or row channels vectors. 
dan@0: %   (Longer dimension is treated as time, shorter as channel count.)
dan@0: % - 'audioconf' is as defined in getconfigs.m . All of its parameters
dan@0: %   ARE respected now, so pass a temporary, edited copy if you want
dan@0: %   to change the behaviour.
dan@0: %
dan@0: % Output:
dan@0: % - 'feats' is a [bands x frames x featchannels] array of mel features.
dan@0: %   If audioconf.melbands is zero, FFT magnitudes are returned instead.
dan@0: % - 'energies' is a [frames x audiochannels] matrix of frame energies
dan@0: % - 'frameaudio' is a [framelen x frames x audiochannels] array of chopped
dan@0: %   audio data (with preprocessing but without the window function).
dan@0: % - 'frameFFT' is an [FFTlow x frames x audiochannels] array of frame FFTs.
dan@0: %   The windowing function has been applied, and the result has been
dan@0: %   truncated to Nfft/2 + 1 bands. However, no abs is taken. You can
dan@0: %   do this in the calling function, or pick the abs values from 'feats'
dan@0: %   by using zero melbands.
dan@0: % 
dan@0: % The main feature output respects audioconf.featchannels, which should
dan@0: % be either the same as audioconf.channels (the number of input streams)
dan@0: % or 1 (downmixed to mono by taking the mean of feature channels). Other 
dan@0: % outputs use original audio channels, because their averaging is not as 
dan@0: % well defined. Note that there is a significant difference between
dan@0: % averaging the audio (causing waveform level phase attenuation) and the
dan@0: % abs-FFT or Mel features (phase-invariant energy mean). If the former is
dan@0: % what you need, downmix the audio in the calling function.
dan@0: %
dan@0: % Some warnings are shown if audio parameters are missing or they do not
dan@0: % match with the data.
dan@0: 
dan@0: function [feats, energies, frameaudio, frameFFT] = FE(sam, audioconf)
dan@0: 
dan@0: verbose = 0;
dan@0: 
dan@0: % Default config. To guarantee intended operation, you should always
dan@0: % pass your own, though.
dan@0: 
dan@0: defconf.channels = 2;         % input channels, in CHiME always 2
dan@0: defconf.featchannels = 1;     % feature level channels
dan@0: defconf.fs = 16000;           % sampling rate for internal processing
dan@0: defconf.maxf = 8000;          % maximum frequency to be considered
dan@0: defconf.minf = 64;            % maximum frequency to be considered
dan@0: defconf.melbands = 26;        % mel band count (0 to disable)
dan@0: defconf.framelen_ms = 25;     % millisecond length of each frame
dan@0: defconf.framestep_ms = 10;    % millisecond step between frames
dan@0: defconf.windowfunc = 'hamming';  % window function name
dan@0: defconf.preemphasis = 0.97;   % 0 to disable
dan@0: defconf.dcremoval = true;     % DC removal in the feature extractor
dan@0: defconf.Nfft = 0;             % Number of FFT bands (0 to calculate from framelength)
dan@0: 
dan@0: if nargin < 2
dan@0:     if verbose
dan@0:         disp('No audioconf given, using defaults.')
dan@0:     end
dan@0:     audioconf = defconf;
dan@0: else
dan@0:     fldnames = fieldnames(defconf);
dan@0:     for fl = 1:length(fldnames)
dan@0:         if ~isfield(audioconf, fldnames{fl})
dan@0:             if verbose
dan@0:                 fprintf('Field %s missing, copying from defaults.\n', fldnames{fl})
dan@0:             end
dan@0:             audioconf.(f)=defconf.(f);
dan@0:         end
dan@0:     end
dan@0: end
dan@0: 
dan@0: % Fetch the shorthand variables.
dan@0: featbands = audioconf.melbands;
dan@0: featchans = audioconf.featchannels;
dan@0: fs = audioconf.fs;
dan@0: fhigh = audioconf.maxf;
dan@0: flow = audioconf.minf;
dan@0: 
dan@0: framelen = ceil(fs * audioconf.framelen_ms / 1000);
dan@0: frameshift = ceil(fs * audioconf.framestep_ms / 1000);
dan@0: % framelen = (fs * audioconf.framelen_ms / 1000);
dan@0: % frameshift = (fs * audioconf.framestep_ms / 1000);
dan@0: 
dan@0: if audioconf.Nfft == 0
dan@0:     Nfft = 2^nextpow2(framelen);
dan@0: else
dan@0:     Nfft = audioconf.Nfft;
dan@0: end
dan@0: 
dan@0: winfunc = str2func(audioconf.windowfunc);
dan@0: win = winfunc(framelen);
dan@0: 
dan@0: if featbands == 0
dan@0:     melmode = false;
dan@0: else
dan@0:     melmode = true;
dan@0: end
dan@0: 
dan@0: % Switch audio to columns.
dan@0: if size(sam, 1) < size(sam,2)
dan@0:     sam = sam';
dan@0: end
dan@0: 
dan@0: samlen = size(sam, 1);
dan@0: samchans = size(sam, 2);
dan@0: 
dan@0: if samchans ~= audioconf.channels
dan@0:     if verbose
dan@0:         fprintf('Warning: Audio has %i channels, config states %i.\n', samchans, audioconf.channels);
dan@0:     end
dan@0: end
dan@0: 
dan@0: if melmode
dan@0:     melmat = mel_matrix(fs, featbands, Nfft, 1, fhigh, flow)';
dan@0:     if size(melmat, 1) ~= featbands
dan@0:         fprintf('Mel matrix has %i bands (config: %i).\n', size(melmat, 1), featbands);
dan@0:     end
dan@0:     if size(melmat, 2) ~= (Nfft/2 + 1)
dan@0:         fprintf('Mel matrix has %i FFT coeffs (expected: %i).\n', size(melmat, 2), Nfft/2 + 1);
dan@0:     end
dan@0: end
dan@0: 
dan@0: % Truncate to full frames, get the number.
dan@0: numframes = floor((samlen-framelen+frameshift) / frameshift);
dan@0: sam = sam(1:(numframes*frameshift+framelen-frameshift), :);
dan@0: 
dan@0: % DC removal - introduces a 1-unit filter delay, thus we discard the
dan@0: % first sample. Note that this behaviour has changed from earlier
dan@0: % versions of FE.
dan@0: if audioconf.dcremoval
dan@0:     samf = filter([1;-1], [1;-0.999], [zeros(1,samchans);sam]);
dan@0:     sam = samf(2:end, :);
dan@0: end
dan@0: samtrlen = size(sam, 1); % trimmed length
dan@0: 
dan@0: % Pre-emphasis if nonzero. Can be done for the whole audio at once.
dan@0: if (audioconf.preemphasis > 0)
dan@0:     sam = [zeros(1, samchans); sam(2:samtrlen, :) - audioconf.preemphasis * sam(1:(end-1), :)];
dan@0: end
dan@0: 
dan@0: if melmode
dan@0:     tmpfeats = zeros(featbands, numframes, samchans);
dan@0: else
dan@0:     tmpfeats = zeros(Nfft/2 + 1, numframes, samchans);
dan@0: end
dan@0: 
dan@0: energies = zeros(numframes, samchans);
dan@0: frameaudio = zeros(framelen, numframes, samchans);
dan@0: frameFFT = zeros(Nfft/2+1, numframes, samchans);
dan@0: 
dan@0: 
dan@0: % Process channels one by one. Trying to perform these ops simultaneously
dan@0: % for all channels might be possible but tricky.
dan@0: for c = 1:samchans
dan@0:     
dan@0:     % starting sample numbers of each frame
dan@0:     ind1 = 1:frameshift:samtrlen-1-framelen+frameshift;
dan@0:     % linear 1-step vector (1...frame length)
dan@0:     ind2 = (1:framelen)';
dan@0:     
dan@0:     % Pick frame audio. The index matrix (framelen x numframes) consists 
dan@0:     % of four summed parts:
dan@0:     % 1) Constant column vectors, each denoting the frame's start sample.
dan@0:     % 2) Increasing sample index column vectors
dan@0:     % 3) Scalar jump to get into the correct channel in linear indexing
dan@0:     % 4) -1 because the first two indices are both one-based.
dan@0:     %
dan@0:     %       [start1  start2 ]   [ 1    1  ]
dan@0:     %  sam( [ ...     ...   ] + [...  ... ] + channel jump - 1) =
dan@0:     %       [start1  start2 ]   [frl  frl ]
dan@0:     %
dan@0:     %       [ start1+1   start2+1  ]
dan@0:     %  sam( [   ...        ...     ] + channel jump - 1)
dan@0:     %       [start1+frl start2+frl ]
dan@0:     %
dan@0:     % Thus we get an index matrix, where each frame column picks the 
dan@0:     % samples belonging to it. These samples are then fetched to 'fra'.
dan@0:     
dan@0:     fra = sam(ind1(ones(framelen,1),:) + ind2(:,ones(1,numframes)) + (c-1)*samtrlen - 1);
dan@0:     frameaudio(:,:,c) = fra;
dan@0: 
dan@0:     % Calculate the energies.
dan@0:     energies(:,c) = sum(fra.^2,1)';            
dan@0:         
dan@0:     % Apply window function, take FFT.
dan@0:     fFFT = fft(win(:,ones(1,numframes)) .* fra, Nfft);
dan@0:     % Truncate and reset constant factor, but do not take abs yet.
dan@0:     fFFT(1,:) = 0;
dan@0:     fFFT = fFFT(1:Nfft/2+1,:);
dan@0:     
dan@0:     % Store the returned FFTs with phase.
dan@0:     frameFFT(:,:,c) = fFFT;
dan@0:     
dan@0:     if melmode
dan@0:         tmpfeats(:,:,c) = melmat * abs(fFFT);
dan@0:     else
dan@0:         tmpfeats(:,:,c) = abs(fFFT);
dan@0:     end
dan@0: end
dan@0: 
dan@0: % Flatten the features if downmixing to 1 is defined.
dan@0: if featchans == 1 
dan@0:     if samchans > 1
dan@0:         feats = mean(tmpfeats, 3);
dan@0:     else
dan@0:         feats = tmpfeats;
dan@0:     end
dan@0: else
dan@0:     if samchans ~= featchans
dan@0:         fprintf('Requested %i feature channels for %i audio - not defined. Returning %i.\n', featchans, samchans, samchans)
dan@0:     end
dan@0:     feats = tmpfeats;
dan@0: end