daniele@0: function [fea, feaNam, feaSiz] = ComputeFeatures(wav, par) daniele@0: % Computes a feature vector consisting of MFCCs coefficients and daniele@0: % coefficients derived from a matching pursuit decomposition with Gabor daniele@0: % atoms. daniele@0: % daniele@0: % [fea, feaNam] = computeMFCCsAndMP(wav,par) daniele@0: % daniele@0: % Input daniele@0: % -wav: file name locating a .wav audio signal. daniele@0: % -par: struct of paramters with the following fields daniele@0: % .fs (22050): sampling frequency daniele@0: % .num_ceps_coeffs (13): number of cepstral coefficients daniele@0: % .mel_filt_bank ([0 11025 23]): extrema and number of mel frequency daniele@0: % bands daniele@0: % .use_first_coeff (false): retain 1st MFCC coefficient daniele@0: % .fft_size (1024): length of fft daniele@0: % .hopsize (512): overlap of consecutive fft daniele@0: % Output daniele@0: % -fea: matrix of features (one column per frame) daniele@0: % -feaNam: struct containing names of features daniele@0: %% Unit test daniele@0: if ~nargin, [fea, feaNam, feaSiz] = unitTest; return, end daniele@0: daniele@0: %% Defaults daniele@0: if ~exist('par','var') || isempty(par), par = struct; end daniele@0: daniele@0: def.fs = 22050; %sampling rate daniele@0: def.fft_size = 1024; %size of window daniele@0: def.hopsize = 512; %step size daniele@0: def.usePreEmphasis = false; %use pre-emphasis (high pass filter) daniele@0: def.feaNam = {'mfcc','dmfcc','nme','hos','zcr','sro','scn','sfl','lpc','mpf'}; daniele@0: daniele@0: par = setdefaultoptions(par,def); %set default options daniele@0: daniele@0: %% Compute features daniele@0: s = preprocessAudio(wav,par); %preprocess audio file daniele@0: daniele@0: feaSiz = []; daniele@0: fea = []; daniele@0: par.feaNam = {par.feaNam}; daniele@0: for iFea=1:length(par.feaNam); daniele@0: switch par.feaNam{iFea} daniele@0: case {'mfcc','dmfcc','nme'} %MFCCs and related daniele@0: [mfcc,~,mel] = ma_mfcc(s,par); daniele@0: switch par.feaNam{iFea} daniele@0: case 'mfcc' daniele@0: x = mfcc; daniele@0: case 'dmfcc' daniele@0: x = derivative(mfcc); daniele@0: case 'nme' daniele@0: x = mel*diag(1./sum(mel)); %energy in each mel band normalized by total energy daniele@0: end daniele@0: case 'hos' %Higher order statistics (see Chi2003Ba) daniele@0: x = (kurtosis(s)/(var(s)^2))*ones(1,fix(length(s)/par.hopsize)-1); daniele@0: case 'zcr' daniele@0: x = zcr(s,par.fft_size,par.hopsize,par.fs)'; %zero crossing rate daniele@0: case 'sro' daniele@0: x = SpectralRollOff(s,par.fft_size,par.hopsize,0.80,par.fs); %spectral roll-off daniele@0: case 'scn' daniele@0: x = SpectralCentroid(s,par.fft_size,par.hopsize,par.fs)'; %spectral centroid daniele@0: case 'sfl' daniele@0: x = SpectralFlux(s,par.fft_size,par.hopsize,par.fs)'; %spectral flux daniele@0: case 'lpc' daniele@0: x = LPCFeatures(s,par); %LPC features daniele@0: case 'mpf' daniele@0: x = GaborFeatures(s,par); %Gabor features daniele@0: end daniele@0: feaSiz = [feaSiz, size(x,1)]; daniele@0: fea = [fea; x]; daniele@0: feaNam = par.feaNam; daniele@0: end daniele@0: daniele@0: function s = preprocessAudio(wav,par) daniele@0: s = wavread(wav); %read file daniele@0: s = s(1:2:end,:); %subsample audio (from 44.1kHz tp 22.05kHz) daniele@0: if size(s,2)>0, s = mean(s,2); end %convert to mono daniele@0: if par.usePreEmphasis %apply pre-emphasis filter that highlights high frequencies daniele@0: h = [1, -15/16]; %see Fundamentals of speech processing (Rabiner, Juang) daniele@0: s = filter(h,1,s); daniele@0: end daniele@0: s = s/max(abs(s)); %normalize audio daniele@0: daniele@0: function dmfcc = derivative(mfcc) daniele@0: dmfcc = zeros(size(mfcc)); %mfccs 1st derivative daniele@0: for iRow=1:size(mfcc,1) daniele@0: temp = conv([mfcc(iRow,1) mfcc(iRow,:) mfcc(iRow,end)],[1/2,0,-1/2],'same'); daniele@0: dmfcc(iRow,:) = temp(2:end-1); daniele@0: end daniele@0: daniele@0: function [fea, feaNam, feaSizes] = unitTest daniele@0: clear, clc, close all daniele@0: file = 'bus01.wav'; daniele@0: [fea, feaSizes] = ComputeFeatures(file);