comparison functions/fe_funcs/FE.m @ 0:2fadb31a9d55 tip

Import code by Vuegen et al
author Dan Stowell <dan.stowell@elec.qmul.ac.uk>
date Fri, 11 Oct 2013 12:02:43 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2fadb31a9d55
1 % Yet another feature extractor, this time with CHiME in mind.
2 % Some cleanup, more support for customised audio parameters.
3 % Updated 9th Aug 2011
4 %
5 % Outputs have been changed. Currently no logarithms are taken here any
6 % more.
7 %
8 % Input:
9 % - 'sam' is a the audio, either as column or row channels vectors.
10 % (Longer dimension is treated as time, shorter as channel count.)
11 % - 'audioconf' is as defined in getconfigs.m . All of its parameters
12 % ARE respected now, so pass a temporary, edited copy if you want
13 % to change the behaviour.
14 %
15 % Output:
16 % - 'feats' is a [bands x frames x featchannels] array of mel features.
17 % If audioconf.melbands is zero, FFT magnitudes are returned instead.
18 % - 'energies' is a [frames x audiochannels] matrix of frame energies
19 % - 'frameaudio' is a [framelen x frames x audiochannels] array of chopped
20 % audio data (with preprocessing but without the window function).
21 % - 'frameFFT' is an [FFTlow x frames x audiochannels] array of frame FFTs.
22 % The windowing function has been applied, and the result has been
23 % truncated to Nfft/2 + 1 bands. However, no abs is taken. You can
24 % do this in the calling function, or pick the abs values from 'feats'
25 % by using zero melbands.
26 %
27 % The main feature output respects audioconf.featchannels, which should
28 % be either the same as audioconf.channels (the number of input streams)
29 % or 1 (downmixed to mono by taking the mean of feature channels). Other
30 % outputs use original audio channels, because their averaging is not as
31 % well defined. Note that there is a significant difference between
32 % averaging the audio (causing waveform level phase attenuation) and the
33 % abs-FFT or Mel features (phase-invariant energy mean). If the former is
34 % what you need, downmix the audio in the calling function.
35 %
36 % Some warnings are shown if audio parameters are missing or they do not
37 % match with the data.
38
39 function [feats, energies, frameaudio, frameFFT] = FE(sam, audioconf)
40
41 verbose = 0;
42
43 % Default config. To guarantee intended operation, you should always
44 % pass your own, though.
45
46 defconf.channels = 2; % input channels, in CHiME always 2
47 defconf.featchannels = 1; % feature level channels
48 defconf.fs = 16000; % sampling rate for internal processing
49 defconf.maxf = 8000; % maximum frequency to be considered
50 defconf.minf = 64; % maximum frequency to be considered
51 defconf.melbands = 26; % mel band count (0 to disable)
52 defconf.framelen_ms = 25; % millisecond length of each frame
53 defconf.framestep_ms = 10; % millisecond step between frames
54 defconf.windowfunc = 'hamming'; % window function name
55 defconf.preemphasis = 0.97; % 0 to disable
56 defconf.dcremoval = true; % DC removal in the feature extractor
57 defconf.Nfft = 0; % Number of FFT bands (0 to calculate from framelength)
58
59 if nargin < 2
60 if verbose
61 disp('No audioconf given, using defaults.')
62 end
63 audioconf = defconf;
64 else
65 fldnames = fieldnames(defconf);
66 for fl = 1:length(fldnames)
67 if ~isfield(audioconf, fldnames{fl})
68 if verbose
69 fprintf('Field %s missing, copying from defaults.\n', fldnames{fl})
70 end
71 audioconf.(f)=defconf.(f);
72 end
73 end
74 end
75
76 % Fetch the shorthand variables.
77 featbands = audioconf.melbands;
78 featchans = audioconf.featchannels;
79 fs = audioconf.fs;
80 fhigh = audioconf.maxf;
81 flow = audioconf.minf;
82
83 framelen = ceil(fs * audioconf.framelen_ms / 1000);
84 frameshift = ceil(fs * audioconf.framestep_ms / 1000);
85 % framelen = (fs * audioconf.framelen_ms / 1000);
86 % frameshift = (fs * audioconf.framestep_ms / 1000);
87
88 if audioconf.Nfft == 0
89 Nfft = 2^nextpow2(framelen);
90 else
91 Nfft = audioconf.Nfft;
92 end
93
94 winfunc = str2func(audioconf.windowfunc);
95 win = winfunc(framelen);
96
97 if featbands == 0
98 melmode = false;
99 else
100 melmode = true;
101 end
102
103 % Switch audio to columns.
104 if size(sam, 1) < size(sam,2)
105 sam = sam';
106 end
107
108 samlen = size(sam, 1);
109 samchans = size(sam, 2);
110
111 if samchans ~= audioconf.channels
112 if verbose
113 fprintf('Warning: Audio has %i channels, config states %i.\n', samchans, audioconf.channels);
114 end
115 end
116
117 if melmode
118 melmat = mel_matrix(fs, featbands, Nfft, 1, fhigh, flow)';
119 if size(melmat, 1) ~= featbands
120 fprintf('Mel matrix has %i bands (config: %i).\n', size(melmat, 1), featbands);
121 end
122 if size(melmat, 2) ~= (Nfft/2 + 1)
123 fprintf('Mel matrix has %i FFT coeffs (expected: %i).\n', size(melmat, 2), Nfft/2 + 1);
124 end
125 end
126
127 % Truncate to full frames, get the number.
128 numframes = floor((samlen-framelen+frameshift) / frameshift);
129 sam = sam(1:(numframes*frameshift+framelen-frameshift), :);
130
131 % DC removal - introduces a 1-unit filter delay, thus we discard the
132 % first sample. Note that this behaviour has changed from earlier
133 % versions of FE.
134 if audioconf.dcremoval
135 samf = filter([1;-1], [1;-0.999], [zeros(1,samchans);sam]);
136 sam = samf(2:end, :);
137 end
138 samtrlen = size(sam, 1); % trimmed length
139
140 % Pre-emphasis if nonzero. Can be done for the whole audio at once.
141 if (audioconf.preemphasis > 0)
142 sam = [zeros(1, samchans); sam(2:samtrlen, :) - audioconf.preemphasis * sam(1:(end-1), :)];
143 end
144
145 if melmode
146 tmpfeats = zeros(featbands, numframes, samchans);
147 else
148 tmpfeats = zeros(Nfft/2 + 1, numframes, samchans);
149 end
150
151 energies = zeros(numframes, samchans);
152 frameaudio = zeros(framelen, numframes, samchans);
153 frameFFT = zeros(Nfft/2+1, numframes, samchans);
154
155
156 % Process channels one by one. Trying to perform these ops simultaneously
157 % for all channels might be possible but tricky.
158 for c = 1:samchans
159
160 % starting sample numbers of each frame
161 ind1 = 1:frameshift:samtrlen-1-framelen+frameshift;
162 % linear 1-step vector (1...frame length)
163 ind2 = (1:framelen)';
164
165 % Pick frame audio. The index matrix (framelen x numframes) consists
166 % of four summed parts:
167 % 1) Constant column vectors, each denoting the frame's start sample.
168 % 2) Increasing sample index column vectors
169 % 3) Scalar jump to get into the correct channel in linear indexing
170 % 4) -1 because the first two indices are both one-based.
171 %
172 % [start1 start2 ] [ 1 1 ]
173 % sam( [ ... ... ] + [... ... ] + channel jump - 1) =
174 % [start1 start2 ] [frl frl ]
175 %
176 % [ start1+1 start2+1 ]
177 % sam( [ ... ... ] + channel jump - 1)
178 % [start1+frl start2+frl ]
179 %
180 % Thus we get an index matrix, where each frame column picks the
181 % samples belonging to it. These samples are then fetched to 'fra'.
182
183 fra = sam(ind1(ones(framelen,1),:) + ind2(:,ones(1,numframes)) + (c-1)*samtrlen - 1);
184 frameaudio(:,:,c) = fra;
185
186 % Calculate the energies.
187 energies(:,c) = sum(fra.^2,1)';
188
189 % Apply window function, take FFT.
190 fFFT = fft(win(:,ones(1,numframes)) .* fra, Nfft);
191 % Truncate and reset constant factor, but do not take abs yet.
192 fFFT(1,:) = 0;
193 fFFT = fFFT(1:Nfft/2+1,:);
194
195 % Store the returned FFTs with phase.
196 frameFFT(:,:,c) = fFFT;
197
198 if melmode
199 tmpfeats(:,:,c) = melmat * abs(fFFT);
200 else
201 tmpfeats(:,:,c) = abs(fFFT);
202 end
203 end
204
205 % Flatten the features if downmixing to 1 is defined.
206 if featchans == 1
207 if samchans > 1
208 feats = mean(tmpfeats, 3);
209 else
210 feats = tmpfeats;
211 end
212 else
213 if samchans ~= featchans
214 fprintf('Requested %i feature channels for %i audio - not defined. Returning %i.\n', featchans, samchans, samchans)
215 end
216 feats = tmpfeats;
217 end