dan@0
|
1 % Yet another feature extractor, this time with CHiME in mind.
|
dan@0
|
2 % Some cleanup, more support for customised audio parameters.
|
dan@0
|
3 % Updated 9th Aug 2011
|
dan@0
|
4 %
|
dan@0
|
5 % Outputs have been changed. Currently no logarithms are taken here any
|
dan@0
|
6 % more.
|
dan@0
|
7 %
|
dan@0
|
8 % Input:
|
dan@0
|
9 % - 'sam' is a the audio, either as column or row channels vectors.
|
dan@0
|
10 % (Longer dimension is treated as time, shorter as channel count.)
|
dan@0
|
11 % - 'audioconf' is as defined in getconfigs.m . All of its parameters
|
dan@0
|
12 % ARE respected now, so pass a temporary, edited copy if you want
|
dan@0
|
13 % to change the behaviour.
|
dan@0
|
14 %
|
dan@0
|
15 % Output:
|
dan@0
|
16 % - 'feats' is a [bands x frames x featchannels] array of mel features.
|
dan@0
|
17 % If audioconf.melbands is zero, FFT magnitudes are returned instead.
|
dan@0
|
18 % - 'energies' is a [frames x audiochannels] matrix of frame energies
|
dan@0
|
19 % - 'frameaudio' is a [framelen x frames x audiochannels] array of chopped
|
dan@0
|
20 % audio data (with preprocessing but without the window function).
|
dan@0
|
21 % - 'frameFFT' is an [FFTlow x frames x audiochannels] array of frame FFTs.
|
dan@0
|
22 % The windowing function has been applied, and the result has been
|
dan@0
|
23 % truncated to Nfft/2 + 1 bands. However, no abs is taken. You can
|
dan@0
|
24 % do this in the calling function, or pick the abs values from 'feats'
|
dan@0
|
25 % by using zero melbands.
|
dan@0
|
26 %
|
dan@0
|
27 % The main feature output respects audioconf.featchannels, which should
|
dan@0
|
28 % be either the same as audioconf.channels (the number of input streams)
|
dan@0
|
29 % or 1 (downmixed to mono by taking the mean of feature channels). Other
|
dan@0
|
30 % outputs use original audio channels, because their averaging is not as
|
dan@0
|
31 % well defined. Note that there is a significant difference between
|
dan@0
|
32 % averaging the audio (causing waveform level phase attenuation) and the
|
dan@0
|
33 % abs-FFT or Mel features (phase-invariant energy mean). If the former is
|
dan@0
|
34 % what you need, downmix the audio in the calling function.
|
dan@0
|
35 %
|
dan@0
|
36 % Some warnings are shown if audio parameters are missing or they do not
|
dan@0
|
37 % match with the data.
|
dan@0
|
38
|
dan@0
|
39 function [feats, energies, frameaudio, frameFFT] = FE(sam, audioconf)
|
dan@0
|
40
|
dan@0
|
41 verbose = 0;
|
dan@0
|
42
|
dan@0
|
43 % Default config. To guarantee intended operation, you should always
|
dan@0
|
44 % pass your own, though.
|
dan@0
|
45
|
dan@0
|
46 defconf.channels = 2; % input channels, in CHiME always 2
|
dan@0
|
47 defconf.featchannels = 1; % feature level channels
|
dan@0
|
48 defconf.fs = 16000; % sampling rate for internal processing
|
dan@0
|
49 defconf.maxf = 8000; % maximum frequency to be considered
|
dan@0
|
50 defconf.minf = 64; % maximum frequency to be considered
|
dan@0
|
51 defconf.melbands = 26; % mel band count (0 to disable)
|
dan@0
|
52 defconf.framelen_ms = 25; % millisecond length of each frame
|
dan@0
|
53 defconf.framestep_ms = 10; % millisecond step between frames
|
dan@0
|
54 defconf.windowfunc = 'hamming'; % window function name
|
dan@0
|
55 defconf.preemphasis = 0.97; % 0 to disable
|
dan@0
|
56 defconf.dcremoval = true; % DC removal in the feature extractor
|
dan@0
|
57 defconf.Nfft = 0; % Number of FFT bands (0 to calculate from framelength)
|
dan@0
|
58
|
dan@0
|
59 if nargin < 2
|
dan@0
|
60 if verbose
|
dan@0
|
61 disp('No audioconf given, using defaults.')
|
dan@0
|
62 end
|
dan@0
|
63 audioconf = defconf;
|
dan@0
|
64 else
|
dan@0
|
65 fldnames = fieldnames(defconf);
|
dan@0
|
66 for fl = 1:length(fldnames)
|
dan@0
|
67 if ~isfield(audioconf, fldnames{fl})
|
dan@0
|
68 if verbose
|
dan@0
|
69 fprintf('Field %s missing, copying from defaults.\n', fldnames{fl})
|
dan@0
|
70 end
|
dan@0
|
71 audioconf.(f)=defconf.(f);
|
dan@0
|
72 end
|
dan@0
|
73 end
|
dan@0
|
74 end
|
dan@0
|
75
|
dan@0
|
76 % Fetch the shorthand variables.
|
dan@0
|
77 featbands = audioconf.melbands;
|
dan@0
|
78 featchans = audioconf.featchannels;
|
dan@0
|
79 fs = audioconf.fs;
|
dan@0
|
80 fhigh = audioconf.maxf;
|
dan@0
|
81 flow = audioconf.minf;
|
dan@0
|
82
|
dan@0
|
83 framelen = ceil(fs * audioconf.framelen_ms / 1000);
|
dan@0
|
84 frameshift = ceil(fs * audioconf.framestep_ms / 1000);
|
dan@0
|
85 % framelen = (fs * audioconf.framelen_ms / 1000);
|
dan@0
|
86 % frameshift = (fs * audioconf.framestep_ms / 1000);
|
dan@0
|
87
|
dan@0
|
88 if audioconf.Nfft == 0
|
dan@0
|
89 Nfft = 2^nextpow2(framelen);
|
dan@0
|
90 else
|
dan@0
|
91 Nfft = audioconf.Nfft;
|
dan@0
|
92 end
|
dan@0
|
93
|
dan@0
|
94 winfunc = str2func(audioconf.windowfunc);
|
dan@0
|
95 win = winfunc(framelen);
|
dan@0
|
96
|
dan@0
|
97 if featbands == 0
|
dan@0
|
98 melmode = false;
|
dan@0
|
99 else
|
dan@0
|
100 melmode = true;
|
dan@0
|
101 end
|
dan@0
|
102
|
dan@0
|
103 % Switch audio to columns.
|
dan@0
|
104 if size(sam, 1) < size(sam,2)
|
dan@0
|
105 sam = sam';
|
dan@0
|
106 end
|
dan@0
|
107
|
dan@0
|
108 samlen = size(sam, 1);
|
dan@0
|
109 samchans = size(sam, 2);
|
dan@0
|
110
|
dan@0
|
111 if samchans ~= audioconf.channels
|
dan@0
|
112 if verbose
|
dan@0
|
113 fprintf('Warning: Audio has %i channels, config states %i.\n', samchans, audioconf.channels);
|
dan@0
|
114 end
|
dan@0
|
115 end
|
dan@0
|
116
|
dan@0
|
117 if melmode
|
dan@0
|
118 melmat = mel_matrix(fs, featbands, Nfft, 1, fhigh, flow)';
|
dan@0
|
119 if size(melmat, 1) ~= featbands
|
dan@0
|
120 fprintf('Mel matrix has %i bands (config: %i).\n', size(melmat, 1), featbands);
|
dan@0
|
121 end
|
dan@0
|
122 if size(melmat, 2) ~= (Nfft/2 + 1)
|
dan@0
|
123 fprintf('Mel matrix has %i FFT coeffs (expected: %i).\n', size(melmat, 2), Nfft/2 + 1);
|
dan@0
|
124 end
|
dan@0
|
125 end
|
dan@0
|
126
|
dan@0
|
127 % Truncate to full frames, get the number.
|
dan@0
|
128 numframes = floor((samlen-framelen+frameshift) / frameshift);
|
dan@0
|
129 sam = sam(1:(numframes*frameshift+framelen-frameshift), :);
|
dan@0
|
130
|
dan@0
|
131 % DC removal - introduces a 1-unit filter delay, thus we discard the
|
dan@0
|
132 % first sample. Note that this behaviour has changed from earlier
|
dan@0
|
133 % versions of FE.
|
dan@0
|
134 if audioconf.dcremoval
|
dan@0
|
135 samf = filter([1;-1], [1;-0.999], [zeros(1,samchans);sam]);
|
dan@0
|
136 sam = samf(2:end, :);
|
dan@0
|
137 end
|
dan@0
|
138 samtrlen = size(sam, 1); % trimmed length
|
dan@0
|
139
|
dan@0
|
140 % Pre-emphasis if nonzero. Can be done for the whole audio at once.
|
dan@0
|
141 if (audioconf.preemphasis > 0)
|
dan@0
|
142 sam = [zeros(1, samchans); sam(2:samtrlen, :) - audioconf.preemphasis * sam(1:(end-1), :)];
|
dan@0
|
143 end
|
dan@0
|
144
|
dan@0
|
145 if melmode
|
dan@0
|
146 tmpfeats = zeros(featbands, numframes, samchans);
|
dan@0
|
147 else
|
dan@0
|
148 tmpfeats = zeros(Nfft/2 + 1, numframes, samchans);
|
dan@0
|
149 end
|
dan@0
|
150
|
dan@0
|
151 energies = zeros(numframes, samchans);
|
dan@0
|
152 frameaudio = zeros(framelen, numframes, samchans);
|
dan@0
|
153 frameFFT = zeros(Nfft/2+1, numframes, samchans);
|
dan@0
|
154
|
dan@0
|
155
|
dan@0
|
156 % Process channels one by one. Trying to perform these ops simultaneously
|
dan@0
|
157 % for all channels might be possible but tricky.
|
dan@0
|
158 for c = 1:samchans
|
dan@0
|
159
|
dan@0
|
160 % starting sample numbers of each frame
|
dan@0
|
161 ind1 = 1:frameshift:samtrlen-1-framelen+frameshift;
|
dan@0
|
162 % linear 1-step vector (1...frame length)
|
dan@0
|
163 ind2 = (1:framelen)';
|
dan@0
|
164
|
dan@0
|
165 % Pick frame audio. The index matrix (framelen x numframes) consists
|
dan@0
|
166 % of four summed parts:
|
dan@0
|
167 % 1) Constant column vectors, each denoting the frame's start sample.
|
dan@0
|
168 % 2) Increasing sample index column vectors
|
dan@0
|
169 % 3) Scalar jump to get into the correct channel in linear indexing
|
dan@0
|
170 % 4) -1 because the first two indices are both one-based.
|
dan@0
|
171 %
|
dan@0
|
172 % [start1 start2 ] [ 1 1 ]
|
dan@0
|
173 % sam( [ ... ... ] + [... ... ] + channel jump - 1) =
|
dan@0
|
174 % [start1 start2 ] [frl frl ]
|
dan@0
|
175 %
|
dan@0
|
176 % [ start1+1 start2+1 ]
|
dan@0
|
177 % sam( [ ... ... ] + channel jump - 1)
|
dan@0
|
178 % [start1+frl start2+frl ]
|
dan@0
|
179 %
|
dan@0
|
180 % Thus we get an index matrix, where each frame column picks the
|
dan@0
|
181 % samples belonging to it. These samples are then fetched to 'fra'.
|
dan@0
|
182
|
dan@0
|
183 fra = sam(ind1(ones(framelen,1),:) + ind2(:,ones(1,numframes)) + (c-1)*samtrlen - 1);
|
dan@0
|
184 frameaudio(:,:,c) = fra;
|
dan@0
|
185
|
dan@0
|
186 % Calculate the energies.
|
dan@0
|
187 energies(:,c) = sum(fra.^2,1)';
|
dan@0
|
188
|
dan@0
|
189 % Apply window function, take FFT.
|
dan@0
|
190 fFFT = fft(win(:,ones(1,numframes)) .* fra, Nfft);
|
dan@0
|
191 % Truncate and reset constant factor, but do not take abs yet.
|
dan@0
|
192 fFFT(1,:) = 0;
|
dan@0
|
193 fFFT = fFFT(1:Nfft/2+1,:);
|
dan@0
|
194
|
dan@0
|
195 % Store the returned FFTs with phase.
|
dan@0
|
196 frameFFT(:,:,c) = fFFT;
|
dan@0
|
197
|
dan@0
|
198 if melmode
|
dan@0
|
199 tmpfeats(:,:,c) = melmat * abs(fFFT);
|
dan@0
|
200 else
|
dan@0
|
201 tmpfeats(:,:,c) = abs(fFFT);
|
dan@0
|
202 end
|
dan@0
|
203 end
|
dan@0
|
204
|
dan@0
|
205 % Flatten the features if downmixing to 1 is defined.
|
dan@0
|
206 if featchans == 1
|
dan@0
|
207 if samchans > 1
|
dan@0
|
208 feats = mean(tmpfeats, 3);
|
dan@0
|
209 else
|
dan@0
|
210 feats = tmpfeats;
|
dan@0
|
211 end
|
dan@0
|
212 else
|
dan@0
|
213 if samchans ~= featchans
|
dan@0
|
214 fprintf('Requested %i feature channels for %i audio - not defined. Returning %i.\n', featchans, samchans, samchans)
|
dan@0
|
215 end
|
dan@0
|
216 feats = tmpfeats;
|
dan@0
|
217 end
|