Mercurial > hg > dcase2013_ed_vuegenetal
comparison functions/fe_funcs/FE.m @ 0:2fadb31a9d55 tip
Import code by Vuegen et al
author | Dan Stowell <dan.stowell@elec.qmul.ac.uk> |
---|---|
date | Fri, 11 Oct 2013 12:02:43 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2fadb31a9d55 |
---|---|
1 % Yet another feature extractor, this time with CHiME in mind. | |
2 % Some cleanup, more support for customised audio parameters. | |
3 % Updated 9th Aug 2011 | |
4 % | |
5 % Outputs have been changed. Currently no logarithms are taken here any | |
6 % more. | |
7 % | |
8 % Input: | |
9 % - 'sam' is a the audio, either as column or row channels vectors. | |
10 % (Longer dimension is treated as time, shorter as channel count.) | |
11 % - 'audioconf' is as defined in getconfigs.m . All of its parameters | |
12 % ARE respected now, so pass a temporary, edited copy if you want | |
13 % to change the behaviour. | |
14 % | |
15 % Output: | |
16 % - 'feats' is a [bands x frames x featchannels] array of mel features. | |
17 % If audioconf.melbands is zero, FFT magnitudes are returned instead. | |
18 % - 'energies' is a [frames x audiochannels] matrix of frame energies | |
19 % - 'frameaudio' is a [framelen x frames x audiochannels] array of chopped | |
20 % audio data (with preprocessing but without the window function). | |
21 % - 'frameFFT' is an [FFTlow x frames x audiochannels] array of frame FFTs. | |
22 % The windowing function has been applied, and the result has been | |
23 % truncated to Nfft/2 + 1 bands. However, no abs is taken. You can | |
24 % do this in the calling function, or pick the abs values from 'feats' | |
25 % by using zero melbands. | |
26 % | |
27 % The main feature output respects audioconf.featchannels, which should | |
28 % be either the same as audioconf.channels (the number of input streams) | |
29 % or 1 (downmixed to mono by taking the mean of feature channels). Other | |
30 % outputs use original audio channels, because their averaging is not as | |
31 % well defined. Note that there is a significant difference between | |
32 % averaging the audio (causing waveform level phase attenuation) and the | |
33 % abs-FFT or Mel features (phase-invariant energy mean). If the former is | |
34 % what you need, downmix the audio in the calling function. | |
35 % | |
36 % Some warnings are shown if audio parameters are missing or they do not | |
37 % match with the data. | |
38 | |
39 function [feats, energies, frameaudio, frameFFT] = FE(sam, audioconf) | |
40 | |
41 verbose = 0; | |
42 | |
43 % Default config. To guarantee intended operation, you should always | |
44 % pass your own, though. | |
45 | |
46 defconf.channels = 2; % input channels, in CHiME always 2 | |
47 defconf.featchannels = 1; % feature level channels | |
48 defconf.fs = 16000; % sampling rate for internal processing | |
49 defconf.maxf = 8000; % maximum frequency to be considered | |
50 defconf.minf = 64; % maximum frequency to be considered | |
51 defconf.melbands = 26; % mel band count (0 to disable) | |
52 defconf.framelen_ms = 25; % millisecond length of each frame | |
53 defconf.framestep_ms = 10; % millisecond step between frames | |
54 defconf.windowfunc = 'hamming'; % window function name | |
55 defconf.preemphasis = 0.97; % 0 to disable | |
56 defconf.dcremoval = true; % DC removal in the feature extractor | |
57 defconf.Nfft = 0; % Number of FFT bands (0 to calculate from framelength) | |
58 | |
59 if nargin < 2 | |
60 if verbose | |
61 disp('No audioconf given, using defaults.') | |
62 end | |
63 audioconf = defconf; | |
64 else | |
65 fldnames = fieldnames(defconf); | |
66 for fl = 1:length(fldnames) | |
67 if ~isfield(audioconf, fldnames{fl}) | |
68 if verbose | |
69 fprintf('Field %s missing, copying from defaults.\n', fldnames{fl}) | |
70 end | |
71 audioconf.(f)=defconf.(f); | |
72 end | |
73 end | |
74 end | |
75 | |
76 % Fetch the shorthand variables. | |
77 featbands = audioconf.melbands; | |
78 featchans = audioconf.featchannels; | |
79 fs = audioconf.fs; | |
80 fhigh = audioconf.maxf; | |
81 flow = audioconf.minf; | |
82 | |
83 framelen = ceil(fs * audioconf.framelen_ms / 1000); | |
84 frameshift = ceil(fs * audioconf.framestep_ms / 1000); | |
85 % framelen = (fs * audioconf.framelen_ms / 1000); | |
86 % frameshift = (fs * audioconf.framestep_ms / 1000); | |
87 | |
88 if audioconf.Nfft == 0 | |
89 Nfft = 2^nextpow2(framelen); | |
90 else | |
91 Nfft = audioconf.Nfft; | |
92 end | |
93 | |
94 winfunc = str2func(audioconf.windowfunc); | |
95 win = winfunc(framelen); | |
96 | |
97 if featbands == 0 | |
98 melmode = false; | |
99 else | |
100 melmode = true; | |
101 end | |
102 | |
103 % Switch audio to columns. | |
104 if size(sam, 1) < size(sam,2) | |
105 sam = sam'; | |
106 end | |
107 | |
108 samlen = size(sam, 1); | |
109 samchans = size(sam, 2); | |
110 | |
111 if samchans ~= audioconf.channels | |
112 if verbose | |
113 fprintf('Warning: Audio has %i channels, config states %i.\n', samchans, audioconf.channels); | |
114 end | |
115 end | |
116 | |
117 if melmode | |
118 melmat = mel_matrix(fs, featbands, Nfft, 1, fhigh, flow)'; | |
119 if size(melmat, 1) ~= featbands | |
120 fprintf('Mel matrix has %i bands (config: %i).\n', size(melmat, 1), featbands); | |
121 end | |
122 if size(melmat, 2) ~= (Nfft/2 + 1) | |
123 fprintf('Mel matrix has %i FFT coeffs (expected: %i).\n', size(melmat, 2), Nfft/2 + 1); | |
124 end | |
125 end | |
126 | |
127 % Truncate to full frames, get the number. | |
128 numframes = floor((samlen-framelen+frameshift) / frameshift); | |
129 sam = sam(1:(numframes*frameshift+framelen-frameshift), :); | |
130 | |
131 % DC removal - introduces a 1-unit filter delay, thus we discard the | |
132 % first sample. Note that this behaviour has changed from earlier | |
133 % versions of FE. | |
134 if audioconf.dcremoval | |
135 samf = filter([1;-1], [1;-0.999], [zeros(1,samchans);sam]); | |
136 sam = samf(2:end, :); | |
137 end | |
138 samtrlen = size(sam, 1); % trimmed length | |
139 | |
140 % Pre-emphasis if nonzero. Can be done for the whole audio at once. | |
141 if (audioconf.preemphasis > 0) | |
142 sam = [zeros(1, samchans); sam(2:samtrlen, :) - audioconf.preemphasis * sam(1:(end-1), :)]; | |
143 end | |
144 | |
145 if melmode | |
146 tmpfeats = zeros(featbands, numframes, samchans); | |
147 else | |
148 tmpfeats = zeros(Nfft/2 + 1, numframes, samchans); | |
149 end | |
150 | |
151 energies = zeros(numframes, samchans); | |
152 frameaudio = zeros(framelen, numframes, samchans); | |
153 frameFFT = zeros(Nfft/2+1, numframes, samchans); | |
154 | |
155 | |
156 % Process channels one by one. Trying to perform these ops simultaneously | |
157 % for all channels might be possible but tricky. | |
158 for c = 1:samchans | |
159 | |
160 % starting sample numbers of each frame | |
161 ind1 = 1:frameshift:samtrlen-1-framelen+frameshift; | |
162 % linear 1-step vector (1...frame length) | |
163 ind2 = (1:framelen)'; | |
164 | |
165 % Pick frame audio. The index matrix (framelen x numframes) consists | |
166 % of four summed parts: | |
167 % 1) Constant column vectors, each denoting the frame's start sample. | |
168 % 2) Increasing sample index column vectors | |
169 % 3) Scalar jump to get into the correct channel in linear indexing | |
170 % 4) -1 because the first two indices are both one-based. | |
171 % | |
172 % [start1 start2 ] [ 1 1 ] | |
173 % sam( [ ... ... ] + [... ... ] + channel jump - 1) = | |
174 % [start1 start2 ] [frl frl ] | |
175 % | |
176 % [ start1+1 start2+1 ] | |
177 % sam( [ ... ... ] + channel jump - 1) | |
178 % [start1+frl start2+frl ] | |
179 % | |
180 % Thus we get an index matrix, where each frame column picks the | |
181 % samples belonging to it. These samples are then fetched to 'fra'. | |
182 | |
183 fra = sam(ind1(ones(framelen,1),:) + ind2(:,ones(1,numframes)) + (c-1)*samtrlen - 1); | |
184 frameaudio(:,:,c) = fra; | |
185 | |
186 % Calculate the energies. | |
187 energies(:,c) = sum(fra.^2,1)'; | |
188 | |
189 % Apply window function, take FFT. | |
190 fFFT = fft(win(:,ones(1,numframes)) .* fra, Nfft); | |
191 % Truncate and reset constant factor, but do not take abs yet. | |
192 fFFT(1,:) = 0; | |
193 fFFT = fFFT(1:Nfft/2+1,:); | |
194 | |
195 % Store the returned FFTs with phase. | |
196 frameFFT(:,:,c) = fFFT; | |
197 | |
198 if melmode | |
199 tmpfeats(:,:,c) = melmat * abs(fFFT); | |
200 else | |
201 tmpfeats(:,:,c) = abs(fFFT); | |
202 end | |
203 end | |
204 | |
205 % Flatten the features if downmixing to 1 is defined. | |
206 if featchans == 1 | |
207 if samchans > 1 | |
208 feats = mean(tmpfeats, 3); | |
209 else | |
210 feats = tmpfeats; | |
211 end | |
212 else | |
213 if samchans ~= featchans | |
214 fprintf('Requested %i feature channels for %i audio - not defined. Returning %i.\n', featchans, samchans, samchans) | |
215 end | |
216 feats = tmpfeats; | |
217 end |