Mercurial > hg > gpsynth
comparison src/feature_extractor.cpp @ 0:add35537fdbb tip
Initial import
author | irh <ian.r.hobson@gmail.com> |
---|---|
date | Thu, 25 Aug 2011 11:05:55 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:add35537fdbb |
---|---|
1 // Copyright 2011, Ian Hobson. | |
2 // | |
3 // This file is part of gpsynth. | |
4 // | |
5 // gpsynth is free software: you can redistribute it and/or modify | |
6 // it under the terms of the GNU General Public License as published by | |
7 // the Free Software Foundation, either version 3 of the License, or | |
8 // (at your option) any later version. | |
9 // | |
10 // gpsynth is distributed in the hope that it will be useful, | |
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 // GNU General Public License for more details. | |
14 // | |
15 // You should have received a copy of the GNU General Public License | |
16 // along with gpsynth in the file COPYING. | |
17 // If not, see http://www.gnu.org/licenses/. | |
18 | |
19 #include "feature_extractor.hpp" | |
20 | |
21 #include "statistics.hpp" | |
22 #include "std_ex.hpp" | |
23 #include "window_makers.hpp" | |
24 | |
25 #include "sndfile.hh" | |
26 | |
27 #include "boost/bind.hpp" | |
28 #include "boost/filesystem.hpp" | |
29 | |
30 #include <algorithm> | |
31 #include <cmath> | |
32 #include <iostream> | |
33 #include <stdexcept> | |
34 #include <sstream> | |
35 | |
36 namespace bf = boost::filesystem; | |
37 | |
38 namespace dsp { | |
39 | |
40 FeatureExtractor::FeatureExtractor(const std::string& file_path /* = "" */, | |
41 int window_size /* = 2048 */, | |
42 int hop_size /* = 512 */) | |
43 : window_size_(-1), // initialized by SetFrameSize | |
44 hop_size_(hop_size), | |
45 bin_frequencies_(window_size), | |
46 spectrum_analyzer_(window_size), | |
47 mfcc_analyzer_(window_size, 40, 13, 30, 10000) | |
48 { | |
49 SetWindowSize(window_size); | |
50 if (!file_path.empty()) { | |
51 LoadFile(file_path); | |
52 } | |
53 } | |
54 | |
55 void FeatureExtractor::ClearCaches() { | |
56 spectrum_.clear(); | |
57 magnitude_spectrum_.clear(); | |
58 log_magnitude_spectrum_.clear(); | |
59 spectral_centroid_.clear(); | |
60 spectral_spread_.clear(); | |
61 spectral_flux_.clear(); | |
62 pitch_.clear(); | |
63 energy_.clear(); | |
64 mfccs_.clear(); | |
65 delta_mfccs_.clear(); | |
66 double_delta_mfccs_.clear(); | |
67 } | |
68 | |
69 void FeatureExtractor::SetWindowSize(int window_size) { | |
70 if (window_size_ != window_size) { | |
71 window_size_ = window_size; | |
72 frame_.resize(window_size); | |
73 // generate new window | |
74 window_.resize(window_size); | |
75 dsp::HammingWindow(window_size, window_.begin()); | |
76 ClearCaches(); | |
77 } | |
78 } | |
79 | |
80 void FeatureExtractor::SetHopSize(int hop_size) { | |
81 if (hop_size_ != hop_size) { | |
82 hop_size_ = hop_size; | |
83 ClearCaches(); | |
84 } | |
85 } | |
86 | |
87 void FeatureExtractor::LoadFile(const std::string& file_path) { | |
88 SndfileHandle file(file_path); | |
89 if (file.error()) { | |
90 std::stringstream message; | |
91 message << "FeatureExtractor::ExtractFeaturesFromFile: " | |
92 << "Error reading from file '" << file_path << "' " | |
93 << '(' << file.error() << ')'; | |
94 throw std::runtime_error(message.str()); | |
95 } | |
96 sf_count_t frames = file.frames(); | |
97 int channels = file.channels(); | |
98 file_data_.resize(frames * channels); | |
99 // read the audio data in to the buffer | |
100 file.readf(&file_data_[0], frames); | |
101 // only supporting mono at the moment, remove extra channels | |
102 if (channels > 1) { | |
103 for (int i = 0; i < frames; i++) { | |
104 file_data_[i] = file_data_[i * channels]; | |
105 } | |
106 file_data_.resize(frames); | |
107 } | |
108 // success, clear caches | |
109 ClearCaches(); | |
110 // update file info | |
111 file_path_ = file_path; | |
112 sample_rate_ = file.samplerate(); | |
113 frames_ = static_cast<int>(file_data_.size()) / hop_size_; | |
114 if (frames_ == 0) { | |
115 frames_++; | |
116 } | |
117 // bin info | |
118 bin_size_ = sample_rate_ / static_cast<Value>(window_size_); | |
119 std::generate(bin_frequencies_.begin(), bin_frequencies_.end(), | |
120 stdx::Stepper<Value>(bin_size_)); | |
121 } | |
122 | |
123 Value FeatureExtractor::Duration() const { | |
124 return file_data_.size() / static_cast<Value>(sample_rate_); | |
125 } | |
126 | |
127 const std::vector<ComplexValueList>& FeatureExtractor::Spectrum() { | |
128 if (spectrum_.empty()) { | |
129 const int frame_size = window_size_; | |
130 const int half_frame_size = frame_size / 2; | |
131 const int hop_size = hop_size_; | |
132 spectrum_.resize(frames_); | |
133 ValueList::const_iterator data_begin = file_data_.begin(); | |
134 ValueList::const_iterator data_end = file_data_.end(); | |
135 for (int i = 0; i < frames_; i++) { | |
136 //std::cout << "frame:" << i << '\n'; | |
137 // find the frame bounds | |
138 ValueList::const_iterator read_start = data_begin + (i * hop_size); | |
139 ValueList::const_iterator read_end = read_start + frame_size; | |
140 if (read_end > data_end) { | |
141 read_end = data_end; | |
142 } | |
143 // extract the frame and apply window | |
144 std::transform(read_start, read_end, window_.begin(), | |
145 frame_.begin(), std::multiplies<Value>()); | |
146 // on the last frame we'll need to fill the rest of the frame with zeros | |
147 std::size_t data_read = read_end - read_start; | |
148 if (data_read < frame_size) { | |
149 std::fill(frame_.begin() + data_read, frame_.end(), 0); | |
150 } | |
151 // get the spectrum of the current frame | |
152 std::copy(frame_.begin(), frame_.end(), spectrum_analyzer_.Input()); | |
153 spectrum_analyzer_.Execute(); | |
154 spectrum_[i].assign(spectrum_analyzer_.Output(), | |
155 spectrum_analyzer_.Output() + half_frame_size); | |
156 } | |
157 } | |
158 return spectrum_; | |
159 } | |
160 | |
161 const std::vector<ValueList>& FeatureExtractor::MagnitudeSpectrum() { | |
162 if (magnitude_spectrum_.empty()) { | |
163 // get complex spectrum | |
164 const std::vector<ComplexValueList>& spectrum = Spectrum(); | |
165 // get magnitude spectrum for each frame | |
166 std::size_t frames = spectrum.size(); | |
167 magnitude_spectrum_.resize(frames); | |
168 for (int i = 0; i < frames; i++) { | |
169 // take the absolute value of each complex bin | |
170 magnitude_spectrum_[i].resize(spectrum[i].size()); | |
171 std::transform(spectrum[i].begin(), spectrum[i].end(), | |
172 magnitude_spectrum_[i].begin(), std::abs<Value>); | |
173 } | |
174 } | |
175 return magnitude_spectrum_; | |
176 } | |
177 | |
178 const std::vector<ValueList>& FeatureExtractor::LogMagnitudeSpectrum() { | |
179 if (log_magnitude_spectrum_.empty()) { | |
180 // get magnitude spectrum | |
181 const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum(); | |
182 // get magnitude spectrum for each frame | |
183 std::size_t frames = magnitude_spectrum.size(); | |
184 log_magnitude_spectrum_.resize(frames); | |
185 for (int i = 0; i < frames; i++) { | |
186 log_magnitude_spectrum_[i].resize(magnitude_spectrum[i].size()); | |
187 // copy magnitudes to log magnitude buffer, avoiding zeros | |
188 std::replace_copy(magnitude_spectrum[i].begin(), | |
189 magnitude_spectrum[i].end(), | |
190 log_magnitude_spectrum_[i].begin(), | |
191 0.0, | |
192 std::numeric_limits<Value>::min()); | |
193 // take the log of each magnitude (ugly casting for cmath function...) | |
194 std::transform(log_magnitude_spectrum_[i].begin(), | |
195 log_magnitude_spectrum_[i].end(), | |
196 log_magnitude_spectrum_[i].begin(), | |
197 static_cast<double(*)(double)>(std::log10)); | |
198 } | |
199 } | |
200 return log_magnitude_spectrum_; | |
201 } | |
202 | |
203 const ValueList& FeatureExtractor::SpectralCentroid() { | |
204 if (spectral_centroid_.empty()) { | |
205 MagnitudeSpectrum(); | |
206 std::size_t frames = magnitude_spectrum_.size(); | |
207 spectral_centroid_.resize(frames); | |
208 std::size_t bins = magnitude_spectrum_[0].size(); | |
209 ValueList weighted_frequencies(bins); | |
210 for (int i = 0; i < frames; i++) { | |
211 const ValueList& magnitudes = magnitude_spectrum_[i]; | |
212 // get bin frequencies weighted by bin magnitudes | |
213 std::transform(magnitudes.begin(), | |
214 magnitudes.end(), | |
215 bin_frequencies_.begin(), | |
216 weighted_frequencies.begin(), | |
217 std::multiplies<Value>()); | |
218 // divide mean of weighted frequencies by mean of bin magnitudes | |
219 Value magnitude = stats::Sum(magnitudes); | |
220 if (magnitude > 0) { | |
221 spectral_centroid_[i] = stats::Sum(weighted_frequencies) / magnitude; | |
222 } else { | |
223 spectral_centroid_[i] = 0; | |
224 } | |
225 } | |
226 } | |
227 return spectral_centroid_; | |
228 } | |
229 | |
230 | |
231 const ValueList& FeatureExtractor::SpectralSpread() { | |
232 if (spectral_spread_.empty()) { | |
233 SpectralCentroid(); | |
234 std::size_t frames = magnitude_spectrum_.size(); | |
235 spectral_spread_.resize(frames); | |
236 std::size_t bins = magnitude_spectrum_[0].size(); | |
237 ValueList weighted_difference(bins); | |
238 for (int i = 0; i < frames; i++) { | |
239 Value centroid = spectral_centroid_[i]; | |
240 const ValueList& magnitudes = magnitude_spectrum_[i]; | |
241 for (int bin = 0; bin < bins; bin++) { | |
242 Value difference = bin_frequencies_[bin] - centroid; | |
243 weighted_difference[bin] = difference * difference * magnitudes[bin]; | |
244 } | |
245 Value magnitude = stats::Sum(magnitudes); | |
246 if (magnitude > 0) { | |
247 Value variance = stats::Sum(weighted_difference) / magnitude; | |
248 spectral_spread_[i] = std::sqrt(variance); | |
249 } else { | |
250 spectral_spread_[i] = 0; | |
251 } | |
252 } | |
253 } | |
254 return spectral_spread_; | |
255 } | |
256 | |
257 const ValueList& FeatureExtractor::Pitch() { | |
258 if (pitch_.empty()) { | |
259 MagnitudeSpectrum(); | |
260 pitch_.resize(frames_); | |
261 std::deque<Value> recent_pitches; | |
262 const int moving_average_size = 8; | |
263 for (int i = 0; i < frames_; i++) { | |
264 const ValueList& mags = magnitude_spectrum_[i]; | |
265 // find the peak bin number | |
266 std::ptrdiff_t peak_bin = std::distance(mags.begin(), | |
267 std::max_element(mags.begin(), | |
268 mags.end())); | |
269 // interpolate peak frequency | |
270 double offset; | |
271 double centre = mags[peak_bin]; | |
272 if (centre == 0) { | |
273 pitch_[i] = 0; | |
274 } else { | |
275 if (peak_bin == 0) { | |
276 // parabolic interpolation of first 2 bin magnitudes | |
277 double above = mags[peak_bin + 1]; | |
278 offset = above / (2 * (2 * centre - above)); | |
279 } else if (peak_bin == mags.size() - 1) { | |
280 // parabolic interpolation of last 2 bin magnitudes | |
281 double below = mags[peak_bin - 1]; | |
282 offset = (-below) / (2 * (2 * centre - below)); | |
283 } else { | |
284 double below = mags[peak_bin - 1]; | |
285 double above = mags[peak_bin + 1]; | |
286 // gaussian interpolation | |
287 offset = std::log(above / below) | |
288 / (2 * std::log((centre * centre) / (above * below))); | |
289 } | |
290 Value frame_pitch = bin_size_ * (peak_bin + offset); | |
291 if (recent_pitches.size() == moving_average_size) { | |
292 recent_pitches.pop_front(); | |
293 } | |
294 recent_pitches.push_back(frame_pitch); | |
295 pitch_[i] = stats::Mean(recent_pitches); | |
296 } | |
297 } | |
298 } | |
299 return pitch_; | |
300 } | |
301 | |
302 const ValueList& FeatureExtractor::Energy() { | |
303 if (energy_.empty()) { | |
304 const int frame_size = window_size_; | |
305 const int hop_size = hop_size_; | |
306 energy_.resize(frames_); | |
307 ValueList::const_iterator data_begin = file_data_.begin(); | |
308 ValueList::const_iterator data_end = file_data_.end(); | |
309 for (int i = 0; i < frames_; i++) { | |
310 // find the frame bounds | |
311 ValueList::const_iterator read_start = data_begin + (i * hop_size); | |
312 ValueList::const_iterator read_end = read_start + frame_size; | |
313 if (read_end > data_end) { | |
314 read_end = data_end; | |
315 } | |
316 Value energy(0); | |
317 std::ptrdiff_t samples = std::distance(read_start, read_end); | |
318 while (read_start != read_end) { | |
319 energy += std::pow(*read_start++, 2); | |
320 } | |
321 energy_[i] = energy / samples; | |
322 } | |
323 } | |
324 return energy_; | |
325 } | |
326 | |
327 const std::vector<ValueList>& FeatureExtractor::MFCCs() { | |
328 if (mfccs_.empty()) { | |
329 const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum(); | |
330 mfcc_analyzer_.SetSampleRate(sample_rate_); | |
331 mfccs_.resize(frames_); | |
332 for (int i = 0; i < frames_; i++) { | |
333 mfccs_[i] = mfcc_analyzer_.ExtractMFCCS(magnitude_spectrum[i]); | |
334 } | |
335 } | |
336 return mfccs_; | |
337 } | |
338 | |
339 const std::vector<ValueList>& FeatureExtractor::DeltaMFCCs() { | |
340 if (delta_mfccs_.empty()) { | |
341 const std::vector<ValueList>& mfccs = MFCCs(); | |
342 // resize buffers | |
343 const std::size_t number_of_mfccs = mfccs[0].size(); | |
344 ValueList empty_values(number_of_mfccs, 0); | |
345 delta_mfccs_.resize(mfccs.size(), empty_values); | |
346 // calculate deltas based on 2 neighbouring frames in either direction | |
347 for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) { | |
348 const ValueList& before_2 = mfccs[i - 2]; | |
349 const ValueList& before_1 = mfccs[i - 1]; | |
350 const ValueList& after_1 = mfccs[i + 1]; | |
351 const ValueList& after_2 = mfccs[i + 2]; | |
352 ValueList& delta_frame = delta_mfccs_[i]; | |
353 for (int m = 0; m < number_of_mfccs; m++) { | |
354 Value delta = after_1[m] - before_1[m] | |
355 + 2.0 * (after_2[m] - before_2[m]); | |
356 delta_frame[m] = delta / 10.0; | |
357 } | |
358 } | |
359 } | |
360 return delta_mfccs_; | |
361 } | |
362 | |
363 const std::vector<ValueList>& FeatureExtractor::DoubleDeltaMFCCs() { | |
364 if (double_delta_mfccs_.empty()) { | |
365 const std::vector<ValueList>& delta_mfccs = DeltaMFCCs(); | |
366 // resize buffers | |
367 const std::size_t number_of_mfccs = delta_mfccs[0].size(); | |
368 ValueList empty_values(number_of_mfccs, 0); | |
369 double_delta_mfccs_.resize(delta_mfccs.size(), empty_values); | |
370 // calculate deltas based on 2 neighbouring frames in either direction | |
371 for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) { | |
372 const ValueList& before_2 = delta_mfccs[i - 2]; | |
373 const ValueList& before_1 = delta_mfccs[i - 1]; | |
374 const ValueList& after_1 = delta_mfccs[i + 1]; | |
375 const ValueList& after_2 = delta_mfccs[i + 2]; | |
376 ValueList& double_delta_frame = double_delta_mfccs_[i]; | |
377 for (int m = 0; m < number_of_mfccs; m++) { | |
378 Value delta = after_1[m] - before_1[m] | |
379 + 2.0 * (after_2[m] - before_2[m]); | |
380 double_delta_frame[m] = delta / 10.0; | |
381 } | |
382 } | |
383 } | |
384 return double_delta_mfccs_; | |
385 } | |
386 | |
387 const std::vector<ValueList>& FeatureExtractor::SpectralFlux() { | |
388 if (spectral_flux_.empty()) { | |
389 // get magnitude spectrum | |
390 const std::vector<ValueList>& spectrum = MagnitudeSpectrum(); | |
391 // get magnitude spectrum for each frame | |
392 std::size_t frames = spectrum.size(); | |
393 std::size_t bins = spectrum[0].size(); | |
394 spectral_flux_.resize(frames); | |
395 // first frame always zeroed | |
396 spectral_flux_[0].assign(bins, 0); | |
397 for (int i = 1; i < frames; i++) { | |
398 const ValueList& spectrum_frame = spectrum[i]; | |
399 const ValueList& previous_spectrum_frame = spectrum[i - 1]; | |
400 ValueList& flux_frame = spectral_flux_[i]; | |
401 flux_frame.resize(bins); | |
402 // get the difference in magnitudes for each bin | |
403 for (int bin = 0; bin < bins; bin++) { | |
404 flux_frame[bin] = spectrum_frame[bin] - previous_spectrum_frame[bin]; | |
405 } | |
406 //std::cout << i << ": " << stats::Mean(flux_frame) << '\n'; | |
407 } | |
408 } | |
409 return spectral_flux_; | |
410 } | |
411 | |
412 } // dsp namespace |