comparison src/feature_extractor.cpp @ 0:add35537fdbb tip

Initial import
author irh <ian.r.hobson@gmail.com>
date Thu, 25 Aug 2011 11:05:55 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:add35537fdbb
1 // Copyright 2011, Ian Hobson.
2 //
3 // This file is part of gpsynth.
4 //
5 // gpsynth is free software: you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation, either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // gpsynth is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with gpsynth in the file COPYING.
17 // If not, see http://www.gnu.org/licenses/.
18
19 #include "feature_extractor.hpp"
20
21 #include "statistics.hpp"
22 #include "std_ex.hpp"
23 #include "window_makers.hpp"
24
25 #include "sndfile.hh"
26
27 #include "boost/bind.hpp"
28 #include "boost/filesystem.hpp"
29
30 #include <algorithm>
31 #include <cmath>
32 #include <iostream>
33 #include <stdexcept>
34 #include <sstream>
35
36 namespace bf = boost::filesystem;
37
38 namespace dsp {
39
40 FeatureExtractor::FeatureExtractor(const std::string& file_path /* = "" */,
41 int window_size /* = 2048 */,
42 int hop_size /* = 512 */)
43 : window_size_(-1), // initialized by SetFrameSize
44 hop_size_(hop_size),
45 bin_frequencies_(window_size),
46 spectrum_analyzer_(window_size),
47 mfcc_analyzer_(window_size, 40, 13, 30, 10000)
48 {
49 SetWindowSize(window_size);
50 if (!file_path.empty()) {
51 LoadFile(file_path);
52 }
53 }
54
55 void FeatureExtractor::ClearCaches() {
56 spectrum_.clear();
57 magnitude_spectrum_.clear();
58 log_magnitude_spectrum_.clear();
59 spectral_centroid_.clear();
60 spectral_spread_.clear();
61 spectral_flux_.clear();
62 pitch_.clear();
63 energy_.clear();
64 mfccs_.clear();
65 delta_mfccs_.clear();
66 double_delta_mfccs_.clear();
67 }
68
69 void FeatureExtractor::SetWindowSize(int window_size) {
70 if (window_size_ != window_size) {
71 window_size_ = window_size;
72 frame_.resize(window_size);
73 // generate new window
74 window_.resize(window_size);
75 dsp::HammingWindow(window_size, window_.begin());
76 ClearCaches();
77 }
78 }
79
80 void FeatureExtractor::SetHopSize(int hop_size) {
81 if (hop_size_ != hop_size) {
82 hop_size_ = hop_size;
83 ClearCaches();
84 }
85 }
86
87 void FeatureExtractor::LoadFile(const std::string& file_path) {
88 SndfileHandle file(file_path);
89 if (file.error()) {
90 std::stringstream message;
91 message << "FeatureExtractor::ExtractFeaturesFromFile: "
92 << "Error reading from file '" << file_path << "' "
93 << '(' << file.error() << ')';
94 throw std::runtime_error(message.str());
95 }
96 sf_count_t frames = file.frames();
97 int channels = file.channels();
98 file_data_.resize(frames * channels);
99 // read the audio data in to the buffer
100 file.readf(&file_data_[0], frames);
101 // only supporting mono at the moment, remove extra channels
102 if (channels > 1) {
103 for (int i = 0; i < frames; i++) {
104 file_data_[i] = file_data_[i * channels];
105 }
106 file_data_.resize(frames);
107 }
108 // success, clear caches
109 ClearCaches();
110 // update file info
111 file_path_ = file_path;
112 sample_rate_ = file.samplerate();
113 frames_ = static_cast<int>(file_data_.size()) / hop_size_;
114 if (frames_ == 0) {
115 frames_++;
116 }
117 // bin info
118 bin_size_ = sample_rate_ / static_cast<Value>(window_size_);
119 std::generate(bin_frequencies_.begin(), bin_frequencies_.end(),
120 stdx::Stepper<Value>(bin_size_));
121 }
122
123 Value FeatureExtractor::Duration() const {
124 return file_data_.size() / static_cast<Value>(sample_rate_);
125 }
126
127 const std::vector<ComplexValueList>& FeatureExtractor::Spectrum() {
128 if (spectrum_.empty()) {
129 const int frame_size = window_size_;
130 const int half_frame_size = frame_size / 2;
131 const int hop_size = hop_size_;
132 spectrum_.resize(frames_);
133 ValueList::const_iterator data_begin = file_data_.begin();
134 ValueList::const_iterator data_end = file_data_.end();
135 for (int i = 0; i < frames_; i++) {
136 //std::cout << "frame:" << i << '\n';
137 // find the frame bounds
138 ValueList::const_iterator read_start = data_begin + (i * hop_size);
139 ValueList::const_iterator read_end = read_start + frame_size;
140 if (read_end > data_end) {
141 read_end = data_end;
142 }
143 // extract the frame and apply window
144 std::transform(read_start, read_end, window_.begin(),
145 frame_.begin(), std::multiplies<Value>());
146 // on the last frame we'll need to fill the rest of the frame with zeros
147 std::size_t data_read = read_end - read_start;
148 if (data_read < frame_size) {
149 std::fill(frame_.begin() + data_read, frame_.end(), 0);
150 }
151 // get the spectrum of the current frame
152 std::copy(frame_.begin(), frame_.end(), spectrum_analyzer_.Input());
153 spectrum_analyzer_.Execute();
154 spectrum_[i].assign(spectrum_analyzer_.Output(),
155 spectrum_analyzer_.Output() + half_frame_size);
156 }
157 }
158 return spectrum_;
159 }
160
161 const std::vector<ValueList>& FeatureExtractor::MagnitudeSpectrum() {
162 if (magnitude_spectrum_.empty()) {
163 // get complex spectrum
164 const std::vector<ComplexValueList>& spectrum = Spectrum();
165 // get magnitude spectrum for each frame
166 std::size_t frames = spectrum.size();
167 magnitude_spectrum_.resize(frames);
168 for (int i = 0; i < frames; i++) {
169 // take the absolute value of each complex bin
170 magnitude_spectrum_[i].resize(spectrum[i].size());
171 std::transform(spectrum[i].begin(), spectrum[i].end(),
172 magnitude_spectrum_[i].begin(), std::abs<Value>);
173 }
174 }
175 return magnitude_spectrum_;
176 }
177
178 const std::vector<ValueList>& FeatureExtractor::LogMagnitudeSpectrum() {
179 if (log_magnitude_spectrum_.empty()) {
180 // get magnitude spectrum
181 const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum();
182 // get magnitude spectrum for each frame
183 std::size_t frames = magnitude_spectrum.size();
184 log_magnitude_spectrum_.resize(frames);
185 for (int i = 0; i < frames; i++) {
186 log_magnitude_spectrum_[i].resize(magnitude_spectrum[i].size());
187 // copy magnitudes to log magnitude buffer, avoiding zeros
188 std::replace_copy(magnitude_spectrum[i].begin(),
189 magnitude_spectrum[i].end(),
190 log_magnitude_spectrum_[i].begin(),
191 0.0,
192 std::numeric_limits<Value>::min());
193 // take the log of each magnitude (ugly casting for cmath function...)
194 std::transform(log_magnitude_spectrum_[i].begin(),
195 log_magnitude_spectrum_[i].end(),
196 log_magnitude_spectrum_[i].begin(),
197 static_cast<double(*)(double)>(std::log10));
198 }
199 }
200 return log_magnitude_spectrum_;
201 }
202
203 const ValueList& FeatureExtractor::SpectralCentroid() {
204 if (spectral_centroid_.empty()) {
205 MagnitudeSpectrum();
206 std::size_t frames = magnitude_spectrum_.size();
207 spectral_centroid_.resize(frames);
208 std::size_t bins = magnitude_spectrum_[0].size();
209 ValueList weighted_frequencies(bins);
210 for (int i = 0; i < frames; i++) {
211 const ValueList& magnitudes = magnitude_spectrum_[i];
212 // get bin frequencies weighted by bin magnitudes
213 std::transform(magnitudes.begin(),
214 magnitudes.end(),
215 bin_frequencies_.begin(),
216 weighted_frequencies.begin(),
217 std::multiplies<Value>());
218 // divide mean of weighted frequencies by mean of bin magnitudes
219 Value magnitude = stats::Sum(magnitudes);
220 if (magnitude > 0) {
221 spectral_centroid_[i] = stats::Sum(weighted_frequencies) / magnitude;
222 } else {
223 spectral_centroid_[i] = 0;
224 }
225 }
226 }
227 return spectral_centroid_;
228 }
229
230
231 const ValueList& FeatureExtractor::SpectralSpread() {
232 if (spectral_spread_.empty()) {
233 SpectralCentroid();
234 std::size_t frames = magnitude_spectrum_.size();
235 spectral_spread_.resize(frames);
236 std::size_t bins = magnitude_spectrum_[0].size();
237 ValueList weighted_difference(bins);
238 for (int i = 0; i < frames; i++) {
239 Value centroid = spectral_centroid_[i];
240 const ValueList& magnitudes = magnitude_spectrum_[i];
241 for (int bin = 0; bin < bins; bin++) {
242 Value difference = bin_frequencies_[bin] - centroid;
243 weighted_difference[bin] = difference * difference * magnitudes[bin];
244 }
245 Value magnitude = stats::Sum(magnitudes);
246 if (magnitude > 0) {
247 Value variance = stats::Sum(weighted_difference) / magnitude;
248 spectral_spread_[i] = std::sqrt(variance);
249 } else {
250 spectral_spread_[i] = 0;
251 }
252 }
253 }
254 return spectral_spread_;
255 }
256
257 const ValueList& FeatureExtractor::Pitch() {
258 if (pitch_.empty()) {
259 MagnitudeSpectrum();
260 pitch_.resize(frames_);
261 std::deque<Value> recent_pitches;
262 const int moving_average_size = 8;
263 for (int i = 0; i < frames_; i++) {
264 const ValueList& mags = magnitude_spectrum_[i];
265 // find the peak bin number
266 std::ptrdiff_t peak_bin = std::distance(mags.begin(),
267 std::max_element(mags.begin(),
268 mags.end()));
269 // interpolate peak frequency
270 double offset;
271 double centre = mags[peak_bin];
272 if (centre == 0) {
273 pitch_[i] = 0;
274 } else {
275 if (peak_bin == 0) {
276 // parabolic interpolation of first 2 bin magnitudes
277 double above = mags[peak_bin + 1];
278 offset = above / (2 * (2 * centre - above));
279 } else if (peak_bin == mags.size() - 1) {
280 // parabolic interpolation of last 2 bin magnitudes
281 double below = mags[peak_bin - 1];
282 offset = (-below) / (2 * (2 * centre - below));
283 } else {
284 double below = mags[peak_bin - 1];
285 double above = mags[peak_bin + 1];
286 // gaussian interpolation
287 offset = std::log(above / below)
288 / (2 * std::log((centre * centre) / (above * below)));
289 }
290 Value frame_pitch = bin_size_ * (peak_bin + offset);
291 if (recent_pitches.size() == moving_average_size) {
292 recent_pitches.pop_front();
293 }
294 recent_pitches.push_back(frame_pitch);
295 pitch_[i] = stats::Mean(recent_pitches);
296 }
297 }
298 }
299 return pitch_;
300 }
301
302 const ValueList& FeatureExtractor::Energy() {
303 if (energy_.empty()) {
304 const int frame_size = window_size_;
305 const int hop_size = hop_size_;
306 energy_.resize(frames_);
307 ValueList::const_iterator data_begin = file_data_.begin();
308 ValueList::const_iterator data_end = file_data_.end();
309 for (int i = 0; i < frames_; i++) {
310 // find the frame bounds
311 ValueList::const_iterator read_start = data_begin + (i * hop_size);
312 ValueList::const_iterator read_end = read_start + frame_size;
313 if (read_end > data_end) {
314 read_end = data_end;
315 }
316 Value energy(0);
317 std::ptrdiff_t samples = std::distance(read_start, read_end);
318 while (read_start != read_end) {
319 energy += std::pow(*read_start++, 2);
320 }
321 energy_[i] = energy / samples;
322 }
323 }
324 return energy_;
325 }
326
327 const std::vector<ValueList>& FeatureExtractor::MFCCs() {
328 if (mfccs_.empty()) {
329 const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum();
330 mfcc_analyzer_.SetSampleRate(sample_rate_);
331 mfccs_.resize(frames_);
332 for (int i = 0; i < frames_; i++) {
333 mfccs_[i] = mfcc_analyzer_.ExtractMFCCS(magnitude_spectrum[i]);
334 }
335 }
336 return mfccs_;
337 }
338
339 const std::vector<ValueList>& FeatureExtractor::DeltaMFCCs() {
340 if (delta_mfccs_.empty()) {
341 const std::vector<ValueList>& mfccs = MFCCs();
342 // resize buffers
343 const std::size_t number_of_mfccs = mfccs[0].size();
344 ValueList empty_values(number_of_mfccs, 0);
345 delta_mfccs_.resize(mfccs.size(), empty_values);
346 // calculate deltas based on 2 neighbouring frames in either direction
347 for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) {
348 const ValueList& before_2 = mfccs[i - 2];
349 const ValueList& before_1 = mfccs[i - 1];
350 const ValueList& after_1 = mfccs[i + 1];
351 const ValueList& after_2 = mfccs[i + 2];
352 ValueList& delta_frame = delta_mfccs_[i];
353 for (int m = 0; m < number_of_mfccs; m++) {
354 Value delta = after_1[m] - before_1[m]
355 + 2.0 * (after_2[m] - before_2[m]);
356 delta_frame[m] = delta / 10.0;
357 }
358 }
359 }
360 return delta_mfccs_;
361 }
362
363 const std::vector<ValueList>& FeatureExtractor::DoubleDeltaMFCCs() {
364 if (double_delta_mfccs_.empty()) {
365 const std::vector<ValueList>& delta_mfccs = DeltaMFCCs();
366 // resize buffers
367 const std::size_t number_of_mfccs = delta_mfccs[0].size();
368 ValueList empty_values(number_of_mfccs, 0);
369 double_delta_mfccs_.resize(delta_mfccs.size(), empty_values);
370 // calculate deltas based on 2 neighbouring frames in either direction
371 for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) {
372 const ValueList& before_2 = delta_mfccs[i - 2];
373 const ValueList& before_1 = delta_mfccs[i - 1];
374 const ValueList& after_1 = delta_mfccs[i + 1];
375 const ValueList& after_2 = delta_mfccs[i + 2];
376 ValueList& double_delta_frame = double_delta_mfccs_[i];
377 for (int m = 0; m < number_of_mfccs; m++) {
378 Value delta = after_1[m] - before_1[m]
379 + 2.0 * (after_2[m] - before_2[m]);
380 double_delta_frame[m] = delta / 10.0;
381 }
382 }
383 }
384 return double_delta_mfccs_;
385 }
386
387 const std::vector<ValueList>& FeatureExtractor::SpectralFlux() {
388 if (spectral_flux_.empty()) {
389 // get magnitude spectrum
390 const std::vector<ValueList>& spectrum = MagnitudeSpectrum();
391 // get magnitude spectrum for each frame
392 std::size_t frames = spectrum.size();
393 std::size_t bins = spectrum[0].size();
394 spectral_flux_.resize(frames);
395 // first frame always zeroed
396 spectral_flux_[0].assign(bins, 0);
397 for (int i = 1; i < frames; i++) {
398 const ValueList& spectrum_frame = spectrum[i];
399 const ValueList& previous_spectrum_frame = spectrum[i - 1];
400 ValueList& flux_frame = spectral_flux_[i];
401 flux_frame.resize(bins);
402 // get the difference in magnitudes for each bin
403 for (int bin = 0; bin < bins; bin++) {
404 flux_frame[bin] = spectrum_frame[bin] - previous_spectrum_frame[bin];
405 }
406 //std::cout << i << ": " << stats::Mean(flux_frame) << '\n';
407 }
408 }
409 return spectral_flux_;
410 }
411
412 } // dsp namespace