diff src/feature_extractor.cpp @ 0:add35537fdbb tip

Initial import
author irh <ian.r.hobson@gmail.com>
date Thu, 25 Aug 2011 11:05:55 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/feature_extractor.cpp	Thu Aug 25 11:05:55 2011 +0100
@@ -0,0 +1,412 @@
+//  Copyright 2011, Ian Hobson.
+//
+//  This file is part of gpsynth.
+//
+//  gpsynth is free software: you can redistribute it and/or modify
+//  it under the terms of the GNU General Public License as published by
+//  the Free Software Foundation, either version 3 of the License, or
+//  (at your option) any later version.
+//
+//  gpsynth is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//  GNU General Public License for more details.
+//
+//  You should have received a copy of the GNU General Public License
+//  along with gpsynth in the file COPYING. 
+//  If not, see http://www.gnu.org/licenses/.
+
+#include "feature_extractor.hpp"
+
+#include "statistics.hpp"
+#include "std_ex.hpp"
+#include "window_makers.hpp"
+
+#include "sndfile.hh"
+
+#include "boost/bind.hpp"
+#include "boost/filesystem.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <stdexcept>
+#include <sstream>
+
+namespace bf = boost::filesystem;
+
+namespace dsp {
+
+FeatureExtractor::FeatureExtractor(const std::string& file_path /* = "" */,
+                                   int window_size /* = 2048 */,
+                                   int hop_size /* = 512 */) 
+: window_size_(-1), // initialized by SetFrameSize
+  hop_size_(hop_size),
+  bin_frequencies_(window_size),
+  spectrum_analyzer_(window_size),
+  mfcc_analyzer_(window_size, 40, 13, 30, 10000)
+{
+  SetWindowSize(window_size);
+  if (!file_path.empty()) {
+    LoadFile(file_path);
+  }
+}
+
+void FeatureExtractor::ClearCaches() {
+  spectrum_.clear();
+  magnitude_spectrum_.clear();
+  log_magnitude_spectrum_.clear();
+  spectral_centroid_.clear();
+  spectral_spread_.clear();
+  spectral_flux_.clear();
+  pitch_.clear();
+  energy_.clear();
+  mfccs_.clear();
+  delta_mfccs_.clear();
+  double_delta_mfccs_.clear();
+}
+  
+void FeatureExtractor::SetWindowSize(int window_size) {
+  if (window_size_ != window_size) {
+    window_size_ = window_size;
+    frame_.resize(window_size);
+    // generate new window
+    window_.resize(window_size);
+    dsp::HammingWindow(window_size, window_.begin());
+    ClearCaches();
+  }
+}
+  
+void FeatureExtractor::SetHopSize(int hop_size) {
+  if (hop_size_ != hop_size) {
+    hop_size_ = hop_size;
+    ClearCaches();
+  }
+}
+  
+void FeatureExtractor::LoadFile(const std::string& file_path) {
+  SndfileHandle file(file_path);
+  if (file.error()) {
+    std::stringstream message;
+    message << "FeatureExtractor::ExtractFeaturesFromFile: "
+            << "Error reading from file '" << file_path << "' "
+            << '(' << file.error() << ')';
+    throw std::runtime_error(message.str());
+  }
+  sf_count_t frames = file.frames();
+  int channels = file.channels();
+  file_data_.resize(frames * channels);
+  // read the audio data in to the buffer
+  file.readf(&file_data_[0], frames);
+  // only supporting mono at the moment, remove extra channels
+  if (channels > 1) {
+    for (int i = 0; i < frames; i++) {
+      file_data_[i] = file_data_[i * channels];
+    }
+    file_data_.resize(frames);
+  }
+  // success, clear caches
+  ClearCaches();
+  // update file info
+  file_path_ = file_path;
+  sample_rate_ = file.samplerate();
+  frames_ = static_cast<int>(file_data_.size()) / hop_size_;
+  if (frames_ == 0) {
+    frames_++;
+  }
+  // bin info
+  bin_size_ = sample_rate_ / static_cast<Value>(window_size_);
+  std::generate(bin_frequencies_.begin(), bin_frequencies_.end(),
+                stdx::Stepper<Value>(bin_size_));
+}
+  
+Value FeatureExtractor::Duration() const {
+  return file_data_.size() / static_cast<Value>(sample_rate_);
+}
+
+const std::vector<ComplexValueList>& FeatureExtractor::Spectrum() {
+  if (spectrum_.empty()) {
+    const int frame_size = window_size_;
+    const int half_frame_size = frame_size / 2;
+    const int hop_size = hop_size_;
+    spectrum_.resize(frames_);
+    ValueList::const_iterator data_begin = file_data_.begin();
+    ValueList::const_iterator data_end = file_data_.end();
+    for (int i = 0; i < frames_; i++) {
+      //std::cout << "frame:" << i << '\n';
+      // find the frame bounds
+      ValueList::const_iterator read_start = data_begin + (i * hop_size);
+      ValueList::const_iterator read_end = read_start + frame_size;
+      if (read_end > data_end) {
+        read_end = data_end;
+      }
+      // extract the frame and apply window
+      std::transform(read_start, read_end, window_.begin(), 
+                     frame_.begin(), std::multiplies<Value>());
+      // on the last frame we'll need to fill the rest of the frame with zeros
+      std::size_t data_read = read_end - read_start;
+      if (data_read < frame_size) {
+        std::fill(frame_.begin() + data_read, frame_.end(), 0);
+      }
+      // get the spectrum of the current frame
+      std::copy(frame_.begin(), frame_.end(), spectrum_analyzer_.Input());
+      spectrum_analyzer_.Execute();
+      spectrum_[i].assign(spectrum_analyzer_.Output(),
+                          spectrum_analyzer_.Output() + half_frame_size);
+    }
+  }
+  return spectrum_;
+}
+
+const std::vector<ValueList>& FeatureExtractor::MagnitudeSpectrum() {
+  if (magnitude_spectrum_.empty()) {
+    // get complex spectrum
+    const std::vector<ComplexValueList>& spectrum = Spectrum();
+    // get magnitude spectrum for each frame
+    std::size_t frames = spectrum.size();
+    magnitude_spectrum_.resize(frames);
+    for (int i = 0; i < frames; i++) {
+      // take the absolute value of each complex bin
+      magnitude_spectrum_[i].resize(spectrum[i].size());
+      std::transform(spectrum[i].begin(), spectrum[i].end(),
+                     magnitude_spectrum_[i].begin(), std::abs<Value>);
+    }
+  }
+  return magnitude_spectrum_;
+}
+  
+const std::vector<ValueList>& FeatureExtractor::LogMagnitudeSpectrum() {
+  if (log_magnitude_spectrum_.empty()) {
+    // get magnitude spectrum
+    const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum();
+    // get magnitude spectrum for each frame
+    std::size_t frames = magnitude_spectrum.size();
+    log_magnitude_spectrum_.resize(frames);
+    for (int i = 0; i < frames; i++) {
+      log_magnitude_spectrum_[i].resize(magnitude_spectrum[i].size());
+      // copy magnitudes to log magnitude buffer, avoiding zeros
+      std::replace_copy(magnitude_spectrum[i].begin(),
+                        magnitude_spectrum[i].end(), 
+                        log_magnitude_spectrum_[i].begin(),
+                        0.0,
+                        std::numeric_limits<Value>::min());
+      // take the log of each magnitude (ugly casting for cmath function...)
+      std::transform(log_magnitude_spectrum_[i].begin(),
+                     log_magnitude_spectrum_[i].end(),
+                     log_magnitude_spectrum_[i].begin(),
+                     static_cast<double(*)(double)>(std::log10));
+    }
+  }
+  return log_magnitude_spectrum_;
+}
+
+const ValueList& FeatureExtractor::SpectralCentroid() {
+  if (spectral_centroid_.empty()) {
+    MagnitudeSpectrum();
+    std::size_t frames = magnitude_spectrum_.size();
+    spectral_centroid_.resize(frames);
+    std::size_t bins = magnitude_spectrum_[0].size();
+    ValueList weighted_frequencies(bins);
+    for (int i = 0; i < frames; i++) {
+      const ValueList& magnitudes = magnitude_spectrum_[i];
+      // get bin frequencies weighted by bin magnitudes
+      std::transform(magnitudes.begin(),
+                     magnitudes.end(),
+                     bin_frequencies_.begin(),
+                     weighted_frequencies.begin(),
+                     std::multiplies<Value>());
+      // divide mean of weighted frequencies by mean of bin magnitudes
+      Value magnitude = stats::Sum(magnitudes);
+      if (magnitude > 0) {
+        spectral_centroid_[i] = stats::Sum(weighted_frequencies) / magnitude;
+      } else {
+        spectral_centroid_[i] = 0;
+      }
+    }
+  }
+  return spectral_centroid_;
+}
+
+
+const ValueList& FeatureExtractor::SpectralSpread() {
+  if (spectral_spread_.empty()) {
+    SpectralCentroid();
+    std::size_t frames = magnitude_spectrum_.size();
+    spectral_spread_.resize(frames);
+    std::size_t bins = magnitude_spectrum_[0].size();
+    ValueList weighted_difference(bins);
+    for (int i = 0; i < frames; i++) {
+      Value centroid = spectral_centroid_[i];
+      const ValueList& magnitudes = magnitude_spectrum_[i];
+      for (int bin = 0; bin < bins; bin++) {
+        Value difference = bin_frequencies_[bin] - centroid;
+        weighted_difference[bin] = difference * difference * magnitudes[bin];
+      }
+      Value magnitude = stats::Sum(magnitudes);
+      if (magnitude > 0) {
+        Value variance = stats::Sum(weighted_difference) / magnitude;
+        spectral_spread_[i] = std::sqrt(variance);
+      } else {
+        spectral_spread_[i] = 0;
+      }
+    }
+  }
+  return spectral_spread_;
+}
+
+const ValueList& FeatureExtractor::Pitch() {
+  if (pitch_.empty()) {
+    MagnitudeSpectrum();
+    pitch_.resize(frames_);
+    std::deque<Value> recent_pitches;
+    const int moving_average_size = 8;
+    for (int i = 0; i < frames_; i++) {
+      const ValueList& mags = magnitude_spectrum_[i];
+      // find the peak bin number 
+      std::ptrdiff_t peak_bin = std::distance(mags.begin(),
+                                              std::max_element(mags.begin(),
+                                                               mags.end()));
+      // interpolate peak frequency
+      double offset;
+      double centre = mags[peak_bin];
+      if (centre == 0) {
+        pitch_[i] = 0;
+      } else {
+        if (peak_bin == 0) {
+          // parabolic interpolation of first 2 bin magnitudes
+          double above = mags[peak_bin + 1];
+          offset = above / (2 * (2 * centre - above));        
+        } else if (peak_bin == mags.size() - 1) {
+          // parabolic interpolation of last 2 bin magnitudes
+          double below = mags[peak_bin - 1];
+          offset = (-below) / (2 * (2 * centre - below));
+        } else {
+          double below = mags[peak_bin - 1];
+          double above = mags[peak_bin + 1];
+          // gaussian interpolation
+          offset = std::log(above / below) 
+                   / (2 * std::log((centre * centre) / (above * below)));
+        }
+        Value frame_pitch = bin_size_ * (peak_bin + offset);
+        if (recent_pitches.size() == moving_average_size) {
+          recent_pitches.pop_front();
+        }
+        recent_pitches.push_back(frame_pitch);
+        pitch_[i] = stats::Mean(recent_pitches);
+      }
+    }
+  }
+  return pitch_;
+}
+  
+const ValueList& FeatureExtractor::Energy() {
+  if (energy_.empty()) {
+    const int frame_size = window_size_;
+    const int hop_size = hop_size_;
+    energy_.resize(frames_);
+    ValueList::const_iterator data_begin = file_data_.begin();
+    ValueList::const_iterator data_end = file_data_.end();
+    for (int i = 0; i < frames_; i++) {
+      // find the frame bounds
+      ValueList::const_iterator read_start = data_begin + (i * hop_size);
+      ValueList::const_iterator read_end = read_start + frame_size;
+      if (read_end > data_end) {
+        read_end = data_end;
+      }
+      Value energy(0);
+      std::ptrdiff_t samples = std::distance(read_start, read_end);
+      while (read_start != read_end) {
+        energy += std::pow(*read_start++, 2);
+      }
+      energy_[i] = energy / samples;
+    }
+  }
+  return energy_;
+}
+
+const std::vector<ValueList>& FeatureExtractor::MFCCs() {
+  if (mfccs_.empty()) {
+    const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum();
+    mfcc_analyzer_.SetSampleRate(sample_rate_);
+    mfccs_.resize(frames_);
+    for (int i = 0; i < frames_; i++) {
+      mfccs_[i] = mfcc_analyzer_.ExtractMFCCS(magnitude_spectrum[i]);
+    }
+  }
+  return mfccs_;
+}
+  
+const std::vector<ValueList>& FeatureExtractor::DeltaMFCCs() {
+  if (delta_mfccs_.empty()) {
+    const std::vector<ValueList>& mfccs = MFCCs();
+    // resize buffers
+    const std::size_t number_of_mfccs = mfccs[0].size();
+    ValueList empty_values(number_of_mfccs, 0);
+    delta_mfccs_.resize(mfccs.size(), empty_values);
+    // calculate deltas based on 2 neighbouring frames in either direction
+    for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) {
+      const ValueList& before_2 = mfccs[i - 2];
+      const ValueList& before_1 = mfccs[i - 1];
+      const ValueList& after_1 = mfccs[i + 1];
+      const ValueList& after_2 = mfccs[i + 2];
+      ValueList& delta_frame = delta_mfccs_[i];
+      for (int m = 0; m < number_of_mfccs; m++) {
+        Value delta = after_1[m] - before_1[m]
+                      + 2.0 * (after_2[m] - before_2[m]);
+        delta_frame[m] = delta / 10.0;
+      }
+    }
+  }
+  return delta_mfccs_;
+}
+
+const std::vector<ValueList>& FeatureExtractor::DoubleDeltaMFCCs() {
+  if (double_delta_mfccs_.empty()) {
+    const std::vector<ValueList>& delta_mfccs = DeltaMFCCs();
+    // resize buffers
+    const std::size_t number_of_mfccs = delta_mfccs[0].size();
+    ValueList empty_values(number_of_mfccs, 0);
+    double_delta_mfccs_.resize(delta_mfccs.size(), empty_values);
+    // calculate deltas based on 2 neighbouring frames in either direction
+    for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) {
+      const ValueList& before_2 = delta_mfccs[i - 2];
+      const ValueList& before_1 = delta_mfccs[i - 1];
+      const ValueList& after_1 = delta_mfccs[i + 1];
+      const ValueList& after_2 = delta_mfccs[i + 2];
+      ValueList& double_delta_frame = double_delta_mfccs_[i];
+      for (int m = 0; m < number_of_mfccs; m++) {
+        Value delta = after_1[m] - before_1[m]
+                      + 2.0 * (after_2[m] - before_2[m]);
+        double_delta_frame[m] = delta / 10.0;
+      }
+    }
+  }
+  return double_delta_mfccs_;
+}
+  
+const std::vector<ValueList>& FeatureExtractor::SpectralFlux() {
+  if (spectral_flux_.empty()) {
+    // get magnitude spectrum
+    const std::vector<ValueList>& spectrum = MagnitudeSpectrum();
+    // get magnitude spectrum for each frame
+    std::size_t frames = spectrum.size();
+    std::size_t bins = spectrum[0].size();
+    spectral_flux_.resize(frames);
+    // first frame always zeroed
+    spectral_flux_[0].assign(bins, 0);
+    for (int i = 1; i < frames; i++) {
+      const ValueList& spectrum_frame = spectrum[i];
+      const ValueList& previous_spectrum_frame = spectrum[i - 1];
+      ValueList& flux_frame = spectral_flux_[i];
+      flux_frame.resize(bins);
+      // get the difference in magnitudes for each bin
+      for (int bin = 0; bin < bins; bin++) {
+        flux_frame[bin] = spectrum_frame[bin] - previous_spectrum_frame[bin];
+      }
+      //std::cout << i << ": " << stats::Mean(flux_frame) << '\n';
+    }
+  }
+  return spectral_flux_;
+}
+
+} // dsp namespace