view src/feature_extractor.cpp @ 0:add35537fdbb tip

Initial import
author irh <ian.r.hobson@gmail.com>
date Thu, 25 Aug 2011 11:05:55 +0100
parents
children
line wrap: on
line source
//  Copyright 2011, Ian Hobson.
//
//  This file is part of gpsynth.
//
//  gpsynth is free software: you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation, either version 3 of the License, or
//  (at your option) any later version.
//
//  gpsynth is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with gpsynth in the file COPYING. 
//  If not, see http://www.gnu.org/licenses/.

#include "feature_extractor.hpp"

#include "statistics.hpp"
#include "std_ex.hpp"
#include "window_makers.hpp"

#include "sndfile.hh"

#include "boost/bind.hpp"
#include "boost/filesystem.hpp"

#include <algorithm>
#include <cmath>
#include <iostream>
#include <stdexcept>
#include <sstream>

namespace bf = boost::filesystem;

namespace dsp {

FeatureExtractor::FeatureExtractor(const std::string& file_path /* = "" */,
                                   int window_size /* = 2048 */,
                                   int hop_size /* = 512 */) 
: window_size_(-1), // initialized by SetFrameSize
  hop_size_(hop_size),
  bin_frequencies_(window_size),
  spectrum_analyzer_(window_size),
  mfcc_analyzer_(window_size, 40, 13, 30, 10000)
{
  SetWindowSize(window_size);
  if (!file_path.empty()) {
    LoadFile(file_path);
  }
}

void FeatureExtractor::ClearCaches() {
  spectrum_.clear();
  magnitude_spectrum_.clear();
  log_magnitude_spectrum_.clear();
  spectral_centroid_.clear();
  spectral_spread_.clear();
  spectral_flux_.clear();
  pitch_.clear();
  energy_.clear();
  mfccs_.clear();
  delta_mfccs_.clear();
  double_delta_mfccs_.clear();
}
  
void FeatureExtractor::SetWindowSize(int window_size) {
  if (window_size_ != window_size) {
    window_size_ = window_size;
    frame_.resize(window_size);
    // generate new window
    window_.resize(window_size);
    dsp::HammingWindow(window_size, window_.begin());
    ClearCaches();
  }
}
  
void FeatureExtractor::SetHopSize(int hop_size) {
  if (hop_size_ != hop_size) {
    hop_size_ = hop_size;
    ClearCaches();
  }
}
  
void FeatureExtractor::LoadFile(const std::string& file_path) {
  SndfileHandle file(file_path);
  if (file.error()) {
    std::stringstream message;
    message << "FeatureExtractor::ExtractFeaturesFromFile: "
            << "Error reading from file '" << file_path << "' "
            << '(' << file.error() << ')';
    throw std::runtime_error(message.str());
  }
  sf_count_t frames = file.frames();
  int channels = file.channels();
  file_data_.resize(frames * channels);
  // read the audio data in to the buffer
  file.readf(&file_data_[0], frames);
  // only supporting mono at the moment, remove extra channels
  if (channels > 1) {
    for (int i = 0; i < frames; i++) {
      file_data_[i] = file_data_[i * channels];
    }
    file_data_.resize(frames);
  }
  // success, clear caches
  ClearCaches();
  // update file info
  file_path_ = file_path;
  sample_rate_ = file.samplerate();
  frames_ = static_cast<int>(file_data_.size()) / hop_size_;
  if (frames_ == 0) {
    frames_++;
  }
  // bin info
  bin_size_ = sample_rate_ / static_cast<Value>(window_size_);
  std::generate(bin_frequencies_.begin(), bin_frequencies_.end(),
                stdx::Stepper<Value>(bin_size_));
}
  
Value FeatureExtractor::Duration() const {
  return file_data_.size() / static_cast<Value>(sample_rate_);
}

const std::vector<ComplexValueList>& FeatureExtractor::Spectrum() {
  if (spectrum_.empty()) {
    const int frame_size = window_size_;
    const int half_frame_size = frame_size / 2;
    const int hop_size = hop_size_;
    spectrum_.resize(frames_);
    ValueList::const_iterator data_begin = file_data_.begin();
    ValueList::const_iterator data_end = file_data_.end();
    for (int i = 0; i < frames_; i++) {
      //std::cout << "frame:" << i << '\n';
      // find the frame bounds
      ValueList::const_iterator read_start = data_begin + (i * hop_size);
      ValueList::const_iterator read_end = read_start + frame_size;
      if (read_end > data_end) {
        read_end = data_end;
      }
      // extract the frame and apply window
      std::transform(read_start, read_end, window_.begin(), 
                     frame_.begin(), std::multiplies<Value>());
      // on the last frame we'll need to fill the rest of the frame with zeros
      std::size_t data_read = read_end - read_start;
      if (data_read < frame_size) {
        std::fill(frame_.begin() + data_read, frame_.end(), 0);
      }
      // get the spectrum of the current frame
      std::copy(frame_.begin(), frame_.end(), spectrum_analyzer_.Input());
      spectrum_analyzer_.Execute();
      spectrum_[i].assign(spectrum_analyzer_.Output(),
                          spectrum_analyzer_.Output() + half_frame_size);
    }
  }
  return spectrum_;
}

const std::vector<ValueList>& FeatureExtractor::MagnitudeSpectrum() {
  if (magnitude_spectrum_.empty()) {
    // get complex spectrum
    const std::vector<ComplexValueList>& spectrum = Spectrum();
    // get magnitude spectrum for each frame
    std::size_t frames = spectrum.size();
    magnitude_spectrum_.resize(frames);
    for (int i = 0; i < frames; i++) {
      // take the absolute value of each complex bin
      magnitude_spectrum_[i].resize(spectrum[i].size());
      std::transform(spectrum[i].begin(), spectrum[i].end(),
                     magnitude_spectrum_[i].begin(), std::abs<Value>);
    }
  }
  return magnitude_spectrum_;
}
  
const std::vector<ValueList>& FeatureExtractor::LogMagnitudeSpectrum() {
  if (log_magnitude_spectrum_.empty()) {
    // get magnitude spectrum
    const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum();
    // get magnitude spectrum for each frame
    std::size_t frames = magnitude_spectrum.size();
    log_magnitude_spectrum_.resize(frames);
    for (int i = 0; i < frames; i++) {
      log_magnitude_spectrum_[i].resize(magnitude_spectrum[i].size());
      // copy magnitudes to log magnitude buffer, avoiding zeros
      std::replace_copy(magnitude_spectrum[i].begin(),
                        magnitude_spectrum[i].end(), 
                        log_magnitude_spectrum_[i].begin(),
                        0.0,
                        std::numeric_limits<Value>::min());
      // take the log of each magnitude (ugly casting for cmath function...)
      std::transform(log_magnitude_spectrum_[i].begin(),
                     log_magnitude_spectrum_[i].end(),
                     log_magnitude_spectrum_[i].begin(),
                     static_cast<double(*)(double)>(std::log10));
    }
  }
  return log_magnitude_spectrum_;
}

const ValueList& FeatureExtractor::SpectralCentroid() {
  if (spectral_centroid_.empty()) {
    MagnitudeSpectrum();
    std::size_t frames = magnitude_spectrum_.size();
    spectral_centroid_.resize(frames);
    std::size_t bins = magnitude_spectrum_[0].size();
    ValueList weighted_frequencies(bins);
    for (int i = 0; i < frames; i++) {
      const ValueList& magnitudes = magnitude_spectrum_[i];
      // get bin frequencies weighted by bin magnitudes
      std::transform(magnitudes.begin(),
                     magnitudes.end(),
                     bin_frequencies_.begin(),
                     weighted_frequencies.begin(),
                     std::multiplies<Value>());
      // divide mean of weighted frequencies by mean of bin magnitudes
      Value magnitude = stats::Sum(magnitudes);
      if (magnitude > 0) {
        spectral_centroid_[i] = stats::Sum(weighted_frequencies) / magnitude;
      } else {
        spectral_centroid_[i] = 0;
      }
    }
  }
  return spectral_centroid_;
}


const ValueList& FeatureExtractor::SpectralSpread() {
  if (spectral_spread_.empty()) {
    SpectralCentroid();
    std::size_t frames = magnitude_spectrum_.size();
    spectral_spread_.resize(frames);
    std::size_t bins = magnitude_spectrum_[0].size();
    ValueList weighted_difference(bins);
    for (int i = 0; i < frames; i++) {
      Value centroid = spectral_centroid_[i];
      const ValueList& magnitudes = magnitude_spectrum_[i];
      for (int bin = 0; bin < bins; bin++) {
        Value difference = bin_frequencies_[bin] - centroid;
        weighted_difference[bin] = difference * difference * magnitudes[bin];
      }
      Value magnitude = stats::Sum(magnitudes);
      if (magnitude > 0) {
        Value variance = stats::Sum(weighted_difference) / magnitude;
        spectral_spread_[i] = std::sqrt(variance);
      } else {
        spectral_spread_[i] = 0;
      }
    }
  }
  return spectral_spread_;
}

const ValueList& FeatureExtractor::Pitch() {
  if (pitch_.empty()) {
    MagnitudeSpectrum();
    pitch_.resize(frames_);
    std::deque<Value> recent_pitches;
    const int moving_average_size = 8;
    for (int i = 0; i < frames_; i++) {
      const ValueList& mags = magnitude_spectrum_[i];
      // find the peak bin number 
      std::ptrdiff_t peak_bin = std::distance(mags.begin(),
                                              std::max_element(mags.begin(),
                                                               mags.end()));
      // interpolate peak frequency
      double offset;
      double centre = mags[peak_bin];
      if (centre == 0) {
        pitch_[i] = 0;
      } else {
        if (peak_bin == 0) {
          // parabolic interpolation of first 2 bin magnitudes
          double above = mags[peak_bin + 1];
          offset = above / (2 * (2 * centre - above));        
        } else if (peak_bin == mags.size() - 1) {
          // parabolic interpolation of last 2 bin magnitudes
          double below = mags[peak_bin - 1];
          offset = (-below) / (2 * (2 * centre - below));
        } else {
          double below = mags[peak_bin - 1];
          double above = mags[peak_bin + 1];
          // gaussian interpolation
          offset = std::log(above / below) 
                   / (2 * std::log((centre * centre) / (above * below)));
        }
        Value frame_pitch = bin_size_ * (peak_bin + offset);
        if (recent_pitches.size() == moving_average_size) {
          recent_pitches.pop_front();
        }
        recent_pitches.push_back(frame_pitch);
        pitch_[i] = stats::Mean(recent_pitches);
      }
    }
  }
  return pitch_;
}
  
const ValueList& FeatureExtractor::Energy() {
  if (energy_.empty()) {
    const int frame_size = window_size_;
    const int hop_size = hop_size_;
    energy_.resize(frames_);
    ValueList::const_iterator data_begin = file_data_.begin();
    ValueList::const_iterator data_end = file_data_.end();
    for (int i = 0; i < frames_; i++) {
      // find the frame bounds
      ValueList::const_iterator read_start = data_begin + (i * hop_size);
      ValueList::const_iterator read_end = read_start + frame_size;
      if (read_end > data_end) {
        read_end = data_end;
      }
      Value energy(0);
      std::ptrdiff_t samples = std::distance(read_start, read_end);
      while (read_start != read_end) {
        energy += std::pow(*read_start++, 2);
      }
      energy_[i] = energy / samples;
    }
  }
  return energy_;
}

const std::vector<ValueList>& FeatureExtractor::MFCCs() {
  if (mfccs_.empty()) {
    const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum();
    mfcc_analyzer_.SetSampleRate(sample_rate_);
    mfccs_.resize(frames_);
    for (int i = 0; i < frames_; i++) {
      mfccs_[i] = mfcc_analyzer_.ExtractMFCCS(magnitude_spectrum[i]);
    }
  }
  return mfccs_;
}
  
const std::vector<ValueList>& FeatureExtractor::DeltaMFCCs() {
  if (delta_mfccs_.empty()) {
    const std::vector<ValueList>& mfccs = MFCCs();
    // resize buffers
    const std::size_t number_of_mfccs = mfccs[0].size();
    ValueList empty_values(number_of_mfccs, 0);
    delta_mfccs_.resize(mfccs.size(), empty_values);
    // calculate deltas based on 2 neighbouring frames in either direction
    for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) {
      const ValueList& before_2 = mfccs[i - 2];
      const ValueList& before_1 = mfccs[i - 1];
      const ValueList& after_1 = mfccs[i + 1];
      const ValueList& after_2 = mfccs[i + 2];
      ValueList& delta_frame = delta_mfccs_[i];
      for (int m = 0; m < number_of_mfccs; m++) {
        Value delta = after_1[m] - before_1[m]
                      + 2.0 * (after_2[m] - before_2[m]);
        delta_frame[m] = delta / 10.0;
      }
    }
  }
  return delta_mfccs_;
}

const std::vector<ValueList>& FeatureExtractor::DoubleDeltaMFCCs() {
  if (double_delta_mfccs_.empty()) {
    const std::vector<ValueList>& delta_mfccs = DeltaMFCCs();
    // resize buffers
    const std::size_t number_of_mfccs = delta_mfccs[0].size();
    ValueList empty_values(number_of_mfccs, 0);
    double_delta_mfccs_.resize(delta_mfccs.size(), empty_values);
    // calculate deltas based on 2 neighbouring frames in either direction
    for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) {
      const ValueList& before_2 = delta_mfccs[i - 2];
      const ValueList& before_1 = delta_mfccs[i - 1];
      const ValueList& after_1 = delta_mfccs[i + 1];
      const ValueList& after_2 = delta_mfccs[i + 2];
      ValueList& double_delta_frame = double_delta_mfccs_[i];
      for (int m = 0; m < number_of_mfccs; m++) {
        Value delta = after_1[m] - before_1[m]
                      + 2.0 * (after_2[m] - before_2[m]);
        double_delta_frame[m] = delta / 10.0;
      }
    }
  }
  return double_delta_mfccs_;
}
  
const std::vector<ValueList>& FeatureExtractor::SpectralFlux() {
  if (spectral_flux_.empty()) {
    // get magnitude spectrum
    const std::vector<ValueList>& spectrum = MagnitudeSpectrum();
    // get magnitude spectrum for each frame
    std::size_t frames = spectrum.size();
    std::size_t bins = spectrum[0].size();
    spectral_flux_.resize(frames);
    // first frame always zeroed
    spectral_flux_[0].assign(bins, 0);
    for (int i = 1; i < frames; i++) {
      const ValueList& spectrum_frame = spectrum[i];
      const ValueList& previous_spectrum_frame = spectrum[i - 1];
      ValueList& flux_frame = spectral_flux_[i];
      flux_frame.resize(bins);
      // get the difference in magnitudes for each bin
      for (int bin = 0; bin < bins; bin++) {
        flux_frame[bin] = spectrum_frame[bin] - previous_spectrum_frame[bin];
      }
      //std::cout << i << ": " << stats::Mean(flux_frame) << '\n';
    }
  }
  return spectral_flux_;
}

} // dsp namespace