Mercurial > hg > gpsynth
view src/feature_extractor.cpp @ 0:add35537fdbb tip
Initial import
author | irh <ian.r.hobson@gmail.com> |
---|---|
date | Thu, 25 Aug 2011 11:05:55 +0100 |
parents | |
children |
line wrap: on
line source
// Copyright 2011, Ian Hobson. // // This file is part of gpsynth. // // gpsynth is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // gpsynth is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with gpsynth in the file COPYING. // If not, see http://www.gnu.org/licenses/. #include "feature_extractor.hpp" #include "statistics.hpp" #include "std_ex.hpp" #include "window_makers.hpp" #include "sndfile.hh" #include "boost/bind.hpp" #include "boost/filesystem.hpp" #include <algorithm> #include <cmath> #include <iostream> #include <stdexcept> #include <sstream> namespace bf = boost::filesystem; namespace dsp { FeatureExtractor::FeatureExtractor(const std::string& file_path /* = "" */, int window_size /* = 2048 */, int hop_size /* = 512 */) : window_size_(-1), // initialized by SetFrameSize hop_size_(hop_size), bin_frequencies_(window_size), spectrum_analyzer_(window_size), mfcc_analyzer_(window_size, 40, 13, 30, 10000) { SetWindowSize(window_size); if (!file_path.empty()) { LoadFile(file_path); } } void FeatureExtractor::ClearCaches() { spectrum_.clear(); magnitude_spectrum_.clear(); log_magnitude_spectrum_.clear(); spectral_centroid_.clear(); spectral_spread_.clear(); spectral_flux_.clear(); pitch_.clear(); energy_.clear(); mfccs_.clear(); delta_mfccs_.clear(); double_delta_mfccs_.clear(); } void FeatureExtractor::SetWindowSize(int window_size) { if (window_size_ != window_size) { window_size_ = window_size; frame_.resize(window_size); // generate new window window_.resize(window_size); dsp::HammingWindow(window_size, window_.begin()); ClearCaches(); } } void FeatureExtractor::SetHopSize(int hop_size) { if (hop_size_ != hop_size) { hop_size_ = hop_size; ClearCaches(); } } void FeatureExtractor::LoadFile(const std::string& file_path) { SndfileHandle file(file_path); if (file.error()) { std::stringstream message; message << "FeatureExtractor::ExtractFeaturesFromFile: " << "Error reading from file '" << file_path << "' " << '(' << file.error() << ')'; throw std::runtime_error(message.str()); } sf_count_t frames = file.frames(); int channels = file.channels(); file_data_.resize(frames * channels); // read the audio data in to the buffer file.readf(&file_data_[0], frames); // only supporting mono at the moment, remove extra channels if (channels > 1) { for (int i = 0; i < frames; i++) { file_data_[i] = file_data_[i * channels]; } file_data_.resize(frames); } // success, clear caches ClearCaches(); // update file info file_path_ = file_path; sample_rate_ = file.samplerate(); frames_ = static_cast<int>(file_data_.size()) / hop_size_; if (frames_ == 0) { frames_++; } // bin info bin_size_ = sample_rate_ / static_cast<Value>(window_size_); std::generate(bin_frequencies_.begin(), bin_frequencies_.end(), stdx::Stepper<Value>(bin_size_)); } Value FeatureExtractor::Duration() const { return file_data_.size() / static_cast<Value>(sample_rate_); } const std::vector<ComplexValueList>& FeatureExtractor::Spectrum() { if (spectrum_.empty()) { const int frame_size = window_size_; const int half_frame_size = frame_size / 2; const int hop_size = hop_size_; spectrum_.resize(frames_); ValueList::const_iterator data_begin = file_data_.begin(); ValueList::const_iterator data_end = file_data_.end(); for (int i = 0; i < frames_; i++) { //std::cout << "frame:" << i << '\n'; // find the frame bounds ValueList::const_iterator read_start = data_begin + (i * hop_size); ValueList::const_iterator read_end = read_start + frame_size; if (read_end > data_end) { read_end = data_end; } // extract the frame and apply window std::transform(read_start, read_end, window_.begin(), frame_.begin(), std::multiplies<Value>()); // on the last frame we'll need to fill the rest of the frame with zeros std::size_t data_read = read_end - read_start; if (data_read < frame_size) { std::fill(frame_.begin() + data_read, frame_.end(), 0); } // get the spectrum of the current frame std::copy(frame_.begin(), frame_.end(), spectrum_analyzer_.Input()); spectrum_analyzer_.Execute(); spectrum_[i].assign(spectrum_analyzer_.Output(), spectrum_analyzer_.Output() + half_frame_size); } } return spectrum_; } const std::vector<ValueList>& FeatureExtractor::MagnitudeSpectrum() { if (magnitude_spectrum_.empty()) { // get complex spectrum const std::vector<ComplexValueList>& spectrum = Spectrum(); // get magnitude spectrum for each frame std::size_t frames = spectrum.size(); magnitude_spectrum_.resize(frames); for (int i = 0; i < frames; i++) { // take the absolute value of each complex bin magnitude_spectrum_[i].resize(spectrum[i].size()); std::transform(spectrum[i].begin(), spectrum[i].end(), magnitude_spectrum_[i].begin(), std::abs<Value>); } } return magnitude_spectrum_; } const std::vector<ValueList>& FeatureExtractor::LogMagnitudeSpectrum() { if (log_magnitude_spectrum_.empty()) { // get magnitude spectrum const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum(); // get magnitude spectrum for each frame std::size_t frames = magnitude_spectrum.size(); log_magnitude_spectrum_.resize(frames); for (int i = 0; i < frames; i++) { log_magnitude_spectrum_[i].resize(magnitude_spectrum[i].size()); // copy magnitudes to log magnitude buffer, avoiding zeros std::replace_copy(magnitude_spectrum[i].begin(), magnitude_spectrum[i].end(), log_magnitude_spectrum_[i].begin(), 0.0, std::numeric_limits<Value>::min()); // take the log of each magnitude (ugly casting for cmath function...) std::transform(log_magnitude_spectrum_[i].begin(), log_magnitude_spectrum_[i].end(), log_magnitude_spectrum_[i].begin(), static_cast<double(*)(double)>(std::log10)); } } return log_magnitude_spectrum_; } const ValueList& FeatureExtractor::SpectralCentroid() { if (spectral_centroid_.empty()) { MagnitudeSpectrum(); std::size_t frames = magnitude_spectrum_.size(); spectral_centroid_.resize(frames); std::size_t bins = magnitude_spectrum_[0].size(); ValueList weighted_frequencies(bins); for (int i = 0; i < frames; i++) { const ValueList& magnitudes = magnitude_spectrum_[i]; // get bin frequencies weighted by bin magnitudes std::transform(magnitudes.begin(), magnitudes.end(), bin_frequencies_.begin(), weighted_frequencies.begin(), std::multiplies<Value>()); // divide mean of weighted frequencies by mean of bin magnitudes Value magnitude = stats::Sum(magnitudes); if (magnitude > 0) { spectral_centroid_[i] = stats::Sum(weighted_frequencies) / magnitude; } else { spectral_centroid_[i] = 0; } } } return spectral_centroid_; } const ValueList& FeatureExtractor::SpectralSpread() { if (spectral_spread_.empty()) { SpectralCentroid(); std::size_t frames = magnitude_spectrum_.size(); spectral_spread_.resize(frames); std::size_t bins = magnitude_spectrum_[0].size(); ValueList weighted_difference(bins); for (int i = 0; i < frames; i++) { Value centroid = spectral_centroid_[i]; const ValueList& magnitudes = magnitude_spectrum_[i]; for (int bin = 0; bin < bins; bin++) { Value difference = bin_frequencies_[bin] - centroid; weighted_difference[bin] = difference * difference * magnitudes[bin]; } Value magnitude = stats::Sum(magnitudes); if (magnitude > 0) { Value variance = stats::Sum(weighted_difference) / magnitude; spectral_spread_[i] = std::sqrt(variance); } else { spectral_spread_[i] = 0; } } } return spectral_spread_; } const ValueList& FeatureExtractor::Pitch() { if (pitch_.empty()) { MagnitudeSpectrum(); pitch_.resize(frames_); std::deque<Value> recent_pitches; const int moving_average_size = 8; for (int i = 0; i < frames_; i++) { const ValueList& mags = magnitude_spectrum_[i]; // find the peak bin number std::ptrdiff_t peak_bin = std::distance(mags.begin(), std::max_element(mags.begin(), mags.end())); // interpolate peak frequency double offset; double centre = mags[peak_bin]; if (centre == 0) { pitch_[i] = 0; } else { if (peak_bin == 0) { // parabolic interpolation of first 2 bin magnitudes double above = mags[peak_bin + 1]; offset = above / (2 * (2 * centre - above)); } else if (peak_bin == mags.size() - 1) { // parabolic interpolation of last 2 bin magnitudes double below = mags[peak_bin - 1]; offset = (-below) / (2 * (2 * centre - below)); } else { double below = mags[peak_bin - 1]; double above = mags[peak_bin + 1]; // gaussian interpolation offset = std::log(above / below) / (2 * std::log((centre * centre) / (above * below))); } Value frame_pitch = bin_size_ * (peak_bin + offset); if (recent_pitches.size() == moving_average_size) { recent_pitches.pop_front(); } recent_pitches.push_back(frame_pitch); pitch_[i] = stats::Mean(recent_pitches); } } } return pitch_; } const ValueList& FeatureExtractor::Energy() { if (energy_.empty()) { const int frame_size = window_size_; const int hop_size = hop_size_; energy_.resize(frames_); ValueList::const_iterator data_begin = file_data_.begin(); ValueList::const_iterator data_end = file_data_.end(); for (int i = 0; i < frames_; i++) { // find the frame bounds ValueList::const_iterator read_start = data_begin + (i * hop_size); ValueList::const_iterator read_end = read_start + frame_size; if (read_end > data_end) { read_end = data_end; } Value energy(0); std::ptrdiff_t samples = std::distance(read_start, read_end); while (read_start != read_end) { energy += std::pow(*read_start++, 2); } energy_[i] = energy / samples; } } return energy_; } const std::vector<ValueList>& FeatureExtractor::MFCCs() { if (mfccs_.empty()) { const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum(); mfcc_analyzer_.SetSampleRate(sample_rate_); mfccs_.resize(frames_); for (int i = 0; i < frames_; i++) { mfccs_[i] = mfcc_analyzer_.ExtractMFCCS(magnitude_spectrum[i]); } } return mfccs_; } const std::vector<ValueList>& FeatureExtractor::DeltaMFCCs() { if (delta_mfccs_.empty()) { const std::vector<ValueList>& mfccs = MFCCs(); // resize buffers const std::size_t number_of_mfccs = mfccs[0].size(); ValueList empty_values(number_of_mfccs, 0); delta_mfccs_.resize(mfccs.size(), empty_values); // calculate deltas based on 2 neighbouring frames in either direction for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) { const ValueList& before_2 = mfccs[i - 2]; const ValueList& before_1 = mfccs[i - 1]; const ValueList& after_1 = mfccs[i + 1]; const ValueList& after_2 = mfccs[i + 2]; ValueList& delta_frame = delta_mfccs_[i]; for (int m = 0; m < number_of_mfccs; m++) { Value delta = after_1[m] - before_1[m] + 2.0 * (after_2[m] - before_2[m]); delta_frame[m] = delta / 10.0; } } } return delta_mfccs_; } const std::vector<ValueList>& FeatureExtractor::DoubleDeltaMFCCs() { if (double_delta_mfccs_.empty()) { const std::vector<ValueList>& delta_mfccs = DeltaMFCCs(); // resize buffers const std::size_t number_of_mfccs = delta_mfccs[0].size(); ValueList empty_values(number_of_mfccs, 0); double_delta_mfccs_.resize(delta_mfccs.size(), empty_values); // calculate deltas based on 2 neighbouring frames in either direction for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) { const ValueList& before_2 = delta_mfccs[i - 2]; const ValueList& before_1 = delta_mfccs[i - 1]; const ValueList& after_1 = delta_mfccs[i + 1]; const ValueList& after_2 = delta_mfccs[i + 2]; ValueList& double_delta_frame = double_delta_mfccs_[i]; for (int m = 0; m < number_of_mfccs; m++) { Value delta = after_1[m] - before_1[m] + 2.0 * (after_2[m] - before_2[m]); double_delta_frame[m] = delta / 10.0; } } } return double_delta_mfccs_; } const std::vector<ValueList>& FeatureExtractor::SpectralFlux() { if (spectral_flux_.empty()) { // get magnitude spectrum const std::vector<ValueList>& spectrum = MagnitudeSpectrum(); // get magnitude spectrum for each frame std::size_t frames = spectrum.size(); std::size_t bins = spectrum[0].size(); spectral_flux_.resize(frames); // first frame always zeroed spectral_flux_[0].assign(bins, 0); for (int i = 1; i < frames; i++) { const ValueList& spectrum_frame = spectrum[i]; const ValueList& previous_spectrum_frame = spectrum[i - 1]; ValueList& flux_frame = spectral_flux_[i]; flux_frame.resize(bins); // get the difference in magnitudes for each bin for (int bin = 0; bin < bins; bin++) { flux_frame[bin] = spectrum_frame[bin] - previous_spectrum_frame[bin]; } //std::cout << i << ": " << stats::Mean(flux_frame) << '\n'; } } return spectral_flux_; } } // dsp namespace