Mercurial > hg > gpsynth
diff src/feature_extractor.cpp @ 0:add35537fdbb tip
Initial import
author | irh <ian.r.hobson@gmail.com> |
---|---|
date | Thu, 25 Aug 2011 11:05:55 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/feature_extractor.cpp Thu Aug 25 11:05:55 2011 +0100 @@ -0,0 +1,412 @@ +// Copyright 2011, Ian Hobson. +// +// This file is part of gpsynth. +// +// gpsynth is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// gpsynth is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with gpsynth in the file COPYING. +// If not, see http://www.gnu.org/licenses/. + +#include "feature_extractor.hpp" + +#include "statistics.hpp" +#include "std_ex.hpp" +#include "window_makers.hpp" + +#include "sndfile.hh" + +#include "boost/bind.hpp" +#include "boost/filesystem.hpp" + +#include <algorithm> +#include <cmath> +#include <iostream> +#include <stdexcept> +#include <sstream> + +namespace bf = boost::filesystem; + +namespace dsp { + +FeatureExtractor::FeatureExtractor(const std::string& file_path /* = "" */, + int window_size /* = 2048 */, + int hop_size /* = 512 */) +: window_size_(-1), // initialized by SetFrameSize + hop_size_(hop_size), + bin_frequencies_(window_size), + spectrum_analyzer_(window_size), + mfcc_analyzer_(window_size, 40, 13, 30, 10000) +{ + SetWindowSize(window_size); + if (!file_path.empty()) { + LoadFile(file_path); + } +} + +void FeatureExtractor::ClearCaches() { + spectrum_.clear(); + magnitude_spectrum_.clear(); + log_magnitude_spectrum_.clear(); + spectral_centroid_.clear(); + spectral_spread_.clear(); + spectral_flux_.clear(); + pitch_.clear(); + energy_.clear(); + mfccs_.clear(); + delta_mfccs_.clear(); + double_delta_mfccs_.clear(); +} + +void FeatureExtractor::SetWindowSize(int window_size) { + if (window_size_ != window_size) { + window_size_ = window_size; + frame_.resize(window_size); + // generate new window + window_.resize(window_size); + dsp::HammingWindow(window_size, window_.begin()); + ClearCaches(); + } +} + +void FeatureExtractor::SetHopSize(int hop_size) { + if (hop_size_ != hop_size) { + hop_size_ = hop_size; + ClearCaches(); + } +} + +void FeatureExtractor::LoadFile(const std::string& file_path) { + SndfileHandle file(file_path); + if (file.error()) { + std::stringstream message; + message << "FeatureExtractor::ExtractFeaturesFromFile: " + << "Error reading from file '" << file_path << "' " + << '(' << file.error() << ')'; + throw std::runtime_error(message.str()); + } + sf_count_t frames = file.frames(); + int channels = file.channels(); + file_data_.resize(frames * channels); + // read the audio data in to the buffer + file.readf(&file_data_[0], frames); + // only supporting mono at the moment, remove extra channels + if (channels > 1) { + for (int i = 0; i < frames; i++) { + file_data_[i] = file_data_[i * channels]; + } + file_data_.resize(frames); + } + // success, clear caches + ClearCaches(); + // update file info + file_path_ = file_path; + sample_rate_ = file.samplerate(); + frames_ = static_cast<int>(file_data_.size()) / hop_size_; + if (frames_ == 0) { + frames_++; + } + // bin info + bin_size_ = sample_rate_ / static_cast<Value>(window_size_); + std::generate(bin_frequencies_.begin(), bin_frequencies_.end(), + stdx::Stepper<Value>(bin_size_)); +} + +Value FeatureExtractor::Duration() const { + return file_data_.size() / static_cast<Value>(sample_rate_); +} + +const std::vector<ComplexValueList>& FeatureExtractor::Spectrum() { + if (spectrum_.empty()) { + const int frame_size = window_size_; + const int half_frame_size = frame_size / 2; + const int hop_size = hop_size_; + spectrum_.resize(frames_); + ValueList::const_iterator data_begin = file_data_.begin(); + ValueList::const_iterator data_end = file_data_.end(); + for (int i = 0; i < frames_; i++) { + //std::cout << "frame:" << i << '\n'; + // find the frame bounds + ValueList::const_iterator read_start = data_begin + (i * hop_size); + ValueList::const_iterator read_end = read_start + frame_size; + if (read_end > data_end) { + read_end = data_end; + } + // extract the frame and apply window + std::transform(read_start, read_end, window_.begin(), + frame_.begin(), std::multiplies<Value>()); + // on the last frame we'll need to fill the rest of the frame with zeros + std::size_t data_read = read_end - read_start; + if (data_read < frame_size) { + std::fill(frame_.begin() + data_read, frame_.end(), 0); + } + // get the spectrum of the current frame + std::copy(frame_.begin(), frame_.end(), spectrum_analyzer_.Input()); + spectrum_analyzer_.Execute(); + spectrum_[i].assign(spectrum_analyzer_.Output(), + spectrum_analyzer_.Output() + half_frame_size); + } + } + return spectrum_; +} + +const std::vector<ValueList>& FeatureExtractor::MagnitudeSpectrum() { + if (magnitude_spectrum_.empty()) { + // get complex spectrum + const std::vector<ComplexValueList>& spectrum = Spectrum(); + // get magnitude spectrum for each frame + std::size_t frames = spectrum.size(); + magnitude_spectrum_.resize(frames); + for (int i = 0; i < frames; i++) { + // take the absolute value of each complex bin + magnitude_spectrum_[i].resize(spectrum[i].size()); + std::transform(spectrum[i].begin(), spectrum[i].end(), + magnitude_spectrum_[i].begin(), std::abs<Value>); + } + } + return magnitude_spectrum_; +} + +const std::vector<ValueList>& FeatureExtractor::LogMagnitudeSpectrum() { + if (log_magnitude_spectrum_.empty()) { + // get magnitude spectrum + const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum(); + // get magnitude spectrum for each frame + std::size_t frames = magnitude_spectrum.size(); + log_magnitude_spectrum_.resize(frames); + for (int i = 0; i < frames; i++) { + log_magnitude_spectrum_[i].resize(magnitude_spectrum[i].size()); + // copy magnitudes to log magnitude buffer, avoiding zeros + std::replace_copy(magnitude_spectrum[i].begin(), + magnitude_spectrum[i].end(), + log_magnitude_spectrum_[i].begin(), + 0.0, + std::numeric_limits<Value>::min()); + // take the log of each magnitude (ugly casting for cmath function...) + std::transform(log_magnitude_spectrum_[i].begin(), + log_magnitude_spectrum_[i].end(), + log_magnitude_spectrum_[i].begin(), + static_cast<double(*)(double)>(std::log10)); + } + } + return log_magnitude_spectrum_; +} + +const ValueList& FeatureExtractor::SpectralCentroid() { + if (spectral_centroid_.empty()) { + MagnitudeSpectrum(); + std::size_t frames = magnitude_spectrum_.size(); + spectral_centroid_.resize(frames); + std::size_t bins = magnitude_spectrum_[0].size(); + ValueList weighted_frequencies(bins); + for (int i = 0; i < frames; i++) { + const ValueList& magnitudes = magnitude_spectrum_[i]; + // get bin frequencies weighted by bin magnitudes + std::transform(magnitudes.begin(), + magnitudes.end(), + bin_frequencies_.begin(), + weighted_frequencies.begin(), + std::multiplies<Value>()); + // divide mean of weighted frequencies by mean of bin magnitudes + Value magnitude = stats::Sum(magnitudes); + if (magnitude > 0) { + spectral_centroid_[i] = stats::Sum(weighted_frequencies) / magnitude; + } else { + spectral_centroid_[i] = 0; + } + } + } + return spectral_centroid_; +} + + +const ValueList& FeatureExtractor::SpectralSpread() { + if (spectral_spread_.empty()) { + SpectralCentroid(); + std::size_t frames = magnitude_spectrum_.size(); + spectral_spread_.resize(frames); + std::size_t bins = magnitude_spectrum_[0].size(); + ValueList weighted_difference(bins); + for (int i = 0; i < frames; i++) { + Value centroid = spectral_centroid_[i]; + const ValueList& magnitudes = magnitude_spectrum_[i]; + for (int bin = 0; bin < bins; bin++) { + Value difference = bin_frequencies_[bin] - centroid; + weighted_difference[bin] = difference * difference * magnitudes[bin]; + } + Value magnitude = stats::Sum(magnitudes); + if (magnitude > 0) { + Value variance = stats::Sum(weighted_difference) / magnitude; + spectral_spread_[i] = std::sqrt(variance); + } else { + spectral_spread_[i] = 0; + } + } + } + return spectral_spread_; +} + +const ValueList& FeatureExtractor::Pitch() { + if (pitch_.empty()) { + MagnitudeSpectrum(); + pitch_.resize(frames_); + std::deque<Value> recent_pitches; + const int moving_average_size = 8; + for (int i = 0; i < frames_; i++) { + const ValueList& mags = magnitude_spectrum_[i]; + // find the peak bin number + std::ptrdiff_t peak_bin = std::distance(mags.begin(), + std::max_element(mags.begin(), + mags.end())); + // interpolate peak frequency + double offset; + double centre = mags[peak_bin]; + if (centre == 0) { + pitch_[i] = 0; + } else { + if (peak_bin == 0) { + // parabolic interpolation of first 2 bin magnitudes + double above = mags[peak_bin + 1]; + offset = above / (2 * (2 * centre - above)); + } else if (peak_bin == mags.size() - 1) { + // parabolic interpolation of last 2 bin magnitudes + double below = mags[peak_bin - 1]; + offset = (-below) / (2 * (2 * centre - below)); + } else { + double below = mags[peak_bin - 1]; + double above = mags[peak_bin + 1]; + // gaussian interpolation + offset = std::log(above / below) + / (2 * std::log((centre * centre) / (above * below))); + } + Value frame_pitch = bin_size_ * (peak_bin + offset); + if (recent_pitches.size() == moving_average_size) { + recent_pitches.pop_front(); + } + recent_pitches.push_back(frame_pitch); + pitch_[i] = stats::Mean(recent_pitches); + } + } + } + return pitch_; +} + +const ValueList& FeatureExtractor::Energy() { + if (energy_.empty()) { + const int frame_size = window_size_; + const int hop_size = hop_size_; + energy_.resize(frames_); + ValueList::const_iterator data_begin = file_data_.begin(); + ValueList::const_iterator data_end = file_data_.end(); + for (int i = 0; i < frames_; i++) { + // find the frame bounds + ValueList::const_iterator read_start = data_begin + (i * hop_size); + ValueList::const_iterator read_end = read_start + frame_size; + if (read_end > data_end) { + read_end = data_end; + } + Value energy(0); + std::ptrdiff_t samples = std::distance(read_start, read_end); + while (read_start != read_end) { + energy += std::pow(*read_start++, 2); + } + energy_[i] = energy / samples; + } + } + return energy_; +} + +const std::vector<ValueList>& FeatureExtractor::MFCCs() { + if (mfccs_.empty()) { + const std::vector<ValueList>& magnitude_spectrum = MagnitudeSpectrum(); + mfcc_analyzer_.SetSampleRate(sample_rate_); + mfccs_.resize(frames_); + for (int i = 0; i < frames_; i++) { + mfccs_[i] = mfcc_analyzer_.ExtractMFCCS(magnitude_spectrum[i]); + } + } + return mfccs_; +} + +const std::vector<ValueList>& FeatureExtractor::DeltaMFCCs() { + if (delta_mfccs_.empty()) { + const std::vector<ValueList>& mfccs = MFCCs(); + // resize buffers + const std::size_t number_of_mfccs = mfccs[0].size(); + ValueList empty_values(number_of_mfccs, 0); + delta_mfccs_.resize(mfccs.size(), empty_values); + // calculate deltas based on 2 neighbouring frames in either direction + for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) { + const ValueList& before_2 = mfccs[i - 2]; + const ValueList& before_1 = mfccs[i - 1]; + const ValueList& after_1 = mfccs[i + 1]; + const ValueList& after_2 = mfccs[i + 2]; + ValueList& delta_frame = delta_mfccs_[i]; + for (int m = 0; m < number_of_mfccs; m++) { + Value delta = after_1[m] - before_1[m] + + 2.0 * (after_2[m] - before_2[m]); + delta_frame[m] = delta / 10.0; + } + } + } + return delta_mfccs_; +} + +const std::vector<ValueList>& FeatureExtractor::DoubleDeltaMFCCs() { + if (double_delta_mfccs_.empty()) { + const std::vector<ValueList>& delta_mfccs = DeltaMFCCs(); + // resize buffers + const std::size_t number_of_mfccs = delta_mfccs[0].size(); + ValueList empty_values(number_of_mfccs, 0); + double_delta_mfccs_.resize(delta_mfccs.size(), empty_values); + // calculate deltas based on 2 neighbouring frames in either direction + for (int i = 2, last_frame = frames_ - 2; i < last_frame; i++) { + const ValueList& before_2 = delta_mfccs[i - 2]; + const ValueList& before_1 = delta_mfccs[i - 1]; + const ValueList& after_1 = delta_mfccs[i + 1]; + const ValueList& after_2 = delta_mfccs[i + 2]; + ValueList& double_delta_frame = double_delta_mfccs_[i]; + for (int m = 0; m < number_of_mfccs; m++) { + Value delta = after_1[m] - before_1[m] + + 2.0 * (after_2[m] - before_2[m]); + double_delta_frame[m] = delta / 10.0; + } + } + } + return double_delta_mfccs_; +} + +const std::vector<ValueList>& FeatureExtractor::SpectralFlux() { + if (spectral_flux_.empty()) { + // get magnitude spectrum + const std::vector<ValueList>& spectrum = MagnitudeSpectrum(); + // get magnitude spectrum for each frame + std::size_t frames = spectrum.size(); + std::size_t bins = spectrum[0].size(); + spectral_flux_.resize(frames); + // first frame always zeroed + spectral_flux_[0].assign(bins, 0); + for (int i = 1; i < frames; i++) { + const ValueList& spectrum_frame = spectrum[i]; + const ValueList& previous_spectrum_frame = spectrum[i - 1]; + ValueList& flux_frame = spectral_flux_[i]; + flux_frame.resize(bins); + // get the difference in magnitudes for each bin + for (int bin = 0; bin < bins; bin++) { + flux_frame[bin] = spectrum_frame[bin] - previous_spectrum_frame[bin]; + } + //std::cout << i << ": " << stats::Mean(flux_frame) << '\n'; + } + } + return spectral_flux_; +} + +} // dsp namespace