Mercurial > hg > vamp-libxtract-plugins
view plugins/XTractPlugin.cpp @ 40:08d9660e57e8 tip
Or better, this
author | Chris Cannam |
---|---|
date | Thu, 16 May 2024 10:17:33 +0100 |
parents | 11b10bf3147a |
children |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ /* Vamp feature extraction plugins using Jamie Bullock's libxtract audio feature extraction library. Centre for Digital Music, Queen Mary, University of London. This file copyright 2006-2012 Queen Mary, University of London. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. See the file COPYING included with this distribution for more information. */ #include "XTractPlugin.h" #include <cassert> #include <cstdio> #include <math.h> #include <stdio.h> using std::cerr; using std::endl; using std::string; xtract_function_descriptor_t * XTractPlugin::m_xtDescriptors = 0; int XTractPlugin::m_xtDescRefCount = 0; XTractPlugin::XTractPlugin(unsigned int xtFeature, float inputSampleRate) : Plugin(inputSampleRate), m_xtFeature(xtFeature), m_channels(0), m_stepSize(0), m_blockSize(0), m_resultBuffer(0), m_peakThreshold(10), m_rolloffThreshold(90), m_harmonicThreshold(.1), m_minFreq(80), m_maxFreq(18000), m_coeffCount(40), m_highestCoef(20), m_lowestCoef(0), m_mfccFilters(0), m_mfccStyle((int)XTRACT_EQUAL_GAIN), m_spectrumType((int)XTRACT_MAGNITUDE_SPECTRUM), m_dc(0), m_normalise(0), m_barkBandLimits(0), m_outputBinCount(0), m_initialised(false) { if (m_xtDescRefCount++ == 0) { m_xtDescriptors = (xtract_function_descriptor_t *)xtract_make_descriptors(); } } XTractPlugin::~XTractPlugin() { if (m_mfccFilters) { for (size_t i = 0; i < m_coeffCount; ++i) { delete[] m_mfccFilters[i]; } delete[] m_mfccFilters; } if (m_barkBandLimits) { delete[] m_barkBandLimits; } if (m_resultBuffer) { delete[] m_resultBuffer; } if (--m_xtDescRefCount == 0) { xtract_free_descriptors(m_xtDescriptors); } } string XTractPlugin::getIdentifier() const { return xtDescriptor()->algo.name; } string XTractPlugin::getName() const { return xtDescriptor()->algo.p_name; } string XTractPlugin::getDescription() const { return xtDescriptor()->algo.p_desc; } string XTractPlugin::getMaker() const { return "libxtract by Jamie Bullock (plugin by Chris Cannam)"; } int XTractPlugin::getPluginVersion() const { return 4; } string XTractPlugin::getCopyright() const { string text = "Copyright 2006-2012 Jamie Bullock, plugin Copyright 2006-2012 Queen Mary, University of London. "; string method = ""; method += xtDescriptor()->algo.author; if (method != "") { int year = xtDescriptor()->algo.year; if (year != 0) { char yearstr[12]; sprintf(yearstr, " (%d)", year); method += yearstr; } text += "Method from " + method + ". "; } text += "Distributed under the GNU General Public License"; return text; } XTractPlugin::InputDomain XTractPlugin::getInputDomain() const { if (xtDescriptor()->data.format == XTRACT_AUDIO_SAMPLES) return TimeDomain; else return FrequencyDomain; } bool XTractPlugin::m_anyInitialised = false; bool XTractPlugin::initialise(size_t channels, size_t stepSize, size_t blockSize) { int donor = *(xtDescriptor()->argv.donor), data_format = xtDescriptor()->data.format; if (channels < getMinChannelCount() || channels > getMaxChannelCount()) return false; if (blockSize != getPreferredBlockSize()) { cerr << "XTractPlugin::initialise: ERROR: " << "Only the standard block size of " << getPreferredBlockSize() << " is supported (owing to global FFT initialisation requirements)" << endl; return false; } m_channels = channels; m_stepSize = stepSize; m_blockSize = blockSize; if (!m_anyInitialised) { m_anyInitialised = true; // initialise libxtract xtract_init_fft(m_blockSize, XTRACT_SPECTRUM); xtract_init_fft(m_blockSize, XTRACT_AUTOCORRELATION_FFT); xtract_init_fft(m_blockSize, XTRACT_DCT); xtract_init_fft(m_blockSize, XTRACT_MFCC); } if (donor == XTRACT_INIT_MFCC) { m_mfccFilters = new double *[m_coeffCount]; for (size_t i = 0; i < m_coeffCount; ++i) { m_mfccFilters[i] = new double[m_blockSize]; } int error = (int)xtract_init_mfcc(m_blockSize, m_inputSampleRate/2, m_mfccStyle, m_minFreq, m_maxFreq, m_coeffCount, m_mfccFilters); if (error != XTRACT_SUCCESS) { cerr << "XTractPlugin::initialise: ERROR: " << "xtract_init_mfcc returned error code " << error << endl; return false; } } else if (donor == XTRACT_BARK_COEFFICIENTS || donor == XTRACT_INIT_BARK || data_format == XTRACT_BARK_COEFFS) { m_barkBandLimits = new int[XTRACT_BARK_BANDS]; /*int error = *(int)*/xtract_init_bark(m_blockSize, m_inputSampleRate, m_barkBandLimits); // if (error != SUCCESS) { // cerr << "XTractPlugin::initialise: ERROR: " // << "xtract_init_bark returned error code " << error << endl; // return false; // } } switch (m_xtFeature) { case XTRACT_SPECTRUM: m_outputBinCount = m_blockSize / 2 + (m_dc ? 1 : 0); break; case XTRACT_HARMONIC_SPECTRUM: case XTRACT_PEAK_SPECTRUM: m_outputBinCount = m_blockSize / 2; break; case XTRACT_DCT: case XTRACT_AUTOCORRELATION_FFT: case XTRACT_AUTOCORRELATION: case XTRACT_AMDF: case XTRACT_ASDF: m_outputBinCount = m_blockSize; break; case XTRACT_MFCC: m_outputBinCount = (m_highestCoef - m_lowestCoef)+1; break; case XTRACT_BARK_COEFFICIENTS: m_outputBinCount = XTRACT_BARK_BANDS; break; default: m_outputBinCount = 1; break; } m_outputDescriptors.clear(); setupOutputDescriptors(); m_initialised = true; return true; } void XTractPlugin::reset() { } size_t XTractPlugin::getMinChannelCount() const { return 1; } size_t XTractPlugin::getMaxChannelCount() const { return 1; } size_t XTractPlugin::getPreferredStepSize() const { if (getInputDomain() == FrequencyDomain) { return getPreferredBlockSize(); } else { return getPreferredBlockSize() / 2; } } size_t XTractPlugin::getPreferredBlockSize() const { return 1024; } XTractPlugin::ParameterList XTractPlugin::getParameterDescriptors() const { ParameterList list; ParameterDescriptor desc; if (m_xtFeature == XTRACT_MFCC) { desc.identifier = "minfreq"; desc.name = "Minimum Frequency"; desc.minValue = 0; desc.maxValue = m_inputSampleRate / 2; desc.defaultValue = 80; desc.isQuantized = false; desc.unit = "Hz"; list.push_back(desc); desc.identifier = "maxfreq"; desc.name = "Maximum Frequency"; desc.defaultValue = 18000; if (desc.defaultValue > m_inputSampleRate * 0.875) { desc.defaultValue = m_inputSampleRate * 0.875; } list.push_back(desc); desc.identifier = "bands"; desc.name = "# Mel Frequency Bands"; desc.minValue = 10; desc.maxValue = 80; desc.defaultValue = 40; desc.unit = ""; desc.isQuantized = true; desc.quantizeStep = 1; list.push_back(desc); desc.identifier = "lowestcoef"; desc.name = "Lowest Coefficient Returned"; desc.minValue = 0; desc.maxValue = 80; desc.defaultValue = 0; desc.unit = ""; desc.isQuantized = true; desc.quantizeStep = 1; list.push_back(desc); desc.identifier = "highestcoef"; desc.name = "Highest Coefficient Returned"; desc.minValue = 0; desc.maxValue = 80; desc.defaultValue = 20; desc.unit = ""; desc.isQuantized = true; desc.quantizeStep = 1; list.push_back(desc); desc.identifier = "style"; desc.name = "MFCC Type"; desc.minValue = 0; desc.maxValue = 1; desc.defaultValue = 0; desc.valueNames.push_back("Equal Gain"); desc.valueNames.push_back("Equal Area"); list.push_back(desc); } if (m_xtFeature == XTRACT_SPECTRUM) { desc.identifier = "spectrumtype"; desc.name = "Type"; desc.minValue = 0; desc.maxValue = 3; desc.defaultValue = int(XTRACT_MAGNITUDE_SPECTRUM); desc.isQuantized = true; desc.quantizeStep = 1; desc.valueNames.push_back("Magnitude Spectrum"); desc.valueNames.push_back("Log Magnitude Spectrum"); desc.valueNames.push_back("Power Spectrum"); desc.valueNames.push_back("Log Power Spectrum"); list.push_back(desc); desc.identifier = "dc"; desc.name = "Include DC"; desc.maxValue = 1; desc.defaultValue = 0; desc.valueNames.clear(); list.push_back(desc); desc.identifier = "normalise"; desc.name = "Normalise"; list.push_back(desc); } if (needPeakThreshold()) { desc.identifier = "peak-threshold"; desc.name = "Peak Threshold"; desc.minValue = 0; desc.maxValue = 100; desc.defaultValue = 10; /* Threshold as % of maximum peak found */ desc.isQuantized = false; desc.valueNames.clear(); desc.unit = "%"; list.push_back(desc); } if (needRolloffThreshold()) { desc.identifier = "rolloff-threshold"; desc.name = "Rolloff Threshold"; desc.minValue = 0; desc.maxValue = 100; desc.defaultValue = 90; /* Freq below which 90% of energy is */ desc.isQuantized = false; desc.valueNames.clear(); desc.unit = "%"; list.push_back(desc); } if (needHarmonicThreshold()) { desc.identifier = "harmonic-threshold"; desc.name = "Harmonic Threshold"; desc.minValue = 0; desc.maxValue = 1.0; desc.defaultValue = .1; /* Distance from nearesst harmonic number */ desc.isQuantized = false; desc.valueNames.clear(); desc.unit = ""; list.push_back(desc); } return list; } float XTractPlugin::getParameter(string param) const { if (m_xtFeature == XTRACT_MFCC) { if (param == "minfreq") return m_minFreq; if (param == "maxfreq") return m_maxFreq; if (param == "bands") return m_coeffCount; if (param == "lowestcoef") return m_lowestCoef; if (param == "highestcoef") return m_highestCoef; if (param == "style") return m_mfccStyle; } if (m_xtFeature == XTRACT_SPECTRUM) { if (param == "spectrumtype") return m_spectrumType; if (param == "dc") return m_dc; if (param == "normalise") return m_normalise; } if (param == "peak-threshold") return m_peakThreshold; if (param == "rolloff-threshold") return m_rolloffThreshold; if (param == "harmonic-threshold") return m_harmonicThreshold; return 0.f; } void XTractPlugin::setParameter(string param, float value) { if (m_xtFeature == XTRACT_MFCC) { if (param == "minfreq") m_minFreq = value; else if (param == "maxfreq") m_maxFreq = value; else if (param == "bands") m_coeffCount = int(value + .1); else if (param == "lowestcoef"){ m_lowestCoef = int(value + .1); if(m_lowestCoef >= m_coeffCount) m_lowestCoef = m_coeffCount - 1; if(m_lowestCoef > m_highestCoef) m_lowestCoef = m_highestCoef; } else if (param == "highestcoef"){ m_highestCoef = int(value + .1); if(m_highestCoef >= m_coeffCount) m_highestCoef = m_coeffCount - 1; if(m_highestCoef < m_lowestCoef) m_highestCoef = m_lowestCoef; } else if (param == "style") m_mfccStyle = int(value + .1); } if (m_xtFeature == XTRACT_SPECTRUM) { if (param == "spectrumtype") m_spectrumType = int(value + .1); if (param == "dc") m_dc = int(value + .1); if (param == "normalise") m_normalise = int(value + .1); } if (param == "peak-threshold") m_peakThreshold = value; if (param == "rolloff-threshold") m_rolloffThreshold = value; if (param == "harmonic-threshold") m_harmonicThreshold = value; } XTractPlugin::OutputList XTractPlugin::getOutputDescriptors() const { if (m_outputDescriptors.empty()) { setupOutputDescriptors(); } return m_outputDescriptors; } void XTractPlugin::setupOutputDescriptors() const { OutputDescriptor d; const xtract_function_descriptor_t *xtFd = xtDescriptor(); d.identifier = getIdentifier(); d.name = getName(); d.description = getDescription(); d.unit = ""; d.hasFixedBinCount = true; d.binCount = m_outputBinCount; d.hasKnownExtents = false; d.isQuantized = false; d.sampleType = OutputDescriptor::OneSamplePerStep; if (xtFd->is_scalar){ switch(xtFd->result.scalar.unit){ case XTRACT_HERTZ: d.unit = "Hz"; break; case XTRACT_DBFS: d.unit = "dB"; break; default: d.unit = ""; break; } } else { if (xtFd->result.vector.format == XTRACT_SPECTRAL){ d.binCount /= 2; d.identifier = "amplitudes"; d.name = "Peak Amplitudes"; d.description = ""; } } m_outputDescriptors.push_back(d); } bool XTractPlugin::needPeakThreshold() const { const xtract_function_descriptor_t *xtFd = xtDescriptor(); if(m_xtFeature == XTRACT_PEAK_SPECTRUM || xtFd->data.format == XTRACT_SPECTRAL_PEAKS || xtFd->data.format == XTRACT_SPECTRAL_PEAKS_MAGNITUDES || needHarmonicThreshold()) return true; else return false; } bool XTractPlugin::needHarmonicThreshold() const { const xtract_function_descriptor_t *xtFd = xtDescriptor(); if(m_xtFeature == XTRACT_HARMONIC_SPECTRUM || xtFd->data.format == XTRACT_SPECTRAL_HARMONICS_FREQUENCIES || m_xtFeature == XTRACT_NOISINESS || xtFd->data.format == XTRACT_SPECTRAL_HARMONICS_MAGNITUDES) return true; else return false; } bool XTractPlugin::needRolloffThreshold() const { if(m_xtFeature == XTRACT_ROLLOFF) return true; else return false; } XTractPlugin::FeatureSet XTractPlugin::process(const float *const *inputBuffers, Vamp::RealTime timestamp) { if (m_outputDescriptors.empty()) { setupOutputDescriptors(); } int rbs = // Add 2 here to accommodate extra data for spectrum with DC 2 + (m_outputBinCount > m_blockSize ? m_outputBinCount : m_blockSize); if (!m_resultBuffer) { m_resultBuffer = new double[rbs]; } int i; for (i = 0; i < rbs; ++i) m_resultBuffer[i] = 0.f; int N = m_blockSize, M = N >> 1; const double *data = 0; double *input_d = new double[N]; for (int i = 0; i < N; ++i) { input_d[i] = inputBuffers[0][i]; } double *fft_temp = 0, *data_temp = 0; void *argv = 0; bool isSpectral = false; xtract_function_descriptor_t *xtFd = xtDescriptor(); FeatureSet fs; switch (xtFd->data.format) { case XTRACT_AUDIO_SAMPLES: data = input_d; break; case XTRACT_SPECTRAL: default: // All the rest are derived from the spectrum // Need same format as would be output by xtract_spectrum double q = m_inputSampleRate / N; fft_temp = new double[N]; for (int n = 1; n < N/2; ++n) { fft_temp[n] = sqrt(input_d[n*2] * input_d[n*2] + input_d[n*2+1] * input_d[n*2+1]) / N; fft_temp[N-n] = (N/2 - n) * q; } fft_temp[0] = fabs(input_d[0]) / N; fft_temp[N/2] = fabs(input_d[N]) / N; data = &fft_temp[0]; isSpectral = true; break; } assert(m_outputBinCount > 0); double *result = m_resultBuffer; double argf[XTRACT_MAXARGS]; argv = &argf[0]; argf[0] = 0.f; // handy for some, e.g. lowest_value which has a threshold double mean, variance, sd, npartials, nharmonics; bool needSD, needVariance, needMean, needPeaks, needBarkCoefficients, needHarmonics, needF0, needSFM, needMax, needNumPartials, needNumHarmonics; int donor; needSD = needVariance = needMean = needPeaks = needBarkCoefficients = needF0 = needHarmonics = needSFM = needMax = needNumPartials = needNumHarmonics = 0; mean = variance = sd = npartials = nharmonics = 0.f; i = xtFd->argc; while(i--){ if (m_xtFeature == XTRACT_BARK_COEFFICIENTS) { /* "BARK_COEFFICIENTS is special because argc = BARK_BANDS" */ break; } donor = xtFd->argv.donor[i]; switch(donor){ case XTRACT_STANDARD_DEVIATION: case XTRACT_SPECTRAL_STANDARD_DEVIATION: needSD = 1; break; case XTRACT_VARIANCE: case XTRACT_SPECTRAL_VARIANCE: needVariance = 1; break; case XTRACT_MEAN: case XTRACT_SPECTRAL_MEAN: needMean = 1; break; case XTRACT_F0: case XTRACT_FAILSAFE_F0: needF0 = 1; break; case XTRACT_FLATNESS: needSFM = 1; case XTRACT_HIGHEST_VALUE: needMax = 1; break; } } if(needHarmonicThreshold() && m_xtFeature != XTRACT_HARMONIC_SPECTRUM) needHarmonics = needF0 = 1; if(needPeakThreshold() && m_xtFeature != XTRACT_PEAK_SPECTRUM) needPeaks = 1; if(xtFd->data.format == XTRACT_BARK_COEFFS && m_xtFeature != XTRACT_BARK_COEFFICIENTS){ needBarkCoefficients = 1; } if (needMean) { if(isSpectral) xtract_spectral_mean(data, N, 0, result); else xtract_mean(data, M, 0, result); mean = *result; *result = 0.f; } if (needVariance || needSD) { argf[0] = mean; if(isSpectral) xtract_spectral_variance(data, N, argv, result); else xtract_variance(data, M, argv, result); variance = *result; *result = 0.f; } if (needSD) { argf[0] = variance; if(isSpectral) xtract_spectral_standard_deviation(data, N, argv, result); else xtract_standard_deviation(data, M, argv, result); sd = *result; *result = 0.f; } if (needMax) { xtract_highest_value(data, M, argv, result); argf[1] = *result; *result = 0.f; } if (needSD) { argf[0] = mean; argf[1] = sd; } else if (needVariance) { argf[0] = variance; } else if (needMean) { argf[0] = mean; } // data should be now correct for all except: // XTRACT_SPECTRAL_CENTROID -- N/2 magnitude peaks and N/2 frequencies // TONALITY -- SFM // TRISTIMULUS_1/2/3 -- harmonic spectrum // ODD_EVEN_RATIO -- harmonic spectrum // LOUDNESS -- Bark coefficients // XTRACT_HARMONIC_SPECTRUM -- peak spectrum // argv should be now correct for all except: // // XTRACT_ROLLOFF -- (sr/N), threshold (%) // XTRACT_PEAK_SPECTRUM -- (sr / N), peak threshold (%) // XTRACT_HARMONIC_SPECTRUM -- f0, harmonic threshold // XTRACT_F0 -- samplerate // XTRACT_MFCC -- Mel filter coefficients // XTRACT_BARK_COEFFICIENTS -- Bark band limits // XTRACT_NOISINESS -- npartials, nharmonics. // XTRACT_SPECTRUM -- q, spectrum type, dc, normalise data_temp = new double[N]; if (m_xtFeature == XTRACT_ROLLOFF || m_xtFeature == XTRACT_PEAK_SPECTRUM || needPeaks) { argf[0] = m_inputSampleRate / N; if(m_xtFeature == XTRACT_ROLLOFF) argf[1] = m_rolloffThreshold; else argf[1] = m_peakThreshold; argv = &argf[0]; } if (m_xtFeature == XTRACT_SPECTRUM) { argf[0] = 0; // xtract_spectrum will calculate this for us argf[1] = m_spectrumType; argf[2] = m_dc; argf[3] = m_normalise; argv = &argf[0]; } if (needPeaks) { //We only read in the magnitudes (M) /*int rv = */ xtract_peak_spectrum(data, M, argv, result); for (int n = 0; n < N; ++n) { data_temp[n] = result[n]; result[n] = 0.f; } // rv not trustworthy // if (rv != SUCCESS) { // cerr << "ERROR: XTractPlugin::process: xtract_peaks failed (error code = " << rv << ")" << endl; // goto done; // } } if (needNumPartials) { xtract_nonzero_count(data_temp, M, NULL, &npartials); } if (needF0 || m_xtFeature == XTRACT_FAILSAFE_F0 || m_xtFeature == XTRACT_F0) { argf[0] = m_inputSampleRate; argv = &argf[0]; } if (needF0) { xtract_failsafe_f0(&input_d[0], N, (void *)&m_inputSampleRate, result); argf[0] = *result; argv = &argf[0]; } if (needSFM) { xtract_flatness(data, N >> 1, 0, &argf[0]); argv = &argf[0]; } if (needHarmonics || m_xtFeature == XTRACT_HARMONIC_SPECTRUM){ argf[1] = m_harmonicThreshold; } if (needHarmonics){ xtract_harmonic_spectrum(data_temp, N, argv, result); for (int n = 0; n < N; ++n) { data_temp[n] = result[n]; result[n] = 0.f; } } if (needNumHarmonics) { xtract_nonzero_count(data_temp, M, NULL, &nharmonics); } if (m_xtFeature == XTRACT_NOISINESS) { argf[0] = nharmonics; argf[1] = npartials; argv = &argf[0]; } if (needBarkCoefficients || m_xtFeature == XTRACT_BARK_COEFFICIENTS) { argv = &m_barkBandLimits[0]; } xtract_mel_filter mfccFilterBank; if (m_xtFeature == XTRACT_MFCC) { mfccFilterBank.n_filters = m_coeffCount; mfccFilterBank.filters = m_mfccFilters; argv = &mfccFilterBank; } if (needBarkCoefficients) { /*int rv = */ xtract_bark_coefficients(data, 0, argv, data_temp); // if (rv != SUCCESS) { // cerr << "ERROR: XTractPlugin::process: xtract_bark_coefficients failed (error code = " << rv << ")" << endl; // goto done; // } data = &data_temp[0]; argv = 0; } if (xtFd->data.format == XTRACT_SPECTRAL_HARMONICS_FREQUENCIES) { N = M; data = &data_temp[N]; } else if (xtFd->data.format == XTRACT_SPECTRAL_HARMONICS_MAGNITUDES) { N = M; data = &data_temp[0]; } // If we only want spectral magnitudes, use first half of the input array else if(xtFd->data.format == XTRACT_SPECTRAL_MAGNITUDES || xtFd->data.format == XTRACT_SPECTRAL_PEAKS_MAGNITUDES || xtFd->data.format == XTRACT_ARBITRARY_SERIES) { N = M; } else if(xtFd->data.format == XTRACT_BARK_COEFFS) { N = XTRACT_BARK_BANDS - 1; /* Because our SR is 44100 (< 54000)*/ } if (needPeaks && !needHarmonics) { data = &data_temp[0]; } // now the main result xtract[m_xtFeature](data, N, argv, result); //haveResult: // { int index = 0; for (size_t output = 0; output < m_outputDescriptors.size(); ++output) { Feature feature; feature.hasTimestamp = false; bool good = true; for (size_t n = 0; n < m_outputDescriptors[output].binCount; ++n) { double value = m_resultBuffer[index + m_lowestCoef]; if (isnan(value) || isinf(value)) { good = false; index += (m_outputDescriptors[output].binCount - n); break; } feature.values.push_back(value); ++index; } if (good) fs[output].push_back(feature); } // } //done: delete[] fft_temp; delete[] data_temp; delete[] input_d; // cerr << "XTractPlugin::process returning" << endl; return fs; } XTractPlugin::FeatureSet XTractPlugin::getRemainingFeatures() { return FeatureSet(); }