# HG changeset patch # User Chris Cannam # Date 1399460132 -3600 # Node ID df05f855f63b458344bb3fddefe1d93dd64abc66 # Parent e282930cfca77e244ecd8535d3b748340809aabf# Parent f7e3c782d7584fca777d9e0abd22381a3f33dcf6 Merge from branch bqvec-openmp diff -r e282930cfca7 -r df05f855f63b .hgsub --- a/.hgsub Tue May 06 18:55:11 2014 +0100 +++ b/.hgsub Wed May 07 11:55:32 2014 +0100 @@ -1,1 +1,2 @@ constant-q-cpp = https://code.soundsoftware.ac.uk/hg/constant-q-cpp +bqvec = https://bitbucket.org/breakfastquay/bqvec diff -r e282930cfca7 -r df05f855f63b .hgsubstate --- a/.hgsubstate Tue May 06 18:55:11 2014 +0100 +++ b/.hgsubstate Wed May 07 11:55:32 2014 +0100 @@ -1,1 +1,2 @@ +f11c3c331979bbec59041e4a76401b50c0469a7d bqvec 7d97986563a0b3104a5c3f88190350ae55528738 constant-q-cpp diff -r e282930cfca7 -r df05f855f63b Makefile.inc --- a/Makefile.inc Tue May 06 18:55:11 2014 +0100 +++ b/Makefile.inc Wed May 07 11:55:32 2014 +0100 @@ -5,6 +5,7 @@ VAMPSDK_DIR ?= ../vamp-plugin-sdk CQ_DIR ?= constant-q-cpp/cpp-qm-dsp +BQVEC_DIR ?= bqvec/src PLUGIN_EXT ?= .so @@ -12,7 +13,7 @@ CC ?= gcc CFLAGS := $(CFLAGS) -CXXFLAGS := -I. -I$(VAMPSDK_DIR) -I$(QMDSP_DIR) $(CXXFLAGS) +CXXFLAGS := -I. -I$(VAMPSDK_DIR) -I$(QMDSP_DIR) -I$(BQVEC_DIR) $(CXXFLAGS) LDFLAGS := $(LDFLAGS) PLUGIN_LDFLAGS := $(LDFLAGS) $(PLUGIN_LDFLAGS) @@ -25,8 +26,11 @@ CQ_HEADERS := $(CQ_DIR)/CQKernel.h $(CQ_DIR)/ConstantQ.h $(CQ_DIR)/CQInterpolated.h CQ_SOURCES := $(CQ_DIR)/CQKernel.cpp $(CQ_DIR)/ConstantQ.cpp $(CQ_DIR)/CQInterpolated.cpp -HEADERS := $(VAMP_HEADERS) $(CQ_HEADERS) -SOURCES := $(VAMP_SOURCES) $(CQ_SOURCES) +BQVEC_HEADERS := $(BQVEC_DIR)/Allocators.h $(BQVEC_DIR)/Restrict.h $(BQVEC_DIR)/VectorOps.h +BQVEC_SOURCES := $(BQVEC_DIR)/Allocators.cpp + +HEADERS := $(VAMP_HEADERS) $(CQ_HEADERS) $(BQVEC_HEADERS) +SOURCES := $(VAMP_SOURCES) $(CQ_SOURCES) $(BQVEC_SOURCES) OBJECTS := $(SOURCES:.cpp=.o) LIBS := $(QMDSP_DIR)/libqm-dsp.a $(VAMPSDK_DIR)/libvamp-sdk.a -lpthread @@ -63,6 +67,10 @@ constant-q-cpp/cpp-qm-dsp/CQInterpolated.o: constant-q-cpp/cpp-qm-dsp/CQInterpolated.h constant-q-cpp/cpp-qm-dsp/CQInterpolated.o: constant-q-cpp/cpp-qm-dsp/ConstantQ.h constant-q-cpp/cpp-qm-dsp/CQInterpolated.o: constant-q-cpp/cpp-qm-dsp/CQKernel.h +bqvec/src/Allocators.o: bqvec/src/Allocators.h bqvec/src/VectorOps.h +bqvec/src/Allocators.o: bqvec/src/Restrict.h constant-q-cpp/cpp-qm-dsp/ConstantQ.o: constant-q-cpp/cpp-qm-dsp/CQKernel.h constant-q-cpp/cpp-qm-dsp/CQInterpolated.o: constant-q-cpp/cpp-qm-dsp/ConstantQ.h constant-q-cpp/cpp-qm-dsp/CQInterpolated.o: constant-q-cpp/cpp-qm-dsp/CQKernel.h +bqvec/src/Allocators.o: bqvec/src/VectorOps.h bqvec/src/Restrict.h +bqvec/src/VectorOps.o: bqvec/src/Restrict.h diff -r e282930cfca7 -r df05f855f63b Makefile.linux --- a/Makefile.linux Tue May 06 18:55:11 2014 +0100 +++ b/Makefile.linux Wed May 07 11:55:32 2014 +0100 @@ -1,11 +1,12 @@ -CFLAGS := -Wall -O3 -ffast-math -msse -mfpmath=sse -ftree-vectorize -fPIC -I../vamp-plugin-sdk/ +CFLAGS := -Wall -O3 -fopenmp -ffast-math -msse -mfpmath=sse -ftree-vectorize -fPIC -I../vamp-plugin-sdk/ + #CFLAGS := -g -fPIC -I../vamp-plugin-sdk CXXFLAGS := $(CFLAGS) VAMPSDK_DIR := ../vamp-plugin-sdk -PLUGIN_LDFLAGS := -shared -Wl,--version-script=vamp-plugin.map +PLUGIN_LDFLAGS := -lgomp -shared -Wl,--version-script=vamp-plugin.map PLUGIN_EXT := .so diff -r e282930cfca7 -r df05f855f63b data/include/templates.h --- a/data/include/templates.h Tue May 06 18:55:11 2014 +0100 +++ b/data/include/templates.h Wed May 07 11:55:32 2014 +0100 @@ -15,7 +15,7 @@ const char *name; int lowest; int highest; - float data[SILVET_TEMPLATE_NOTE_COUNT][SILVET_TEMPLATE_SIZE]; + double data[SILVET_TEMPLATE_NOTE_COUNT][SILVET_TEMPLATE_SIZE]; } silvet_template_t; static int silvet_templates_lowest_note = 15; diff -r e282930cfca7 -r df05f855f63b src/EM.cpp --- a/src/EM.cpp Tue May 06 18:55:11 2014 +0100 +++ b/src/EM.cpp Wed May 07 11:55:32 2014 +0100 @@ -22,57 +22,68 @@ #include -#include +#include "VectorOps.h" +#include "Allocators.h" using std::vector; using std::cerr; using std::endl; +using namespace breakfastquay; + static double epsilon = 1e-16; EM::EM(bool useShifts) : - m_useShifts(useShifts), m_noteCount(SILVET_TEMPLATE_NOTE_COUNT), m_shiftCount(useShifts ? SILVET_TEMPLATE_MAX_SHIFT * 2 + 1 : 1), m_binCount(SILVET_TEMPLATE_HEIGHT), - m_instrumentCount(SILVET_TEMPLATE_COUNT), + m_sourceCount(SILVET_TEMPLATE_COUNT), m_pitchSparsity(1.1), - m_sourceSparsity(1.3) + m_sourceSparsity(1.3), + m_lowestPitch(silvet_templates_lowest_note), + m_highestPitch(silvet_templates_highest_note) { - m_lowestPitch = silvet_templates_lowest_note; - m_highestPitch = silvet_templates_highest_note; - - m_pitches = V(m_noteCount); + m_pitches = allocate(m_noteCount); + m_updatePitches = allocate(m_noteCount); for (int n = 0; n < m_noteCount; ++n) { m_pitches[n] = drand48(); } - m_shifts = Grid(m_shiftCount); - for (int f = 0; f < m_shiftCount; ++f) { - m_shifts[f] = V(m_noteCount); - for (int n = 0; n < m_noteCount; ++n) { - if (m_useShifts) { + if (useShifts) { + m_shifts = allocate_channels(m_shiftCount, m_noteCount); + m_updateShifts = allocate_channels(m_shiftCount, m_noteCount); + for (int f = 0; f < m_shiftCount; ++f) { + for (int n = 0; n < m_noteCount; ++n) { m_shifts[f][n] = drand48(); - } else { - m_shifts[f][n] = 1.0; } } + } else { + m_shifts = 0; + m_updateShifts = 0; } - m_sources = Grid(m_instrumentCount); - for (int i = 0; i < m_instrumentCount; ++i) { - m_sources[i] = V(m_noteCount); + m_sources = allocate_channels(m_sourceCount, m_noteCount); + m_updateSources = allocate_channels(m_sourceCount, m_noteCount); + for (int i = 0; i < m_sourceCount; ++i) { for (int n = 0; n < m_noteCount; ++n) { m_sources[i][n] = (inRange(i, n) ? 1.0 : 0.0); } } - m_estimate = V(m_binCount); - m_q = V(m_binCount); + m_estimate = allocate(m_binCount); + m_q = allocate(m_binCount); } EM::~EM() { + deallocate(m_q); + deallocate(m_estimate); + deallocate_channels(m_sources, m_sourceCount); + deallocate_channels(m_updateSources, m_sourceCount); + deallocate_channels(m_shifts, m_shiftCount); + deallocate_channels(m_updateShifts, m_shiftCount); + deallocate(m_pitches); + deallocate(m_updatePitches); } void @@ -91,47 +102,45 @@ } void -EM::normaliseColumn(V &column) +EM::normaliseColumn(double *column, int size) { - double sum = 0.0; - for (int i = 0; i < (int)column.size(); ++i) { - sum += column[i]; - } - for (int i = 0; i < (int)column.size(); ++i) { - column[i] /= sum; - } + double sum = v_sum(column, size); + v_scale(column, 1.0 / sum, size); } void -EM::normaliseGrid(Grid &grid) +EM::normaliseGrid(double **grid, int size1, int size2) { - V denominators(grid[0].size()); + double *denominators = allocate_and_zero(size2); - for (int i = 0; i < (int)grid.size(); ++i) { - for (int j = 0; j < (int)grid[i].size(); ++j) { + for (int i = 0; i < size1; ++i) { + for (int j = 0; j < size2; ++j) { denominators[j] += grid[i][j]; } } - for (int i = 0; i < (int)grid.size(); ++i) { - for (int j = 0; j < (int)grid[i].size(); ++j) { - grid[i][j] /= denominators[j]; - } + for (int i = 0; i < size1; ++i) { + v_divide(grid[i], denominators, size2); } + + deallocate(denominators); } void -EM::iterate(V column) +EM::iterate(const double *column) { - normaliseColumn(column); - expectation(column); - maximisation(column); + double *norm = allocate(m_binCount); + v_copy(norm, column, m_binCount); + normaliseColumn(norm, m_binCount); + expectation(norm); + maximisation(norm); + deallocate(norm); } -const float * +const double * EM::templateFor(int instrument, int note, int shift) { - if (m_useShifts) { + if (m_shifts) { return silvet_templates[instrument].data[note] + shift; } else { return silvet_templates[instrument].data[note] + @@ -140,24 +149,21 @@ } void -EM::expectation(const V &column) +EM::expectation(const double *column) { // cerr << "."; - for (int i = 0; i < m_binCount; ++i) { - m_estimate[i] = epsilon; - } + v_set(m_estimate, epsilon, m_binCount); - for (int i = 0; i < m_instrumentCount; ++i) { + for (int i = 0; i < m_sourceCount; ++i) { for (int n = 0; n < m_noteCount; ++n) { + const double pitch = m_pitches[n]; + const double source = m_sources[i][n]; for (int f = 0; f < m_shiftCount; ++f) { - const float *w = templateFor(i, n, f); - double pitch = m_pitches[n]; - double source = m_sources[i][n]; - double shift = m_shifts[f][n]; - for (int j = 0; j < m_binCount; ++j) { - m_estimate[j] += w[j] * pitch * source * shift; - } + const double *w = templateFor(i, n, f); + const double shift = m_shifts ? m_shifts[f][n] : 1.0; + const double factor = pitch * source * shift; + v_add_with_gain(m_estimate, w, factor, m_binCount); } } } @@ -168,77 +174,83 @@ } void -EM::maximisation(const V &column) +EM::maximisation(const double *column) { - V newPitches = m_pitches; + v_set(m_updatePitches, epsilon, m_noteCount); + + for (int i = 0; i < m_sourceCount; ++i) { + v_set(m_updateSources[i], epsilon, m_noteCount); + } + + if (m_shifts) { + for (int i = 0; i < m_shiftCount; ++i) { + v_set(m_updateShifts[i], epsilon, m_noteCount); + } + } + + double *contributions = allocate(m_binCount); for (int n = 0; n < m_noteCount; ++n) { - newPitches[n] = epsilon; - if (n >= m_lowestPitch && n <= m_highestPitch) { - for (int i = 0; i < m_instrumentCount; ++i) { - for (int f = 0; f < m_shiftCount; ++f) { - const float *w = templateFor(i, n, f); - double pitch = m_pitches[n]; - double source = m_sources[i][n]; - double shift = m_shifts[f][n]; - for (int j = 0; j < m_binCount; ++j) { - newPitches[n] += w[j] * m_q[j] * pitch * source * shift; + + const double pitch = m_pitches[n]; + + for (int f = 0; f < m_shiftCount; ++f) { + + const double shift = m_shifts ? m_shifts[f][n] : 1.0; + + for (int i = 0; i < m_sourceCount; ++i) { + + const double source = m_sources[i][n]; + const double factor = pitch * source * shift; + const double *w = templateFor(i, n, f); + + v_copy(contributions, w, m_binCount); + v_multiply(contributions, m_q, m_binCount); + + double total = factor * v_sum(contributions, m_binCount); + + if (n >= m_lowestPitch && n <= m_highestPitch) { + + m_updatePitches[n] += total; + + if (inRange(i, n)) { + m_updateSources[i][n] += total; } } + + if (m_shifts) { + m_updateShifts[f][n] += total; + } } } - if (m_pitchSparsity != 1.0) { - newPitches[n] = pow(newPitches[n], m_pitchSparsity); + } + + if (m_pitchSparsity != 1.0) { + for (int n = 0; n < m_noteCount; ++n) { + m_updatePitches[n] = + pow(m_updatePitches[n], m_pitchSparsity); } } - normaliseColumn(newPitches); - Grid newShifts = m_shifts; - - if (m_useShifts) { - for (int f = 0; f < m_shiftCount; ++f) { - for (int n = 0; n < m_noteCount; ++n) { - newShifts[f][n] = epsilon; - for (int i = 0; i < m_instrumentCount; ++i) { - const float *w = templateFor(i, n, f); - double pitch = m_pitches[n]; - double source = m_sources[i][n]; - double shift = m_shifts[f][n]; - for (int j = 0; j < m_binCount; ++j) { - newShifts[f][n] += w[j] * m_q[j] * pitch * source * shift; - } - } - } - } - normaliseGrid(newShifts); - } - - Grid newSources = m_sources; - - for (int i = 0; i < m_instrumentCount; ++i) { + if (m_sourceSparsity != 1.0) { for (int n = 0; n < m_noteCount; ++n) { - newSources[i][n] = epsilon; - if (inRange(i, n)) { - for (int f = 0; f < m_shiftCount; ++f) { - const float *w = templateFor(i, n, f); - double pitch = m_pitches[n]; - double source = m_sources[i][n]; - double shift = m_shifts[f][n]; - for (int j = 0; j < m_binCount; ++j) { - newSources[i][n] += w[j] * m_q[j] * pitch * source * shift; - } - } - } - if (m_sourceSparsity != 1.0) { - newSources[i][n] = pow(newSources[i][n], m_sourceSparsity); + for (int i = 0; i < m_sourceCount; ++i) { + m_updateSources[i][n] = + pow(m_updateSources[i][n], m_sourceSparsity); } } } - normaliseGrid(newSources); - m_pitches = newPitches; - m_shifts = newShifts; - m_sources = newSources; + normaliseColumn(m_updatePitches, m_noteCount); + std::swap(m_pitches, m_updatePitches); + + normaliseGrid(m_updateSources, m_sourceCount, m_noteCount); + std::swap(m_sources, m_updateSources); + + if (m_shifts) { + normaliseGrid(m_updateShifts, m_shiftCount, m_noteCount); + std::swap(m_shifts, m_updateShifts); + } } diff -r e282930cfca7 -r df05f855f63b src/EM.h --- a/src/EM.h Tue May 06 18:55:11 2014 +0100 +++ b/src/EM.h Wed May 07 11:55:32 2014 +0100 @@ -24,48 +24,71 @@ EM(bool useShifts); ~EM(); - void iterate(std::vector column); + int getBinCount() const { return m_binCount; } + int getNoteCount() const { return m_noteCount; } + int getSourceCount() const { return m_sourceCount; } - const std::vector &getEstimate() const { + /** + * Carry out one iteration using the given column as input. The + * column must have getBinCount() values. + */ + void iterate(const double *column); + + /** + * Return the estimated distribution after the current iteration. + * Like the input, this will have getBinCount() values. + */ + const double *getEstimate() const { return m_estimate; } - const std::vector &getPitchDistribution() const { + + /** + * Return the pitch distribution for the current estimate. The + * returned array has getNoteCount() values. + */ + const double *getPitchDistribution() const { return m_pitches; } - const std::vector > &getSources() const { + + /** + * Return the source distribution for the current estimate. The + * returned pointer refers to getSourceCount() arrays of + * getNoteCount() values. + */ + const double *const *getSources() const { return m_sources; } private: - typedef std::vector V; - typedef std::vector > Grid; + double *m_pitches; + double **m_shifts; + double **m_sources; - bool m_useShifts; + double *m_updatePitches; + double **m_updateShifts; + double **m_updateSources; - V m_pitches; - Grid m_shifts; - Grid m_sources; + double *m_estimate; + double *m_q; + + const int m_noteCount; + const int m_shiftCount; // 1 + 2 * max template shift + const int m_binCount; + const int m_sourceCount; + + const double m_pitchSparsity; + const double m_sourceSparsity; - V m_estimate; - V m_q; - - int m_noteCount; - int m_shiftCount; // 1 + 2 * max template shift - int m_binCount; - int m_instrumentCount; - - double m_pitchSparsity; - double m_sourceSparsity; + const int m_lowestPitch; + const int m_highestPitch; - int m_lowestPitch; - int m_highestPitch; + void normaliseColumn(double *column, int size); + void normaliseGrid(double **grid, int size1, int size2); - void normaliseColumn(V &column); - void normaliseGrid(Grid &grid); - void expectation(const V &column); - void maximisation(const V &column); + void expectation(const double *column); // size is m_binCount + void maximisation(const double *column); // size is m_binCount - const float *templateFor(int instrument, int note, int shift); + const double *templateFor(int instrument, int note, int shift); void rangeFor(int instrument, int &minPitch, int &maxPitch); bool inRange(int instrument, int pitch); }; diff -r e282930cfca7 -r df05f855f63b src/Silvet.cpp --- a/src/Silvet.cpp Tue May 06 18:55:11 2014 +0100 +++ b/src/Silvet.cpp Wed May 07 11:55:32 2014 +0100 @@ -41,7 +41,7 @@ Plugin(inputSampleRate), m_resampler(0), m_cq(0), - m_hqMode(false) + m_hqMode(true) { } @@ -138,7 +138,7 @@ desc.description = "Determines the tradeoff of processing speed against transcription quality"; desc.minValue = 0; desc.maxValue = 1; - desc.defaultValue = 0; + desc.defaultValue = 1; desc.isQuantized = true; desc.quantizeStep = 1; desc.valueNames.push_back("Draft (faster)"); @@ -397,6 +397,8 @@ FeatureSet fs; + if (filtered.empty()) return fs; + for (int i = 0; i < (int)filtered.size(); ++i) { Feature f; for (int j = 0; j < processingHeight; ++j) { @@ -409,33 +411,40 @@ int iterations = 12; + Grid pitchMatrix(width, vector(processingNotes)); + +#pragma omp parallel for for (int i = 0; i < width; ++i) { double sum = 0.0; for (int j = 0; j < processingHeight; ++j) { - sum += filtered[i][j]; + sum += filtered.at(i).at(j); } if (sum < 1e-5) continue; EM em(m_hqMode); + for (int j = 0; j < iterations; ++j) { - em.iterate(filtered[i]); + em.iterate(filtered.at(i).data()); } - - vector pitches = em.getPitchDistribution(); + + const double *pitches = em.getPitchDistribution(); for (int j = 0; j < processingNotes; ++j) { - pitches[j] *= sum; + pitchMatrix[i][j] = pitches[j] * sum; } + } + for (int i = 0; i < width; ++i) { + Feature f; for (int j = 0; j < processingNotes; ++j) { - f.values.push_back(float(pitches[j])); + f.values.push_back(float(pitchMatrix[i][j])); } fs[m_pitchOutputNo].push_back(f); - FeatureList noteFeatures = postProcess(pitches); + FeatureList noteFeatures = postProcess(pitchMatrix[i]); for (FeatureList::const_iterator fi = noteFeatures.begin(); fi != noteFeatures.end(); ++fi) { diff -r e282930cfca7 -r df05f855f63b testdata/timing/results.txt --- a/testdata/timing/results.txt Tue May 06 18:55:11 2014 +0100 +++ b/testdata/timing/results.txt Wed May 07 11:55:32 2014 +0100 @@ -91,7 +91,6 @@ conclusion: supports the previous test - OPENMP: commit:62b7be1226d5, as commit:ce64d11ef336 but with OpenMP parallel @@ -152,6 +151,16 @@ user 1m28.697s sys 0m0.197s +commit:91bb029a847a, as commit:a6e136aaa202 but with the series of +calculations reordered to match that in the recent bqvec code +commit:b2f0967cb8d1. Just testing whether it is the replacement of +std::vector or the reordering of vector operations that was saving the +time in bqvec branch. + +real 2m52.922s +user 2m52.480s +sys 0m0.263s + BQVEC: @@ -209,3 +218,34 @@ real 0m44.650s user 2m19.997s sys 0m0.343s + +commit:c4eae816bdb3, as commit:ac750e222ad3 but with some logic to +make using the shifts optional (though on by default). Performance +*should* be unchanged here. + +real 0m43.979s +user 2m19.297s +sys 0m0.360s + +commit:b2f0967cb8d1, as commit:c4eae816bdb3 but storing the templates +as float arrays and then pulling them out into individual +one-per-shift-factor double arrays each of which is explicitly +allocated with the proper alignment. Uses more memory, and the code is +ugly, but gets aligned starts for slightly more of the vector ops. + +real 0m50.856s +user 2m44.937s +sys 0m0.463s + +commit:6890dea115c3, as commit:c4eae816bdb3 with a loop factored out + +real 0m40.565s +user 2m3.883s +sys 0m0.307s + +commit:230920148ee5, as commit:6890dea115c3 with a simpler openmp loop + +real 0m39.761s +user 2m3.093s +sys 0m0.347s + diff -r e282930cfca7 -r df05f855f63b yeti/scratch/generateTemplatesC.yeti --- a/yeti/scratch/generateTemplatesC.yeti Tue May 06 18:55:11 2014 +0100 +++ b/yeti/scratch/generateTemplatesC.yeti Wed May 07 11:55:32 2014 +0100 @@ -106,7 +106,7 @@ " const char *name;", " int lowest;", " int highest;", - " float data[SILVET_TEMPLATE_NOTE_COUNT][SILVET_TEMPLATE_SIZE];", + " double data[SILVET_TEMPLATE_NOTE_COUNT][SILVET_TEMPLATE_SIZE];", "} silvet_template_t;", "", "static int silvet_templates_lowest_note = \(overallLowest);",