changeset 127:df05f855f63b

Merge from branch bqvec-openmp
author Chris Cannam
date Wed, 07 May 2014 11:55:32 +0100
parents e282930cfca7 (current diff) f7e3c782d758 (diff)
children 39071e131651
files
diffstat 10 files changed, 249 insertions(+), 154 deletions(-) [+]
line wrap: on
line diff
--- a/.hgsub	Tue May 06 18:55:11 2014 +0100
+++ b/.hgsub	Wed May 07 11:55:32 2014 +0100
@@ -1,1 +1,2 @@
 constant-q-cpp = https://code.soundsoftware.ac.uk/hg/constant-q-cpp
+bqvec = https://bitbucket.org/breakfastquay/bqvec
--- a/.hgsubstate	Tue May 06 18:55:11 2014 +0100
+++ b/.hgsubstate	Wed May 07 11:55:32 2014 +0100
@@ -1,1 +1,2 @@
+f11c3c331979bbec59041e4a76401b50c0469a7d bqvec
 7d97986563a0b3104a5c3f88190350ae55528738 constant-q-cpp
--- a/Makefile.inc	Tue May 06 18:55:11 2014 +0100
+++ b/Makefile.inc	Wed May 07 11:55:32 2014 +0100
@@ -5,6 +5,7 @@
 VAMPSDK_DIR  ?= ../vamp-plugin-sdk
 
 CQ_DIR	     ?= constant-q-cpp/cpp-qm-dsp
+BQVEC_DIR    ?= bqvec/src
 
 PLUGIN_EXT	?= .so
 
@@ -12,7 +13,7 @@
 CC	?= gcc
 
 CFLAGS := $(CFLAGS) 
-CXXFLAGS := -I. -I$(VAMPSDK_DIR) -I$(QMDSP_DIR) $(CXXFLAGS)
+CXXFLAGS := -I. -I$(VAMPSDK_DIR) -I$(QMDSP_DIR) -I$(BQVEC_DIR) $(CXXFLAGS)
 
 LDFLAGS := $(LDFLAGS) 
 PLUGIN_LDFLAGS := $(LDFLAGS) $(PLUGIN_LDFLAGS)
@@ -25,8 +26,11 @@
 CQ_HEADERS   := $(CQ_DIR)/CQKernel.h $(CQ_DIR)/ConstantQ.h $(CQ_DIR)/CQInterpolated.h
 CQ_SOURCES   := $(CQ_DIR)/CQKernel.cpp $(CQ_DIR)/ConstantQ.cpp $(CQ_DIR)/CQInterpolated.cpp
 
-HEADERS	     := $(VAMP_HEADERS) $(CQ_HEADERS)
-SOURCES	     := $(VAMP_SOURCES) $(CQ_SOURCES)
+BQVEC_HEADERS	:= $(BQVEC_DIR)/Allocators.h $(BQVEC_DIR)/Restrict.h $(BQVEC_DIR)/VectorOps.h
+BQVEC_SOURCES	:= $(BQVEC_DIR)/Allocators.cpp
+
+HEADERS	     := $(VAMP_HEADERS) $(CQ_HEADERS) $(BQVEC_HEADERS)
+SOURCES	     := $(VAMP_SOURCES) $(CQ_SOURCES) $(BQVEC_SOURCES)
 OBJECTS	     := $(SOURCES:.cpp=.o)
 
 LIBS	:= $(QMDSP_DIR)/libqm-dsp.a $(VAMPSDK_DIR)/libvamp-sdk.a -lpthread
@@ -63,6 +67,10 @@
 constant-q-cpp/cpp-qm-dsp/CQInterpolated.o: constant-q-cpp/cpp-qm-dsp/CQInterpolated.h
 constant-q-cpp/cpp-qm-dsp/CQInterpolated.o: constant-q-cpp/cpp-qm-dsp/ConstantQ.h
 constant-q-cpp/cpp-qm-dsp/CQInterpolated.o: constant-q-cpp/cpp-qm-dsp/CQKernel.h
+bqvec/src/Allocators.o: bqvec/src/Allocators.h bqvec/src/VectorOps.h
+bqvec/src/Allocators.o: bqvec/src/Restrict.h
 constant-q-cpp/cpp-qm-dsp/ConstantQ.o: constant-q-cpp/cpp-qm-dsp/CQKernel.h
 constant-q-cpp/cpp-qm-dsp/CQInterpolated.o: constant-q-cpp/cpp-qm-dsp/ConstantQ.h
 constant-q-cpp/cpp-qm-dsp/CQInterpolated.o: constant-q-cpp/cpp-qm-dsp/CQKernel.h
+bqvec/src/Allocators.o: bqvec/src/VectorOps.h bqvec/src/Restrict.h
+bqvec/src/VectorOps.o: bqvec/src/Restrict.h
--- a/Makefile.linux	Tue May 06 18:55:11 2014 +0100
+++ b/Makefile.linux	Wed May 07 11:55:32 2014 +0100
@@ -1,11 +1,12 @@
 
-CFLAGS := -Wall -O3 -ffast-math -msse -mfpmath=sse -ftree-vectorize -fPIC -I../vamp-plugin-sdk/
+CFLAGS := -Wall -O3 -fopenmp -ffast-math -msse -mfpmath=sse -ftree-vectorize -fPIC -I../vamp-plugin-sdk/
+
 #CFLAGS := -g -fPIC -I../vamp-plugin-sdk
 
 CXXFLAGS := $(CFLAGS)
 
 VAMPSDK_DIR := ../vamp-plugin-sdk
-PLUGIN_LDFLAGS := -shared -Wl,--version-script=vamp-plugin.map
+PLUGIN_LDFLAGS := -lgomp -shared -Wl,--version-script=vamp-plugin.map
 
 PLUGIN_EXT := .so
 
--- a/data/include/templates.h	Tue May 06 18:55:11 2014 +0100
+++ b/data/include/templates.h	Wed May 07 11:55:32 2014 +0100
@@ -15,7 +15,7 @@
     const char *name;
     int lowest;
     int highest;
-    float data[SILVET_TEMPLATE_NOTE_COUNT][SILVET_TEMPLATE_SIZE];
+    double data[SILVET_TEMPLATE_NOTE_COUNT][SILVET_TEMPLATE_SIZE];
 } silvet_template_t;
 
 static int silvet_templates_lowest_note = 15;
--- a/src/EM.cpp	Tue May 06 18:55:11 2014 +0100
+++ b/src/EM.cpp	Wed May 07 11:55:32 2014 +0100
@@ -22,57 +22,68 @@
 
 #include <iostream>
 
-#include <vector>
+#include "VectorOps.h"
+#include "Allocators.h"
 
 using std::vector;
 using std::cerr;
 using std::endl;
 
+using namespace breakfastquay;
+
 static double epsilon = 1e-16;
 
 EM::EM(bool useShifts) :
-    m_useShifts(useShifts),
     m_noteCount(SILVET_TEMPLATE_NOTE_COUNT),
     m_shiftCount(useShifts ? SILVET_TEMPLATE_MAX_SHIFT * 2 + 1 : 1),
     m_binCount(SILVET_TEMPLATE_HEIGHT),
-    m_instrumentCount(SILVET_TEMPLATE_COUNT),
+    m_sourceCount(SILVET_TEMPLATE_COUNT),
     m_pitchSparsity(1.1),
-    m_sourceSparsity(1.3)
+    m_sourceSparsity(1.3),
+    m_lowestPitch(silvet_templates_lowest_note),
+    m_highestPitch(silvet_templates_highest_note)
 {
-    m_lowestPitch = silvet_templates_lowest_note;
-    m_highestPitch = silvet_templates_highest_note;
-
-    m_pitches = V(m_noteCount);
+    m_pitches = allocate<double>(m_noteCount);
+    m_updatePitches = allocate<double>(m_noteCount);
     for (int n = 0; n < m_noteCount; ++n) {
         m_pitches[n] = drand48();
     }
 
-    m_shifts = Grid(m_shiftCount);
-    for (int f = 0; f < m_shiftCount; ++f) {
-        m_shifts[f] = V(m_noteCount);
-        for (int n = 0; n < m_noteCount; ++n) {
-            if (m_useShifts) {
+    if (useShifts) {
+        m_shifts = allocate_channels<double>(m_shiftCount, m_noteCount);
+        m_updateShifts = allocate_channels<double>(m_shiftCount, m_noteCount);
+        for (int f = 0; f < m_shiftCount; ++f) {
+            for (int n = 0; n < m_noteCount; ++n) {
                 m_shifts[f][n] = drand48();
-            } else {
-                m_shifts[f][n] = 1.0;
             }
         }
+    } else {
+        m_shifts = 0;
+        m_updateShifts = 0;
     }
     
-    m_sources = Grid(m_instrumentCount);
-        for (int i = 0; i < m_instrumentCount; ++i) {
-        m_sources[i] = V(m_noteCount);
+    m_sources = allocate_channels<double>(m_sourceCount, m_noteCount);
+    m_updateSources = allocate_channels<double>(m_sourceCount, m_noteCount);
+    for (int i = 0; i < m_sourceCount; ++i) {
         for (int n = 0; n < m_noteCount; ++n) {
             m_sources[i][n] = (inRange(i, n) ? 1.0 : 0.0);
         }
     }
 
-    m_estimate = V(m_binCount);
-    m_q = V(m_binCount);
+    m_estimate = allocate<double>(m_binCount);
+    m_q = allocate<double>(m_binCount);
 }
 
 EM::~EM()
 {
+    deallocate(m_q);
+    deallocate(m_estimate);
+    deallocate_channels(m_sources, m_sourceCount);
+    deallocate_channels(m_updateSources, m_sourceCount);
+    deallocate_channels(m_shifts, m_shiftCount);
+    deallocate_channels(m_updateShifts, m_shiftCount);
+    deallocate(m_pitches);
+    deallocate(m_updatePitches);
 }
 
 void
@@ -91,47 +102,45 @@
 }
 
 void
-EM::normaliseColumn(V &column)
+EM::normaliseColumn(double *column, int size)
 {
-    double sum = 0.0;
-    for (int i = 0; i < (int)column.size(); ++i) {
-        sum += column[i];
-    }
-    for (int i = 0; i < (int)column.size(); ++i) {
-        column[i] /= sum;
-    }
+    double sum = v_sum(column, size);
+    v_scale(column, 1.0 / sum, size);
 }
 
 void
-EM::normaliseGrid(Grid &grid)
+EM::normaliseGrid(double **grid, int size1, int size2)
 {
-    V denominators(grid[0].size());
+    double *denominators = allocate_and_zero<double>(size2);
 
-    for (int i = 0; i < (int)grid.size(); ++i) {
-        for (int j = 0; j < (int)grid[i].size(); ++j) {
+    for (int i = 0; i < size1; ++i) {
+        for (int j = 0; j < size2; ++j) {
             denominators[j] += grid[i][j];
         }
     }
 
-    for (int i = 0; i < (int)grid.size(); ++i) {
-        for (int j = 0; j < (int)grid[i].size(); ++j) {
-            grid[i][j] /= denominators[j];
-        }
+    for (int i = 0; i < size1; ++i) {
+        v_divide(grid[i], denominators, size2);
     }
+
+    deallocate(denominators);
 }
 
 void
-EM::iterate(V column)
+EM::iterate(const double *column)
 {
-    normaliseColumn(column);
-    expectation(column);
-    maximisation(column);
+    double *norm = allocate<double>(m_binCount);
+    v_copy(norm, column, m_binCount);
+    normaliseColumn(norm, m_binCount);
+    expectation(norm);
+    maximisation(norm);
+    deallocate(norm);
 }
 
-const float *
+const double *
 EM::templateFor(int instrument, int note, int shift)
 {
-    if (m_useShifts) {
+    if (m_shifts) {
         return silvet_templates[instrument].data[note] + shift;
     } else {
         return silvet_templates[instrument].data[note] + 
@@ -140,24 +149,21 @@
 }
 
 void
-EM::expectation(const V &column)
+EM::expectation(const double *column)
 {
 //    cerr << ".";
 
-    for (int i = 0; i < m_binCount; ++i) {
-        m_estimate[i] = epsilon;
-    }
+    v_set(m_estimate, epsilon, m_binCount);
 
-    for (int i = 0; i < m_instrumentCount; ++i) {
+    for (int i = 0; i < m_sourceCount; ++i) {
         for (int n = 0; n < m_noteCount; ++n) {
+            const double pitch = m_pitches[n];
+            const double source = m_sources[i][n];
             for (int f = 0; f < m_shiftCount; ++f) {
-                const float *w = templateFor(i, n, f);
-                double pitch = m_pitches[n];
-                double source = m_sources[i][n];
-                double shift = m_shifts[f][n];
-                for (int j = 0; j < m_binCount; ++j) {
-                    m_estimate[j] += w[j] * pitch * source * shift;
-                }
+                const double *w = templateFor(i, n, f);
+                const double shift = m_shifts ? m_shifts[f][n] : 1.0;
+                const double factor = pitch * source * shift;
+                v_add_with_gain(m_estimate, w, factor, m_binCount);
             }
         }
     }
@@ -168,77 +174,83 @@
 }
 
 void
-EM::maximisation(const V &column)
+EM::maximisation(const double *column)
 {
-    V newPitches = m_pitches;
+    v_set(m_updatePitches, epsilon, m_noteCount);
+
+    for (int i = 0; i < m_sourceCount; ++i) {
+        v_set(m_updateSources[i], epsilon, m_noteCount);
+    }
+
+    if (m_shifts) {
+        for (int i = 0; i < m_shiftCount; ++i) {
+            v_set(m_updateShifts[i], epsilon, m_noteCount);
+        }
+    }
+
+    double *contributions = allocate<double>(m_binCount);
 
     for (int n = 0; n < m_noteCount; ++n) {
-        newPitches[n] = epsilon;
-        if (n >= m_lowestPitch && n <= m_highestPitch) {
-            for (int i = 0; i < m_instrumentCount; ++i) {
-                for (int f = 0; f < m_shiftCount; ++f) {
-                    const float *w = templateFor(i, n, f);
-                    double pitch = m_pitches[n];
-                    double source = m_sources[i][n];
-                    double shift = m_shifts[f][n];
-                    for (int j = 0; j < m_binCount; ++j) {
-                        newPitches[n] += w[j] * m_q[j] * pitch * source * shift;
+
+        const double pitch = m_pitches[n];
+
+        for (int f = 0; f < m_shiftCount; ++f) {
+
+            const double shift = m_shifts ? m_shifts[f][n] : 1.0;
+
+            for (int i = 0; i < m_sourceCount; ++i) {
+
+                const double source = m_sources[i][n];
+                const double factor = pitch * source * shift;
+                const double *w = templateFor(i, n, f);
+
+                v_copy(contributions, w, m_binCount);
+                v_multiply(contributions, m_q, m_binCount);
+
+                double total = factor * v_sum(contributions, m_binCount);
+
+                if (n >= m_lowestPitch && n <= m_highestPitch) {
+
+                    m_updatePitches[n] += total;
+
+                    if (inRange(i, n)) {
+                        m_updateSources[i][n] += total;
                     }
                 }
+
+                if (m_shifts) {
+                    m_updateShifts[f][n] += total;
+                }
             }
         }
-        if (m_pitchSparsity != 1.0) {
-            newPitches[n] = pow(newPitches[n], m_pitchSparsity);
+    }
+
+    if (m_pitchSparsity != 1.0) {
+        for (int n = 0; n < m_noteCount; ++n) {
+            m_updatePitches[n] = 
+                pow(m_updatePitches[n], m_pitchSparsity);
         }
     }
-    normaliseColumn(newPitches);
 
-    Grid newShifts = m_shifts;
-
-    if (m_useShifts) {
-        for (int f = 0; f < m_shiftCount; ++f) {
-            for (int n = 0; n < m_noteCount; ++n) {
-                newShifts[f][n] = epsilon;
-                for (int i = 0; i < m_instrumentCount; ++i) {
-                    const float *w = templateFor(i, n, f);
-                    double pitch = m_pitches[n];
-                    double source = m_sources[i][n];
-                    double shift = m_shifts[f][n];
-                    for (int j = 0; j < m_binCount; ++j) {
-                        newShifts[f][n] += w[j] * m_q[j] * pitch * source * shift;
-                    }
-                }
-            }
-        }
-        normaliseGrid(newShifts);
-    }
-
-    Grid newSources = m_sources;
-
-    for (int i = 0; i < m_instrumentCount; ++i) {
+    if (m_sourceSparsity != 1.0) {
         for (int n = 0; n < m_noteCount; ++n) {
-            newSources[i][n] = epsilon;
-            if (inRange(i, n)) {
-                for (int f = 0; f < m_shiftCount; ++f) {
-                    const float *w = templateFor(i, n, f);
-                    double pitch = m_pitches[n];
-                    double source = m_sources[i][n];
-                    double shift = m_shifts[f][n];
-                    for (int j = 0; j < m_binCount; ++j) {
-                        newSources[i][n] += w[j] * m_q[j] * pitch * source * shift;
-                    }
-                }
-            }
-            if (m_sourceSparsity != 1.0) {
-                newSources[i][n] = pow(newSources[i][n], m_sourceSparsity);
+            for (int i = 0; i < m_sourceCount; ++i) {
+                m_updateSources[i][n] =
+                    pow(m_updateSources[i][n], m_sourceSparsity);
             }
         }
     }
-    normaliseGrid(newSources);
 
-    m_pitches = newPitches;
-    m_shifts = newShifts;
-    m_sources = newSources;
+    normaliseColumn(m_updatePitches, m_noteCount);
+    std::swap(m_pitches, m_updatePitches);
+
+    normaliseGrid(m_updateSources, m_sourceCount, m_noteCount);
+    std::swap(m_sources, m_updateSources);
+
+    if (m_shifts) {
+        normaliseGrid(m_updateShifts, m_shiftCount, m_noteCount);
+        std::swap(m_shifts, m_updateShifts);
+    }
 }
 
 
--- a/src/EM.h	Tue May 06 18:55:11 2014 +0100
+++ b/src/EM.h	Wed May 07 11:55:32 2014 +0100
@@ -24,48 +24,71 @@
     EM(bool useShifts);
     ~EM();
 
-    void iterate(std::vector<double> column);
+    int getBinCount() const { return m_binCount; }
+    int getNoteCount() const { return m_noteCount; }
+    int getSourceCount() const { return m_sourceCount; }
 
-    const std::vector<double> &getEstimate() const { 
+    /**
+     * Carry out one iteration using the given column as input. The
+     * column must have getBinCount() values.
+     */
+    void iterate(const double *column);
+
+    /**
+     * Return the estimated distribution after the current iteration.
+     * Like the input, this will have getBinCount() values.
+     */
+    const double *getEstimate() const {
 	return m_estimate;
     }
-    const std::vector<double> &getPitchDistribution() const {
+
+    /**
+     * Return the pitch distribution for the current estimate.  The
+     * returned array has getNoteCount() values.
+     */
+    const double *getPitchDistribution() const {
 	return m_pitches;
     }
-    const std::vector<std::vector<double> > &getSources() const {
+    
+    /** 
+     * Return the source distribution for the current estimate. The
+     * returned pointer refers to getSourceCount() arrays of
+     * getNoteCount() values.
+     */
+    const double *const *getSources() const {
 	return m_sources; 
     }
 
 private:
-    typedef std::vector<double> V;
-    typedef std::vector<std::vector<double> > Grid;
+    double *m_pitches;
+    double **m_shifts;
+    double **m_sources;
 
-    bool m_useShifts;
+    double *m_updatePitches;
+    double **m_updateShifts;
+    double **m_updateSources;
 
-    V m_pitches;
-    Grid m_shifts;
-    Grid m_sources;
+    double *m_estimate;
+    double *m_q;
+    
+    const int m_noteCount;
+    const int m_shiftCount; // 1 + 2 * max template shift
+    const int m_binCount;
+    const int m_sourceCount;
+    
+    const double m_pitchSparsity;
+    const double m_sourceSparsity;
 
-    V m_estimate;
-    V m_q;
-    
-    int m_noteCount;
-    int m_shiftCount; // 1 + 2 * max template shift
-    int m_binCount;
-    int m_instrumentCount;
-    
-    double m_pitchSparsity;
-    double m_sourceSparsity;
+    const int m_lowestPitch;
+    const int m_highestPitch;
 
-    int m_lowestPitch;
-    int m_highestPitch;
+    void normaliseColumn(double *column, int size);
+    void normaliseGrid(double **grid, int size1, int size2);
 
-    void normaliseColumn(V &column);
-    void normaliseGrid(Grid &grid);
-    void expectation(const V &column);
-    void maximisation(const V &column);
+    void expectation(const double *column); // size is m_binCount
+    void maximisation(const double *column); // size is m_binCount
 
-    const float *templateFor(int instrument, int note, int shift);
+    const double *templateFor(int instrument, int note, int shift);
     void rangeFor(int instrument, int &minPitch, int &maxPitch);
     bool inRange(int instrument, int pitch);
 };
--- a/src/Silvet.cpp	Tue May 06 18:55:11 2014 +0100
+++ b/src/Silvet.cpp	Wed May 07 11:55:32 2014 +0100
@@ -41,7 +41,7 @@
     Plugin(inputSampleRate),
     m_resampler(0),
     m_cq(0),
-    m_hqMode(false)
+    m_hqMode(true)
 {
 }
 
@@ -138,7 +138,7 @@
     desc.description = "Determines the tradeoff of processing speed against transcription quality";
     desc.minValue = 0;
     desc.maxValue = 1;
-    desc.defaultValue = 0;
+    desc.defaultValue = 1;
     desc.isQuantized = true;
     desc.quantizeStep = 1;
     desc.valueNames.push_back("Draft (faster)");
@@ -397,6 +397,8 @@
 
     FeatureSet fs;
 
+    if (filtered.empty()) return fs;
+
     for (int i = 0; i < (int)filtered.size(); ++i) {
         Feature f;
         for (int j = 0; j < processingHeight; ++j) {
@@ -409,33 +411,40 @@
 
     int iterations = 12;
 
+    Grid pitchMatrix(width, vector<double>(processingNotes));
+
+#pragma omp parallel for
     for (int i = 0; i < width; ++i) {
 
         double sum = 0.0;
         for (int j = 0; j < processingHeight; ++j) {
-            sum += filtered[i][j];
+            sum += filtered.at(i).at(j);
         }
 
         if (sum < 1e-5) continue;
 
         EM em(m_hqMode);
+
         for (int j = 0; j < iterations; ++j) {
-            em.iterate(filtered[i]);
+            em.iterate(filtered.at(i).data());
         }
-
-        vector<double> pitches = em.getPitchDistribution();
+        
+        const double *pitches = em.getPitchDistribution();
         
         for (int j = 0; j < processingNotes; ++j) {
-            pitches[j] *= sum;
+            pitchMatrix[i][j] = pitches[j] * sum;
         }
+    }
 
+    for (int i = 0; i < width; ++i) {
+        
         Feature f;
         for (int j = 0; j < processingNotes; ++j) {
-            f.values.push_back(float(pitches[j]));
+            f.values.push_back(float(pitchMatrix[i][j]));
         }
         fs[m_pitchOutputNo].push_back(f);
 
-        FeatureList noteFeatures = postProcess(pitches);
+        FeatureList noteFeatures = postProcess(pitchMatrix[i]);
 
         for (FeatureList::const_iterator fi = noteFeatures.begin();
              fi != noteFeatures.end(); ++fi) {
--- a/testdata/timing/results.txt	Tue May 06 18:55:11 2014 +0100
+++ b/testdata/timing/results.txt	Wed May 07 11:55:32 2014 +0100
@@ -91,7 +91,6 @@
 conclusion: supports the previous test
 
 
-
 OPENMP:
 
 commit:62b7be1226d5, as commit:ce64d11ef336 but with OpenMP parallel
@@ -152,6 +151,16 @@
 user	1m28.697s
 sys	0m0.197s
 
+commit:91bb029a847a, as commit:a6e136aaa202 but with the series of
+calculations reordered to match that in the recent bqvec code
+commit:b2f0967cb8d1. Just testing whether it is the replacement of
+std::vector or the reordering of vector operations that was saving the
+time in bqvec branch.
+
+real	2m52.922s
+user	2m52.480s
+sys	0m0.263s
+
 
 BQVEC:
 
@@ -209,3 +218,34 @@
 real	0m44.650s
 user	2m19.997s
 sys	0m0.343s
+
+commit:c4eae816bdb3, as commit:ac750e222ad3 but with some logic to
+make using the shifts optional (though on by default). Performance
+*should* be unchanged here.
+
+real	0m43.979s
+user	2m19.297s
+sys	0m0.360s
+
+commit:b2f0967cb8d1, as commit:c4eae816bdb3 but storing the templates
+as float arrays and then pulling them out into individual
+one-per-shift-factor double arrays each of which is explicitly
+allocated with the proper alignment. Uses more memory, and the code is
+ugly, but gets aligned starts for slightly more of the vector ops.
+
+real	0m50.856s
+user	2m44.937s
+sys	0m0.463s
+
+commit:6890dea115c3, as commit:c4eae816bdb3 with a loop factored out
+
+real	0m40.565s
+user	2m3.883s
+sys	0m0.307s
+
+commit:230920148ee5, as commit:6890dea115c3 with a simpler openmp loop
+
+real	0m39.761s
+user	2m3.093s
+sys	0m0.347s
+
--- a/yeti/scratch/generateTemplatesC.yeti	Tue May 06 18:55:11 2014 +0100
+++ b/yeti/scratch/generateTemplatesC.yeti	Wed May 07 11:55:32 2014 +0100
@@ -106,7 +106,7 @@
         "    const char *name;",
         "    int lowest;",
         "    int highest;",
-        "    float data[SILVET_TEMPLATE_NOTE_COUNT][SILVET_TEMPLATE_SIZE];",
+        "    double data[SILVET_TEMPLATE_NOTE_COUNT][SILVET_TEMPLATE_SIZE];",
         "} silvet_template_t;",
         "",
         "static int silvet_templates_lowest_note = \(overallLowest);",