view src/EM.cpp @ 167:416b555df3b2 finetune

More on returning fine tuning (but we're treating different shifts of the same pitch as different notes at the moment which is not right)
author Chris Cannam
date Tue, 20 May 2014 17:49:07 +0100
parents 629c9525b815
children 237d41a0f69d
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

/*
  Silvet

  A Vamp plugin for note transcription.
  Centre for Digital Music, Queen Mary University of London.
    
  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
  published by the Free Software Foundation; either version 2 of the
  License, or (at your option) any later version.  See the file
  COPYING included with this distribution for more information.
*/

#include "EM.h"

#include <cstdlib>
#include <cmath>

#include <iostream>

#include "VectorOps.h"
#include "Allocators.h"
#include "Instruments.h"

using std::vector;
using std::cerr;
using std::endl;

using namespace breakfastquay;

static float epsilon = 1e-10;

EM::EM(const InstrumentPack *pack, bool useShifts) :
    m_pack(pack),
    m_noteCount(pack->templateNoteCount),
    m_shiftCount(useShifts ? pack->templateMaxShift * 2 + 1 : 1),
    m_binCount(pack->templateHeight),
    m_sourceCount(pack->templates.size()),
    m_pitchSparsity(1.1),
    //!!! note: slightly less source sparsity might help; also
    //!!! consider a modest shift sparsity e.g. 1.1
    m_sourceSparsity(1.3),
    m_lowestPitch(pack->lowestNote),
    m_highestPitch(pack->highestNote)
{
    m_pitches = allocate<float>(m_noteCount);
    m_updatePitches = allocate<float>(m_noteCount);
    for (int n = 0; n < m_noteCount; ++n) {
        m_pitches[n] = drand48();
    }

    if (useShifts) {
        m_shifts = allocate_channels<float>(m_shiftCount, m_noteCount);
        m_updateShifts = allocate_channels<float>(m_shiftCount, m_noteCount);
        for (int f = 0; f < m_shiftCount; ++f) {
            for (int n = 0; n < m_noteCount; ++n) {
                m_shifts[f][n] = drand48();
            }
        }
    } else {
        m_shifts = 0;
        m_updateShifts = 0;
    }
    
    m_sources = allocate_channels<float>(m_sourceCount, m_noteCount);
    m_updateSources = allocate_channels<float>(m_sourceCount, m_noteCount);
    for (int i = 0; i < m_sourceCount; ++i) {
        for (int n = 0; n < m_noteCount; ++n) {
            m_sources[i][n] = (inRange(i, n) ? 1.0 : 0.0);
        }
    }

    m_estimate = allocate<float>(m_binCount);
    m_q = allocate<float>(m_binCount);
}

EM::~EM()
{
    deallocate(m_q);
    deallocate(m_estimate);
    deallocate_channels(m_sources, m_sourceCount);
    deallocate_channels(m_updateSources, m_sourceCount);
    deallocate_channels(m_shifts, m_shiftCount);
    deallocate_channels(m_updateShifts, m_shiftCount);
    deallocate(m_pitches);
    deallocate(m_updatePitches);
}

void
EM::rangeFor(int instrument, int &minPitch, int &maxPitch)
{
    minPitch = m_pack->templates[instrument].lowestNote;
    maxPitch = m_pack->templates[instrument].highestNote;
}

bool
EM::inRange(int instrument, int pitch)
{
    int minPitch, maxPitch;
    rangeFor(instrument, minPitch, maxPitch);
    return (pitch >= minPitch && pitch <= maxPitch);
}

void
EM::normaliseColumn(float *column, int size)
{
    float sum = v_sum(column, size);
    v_scale(column, 1.0 / sum, size);
}

void
EM::normaliseGrid(float **grid, int size1, int size2)
{
    float *denominators = allocate_and_zero<float>(size2);

    for (int i = 0; i < size1; ++i) {
        for (int j = 0; j < size2; ++j) {
            denominators[j] += grid[i][j];
        }
    }

    for (int i = 0; i < size1; ++i) {
        v_divide(grid[i], denominators, size2);
    }

    deallocate(denominators);
}

void
EM::iterate(const double *column)
{
    float *norm = allocate<float>(m_binCount);
    v_convert(norm, column, m_binCount);
    normaliseColumn(norm, m_binCount);
    expectation(norm);
    maximisation(norm);
    deallocate(norm);
}

const float *
EM::templateFor(int instrument, int note, int shift)
{
    const float *base = m_pack->templates.at(instrument).data.at(note).data();
    if (m_shifts) {
        return base + shift;
    } else {
        return base + m_pack->templateMaxShift;
    }
}

void
EM::expectation(const float *column)
{
//    cerr << ".";

    v_set(m_estimate, epsilon, m_binCount);

    for (int f = 0; f < m_shiftCount; ++f) {

        const float *shiftIn = m_shifts ? m_shifts[f] : 0;

        for (int i = 0; i < m_sourceCount; ++i) {

            const float *sourceIn = m_sources[i];

            int lowest, highest;
            rangeFor(i, lowest, highest);

            for (int n = lowest; n <= highest; ++n) {

                const float source = sourceIn[n];
                const float shift = shiftIn ? shiftIn[n] : 1.0;
                const float pitch = m_pitches[n];

                const float factor = pitch * source * shift;
                const float *w = templateFor(i, n, f);

                v_add_with_gain(m_estimate, w, factor, m_binCount);
            }
        }
    }

    for (int i = 0; i < m_binCount; ++i) {
        m_q[i] = column[i] / m_estimate[i];
    }

/*
    double l2norm = 0.0;
    
    for (int i = 0; i < m_binCount; ++i) {
        l2norm += (column[i] - m_estimate[i]) * (column[i] - m_estimate[i]);
    }

    l2norm = sqrt(l2norm);
    cerr << "l2norm = " << l2norm << endl;
*/
}

void
EM::maximisation(const float *column)
{
    v_set(m_updatePitches, epsilon, m_noteCount);

    for (int i = 0; i < m_sourceCount; ++i) {
        v_set(m_updateSources[i], epsilon, m_noteCount);
    }

    if (m_shifts) {
        for (int i = 0; i < m_shiftCount; ++i) {
            v_set(m_updateShifts[i], epsilon, m_noteCount);
        }
    }

    float *contributions = allocate<float>(m_binCount);

    for (int f = 0; f < m_shiftCount; ++f) {

        const float *shiftIn = m_shifts ? m_shifts[f] : 0;
        float *shiftOut = m_shifts ? m_updateShifts[f] : 0;

        for (int i = 0; i < m_sourceCount; ++i) {

            const float *sourceIn = m_sources[i];
            float *sourceOut = m_updateSources[i];

            int lowest, highest;
            rangeFor(i, lowest, highest);

            for (int n = lowest; n <= highest; ++n) {

                const float shift = shiftIn ? shiftIn[n] : 1.0;
                const float source = sourceIn[n];
                const float pitch = m_pitches[n];

                const float factor = pitch * source * shift;
                const float *w = templateFor(i, n, f);

                v_copy(contributions, w, m_binCount);
                v_multiply(contributions, m_q, m_binCount);

                float total = factor * v_sum(contributions, m_binCount);

                m_updatePitches[n] += total;
                sourceOut[n] += total;

                if (shiftOut) {
                    shiftOut[n] += total;
                }
            }
        }
    }

    if (m_pitchSparsity != 1.0) {
        for (int n = 0; n < m_noteCount; ++n) {
            m_updatePitches[n] = 
                pow(m_updatePitches[n], m_pitchSparsity);
        }
    }

    if (m_sourceSparsity != 1.0) {
        for (int i = 0; i < m_sourceCount; ++i) {
            for (int n = 0; n < m_noteCount; ++n) {
                m_updateSources[i][n] =
                    pow(m_updateSources[i][n], m_sourceSparsity);
            }
        }
    }

    normaliseColumn(m_updatePitches, m_noteCount);
    std::swap(m_pitches, m_updatePitches);

    normaliseGrid(m_updateSources, m_sourceCount, m_noteCount);
    std::swap(m_sources, m_updateSources);

    if (m_shifts) {
        normaliseGrid(m_updateShifts, m_shiftCount, m_noteCount);
        std::swap(m_shifts, m_updateShifts);
    }
}