view transform/FeatureExtractionPluginTransform.cpp @ 117:c30728d5625c sv1-v0.9rc1

* Make vertical scale alignment modes work in note layer as well as time-value layer, and several significant fixes to it * Make it possible to draw notes properly on the note layer * Show units (and frequencies etc in note layer's case) in the time-value and note layer description boxes * Minor fix to item edit dialog layout * Some minor menu rearrangement * Comment out a lot of debug output * Add SV website and reference URLs to Help menu, and add code to (attempt to) open them in the user's preferred browser
author Chris Cannam
date Fri, 12 May 2006 14:40:43 +0000
parents 90ade4fa63be
children 4170b21773cf
line wrap: on
line source

/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

/*
    Sonic Visualiser
    An audio file viewer and annotation editor.
    Centre for Digital Music, Queen Mary, University of London.
    This file copyright 2006 Chris Cannam.
    
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
    published by the Free Software Foundation; either version 2 of the
    License, or (at your option) any later version.  See the file
    COPYING included with this distribution for more information.
*/

#include "FeatureExtractionPluginTransform.h"

#include "plugin/FeatureExtractionPluginFactory.h"
#include "plugin/PluginXml.h"
#include "vamp-sdk/Plugin.h"

#include "base/Model.h"
#include "base/Window.h"
#include "model/SparseOneDimensionalModel.h"
#include "model/SparseTimeValueModel.h"
#include "model/DenseThreeDimensionalModel.h"
#include "model/DenseTimeValueModel.h"
#include "model/NoteModel.h"

#include <fftw3.h>

#include <iostream>

FeatureExtractionPluginTransform::FeatureExtractionPluginTransform(Model *inputModel,
								   QString pluginId,
                                                                   int channel,
                                                                   QString configurationXml,
								   QString outputName) :
    Transform(inputModel),
    m_plugin(0),
    m_channel(channel),
    m_stepSize(0),
    m_blockSize(0),
    m_descriptor(0),
    m_outputFeatureNo(0)
{
//    std::cerr << "FeatureExtractionPluginTransform::FeatureExtractionPluginTransform: plugin " << pluginId.toStdString() << ", outputName " << outputName.toStdString() << std::endl;

    FeatureExtractionPluginFactory *factory =
	FeatureExtractionPluginFactory::instanceFor(pluginId);

    if (!factory) {
	std::cerr << "FeatureExtractionPluginTransform: No factory available for plugin id \""
		  << pluginId.toStdString() << "\"" << std::endl;
	return;
    }

    m_plugin = factory->instantiatePlugin(pluginId, m_input->getSampleRate());

    if (!m_plugin) {
	std::cerr << "FeatureExtractionPluginTransform: Failed to instantiate plugin \""
		  << pluginId.toStdString() << "\"" << std::endl;
	return;
    }

    if (configurationXml != "") {
        PluginXml(m_plugin).setParametersFromXml(configurationXml);
    }

    m_blockSize = m_plugin->getPreferredBlockSize();
    m_stepSize = m_plugin->getPreferredStepSize();

    if (m_blockSize == 0) m_blockSize = 1024; //!!! todo: ask user
    if (m_stepSize == 0) m_stepSize = m_blockSize; //!!! likewise

    DenseTimeValueModel *input = getInput();
    if (!input) return;

    size_t channelCount = input->getChannelCount();
    if (m_plugin->getMaxChannelCount() < channelCount) {
	channelCount = 1;
    }
    if (m_plugin->getMinChannelCount() > channelCount) {
	std::cerr << "FeatureExtractionPluginTransform:: "
		  << "Can't provide enough channels to plugin (plugin min "
		  << m_plugin->getMinChannelCount() << ", max "
		  << m_plugin->getMaxChannelCount() << ", input model has "
		  << input->getChannelCount() << ")" << std::endl;
	return;
    }

    if (!m_plugin->initialise(channelCount, m_stepSize, m_blockSize)) {
        std::cerr << "FeatureExtractionPluginTransform: Plugin "
                  << m_plugin->getName() << " failed to initialise!" << std::endl;
        return;
    }

    //!!! cope with plugins that request non-power-of-2 block sizes in
    // the frequency domain!

    Vamp::Plugin::OutputList outputs = m_plugin->getOutputDescriptors();

    if (outputs.empty()) {
	std::cerr << "FeatureExtractionPluginTransform: Plugin \""
		  << pluginId.toStdString() << "\" has no outputs" << std::endl;
	return;
    }
    
    for (size_t i = 0; i < outputs.size(); ++i) {
	if (outputName == "" || outputs[i].name == outputName.toStdString()) {
	    m_outputFeatureNo = i;
	    m_descriptor = new Vamp::Plugin::OutputDescriptor
		(outputs[i]);
	    break;
	}
    }

    if (!m_descriptor) {
	std::cerr << "FeatureExtractionPluginTransform: Plugin \""
		  << pluginId.toStdString() << "\" has no output named \""
		  << outputName.toStdString() << "\"" << std::endl;
	return;
    }

//    std::cerr << "FeatureExtractionPluginTransform: output sample type "
//	      << m_descriptor->sampleType << std::endl;

    int binCount = 1;
    float minValue = 0.0, maxValue = 0.0;
    
    if (m_descriptor->hasFixedBinCount) {
	binCount = m_descriptor->binCount;
    }

//    std::cerr << "FeatureExtractionPluginTransform: output bin count "
//	      << binCount << std::endl;

    if (binCount > 0 && m_descriptor->hasKnownExtents) {
	minValue = m_descriptor->minValue;
	maxValue = m_descriptor->maxValue;
    }

    size_t modelRate = m_input->getSampleRate();
    size_t modelResolution = 1;
    
    switch (m_descriptor->sampleType) {

    case Vamp::Plugin::OutputDescriptor::VariableSampleRate:
	if (m_descriptor->sampleRate != 0.0) {
	    modelResolution = size_t(modelRate / m_descriptor->sampleRate + 0.001);
	}
	break;

    case Vamp::Plugin::OutputDescriptor::OneSamplePerStep:
	modelResolution = m_stepSize;
	break;

    case Vamp::Plugin::OutputDescriptor::FixedSampleRate:
	modelRate = size_t(m_descriptor->sampleRate + 0.001);
	break;
    }

    if (binCount == 0) {

	m_output = new SparseOneDimensionalModel(modelRate, modelResolution,
						 false);

    } else if (binCount == 1) {

        SparseTimeValueModel *model = new SparseTimeValueModel
            (modelRate, modelResolution, minValue, maxValue, false);
        model->setScaleUnits(outputs[m_outputFeatureNo].unit.c_str());

        m_output = model;

    } else if (m_descriptor->sampleType ==
	       Vamp::Plugin::OutputDescriptor::VariableSampleRate) {

        // We don't have a sparse 3D model, so interpret this as a
        // note model.  There's nothing to define which values to use
        // as which parameters of the note -- for the moment let's
        // treat the first as pitch, second as duration in frames,
        // third (if present) as velocity. (Our note model doesn't
        // yet store velocity.)
        //!!! todo: ask the user!
	
        NoteModel *model = new NoteModel
            (modelRate, modelResolution, minValue, maxValue, false);
        model->setScaleUnits(outputs[m_outputFeatureNo].unit.c_str());

        m_output = model;

    } else {
	
	m_output = new DenseThreeDimensionalModel(modelRate, modelResolution,
						  binCount, false);

	if (!m_descriptor->binNames.empty()) {
	    std::vector<QString> names;
	    for (size_t i = 0; i < m_descriptor->binNames.size(); ++i) {
		names.push_back(m_descriptor->binNames[i].c_str());
	    }
	    (dynamic_cast<DenseThreeDimensionalModel *>(m_output))
		->setBinNames(names);
	}
    }
}

FeatureExtractionPluginTransform::~FeatureExtractionPluginTransform()
{
    delete m_plugin;
    delete m_descriptor;
}

DenseTimeValueModel *
FeatureExtractionPluginTransform::getInput()
{
    DenseTimeValueModel *dtvm =
	dynamic_cast<DenseTimeValueModel *>(getInputModel());
    if (!dtvm) {
	std::cerr << "FeatureExtractionPluginTransform::getInput: WARNING: Input model is not conformable to DenseTimeValueModel" << std::endl;
    }
    return dtvm;
}

void
FeatureExtractionPluginTransform::run()
{
    DenseTimeValueModel *input = getInput();
    if (!input) return;

    if (!m_output) return;

    size_t sampleRate = m_input->getSampleRate();

    size_t channelCount = input->getChannelCount();
    if (m_plugin->getMaxChannelCount() < channelCount) {
	channelCount = 1;
    }

    float **buffers = new float*[channelCount];
    for (size_t ch = 0; ch < channelCount; ++ch) {
	buffers[ch] = new float[m_blockSize];
    }

    double *fftInput = 0;
    fftw_complex *fftOutput = 0;
    fftw_plan fftPlan = 0;
    Window<double> windower(HanningWindow, m_blockSize);

    if (m_plugin->getInputDomain() == Vamp::Plugin::FrequencyDomain) {

        fftInput = (double *)fftw_malloc(m_blockSize * sizeof(double));
        fftOutput = (fftw_complex *)fftw_malloc(m_blockSize * sizeof(fftw_complex));
        fftPlan = fftw_plan_dft_r2c_1d(m_blockSize, fftInput, fftOutput,
                                       FFTW_ESTIMATE);
        if (!fftPlan) {
            std::cerr << "ERROR: FeatureExtractionPluginTransform::run(): fftw_plan failed! Results will be garbage" << std::endl;
        }
    }

    long startFrame = m_input->getStartFrame();
    long   endFrame = m_input->getEndFrame();
    long blockFrame = startFrame;

    long prevCompletion = 0;

    while (1) {

        if (fftPlan) {
            if (blockFrame - int(m_blockSize)/2 > endFrame) break;
        } else {
            if (blockFrame >= endFrame) break;
        }

//	std::cerr << "FeatureExtractionPluginTransform::run: blockFrame "
//		  << blockFrame << std::endl;

	long completion =
	    (((blockFrame - startFrame) / m_stepSize) * 99) /
	    (   (endFrame - startFrame) / m_stepSize);

	// channelCount is either m_input->channelCount or 1

        for (size_t ch = 0; ch < channelCount; ++ch) {
            if (fftPlan) {
                getFrames(ch, channelCount, 
                          blockFrame - m_blockSize/2, m_blockSize, buffers[ch]);
            } else {
                getFrames(ch, channelCount, 
                           blockFrame, m_blockSize, buffers[ch]);
            }                
        }
        
        if (fftPlan) {
            for (size_t ch = 0; ch < channelCount; ++ch) {
                for (size_t i = 0; i < m_blockSize; ++i) {
                    fftInput[i] = buffers[ch][i];
                }
                windower.cut(fftInput);
                for (size_t i = 0; i < m_blockSize/2; ++i) {
                    double temp = fftInput[i];
                    fftInput[i] = fftInput[i + m_blockSize/2];
                    fftInput[i + m_blockSize/2] = temp;
                }
                fftw_execute(fftPlan);
                for (size_t i = 0; i < m_blockSize/2; ++i) {
                    buffers[ch][i*2] = fftOutput[i][0];
                    buffers[ch][i*2 + 1] = fftOutput[i][1];
                }
            }
        }

	Vamp::Plugin::FeatureSet features = m_plugin->process
	    (buffers, Vamp::RealTime::frame2RealTime(blockFrame, sampleRate));

	for (size_t fi = 0; fi < features[m_outputFeatureNo].size(); ++fi) {
	    Vamp::Plugin::Feature feature =
		features[m_outputFeatureNo][fi];
	    addFeature(blockFrame, feature);
	}

	if (blockFrame == startFrame || completion > prevCompletion) {
	    setCompletion(completion);
	    prevCompletion = completion;
	}

	blockFrame += m_stepSize;
    }

    if (fftPlan) {
        fftw_destroy_plan(fftPlan);
        fftw_free(fftInput);
        fftw_free(fftOutput);
    }

    Vamp::Plugin::FeatureSet features = m_plugin->getRemainingFeatures();

    for (size_t fi = 0; fi < features[m_outputFeatureNo].size(); ++fi) {
	Vamp::Plugin::Feature feature =
	    features[m_outputFeatureNo][fi];
	addFeature(blockFrame, feature);
    }

    setCompletion(100);
}

void
FeatureExtractionPluginTransform::getFrames(int channel, int channelCount,
                                            long startFrame, long size,
                                            float *buffer)
{
    long offset = 0;

    if (startFrame < 0) {
        for (int i = 0; i < size && startFrame + i < 0; ++i) {
            buffer[i] = 0.0f;
        }
        offset = -startFrame;
        size -= offset;
        if (size <= 0) return;
        startFrame = 0;
    }

    long got = getInput()->getValues
        ((channelCount == 1 ? m_channel : channel),
         startFrame, startFrame + size, buffer + offset);

    while (got < size) {
        buffer[offset + got] = 0.0;
        ++got;
    }

    if (m_channel == -1 && channelCount == 1 &&
        getInput()->getChannelCount() > 1) {
        // use mean instead of sum, as plugin input
        int cc = getInput()->getChannelCount();
        for (long i = 0; i < size; ++i) {
            buffer[i] /= cc;
        }
    }
}

void
FeatureExtractionPluginTransform::addFeature(size_t blockFrame,
					     const Vamp::Plugin::Feature &feature)
{
    size_t inputRate = m_input->getSampleRate();

//    std::cerr << "FeatureExtractionPluginTransform::addFeature("
//	      << blockFrame << ")" << std::endl;

    int binCount = 1;
    if (m_descriptor->hasFixedBinCount) {
	binCount = m_descriptor->binCount;
    }

    size_t frame = blockFrame;

    if (m_descriptor->sampleType ==
	Vamp::Plugin::OutputDescriptor::VariableSampleRate) {

	if (!feature.hasTimestamp) {
	    std::cerr
		<< "WARNING: FeatureExtractionPluginTransform::addFeature: "
		<< "Feature has variable sample rate but no timestamp!"
		<< std::endl;
	    return;
	} else {
	    frame = Vamp::RealTime::realTime2Frame(feature.timestamp, inputRate);
	}

    } else if (m_descriptor->sampleType ==
	       Vamp::Plugin::OutputDescriptor::FixedSampleRate) {

	if (feature.hasTimestamp) {
	    //!!! warning: sampleRate may be non-integral
	    frame = Vamp::RealTime::realTime2Frame(feature.timestamp,
                                                   m_descriptor->sampleRate);
	} else {
	    frame = m_output->getEndFrame() + 1;
	}
    }
	
    if (binCount == 0) {

	SparseOneDimensionalModel *model = getOutput<SparseOneDimensionalModel>();
	if (!model) return;
	model->addPoint(SparseOneDimensionalModel::Point(frame, feature.label.c_str()));
	
    } else if (binCount == 1) {

	float value = 0.0;
	if (feature.values.size() > 0) value = feature.values[0];

	SparseTimeValueModel *model = getOutput<SparseTimeValueModel>();
	if (!model) return;
	model->addPoint(SparseTimeValueModel::Point(frame, value, feature.label.c_str()));

    } else if (m_descriptor->sampleType == 
	       Vamp::Plugin::OutputDescriptor::VariableSampleRate) {

        float pitch = 0.0;
        if (feature.values.size() > 0) pitch = feature.values[0];

        float duration = 1;
        if (feature.values.size() > 1) duration = feature.values[1];
        
        float velocity = 100;
        if (feature.values.size() > 2) velocity = feature.values[2];

        NoteModel *model = getOutput<NoteModel>();
        if (!model) return;

        model->addPoint(NoteModel::Point(frame, pitch, duration, feature.label.c_str()));
	
    } else {
	
	DenseThreeDimensionalModel::BinValueSet values = feature.values;
	
	DenseThreeDimensionalModel *model = getOutput<DenseThreeDimensionalModel>();
	if (!model) return;

	model->setBinValues(frame, values);
    }
}

void
FeatureExtractionPluginTransform::setCompletion(int completion)
{
    int binCount = 1;
    if (m_descriptor->hasFixedBinCount) {
	binCount = m_descriptor->binCount;
    }

    if (binCount == 0) {

	SparseOneDimensionalModel *model = getOutput<SparseOneDimensionalModel>();
	if (!model) return;
	model->setCompletion(completion);

    } else if (binCount == 1) {

	SparseTimeValueModel *model = getOutput<SparseTimeValueModel>();
	if (!model) return;
	model->setCompletion(completion);

    } else if (m_descriptor->sampleType ==
	       Vamp::Plugin::OutputDescriptor::VariableSampleRate) {

	NoteModel *model = getOutput<NoteModel>();
	if (!model) return;
	model->setCompletion(completion);

    } else {

	DenseThreeDimensionalModel *model = getOutput<DenseThreeDimensionalModel>();
	if (!model) return;
	model->setCompletion(completion);
    }
}