view common/Matcher.cpp @ 34:271cbaf6e8d9

* First bits of works matching
author Chris Cannam
date Fri, 26 Mar 2010 13:53:31 +0000
parents 84d6acb6b3ba
children 732fb6b754fb
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "Matcher.h"
#include "Objects.h"
#include "EditDistance.h"

#include <QMultiMap>

#include <iostream>

using namespace Dataquay;

namespace ClassicalData {

ComposerTypingQuickMatcher::ComposerTypingQuickMatcher(QList<Composer *> cl) :
    m_composers(cl)
{
}

GuessList
ComposerTypingQuickMatcher::match(QString text, int maxResults,
                                  float threshold) const
{
    GuessList results;
    GuessSet matches;

    foreach (Composer *c, m_composers) {
        float value = c->matchTypingQuick(text);
        if (value < threshold) continue;
        matches.insert(Guess(value, c));
    }
    
    int n = 0;
    for (GuessSet::const_iterator i = matches.begin();
         i != matches.end(); ++i) {
        results.push_back(*i);
        if (maxResults > 0 && ++n > maxResults) break;
    }

    return results;
}

ComposerTypingThoroughMatcher::ComposerTypingThoroughMatcher(QList<Composer *> cl) :
    m_composers(cl)
{
}

GuessList
ComposerTypingThoroughMatcher::match(QString text, int maxResults,
                                     float threshold) const
{
    GuessList results;

    GuessSet matches;
    foreach (Composer *c, m_composers) {
        float value = c->matchTyping(text);
        if (value < threshold) continue;
        matches.insert(Guess(value, c));
    }
    
    int n = 0;
    for (GuessSet::const_iterator i = matches.begin();
         i != matches.end(); ++i) {
        results.push_back(*i);
        if (maxResults > 0 && ++n > maxResults) break;
    }

    return results;
}

ComposerFullTextMatcher::ComposerFullTextMatcher(QList<Composer *> cl) :
    m_composers(cl)
{
}

GuessList
ComposerFullTextMatcher::match(QString text, int maxResults,
                               float threshold) const
{
    GuessList results;

    GuessSet matches;
    foreach (Composer *c, m_composers) {
        float value = c->matchFuzzyName(text);
        if (value < threshold) continue;
//        std::cerr << "Liking: " << c->name().toStdString() << " (" << value << ")" << std::endl;
        matches.insert(Guess(value, c));
    }
    
    int n = 0;
    for (GuessSet::iterator i = matches.begin();
         i != matches.end(); ++i) {
        Guess g = *i;
        results.push_back(g);
//        std::cerr << "Pushing: " << g.entity()->name().toStdString() << std::endl;
        if (maxResults > 0 && ++n > maxResults) break;
    }

    return results;
}

WorkCatalogueMatcher::WorkCatalogueMatcher(QList<Work *> wl) :
    m_works(wl)
{
}

GuessList
WorkCatalogueMatcher::match(QString text, int maxResults,
                            float threshold) const
{
    GuessList results;
    GuessSet matches;
    QStringList cats = Work::extractCatalogueNumberTexts(text);
    if (cats.empty()) return results;
    foreach (QString cat, cats) {
        std::cerr << "testing cat \"" << cat.toStdString() << "\" against "
                  << m_works.size() << " works" << std::endl;
        foreach (Work *w, m_works) {
            if (maxResults > 0 && matches.size() >= maxResults) {
                break;
            }
            QString catalogue = w->catalogue();
            if (catalogue != "") {
                if (!Work::compareCatalogueNumberTexts(catalogue, cat)) {
                    std::cerr << "We like: " << w->name().toStdString() << " ("
                              << catalogue.toStdString() << ")" << std::endl;
                    // all catalogue matches score equal here
                    matches.insert(Guess(1.f, w));
                    continue;
                }
            }
            QString opus = w->opus();
            QString number = w->number();
            QString optext;
            if (opus != "") {
                if (number != "") {
                    optext = QString("Op %1 no %2").arg(opus).arg(number);
                    if (!Work::compareCatalogueNumberTexts(optext, cat)) {
                        std::cerr << "We like: " << w->name().toStdString() << " ("
                                  << optext.toStdString() << ")" << std::endl;
                        matches.insert(Guess(1.f, w));
                        continue;
                    }
                } else {
                    optext = QString("Op %1").arg(opus);
                    if (!Work::compareCatalogueNumberTexts(optext, cat)) {
                        std::cerr << "We like: " << w->name().toStdString() << " ("
                                  << optext.toStdString() << ")" << std::endl;
                        matches.insert(Guess(1.f, w));
                        continue;
                    }
                }
            }
        }
    }

    if (maxResults == 0 || matches.size() < maxResults) {

        // Now, for slightly lower marks, test for strings like "Symphony
        // no 8" at the start of the title

        QRegExp numberRe("^([^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)");

        if (numberRe.indexIn(text) >= 0) {

            QString tag, number;
            tag = numberRe.cap(1);
            tag.replace(QRegExp("[^\\w\\s]+"), "");
            number = numberRe.cap(2);

            QString matcherReStr =
                QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(tag).arg(number);
            QRegExp matcherRe(matcherReStr, Qt::CaseInsensitive);
            std::cerr << "matcherRe: " << matcherReStr.toStdString() << std::endl;

            // initials only
/* nah, doesn't work well
            QString weakTag;
            QRegExp initialRe("\\b(\\w)\\w*\\b");
            int ix = 0;
            while ((ix = initialRe.indexIn(tag, ix)) >= 0) {
                if (ix > 0) weakTag += "\\s+";
                weakTag += initialRe.cap(1) + "\\w*";
                ++ix;
            }
            
            QString weakMatcherReStr =
                QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(weakTag).arg(number);
            QRegExp weakMatcherRe(weakMatcherReStr, Qt::CaseInsensitive);
            std::cerr << "weakMatcherRe: " << weakMatcherReStr.toStdString() << std::endl;
*/
            foreach (Work *w, m_works) {
                if (maxResults > 0 && matches.size() >= maxResults) {
                    break;
                }
                QString name = w->name();
                if (matcherRe.indexIn(name) >= 0) {
                    std::cerr << "We quite like: " << name.toStdString() << std::endl;
                    matches.insert(Guess(0.8f, w));
                }
/* else if (weakMatcherRe.indexIn(name) >= 0) {
                    std::cerr << "We sorta like: " << name.toStdString() << std::endl;
                    matches.insert(Guess(0.2f, w));
                }
*/
            }
        }
    }
    
    int n = 0;
    for (GuessSet::const_iterator i = matches.begin();
         i != matches.end(); ++i) {
        results.push_back(*i);
        if (maxResults > 0 && ++n > maxResults) break;
    }

    return results;
}

WorkTitleMatcher::WorkTitleMatcher(QList<Work *> wl) :
    m_works(wl)
{
}

GuessList
WorkTitleMatcher::match(QString text, int maxResults,
                        float threshold) const
{
    GuessList results;
    GuessSet matches;

    QString quoted;
    QRegExp quoteRe("\\b[\"']([^\"]+)[\"']\\b");
    int qthresh = 0;
    
    if (quoteRe.indexIn(text) >= 0) {
        quoted = quoteRe.cap(1);
        if (quoted.length() < 4) quoted = "";
        qthresh = quoted.length() / 4;
    }

    std::cerr << "text = " << text.toStdString() << ", quoted = "
              << quoted.toStdString() << std::endl;

    EditDistance ed;

    foreach (Work *w, m_works) {
        if (maxResults > 0 && matches.size() >= maxResults) {
            break;
        }

        float score = 0.f;
        QString name = w->name();

        if (quoted != "") {
            if (quoteRe.indexIn(name) >= 0) {
                QString q = quoteRe.cap(1);
                int dist = ed.calculate(quoted, q, qthresh);
                if (dist < qthresh) {
                    std::cerr << "quoted name match: " << q.toStdString() << std::endl;
                    score += 0.7f;
                }
            }
        }

        if (score > 0.f) {
            matches.insert(Guess(score, w));
        }
    }
    
    int n = 0;
    for (GuessSet::const_iterator i = matches.begin();
         i != matches.end(); ++i) {
        results.push_back(*i);
        if (maxResults > 0 && ++n > maxResults) break;
    }

    return results;
}

}