view common/Matcher.cpp @ 40:40e3f0049c00

* Track guessing fixes
author Chris Cannam
date Tue, 06 Apr 2010 17:36:27 +0100
parents a8ab8c08a668
children
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

#include "Matcher.h"
#include "Objects.h"
#include "EditDistance.h"

#include <QMultiMap>

#include <iostream>

using namespace Dataquay;

namespace ClassicalData {

ComposerTypingQuickMatcher::ComposerTypingQuickMatcher(QList<Composer *> cl) :
    m_composers(cl)
{
}

GuessList
ComposerTypingQuickMatcher::match(QString text, int maxResults,
                                  float threshold) const
{
    GuessList results;
    GuessSet matches;

    foreach (Composer *c, m_composers) {
        float value = c->matchTypingQuick(text);
        if (value < threshold) continue;
        matches.insert(Guess(value, c));
    }
    
    int n = 0;
    for (GuessSet::const_iterator i = matches.begin();
         i != matches.end(); ++i) {
        results.push_back(*i);
        if (maxResults > 0 && ++n > maxResults) break;
    }

    return results;
}

ComposerTypingThoroughMatcher::ComposerTypingThoroughMatcher(QList<Composer *> cl) :
    m_composers(cl)
{
}

GuessList
ComposerTypingThoroughMatcher::match(QString text, int maxResults,
                                     float threshold) const
{
    GuessList results;

    GuessSet matches;
    foreach (Composer *c, m_composers) {
        float value = c->matchTyping(text);
        if (value < threshold) continue;
        matches.insert(Guess(value, c));
    }
    
    int n = 0;
    for (GuessSet::const_iterator i = matches.begin();
         i != matches.end(); ++i) {
        results.push_back(*i);
        if (maxResults > 0 && ++n > maxResults) break;
    }

    return results;
}

ComposerFullTextMatcher::ComposerFullTextMatcher(QList<Composer *> cl) :
    m_composers(cl)
{
}

GuessList
ComposerFullTextMatcher::match(QString text, int maxResults,
                               float threshold) const
{
    GuessList results;

    GuessSet matches;
    foreach (Composer *c, m_composers) {
        float value = c->matchFuzzyName(text);
        if (value < threshold) continue;
//        std::cerr << "Liking: " << c->name().toStdString() << " (" << value << ")" << std::endl;
        matches.insert(Guess(value, c));
    }
    
    int n = 0;
    for (GuessSet::iterator i = matches.begin();
         i != matches.end(); ++i) {
        Guess g = *i;
        results.push_back(g);
//        std::cerr << "Pushing: " << g.entity()->name().toStdString() << std::endl;
        if (maxResults > 0 && ++n > maxResults) break;
    }

    return results;
}

WorkCatalogueMatcher::WorkCatalogueMatcher(QList<Work *> wl) :
    m_works(wl)
{
}

GuessList
WorkCatalogueMatcher::match(QString text, int maxResults,
                            float threshold) const
{
    GuessList results;
    GuessSet matches;
    QStringList cats = Work::extractCatalogueNumberTexts(text);
    if (cats.empty()) return results;
    foreach (QString cat, cats) {
        std::cerr << "testing cat \"" << cat.toStdString() << "\" against "
                  << m_works.size() << " works" << std::endl;
        foreach (Work *w, m_works) {
            if (maxResults > 0 && matches.size() >= maxResults) {
                break;
            }
            QString catalogue = w->catalogue();
            if (catalogue != "") {
                if (!Work::compareCatalogueNumberTexts(catalogue, cat)) {
                    std::cerr << "We like: " << w->name().toStdString() << " ("
                              << catalogue.toStdString() << ")" << std::endl;
                    // all catalogue matches score equal here
                    matches.insert(Guess(1.f, w));
                    continue;
                }
            }
            QString opus = w->opus();
            QString number = w->number();
            QString optext;
            if (opus != "") {
                if (number != "") {
                    optext = QString("Op %1 no %2").arg(opus).arg(number);
                    if (!Work::compareCatalogueNumberTexts(optext, cat)) {
                        std::cerr << "We like: " << w->name().toStdString() << " ("
                                  << optext.toStdString() << ")" << std::endl;
                        matches.insert(Guess(1.f, w));
                        continue;
                    }
                } else {
                    optext = QString("Op %1").arg(opus);
                    if (!Work::compareCatalogueNumberTexts(optext, cat)) {
                        std::cerr << "We like: " << w->name().toStdString() << " ("
                                  << optext.toStdString() << ")" << std::endl;
                        matches.insert(Guess(1.f, w));
                        continue;
                    }
                }
            }
        }
    }

    if (maxResults == 0 || matches.size() < maxResults) {

        // Now, for slightly lower marks, test for strings like
        // "Symphony no 8" at the start of the title, or after a
        // colon, slash or dash (e.g. "Brahms: Symphony no 4")

        QRegExp numberRe1("^([^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)");
        QRegExp numberRe2("[/:-]\\s*(\\w[^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)");
        QString tag, number;

        if (numberRe1.indexIn(text) >= 0) {
            tag = numberRe1.cap(1);
            number = numberRe1.cap(2);
        } else if (numberRe2.indexIn(text) >= 0) {
            tag = numberRe2.cap(1);
            number = numberRe2.cap(2);
        }
        
        if (tag != "") {

            std::cerr << "tag = \"" << tag.toStdString() << "\", number = \""
                      << number.toStdString() << "\"" << std::endl;

            tag.replace(QRegExp("[^\\w\\s]+"), "");
            QString matcherReStr =
                QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(tag).arg(number);
            QRegExp matcherRe(matcherReStr, Qt::CaseInsensitive);
            std::cerr << "matcherRe: " << matcherReStr.toStdString() << std::endl;

            // initials only
/* nah, doesn't work well
            QString weakTag;
            QRegExp initialRe("\\b(\\w)\\w*\\b");
            int ix = 0;
            while ((ix = initialRe.indexIn(tag, ix)) >= 0) {
                if (ix > 0) weakTag += "\\s+";
                weakTag += initialRe.cap(1) + "\\w*";
                ++ix;
            }
            
            QString weakMatcherReStr =
                QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(weakTag).arg(number);
            QRegExp weakMatcherRe(weakMatcherReStr, Qt::CaseInsensitive);
            std::cerr << "weakMatcherRe: " << weakMatcherReStr.toStdString() << std::endl;
*/
            foreach (Work *w, m_works) {
                if (maxResults > 0 && matches.size() >= maxResults) {
                    break;
                }
                QString name = w->name();
                if (matcherRe.indexIn(name) >= 0) {
                    std::cerr << "We quite like: " << name.toStdString() << std::endl;
                    matches.insert(Guess(0.8f, w));
                }
/* else if (weakMatcherRe.indexIn(name) >= 0) {
                    std::cerr << "We sorta like: " << name.toStdString() << std::endl;
                    matches.insert(Guess(0.2f, w));
                }
*/
            }
        }
    }
    
    int n = 0;
    for (GuessSet::const_iterator i = matches.begin();
         i != matches.end(); ++i) {
        results.push_back(*i);
        if (maxResults > 0 && ++n > maxResults) break;
    }

    return results;
}

WorkTitleMatcher::WorkTitleMatcher(QList<Work *> wl) :
    m_works(wl)
{
}

GuessList
WorkTitleMatcher::match(QString text, int maxResults,
                        float threshold) const
{
    GuessList results;
    GuessSet matches;

    // Throw away any initial numbers (likely to be track index)
    text = text.replace(QRegExp("^[0-9]+"), "");

    QString quoted;
    QRegExp quoteRe("(^|\\s)[\"']([^\"]+)[\"']($|[^\\w])");
    int qthresh = 0;
    
    if (quoteRe.indexIn(text) >= 0) {
        quoted = quoteRe.cap(2);
        if (quoted.length() < 4) quoted = "";
        qthresh = quoted.length() / 4;
    }

    std::cerr << "text = " << text.toStdString() << ", quoted = "
              << quoted.toStdString() << std::endl;

    QStringList components =
        text.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts);
    QStringList reduced;
    foreach (QString c, components) {
        QString r = Composer::reduceName(c.toLower());
        if (r != "") {
            if (r.length() > 3 || r[0].isDigit()) {
                reduced.push_back(r);
            }
        }
    }

    std::cerr << "reduced = " << reduced.join(" ").toStdString() << std::endl;

    EditDistance ed;

    foreach (Work *w, m_works) {
        if (maxResults > 0 && matches.size() >= maxResults) {
            break;
        }

        float highScore = 0.f;

        QSet<QString> names = w->aliases();
        names.insert(w->name());

        foreach (QString name, names) {

            float pro = 0.f;
            float con = 0.f;

            if (quoted != "") {
                if (quoteRe.indexIn(name) >= 0) {
                    QString q = quoteRe.cap(1);
                    int dist = ed.calculate(quoted, q, qthresh);
                    if (dist < qthresh) {
                        std::cerr << "quoted name match: " << q.toStdString() << std::endl;
                        pro += 0.7f - 0.1f * dist;
                        if (pro - con > highScore) {
                            highScore = pro - con;
                            continue;
                        }
                    }
                }
            }

            QStringList wcomp =
                name.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts);
            QSet<QString> wr;
            foreach (QString wc, wcomp) {
                wr.insert(Composer::reduceName(wc.toLower()));
            }
            foreach (QString rc, reduced) {
                if (wr.contains(rc)) {
                    std::cerr << "component match: " << rc.toStdString() << std::endl;
                    pro += 0.1;
                } else {
                    con += 0.101;
                    if (con > 0.25) con = 0.25;
                }
            }

            if (pro - con > highScore) highScore = pro - con;
        }

        if (highScore > 0.f) {
            std::cerr << "for " << w->name().toStdString() << " highScore = " << highScore << std::endl;
            matches.insert(Guess(highScore, w));
        }
    }
    
    int n = 0;
    for (GuessSet::const_iterator i = matches.begin();
         i != matches.end(); ++i) {
        results.push_back(*i);
        if (maxResults > 0 && ++n > maxResults) break;
    }

    return results;
}

}