Mercurial > hg > classical
view common/Matcher.cpp @ 40:40e3f0049c00
* Track guessing fixes
author | Chris Cannam |
---|---|
date | Tue, 06 Apr 2010 17:36:27 +0100 |
parents | a8ab8c08a668 |
children |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "Matcher.h" #include "Objects.h" #include "EditDistance.h" #include <QMultiMap> #include <iostream> using namespace Dataquay; namespace ClassicalData { ComposerTypingQuickMatcher::ComposerTypingQuickMatcher(QList<Composer *> cl) : m_composers(cl) { } GuessList ComposerTypingQuickMatcher::match(QString text, int maxResults, float threshold) const { GuessList results; GuessSet matches; foreach (Composer *c, m_composers) { float value = c->matchTypingQuick(text); if (value < threshold) continue; matches.insert(Guess(value, c)); } int n = 0; for (GuessSet::const_iterator i = matches.begin(); i != matches.end(); ++i) { results.push_back(*i); if (maxResults > 0 && ++n > maxResults) break; } return results; } ComposerTypingThoroughMatcher::ComposerTypingThoroughMatcher(QList<Composer *> cl) : m_composers(cl) { } GuessList ComposerTypingThoroughMatcher::match(QString text, int maxResults, float threshold) const { GuessList results; GuessSet matches; foreach (Composer *c, m_composers) { float value = c->matchTyping(text); if (value < threshold) continue; matches.insert(Guess(value, c)); } int n = 0; for (GuessSet::const_iterator i = matches.begin(); i != matches.end(); ++i) { results.push_back(*i); if (maxResults > 0 && ++n > maxResults) break; } return results; } ComposerFullTextMatcher::ComposerFullTextMatcher(QList<Composer *> cl) : m_composers(cl) { } GuessList ComposerFullTextMatcher::match(QString text, int maxResults, float threshold) const { GuessList results; GuessSet matches; foreach (Composer *c, m_composers) { float value = c->matchFuzzyName(text); if (value < threshold) continue; // std::cerr << "Liking: " << c->name().toStdString() << " (" << value << ")" << std::endl; matches.insert(Guess(value, c)); } int n = 0; for (GuessSet::iterator i = matches.begin(); i != matches.end(); ++i) { Guess g = *i; results.push_back(g); // std::cerr << "Pushing: " << g.entity()->name().toStdString() << std::endl; if (maxResults > 0 && ++n > maxResults) break; } return results; } WorkCatalogueMatcher::WorkCatalogueMatcher(QList<Work *> wl) : m_works(wl) { } GuessList WorkCatalogueMatcher::match(QString text, int maxResults, float threshold) const { GuessList results; GuessSet matches; QStringList cats = Work::extractCatalogueNumberTexts(text); if (cats.empty()) return results; foreach (QString cat, cats) { std::cerr << "testing cat \"" << cat.toStdString() << "\" against " << m_works.size() << " works" << std::endl; foreach (Work *w, m_works) { if (maxResults > 0 && matches.size() >= maxResults) { break; } QString catalogue = w->catalogue(); if (catalogue != "") { if (!Work::compareCatalogueNumberTexts(catalogue, cat)) { std::cerr << "We like: " << w->name().toStdString() << " (" << catalogue.toStdString() << ")" << std::endl; // all catalogue matches score equal here matches.insert(Guess(1.f, w)); continue; } } QString opus = w->opus(); QString number = w->number(); QString optext; if (opus != "") { if (number != "") { optext = QString("Op %1 no %2").arg(opus).arg(number); if (!Work::compareCatalogueNumberTexts(optext, cat)) { std::cerr << "We like: " << w->name().toStdString() << " (" << optext.toStdString() << ")" << std::endl; matches.insert(Guess(1.f, w)); continue; } } else { optext = QString("Op %1").arg(opus); if (!Work::compareCatalogueNumberTexts(optext, cat)) { std::cerr << "We like: " << w->name().toStdString() << " (" << optext.toStdString() << ")" << std::endl; matches.insert(Guess(1.f, w)); continue; } } } } } if (maxResults == 0 || matches.size() < maxResults) { // Now, for slightly lower marks, test for strings like // "Symphony no 8" at the start of the title, or after a // colon, slash or dash (e.g. "Brahms: Symphony no 4") QRegExp numberRe1("^([^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)"); QRegExp numberRe2("[/:-]\\s*(\\w[^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)"); QString tag, number; if (numberRe1.indexIn(text) >= 0) { tag = numberRe1.cap(1); number = numberRe1.cap(2); } else if (numberRe2.indexIn(text) >= 0) { tag = numberRe2.cap(1); number = numberRe2.cap(2); } if (tag != "") { std::cerr << "tag = \"" << tag.toStdString() << "\", number = \"" << number.toStdString() << "\"" << std::endl; tag.replace(QRegExp("[^\\w\\s]+"), ""); QString matcherReStr = QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(tag).arg(number); QRegExp matcherRe(matcherReStr, Qt::CaseInsensitive); std::cerr << "matcherRe: " << matcherReStr.toStdString() << std::endl; // initials only /* nah, doesn't work well QString weakTag; QRegExp initialRe("\\b(\\w)\\w*\\b"); int ix = 0; while ((ix = initialRe.indexIn(tag, ix)) >= 0) { if (ix > 0) weakTag += "\\s+"; weakTag += initialRe.cap(1) + "\\w*"; ++ix; } QString weakMatcherReStr = QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(weakTag).arg(number); QRegExp weakMatcherRe(weakMatcherReStr, Qt::CaseInsensitive); std::cerr << "weakMatcherRe: " << weakMatcherReStr.toStdString() << std::endl; */ foreach (Work *w, m_works) { if (maxResults > 0 && matches.size() >= maxResults) { break; } QString name = w->name(); if (matcherRe.indexIn(name) >= 0) { std::cerr << "We quite like: " << name.toStdString() << std::endl; matches.insert(Guess(0.8f, w)); } /* else if (weakMatcherRe.indexIn(name) >= 0) { std::cerr << "We sorta like: " << name.toStdString() << std::endl; matches.insert(Guess(0.2f, w)); } */ } } } int n = 0; for (GuessSet::const_iterator i = matches.begin(); i != matches.end(); ++i) { results.push_back(*i); if (maxResults > 0 && ++n > maxResults) break; } return results; } WorkTitleMatcher::WorkTitleMatcher(QList<Work *> wl) : m_works(wl) { } GuessList WorkTitleMatcher::match(QString text, int maxResults, float threshold) const { GuessList results; GuessSet matches; // Throw away any initial numbers (likely to be track index) text = text.replace(QRegExp("^[0-9]+"), ""); QString quoted; QRegExp quoteRe("(^|\\s)[\"']([^\"]+)[\"']($|[^\\w])"); int qthresh = 0; if (quoteRe.indexIn(text) >= 0) { quoted = quoteRe.cap(2); if (quoted.length() < 4) quoted = ""; qthresh = quoted.length() / 4; } std::cerr << "text = " << text.toStdString() << ", quoted = " << quoted.toStdString() << std::endl; QStringList components = text.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts); QStringList reduced; foreach (QString c, components) { QString r = Composer::reduceName(c.toLower()); if (r != "") { if (r.length() > 3 || r[0].isDigit()) { reduced.push_back(r); } } } std::cerr << "reduced = " << reduced.join(" ").toStdString() << std::endl; EditDistance ed; foreach (Work *w, m_works) { if (maxResults > 0 && matches.size() >= maxResults) { break; } float highScore = 0.f; QSet<QString> names = w->aliases(); names.insert(w->name()); foreach (QString name, names) { float pro = 0.f; float con = 0.f; if (quoted != "") { if (quoteRe.indexIn(name) >= 0) { QString q = quoteRe.cap(1); int dist = ed.calculate(quoted, q, qthresh); if (dist < qthresh) { std::cerr << "quoted name match: " << q.toStdString() << std::endl; pro += 0.7f - 0.1f * dist; if (pro - con > highScore) { highScore = pro - con; continue; } } } } QStringList wcomp = name.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts); QSet<QString> wr; foreach (QString wc, wcomp) { wr.insert(Composer::reduceName(wc.toLower())); } foreach (QString rc, reduced) { if (wr.contains(rc)) { std::cerr << "component match: " << rc.toStdString() << std::endl; pro += 0.1; } else { con += 0.101; if (con > 0.25) con = 0.25; } } if (pro - con > highScore) highScore = pro - con; } if (highScore > 0.f) { std::cerr << "for " << w->name().toStdString() << " highScore = " << highScore << std::endl; matches.insert(Guess(highScore, w)); } } int n = 0; for (GuessSet::const_iterator i = matches.begin(); i != matches.end(); ++i) { results.push_back(*i); if (maxResults > 0 && ++n > maxResults) break; } return results; } }