Chris@28: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@28: Chris@28: #include "Matcher.h" Chris@28: #include "Objects.h" Chris@34: #include "EditDistance.h" Chris@28: Chris@28: #include Chris@28: Chris@34: #include Chris@34: Chris@28: using namespace Dataquay; Chris@28: Chris@28: namespace ClassicalData { Chris@28: Chris@33: ComposerTypingQuickMatcher::ComposerTypingQuickMatcher(QList cl) : Chris@33: m_composers(cl) Chris@30: { Chris@30: } Chris@30: Chris@30: GuessList Chris@33: ComposerTypingQuickMatcher::match(QString text, int maxResults, Chris@33: float threshold) const Chris@28: { Chris@30: GuessList results; Chris@34: GuessSet matches; Chris@28: Chris@28: foreach (Composer *c, m_composers) { Chris@28: float value = c->matchTypingQuick(text); Chris@33: if (value < threshold) continue; Chris@34: matches.insert(Guess(value, c)); Chris@28: } Chris@28: Chris@28: int n = 0; Chris@34: for (GuessSet::const_iterator i = matches.begin(); Chris@30: i != matches.end(); ++i) { Chris@34: results.push_back(*i); Chris@34: if (maxResults > 0 && ++n > maxResults) break; Chris@28: } Chris@28: Chris@28: return results; Chris@28: } Chris@28: Chris@33: ComposerTypingThoroughMatcher::ComposerTypingThoroughMatcher(QList cl) : Chris@33: m_composers(cl) Chris@30: { Chris@30: } Chris@30: Chris@30: GuessList Chris@33: ComposerTypingThoroughMatcher::match(QString text, int maxResults, Chris@33: float threshold) const Chris@28: { Chris@30: GuessList results; Chris@28: Chris@34: GuessSet matches; Chris@28: foreach (Composer *c, m_composers) { Chris@28: float value = c->matchTyping(text); Chris@33: if (value < threshold) continue; Chris@34: matches.insert(Guess(value, c)); Chris@33: } Chris@33: Chris@33: int n = 0; Chris@34: for (GuessSet::const_iterator i = matches.begin(); Chris@33: i != matches.end(); ++i) { Chris@34: results.push_back(*i); Chris@34: if (maxResults > 0 && ++n > maxResults) break; Chris@33: } Chris@33: Chris@33: return results; Chris@33: } Chris@33: Chris@33: ComposerFullTextMatcher::ComposerFullTextMatcher(QList cl) : Chris@33: m_composers(cl) Chris@33: { Chris@33: } Chris@33: Chris@33: GuessList Chris@33: ComposerFullTextMatcher::match(QString text, int maxResults, Chris@33: float threshold) const Chris@33: { Chris@33: GuessList results; Chris@33: Chris@34: GuessSet matches; Chris@33: foreach (Composer *c, m_composers) { Chris@33: float value = c->matchFuzzyName(text); Chris@33: if (value < threshold) continue; Chris@34: // std::cerr << "Liking: " << c->name().toStdString() << " (" << value << ")" << std::endl; Chris@34: matches.insert(Guess(value, c)); Chris@28: } Chris@28: Chris@28: int n = 0; Chris@34: for (GuessSet::iterator i = matches.begin(); Chris@30: i != matches.end(); ++i) { Chris@34: Guess g = *i; Chris@34: results.push_back(g); Chris@34: // std::cerr << "Pushing: " << g.entity()->name().toStdString() << std::endl; Chris@34: if (maxResults > 0 && ++n > maxResults) break; Chris@34: } Chris@34: Chris@34: return results; Chris@34: } Chris@34: Chris@34: WorkCatalogueMatcher::WorkCatalogueMatcher(QList wl) : Chris@34: m_works(wl) Chris@34: { Chris@34: } Chris@34: Chris@34: GuessList Chris@34: WorkCatalogueMatcher::match(QString text, int maxResults, Chris@34: float threshold) const Chris@34: { Chris@34: GuessList results; Chris@34: GuessSet matches; Chris@34: QStringList cats = Work::extractCatalogueNumberTexts(text); Chris@34: if (cats.empty()) return results; Chris@34: foreach (QString cat, cats) { Chris@34: std::cerr << "testing cat \"" << cat.toStdString() << "\" against " Chris@34: << m_works.size() << " works" << std::endl; Chris@34: foreach (Work *w, m_works) { Chris@34: if (maxResults > 0 && matches.size() >= maxResults) { Chris@34: break; Chris@34: } Chris@34: QString catalogue = w->catalogue(); Chris@34: if (catalogue != "") { Chris@34: if (!Work::compareCatalogueNumberTexts(catalogue, cat)) { Chris@34: std::cerr << "We like: " << w->name().toStdString() << " (" Chris@34: << catalogue.toStdString() << ")" << std::endl; Chris@34: // all catalogue matches score equal here Chris@34: matches.insert(Guess(1.f, w)); Chris@34: continue; Chris@34: } Chris@34: } Chris@34: QString opus = w->opus(); Chris@34: QString number = w->number(); Chris@34: QString optext; Chris@34: if (opus != "") { Chris@34: if (number != "") { Chris@34: optext = QString("Op %1 no %2").arg(opus).arg(number); Chris@34: if (!Work::compareCatalogueNumberTexts(optext, cat)) { Chris@34: std::cerr << "We like: " << w->name().toStdString() << " (" Chris@34: << optext.toStdString() << ")" << std::endl; Chris@34: matches.insert(Guess(1.f, w)); Chris@34: continue; Chris@34: } Chris@34: } else { Chris@34: optext = QString("Op %1").arg(opus); Chris@34: if (!Work::compareCatalogueNumberTexts(optext, cat)) { Chris@34: std::cerr << "We like: " << w->name().toStdString() << " (" Chris@34: << optext.toStdString() << ")" << std::endl; Chris@34: matches.insert(Guess(1.f, w)); Chris@34: continue; Chris@34: } Chris@34: } Chris@34: } Chris@34: } Chris@34: } Chris@34: Chris@34: if (maxResults == 0 || matches.size() < maxResults) { Chris@34: Chris@35: // Now, for slightly lower marks, test for strings like Chris@35: // "Symphony no 8" at the start of the title, or after a Chris@35: // colon, slash or dash (e.g. "Brahms: Symphony no 4") Chris@34: Chris@35: QRegExp numberRe1("^([^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)"); Chris@35: QRegExp numberRe2("[/:-]\\s*(\\w[^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)"); Chris@35: QString tag, number; Chris@34: Chris@35: if (numberRe1.indexIn(text) >= 0) { Chris@35: tag = numberRe1.cap(1); Chris@35: number = numberRe1.cap(2); Chris@35: } else if (numberRe2.indexIn(text) >= 0) { Chris@35: tag = numberRe2.cap(1); Chris@35: number = numberRe2.cap(2); Chris@35: } Chris@35: Chris@35: if (tag != "") { Chris@34: Chris@35: std::cerr << "tag = \"" << tag.toStdString() << "\", number = \"" Chris@35: << number.toStdString() << "\"" << std::endl; Chris@35: Chris@34: tag.replace(QRegExp("[^\\w\\s]+"), ""); Chris@34: QString matcherReStr = Chris@34: QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(tag).arg(number); Chris@34: QRegExp matcherRe(matcherReStr, Qt::CaseInsensitive); Chris@34: std::cerr << "matcherRe: " << matcherReStr.toStdString() << std::endl; Chris@34: Chris@34: // initials only Chris@34: /* nah, doesn't work well Chris@34: QString weakTag; Chris@34: QRegExp initialRe("\\b(\\w)\\w*\\b"); Chris@34: int ix = 0; Chris@34: while ((ix = initialRe.indexIn(tag, ix)) >= 0) { Chris@34: if (ix > 0) weakTag += "\\s+"; Chris@34: weakTag += initialRe.cap(1) + "\\w*"; Chris@34: ++ix; Chris@34: } Chris@34: Chris@34: QString weakMatcherReStr = Chris@34: QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(weakTag).arg(number); Chris@34: QRegExp weakMatcherRe(weakMatcherReStr, Qt::CaseInsensitive); Chris@34: std::cerr << "weakMatcherRe: " << weakMatcherReStr.toStdString() << std::endl; Chris@34: */ Chris@34: foreach (Work *w, m_works) { Chris@34: if (maxResults > 0 && matches.size() >= maxResults) { Chris@34: break; Chris@34: } Chris@34: QString name = w->name(); Chris@34: if (matcherRe.indexIn(name) >= 0) { Chris@34: std::cerr << "We quite like: " << name.toStdString() << std::endl; Chris@34: matches.insert(Guess(0.8f, w)); Chris@34: } Chris@34: /* else if (weakMatcherRe.indexIn(name) >= 0) { Chris@34: std::cerr << "We sorta like: " << name.toStdString() << std::endl; Chris@34: matches.insert(Guess(0.2f, w)); Chris@34: } Chris@34: */ Chris@34: } Chris@34: } Chris@34: } Chris@34: Chris@34: int n = 0; Chris@34: for (GuessSet::const_iterator i = matches.begin(); Chris@34: i != matches.end(); ++i) { Chris@34: results.push_back(*i); Chris@34: if (maxResults > 0 && ++n > maxResults) break; Chris@34: } Chris@34: Chris@34: return results; Chris@34: } Chris@34: Chris@34: WorkTitleMatcher::WorkTitleMatcher(QList wl) : Chris@34: m_works(wl) Chris@34: { Chris@34: } Chris@34: Chris@34: GuessList Chris@34: WorkTitleMatcher::match(QString text, int maxResults, Chris@34: float threshold) const Chris@34: { Chris@34: GuessList results; Chris@34: GuessSet matches; Chris@34: Chris@37: // Throw away any initial numbers (likely to be track index) Chris@37: text = text.replace(QRegExp("^[0-9]+"), ""); Chris@37: Chris@34: QString quoted; Chris@40: QRegExp quoteRe("(^|\\s)[\"']([^\"]+)[\"']($|[^\\w])"); Chris@34: int qthresh = 0; Chris@34: Chris@34: if (quoteRe.indexIn(text) >= 0) { Chris@40: quoted = quoteRe.cap(2); Chris@34: if (quoted.length() < 4) quoted = ""; Chris@34: qthresh = quoted.length() / 4; Chris@34: } Chris@34: Chris@34: std::cerr << "text = " << text.toStdString() << ", quoted = " Chris@34: << quoted.toStdString() << std::endl; Chris@34: Chris@36: QStringList components = Chris@36: text.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts); Chris@36: QStringList reduced; Chris@36: foreach (QString c, components) { Chris@36: QString r = Composer::reduceName(c.toLower()); Chris@36: if (r != "") { Chris@36: if (r.length() > 3 || r[0].isDigit()) { Chris@36: reduced.push_back(r); Chris@36: } Chris@36: } Chris@36: } Chris@36: Chris@40: std::cerr << "reduced = " << reduced.join(" ").toStdString() << std::endl; Chris@40: Chris@34: EditDistance ed; Chris@34: Chris@34: foreach (Work *w, m_works) { Chris@34: if (maxResults > 0 && matches.size() >= maxResults) { Chris@34: break; Chris@34: } Chris@34: Chris@36: float highScore = 0.f; Chris@34: Chris@36: QSet names = w->aliases(); Chris@36: names.insert(w->name()); Chris@36: Chris@36: foreach (QString name, names) { Chris@36: Chris@40: float pro = 0.f; Chris@40: float con = 0.f; Chris@36: Chris@36: if (quoted != "") { Chris@36: if (quoteRe.indexIn(name) >= 0) { Chris@36: QString q = quoteRe.cap(1); Chris@36: int dist = ed.calculate(quoted, q, qthresh); Chris@36: if (dist < qthresh) { Chris@36: std::cerr << "quoted name match: " << q.toStdString() << std::endl; Chris@40: pro += 0.7f - 0.1f * dist; Chris@40: if (pro - con > highScore) { Chris@40: highScore = pro - con; Chris@36: continue; Chris@36: } Chris@36: } Chris@34: } Chris@34: } Chris@34: Chris@36: QStringList wcomp = Chris@36: name.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts); Chris@36: QSet wr; Chris@36: foreach (QString wc, wcomp) { Chris@36: wr.insert(Composer::reduceName(wc.toLower())); Chris@36: } Chris@36: foreach (QString rc, reduced) { Chris@36: if (wr.contains(rc)) { Chris@36: std::cerr << "component match: " << rc.toStdString() << std::endl; Chris@40: pro += 0.1; Chris@36: } else { Chris@40: con += 0.101; Chris@40: if (con > 0.25) con = 0.25; Chris@36: } Chris@36: } Chris@36: Chris@40: if (pro - con > highScore) highScore = pro - con; Chris@36: } Chris@36: Chris@36: if (highScore > 0.f) { Chris@36: std::cerr << "for " << w->name().toStdString() << " highScore = " << highScore << std::endl; Chris@36: matches.insert(Guess(highScore, w)); Chris@34: } Chris@34: } Chris@34: Chris@34: int n = 0; Chris@34: for (GuessSet::const_iterator i = matches.begin(); Chris@34: i != matches.end(); ++i) { Chris@34: results.push_back(*i); Chris@34: if (maxResults > 0 && ++n > maxResults) break; Chris@28: } Chris@28: Chris@28: return results; Chris@28: } Chris@28: Chris@28: } Chris@28: