annotate common/Matcher.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents 40e3f0049c00
children
rev   line source
Chris@28 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@28 2
Chris@28 3 #include "Matcher.h"
Chris@28 4 #include "Objects.h"
Chris@34 5 #include "EditDistance.h"
Chris@28 6
Chris@28 7 #include <QMultiMap>
Chris@28 8
Chris@34 9 #include <iostream>
Chris@34 10
Chris@28 11 using namespace Dataquay;
Chris@28 12
Chris@28 13 namespace ClassicalData {
Chris@28 14
Chris@33 15 ComposerTypingQuickMatcher::ComposerTypingQuickMatcher(QList<Composer *> cl) :
Chris@33 16 m_composers(cl)
Chris@30 17 {
Chris@30 18 }
Chris@30 19
Chris@30 20 GuessList
Chris@33 21 ComposerTypingQuickMatcher::match(QString text, int maxResults,
Chris@33 22 float threshold) const
Chris@28 23 {
Chris@30 24 GuessList results;
Chris@34 25 GuessSet matches;
Chris@28 26
Chris@28 27 foreach (Composer *c, m_composers) {
Chris@28 28 float value = c->matchTypingQuick(text);
Chris@33 29 if (value < threshold) continue;
Chris@34 30 matches.insert(Guess(value, c));
Chris@28 31 }
Chris@28 32
Chris@28 33 int n = 0;
Chris@34 34 for (GuessSet::const_iterator i = matches.begin();
Chris@30 35 i != matches.end(); ++i) {
Chris@34 36 results.push_back(*i);
Chris@34 37 if (maxResults > 0 && ++n > maxResults) break;
Chris@28 38 }
Chris@28 39
Chris@28 40 return results;
Chris@28 41 }
Chris@28 42
Chris@33 43 ComposerTypingThoroughMatcher::ComposerTypingThoroughMatcher(QList<Composer *> cl) :
Chris@33 44 m_composers(cl)
Chris@30 45 {
Chris@30 46 }
Chris@30 47
Chris@30 48 GuessList
Chris@33 49 ComposerTypingThoroughMatcher::match(QString text, int maxResults,
Chris@33 50 float threshold) const
Chris@28 51 {
Chris@30 52 GuessList results;
Chris@28 53
Chris@34 54 GuessSet matches;
Chris@28 55 foreach (Composer *c, m_composers) {
Chris@28 56 float value = c->matchTyping(text);
Chris@33 57 if (value < threshold) continue;
Chris@34 58 matches.insert(Guess(value, c));
Chris@33 59 }
Chris@33 60
Chris@33 61 int n = 0;
Chris@34 62 for (GuessSet::const_iterator i = matches.begin();
Chris@33 63 i != matches.end(); ++i) {
Chris@34 64 results.push_back(*i);
Chris@34 65 if (maxResults > 0 && ++n > maxResults) break;
Chris@33 66 }
Chris@33 67
Chris@33 68 return results;
Chris@33 69 }
Chris@33 70
Chris@33 71 ComposerFullTextMatcher::ComposerFullTextMatcher(QList<Composer *> cl) :
Chris@33 72 m_composers(cl)
Chris@33 73 {
Chris@33 74 }
Chris@33 75
Chris@33 76 GuessList
Chris@33 77 ComposerFullTextMatcher::match(QString text, int maxResults,
Chris@33 78 float threshold) const
Chris@33 79 {
Chris@33 80 GuessList results;
Chris@33 81
Chris@34 82 GuessSet matches;
Chris@33 83 foreach (Composer *c, m_composers) {
Chris@33 84 float value = c->matchFuzzyName(text);
Chris@33 85 if (value < threshold) continue;
Chris@34 86 // std::cerr << "Liking: " << c->name().toStdString() << " (" << value << ")" << std::endl;
Chris@34 87 matches.insert(Guess(value, c));
Chris@28 88 }
Chris@28 89
Chris@28 90 int n = 0;
Chris@34 91 for (GuessSet::iterator i = matches.begin();
Chris@30 92 i != matches.end(); ++i) {
Chris@34 93 Guess g = *i;
Chris@34 94 results.push_back(g);
Chris@34 95 // std::cerr << "Pushing: " << g.entity()->name().toStdString() << std::endl;
Chris@34 96 if (maxResults > 0 && ++n > maxResults) break;
Chris@34 97 }
Chris@34 98
Chris@34 99 return results;
Chris@34 100 }
Chris@34 101
Chris@34 102 WorkCatalogueMatcher::WorkCatalogueMatcher(QList<Work *> wl) :
Chris@34 103 m_works(wl)
Chris@34 104 {
Chris@34 105 }
Chris@34 106
Chris@34 107 GuessList
Chris@34 108 WorkCatalogueMatcher::match(QString text, int maxResults,
Chris@34 109 float threshold) const
Chris@34 110 {
Chris@34 111 GuessList results;
Chris@34 112 GuessSet matches;
Chris@34 113 QStringList cats = Work::extractCatalogueNumberTexts(text);
Chris@34 114 if (cats.empty()) return results;
Chris@34 115 foreach (QString cat, cats) {
Chris@34 116 std::cerr << "testing cat \"" << cat.toStdString() << "\" against "
Chris@34 117 << m_works.size() << " works" << std::endl;
Chris@34 118 foreach (Work *w, m_works) {
Chris@34 119 if (maxResults > 0 && matches.size() >= maxResults) {
Chris@34 120 break;
Chris@34 121 }
Chris@34 122 QString catalogue = w->catalogue();
Chris@34 123 if (catalogue != "") {
Chris@34 124 if (!Work::compareCatalogueNumberTexts(catalogue, cat)) {
Chris@34 125 std::cerr << "We like: " << w->name().toStdString() << " ("
Chris@34 126 << catalogue.toStdString() << ")" << std::endl;
Chris@34 127 // all catalogue matches score equal here
Chris@34 128 matches.insert(Guess(1.f, w));
Chris@34 129 continue;
Chris@34 130 }
Chris@34 131 }
Chris@34 132 QString opus = w->opus();
Chris@34 133 QString number = w->number();
Chris@34 134 QString optext;
Chris@34 135 if (opus != "") {
Chris@34 136 if (number != "") {
Chris@34 137 optext = QString("Op %1 no %2").arg(opus).arg(number);
Chris@34 138 if (!Work::compareCatalogueNumberTexts(optext, cat)) {
Chris@34 139 std::cerr << "We like: " << w->name().toStdString() << " ("
Chris@34 140 << optext.toStdString() << ")" << std::endl;
Chris@34 141 matches.insert(Guess(1.f, w));
Chris@34 142 continue;
Chris@34 143 }
Chris@34 144 } else {
Chris@34 145 optext = QString("Op %1").arg(opus);
Chris@34 146 if (!Work::compareCatalogueNumberTexts(optext, cat)) {
Chris@34 147 std::cerr << "We like: " << w->name().toStdString() << " ("
Chris@34 148 << optext.toStdString() << ")" << std::endl;
Chris@34 149 matches.insert(Guess(1.f, w));
Chris@34 150 continue;
Chris@34 151 }
Chris@34 152 }
Chris@34 153 }
Chris@34 154 }
Chris@34 155 }
Chris@34 156
Chris@34 157 if (maxResults == 0 || matches.size() < maxResults) {
Chris@34 158
Chris@35 159 // Now, for slightly lower marks, test for strings like
Chris@35 160 // "Symphony no 8" at the start of the title, or after a
Chris@35 161 // colon, slash or dash (e.g. "Brahms: Symphony no 4")
Chris@34 162
Chris@35 163 QRegExp numberRe1("^([^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)");
Chris@35 164 QRegExp numberRe2("[/:-]\\s*(\\w[^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)");
Chris@35 165 QString tag, number;
Chris@34 166
Chris@35 167 if (numberRe1.indexIn(text) >= 0) {
Chris@35 168 tag = numberRe1.cap(1);
Chris@35 169 number = numberRe1.cap(2);
Chris@35 170 } else if (numberRe2.indexIn(text) >= 0) {
Chris@35 171 tag = numberRe2.cap(1);
Chris@35 172 number = numberRe2.cap(2);
Chris@35 173 }
Chris@35 174
Chris@35 175 if (tag != "") {
Chris@34 176
Chris@35 177 std::cerr << "tag = \"" << tag.toStdString() << "\", number = \""
Chris@35 178 << number.toStdString() << "\"" << std::endl;
Chris@35 179
Chris@34 180 tag.replace(QRegExp("[^\\w\\s]+"), "");
Chris@34 181 QString matcherReStr =
Chris@34 182 QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(tag).arg(number);
Chris@34 183 QRegExp matcherRe(matcherReStr, Qt::CaseInsensitive);
Chris@34 184 std::cerr << "matcherRe: " << matcherReStr.toStdString() << std::endl;
Chris@34 185
Chris@34 186 // initials only
Chris@34 187 /* nah, doesn't work well
Chris@34 188 QString weakTag;
Chris@34 189 QRegExp initialRe("\\b(\\w)\\w*\\b");
Chris@34 190 int ix = 0;
Chris@34 191 while ((ix = initialRe.indexIn(tag, ix)) >= 0) {
Chris@34 192 if (ix > 0) weakTag += "\\s+";
Chris@34 193 weakTag += initialRe.cap(1) + "\\w*";
Chris@34 194 ++ix;
Chris@34 195 }
Chris@34 196
Chris@34 197 QString weakMatcherReStr =
Chris@34 198 QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(weakTag).arg(number);
Chris@34 199 QRegExp weakMatcherRe(weakMatcherReStr, Qt::CaseInsensitive);
Chris@34 200 std::cerr << "weakMatcherRe: " << weakMatcherReStr.toStdString() << std::endl;
Chris@34 201 */
Chris@34 202 foreach (Work *w, m_works) {
Chris@34 203 if (maxResults > 0 && matches.size() >= maxResults) {
Chris@34 204 break;
Chris@34 205 }
Chris@34 206 QString name = w->name();
Chris@34 207 if (matcherRe.indexIn(name) >= 0) {
Chris@34 208 std::cerr << "We quite like: " << name.toStdString() << std::endl;
Chris@34 209 matches.insert(Guess(0.8f, w));
Chris@34 210 }
Chris@34 211 /* else if (weakMatcherRe.indexIn(name) >= 0) {
Chris@34 212 std::cerr << "We sorta like: " << name.toStdString() << std::endl;
Chris@34 213 matches.insert(Guess(0.2f, w));
Chris@34 214 }
Chris@34 215 */
Chris@34 216 }
Chris@34 217 }
Chris@34 218 }
Chris@34 219
Chris@34 220 int n = 0;
Chris@34 221 for (GuessSet::const_iterator i = matches.begin();
Chris@34 222 i != matches.end(); ++i) {
Chris@34 223 results.push_back(*i);
Chris@34 224 if (maxResults > 0 && ++n > maxResults) break;
Chris@34 225 }
Chris@34 226
Chris@34 227 return results;
Chris@34 228 }
Chris@34 229
Chris@34 230 WorkTitleMatcher::WorkTitleMatcher(QList<Work *> wl) :
Chris@34 231 m_works(wl)
Chris@34 232 {
Chris@34 233 }
Chris@34 234
Chris@34 235 GuessList
Chris@34 236 WorkTitleMatcher::match(QString text, int maxResults,
Chris@34 237 float threshold) const
Chris@34 238 {
Chris@34 239 GuessList results;
Chris@34 240 GuessSet matches;
Chris@34 241
Chris@37 242 // Throw away any initial numbers (likely to be track index)
Chris@37 243 text = text.replace(QRegExp("^[0-9]+"), "");
Chris@37 244
Chris@34 245 QString quoted;
Chris@40 246 QRegExp quoteRe("(^|\\s)[\"']([^\"]+)[\"']($|[^\\w])");
Chris@34 247 int qthresh = 0;
Chris@34 248
Chris@34 249 if (quoteRe.indexIn(text) >= 0) {
Chris@40 250 quoted = quoteRe.cap(2);
Chris@34 251 if (quoted.length() < 4) quoted = "";
Chris@34 252 qthresh = quoted.length() / 4;
Chris@34 253 }
Chris@34 254
Chris@34 255 std::cerr << "text = " << text.toStdString() << ", quoted = "
Chris@34 256 << quoted.toStdString() << std::endl;
Chris@34 257
Chris@36 258 QStringList components =
Chris@36 259 text.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts);
Chris@36 260 QStringList reduced;
Chris@36 261 foreach (QString c, components) {
Chris@36 262 QString r = Composer::reduceName(c.toLower());
Chris@36 263 if (r != "") {
Chris@36 264 if (r.length() > 3 || r[0].isDigit()) {
Chris@36 265 reduced.push_back(r);
Chris@36 266 }
Chris@36 267 }
Chris@36 268 }
Chris@36 269
Chris@40 270 std::cerr << "reduced = " << reduced.join(" ").toStdString() << std::endl;
Chris@40 271
Chris@34 272 EditDistance ed;
Chris@34 273
Chris@34 274 foreach (Work *w, m_works) {
Chris@34 275 if (maxResults > 0 && matches.size() >= maxResults) {
Chris@34 276 break;
Chris@34 277 }
Chris@34 278
Chris@36 279 float highScore = 0.f;
Chris@34 280
Chris@36 281 QSet<QString> names = w->aliases();
Chris@36 282 names.insert(w->name());
Chris@36 283
Chris@36 284 foreach (QString name, names) {
Chris@36 285
Chris@40 286 float pro = 0.f;
Chris@40 287 float con = 0.f;
Chris@36 288
Chris@36 289 if (quoted != "") {
Chris@36 290 if (quoteRe.indexIn(name) >= 0) {
Chris@36 291 QString q = quoteRe.cap(1);
Chris@36 292 int dist = ed.calculate(quoted, q, qthresh);
Chris@36 293 if (dist < qthresh) {
Chris@36 294 std::cerr << "quoted name match: " << q.toStdString() << std::endl;
Chris@40 295 pro += 0.7f - 0.1f * dist;
Chris@40 296 if (pro - con > highScore) {
Chris@40 297 highScore = pro - con;
Chris@36 298 continue;
Chris@36 299 }
Chris@36 300 }
Chris@34 301 }
Chris@34 302 }
Chris@34 303
Chris@36 304 QStringList wcomp =
Chris@36 305 name.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts);
Chris@36 306 QSet<QString> wr;
Chris@36 307 foreach (QString wc, wcomp) {
Chris@36 308 wr.insert(Composer::reduceName(wc.toLower()));
Chris@36 309 }
Chris@36 310 foreach (QString rc, reduced) {
Chris@36 311 if (wr.contains(rc)) {
Chris@36 312 std::cerr << "component match: " << rc.toStdString() << std::endl;
Chris@40 313 pro += 0.1;
Chris@36 314 } else {
Chris@40 315 con += 0.101;
Chris@40 316 if (con > 0.25) con = 0.25;
Chris@36 317 }
Chris@36 318 }
Chris@36 319
Chris@40 320 if (pro - con > highScore) highScore = pro - con;
Chris@36 321 }
Chris@36 322
Chris@36 323 if (highScore > 0.f) {
Chris@36 324 std::cerr << "for " << w->name().toStdString() << " highScore = " << highScore << std::endl;
Chris@36 325 matches.insert(Guess(highScore, w));
Chris@34 326 }
Chris@34 327 }
Chris@34 328
Chris@34 329 int n = 0;
Chris@34 330 for (GuessSet::const_iterator i = matches.begin();
Chris@34 331 i != matches.end(); ++i) {
Chris@34 332 results.push_back(*i);
Chris@34 333 if (maxResults > 0 && ++n > maxResults) break;
Chris@28 334 }
Chris@28 335
Chris@28 336 return results;
Chris@28 337 }
Chris@28 338
Chris@28 339 }
Chris@28 340