Chris@28
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@28
|
2
|
Chris@28
|
3 #include "Matcher.h"
|
Chris@28
|
4 #include "Objects.h"
|
Chris@34
|
5 #include "EditDistance.h"
|
Chris@28
|
6
|
Chris@28
|
7 #include <QMultiMap>
|
Chris@28
|
8
|
Chris@34
|
9 #include <iostream>
|
Chris@34
|
10
|
Chris@28
|
11 using namespace Dataquay;
|
Chris@28
|
12
|
Chris@28
|
13 namespace ClassicalData {
|
Chris@28
|
14
|
Chris@33
|
15 ComposerTypingQuickMatcher::ComposerTypingQuickMatcher(QList<Composer *> cl) :
|
Chris@33
|
16 m_composers(cl)
|
Chris@30
|
17 {
|
Chris@30
|
18 }
|
Chris@30
|
19
|
Chris@30
|
20 GuessList
|
Chris@33
|
21 ComposerTypingQuickMatcher::match(QString text, int maxResults,
|
Chris@33
|
22 float threshold) const
|
Chris@28
|
23 {
|
Chris@30
|
24 GuessList results;
|
Chris@34
|
25 GuessSet matches;
|
Chris@28
|
26
|
Chris@28
|
27 foreach (Composer *c, m_composers) {
|
Chris@28
|
28 float value = c->matchTypingQuick(text);
|
Chris@33
|
29 if (value < threshold) continue;
|
Chris@34
|
30 matches.insert(Guess(value, c));
|
Chris@28
|
31 }
|
Chris@28
|
32
|
Chris@28
|
33 int n = 0;
|
Chris@34
|
34 for (GuessSet::const_iterator i = matches.begin();
|
Chris@30
|
35 i != matches.end(); ++i) {
|
Chris@34
|
36 results.push_back(*i);
|
Chris@34
|
37 if (maxResults > 0 && ++n > maxResults) break;
|
Chris@28
|
38 }
|
Chris@28
|
39
|
Chris@28
|
40 return results;
|
Chris@28
|
41 }
|
Chris@28
|
42
|
Chris@33
|
43 ComposerTypingThoroughMatcher::ComposerTypingThoroughMatcher(QList<Composer *> cl) :
|
Chris@33
|
44 m_composers(cl)
|
Chris@30
|
45 {
|
Chris@30
|
46 }
|
Chris@30
|
47
|
Chris@30
|
48 GuessList
|
Chris@33
|
49 ComposerTypingThoroughMatcher::match(QString text, int maxResults,
|
Chris@33
|
50 float threshold) const
|
Chris@28
|
51 {
|
Chris@30
|
52 GuessList results;
|
Chris@28
|
53
|
Chris@34
|
54 GuessSet matches;
|
Chris@28
|
55 foreach (Composer *c, m_composers) {
|
Chris@28
|
56 float value = c->matchTyping(text);
|
Chris@33
|
57 if (value < threshold) continue;
|
Chris@34
|
58 matches.insert(Guess(value, c));
|
Chris@33
|
59 }
|
Chris@33
|
60
|
Chris@33
|
61 int n = 0;
|
Chris@34
|
62 for (GuessSet::const_iterator i = matches.begin();
|
Chris@33
|
63 i != matches.end(); ++i) {
|
Chris@34
|
64 results.push_back(*i);
|
Chris@34
|
65 if (maxResults > 0 && ++n > maxResults) break;
|
Chris@33
|
66 }
|
Chris@33
|
67
|
Chris@33
|
68 return results;
|
Chris@33
|
69 }
|
Chris@33
|
70
|
Chris@33
|
71 ComposerFullTextMatcher::ComposerFullTextMatcher(QList<Composer *> cl) :
|
Chris@33
|
72 m_composers(cl)
|
Chris@33
|
73 {
|
Chris@33
|
74 }
|
Chris@33
|
75
|
Chris@33
|
76 GuessList
|
Chris@33
|
77 ComposerFullTextMatcher::match(QString text, int maxResults,
|
Chris@33
|
78 float threshold) const
|
Chris@33
|
79 {
|
Chris@33
|
80 GuessList results;
|
Chris@33
|
81
|
Chris@34
|
82 GuessSet matches;
|
Chris@33
|
83 foreach (Composer *c, m_composers) {
|
Chris@33
|
84 float value = c->matchFuzzyName(text);
|
Chris@33
|
85 if (value < threshold) continue;
|
Chris@34
|
86 // std::cerr << "Liking: " << c->name().toStdString() << " (" << value << ")" << std::endl;
|
Chris@34
|
87 matches.insert(Guess(value, c));
|
Chris@28
|
88 }
|
Chris@28
|
89
|
Chris@28
|
90 int n = 0;
|
Chris@34
|
91 for (GuessSet::iterator i = matches.begin();
|
Chris@30
|
92 i != matches.end(); ++i) {
|
Chris@34
|
93 Guess g = *i;
|
Chris@34
|
94 results.push_back(g);
|
Chris@34
|
95 // std::cerr << "Pushing: " << g.entity()->name().toStdString() << std::endl;
|
Chris@34
|
96 if (maxResults > 0 && ++n > maxResults) break;
|
Chris@34
|
97 }
|
Chris@34
|
98
|
Chris@34
|
99 return results;
|
Chris@34
|
100 }
|
Chris@34
|
101
|
Chris@34
|
102 WorkCatalogueMatcher::WorkCatalogueMatcher(QList<Work *> wl) :
|
Chris@34
|
103 m_works(wl)
|
Chris@34
|
104 {
|
Chris@34
|
105 }
|
Chris@34
|
106
|
Chris@34
|
107 GuessList
|
Chris@34
|
108 WorkCatalogueMatcher::match(QString text, int maxResults,
|
Chris@34
|
109 float threshold) const
|
Chris@34
|
110 {
|
Chris@34
|
111 GuessList results;
|
Chris@34
|
112 GuessSet matches;
|
Chris@34
|
113 QStringList cats = Work::extractCatalogueNumberTexts(text);
|
Chris@34
|
114 if (cats.empty()) return results;
|
Chris@34
|
115 foreach (QString cat, cats) {
|
Chris@34
|
116 std::cerr << "testing cat \"" << cat.toStdString() << "\" against "
|
Chris@34
|
117 << m_works.size() << " works" << std::endl;
|
Chris@34
|
118 foreach (Work *w, m_works) {
|
Chris@34
|
119 if (maxResults > 0 && matches.size() >= maxResults) {
|
Chris@34
|
120 break;
|
Chris@34
|
121 }
|
Chris@34
|
122 QString catalogue = w->catalogue();
|
Chris@34
|
123 if (catalogue != "") {
|
Chris@34
|
124 if (!Work::compareCatalogueNumberTexts(catalogue, cat)) {
|
Chris@34
|
125 std::cerr << "We like: " << w->name().toStdString() << " ("
|
Chris@34
|
126 << catalogue.toStdString() << ")" << std::endl;
|
Chris@34
|
127 // all catalogue matches score equal here
|
Chris@34
|
128 matches.insert(Guess(1.f, w));
|
Chris@34
|
129 continue;
|
Chris@34
|
130 }
|
Chris@34
|
131 }
|
Chris@34
|
132 QString opus = w->opus();
|
Chris@34
|
133 QString number = w->number();
|
Chris@34
|
134 QString optext;
|
Chris@34
|
135 if (opus != "") {
|
Chris@34
|
136 if (number != "") {
|
Chris@34
|
137 optext = QString("Op %1 no %2").arg(opus).arg(number);
|
Chris@34
|
138 if (!Work::compareCatalogueNumberTexts(optext, cat)) {
|
Chris@34
|
139 std::cerr << "We like: " << w->name().toStdString() << " ("
|
Chris@34
|
140 << optext.toStdString() << ")" << std::endl;
|
Chris@34
|
141 matches.insert(Guess(1.f, w));
|
Chris@34
|
142 continue;
|
Chris@34
|
143 }
|
Chris@34
|
144 } else {
|
Chris@34
|
145 optext = QString("Op %1").arg(opus);
|
Chris@34
|
146 if (!Work::compareCatalogueNumberTexts(optext, cat)) {
|
Chris@34
|
147 std::cerr << "We like: " << w->name().toStdString() << " ("
|
Chris@34
|
148 << optext.toStdString() << ")" << std::endl;
|
Chris@34
|
149 matches.insert(Guess(1.f, w));
|
Chris@34
|
150 continue;
|
Chris@34
|
151 }
|
Chris@34
|
152 }
|
Chris@34
|
153 }
|
Chris@34
|
154 }
|
Chris@34
|
155 }
|
Chris@34
|
156
|
Chris@34
|
157 if (maxResults == 0 || matches.size() < maxResults) {
|
Chris@34
|
158
|
Chris@35
|
159 // Now, for slightly lower marks, test for strings like
|
Chris@35
|
160 // "Symphony no 8" at the start of the title, or after a
|
Chris@35
|
161 // colon, slash or dash (e.g. "Brahms: Symphony no 4")
|
Chris@34
|
162
|
Chris@35
|
163 QRegExp numberRe1("^([^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)");
|
Chris@35
|
164 QRegExp numberRe2("[/:-]\\s*(\\w[^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)");
|
Chris@35
|
165 QString tag, number;
|
Chris@34
|
166
|
Chris@35
|
167 if (numberRe1.indexIn(text) >= 0) {
|
Chris@35
|
168 tag = numberRe1.cap(1);
|
Chris@35
|
169 number = numberRe1.cap(2);
|
Chris@35
|
170 } else if (numberRe2.indexIn(text) >= 0) {
|
Chris@35
|
171 tag = numberRe2.cap(1);
|
Chris@35
|
172 number = numberRe2.cap(2);
|
Chris@35
|
173 }
|
Chris@35
|
174
|
Chris@35
|
175 if (tag != "") {
|
Chris@34
|
176
|
Chris@35
|
177 std::cerr << "tag = \"" << tag.toStdString() << "\", number = \""
|
Chris@35
|
178 << number.toStdString() << "\"" << std::endl;
|
Chris@35
|
179
|
Chris@34
|
180 tag.replace(QRegExp("[^\\w\\s]+"), "");
|
Chris@34
|
181 QString matcherReStr =
|
Chris@34
|
182 QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(tag).arg(number);
|
Chris@34
|
183 QRegExp matcherRe(matcherReStr, Qt::CaseInsensitive);
|
Chris@34
|
184 std::cerr << "matcherRe: " << matcherReStr.toStdString() << std::endl;
|
Chris@34
|
185
|
Chris@34
|
186 // initials only
|
Chris@34
|
187 /* nah, doesn't work well
|
Chris@34
|
188 QString weakTag;
|
Chris@34
|
189 QRegExp initialRe("\\b(\\w)\\w*\\b");
|
Chris@34
|
190 int ix = 0;
|
Chris@34
|
191 while ((ix = initialRe.indexIn(tag, ix)) >= 0) {
|
Chris@34
|
192 if (ix > 0) weakTag += "\\s+";
|
Chris@34
|
193 weakTag += initialRe.cap(1) + "\\w*";
|
Chris@34
|
194 ++ix;
|
Chris@34
|
195 }
|
Chris@34
|
196
|
Chris@34
|
197 QString weakMatcherReStr =
|
Chris@34
|
198 QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(weakTag).arg(number);
|
Chris@34
|
199 QRegExp weakMatcherRe(weakMatcherReStr, Qt::CaseInsensitive);
|
Chris@34
|
200 std::cerr << "weakMatcherRe: " << weakMatcherReStr.toStdString() << std::endl;
|
Chris@34
|
201 */
|
Chris@34
|
202 foreach (Work *w, m_works) {
|
Chris@34
|
203 if (maxResults > 0 && matches.size() >= maxResults) {
|
Chris@34
|
204 break;
|
Chris@34
|
205 }
|
Chris@34
|
206 QString name = w->name();
|
Chris@34
|
207 if (matcherRe.indexIn(name) >= 0) {
|
Chris@34
|
208 std::cerr << "We quite like: " << name.toStdString() << std::endl;
|
Chris@34
|
209 matches.insert(Guess(0.8f, w));
|
Chris@34
|
210 }
|
Chris@34
|
211 /* else if (weakMatcherRe.indexIn(name) >= 0) {
|
Chris@34
|
212 std::cerr << "We sorta like: " << name.toStdString() << std::endl;
|
Chris@34
|
213 matches.insert(Guess(0.2f, w));
|
Chris@34
|
214 }
|
Chris@34
|
215 */
|
Chris@34
|
216 }
|
Chris@34
|
217 }
|
Chris@34
|
218 }
|
Chris@34
|
219
|
Chris@34
|
220 int n = 0;
|
Chris@34
|
221 for (GuessSet::const_iterator i = matches.begin();
|
Chris@34
|
222 i != matches.end(); ++i) {
|
Chris@34
|
223 results.push_back(*i);
|
Chris@34
|
224 if (maxResults > 0 && ++n > maxResults) break;
|
Chris@34
|
225 }
|
Chris@34
|
226
|
Chris@34
|
227 return results;
|
Chris@34
|
228 }
|
Chris@34
|
229
|
Chris@34
|
230 WorkTitleMatcher::WorkTitleMatcher(QList<Work *> wl) :
|
Chris@34
|
231 m_works(wl)
|
Chris@34
|
232 {
|
Chris@34
|
233 }
|
Chris@34
|
234
|
Chris@34
|
235 GuessList
|
Chris@34
|
236 WorkTitleMatcher::match(QString text, int maxResults,
|
Chris@34
|
237 float threshold) const
|
Chris@34
|
238 {
|
Chris@34
|
239 GuessList results;
|
Chris@34
|
240 GuessSet matches;
|
Chris@34
|
241
|
Chris@37
|
242 // Throw away any initial numbers (likely to be track index)
|
Chris@37
|
243 text = text.replace(QRegExp("^[0-9]+"), "");
|
Chris@37
|
244
|
Chris@34
|
245 QString quoted;
|
Chris@40
|
246 QRegExp quoteRe("(^|\\s)[\"']([^\"]+)[\"']($|[^\\w])");
|
Chris@34
|
247 int qthresh = 0;
|
Chris@34
|
248
|
Chris@34
|
249 if (quoteRe.indexIn(text) >= 0) {
|
Chris@40
|
250 quoted = quoteRe.cap(2);
|
Chris@34
|
251 if (quoted.length() < 4) quoted = "";
|
Chris@34
|
252 qthresh = quoted.length() / 4;
|
Chris@34
|
253 }
|
Chris@34
|
254
|
Chris@34
|
255 std::cerr << "text = " << text.toStdString() << ", quoted = "
|
Chris@34
|
256 << quoted.toStdString() << std::endl;
|
Chris@34
|
257
|
Chris@36
|
258 QStringList components =
|
Chris@36
|
259 text.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts);
|
Chris@36
|
260 QStringList reduced;
|
Chris@36
|
261 foreach (QString c, components) {
|
Chris@36
|
262 QString r = Composer::reduceName(c.toLower());
|
Chris@36
|
263 if (r != "") {
|
Chris@36
|
264 if (r.length() > 3 || r[0].isDigit()) {
|
Chris@36
|
265 reduced.push_back(r);
|
Chris@36
|
266 }
|
Chris@36
|
267 }
|
Chris@36
|
268 }
|
Chris@36
|
269
|
Chris@40
|
270 std::cerr << "reduced = " << reduced.join(" ").toStdString() << std::endl;
|
Chris@40
|
271
|
Chris@34
|
272 EditDistance ed;
|
Chris@34
|
273
|
Chris@34
|
274 foreach (Work *w, m_works) {
|
Chris@34
|
275 if (maxResults > 0 && matches.size() >= maxResults) {
|
Chris@34
|
276 break;
|
Chris@34
|
277 }
|
Chris@34
|
278
|
Chris@36
|
279 float highScore = 0.f;
|
Chris@34
|
280
|
Chris@36
|
281 QSet<QString> names = w->aliases();
|
Chris@36
|
282 names.insert(w->name());
|
Chris@36
|
283
|
Chris@36
|
284 foreach (QString name, names) {
|
Chris@36
|
285
|
Chris@40
|
286 float pro = 0.f;
|
Chris@40
|
287 float con = 0.f;
|
Chris@36
|
288
|
Chris@36
|
289 if (quoted != "") {
|
Chris@36
|
290 if (quoteRe.indexIn(name) >= 0) {
|
Chris@36
|
291 QString q = quoteRe.cap(1);
|
Chris@36
|
292 int dist = ed.calculate(quoted, q, qthresh);
|
Chris@36
|
293 if (dist < qthresh) {
|
Chris@36
|
294 std::cerr << "quoted name match: " << q.toStdString() << std::endl;
|
Chris@40
|
295 pro += 0.7f - 0.1f * dist;
|
Chris@40
|
296 if (pro - con > highScore) {
|
Chris@40
|
297 highScore = pro - con;
|
Chris@36
|
298 continue;
|
Chris@36
|
299 }
|
Chris@36
|
300 }
|
Chris@34
|
301 }
|
Chris@34
|
302 }
|
Chris@34
|
303
|
Chris@36
|
304 QStringList wcomp =
|
Chris@36
|
305 name.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts);
|
Chris@36
|
306 QSet<QString> wr;
|
Chris@36
|
307 foreach (QString wc, wcomp) {
|
Chris@36
|
308 wr.insert(Composer::reduceName(wc.toLower()));
|
Chris@36
|
309 }
|
Chris@36
|
310 foreach (QString rc, reduced) {
|
Chris@36
|
311 if (wr.contains(rc)) {
|
Chris@36
|
312 std::cerr << "component match: " << rc.toStdString() << std::endl;
|
Chris@40
|
313 pro += 0.1;
|
Chris@36
|
314 } else {
|
Chris@40
|
315 con += 0.101;
|
Chris@40
|
316 if (con > 0.25) con = 0.25;
|
Chris@36
|
317 }
|
Chris@36
|
318 }
|
Chris@36
|
319
|
Chris@40
|
320 if (pro - con > highScore) highScore = pro - con;
|
Chris@36
|
321 }
|
Chris@36
|
322
|
Chris@36
|
323 if (highScore > 0.f) {
|
Chris@36
|
324 std::cerr << "for " << w->name().toStdString() << " highScore = " << highScore << std::endl;
|
Chris@36
|
325 matches.insert(Guess(highScore, w));
|
Chris@34
|
326 }
|
Chris@34
|
327 }
|
Chris@34
|
328
|
Chris@34
|
329 int n = 0;
|
Chris@34
|
330 for (GuessSet::const_iterator i = matches.begin();
|
Chris@34
|
331 i != matches.end(); ++i) {
|
Chris@34
|
332 results.push_back(*i);
|
Chris@34
|
333 if (maxResults > 0 && ++n > maxResults) break;
|
Chris@28
|
334 }
|
Chris@28
|
335
|
Chris@28
|
336 return results;
|
Chris@28
|
337 }
|
Chris@28
|
338
|
Chris@28
|
339 }
|
Chris@28
|
340
|