Chris@0
|
1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
Chris@0
|
2
|
Chris@0
|
3 #include "Objects.h"
|
Chris@0
|
4
|
Chris@52
|
5 //#include <dataquay/Debug.h>
|
Chris@0
|
6
|
Chris@0
|
7 #include <cstdlib>
|
Chris@0
|
8 #include <iostream>
|
Chris@0
|
9
|
Chris@11
|
10 #include "EditDistance.h"
|
Chris@11
|
11
|
Chris@4
|
12 #include <QHash> // to ensure correct qHash(const QString &) is found
|
Chris@33
|
13 #include <QFile>
|
Chris@33
|
14 #include <QFileInfo>
|
Chris@33
|
15 #include <QCryptographicHash>
|
Chris@4
|
16
|
Chris@0
|
17 namespace ClassicalData {
|
Chris@0
|
18
|
Chris@0
|
19 QMap<QString, Form *> Form::m_map;
|
Chris@0
|
20 QMutex Form::m_mutex;
|
Chris@0
|
21
|
Chris@37
|
22 QString
|
Chris@37
|
23 Composition::getComposerName() const
|
Chris@37
|
24 {
|
Chris@37
|
25 if (m_composer) return m_composer->name();
|
Chris@37
|
26 return m_cname;
|
Chris@37
|
27 }
|
Chris@37
|
28
|
Chris@1
|
29 bool
|
Chris@10
|
30 Composer::matchDates(const Composer *b) const
|
Chris@1
|
31 {
|
Chris@1
|
32 const Composer *a = this;
|
Chris@1
|
33
|
Chris@1
|
34 if (a->birth() && b->birth()) {
|
Chris@1
|
35 int ay = a->birth()->year(), by = b->birth()->year();
|
Chris@1
|
36 if (ay < 1800 || // birth dates before 1700 tend to be vague!
|
Chris@1
|
37 a->birth()->approximate() ||
|
Chris@1
|
38 b->birth()->approximate()) {
|
Chris@1
|
39 if (abs(ay - by) > 25) return false;
|
Chris@1
|
40 } else {
|
Chris@1
|
41 if (abs(ay - by) > 1) {
|
Chris@1
|
42 return false;
|
Chris@1
|
43 }
|
Chris@1
|
44 }
|
Chris@1
|
45 }
|
Chris@1
|
46 if (a->death() && b->death()) {
|
Chris@1
|
47 int ay = a->death()->year(), by = b->death()->year();
|
Chris@1
|
48 if (a->death()->approximate() || b->death()->approximate()) {
|
Chris@1
|
49 if (abs(ay - by) > 10) return false;
|
Chris@1
|
50 } else if (ay < 1700) {
|
Chris@1
|
51 if (abs(ay - by) > 25) return false;
|
Chris@1
|
52 } else if (ay < 1800) {
|
Chris@1
|
53 // cut a bit of slack, but not as much as for birth date
|
Chris@1
|
54 if (abs(ay - by) > 10) return false;
|
Chris@1
|
55 } else {
|
Chris@1
|
56 if (abs(ay - by) > 1) return false;
|
Chris@1
|
57 }
|
Chris@1
|
58 }
|
Chris@1
|
59 return true;
|
Chris@1
|
60 }
|
Chris@1
|
61
|
Chris@11
|
62 void
|
Chris@11
|
63 Composer::cacheNames() const
|
Chris@11
|
64 {
|
Chris@11
|
65 if (m_namesCached) return;
|
Chris@11
|
66
|
Chris@11
|
67 QString n = name();
|
Chris@11
|
68 QStringList pl = n.split(", ");
|
Chris@11
|
69
|
Chris@11
|
70 if (pl.size() == 1) {
|
Chris@11
|
71 QStringList pl2;
|
Chris@11
|
72 pl = n.split(' ');
|
Chris@11
|
73 pl2.push_back(pl[pl.size()-1]);
|
Chris@11
|
74 pl2.push_back("");
|
Chris@11
|
75 for (int i = 0; i+1 < pl.size(); ++i) {
|
Chris@11
|
76 if (i > 0) pl2[1] += " ";
|
Chris@11
|
77 pl2[1] += pl[i];
|
Chris@11
|
78 }
|
Chris@11
|
79 pl = pl2;
|
Chris@11
|
80 }
|
Chris@11
|
81
|
Chris@11
|
82 m_surname = pl[0];
|
Chris@11
|
83
|
Chris@11
|
84 n = "";
|
Chris@11
|
85 for (int i = 1; i < pl.size(); ++i) {
|
Chris@11
|
86 if (i > 1) n += ", ";
|
Chris@11
|
87 n += pl[i];
|
Chris@11
|
88 }
|
Chris@11
|
89
|
Chris@11
|
90 m_forenames = n;
|
Chris@11
|
91
|
Chris@11
|
92 m_surnameElements.clear();
|
Chris@11
|
93 m_connectiveElements.clear();
|
Chris@11
|
94 m_forenameElements.clear();
|
Chris@11
|
95 m_otherElements.clear();
|
Chris@11
|
96 m_reducedSurnameElements.clear();
|
Chris@11
|
97 m_reducedForenameElements.clear();
|
Chris@11
|
98
|
Chris@13
|
99 static QRegExp sre("[\\., -]+");
|
Chris@13
|
100
|
Chris@13
|
101 foreach (QString s, m_surname.split(sre, QString::SkipEmptyParts)) {
|
Chris@11
|
102 if (s[0].isUpper()) {
|
Chris@11
|
103 m_surnameElements.push_back(s.toLower());
|
Chris@11
|
104 m_reducedSurnameElements.push_back(reduceName(s));
|
Chris@12
|
105 } else if (s.length() > 1) {
|
Chris@11
|
106 m_connectiveElements.push_back(s.toLower());
|
Chris@11
|
107 }
|
Chris@11
|
108 }
|
Chris@11
|
109
|
Chris@13
|
110 foreach (QString s, m_forenames.split(sre, QString::SkipEmptyParts)) {
|
Chris@11
|
111 if (s[0].isUpper()) {
|
Chris@11
|
112 m_forenameElements.push_back(s.toLower());
|
Chris@11
|
113 m_reducedForenameElements.push_back(reduceName(s));
|
Chris@12
|
114 } else if (s.length() > 1) {
|
Chris@11
|
115 m_connectiveElements.push_back(s.toLower());
|
Chris@11
|
116 }
|
Chris@11
|
117 }
|
Chris@11
|
118
|
Chris@11
|
119 foreach (QString a, m_aliases) {
|
Chris@13
|
120 foreach (QString ae, a.split(sre, QString::SkipEmptyParts)) {
|
Chris@11
|
121 m_otherElements.push_back(ae.toLower());
|
Chris@11
|
122 }
|
Chris@11
|
123 }
|
Chris@13
|
124
|
Chris@13
|
125 m_namesCached = true;
|
Chris@11
|
126 }
|
Chris@11
|
127
|
Chris@0
|
128 QString
|
Chris@0
|
129 Composer::getSortName(bool caps) const
|
Chris@0
|
130 {
|
Chris@10
|
131 QString surname = getSurname();
|
Chris@10
|
132 QString forenames = getForenames();
|
Chris@10
|
133 if (caps) surname = surname.toUpper();
|
Chris@10
|
134 if (forenames != "") return surname + ", " + forenames;
|
Chris@10
|
135 else return surname;
|
Chris@10
|
136 }
|
Chris@10
|
137
|
Chris@10
|
138 QString
|
Chris@10
|
139 Composer::getSurname() const
|
Chris@10
|
140 {
|
Chris@11
|
141 cacheNames();
|
Chris@11
|
142 return m_surname;
|
Chris@10
|
143 }
|
Chris@10
|
144
|
Chris@10
|
145 QString
|
Chris@10
|
146 Composer::getForenames() const
|
Chris@10
|
147 {
|
Chris@11
|
148 cacheNames();
|
Chris@11
|
149 return m_forenames;
|
Chris@0
|
150 }
|
Chris@0
|
151
|
Chris@0
|
152 QString
|
Chris@0
|
153 Composer::getDisplayDates() const
|
Chris@0
|
154 {
|
Chris@0
|
155 QString s;
|
Chris@0
|
156 if (birth() || death()) {
|
Chris@0
|
157 bool showApprox = false;
|
Chris@0
|
158 if ((birth() && birth()->approximate()) ||
|
Chris@0
|
159 (death() && death()->approximate())) {
|
Chris@0
|
160 showApprox = true;
|
Chris@0
|
161 }
|
Chris@0
|
162 if (birth()) {
|
Chris@0
|
163 if (birth()->place() != "") {
|
Chris@0
|
164 s += birth()->place() + ", ";
|
Chris@0
|
165 }
|
Chris@0
|
166 if (showApprox) {
|
Chris@0
|
167 s += "c. ";
|
Chris@0
|
168 showApprox = false;
|
Chris@0
|
169 }
|
Chris@22
|
170 s += QString("%1").arg(birth()->year().toInt());
|
Chris@0
|
171 }
|
Chris@0
|
172 s += "-";
|
Chris@0
|
173 if (death()) {
|
Chris@0
|
174 if (death()->place() != "") {
|
Chris@0
|
175 s += death()->place() + ", ";
|
Chris@0
|
176 }
|
Chris@0
|
177 if (showApprox) {
|
Chris@0
|
178 s += "c. ";
|
Chris@0
|
179 showApprox = false;
|
Chris@0
|
180 }
|
Chris@22
|
181 s += QString("%1").arg(death()->year().toInt());
|
Chris@0
|
182 }
|
Chris@0
|
183 }
|
Chris@0
|
184
|
Chris@0
|
185 return s;
|
Chris@0
|
186 }
|
Chris@10
|
187
|
Chris@10
|
188 static QString
|
Chris@10
|
189 asciify(QString field)
|
Chris@10
|
190 {
|
Chris@10
|
191 QString ascii;
|
Chris@10
|
192 for (int i = 0; i < field.length(); ++i) {
|
Chris@10
|
193 QString dc = field[i].decomposition();
|
Chris@10
|
194 if (dc != "") ascii += dc[0];
|
Chris@10
|
195 else if (field[i] == QChar(0x00DF)) {
|
Chris@10
|
196 ascii += "ss";
|
Chris@10
|
197 } else {
|
Chris@10
|
198 ascii += field[i];
|
Chris@10
|
199 }
|
Chris@10
|
200 }
|
Chris@10
|
201 ascii.replace(QString::fromUtf8("\342\200\231"), "'"); // apostrophe
|
Chris@10
|
202 ascii.replace(QString::fromUtf8("\342\200\222"), "-");
|
Chris@10
|
203 ascii.replace(QString::fromUtf8("\342\200\223"), "-");
|
Chris@10
|
204 ascii.replace(QString::fromUtf8("\342\200\224"), "-");
|
Chris@10
|
205 ascii.replace(QString::fromUtf8("\342\200\225"), "-");
|
Chris@10
|
206 return ascii;
|
Chris@10
|
207 }
|
Chris@10
|
208
|
Chris@10
|
209 QString
|
Chris@10
|
210 Composer::reduceName(QString name)
|
Chris@10
|
211 {
|
Chris@10
|
212 QString key = asciify(name).toLower()
|
Chris@10
|
213 .replace("'", "")
|
Chris@10
|
214 .replace("x", "ks")
|
Chris@10
|
215 .replace("y", "i")
|
Chris@36
|
216 .replace("ie", "i")
|
Chris@36
|
217 .replace("ei", "i")
|
Chris@36
|
218 .replace("ii", "i")
|
Chris@10
|
219 .replace("k", "c")
|
Chris@10
|
220 .replace("aa", "a")
|
Chris@36
|
221 .replace("a", "e")
|
Chris@36
|
222 .replace("ee", "e")
|
Chris@10
|
223 .replace("v", "f")
|
Chris@36
|
224 .replace("ph", "f")
|
Chris@10
|
225 .replace("ff", "f")
|
Chris@10
|
226 .replace("th", "t")
|
Chris@10
|
227 .replace("tch", "ch")
|
Chris@36
|
228 .replace("ch", "c")
|
Chris@36
|
229 .replace("cc", "c")
|
Chris@10
|
230 .replace("er", "r");
|
Chris@10
|
231 return key;
|
Chris@10
|
232 }
|
Chris@10
|
233
|
Chris@10
|
234 bool
|
Chris@10
|
235 Composer::matchCatalogueName(QString an) const
|
Chris@10
|
236 {
|
Chris@10
|
237 // ew!
|
Chris@10
|
238
|
Chris@10
|
239 QString bn = name();
|
Chris@10
|
240 if (bn == an) return true;
|
Chris@10
|
241 if (aliases().contains(an)) return true;
|
Chris@10
|
242
|
Chris@10
|
243 int aSurnameIndex = 0, bSurnameIndex = 0;
|
Chris@10
|
244 if (an.contains(",")) {
|
Chris@10
|
245 an.replace(",", "");
|
Chris@10
|
246 } else {
|
Chris@10
|
247 aSurnameIndex = -1;
|
Chris@10
|
248 }
|
Chris@10
|
249 if (bn.contains(",")) {
|
Chris@10
|
250 bn.replace(",", "");
|
Chris@10
|
251 } else {
|
Chris@10
|
252 bSurnameIndex = -1;
|
Chris@10
|
253 }
|
Chris@10
|
254 QStringList nl = an.split(QRegExp("[ -]"));
|
Chris@10
|
255 QStringList bnl = reduceName(bn).split(QRegExp("[ -]"));
|
Chris@10
|
256 int matchCount = 0;
|
Chris@10
|
257 QString surnameMatch = "";
|
Chris@10
|
258 if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1;
|
Chris@10
|
259 if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1;
|
Chris@10
|
260 if (nl[aSurnameIndex][0].isUpper() &&
|
Chris@10
|
261 nl[aSurnameIndex] != "Della" &&
|
Chris@10
|
262 reduceName(nl[aSurnameIndex]) == bnl[bSurnameIndex]) {
|
Chris@10
|
263 surnameMatch = nl[aSurnameIndex];
|
Chris@10
|
264 }
|
Chris@10
|
265 int tested = 0;
|
Chris@10
|
266 foreach (QString elt, nl) {
|
Chris@10
|
267 if (!elt[0].isUpper() || elt == "Della") continue;
|
Chris@10
|
268 QString k = reduceName(elt);
|
Chris@10
|
269 if (bnl.contains(k)) {
|
Chris@10
|
270 ++matchCount;
|
Chris@10
|
271 }
|
Chris@10
|
272 if (++tested == 2 && matchCount == 0) {
|
Chris@10
|
273 return false;
|
Chris@10
|
274 }
|
Chris@10
|
275 }
|
Chris@10
|
276 if (surnameMatch != "") {
|
Chris@52
|
277 // DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl;
|
Chris@10
|
278 if (matchCount > 1) {
|
Chris@10
|
279 return true;
|
Chris@10
|
280 } else {
|
Chris@52
|
281 // DEBUG << "(but not enough else matched)" << endl;
|
Chris@10
|
282 return false;
|
Chris@10
|
283 }
|
Chris@10
|
284 }
|
Chris@10
|
285 return false;
|
Chris@10
|
286 }
|
Chris@10
|
287
|
Chris@14
|
288 float
|
Chris@10
|
289 Composer::matchFuzzyName(QString n) const
|
Chris@10
|
290 {
|
Chris@13
|
291 int fameBonus = m_pages.size();
|
Chris@13
|
292 if (n == name()) return 100 + fameBonus;
|
Chris@13
|
293 static QRegExp sre("[\\., -]+");
|
Chris@13
|
294 return matchFuzzyName(n.toLower().split(sre, QString::SkipEmptyParts));
|
Chris@13
|
295 }
|
Chris@13
|
296
|
Chris@15
|
297 static int
|
Chris@15
|
298 calculateThresholdedDistance(EditDistance &ed, const QString &user,
|
Chris@15
|
299 const QString &machine)
|
Chris@15
|
300 {
|
Chris@15
|
301 int threshold = machine.length()/3;
|
Chris@15
|
302 int dist;
|
Chris@15
|
303 if (threshold == 0) dist = (user == machine ? 0 : -1);
|
Chris@15
|
304 else {
|
Chris@15
|
305 dist = ed.calculate(user, machine, threshold);
|
Chris@15
|
306 if (dist > threshold) dist = -1;
|
Chris@15
|
307 }
|
Chris@15
|
308 return dist;
|
Chris@15
|
309 }
|
Chris@15
|
310
|
Chris@14
|
311 float
|
Chris@13
|
312 Composer::matchFuzzyName(QStringList elements) const
|
Chris@13
|
313 {
|
Chris@14
|
314 if (elements.empty()) return 0;
|
Chris@14
|
315
|
Chris@11
|
316 cacheNames();
|
Chris@11
|
317 int fameBonus = m_pages.size();
|
Chris@10
|
318
|
Chris@28
|
319 EditDistance ed(EditDistance::RestrictedTransposition);
|
Chris@10
|
320
|
Chris@10
|
321 int score = 0;
|
Chris@15
|
322 bool haveSurname = false;
|
Chris@15
|
323
|
Chris@15
|
324 // We aim to scale the eventual result such that a score of 1.0 or
|
Chris@15
|
325 // more indicates near-certainty that this is a correct match
|
Chris@15
|
326 // (i.e. that it is properly matched -- not that it is the only
|
Chris@15
|
327 // possible match). To achieve this score, we need to have
|
Chris@15
|
328 // matched with reasonable confidence every element in the passed
|
Chris@15
|
329 // elements list, and to have matched at least one of them to a
|
Chris@15
|
330 // part of our surname.
|
Chris@15
|
331
|
Chris@15
|
332 int matched = 0;
|
Chris@15
|
333 int unmatched = 0;
|
Chris@10
|
334
|
Chris@11
|
335 foreach (QString elt, elements) {
|
Chris@10
|
336
|
Chris@11
|
337 bool accept = false;
|
Chris@11
|
338
|
Chris@11
|
339 if (elt.length() == 1) {
|
Chris@15
|
340 // An initial: search forenames only, ignoring
|
Chris@15
|
341 // connectives. The score contribution here is low, but
|
Chris@15
|
342 // they do not count to matched which means the score can
|
Chris@15
|
343 // only enhance whatever happens elsewhere. They can
|
Chris@15
|
344 // however seriously damage our score if unmatched, which
|
Chris@15
|
345 // is as it should be.
|
Chris@11
|
346 foreach (QString s, m_forenameElements) {
|
Chris@11
|
347 if (s[0] == elt[0]) {
|
Chris@15
|
348 score += 2;
|
Chris@11
|
349 accept = true;
|
Chris@10
|
350 break;
|
Chris@10
|
351 }
|
Chris@10
|
352 }
|
Chris@11
|
353 if (!accept) {
|
Chris@15
|
354 foreach (QString s, m_connectiveElements) {
|
Chris@15
|
355 if (s[0] == elt[0]) {
|
Chris@15
|
356 score += 1;
|
Chris@15
|
357 accept = true;
|
Chris@15
|
358 break;
|
Chris@15
|
359 }
|
Chris@15
|
360 }
|
Chris@10
|
361 }
|
Chris@15
|
362 if (!accept) {
|
Chris@15
|
363 foreach (QString s, m_surnameElements) {
|
Chris@15
|
364 if (s[0] == elt[0]) {
|
Chris@15
|
365 // no score, but don't call it unmatched
|
Chris@15
|
366 accept = true;
|
Chris@15
|
367 break;
|
Chris@15
|
368 }
|
Chris@15
|
369 }
|
Chris@15
|
370 }
|
Chris@15
|
371 if (!accept) ++unmatched;
|
Chris@10
|
372 continue;
|
Chris@10
|
373 }
|
Chris@11
|
374
|
Chris@11
|
375 foreach (QString s, m_surnameElements) {
|
Chris@15
|
376 int dist = calculateThresholdedDistance(ed, elt, s);
|
Chris@15
|
377 if (dist >= 0) {
|
Chris@15
|
378 score += 22 - dist*2;
|
Chris@15
|
379 if (elt[0] != s[0]) score -= 10;
|
Chris@15
|
380 accept = true;
|
Chris@13
|
381 // std::cerr << "[surname: " << s.toStdString() << "]" << std::endl;
|
Chris@10
|
382 break;
|
Chris@10
|
383 }
|
Chris@10
|
384 }
|
Chris@15
|
385 if (accept) {
|
Chris@15
|
386 haveSurname = true;
|
Chris@15
|
387 ++matched;
|
Chris@15
|
388 continue;
|
Chris@15
|
389 }
|
Chris@10
|
390
|
Chris@11
|
391 foreach (QString s, m_forenameElements) {
|
Chris@15
|
392 int dist = calculateThresholdedDistance(ed, elt, s);
|
Chris@15
|
393 if (dist >= 0) {
|
Chris@15
|
394 score += 22 - dist*2;
|
Chris@15
|
395 if (elt[0] != s[0]) score -= 10;
|
Chris@15
|
396 accept = true;
|
Chris@13
|
397 // std::cerr << "[forename: " << s.toStdString() << "]" << std::endl;
|
Chris@10
|
398 break;
|
Chris@10
|
399 }
|
Chris@10
|
400 }
|
Chris@15
|
401 if (accept) {
|
Chris@15
|
402 ++matched;
|
Chris@15
|
403 continue;
|
Chris@15
|
404 }
|
Chris@10
|
405
|
Chris@11
|
406 foreach (QString s, m_connectiveElements) {
|
Chris@15
|
407 // treated much like initials
|
Chris@15
|
408 int dist = calculateThresholdedDistance(ed, elt, s);
|
Chris@15
|
409 if (dist == 0) {
|
Chris@15
|
410 score += 2;
|
Chris@15
|
411 accept = true;
|
Chris@15
|
412 } else if (dist == 1) {
|
Chris@15
|
413 score += 1;
|
Chris@15
|
414 accept = true;
|
Chris@15
|
415 }
|
Chris@11
|
416 if (accept) {
|
Chris@13
|
417 // std::cerr << "[connective: " << s.toStdString() << "]" << std::endl;
|
Chris@10
|
418 break;
|
Chris@10
|
419 }
|
Chris@10
|
420 }
|
Chris@15
|
421 if (accept) {
|
Chris@11
|
422 continue;
|
Chris@11
|
423 }
|
Chris@11
|
424
|
Chris@15
|
425 QString reduced = reduceName(elt);
|
Chris@15
|
426
|
Chris@16
|
427 //!!! these don't seem to match often...
|
Chris@16
|
428
|
Chris@15
|
429 if (m_reducedSurnameElements.contains(reduced)) {
|
Chris@15
|
430 score += 10;
|
Chris@15
|
431 haveSurname = true;
|
Chris@15
|
432 ++matched;
|
Chris@15
|
433 std::cerr << "[reduced surname: " << elt.toStdString() << "]" << std::endl;
|
Chris@15
|
434 continue;
|
Chris@15
|
435 }
|
Chris@15
|
436
|
Chris@15
|
437 if (m_reducedForenameElements.contains(reduced)) {
|
Chris@11
|
438 score += 7;
|
Chris@15
|
439 ++matched;
|
Chris@15
|
440 std::cerr << "[reduced forename: " << elt.toStdString() << "]" << std::endl;
|
Chris@11
|
441 continue;
|
Chris@11
|
442 }
|
Chris@11
|
443
|
Chris@11
|
444 foreach (QString s, m_otherElements) {
|
Chris@15
|
445 int dist = calculateThresholdedDistance(ed, elt, s);
|
Chris@15
|
446 if (dist >= 0) {
|
Chris@15
|
447 score += 22 - dist*2;
|
Chris@15
|
448 if (elt[0] != s[0]) score -= 10;
|
Chris@15
|
449 accept = true;
|
Chris@13
|
450 // std::cerr << "[other: " << s.toStdString() << "]" << std::endl;
|
Chris@10
|
451 break;
|
Chris@10
|
452 }
|
Chris@10
|
453 }
|
Chris@15
|
454 if (accept) {
|
Chris@15
|
455 ++matched;
|
Chris@15
|
456 continue;
|
Chris@15
|
457 }
|
Chris@10
|
458
|
Chris@15
|
459 ++unmatched;
|
Chris@11
|
460 }
|
Chris@15
|
461
|
Chris@15
|
462 // if (fameBonus > 0) std::cerr << "[fame: " << fameBonus << "]" << std::endl;
|
Chris@15
|
463 score += fameBonus;
|
Chris@10
|
464
|
Chris@15
|
465 if (matched == 0) {
|
Chris@15
|
466 if (unmatched == 0) {
|
Chris@15
|
467 return float(score) / 20.f;
|
Chris@15
|
468 } else {
|
Chris@15
|
469 return 0;
|
Chris@15
|
470 }
|
Chris@11
|
471 }
|
Chris@15
|
472
|
Chris@15
|
473 float fscore = score;
|
Chris@15
|
474 float divisor = (matched + unmatched) * 20;
|
Chris@15
|
475
|
Chris@15
|
476 if (!haveSurname) fscore /= 2;
|
Chris@15
|
477 if (unmatched > 0) fscore /= 1.5;
|
Chris@15
|
478
|
Chris@15
|
479 fscore /= divisor;
|
Chris@15
|
480
|
Chris@15
|
481 if (matched > 0) {
|
Chris@15
|
482 // std::cerr << "[score " << score << " with divisor " << divisor << " for " << name().toStdString() << " adjusted to " << fscore << "]" << std::endl;
|
Chris@15
|
483 }
|
Chris@15
|
484
|
Chris@15
|
485 return fscore;
|
Chris@10
|
486 }
|
Chris@0
|
487
|
Chris@16
|
488 float
|
Chris@19
|
489 Composer::matchTyping(QString t) const
|
Chris@16
|
490 {
|
Chris@28
|
491 return doMatchTyping(t, false);
|
Chris@28
|
492 }
|
Chris@28
|
493
|
Chris@28
|
494 float
|
Chris@28
|
495 Composer::matchTypingQuick(QString t) const
|
Chris@28
|
496 {
|
Chris@28
|
497 return doMatchTyping(t, true);
|
Chris@28
|
498 }
|
Chris@28
|
499
|
Chris@28
|
500 float
|
Chris@28
|
501 Composer::doMatchTyping(QString t, bool quick) const
|
Chris@28
|
502 {
|
Chris@19
|
503 if (t == "") return 0;
|
Chris@16
|
504
|
Chris@16
|
505 cacheNames();
|
Chris@28
|
506 float fameBonus = m_pages.size() / 400.f;
|
Chris@16
|
507
|
Chris@28
|
508 QString n = name().toLower();
|
Chris@28
|
509 t = t.toLower();
|
Chris@16
|
510
|
Chris@19
|
511 if (n == t) return 1.f + fameBonus;
|
Chris@19
|
512 if (n.startsWith(t)) return 0.8f + fameBonus;
|
Chris@28
|
513
|
Chris@28
|
514 QSet<QString> sl;
|
Chris@28
|
515 QSet<QString> nl;
|
Chris@28
|
516 foreach (QString s, m_surnameElements) {
|
Chris@28
|
517 sl.insert(s.toLower());
|
Chris@28
|
518 nl.insert(s.toLower());
|
Chris@28
|
519 }
|
Chris@28
|
520 foreach (QString s, m_forenameElements) {
|
Chris@28
|
521 nl.insert(s.toLower());
|
Chris@28
|
522 }
|
Chris@28
|
523 if (!quick) {
|
Chris@28
|
524 foreach (QString s, m_otherElements) {
|
Chris@28
|
525 nl.insert(s.toLower());
|
Chris@28
|
526 }
|
Chris@28
|
527 foreach (QString s, m_connectiveElements) {
|
Chris@28
|
528 nl.insert(s.toLower());
|
Chris@28
|
529 }
|
Chris@28
|
530 }
|
Chris@28
|
531
|
Chris@28
|
532 static QRegExp sre("[\\., -]+");
|
Chris@28
|
533 QStringList tl = t.split(sre, QString::SkipEmptyParts);
|
Chris@19
|
534
|
Chris@16
|
535 float score = 0.f;
|
Chris@16
|
536
|
Chris@19
|
537 if (nl.empty() || tl.empty()) return 0.f;
|
Chris@19
|
538
|
Chris@19
|
539 int unmatched = 0;
|
Chris@28
|
540
|
Chris@19
|
541 for (int i = 0; i < tl.size(); ++i) {
|
Chris@28
|
542
|
Chris@28
|
543 QString tel = tl[i];
|
Chris@28
|
544 float component = 0.f;
|
Chris@28
|
545 float max = 0.f;
|
Chris@28
|
546
|
Chris@28
|
547 for (QSet<QString>::const_iterator ni = nl.begin();
|
Chris@28
|
548 ni != nl.end(); ++ni) {
|
Chris@28
|
549
|
Chris@28
|
550 QString nel = ni->toLower();
|
Chris@28
|
551
|
Chris@28
|
552 if (tel == nel) {
|
Chris@28
|
553 if (tel.length() > 1) {
|
Chris@28
|
554 component = 0.2;
|
Chris@19
|
555 } else {
|
Chris@28
|
556 component = 0.1;
|
Chris@19
|
557 }
|
Chris@28
|
558 if (sl.contains(nel)) component *= 1.5;
|
Chris@28
|
559 goto calculated;
|
Chris@19
|
560 }
|
Chris@28
|
561
|
Chris@28
|
562 if (nel.startsWith(tel)) {
|
Chris@28
|
563 component = 0.1;
|
Chris@28
|
564 if (sl.contains(nel)) component *= 1.5;
|
Chris@28
|
565 goto calculated;
|
Chris@28
|
566 }
|
Chris@28
|
567
|
Chris@28
|
568 if (!quick) {
|
Chris@29
|
569 if (tel.length() > 3) {
|
Chris@28
|
570 EditDistance ed(EditDistance::RestrictedTransposition);
|
Chris@28
|
571 int dist = calculateThresholdedDistance
|
Chris@29
|
572 (ed, nel.left(tel.length()), tel);
|
Chris@28
|
573 if (dist >= 0) {
|
Chris@28
|
574 component = 0.08 - dist * 0.01;
|
Chris@28
|
575 if (sl.contains(nel)) component *= 1.5;
|
Chris@28
|
576 }
|
Chris@28
|
577 }
|
Chris@28
|
578 if (component > 0.f) goto calculated;
|
Chris@28
|
579 }
|
Chris@28
|
580
|
Chris@28
|
581 if (nel.startsWith(tel[0])) {
|
Chris@28
|
582 component += 0.02;
|
Chris@28
|
583 }
|
Chris@28
|
584
|
Chris@28
|
585 calculated:
|
Chris@28
|
586 if (component > max) max = component;
|
Chris@16
|
587 }
|
Chris@28
|
588
|
Chris@28
|
589 score += max;
|
Chris@16
|
590 }
|
Chris@16
|
591
|
Chris@28
|
592 if (!quick) {
|
Chris@28
|
593 if (t.contains(" ")) {
|
Chris@28
|
594 float fuzzyScore = matchFuzzyName(t);
|
Chris@28
|
595 if (fuzzyScore >= 0.4f) {
|
Chris@28
|
596 score += fuzzyScore / 3.f;
|
Chris@28
|
597 }
|
Chris@19
|
598 }
|
Chris@19
|
599 }
|
Chris@19
|
600
|
Chris@16
|
601 if (score > 0.f) score += fameBonus;
|
Chris@16
|
602 return score;
|
Chris@16
|
603 }
|
Chris@16
|
604
|
Chris@24
|
605 void
|
Chris@24
|
606 Composer::mergeFrom(Composer *c)
|
Chris@24
|
607 {
|
Chris@24
|
608 QSet<QString> allNames = c->aliases();
|
Chris@25
|
609 allNames.insert(c->name());
|
Chris@24
|
610
|
Chris@24
|
611 foreach (QString n, allNames) {
|
Chris@24
|
612 if (n != m_name && !m_aliases.contains(n)) {
|
Chris@24
|
613 m_aliases.insert(n);
|
Chris@24
|
614 m_namesCached = false;
|
Chris@24
|
615 }
|
Chris@24
|
616 }
|
Chris@24
|
617
|
Chris@24
|
618 if (!m_birth) {
|
Chris@31
|
619 if (c->birth()) {
|
Chris@31
|
620 m_birth = new Birth(*c->birth());
|
Chris@31
|
621 emit birthChanged(m_birth);
|
Chris@31
|
622 }
|
Chris@24
|
623 }
|
Chris@24
|
624
|
Chris@24
|
625 if (!m_death) {
|
Chris@31
|
626 if (c->death()) {
|
Chris@31
|
627 m_death = new Death(*c->death());
|
Chris@31
|
628 emit deathChanged(m_death);
|
Chris@31
|
629 }
|
Chris@24
|
630 }
|
Chris@24
|
631
|
Chris@24
|
632 if (c->gender() != "") {
|
Chris@24
|
633 if (m_gender == "") {
|
Chris@24
|
634 m_gender = c->gender();
|
Chris@31
|
635 emit genderChanged(m_gender);
|
Chris@24
|
636 } else if (c->gender() != m_gender) {
|
Chris@24
|
637 std::cerr << "WARNING: Composer::mergeFrom: Gender mismatch! Composer " << c->name().toStdString() << " has gender " << c->gender().toStdString() << ", but target composer " << m_name.toStdString() << " has gender " << m_gender.toStdString() << std::endl;
|
Chris@24
|
638 }
|
Chris@24
|
639 }
|
Chris@24
|
640
|
Chris@24
|
641 m_nationality.unite(c->nationality());
|
Chris@24
|
642 m_geonameURIs.unite(c->geonameURIs());
|
Chris@24
|
643 m_otherURIs.unite(c->otherURIs());
|
Chris@26
|
644
|
Chris@26
|
645 foreach (Document *d, c->pages()) {
|
Chris@38
|
646 /*
|
Chris@26
|
647 Document *dd = new Document;
|
Chris@26
|
648 dd->setUri(d->uri());
|
Chris@38
|
649 dd->setSiteName(d->siteName());
|
Chris@26
|
650 dd->setTopic(this);
|
Chris@26
|
651 m_pages.insert(dd);
|
Chris@38
|
652 */
|
Chris@38
|
653 d->setTopic(this);
|
Chris@38
|
654 m_pages.insert(d);
|
Chris@26
|
655 }
|
Chris@24
|
656
|
Chris@24
|
657 if (m_period == "") m_period = c->period();
|
Chris@24
|
658 if (m_remarks == "") m_remarks = c->remarks();
|
Chris@31
|
659
|
Chris@31
|
660 emit nationalityChanged(m_nationality);
|
Chris@31
|
661 emit geonameURIsChanged(m_geonameURIs);
|
Chris@31
|
662 emit otherURIsChanged(m_otherURIs);
|
Chris@31
|
663 emit pagesChanged(m_pages);
|
Chris@31
|
664 emit periodChanged(m_period);
|
Chris@31
|
665 emit remarksChanged(m_remarks);
|
Chris@31
|
666 emit aliasesChanged(m_aliases);
|
Chris@24
|
667 }
|
Chris@24
|
668
|
Chris@37
|
669 QString
|
Chris@37
|
670 Work::getComposerName() const
|
Chris@37
|
671 {
|
Chris@37
|
672 Composer *c = getComposer();
|
Chris@37
|
673 if (c) return c->name();
|
Chris@37
|
674 else return "";
|
Chris@37
|
675 }
|
Chris@37
|
676
|
Chris@0
|
677 static int
|
Chris@0
|
678 compare(QString a, QString b)
|
Chris@0
|
679 {
|
Chris@0
|
680 if (a < b) {
|
Chris@0
|
681 return -1;
|
Chris@0
|
682 } else if (a > b) {
|
Chris@0
|
683 return 1;
|
Chris@0
|
684 } else {
|
Chris@0
|
685 return 0;
|
Chris@0
|
686 }
|
Chris@0
|
687 }
|
Chris@0
|
688
|
Chris@10
|
689 int
|
Chris@10
|
690 Work::compareCatalogueNumberTexts(QString a, QString b)
|
Chris@0
|
691 {
|
Chris@0
|
692 // std::cout << "compare " << a.toStdString()
|
Chris@34
|
693 // << " :: " << b.toStdString() << std::endl;
|
Chris@0
|
694
|
Chris@0
|
695 if (a == b) return 0;
|
Chris@0
|
696
|
Chris@0
|
697 if (!a[0].isDigit()) {
|
Chris@34
|
698 a.replace(QRegExp("^[^\\d]+"), "");
|
Chris@34
|
699 }
|
Chris@34
|
700
|
Chris@34
|
701 if (!b[0].isDigit()) {
|
Chris@34
|
702 b.replace(QRegExp("^[^\\d]+"), "");
|
Chris@34
|
703 }
|
Chris@34
|
704
|
Chris@34
|
705 QStringList al = a.split(QRegExp("\\b[^\\d]*"), QString::SkipEmptyParts);
|
Chris@34
|
706 QStringList bl = b.split(QRegExp("\\b[^\\d]*"), QString::SkipEmptyParts);
|
Chris@34
|
707 if (al.size() != bl.size()) return int(al.size()) - int(bl.size());
|
Chris@34
|
708
|
Chris@34
|
709 /* if (al.size() < 2 || bl.size() < 2 || al.size() != bl.size()) {
|
Chris@34
|
710 if (a < b) return -1;
|
Chris@34
|
711 else if (a > b) return 1;
|
Chris@34
|
712 else return 0;
|
Chris@34
|
713 }
|
Chris@34
|
714 */
|
Chris@34
|
715 for (int i = 0; i < al.size(); ++i) {
|
Chris@34
|
716 if (al[i] != bl[i]) {
|
Chris@34
|
717 // use atoi instead of toInt() because we want it to succeed even
|
Chris@34
|
718 // if the text is not only an integer (e.g. 35a)
|
Chris@34
|
719 int aoi = atoi(al[i].toLocal8Bit().data());
|
Chris@34
|
720 int boi = atoi(bl[i].toLocal8Bit().data());
|
Chris@34
|
721 if (aoi != boi) return aoi - boi;
|
Chris@34
|
722 else return compare(al[i], bl[i]);
|
Chris@0
|
723 }
|
Chris@0
|
724 }
|
Chris@34
|
725 return 0;
|
Chris@34
|
726 }
|
Chris@0
|
727
|
Chris@34
|
728 QStringList
|
Chris@34
|
729 Work::extractCatalogueNumberTexts(QString text)
|
Chris@34
|
730 {
|
Chris@34
|
731 //!!! test this
|
Chris@34
|
732 QStringList results;
|
Chris@34
|
733 std::cerr << "Work::extractCatalogueNumberTexts(" << text.toStdString() << ")" << std::endl;
|
Chris@0
|
734
|
Chris@34
|
735 // Note we explicitly exclude "catalogue identifiers" beginning
|
Chris@34
|
736 // with N, because we don't want to treat e.g. "Symphony No. 8"
|
Chris@34
|
737 // as catalogue number 8. What a fine hack.
|
Chris@34
|
738
|
Chris@37
|
739 QRegExp catre("\\b([Oo]pu?s?|[A-MP-Z]+)\\.?[\\s_]*(\\d+\\w*)(\\s+[Nn]([OoRrBb]?|umber)(\\.\\s*|\\s+)(\\d+\\w*))?\\b");
|
Chris@34
|
740 int ix = 0;
|
Chris@34
|
741 while ((ix = catre.indexIn(text, ix+1)) >= 0) {
|
Chris@34
|
742 std::cerr << "extractCatalogueNumberTexts: found match \"" << catre.cap(0).toStdString() << "\"" << std::endl;
|
Chris@37
|
743 QString cat = catre.cap(0);
|
Chris@37
|
744 // ensure space before digit
|
Chris@37
|
745 for (int i = 0; i+1 < cat.length(); ++i) {
|
Chris@37
|
746 if (!cat[i].isDigit() && !cat[i].isSpace() && cat[i+1].isDigit()) {
|
Chris@37
|
747 QString spaced = cat.left(i+1) + " " + cat.right(cat.length()-i-1);
|
Chris@37
|
748 std::cerr << "spaced out from " << cat.toStdString() << " to "
|
Chris@37
|
749 << spaced.toStdString() << std::endl;
|
Chris@37
|
750 cat = spaced;
|
Chris@37
|
751 break;
|
Chris@37
|
752 }
|
Chris@37
|
753 }
|
Chris@37
|
754 results.push_back(cat);
|
Chris@34
|
755 }
|
Chris@34
|
756 return results;
|
Chris@0
|
757 }
|
Chris@0
|
758
|
Chris@0
|
759 bool
|
Chris@0
|
760 Work::Ordering::operator()(Work *a, Work *b)
|
Chris@0
|
761 {
|
Chris@0
|
762 if (!a) {
|
Chris@0
|
763 if (!b) return false;
|
Chris@0
|
764 else return true;
|
Chris@0
|
765 } else {
|
Chris@0
|
766 if (!b) {
|
Chris@0
|
767 return false;
|
Chris@0
|
768 }
|
Chris@0
|
769 }
|
Chris@0
|
770 /*
|
Chris@0
|
771 QString ao = a->catalogue();
|
Chris@0
|
772 if (ao == "") ao = a->opus();
|
Chris@0
|
773
|
Chris@0
|
774 QString bo = b->catalogue();
|
Chris@0
|
775 if (bo == "") bo = b->opus();
|
Chris@0
|
776
|
Chris@0
|
777 std::cout << "ao " << ao.toStdString() << ", bo " << bo.toStdString() << std::endl;
|
Chris@0
|
778 */
|
Chris@0
|
779 int c = 0;
|
Chris@0
|
780 if (a->catalogue() != "" && b->catalogue() != "") {
|
Chris@10
|
781 c = compareCatalogueNumberTexts(a->catalogue(), b->catalogue());
|
Chris@0
|
782 }
|
Chris@0
|
783 if (c == 0 && a->opus() != "" && b->opus() != "") {
|
Chris@10
|
784 c = compareCatalogueNumberTexts(a->opus(), b->opus());
|
Chris@0
|
785 }
|
Chris@0
|
786 if (c == 0 && a->partOf() == b->partOf() &&
|
Chris@0
|
787 a->number() != "" && b->number() != "") {
|
Chris@10
|
788 c = compareCatalogueNumberTexts(a->number(), b->number());
|
Chris@0
|
789 }
|
Chris@0
|
790
|
Chris@0
|
791 bool rv = false;
|
Chris@0
|
792
|
Chris@0
|
793 if (c == 0) {
|
Chris@0
|
794 if (a->name() == b->name()) rv = (a < b);
|
Chris@0
|
795 else rv = (a->name() < b->name());
|
Chris@0
|
796 } else {
|
Chris@0
|
797 rv = (c < 0);
|
Chris@0
|
798 }
|
Chris@0
|
799
|
Chris@0
|
800 // std::cout << "result = " << rv << std::endl;
|
Chris@0
|
801 return rv;
|
Chris@0
|
802 }
|
Chris@0
|
803
|
Chris@37
|
804 QString
|
Chris@37
|
805 Work::getDisplayName() const
|
Chris@37
|
806 {
|
Chris@37
|
807 QString suffix;
|
Chris@37
|
808
|
Chris@37
|
809 if (catalogue() != "") {
|
Chris@37
|
810 suffix = catalogue();
|
Chris@37
|
811 } else if (opus() != "") {
|
Chris@37
|
812 suffix = QString("Op. %1").arg(opus());
|
Chris@37
|
813 }
|
Chris@37
|
814 if (suffix != "" && number() != "") {
|
Chris@37
|
815 suffix = QString("%1 no. %2").arg(suffix).arg(number());
|
Chris@37
|
816 }
|
Chris@37
|
817 if (suffix != "") {
|
Chris@37
|
818 if (name() != "") {
|
Chris@37
|
819 return QString("%1, %2").arg(name()).arg(suffix);
|
Chris@37
|
820 } else {
|
Chris@37
|
821 return suffix;
|
Chris@37
|
822 }
|
Chris@37
|
823 } else {
|
Chris@37
|
824 return name();
|
Chris@37
|
825 }
|
Chris@37
|
826 }
|
Chris@37
|
827
|
Chris@45
|
828 AudioFile::AudioFile(QObject *parent) :
|
Chris@45
|
829 QObject(parent)
|
Chris@43
|
830 {
|
Chris@43
|
831 }
|
Chris@43
|
832
|
Chris@45
|
833 AudioFile::AudioFile(FileSource source, QObject *parent) :
|
Chris@45
|
834 QObject(parent)
|
Chris@33
|
835 {
|
Chris@33
|
836 if (source.isAvailable()) {
|
Chris@33
|
837 QFile f(source.getLocalFilename());
|
Chris@33
|
838 f.open(QIODevice::ReadOnly);
|
Chris@45
|
839 //!!! stream this!
|
Chris@33
|
840 QByteArray ba = f.readAll();
|
Chris@52
|
841 m_hash = QString::fromLatin1
|
Chris@33
|
842 (QCryptographicHash::hash(ba, QCryptographicHash::Sha1).toHex());
|
Chris@33
|
843 }
|
Chris@33
|
844 QString location = source.getLocation();
|
Chris@33
|
845 if (source.isRemote()) {
|
Chris@33
|
846 m_uri = Dataquay::Uri(location);
|
Chris@33
|
847 } else {
|
Chris@33
|
848 if (location.contains("://")) {
|
Chris@33
|
849 m_uri = Dataquay::Uri(location);
|
Chris@33
|
850 } else if (location.startsWith('/')) {
|
Chris@33
|
851 m_uri = Dataquay::Uri("file://" + location);
|
Chris@33
|
852 } else {
|
Chris@33
|
853 m_uri = Dataquay::Uri("file://" + QFileInfo(location).canonicalFilePath());
|
Chris@33
|
854 }
|
Chris@33
|
855 }
|
Chris@45
|
856
|
Chris@45
|
857 std::cerr << "AudioFile::AudioFile: hash = " << m_hash.toStdString()
|
Chris@33
|
858 << ", uri = " << m_uri.toString().toStdString() << std::endl;
|
Chris@33
|
859 }
|
Chris@33
|
860
|
Chris@48
|
861 AudioFile::~AudioFile()
|
Chris@48
|
862 {
|
Chris@48
|
863 foreach (AudioFileTag *t, m_tags) delete t;
|
Chris@48
|
864 }
|
Chris@48
|
865
|
Chris@48
|
866 void
|
Chris@48
|
867 AudioFile::setTags(QSet<AudioFileTag *> tt)
|
Chris@48
|
868 {
|
Chris@48
|
869 foreach (AudioFileTag *t, m_tags) {
|
Chris@48
|
870 if (!tt.contains(t)) delete t;
|
Chris@48
|
871 }
|
Chris@48
|
872 m_tags = tt;
|
Chris@48
|
873 }
|
Chris@48
|
874
|
Chris@48
|
875 void
|
Chris@48
|
876 AudioFile::addTag(AudioFileTag *t)
|
Chris@48
|
877 {
|
Chris@48
|
878 m_tags.insert(t);
|
Chris@48
|
879 }
|
Chris@0
|
880
|
Chris@0
|
881 }
|
Chris@0
|
882
|