comparison import/Import.cpp @ 10:d35e5d769c87 classical-rdf

* some experiments with composer name matching
author Chris Cannam
date Wed, 17 Feb 2010 19:26:48 +0000
parents df999875c53b
children c8ef23d3888c
comparison
equal deleted inserted replaced
9:9e2b203254ab 10:d35e5d769c87
139 QString nr = n; 139 QString nr = n;
140 nr.replace("(I)", "I"); 140 nr.replace("(I)", "I");
141 nr.replace("(II)", "II"); 141 nr.replace("(II)", "II");
142 nr.replace("(III)", "III"); 142 nr.replace("(III)", "III");
143 c->addAlias(nr); 143 c->addAlias(nr);
144 }
145
146 QString makeNameKey(QString name)
147 {
148 QString key = name.toLower()
149 .replace("'", "")
150 .replace("x", "ks")
151 .replace("y", "i")
152 .replace("k", "c")
153 .replace("ch", "c")
154 .replace("cc", "c")
155 .replace("v", "f")
156 .replace("ff", "f")
157 .replace("th", "t")
158 .replace("tch", "ch")
159 .replace("er", "r");
160 // DEBUG << "makeNameKey(" << name << "): " << key << endl;
161 return key;
162 }
163
164 bool namesFuzzyMatch(QString an, Composer *b)
165 {
166 // ew!
167
168 QString bn = b->name();
169 if (bn == an) return true;
170 if (b->aliases().contains(an)) return true;
171 int aSurnameIndex = 0, bSurnameIndex = 0;
172 if (an.contains(",")) {
173 an.replace(",", "");
174 } else {
175 aSurnameIndex = -1;
176 }
177 if (bn.contains(",")) {
178 bn.replace(",", "");
179 } else {
180 bSurnameIndex = -1;
181 }
182 QStringList nl = an.split(QRegExp("[ -]"));
183 QStringList bnl = makeNameKey(bn).split(QRegExp("[ -]"));
184 int matchCount = 0;
185 QString surnameMatch = "";
186 if (aSurnameIndex == -1) aSurnameIndex = nl.size()-1;
187 if (bSurnameIndex == -1) bSurnameIndex = bnl.size()-1;
188 if (nl[aSurnameIndex][0].isUpper() &&
189 nl[aSurnameIndex] != "Della" &&
190 makeNameKey(nl[aSurnameIndex]) == bnl[bSurnameIndex]) {
191 surnameMatch = nl[aSurnameIndex];
192 }
193 int tested = 0;
194 foreach (QString elt, nl) {
195 if (!elt[0].isUpper() || elt == "Della") continue;
196 QString k = makeNameKey(elt);
197 if (bnl.contains(k)) {
198 ++matchCount;
199 }
200 if (++tested == 2 && matchCount == 0) {
201 return false;
202 }
203 }
204 if (surnameMatch != "") {
205 DEBUG << "namesFuzzyMatch: note: surnameMatch = " << surnameMatch << endl;
206 if (matchCount > 1) {
207 return true;
208 } else {
209 DEBUG << "(but not enough else matched)" << endl;
210 return false;
211 }
212 }
213 return false;
214 } 144 }
215 145
216 bool 146 bool
217 hasBetterName(Composer *c, Composer *other) 147 hasBetterName(Composer *c, Composer *other)
218 { 148 {
262 } 192 }
263 193
264 QSet<Composer *> matches; 194 QSet<Composer *> matches;
265 195
266 foreach (QString candidateName, allNames) { 196 foreach (QString candidateName, allNames) {
267 QString key = makeNameKey(candidateName); 197 QString key = Composer::reduceName(candidateName);
268 if (composers.contains(key)) { 198 if (composers.contains(key)) {
269 foreach (Composer *candidate, composers[key]) { 199 foreach (Composer *candidate, composers[key]) {
270 if (candidateName == dates) { 200 if (candidateName == dates) {
271 if (c->name() == candidate->name()) { 201 if (c->name() == candidate->name()) {
272 DEBUG << "mergeComposer: Exact name match for " << c->name() << " with date(s) " << dates << endl; 202 DEBUG << "mergeComposer: Exact name match for " << c->name() << " with date(s) " << dates << endl;
273 } else if (!namesFuzzyMatch(c->name(), candidate) && 203 } else if (!candidate->matchCatalogueName(c->name()) &&
274 !namesFuzzyMatch(candidate->name(), c)) { 204 !c->matchCatalogueName(candidate->name())) {
275 DEBUG << "mergeComposer: Names differ for " << c->name() << " and " << candidate->name() << " (having matched date(s) " << dates << ")" << endl; 205 DEBUG << "mergeComposer: Names differ for " << c->name() << " and " << candidate->name() << " (having matched date(s) " << dates << ")" << endl;
276 continue; 206 continue;
277 } else { 207 } else {
278 DEBUG << "mergeComposer: Note: Fuzzy name match for " << c->name() << " and " << candidate->name() << " with date(s) " << dates << endl; 208 DEBUG << "mergeComposer: Note: Fuzzy name match for " << c->name() << " and " << candidate->name() << " with date(s) " << dates << endl;
279 } 209 }
280 } else { 210 } else {
281 if (!c->datesMatch(candidate)) { 211 if (!c->matchDates(candidate)) {
282 DEBUG << "mergeComposer: Dates differ for " << c->name() << " and " << candidate->name() << endl; 212 DEBUG << "mergeComposer: Dates differ for " << c->name() << " and " << candidate->name() << endl;
283 continue; 213 continue;
284 } 214 }
285 } 215 }
286 matches.insert(candidate); 216 matches.insert(candidate);
295 DEBUG << "Composer has no dates, laboriously searching for all names" << endl; 225 DEBUG << "Composer has no dates, laboriously searching for all names" << endl;
296 // laboriously look for fuzzy match across _all_ composers 226 // laboriously look for fuzzy match across _all_ composers
297 for (ComposerMap::iterator i = composers.begin(); 227 for (ComposerMap::iterator i = composers.begin();
298 i != composers.end(); ++i) { 228 i != composers.end(); ++i) {
299 foreach (Composer *candidate, *i) { 229 foreach (Composer *candidate, *i) {
300 if (namesFuzzyMatch(c->name(), candidate)) { 230 if (candidate->matchCatalogueName(c->name())) {
301 DEBUG << "mergeComposer: Found fuzzy match for undated composer " << c->name() << " as " << candidate->name() << ", daringly merging" << endl; 231 DEBUG << "mergeComposer: Found fuzzy match for undated composer " << c->name() << " as " << candidate->name() << ", daringly merging" << endl;
302 matches.insert(candidate); 232 matches.insert(candidate);
303 break; 233 break;
304 } 234 }
305 } 235 }
307 } 237 }
308 } 238 }
309 239
310 if (matches.empty()) { 240 if (matches.empty()) {
311 foreach (QString candidateName, allNames) { 241 foreach (QString candidateName, allNames) {
312 QString key = makeNameKey(candidateName); 242 QString key = Composer::reduceName(candidateName);
313 composers[key].insert(c); 243 composers[key].insert(c);
314 DEBUG << "added for alias or date " << candidateName << endl; 244 DEBUG << "added for alias or date " << candidateName << endl;
315 } 245 }
316 return; 246 return;
317 } 247 }
329 other->addAlias(other->name()); 259 other->addAlias(other->name());
330 other->setName(c->name()); 260 other->setName(c->name());
331 } else { 261 } else {
332 other->addAlias(c->name()); 262 other->addAlias(c->name());
333 } 263 }
334 composers[makeNameKey(c->name())].insert(other); 264 composers[Composer::reduceName(c->name())].insert(other);
335 DEBUG << "linking from alias " << c->name() << endl; 265 DEBUG << "linking from alias " << c->name() << endl;
336 266
337 foreach (QString alias, c->aliases()) { 267 foreach (QString alias, c->aliases()) {
338 if (alias != other->name() && 268 if (alias != other->name() &&
339 !other->aliases().contains(alias)) { 269 !other->aliases().contains(alias)) {
340 other->addAlias(alias); 270 other->addAlias(alias);
341 composers[makeNameKey(alias)].insert(other); 271 composers[Composer::reduceName(alias)].insert(other);
342 DEBUG << "linking from alias " << alias << endl; 272 DEBUG << "linking from alias " << alias << endl;
343 } 273 }
344 } 274 }
345 275
346 foreach (Document *d, c->pages()) { 276 foreach (Document *d, c->pages()) {
640 foreach (Work *w, works) { 570 foreach (Work *w, works) {
641 Composition *cn = w->composition(); 571 Composition *cn = w->composition();
642 if (!cn) continue; 572 if (!cn) continue;
643 if (!cn->composer()) { 573 if (!cn->composer()) {
644 QString cname = cn->composerName(); 574 QString cname = cn->composerName();
645 QString key = makeNameKey(cname); 575 QString key = Composer::reduceName(cname);
646 if (cname != "") { 576 if (cname != "") {
647 if (!composers.contains(key)) { 577 if (!composers.contains(key)) {
648 DEBUG << "Failed to assign Composition to composer: no composer matches name " << cname << endl; 578 DEBUG << "Failed to assign Composition to composer: no composer matches name " << cname << endl;
649 } else { 579 } else {
650 QSet<Composer *> cs = composers[key]; 580 QSet<Composer *> cs = composers[key];