Chris@0: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@0: Chris@0: #include "ImportClassicalComposersOrg.h" Chris@0: Chris@0: #include Chris@0: Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: Chris@0: #include Chris@0: Chris@0: using namespace Dataquay; Chris@0: Chris@0: namespace ClassicalData { Chris@0: Chris@0: void Chris@0: ClassicalComposersOrgImporter::setSource(QUrl source) Chris@0: { Chris@0: DEBUG << "ClassicalComposersOrgImporter::setSource: " << source << endl; Chris@0: import(source); Chris@0: } Chris@0: Chris@0: typedef QMap NameMap; Chris@0: Chris@0: void Chris@0: parseNames(QString field, NameMap &names, int score = 0) Chris@0: { Chris@0: QString a(field), b(field); Chris@0: Chris@0: int mp; Chris@0: QRegExp re; Chris@0: Chris@0: /* classical-composers.org uses quite a few (not always Chris@0: * consistent) ways to indicate alternatives in composer Chris@0: * names. Not all of them are distinguishable. Chris@0: * Examples: Chris@0: * Chris@0: * Pipe used to separate sorted surname from alternative for whole: Chris@0: * Hardin | Moondog, Louis Thomas Chris@0: * -> "Louis Thomas Hardin", "Moondog" Chris@0: * Barron | Charlotte May Wind, Bebe Chris@0: * -> "Bebe Barron", "Charlotte May Wind" Chris@0: * Chris@0: * Pipe used to separate alternatives for surname only (seems Chris@0: * slightly more common than the previous one; if there is only Chris@0: * one word between the pipe and a following comma, I'd be Chris@0: * inclined to assume this case, Moondog notwithstanding): Chris@0: * Mendelssohn | Hensel, Fanny Cécile Chris@0: * -> "Fanny Cécile Mendelssohn", "Fanny Cécile Hensel" Chris@0: * Erskine, 6th Earl of Kellie | Kelly, Thomas Alexander Chris@0: * -> "Thomas Alexander Erskine, 6th Earl of Kellie", Chris@0: * "Thomas Alexander Kelly" Chris@0: * Chris@0: * Round brackets used to indicate one or more alternatives for Chris@0: * prior word; slash for alternation: Chris@0: * Edelmann, Jean-Frédéric (Johann-Friedrich) Chris@0: * -> "Jean-Frédéric Edelmann", "Johann-Friedrich Edelmann" Chris@0: * Eberwein, Max (Traugott Maximilian) Chris@0: * -> "Max Eberwein", "Traugott Maximilian Eberwein" Chris@0: * Mahaut | Mahault | Mahoti | Mahout, Antoine (Anton/Antonio) Chris@0: * -> "Antoine Mahaut", "Antoine Mahault", "Antoine Mahoti", Chris@0: * "Antoine Mahout", "Anton Mahaut", "Anton Mahault", Chris@0: * "Anton Mahoti", "Anton Mahout", "Antonio Mahaut", Chris@0: * "Antonio Mahault", "Antonio Mahoti", "Antonio Mahout" Chris@0: * Chris@0: * Round brackets used to indicate alternative to prior Chris@0: * names, with some meaning left implicit: Chris@0: * Kaan | Kaan-Albest, Jindrich z Albestu (Heinrich) Chris@0: * -> "Jindrich z Albestu Kaan", "Heinrich Kaan-Albest", Chris@0: * perhaps "Heinrich Kaan" (but not "Jindrich z Albestu Chris@0: * Kaan-Albest") Chris@0: * Chris@0: * Round brackets used to augment rather than Chris@0: * alternate. Probably can't identify this reliably, though Chris@0: * round brackets used somewhere other than at end of line Chris@0: * are relatively likely to be this form (?): Chris@0: * Linley (the elder), Thomas Chris@0: * -> "Thomas Linley", "Thomas Linley the elder" Chris@0: * Keys | Keyes, Ivor (Christopher Banfield) Chris@0: * -> "Ivor Keys", "Ivor Keyes", "Ivor Christopher Banfield Keys", Chris@0: * "Ivor Christopher Banfield Keyes" Chris@0: * Chris@0: * Square brackets used to indicate alternative for all Chris@0: * forenames: Chris@0: * Moller | Möller, John Christopher [Johann Christoph] Chris@0: * -> "John Christopher Moller", "John Christopher Möller", Chris@0: * "Johann Christoph Moller", "Johann Christoph Möller" Chris@0: * Chris@0: * Complicated examples: Chris@0: * Mayr | Mayer, (Johann) Simon [Giovanni Simone] Chris@0: * -> "Simon Mayr", "Simon Mayer", "Johann Simon Mayr", Chris@0: * "Johann Simon Mayer", "Giovanni Simone Mayr", Chris@0: * "Geovanni Simone Mayer" (but not "Johann Giovanni Simone Mayr") Chris@0: * Frauenlob | Heinrich von Meissen Chris@0: * -> "Heinrich Frauenlob", "Heinrich von Meissen", or Chris@0: * perhaps "Frauenlob" (but not "Heinrich von Meissen Frauenlob") Chris@0: */ Chris@0: Chris@0: // DEBUG << "parseNames: field = " << field << ", names contains " << names.size() << " item(s)" << endl; Chris@0: Chris@0: // round brackets used for augmentation right at the start Chris@0: re = QRegExp("\\(([^\\)]+)\\) "); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: int ml = re.matchedLength(); Chris@0: QString c(re.cap(1)); Chris@0: a.replace(mp, ml, ""); Chris@0: b.replace(mp, ml, QString("%1 ").arg(c)); Chris@0: parseNames(a, names, score); Chris@0: parseNames(b, names, score+1); Chris@0: return; Chris@0: } Chris@0: Chris@0: // round brackets used for augmentation directly after the comma Chris@0: re = QRegExp(", \\(([^\\)]+)\\)"); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: int ml = re.matchedLength(); Chris@0: QString c(re.cap(1)); Chris@0: a.replace(mp, ml, ","); Chris@0: b.replace(mp, ml, QString(", %1").arg(c)); Chris@0: parseNames(a, names, score); Chris@0: parseNames(b, names, score+1); Chris@0: return; Chris@0: } Chris@0: Chris@0: // round brackets used for augmentation directly before the comma Chris@0: re = QRegExp(" \\(([^\\)]+)\\),"); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: int ml = re.matchedLength(); Chris@0: QString c(re.cap(1)); Chris@0: a.replace(mp, ml, ","); Chris@0: b.replace(mp, ml, QString(" %1,").arg(c)); Chris@0: parseNames(a, names, score); Chris@0: parseNames(b, names, score+1); Chris@0: return; Chris@0: } Chris@0: Chris@0: // round brackets for alternation of single name, anywhere Chris@0: re = QRegExp("([^\\[\\(, |]+) \\(([^\\)]+)\\)"); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: int ml = re.matchedLength(); Chris@0: QString c(re.cap(1)); Chris@0: QString d(re.cap(2)); Chris@0: a.replace(mp, ml, c); Chris@0: b.replace(mp, ml, d); Chris@0: parseNames(a, names, score); Chris@0: parseNames(b, names, score+1); Chris@0: return; Chris@0: } Chris@0: Chris@0: // square brackets for alternation of a series of names, at end or after pipe Chris@0: re = QRegExp("([,|]) ([^\\[|,]+) \\[([^\\]]+)\\]"); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: int ml = re.matchedLength(); Chris@0: QString p(re.cap(1)); Chris@0: QString c(re.cap(2)); Chris@0: QString d(re.cap(3)); Chris@0: a.replace(mp, ml, QString("%1 %2").arg(p).arg(c)); Chris@0: b.replace(mp, ml, QString("%1 %2").arg(p).arg(d)); Chris@0: parseNames(a, names, score); Chris@0: parseNames(b, names, score+1); Chris@0: return; Chris@0: } Chris@0: Chris@0: // square brackets for alternation of a series of names, at start Chris@0: re = QRegExp("^([^\\[,]+) \\[([^\\]]+)\\]"); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: int ml = re.matchedLength(); Chris@0: QString c(re.cap(1)); Chris@0: QString d(re.cap(2)); Chris@0: a.replace(mp, ml, c); Chris@0: b.replace(mp, ml, d); Chris@0: parseNames(a, names, score); Chris@0: parseNames(b, names, score+1); Chris@0: return; Chris@0: } Chris@0: Chris@0: // slash for alternation of word Chris@0: re = QRegExp("([^ ,|]+)/([^ ,|]+)"); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: int ml = re.matchedLength(); Chris@0: QString c(re.cap(1)); Chris@0: QString d(re.cap(2)); Chris@0: a.replace(mp, ml, c); Chris@0: b.replace(mp, ml, d); Chris@0: parseNames(a, names, score); Chris@0: parseNames(b, names, score+1); Chris@0: return; Chris@0: } Chris@0: Chris@0: // pipe for alternation of surname Chris@0: re = QRegExp("^(.*) \\| ([^|, ]+),"); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: int ml = re.matchedLength(); Chris@0: QString c(re.cap(1)); Chris@0: QString d(re.cap(2)); Chris@0: a.replace(mp, ml, c + ","); Chris@0: b.replace(mp, ml, d + ","); Chris@0: parseNames(a, names, score); Chris@0: parseNames(b, names, score+1); Chris@0: return; Chris@0: } Chris@0: Chris@0: // pipe for alternation of whole (before comma) Chris@0: re = QRegExp("^(.*) \\| ([^|,]+),"); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: int ml = re.matchedLength(); Chris@0: QString c(re.cap(1)); Chris@0: QString d(re.cap(2)); Chris@0: a.replace(mp, ml, c + ","); Chris@0: b = d; Chris@0: parseNames(a, names, score); Chris@0: parseNames(b, names, score+1); Chris@0: return; Chris@0: } Chris@0: Chris@0: // pipe for alternation of whole (at end) Chris@0: re = QRegExp("^(.*) \\| ([^|,]+)$"); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: int ml = re.matchedLength(); Chris@0: QString c(re.cap(1)); Chris@0: QString d(re.cap(2)); Chris@0: a.replace(mp, ml, c); Chris@0: b.replace(mp, ml, d); Chris@0: parseNames(a, names, score); Chris@0: parseNames(b, names, score+1); Chris@0: return; Chris@0: } Chris@0: Chris@0: // comma Chris@0: re = QRegExp("^(.+), ([^,]+)$"); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: QString c(re.cap(1)); Chris@0: QString d(re.cap(2)); Chris@0: parseNames(d + " " + c, names, score+1); Chris@0: // fall through to add Chris@0: } Chris@0: Chris@4: field.replace("(", ""); Chris@4: field.replace(")", ""); Chris@4: Chris@0: names[field] = score; Chris@0: } Chris@0: Chris@0: void Chris@0: ClassicalComposersOrgImporter::import(QUrl source) Chris@0: { Chris@0: int i = 0; Chris@0: Chris@0: //!!! for now Chris@0: QString filename = source.toLocalFile(); Chris@0: Chris@0: Chris@0: QFile file(filename); Chris@0: if (!file.open(QFile::ReadOnly | QFile::Text)) { Chris@0: throw std::exception(); Chris@0: } Chris@0: Chris@0: QTextStream stream(&file); Chris@0: stream.setCodec("UTF-8"); Chris@0: QString all = stream.readAll(); Chris@0: Chris@0: all.replace(QRegExp("^.*
"), ""); Chris@0: Chris@0: QRegExp matcher Chris@1: (QString::fromUtf8("
  • ([^<]+)(([^<]*))? \\((\\*?)([0-9]+)([^0-9)]*)([0-9]+)?\\)\\s*([^\\s])?
  • ")); Chris@0: Chris@0: int pos = 0, count = 0; Chris@0: while ((pos = matcher.indexIn(all, pos)) != -1) { Chris@0: Chris@0: pos += matcher.matchedLength(); Chris@0: ++count; Chris@0: Chris@0: QString page = matcher.cap(1); Chris@0: QString name = matcher.cap(2); Chris@1: QString star = matcher.cap(5); Chris@0: QString birth = matcher.cap(6); Chris@1: QString dagger = matcher.cap(7); Chris@1: QString death = matcher.cap(8); Chris@1: QString female = matcher.cap(9); Chris@0: Chris@0: DEBUG << "Item " << count Chris@0: << ": page = " << page Chris@0: << ", name = " << name Chris@0: << ", birth = " << birth Chris@0: << ", death = " << death Chris@0: << ", female = " << female; Chris@0: Chris@0: QString namefield = name.trimmed(); Chris@0: NameMap names; Chris@0: Chris@4: if (namefield.contains("P.D.Q.")) { // lose this joke Chris@4: continue; Chris@4: } Chris@4: Chris@0: parseNames(namefield, names); Chris@0: Chris@0: i = 0; Chris@0: QString preferred; Chris@0: foreach (QString n, names.keys()) { Chris@0: if (preferred == "" || names[n] == 0) preferred = n; Chris@0: DEBUG << "Name " << i << ": " << n << " score " << names[n] << endl; Chris@0: ++i; Chris@0: } Chris@0: Chris@0: if (names.empty()) { Chris@0: DEBUG << "No name!" << endl; Chris@0: continue; Chris@0: } Chris@0: Chris@0: Composer *composer = new Composer(); Chris@0: composer->setName(preferred); Chris@0: foreach (QString n, names.keys()) { Chris@0: if (n != preferred) composer->addAlias(n); Chris@0: } Chris@0: Chris@0: if (page != "") { Chris@0: Document *d = new Document; Chris@18: d->setUri(Uri("http://www.classical-composers.org" + page)); Chris@0: d->setTopic(composer); Chris@0: d->setSiteName("Classical Composers Database"); Chris@0: composer->addPage(d); Chris@0: } Chris@1: Chris@1: if (birth != "" && death == "") { Chris@1: if (star == "" && dagger != QString::fromUtf8("\342\200\240")) { Chris@1: DEBUG << "Unexpected \"dagger\" character" << dagger << endl; Chris@1: birth = ""; Chris@1: } Chris@1: if (star == "" && dagger == "") { Chris@1: DEBUG << "Only one date in date range (" << birth << "), but no star or dagger -- ignoring" << endl; Chris@1: birth = ""; Chris@1: } else if (star != "" && dagger != "") { Chris@1: DEBUG << "Date range features both star and dagger -- ignoring" << endl; Chris@1: birth = ""; Chris@1: } else if (dagger != "") { Chris@1: DEBUG << "dagger found: setting death to " << birth << endl; Chris@1: death = birth; Chris@1: birth = ""; Chris@1: } Chris@1: } Chris@1: Chris@0: if (birth != "") { Chris@0: Birth *e = new Birth(birth.toInt()); Chris@0: composer->setBirth(e); Chris@0: } Chris@0: if (death != "") { Chris@0: composer->setDeath(new Death(death.toInt())); Chris@0: } Chris@0: if (female != "") { Chris@0: composer->setGender("female"); Chris@20: } else { Chris@20: composer->setGender("male"); Chris@20: } Chris@0: Chris@0: m_objects.push_back(composer); Chris@0: } Chris@0: Chris@0: DEBUG << "Found " << count << " things" << endl; Chris@0: Chris@0: } Chris@0: Chris@0: Chris@0: } Chris@0: