Mercurial > hg > classical
view import/ImportClassicalComposersOrg.cpp @ 53:bcea875d8d2f tip
More build fixes
author | Chris Cannam |
---|---|
date | Thu, 16 Oct 2014 19:03:51 +0100 |
parents | c4cb65c436ef |
children |
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ #include "ImportClassicalComposersOrg.h" #include <dataquay/Debug.h> #include <QFile> #include <QFileInfo> #include <QTextStream> #include <QRegExp> #include <QVariant> #include <exception> using namespace Dataquay; namespace ClassicalData { void ClassicalComposersOrgImporter::setSource(QUrl source) { DEBUG << "ClassicalComposersOrgImporter::setSource: " << source << endl; import(source); } typedef QMap<QString, int> NameMap; void parseNames(QString field, NameMap &names, int score = 0) { QString a(field), b(field); int mp; QRegExp re; /* classical-composers.org uses quite a few (not always * consistent) ways to indicate alternatives in composer * names. Not all of them are distinguishable. * Examples: * * Pipe used to separate sorted surname from alternative for whole: * Hardin | Moondog, Louis Thomas * -> "Louis Thomas Hardin", "Moondog" * Barron | Charlotte May Wind, Bebe * -> "Bebe Barron", "Charlotte May Wind" * * Pipe used to separate alternatives for surname only (seems * slightly more common than the previous one; if there is only * one word between the pipe and a following comma, I'd be * inclined to assume this case, Moondog notwithstanding): * Mendelssohn | Hensel, Fanny Cécile * -> "Fanny Cécile Mendelssohn", "Fanny Cécile Hensel" * Erskine, 6th Earl of Kellie | Kelly, Thomas Alexander * -> "Thomas Alexander Erskine, 6th Earl of Kellie", * "Thomas Alexander Kelly" * * Round brackets used to indicate one or more alternatives for * prior word; slash for alternation: * Edelmann, Jean-Frédéric (Johann-Friedrich) * -> "Jean-Frédéric Edelmann", "Johann-Friedrich Edelmann" * Eberwein, Max (Traugott Maximilian) * -> "Max Eberwein", "Traugott Maximilian Eberwein" * Mahaut | Mahault | Mahoti | Mahout, Antoine (Anton/Antonio) * -> "Antoine Mahaut", "Antoine Mahault", "Antoine Mahoti", * "Antoine Mahout", "Anton Mahaut", "Anton Mahault", * "Anton Mahoti", "Anton Mahout", "Antonio Mahaut", * "Antonio Mahault", "Antonio Mahoti", "Antonio Mahout" * * Round brackets used to indicate alternative to prior * names, with some meaning left implicit: * Kaan | Kaan-Albest, Jindrich z Albestu (Heinrich) * -> "Jindrich z Albestu Kaan", "Heinrich Kaan-Albest", * perhaps "Heinrich Kaan" (but not "Jindrich z Albestu * Kaan-Albest") * * Round brackets used to augment rather than * alternate. Probably can't identify this reliably, though * round brackets used somewhere other than at end of line * are relatively likely to be this form (?): * Linley (the elder), Thomas * -> "Thomas Linley", "Thomas Linley the elder" * Keys | Keyes, Ivor (Christopher Banfield) * -> "Ivor Keys", "Ivor Keyes", "Ivor Christopher Banfield Keys", * "Ivor Christopher Banfield Keyes" * * Square brackets used to indicate alternative for all * forenames: * Moller | Möller, John Christopher [Johann Christoph] * -> "John Christopher Moller", "John Christopher Möller", * "Johann Christoph Moller", "Johann Christoph Möller" * * Complicated examples: * Mayr | Mayer, (Johann) Simon [Giovanni Simone] * -> "Simon Mayr", "Simon Mayer", "Johann Simon Mayr", * "Johann Simon Mayer", "Giovanni Simone Mayr", * "Geovanni Simone Mayer" (but not "Johann Giovanni Simone Mayr") * Frauenlob | Heinrich von Meissen * -> "Heinrich Frauenlob", "Heinrich von Meissen", or * perhaps "Frauenlob" (but not "Heinrich von Meissen Frauenlob") */ // DEBUG << "parseNames: field = " << field << ", names contains " << names.size() << " item(s)" << endl; // round brackets used for augmentation right at the start re = QRegExp("\\(([^\\)]+)\\) "); if ((mp = re.indexIn(field)) >= 0) { int ml = re.matchedLength(); QString c(re.cap(1)); a.replace(mp, ml, ""); b.replace(mp, ml, QString("%1 ").arg(c)); parseNames(a, names, score); parseNames(b, names, score+1); return; } // round brackets used for augmentation directly after the comma re = QRegExp(", \\(([^\\)]+)\\)"); if ((mp = re.indexIn(field)) >= 0) { int ml = re.matchedLength(); QString c(re.cap(1)); a.replace(mp, ml, ","); b.replace(mp, ml, QString(", %1").arg(c)); parseNames(a, names, score); parseNames(b, names, score+1); return; } // round brackets used for augmentation directly before the comma re = QRegExp(" \\(([^\\)]+)\\),"); if ((mp = re.indexIn(field)) >= 0) { int ml = re.matchedLength(); QString c(re.cap(1)); a.replace(mp, ml, ","); b.replace(mp, ml, QString(" %1,").arg(c)); parseNames(a, names, score); parseNames(b, names, score+1); return; } // round brackets for alternation of single name, anywhere re = QRegExp("([^\\[\\(, |]+) \\(([^\\)]+)\\)"); if ((mp = re.indexIn(field)) >= 0) { int ml = re.matchedLength(); QString c(re.cap(1)); QString d(re.cap(2)); a.replace(mp, ml, c); b.replace(mp, ml, d); parseNames(a, names, score); parseNames(b, names, score+1); return; } // square brackets for alternation of a series of names, at end or after pipe re = QRegExp("([,|]) ([^\\[|,]+) \\[([^\\]]+)\\]"); if ((mp = re.indexIn(field)) >= 0) { int ml = re.matchedLength(); QString p(re.cap(1)); QString c(re.cap(2)); QString d(re.cap(3)); a.replace(mp, ml, QString("%1 %2").arg(p).arg(c)); b.replace(mp, ml, QString("%1 %2").arg(p).arg(d)); parseNames(a, names, score); parseNames(b, names, score+1); return; } // square brackets for alternation of a series of names, at start re = QRegExp("^([^\\[,]+) \\[([^\\]]+)\\]"); if ((mp = re.indexIn(field)) >= 0) { int ml = re.matchedLength(); QString c(re.cap(1)); QString d(re.cap(2)); a.replace(mp, ml, c); b.replace(mp, ml, d); parseNames(a, names, score); parseNames(b, names, score+1); return; } // slash for alternation of word re = QRegExp("([^ ,|]+)/([^ ,|]+)"); if ((mp = re.indexIn(field)) >= 0) { int ml = re.matchedLength(); QString c(re.cap(1)); QString d(re.cap(2)); a.replace(mp, ml, c); b.replace(mp, ml, d); parseNames(a, names, score); parseNames(b, names, score+1); return; } // pipe for alternation of surname re = QRegExp("^(.*) \\| ([^|, ]+),"); if ((mp = re.indexIn(field)) >= 0) { int ml = re.matchedLength(); QString c(re.cap(1)); QString d(re.cap(2)); a.replace(mp, ml, c + ","); b.replace(mp, ml, d + ","); parseNames(a, names, score); parseNames(b, names, score+1); return; } // pipe for alternation of whole (before comma) re = QRegExp("^(.*) \\| ([^|,]+),"); if ((mp = re.indexIn(field)) >= 0) { int ml = re.matchedLength(); QString c(re.cap(1)); QString d(re.cap(2)); a.replace(mp, ml, c + ","); b = d; parseNames(a, names, score); parseNames(b, names, score+1); return; } // pipe for alternation of whole (at end) re = QRegExp("^(.*) \\| ([^|,]+)$"); if ((mp = re.indexIn(field)) >= 0) { int ml = re.matchedLength(); QString c(re.cap(1)); QString d(re.cap(2)); a.replace(mp, ml, c); b.replace(mp, ml, d); parseNames(a, names, score); parseNames(b, names, score+1); return; } // comma re = QRegExp("^(.+), ([^,]+)$"); if ((mp = re.indexIn(field)) >= 0) { QString c(re.cap(1)); QString d(re.cap(2)); parseNames(d + " " + c, names, score+1); // fall through to add } field.replace("(", ""); field.replace(")", ""); names[field] = score; } void ClassicalComposersOrgImporter::import(QUrl source) { int i = 0; //!!! for now QString filename = source.toLocalFile(); QFile file(filename); if (!file.open(QFile::ReadOnly | QFile::Text)) { throw std::exception(); } QTextStream stream(&file); stream.setCodec("UTF-8"); QString all = stream.readAll(); all.replace(QRegExp("^.*<div id=\"main\">"), ""); QRegExp matcher (QString::fromUtf8("<li><a href=\"([^\"]+)\">([^<]+)(<small>([^<]*)</small>)?</a> \\((\\*?)([0-9]+)([^0-9)]*)([0-9]+)?\\)\\s*([^\\s])?</li>")); int pos = 0, count = 0; while ((pos = matcher.indexIn(all, pos)) != -1) { pos += matcher.matchedLength(); ++count; QString page = matcher.cap(1); QString name = matcher.cap(2); QString star = matcher.cap(5); QString birth = matcher.cap(6); QString dagger = matcher.cap(7); QString death = matcher.cap(8); QString female = matcher.cap(9); DEBUG << "Item " << count << ": page = " << page << ", name = " << name << ", birth = " << birth << ", death = " << death << ", female = " << female; QString namefield = name.trimmed(); NameMap names; if (namefield.contains("P.D.Q.")) { // lose this joke continue; } parseNames(namefield, names); i = 0; QString preferred; foreach (QString n, names.keys()) { if (preferred == "" || names[n] == 0) preferred = n; DEBUG << "Name " << i << ": " << n << " score " << names[n] << endl; ++i; } if (names.empty()) { DEBUG << "No name!" << endl; continue; } Composer *composer = new Composer(); composer->setName(preferred); foreach (QString n, names.keys()) { if (n != preferred) composer->addAlias(n); } if (page != "") { Document *d = new Document; d->setUri(Uri("http://www.classical-composers.org" + page)); d->setTopic(composer); d->setSiteName("Classical Composers Database"); composer->addPage(d); } if (birth != "" && death == "") { if (star == "" && dagger != QString::fromUtf8("\342\200\240")) { DEBUG << "Unexpected \"dagger\" character" << dagger << endl; birth = ""; } if (star == "" && dagger == "") { DEBUG << "Only one date in date range (" << birth << "), but no star or dagger -- ignoring" << endl; birth = ""; } else if (star != "" && dagger != "") { DEBUG << "Date range features both star and dagger -- ignoring" << endl; birth = ""; } else if (dagger != "") { DEBUG << "dagger found: setting death to " << birth << endl; death = birth; birth = ""; } } if (birth != "") { Birth *e = new Birth(birth.toInt()); composer->setBirth(e); } if (death != "") { composer->setDeath(new Death(death.toInt())); } if (female != "") { composer->setGender("female"); } else { composer->setGender("male"); } m_objects.push_back(composer); } DEBUG << "Found " << count << " things" << endl; } }