annotate import/ImportWikipediaComposers.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents c8ef23d3888c
children
rev   line source
Chris@0 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@0 2
Chris@0 3 #include "ImportWikipediaComposers.h"
Chris@0 4
Chris@0 5 #include <dataquay/Debug.h>
Chris@0 6
Chris@0 7 #include <QFile>
Chris@0 8 #include <QFileInfo>
Chris@0 9 #include <QTextStream>
Chris@0 10 #include <QRegExp>
Chris@0 11 #include <QVariant>
Chris@0 12
Chris@0 13 #include <exception>
Chris@0 14
Chris@0 15 using namespace Dataquay;
Chris@0 16
Chris@0 17 namespace ClassicalData {
Chris@0 18
Chris@0 19 void
Chris@0 20 WikipediaComposersImporter::setSource(QUrl source)
Chris@0 21 {
Chris@0 22 DEBUG << "WikipediaComposersImporter::setSource: " << source << endl;
Chris@0 23 import(source);
Chris@0 24 }
Chris@0 25
Chris@0 26 Composer *
Chris@0 27 addComposer(QString namefield, QString birthfield, QString deathfield,
Chris@0 28 QString datesfield, QString nationalityfield, QString worksfield,
Chris@0 29 QString summaryfield)
Chris@0 30 {
Chris@0 31 namefield = namefield.trimmed();
Chris@0 32 birthfield = birthfield.trimmed();
Chris@0 33 deathfield = deathfield.trimmed();
Chris@0 34 datesfield = datesfield.trimmed();
Chris@0 35 nationalityfield = nationalityfield.trimmed();
Chris@0 36 worksfield = worksfield.trimmed();
Chris@0 37 summaryfield = summaryfield.trimmed();
Chris@0 38
Chris@0 39 Composer *composer = new Composer();
Chris@0 40
Chris@0 41 QString name = namefield;
Chris@0 42 name.replace("[[", "");
Chris@0 43 name.replace("]]", "");
Chris@0 44 QString pagename = name;
Chris@0 45
Chris@0 46 if (name.contains('|')) {
Chris@0 47 QStringList bits = name.split('|');
Chris@0 48 pagename = bits[0];
Chris@0 49 name = bits[1];
Chris@0 50 }
Chris@0 51
Chris@0 52 composer->setName(name);
Chris@0 53
Chris@0 54 pagename.replace(" ", "_");
Chris@0 55 QUrl url;
Chris@0 56 url.setScheme("http");
Chris@0 57 url.setHost("en.wikipedia.org");
Chris@0 58
Chris@0 59 url.setPath("/wiki/" + QUrl::toPercentEncoding(pagename));
Chris@0 60 Document *d = new Document;
Chris@18 61 d->setUri(Uri(url));
Chris@0 62 d->setSiteName("Wikipedia");
Chris@0 63 d->setTopic(composer);
Chris@0 64 composer->addPage(d);
Chris@0 65
Chris@0 66 if (datesfield.contains("fl.")) datesfield = ""; // "flourished", meaningless for us at the moment
Chris@0 67
Chris@0 68 bool approx = (datesfield.contains("c.") || datesfield.contains("?")
Chris@0 69 || datesfield.contains("before") || datesfield.contains("after"));
Chris@0 70
Chris@0 71 if (datesfield != "") {
Chris@0 72 DEBUG << "dates for " << name << ": " << datesfield << endl;
Chris@0 73 datesfield.replace("(", "");
Chris@0 74 datesfield.replace(")", "");
Chris@0 75 datesfield.replace(" ", "");
Chris@0 76 datesfield.replace(QString::fromUtf8("\342\200\222"), "-");
Chris@0 77 datesfield.replace(QString::fromUtf8("\342\200\223"), "-");
Chris@0 78 datesfield.replace(QString::fromUtf8("\342\200\224"), "-");
Chris@0 79 datesfield.replace(QString::fromUtf8("\342\200\225"), "-");
Chris@0 80 datesfield.replace("--", "-");
Chris@0 81 DEBUG << "dates for " << name << ": " << datesfield << endl;
Chris@0 82
Chris@0 83 QRegExp birthRe1("([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?-");
Chris@0 84 QRegExp birthRe2("b\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])(/[0-9]+|to[0-9]+)?");
Chris@0 85
Chris@0 86 if (birthRe1.indexIn(datesfield) >= 0) birthfield = birthRe1.cap(1);
Chris@0 87 else if (birthRe2.indexIn(datesfield) >= 0) birthfield = birthRe2.cap(2);
Chris@0 88
Chris@0 89 QRegExp deathRe1("-(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])");
Chris@0 90 QRegExp deathRe2("d\\.(c\\.|\\?|[a-z]+)?([0-9][0-9][0-9][0-9])");
Chris@0 91
Chris@0 92 if (deathRe1.indexIn(datesfield) >= 0) deathfield = deathRe1.cap(2);
Chris@0 93 else if (deathRe2.indexIn(datesfield) >= 0) deathfield = deathRe2.cap(2);
Chris@0 94
Chris@0 95 // datesfield.replace(QRegExp("[^0-9]+"), "-");
Chris@0 96 /*
Chris@0 97 QStringList list = datesfield.split('-');
Chris@0 98 if (!list.empty()) {
Chris@0 99 birthfield = list[0];
Chris@0 100 if (list.size() > 1) {
Chris@0 101 deathfield = list[1];
Chris@0 102 }
Chris@0 103 }
Chris@0 104 */
Chris@0 105 DEBUG << " -> dates normalised to " << birthfield << " to " << deathfield << " approx =" << approx << endl;
Chris@0 106 }
Chris@0 107 if (birthfield != "") {
Chris@0 108 Birth *e = new Birth(birthfield.toInt());
Chris@0 109 e->setApproximate(approx);
Chris@0 110 composer->setBirth(e);
Chris@0 111 }
Chris@0 112 if (deathfield != "") {
Chris@0 113 Death *e = new Death(deathfield.toInt());
Chris@0 114 e->setApproximate(approx);
Chris@0 115 composer->setDeath(e);
Chris@0 116 }
Chris@0 117 if (nationalityfield != "") {
Chris@4 118 composer->addNationality(nationalityfield);
Chris@0 119 }
Chris@0 120 if (summaryfield != "") {
Chris@0 121 summaryfield.replace(QRegExp("^[Cc]omposer, *"), "");
Chris@0 122 summaryfield[0] = summaryfield[0].toUpper();
Chris@0 123 summaryfield.replace(QRegExp("\\[\\[[^]\\|]+\\|"), "[[");
Chris@0 124 summaryfield.replace("[[", "");
Chris@0 125 summaryfield.replace("]]", "");
Chris@0 126 summaryfield.replace("''", "\"");
Chris@0 127 summaryfield.replace("&quot;", "'");
Chris@0 128 summaryfield.replace(QRegExp("^\\[[^]]*\\]$"), "");
Chris@0 129 summaryfield.replace("[", "");
Chris@0 130 summaryfield.replace("]", "");
Chris@0 131 composer->setRemarks(summaryfield);
Chris@0 132 }
Chris@0 133
Chris@0 134 return composer;
Chris@0 135 }
Chris@0 136
Chris@0 137 void
Chris@0 138 WikipediaComposersImporter::import(QUrl source)
Chris@0 139 {
Chris@0 140 //!!! for now
Chris@0 141 QString filename = source.toLocalFile();
Chris@0 142
Chris@0 143 QFile file(filename);
Chris@0 144 if (!file.open(QFile::ReadOnly | QFile::Text)) {
Chris@0 145 throw std::exception();
Chris@0 146 }
Chris@0 147
Chris@0 148 QTextStream stream(&file);
Chris@0 149 stream.setCodec("UTF-8");
Chris@0 150
Chris@0 151 QString period;
Chris@0 152 DEBUG << "source = " << source.toString() << endl;
Chris@0 153 QRegExp pmatcher1("List_of_([0-9][^_-]+[_-][^_-]+)_");
Chris@0 154 QRegExp pmatcher2("List_of_([^_-]+)[_-]era_");
Chris@0 155 QRegExp pmatcher3("([^_-]+)_composers");
Chris@0 156 if (pmatcher1.indexIn(source.toString()) >= 0) period = pmatcher1.cap(1);
Chris@0 157 else if (pmatcher2.indexIn(source.toString()) >= 0) period = pmatcher2.cap(1);
Chris@0 158 else if (pmatcher3.indexIn(source.toString()) >= 0) period = pmatcher3.cap(1);
Chris@0 159 DEBUG << "period = "<< period << endl;
Chris@0 160
Chris@0 161 int count = 0;
Chris@0 162
Chris@0 163 // table form A (used of e.g. Romantic transitional composers)
Chris@0 164 // | Name || birth || death || nationality || summary || flags
Chris@0 165 // note: 5x ||
Chris@0 166 QRegExp matcher1("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*[0-9]) *\\|\\|([^|]*)\\|\\|(.*)\\|\\|");
Chris@0 167
Chris@0 168 // table form B (used of e.g. 20th-century composers)
Chris@0 169 // | Name || birth-[death] || nationality || notable works || remarks
Chris@0 170 // Note name may contain a single | if in double-square brackets, hence 2a
Chris@0 171 // note: 4x ||
Chris@0 172 QRegExp matcher2("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)\\|\\|(.*)");
Chris@0 173 // just in case the final column has been omitted completely (as happens).
Chris@0 174 // this must be matched after matcher2
Chris@0 175 QRegExp matcher2a("^\\| *\\[\\[([^]]+)\\]\\] *\\|\\| *([0-9]+) *[^0-9] *([0-9]*) *\\|\\|([^0-9|]*)\\|\\|(.*)");
Chris@0 176
Chris@0 177 // list form
Chris@0 178 // * [[Name]] [alias?] (stuff about dates)[,] notes
Chris@0 179 QRegExp matcher3("^\\* *\\[\\[([^\\]]+)\\]\\],? *([^\\(]*) *\\(([^\\)]+)\\)(,?) *(.*)");
Chris@0 180
Chris@0 181 while (!stream.atEnd()) {
Chris@0 182 QString line = stream.readLine();
Chris@0 183
Chris@0 184 Composer *o = 0;
Chris@0 185
Chris@0 186 if (matcher1.indexIn(line) >= 0) {
Chris@0 187
Chris@0 188 o = addComposer(matcher1.cap(1), matcher1.cap(2), matcher1.cap(3),
Chris@0 189 "", matcher1.cap(4), "", matcher1.cap(5));
Chris@0 190
Chris@0 191 } else if (matcher2.indexIn(line) >= 0) {
Chris@0 192
Chris@0 193 o = addComposer(matcher2.cap(1), matcher2.cap(2), matcher2.cap(3), "",
Chris@0 194 matcher2.cap(4), matcher2.cap(5), "");
Chris@0 195
Chris@0 196 } else if (matcher2a.indexIn(line) >= 0) {
Chris@0 197
Chris@0 198 o = addComposer(matcher2a.cap(1), matcher2a.cap(2), matcher2a.cap(3), "",
Chris@0 199 matcher2a.cap(4), "", "");
Chris@0 200
Chris@0 201 } else if (matcher3.indexIn(line) >= 0) {
Chris@0 202
Chris@0 203 o = addComposer(matcher3.cap(1), "", "", matcher3.cap(3),
Chris@0 204 "", "", matcher3.cap(5));
Chris@0 205
Chris@0 206 } else if (line.startsWith("* ") || line.startsWith("| ") ||
Chris@0 207 line.startsWith("*[") || line.startsWith("|[")) {
Chris@0 208 DEBUG << "Failed to match promising line: " << line << endl;
Chris@0 209 }
Chris@0 210
Chris@0 211 if (o) {
Chris@0 212 if (period != "") o->setPeriod(period);
Chris@0 213 m_objects.push_back(o);
Chris@0 214 ++count;
Chris@0 215 }
Chris@0 216
Chris@0 217 }
Chris@0 218
Chris@0 219 DEBUG << "Found " << count << " things" << endl;
Chris@0 220 }
Chris@0 221
Chris@0 222
Chris@0 223 }
Chris@0 224
Chris@0 225