annotate import/ImportWikipediaWorksK.cpp @ 53:bcea875d8d2f tip

More build fixes
author Chris Cannam
date Thu, 16 Oct 2014 19:03:51 +0100
parents c8ef23d3888c
children
rev   line source
Chris@0 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@0 2
Chris@0 3 #include "ImportWikipediaWorksK.h"
Chris@0 4
Chris@0 5 #include <dataquay/Debug.h>
Chris@0 6
Chris@0 7 #include <QFile>
Chris@0 8 #include <QFileInfo>
Chris@0 9 #include <QTextStream>
Chris@0 10 #include <QRegExp>
Chris@0 11 #include <QVariant>
Chris@0 12
Chris@0 13 #include <exception>
Chris@0 14
Chris@0 15 using namespace Dataquay;
Chris@0 16
Chris@0 17 namespace ClassicalData {
Chris@0 18
Chris@0 19 void
Chris@0 20 WikipediaWorksKImporter::setSource(QUrl source)
Chris@0 21 {
Chris@0 22 DEBUG << "WikipediaWorksKImporter::setSource: " << source << endl;
Chris@0 23 import(source);
Chris@0 24 }
Chris@0 25
Chris@0 26 static QString
Chris@0 27 sanitise(QString field, QString &linkText)
Chris@0 28 {
Chris@0 29 int mp;
Chris@0 30
Chris@0 31 field.replace(QString::fromUtf8("\342\200\222"), "-");
Chris@0 32 field.replace(QString::fromUtf8("\342\200\223"), "-");
Chris@0 33 field.replace(QString::fromUtf8("\342\200\224"), "-");
Chris@0 34 field.replace(QString::fromUtf8("\342\200\225"), "-");
Chris@0 35
Chris@0 36 QRegExp link2("^([^A-Za-z]*)\\[\\[([^\\]\\|]+)\\|([^\\]]+)\\]\\]");
Chris@0 37 if ((mp = link2.indexIn(field)) >= 0) {
Chris@0 38 if (linkText == "") linkText = link2.cap(2);
Chris@0 39 field.replace(mp, link2.matchedLength(), link2.cap(1) + link2.cap(3));
Chris@0 40 return sanitise(field, linkText);
Chris@0 41 }
Chris@0 42
Chris@0 43 QRegExp link1("^([^A-Za-z]*)\\[\\[([^\\]]+)\\]\\]");
Chris@0 44 if ((mp = link1.indexIn(field)) >= 0) {
Chris@0 45 if (linkText == "") linkText = link1.cap(2);
Chris@0 46 field.replace(mp, link1.matchedLength(), link1.cap(1) + link1.cap(2));
Chris@0 47 return sanitise(field, linkText);
Chris@0 48 }
Chris@0 49
Chris@0 50 field = field.trimmed();
Chris@0 51
Chris@0 52 field.replace("[", "");
Chris@0 53 field.replace("]", "");
Chris@0 54 field.replace(QRegExp("\\{+[^\\}]*\\}+ *"), "");
Chris@0 55 field.replace("''", "\"");
Chris@0 56 field.replace("&quot;", "\"");
Chris@0 57 field.replace(QRegExp("&lt;[^&]*&gt;"), "");
Chris@0 58 field.replace(QRegExp("^\\**"), "");
Chris@0 59
Chris@0 60 while (field.endsWith(".") || field.endsWith(",")) {
Chris@0 61 field = field.left(field.length()-1);
Chris@0 62 }
Chris@0 63
Chris@0 64 if (field.startsWith("(") && field.endsWith(")")) {
Chris@0 65 DEBUG << "before: " << field;
Chris@0 66 field = field.mid(1, field.length()-2);
Chris@0 67 DEBUG << "after: " << field;
Chris@0 68 }
Chris@0 69 field.replace(QRegExp("^\\**"), "");
Chris@0 70 if (field == ")" || field == "(") {
Chris@0 71 field = "";
Chris@0 72 }
Chris@0 73
Chris@0 74 field.replace(" - ,", ",");
Chris@0 75
Chris@0 76 return field;
Chris@0 77 }
Chris@0 78
Chris@0 79 static QString
Chris@0 80 extractYear(QString datefield)
Chris@0 81 {
Chris@0 82 QRegExp re("[0-9]{4}");
Chris@0 83 if (re.indexIn(datefield) >= 0) {
Chris@0 84 return re.cap(0);
Chris@0 85 }
Chris@0 86 return "";
Chris@0 87 }
Chris@0 88
Chris@0 89 static QString
Chris@0 90 extractKey(QString titlefield)
Chris@0 91 {
Chris@0 92 QRegExp re("in ([A-H]([ -][a-z]+)? (major|minor))");
Chris@0 93 if (re.indexIn(titlefield) >= 0) {
Chris@0 94 return re.cap(1);
Chris@0 95 }
Chris@0 96 return "";
Chris@0 97 }
Chris@0 98
Chris@0 99 static Work *
Chris@0 100 makeWork(QString composerName, QString opfield, QString kfield,
Chris@0 101 QString numfield, QString titlefield, QString datefield,
Chris@0 102 QString placefield, QString remarksfield, Work *main)
Chris@0 103 {
Chris@0 104 QString linkText;
Chris@0 105
Chris@0 106 Work *w = new Work;
Chris@0 107
Chris@0 108 QString op = sanitise(opfield, linkText);
Chris@0 109 if (op != "") {
Chris@0 110 op.replace("Opus ", "");
Chris@0 111 op.replace("Op. ", "");
Chris@0 112 op.replace("Op ", "");
Chris@0 113 w->setOpus(op);
Chris@0 114 }
Chris@0 115
Chris@0 116 QString k = sanitise(kfield, linkText);
Chris@0 117 if (k != "") {
Chris@0 118 k.replace("K. ", "K ");
Chris@0 119 w->setCatalogue(k);
Chris@0 120 }
Chris@0 121
Chris@0 122 QString num = sanitise(numfield, linkText);
Chris@0 123 if (num != "") {
Chris@0 124 num.replace("No. ", "");
Chris@0 125 num.replace("No ", "");
Chris@0 126 w->setNumber(num);
Chris@0 127 }
Chris@0 128
Chris@0 129 QString key = extractKey(titlefield);
Chris@0 130 if (key != "") {
Chris@0 131 w->setKey(key);
Chris@0 132 }
Chris@0 133
Chris@0 134 QString title = sanitise(titlefield, linkText);
Chris@0 135 if (linkText != "") {
Chris@0 136 linkText.replace(" ", "_");
Chris@0 137 QUrl url;
Chris@0 138 url.setScheme("http");
Chris@0 139 url.setHost("en.wikipedia.org");
Chris@0 140 url.setPath("/wiki/" + QUrl::toPercentEncoding(linkText));
Chris@0 141 Document *d = new Document;
Chris@18 142 d->setUri(Uri(url));
Chris@0 143 d->setSiteName("Wikipedia");
Chris@0 144 d->setTopic(w);
Chris@0 145 w->addPage(d);
Chris@0 146 }
Chris@0 147
Chris@0 148 QRegExp explicationRE("^(\"[^-]+\") - (.*)$");
Chris@0 149 int pos;
Chris@0 150 if ((pos = explicationRE.indexIn(title)) >= 0) {
Chris@0 151 w->addAlias(explicationRE.cap(2));
Chris@0 152 title = explicationRE.cap(1);
Chris@0 153 }
Chris@0 154
Chris@0 155 if (remarksfield == "") {
Chris@0 156 QRegExp remarksRE("^(\"[^-]+\") (for .*)$");
Chris@0 157 if ((pos = remarksRE.indexIn(title)) >= 0) {
Chris@0 158 remarksfield = remarksRE.cap(2);
Chris@0 159 title = remarksRE.cap(1);
Chris@0 160 }
Chris@0 161 }
Chris@0 162
Chris@0 163 if (remarksfield == "") {
Chris@0 164 QRegExp remarksRE("^(\"[^-]+\"), (.*)$");
Chris@0 165 if ((pos = remarksRE.indexIn(title)) >= 0) {
Chris@0 166 remarksfield = remarksRE.cap(2);
Chris@0 167 title = remarksRE.cap(1);
Chris@0 168 }
Chris@0 169 }
Chris@0 170
Chris@0 171 w->setName(title);
Chris@0 172
Chris@0 173 QString remarks = sanitise(remarksfield, linkText);
Chris@0 174 if (remarks != "") {
Chris@0 175 w->setRemarks(remarks);
Chris@0 176 }
Chris@0 177
Chris@0 178 QString year = extractYear(datefield);
Chris@0 179 QString place = sanitise(placefield, linkText);
Chris@0 180
Chris@0 181 DEBUG << "title = " << title << endl;
Chris@0 182
Chris@0 183 if (main) {
Chris@0 184 main->addPart(w);
Chris@0 185 w->setPartOf(main);
Chris@0 186 w->setComposition(main->composition());
Chris@0 187 main->composition()->addWork(w);
Chris@0 188 }
Chris@0 189
Chris@0 190 if (!main || !main->composition() ||
Chris@0 191 (year != "" && (main->composition()->year() != year.toInt()))) {
Chris@0 192 Composition *c = new Composition;
Chris@0 193 c->setComposerName(composerName);
Chris@0 194 c->addWork(w);
Chris@0 195 c->setYear(year.toInt());
Chris@0 196 c->setPlace(place);
Chris@0 197 w->setComposition(c);
Chris@0 198 }
Chris@0 199
Chris@0 200 return w;
Chris@0 201 }
Chris@0 202
Chris@0 203
Chris@0 204 void
Chris@0 205 WikipediaWorksKImporter::import(QUrl source)
Chris@0 206 {
Chris@0 207 //!!! for now
Chris@0 208 QString filename = source.toLocalFile();
Chris@0 209
Chris@0 210 QFile file(filename);
Chris@0 211 if (!file.open(QFile::ReadOnly | QFile::Text)) {
Chris@0 212 throw std::exception();
Chris@0 213 }
Chris@0 214
Chris@0 215 QTextStream stream(&file);
Chris@0 216 stream.setCodec("UTF-8");
Chris@0 217
Chris@0 218 QString composerName;
Chris@0 219 if (filename.contains("K%C3%B6chel")) {
Chris@0 220 composerName = "Wolfgang Amadeus Mozart";
Chris@0 221 } else {
Chris@0 222 QRegExp byby("by_(.*)_by");
Chris@0 223 if (byby.indexIn(filename) >= 0) {
Chris@0 224 composerName = byby.cap(1).replace('_', ' ');
Chris@0 225 } else {
Chris@0 226 QRegExp by("by_(.*)");
Chris@0 227 if (by.indexIn(filename) >= 0) {
Chris@0 228 composerName = by.cap(1).replace('_', ' ');
Chris@0 229 }
Chris@0 230 }
Chris@0 231 }
Chris@0 232 composerName = QUrl::fromPercentEncoding(composerName.toLocal8Bit());
Chris@0 233
Chris@0 234 DEBUG << "composerName = " << composerName << endl;
Chris@0 235
Chris@0 236 // K numbers in tabular form (as found in "Köchel Catalogue" WP page)
Chris@0 237 QRegExp matcherK("\\|- *\n\\|[^\n]*\n\\|\\{\\{[^\\[]*\\[\\[(K\\.? *[0-9][0-9a-z]*)[^\n]*\n\\|([^\n]*)\n\\|([^\n]*)\n\\|([^\n]*)\n");
Chris@0 238
Chris@0 239 QString all = stream.readAll();
Chris@0 240
Chris@0 241 DEBUG << "Read " << all.length() << " chars" << endl;
Chris@0 242
Chris@0 243 all.replace(QRegExp("^.*<page>"), "");
Chris@0 244
Chris@0 245 int pos = 0, count = 0;
Chris@0 246
Chris@0 247 while ((pos = matcherK.indexIn(all, pos)) != -1) {
Chris@0 248
Chris@0 249 all.replace(pos, matcherK.matchedLength(), "");
Chris@0 250 ++count;
Chris@0 251
Chris@0 252 QString kfield = matcherK.cap(1);
Chris@0 253 QString titlefield = matcherK.cap(2);
Chris@0 254 QString datefield = matcherK.cap(3);
Chris@0 255 QString placefield = matcherK.cap(4);
Chris@0 256
Chris@0 257 m_objects.push_back
Chris@0 258 (makeWork(composerName, "", kfield, "",
Chris@0 259 titlefield, datefield, placefield, "", 0));
Chris@0 260 }
Chris@0 261
Chris@0 262 DEBUG << "Left over: " << all << endl;
Chris@0 263
Chris@0 264 DEBUG << "Found " << count << " things" << endl;
Chris@0 265 }
Chris@0 266
Chris@0 267
Chris@0 268 }
Chris@0 269
Chris@0 270