Chris@0: /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ Chris@0: Chris@0: #include "ImportClassicalDotNet.h" Chris@0: Chris@0: #include Chris@0: Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: #include Chris@0: Chris@0: #include Chris@0: Chris@0: using namespace Dataquay; Chris@0: Chris@0: namespace ClassicalData { Chris@0: Chris@0: void Chris@0: ClassicalDotNetImporter::setSource(QUrl source) Chris@0: { Chris@0: DEBUG << "ClassicalDotNetImporter::setSource: " << source << endl; Chris@0: import(source); Chris@0: } Chris@0: Chris@0: void Chris@0: parseNames(QString field, QStringList &names) Chris@0: { Chris@0: field.replace("Ä", QChar(0x00C4)); // LATIN CAPITAL LETTER A WITH DIAERESIS Chris@0: field.replace("ł", QChar(0x0142)); // LATIN SMALL LETTER L WITH STROKE Chris@0: field.replace("Ř", QChar(0x0158)); // LATIN CAPITAL LETTER R WITH CARON Chris@0: Chris@0: field.replace("á", QChar(0x00E1)); Chris@0: field.replace("Á", QChar(0x00C1)); Chris@0: field.replace("ç", QChar(0x00E7)); Chris@0: field.replace("é", QChar(0x00E9)); Chris@0: field.replace("É", QChar(0x00C9)); Chris@0: field.replace("È", QChar(0x00C8)); Chris@0: field.replace("Ë", QChar(0x00CB)); Chris@0: field.replace("í", QChar(0x00ED)); Chris@0: field.replace("Ï", QChar(0x00CF)); Chris@0: field.replace("Ñ", QChar(0x00D1)); Chris@0: field.replace("Ó", QChar(0x00D3)); Chris@0: field.replace("Ô", QChar(0x00D4)); Chris@0: field.replace("ò", QChar(0x00F2)); Chris@0: field.replace("ö", QChar(0x00F6)); Chris@0: field.replace("Ÿ", QChar(0x0178)); Chris@0: Chris@0: if (field.contains(QRegExp("&[^ ]+;"))) { Chris@0: DEBUG << "Failed to handle entity in " << field << endl; Chris@0: } Chris@0: Chris@0: // all-caps -> titlecase Chris@0: QRegExp re("[A-Z][^ ,]*[A-Z][^,]+"); Chris@0: int mp = re.indexIn(field); Chris@0: if (mp >= 0) { Chris@0: int ml = re.matchedLength(); Chris@0: bool initial = true; Chris@0: for (int i = 0; i < ml; ++i) { Chris@0: if (initial) { Chris@0: initial = false; Chris@0: continue; Chris@0: } Chris@0: if (field[mp + i].isUpper()) { Chris@0: field[mp + i] = field[mp + i].toLower(); Chris@0: } else if (field[mp + i].isSpace()) { Chris@0: initial = true; Chris@0: } Chris@0: } Chris@0: } Chris@0: Chris@0: field = field.trimmed(); Chris@0: names.push_back(field); Chris@0: Chris@0: // comma Chris@0: re = QRegExp("^([^,]+), ([^,]+)$"); Chris@0: if ((mp = re.indexIn(field)) >= 0) { Chris@0: QString c(re.cap(1)); Chris@0: QString d(re.cap(2)); Chris@0: names.push_back(d + " " + c); Chris@0: return; Chris@0: } Chris@0: } Chris@0: Chris@0: void Chris@0: ClassicalDotNetImporter::import(QUrl source) Chris@0: { Chris@0: //!!! for now Chris@0: QString filename = source.toLocalFile(); Chris@0: Chris@0: QFile file(filename); Chris@0: if (!file.open(QFile::ReadOnly | QFile::Text)) { Chris@0: throw std::exception(); Chris@0: } Chris@0: Chris@0: QTextStream stream(&file); Chris@0: stream.setCodec("UTF-8"); Chris@0: QString all = stream.readAll(); Chris@0: Chris@0: all.replace(QRegExp("^.*
"), ""); Chris@0: Chris@0: QRegExp matcher Chris@0: ("
  • ([^<]+)
  • "); Chris@0: Chris@0: int pos = 0, count = 0; Chris@0: while ((pos = matcher.indexIn(all, pos)) != -1) { Chris@0: pos += matcher.matchedLength(); Chris@0: ++count; Chris@0: Chris@0: DEBUG << "Item " << count Chris@0: << ": page = " << matcher.cap(1) Chris@0: << ", name = " << matcher.cap(2); Chris@0: Chris@0: QString namefield = matcher.cap(2); Chris@0: QStringList names; Chris@0: Chris@0: parseNames(namefield, names); Chris@0: if (names.empty()) { Chris@0: DEBUG << "No name!" << endl; Chris@0: continue; Chris@0: } Chris@0: Chris@0: if (names[0].contains(" Collections")) { Chris@0: continue; Chris@0: } Chris@0: Chris@0: Composer *composer = new Composer(); Chris@0: composer->setName(names[0]); Chris@0: for (int i = 1; i < names.size(); ++i) { Chris@0: composer->addAlias(names[i]); Chris@0: } Chris@0: Chris@0: if (matcher.cap(1) != "") { Chris@0: QString url = matcher.cap(1); Chris@0: url.replace(QRegExp("^\\.\\./"), "/music/"); Chris@0: Document *d = new Document; Chris@18: d->setUri(Uri("http://www.classical.net" + url)); Chris@0: d->setTopic(composer); Chris@0: d->setSiteName("Classical Net"); Chris@0: composer->addPage(d); Chris@0: } Chris@0: Chris@0: m_objects.push_back(composer); Chris@0: } Chris@0: Chris@0: Chris@0: DEBUG << "Found " << count << " things" << endl; Chris@0: } Chris@0: Chris@0: Chris@0: } Chris@0: Chris@0: