To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Revision:

root / import / ImportClassicalDotNet.cpp

History | View | Annotate | Download (3.77 KB)

1
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
2

    
3
#include "ImportClassicalDotNet.h"
4

    
5
#include <dataquay/Debug.h>
6

    
7
#include <QFile>
8
#include <QFileInfo>
9
#include <QTextStream>
10
#include <QRegExp>
11
#include <QVariant>
12

    
13
#include <exception>
14

    
15
using namespace Dataquay;
16

    
17
namespace ClassicalData {
18

    
19
void
20
ClassicalDotNetImporter::setSource(QUrl source)
21
{
22
    DEBUG << "ClassicalDotNetImporter::setSource: " << source << endl;
23
    import(source);
24
}
25

    
26
void
27
parseNames(QString field, QStringList &names)
28
{
29
    field.replace("&#196;", QChar(0x00C4)); // LATIN CAPITAL LETTER A WITH DIAERESIS
30
    field.replace("&#322;", QChar(0x0142)); // LATIN SMALL LETTER L WITH STROKE
31
    field.replace("&#344;", QChar(0x0158)); // LATIN CAPITAL LETTER R WITH CARON
32

    
33
    field.replace("&aacute;", QChar(0x00E1));
34
    field.replace("&Aacute;", QChar(0x00C1));
35
    field.replace("&ccedil;", QChar(0x00E7));
36
    field.replace("&eacute;", QChar(0x00E9));
37
    field.replace("&Eacute;", QChar(0x00C9));
38
    field.replace("&Egrave;", QChar(0x00C8));
39
    field.replace("&Euml;", QChar(0x00CB));
40
    field.replace("&iacute;", QChar(0x00ED));
41
    field.replace("&Iuml;", QChar(0x00CF));
42
    field.replace("&Ntilde;", QChar(0x00D1));
43
    field.replace("&Oacute;", QChar(0x00D3));
44
    field.replace("&Ocirc;", QChar(0x00D4));
45
    field.replace("&ograve;", QChar(0x00F2));
46
    field.replace("&ouml;", QChar(0x00F6));
47
    field.replace("&Yuml;", QChar(0x0178));
48

    
49
    if (field.contains(QRegExp("&[^ ]+;"))) {
50
        DEBUG << "Failed to handle entity in " << field << endl;
51
    }
52

    
53
    // all-caps -> titlecase
54
    QRegExp re("[A-Z][^ ,]*[A-Z][^,]+");
55
    int mp = re.indexIn(field);
56
    if (mp >= 0) {
57
        int ml = re.matchedLength();
58
        bool initial = true;
59
        for (int i = 0; i < ml; ++i) {
60
            if (initial) {
61
                initial = false;
62
                continue;
63
            }
64
            if (field[mp + i].isUpper()) {
65
                field[mp + i] = field[mp + i].toLower();
66
            } else if (field[mp + i].isSpace()) {
67
                initial = true;
68
            }
69
        }
70
    }
71

    
72
    field = field.trimmed();
73
    names.push_back(field);
74

    
75
    // comma
76
    re = QRegExp("^([^,]+), ([^,]+)$");
77
    if ((mp = re.indexIn(field)) >= 0) {
78
        QString c(re.cap(1));
79
        QString d(re.cap(2));
80
        names.push_back(d + " " + c);
81
        return;
82
    }
83
}
84

    
85
void
86
ClassicalDotNetImporter::import(QUrl source)
87
{
88
    //!!! for now
89
    QString filename = source.toLocalFile();
90

    
91
    QFile file(filename);
92
    if (!file.open(QFile::ReadOnly | QFile::Text)) {
93
        throw std::exception();
94
    }
95

    
96
    QTextStream stream(&file);
97
    stream.setCodec("UTF-8");
98
    QString all = stream.readAll();
99
    
100
    all.replace(QRegExp("^.*<div id=\"center\">"), "");
101
    
102
    QRegExp matcher
103
        ("<li><a href=\"([^\"]+)\">([^<]+)</a></li>");
104
    
105
    int pos = 0, count = 0;
106
    while ((pos = matcher.indexIn(all, pos)) != -1) {
107
        pos += matcher.matchedLength();
108
        ++count;
109

    
110
        DEBUG << "Item " << count
111
              << ": page = " << matcher.cap(1)
112
              << ", name = " << matcher.cap(2);
113

    
114
        QString namefield = matcher.cap(2);
115
        QStringList names;
116

    
117
        parseNames(namefield, names);
118
        if (names.empty()) {
119
            DEBUG << "No name!" << endl;
120
            continue;
121
        }
122

    
123
        if (names[0].contains(" Collections")) {
124
            continue;
125
        }
126

    
127
        Composer *composer = new Composer();
128
        composer->setName(names[0]);
129
        for (int i = 1; i < names.size(); ++i) {
130
            composer->addAlias(names[i]);
131
        }
132
        
133
        if (matcher.cap(1) != "") {
134
            QString url = matcher.cap(1);
135
            url.replace(QRegExp("^\\.\\./"), "/music/");
136
            Document *d = new Document;
137
            d->setUri(Uri("http://www.classical.net" + url));
138
            d->setTopic(composer);
139
            d->setSiteName("Classical Net");
140
            composer->addPage(d);
141
        }
142
        
143
        m_objects.push_back(composer);
144
    }
145

    
146
    
147
    DEBUG << "Found " << count << " things" << endl;
148
}
149

    
150

    
151
}
152

    
153