To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Revision:

root / common / Matcher.cpp

History | View | Annotate | Download (10.4 KB)

1
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
2

    
3
#include "Matcher.h"
4
#include "Objects.h"
5
#include "EditDistance.h"
6

    
7
#include <QMultiMap>
8

    
9
#include <iostream>
10

    
11
using namespace Dataquay;
12

    
13
namespace ClassicalData {
14

    
15
ComposerTypingQuickMatcher::ComposerTypingQuickMatcher(QList<Composer *> cl) :
16
    m_composers(cl)
17
{
18
}
19

    
20
GuessList
21
ComposerTypingQuickMatcher::match(QString text, int maxResults,
22
                                  float threshold) const
23
{
24
    GuessList results;
25
    GuessSet matches;
26

    
27
    foreach (Composer *c, m_composers) {
28
        float value = c->matchTypingQuick(text);
29
        if (value < threshold) continue;
30
        matches.insert(Guess(value, c));
31
    }
32
    
33
    int n = 0;
34
    for (GuessSet::const_iterator i = matches.begin();
35
         i != matches.end(); ++i) {
36
        results.push_back(*i);
37
        if (maxResults > 0 && ++n > maxResults) break;
38
    }
39

    
40
    return results;
41
}
42

    
43
ComposerTypingThoroughMatcher::ComposerTypingThoroughMatcher(QList<Composer *> cl) :
44
    m_composers(cl)
45
{
46
}
47

    
48
GuessList
49
ComposerTypingThoroughMatcher::match(QString text, int maxResults,
50
                                     float threshold) const
51
{
52
    GuessList results;
53

    
54
    GuessSet matches;
55
    foreach (Composer *c, m_composers) {
56
        float value = c->matchTyping(text);
57
        if (value < threshold) continue;
58
        matches.insert(Guess(value, c));
59
    }
60
    
61
    int n = 0;
62
    for (GuessSet::const_iterator i = matches.begin();
63
         i != matches.end(); ++i) {
64
        results.push_back(*i);
65
        if (maxResults > 0 && ++n > maxResults) break;
66
    }
67

    
68
    return results;
69
}
70

    
71
ComposerFullTextMatcher::ComposerFullTextMatcher(QList<Composer *> cl) :
72
    m_composers(cl)
73
{
74
}
75

    
76
GuessList
77
ComposerFullTextMatcher::match(QString text, int maxResults,
78
                               float threshold) const
79
{
80
    GuessList results;
81

    
82
    GuessSet matches;
83
    foreach (Composer *c, m_composers) {
84
        float value = c->matchFuzzyName(text);
85
        if (value < threshold) continue;
86
//        std::cerr << "Liking: " << c->name().toStdString() << " (" << value << ")" << std::endl;
87
        matches.insert(Guess(value, c));
88
    }
89
    
90
    int n = 0;
91
    for (GuessSet::iterator i = matches.begin();
92
         i != matches.end(); ++i) {
93
        Guess g = *i;
94
        results.push_back(g);
95
//        std::cerr << "Pushing: " << g.entity()->name().toStdString() << std::endl;
96
        if (maxResults > 0 && ++n > maxResults) break;
97
    }
98

    
99
    return results;
100
}
101

    
102
WorkCatalogueMatcher::WorkCatalogueMatcher(QList<Work *> wl) :
103
    m_works(wl)
104
{
105
}
106

    
107
GuessList
108
WorkCatalogueMatcher::match(QString text, int maxResults,
109
                            float threshold) const
110
{
111
    GuessList results;
112
    GuessSet matches;
113
    QStringList cats = Work::extractCatalogueNumberTexts(text);
114
    if (cats.empty()) return results;
115
    foreach (QString cat, cats) {
116
        std::cerr << "testing cat \"" << cat.toStdString() << "\" against "
117
                  << m_works.size() << " works" << std::endl;
118
        foreach (Work *w, m_works) {
119
            if (maxResults > 0 && matches.size() >= maxResults) {
120
                break;
121
            }
122
            QString catalogue = w->catalogue();
123
            if (catalogue != "") {
124
                if (!Work::compareCatalogueNumberTexts(catalogue, cat)) {
125
                    std::cerr << "We like: " << w->name().toStdString() << " ("
126
                              << catalogue.toStdString() << ")" << std::endl;
127
                    // all catalogue matches score equal here
128
                    matches.insert(Guess(1.f, w));
129
                    continue;
130
                }
131
            }
132
            QString opus = w->opus();
133
            QString number = w->number();
134
            QString optext;
135
            if (opus != "") {
136
                if (number != "") {
137
                    optext = QString("Op %1 no %2").arg(opus).arg(number);
138
                    if (!Work::compareCatalogueNumberTexts(optext, cat)) {
139
                        std::cerr << "We like: " << w->name().toStdString() << " ("
140
                                  << optext.toStdString() << ")" << std::endl;
141
                        matches.insert(Guess(1.f, w));
142
                        continue;
143
                    }
144
                } else {
145
                    optext = QString("Op %1").arg(opus);
146
                    if (!Work::compareCatalogueNumberTexts(optext, cat)) {
147
                        std::cerr << "We like: " << w->name().toStdString() << " ("
148
                                  << optext.toStdString() << ")" << std::endl;
149
                        matches.insert(Guess(1.f, w));
150
                        continue;
151
                    }
152
                }
153
            }
154
        }
155
    }
156

    
157
    if (maxResults == 0 || matches.size() < maxResults) {
158

    
159
        // Now, for slightly lower marks, test for strings like
160
        // "Symphony no 8" at the start of the title, or after a
161
        // colon, slash or dash (e.g. "Brahms: Symphony no 4")
162

    
163
        QRegExp numberRe1("^([^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)");
164
        QRegExp numberRe2("[/:-]\\s*(\\w[^\\d]+)\\s+[Nn][a-z]*\\.?\\s*(\\d+)");
165
        QString tag, number;
166

    
167
        if (numberRe1.indexIn(text) >= 0) {
168
            tag = numberRe1.cap(1);
169
            number = numberRe1.cap(2);
170
        } else if (numberRe2.indexIn(text) >= 0) {
171
            tag = numberRe2.cap(1);
172
            number = numberRe2.cap(2);
173
        }
174
        
175
        if (tag != "") {
176

    
177
            std::cerr << "tag = \"" << tag.toStdString() << "\", number = \""
178
                      << number.toStdString() << "\"" << std::endl;
179

    
180
            tag.replace(QRegExp("[^\\w\\s]+"), "");
181
            QString matcherReStr =
182
                QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(tag).arg(number);
183
            QRegExp matcherRe(matcherReStr, Qt::CaseInsensitive);
184
            std::cerr << "matcherRe: " << matcherReStr.toStdString() << std::endl;
185

    
186
            // initials only
187
/* nah, doesn't work well
188
            QString weakTag;
189
            QRegExp initialRe("\\b(\\w)\\w*\\b");
190
            int ix = 0;
191
            while ((ix = initialRe.indexIn(tag, ix)) >= 0) {
192
                if (ix > 0) weakTag += "\\s+";
193
                weakTag += initialRe.cap(1) + "\\w*";
194
                ++ix;
195
            }
196
            
197
            QString weakMatcherReStr =
198
                QString("^%1\\s+[Nn][a-z]*\\.?\\s*%2\\b").arg(weakTag).arg(number);
199
            QRegExp weakMatcherRe(weakMatcherReStr, Qt::CaseInsensitive);
200
            std::cerr << "weakMatcherRe: " << weakMatcherReStr.toStdString() << std::endl;
201
*/
202
            foreach (Work *w, m_works) {
203
                if (maxResults > 0 && matches.size() >= maxResults) {
204
                    break;
205
                }
206
                QString name = w->name();
207
                if (matcherRe.indexIn(name) >= 0) {
208
                    std::cerr << "We quite like: " << name.toStdString() << std::endl;
209
                    matches.insert(Guess(0.8f, w));
210
                }
211
/* else if (weakMatcherRe.indexIn(name) >= 0) {
212
                    std::cerr << "We sorta like: " << name.toStdString() << std::endl;
213
                    matches.insert(Guess(0.2f, w));
214
                }
215
*/
216
            }
217
        }
218
    }
219
    
220
    int n = 0;
221
    for (GuessSet::const_iterator i = matches.begin();
222
         i != matches.end(); ++i) {
223
        results.push_back(*i);
224
        if (maxResults > 0 && ++n > maxResults) break;
225
    }
226

    
227
    return results;
228
}
229

    
230
WorkTitleMatcher::WorkTitleMatcher(QList<Work *> wl) :
231
    m_works(wl)
232
{
233
}
234

    
235
GuessList
236
WorkTitleMatcher::match(QString text, int maxResults,
237
                        float threshold) const
238
{
239
    GuessList results;
240
    GuessSet matches;
241

    
242
    // Throw away any initial numbers (likely to be track index)
243
    text = text.replace(QRegExp("^[0-9]+"), "");
244

    
245
    QString quoted;
246
    QRegExp quoteRe("(^|\\s)[\"']([^\"]+)[\"']($|[^\\w])");
247
    int qthresh = 0;
248
    
249
    if (quoteRe.indexIn(text) >= 0) {
250
        quoted = quoteRe.cap(2);
251
        if (quoted.length() < 4) quoted = "";
252
        qthresh = quoted.length() / 4;
253
    }
254

    
255
    std::cerr << "text = " << text.toStdString() << ", quoted = "
256
              << quoted.toStdString() << std::endl;
257

    
258
    QStringList components =
259
        text.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts);
260
    QStringList reduced;
261
    foreach (QString c, components) {
262
        QString r = Composer::reduceName(c.toLower());
263
        if (r != "") {
264
            if (r.length() > 3 || r[0].isDigit()) {
265
                reduced.push_back(r);
266
            }
267
        }
268
    }
269

    
270
    std::cerr << "reduced = " << reduced.join(" ").toStdString() << std::endl;
271

    
272
    EditDistance ed;
273

    
274
    foreach (Work *w, m_works) {
275
        if (maxResults > 0 && matches.size() >= maxResults) {
276
            break;
277
        }
278

    
279
        float highScore = 0.f;
280

    
281
        QSet<QString> names = w->aliases();
282
        names.insert(w->name());
283

    
284
        foreach (QString name, names) {
285

    
286
            float pro = 0.f;
287
            float con = 0.f;
288

    
289
            if (quoted != "") {
290
                if (quoteRe.indexIn(name) >= 0) {
291
                    QString q = quoteRe.cap(1);
292
                    int dist = ed.calculate(quoted, q, qthresh);
293
                    if (dist < qthresh) {
294
                        std::cerr << "quoted name match: " << q.toStdString() << std::endl;
295
                        pro += 0.7f - 0.1f * dist;
296
                        if (pro - con > highScore) {
297
                            highScore = pro - con;
298
                            continue;
299
                        }
300
                    }
301
                }
302
            }
303

    
304
            QStringList wcomp =
305
                name.split(QRegExp("[^\\w]+"), QString::SkipEmptyParts);
306
            QSet<QString> wr;
307
            foreach (QString wc, wcomp) {
308
                wr.insert(Composer::reduceName(wc.toLower()));
309
            }
310
            foreach (QString rc, reduced) {
311
                if (wr.contains(rc)) {
312
                    std::cerr << "component match: " << rc.toStdString() << std::endl;
313
                    pro += 0.1;
314
                } else {
315
                    con += 0.101;
316
                    if (con > 0.25) con = 0.25;
317
                }
318
            }
319

    
320
            if (pro - con > highScore) highScore = pro - con;
321
        }
322

    
323
        if (highScore > 0.f) {
324
            std::cerr << "for " << w->name().toStdString() << " highScore = " << highScore << std::endl;
325
            matches.insert(Guess(highScore, w));
326
        }
327
    }
328
    
329
    int n = 0;
330
    for (GuessSet::const_iterator i = matches.begin();
331
         i != matches.end(); ++i) {
332
        results.push_back(*i);
333
        if (maxResults > 0 && ++n > maxResults) break;
334
    }
335

    
336
    return results;
337
}
338

    
339
}
340