Mercurial > hg > dml-open-backendtools
comparison collection_analysis/chord_sequence_mining/spmf.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e34cf1b6fe09 |
---|---|
1 # Part of DML (Digital Music Laboratory) | |
2 # Copyright 2014-2015 Daniel Wolff, City University | |
3 | |
4 # This program is free software; you can redistribute it and/or | |
5 # modify it under the terms of the GNU General Public License | |
6 # as published by the Free Software Foundation; either version 2 | |
7 # of the License, or (at your option) any later version. | |
8 # | |
9 # This program is distributed in the hope that it will be useful, | |
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 # GNU General Public License for more details. | |
13 # | |
14 # You should have received a copy of the GNU General Public | |
15 # License along with this library; if not, write to the Free Software | |
16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
17 | |
18 #!/usr/bin/python | |
19 # -*- coding: utf-8 -*- | |
20 # | |
21 # This is a data conversion wrapper for the spmf toolkit. | |
22 # The toolkit has been released under GPL3 at www.philippe-fournier-viger.com/spmf | |
23 | |
24 __author__="Daniel Wolff" | |
25 | |
26 import chord2function as c2f | |
27 import csv | |
28 import re | |
29 | |
30 # takes a dictionary of chords for one or multiple files | |
31 # in the form of dict[clipid] = [ (time,key,mode,fun,typ,bfun) ] | |
32 # and converts it into spmf | |
33 def folder2spmf(folderin = 'D:/mirg/Chord_Analysis20141216/', fileout = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf'): | |
34 | |
35 # get chords for all files | |
36 output = c2f.folder2functions(folderin) | |
37 | |
38 # open log | |
39 logfile = fileout + '.dic' | |
40 csvfile = open(logfile, "w+b") #opens the file for updating | |
41 w = csv.writer(csvfile) | |
42 w.writerow(["track","key","mode","sequence length"]) | |
43 | |
44 # open spmf file | |
45 fspmf = open(fileout,'w') | |
46 # --- | |
47 # this is writing the spmf format | |
48 for track,trackdata in output.iteritems(): | |
49 # write chord sequence as one line in spmf file | |
50 for (time,key,mode,fun,typ,bfun) in trackdata: | |
51 chord = c2f.fun2num(fun,typ,bfun,mode) | |
52 | |
53 # -1 is the spearator of items or itemsets | |
54 fspmf.write(str(chord) + ' -1 ') | |
55 | |
56 # the sequence is closed with -2 | |
57 fspmf.write('-2\n') | |
58 w.writerow([track, str(key), str(mode),str(len(trackdata))]) | |
59 | |
60 fspmf.close() | |
61 csvfile.close() | |
62 | |
63 # read an spmf file | |
64 # def parsespmf(filein = 'D:/mirg/Chord_Analysis20141216/Beethoven.txt'): | |
65 | |
66 # string sourcefile path to the source spmf file with chords from records | |
67 # string patternfile path to the pattern spmf file | |
68 # matches each of the patterns in patternfile | |
69 # to the chord sequences in sourcefile | |
70 def match(sourcefile = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf',sourcedict = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf.dic', patternfile = 'D:/mirg/Chord_Analysis20141216/Beethoven_70.txt'): | |
71 | |
72 # define regular expressions for matching | |
73 # closed sequence | |
74 | |
75 # --- | |
76 # we here assume that there are more files than patterns, | |
77 # as display of patterns is somehow limited | |
78 # therefore parallelisation will be 1 pattern/multiple files | |
79 # per instance | |
80 # --- | |
81 | |
82 patterns = [] | |
83 patterns_raw = [] | |
84 # read all patterns | |
85 f = open(patternfile, 'r') | |
86 for line in f: | |
87 # a line looks like this: | |
88 # 1120401 -1 1120101 -1 #SUP: 916 | |
89 | |
90 | |
91 # save pattern | |
92 #patterns.append(pattern) | |
93 #numeric? or just regex? | |
94 # we'll use string, so any representation works | |
95 | |
96 pattern,support = readPattern(line) | |
97 patterns.append(pattern) | |
98 | |
99 # here's the regex | |
100 # first the spacer | |
101 #spacer = '((\s-1\s)|((\s-1\s)*[0-9]+\s-1\s)+)' | |
102 #repattern = r'(' + spacer + '*' + spacer.join(pattern) + spacer + '*' + '.*)' | |
103 #print repattern | |
104 #patterns.append(re.compile(repattern)) | |
105 | |
106 # --- | |
107 # now for the input sequences | |
108 # --- | |
109 # first: read track dictionary and get the input sequence names | |
110 tracks = getClipDict(sourcedict) | |
111 | |
112 # read the input sequences | |
113 source = open(sourcefile, 'r') | |
114 patterns_tracks = dict() | |
115 tracks_patterns = dict() | |
116 | |
117 # iterate over all tracks - to be parallelised | |
118 for track,count in tracks.iteritems(): | |
119 sequence = readSequence(next(source)) | |
120 print track | |
121 for p in range(0,len(patterns)): | |
122 # match open or closed pattern | |
123 if openPatternInSequence(sequence,patterns[p]): | |
124 if patterns_tracks.has_key(p): | |
125 patterns_tracks[p].append(track) | |
126 else: | |
127 patterns_tracks[p] = [track] | |
128 | |
129 if tracks_patterns.has_key(track): | |
130 tracks_patterns[track].append(p) | |
131 else: | |
132 tracks_patterns[track] = [p] | |
133 | |
134 # write clip index to files | |
135 writeAllPatternsForClips('D:/mirg/Chord_Analysis20141216/',tracks_patterns) | |
136 #print patterns_tracks[p] | |
137 | |
138 # writes results to disk per key | |
139 def writeAllPatternsForClips(path = 'D:/mirg/Chord_Analysis20141216/',tracks_patterns = dict()): | |
140 | |
141 for name, contents in tracks_patterns.iteritems(): | |
142 # create new file | |
143 csvfile = open(path + '/' + name + '_patterns.csv', "w+b") #opens the file for updating | |
144 w = csv.writer(csvfile) | |
145 | |
146 # compress pattern data ? | |
147 # e.g. 2 columns from-to for the long series of atomic increments | |
148 | |
149 w.writerow(contents) | |
150 csvfile.close() | |
151 | |
152 | |
153 # @param line: reads a line in the spmf output file with frequent patterns | |
154 # returns list of strings "pattern" and int "support" | |
155 def readPattern(line): | |
156 # locate support | |
157 suploc = line.find('#SUP:') | |
158 support = int(line[suploc+5:-1]) | |
159 | |
160 # extract pattern | |
161 pattern = line[:suploc].split(' -1 ')[:-1] | |
162 return (pattern,support) | |
163 | |
164 # @param line: reads a line in the spmf input file with chord sequence | |
165 # returns list of strings "pattern" and int "support" | |
166 def readSequence(line): | |
167 # locate support | |
168 suploc = line.find('-2') | |
169 | |
170 # extract pattern | |
171 sequence = line[:suploc].split(' -1 ')[:-1] | |
172 return sequence | |
173 | |
174 # finds open pattern in sequences | |
175 # @param [string] sequence input sequence | |
176 # @param [string] pattern pattern to be found | |
177 def openPatternInSequence(sequence,pattern): | |
178 patidx = 0 | |
179 for item in sequence: | |
180 if item == pattern[patidx]: | |
181 patidx +=1 | |
182 | |
183 # did we complet the pattern? | |
184 if patidx >= (len(pattern)-1): | |
185 # could also return the start index | |
186 return 1 | |
187 # finished the sequence before finishing pattern | |
188 return 0 | |
189 | |
190 # finds closed pattern in sequences | |
191 # @param [string] sequence input sequence | |
192 # @param [string] pattern pattern to be found | |
193 def closedPatternInSequence(sequence,pattern): | |
194 # alternatively use KnuthMorrisPratt with unsplit string | |
195 return ''.join(map(str, pattern)) in ''.join(map(str, sequence)) | |
196 | |
197 # reads all track names from the dictionary created by folder2spmf | |
198 # @param sourcedict path to dictionary | |
199 def getClipDict(sourcedict): | |
200 | |
201 f = open(sourcedict, 'rt') | |
202 reader = csv.reader(f) | |
203 | |
204 # skip first roow that contains legend | |
205 next(reader) | |
206 | |
207 # get following rows | |
208 tracks = dict() | |
209 for (track,key,mode,seqlen) in reader: | |
210 tracks[track]= (key,mode,seqlen) | |
211 #tracks.append((track,count)) | |
212 | |
213 f.close() | |
214 return tracks | |
215 | |
216 | |
217 # run spmf afterwards with java -jar spmf.jar run CM-SPADE Beethoven.spmf output.txt 50% 3 | |
218 if __name__ == "__main__": | |
219 #folder2spmf() | |
220 match() |