Mercurial > hg > dml-open-cliopatria
comparison dml-cla/python/chord_seq_spmf_helper.py @ 0:718306e29690 tip
commiting public release
author | Daniel Wolff |
---|---|
date | Tue, 09 Feb 2016 21:05:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:718306e29690 |
---|---|
1 #!/usr/bin/python | |
2 # Part of DML (Digital Music Laboratory) | |
3 # Copyright 2014-2015 Daniel Wolff, City University | |
4 | |
5 # This program is free software; you can redistribute it and/or | |
6 # modify it under the terms of the GNU General Public License | |
7 # as published by the Free Software Foundation; either version 2 | |
8 # of the License, or (at your option) any later version. | |
9 # | |
10 # This program is distributed in the hope that it will be useful, | |
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 # GNU General Public License for more details. | |
14 # | |
15 # You should have received a copy of the GNU General Public | |
16 # License along with this library; if not, write to the Free Software | |
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 | |
19 # -*- coding: utf-8 -*- | |
20 # | |
21 # This is a data conversion wrapper for the spmf toolkit | |
22 __author__="Daniel Wolff" | |
23 | |
24 import chord_seq_key_relative as c2f | |
25 import csv | |
26 import re | |
27 import tempfile | |
28 import subprocess | |
29 import os | |
30 import platform | |
31 from aggregate import * | |
32 from csvutils import * | |
33 | |
34 # command for threading | |
35 import subprocess, threading | |
36 import signal | |
37 | |
38 # limit for sequences read | |
39 max_lines = 10000000 | |
40 | |
41 | |
42 class Command(object): | |
43 def __init__(self, cmd): | |
44 self.cmd = cmd | |
45 self.process = None | |
46 self.text = 'SPMF terminated unexpectedly' | |
47 | |
48 def run(self, timeout): | |
49 def target(): | |
50 print_status('Thread started') | |
51 if 'Win' in platform.system(): | |
52 self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=False) | |
53 else: | |
54 self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=False, preexec_fn=os.setsid) | |
55 | |
56 self.text = self.process.stdout.read() | |
57 self.process.communicate() | |
58 | |
59 print_status('Thread finished') | |
60 | |
61 thread = threading.Thread(target=target) | |
62 thread.start() | |
63 | |
64 # wait until timeout if specified | |
65 if timeout > 0: | |
66 thread.join(timeout) | |
67 if thread.is_alive(): | |
68 print_status('Terminating process') | |
69 if 'Win' in platform.system(): | |
70 self.text = 'Terminating SPMF after ' + str(timeout) + ' seconds' | |
71 self.process.kill() | |
72 else: | |
73 self.text = 'Terminating SPMF after ' + str(timeout) + ' seconds' | |
74 os.killpg(self.process.pid, signal.SIGTERM) | |
75 thread.join() | |
76 | |
77 else: | |
78 thread.join() | |
79 | |
80 # return retcode | |
81 return (self.process.returncode, self.text) | |
82 | |
83 | |
84 # runs the spmf java with method and parameters as specified | |
85 # 1st parameter: usually minimal support of sequence | |
86 # 2nd parameter: minimal length of sequence | |
87 # run spmf with java -jar spmf.jar run CM-SPADE Beethoven.spmf output.txt 50% 3 | |
88 def spmf(file,method = "CM-SPADE", params=["70%", "3"], timeout=10): | |
89 | |
90 # create outpput file name | |
91 outfile = tempfile.mktemp() | |
92 command = ["java"] | |
93 command.extend(["-Xmx1g","-jar","spmf.jar","run"]) | |
94 command.extend([method, file, outfile]) | |
95 command.extend(params) | |
96 | |
97 #print_status('CWD:' + os.getcwd()) | |
98 #print_status('Calling SPMF: ' + ' '.join(command)) | |
99 | |
100 proc = Command(command) | |
101 retcode, text = proc.run(timeout=timeout) | |
102 | |
103 if (retcode==0): | |
104 #print_status("Finished") | |
105 return outfile | |
106 else: | |
107 print_status( "Terminated with errors" + text) | |
108 return outfile | |
109 | |
110 | |
111 # takes a dictionary of chords for one or multiple files | |
112 # in the form of dict[clipid] = [ (time,key,mode,fun,typ,bfun) ] | |
113 # and converts it into spmf | |
114 # | |
115 # output: tempfile of spmf output | |
116 def relchords2spmf(input): | |
117 | |
118 # choose random filename for spmf location | |
119 # open spmf file | |
120 fspmf = tempfile.NamedTemporaryFile(delete=False) | |
121 | |
122 # --- | |
123 # this is writing the spmf format | |
124 for track,trackdata in input.iteritems(): | |
125 # write chord sequence as one line in spmf file | |
126 for (time,key,mode,fun,typ,bfun) in trackdata: | |
127 chord = c2f.fun2num(fun,typ,bfun,mode) | |
128 | |
129 # -1 is the spearator of items or itemsets | |
130 fspmf.write(str(chord) + ' -1 ') | |
131 | |
132 # the sequence is closed with -2 | |
133 fspmf.write('-2\n') | |
134 | |
135 fspmf.close() | |
136 | |
137 return fspmf | |
138 | |
139 | |
140 ## takes a dictionary of chords for one or multiple files | |
141 ## in the form of dict[clipid] = [ (time,key,mode,fun,typ,bfun) ] | |
142 ## and converts it into spmf | |
143 #def folder2spmf(folderin = 'D:/mirg/Chord_Analysis20141216/', fileout = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf'): | |
144 # | |
145 # # get chords for all files | |
146 # output = c2f.folder2functions(folderin) | |
147 # | |
148 # # open log | |
149 # logfile = fileout + '.dic' | |
150 # csvfile = open(logfile, "w+b") #opens the file for updating | |
151 # w = csv.writer(csvfile) | |
152 # w.writerow(["track","key","mode","sequence length"]) | |
153 # | |
154 # # open spmf file | |
155 # fspmf = open(fileout,'w') | |
156 # # --- | |
157 # # this is writing the spmf format | |
158 # for track,trackdata in output.iteritems(): | |
159 # # write chord sequence as one line in spmf file | |
160 # for (time,key,mode,fun,typ,bfun) in trackdata: | |
161 # chord = c2f.fun2num(fun,typ,bfun,mode) | |
162 # | |
163 # # -1 is the spearator of items or itemsets | |
164 # fspmf.write(str(chord) + ' -1 ') | |
165 # | |
166 # # the sequence is closed with -2 | |
167 # fspmf.write('-2\n') | |
168 # w.writerow([track, str(key), str(mode),str(len(trackdata))]) | |
169 # | |
170 # fspmf.close() | |
171 # csvfile.close() | |
172 | |
173 # read an spmf file | |
174 # def parsespmf(filein = 'D:/mirg/Chord_Analysis20141216/Beethoven.txt'): | |
175 | |
176 # string sourcefile path to the source spmf file with chords from records | |
177 # string patternfile path to the pattern spmf file | |
178 # matches each of the patterns in patternfile | |
179 # to the chord sequences in sourcefile | |
180 def match(sourcefile = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf',sourcedict = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf.dic', patternfile = 'D:/mirg/Chord_Analysis20141216/Beethoven_70.txt'): | |
181 | |
182 # define regular expressions for matching | |
183 # closed sequence | |
184 | |
185 # --- | |
186 # we here assume that there are more files than patterns, | |
187 # as display of patterns is somehow limited | |
188 # therefore parallelisation will be 1 pattern/multiple files | |
189 # per instance | |
190 # --- | |
191 | |
192 patterns = spmf2table(patternfile) | |
193 | |
194 # --- | |
195 # now for the input sequences | |
196 # --- | |
197 # first: read track dictionary and get the input sequence names | |
198 tracks = getClipDict(sourcedict) | |
199 | |
200 # read the input sequences | |
201 source = open(sourcefile, 'r') | |
202 patterns_tracks = dict() | |
203 tracks_patterns = dict() | |
204 | |
205 # iterate over all tracks - to be parallelised | |
206 for track,count in tracks.iteritems(): | |
207 sequence = readSequence(next(source)) | |
208 print track | |
209 for p in range(0,len(patterns)): | |
210 # match open or closed pattern | |
211 if openPatternInSequence(sequence,patterns[p]): | |
212 if patterns_tracks.has_key(p): | |
213 patterns_tracks[p].append(track) | |
214 else: | |
215 patterns_tracks[p] = [track] | |
216 | |
217 if tracks_patterns.has_key(track): | |
218 tracks_patterns[track].append(p) | |
219 else: | |
220 tracks_patterns[track] = [p] | |
221 | |
222 # write clip index to files | |
223 writeAllPatternsForClips('D:/mirg/Chord_Analysis20141216/',tracks_patterns) | |
224 #print patterns_tracks[p] | |
225 | |
226 # writes results to disk per key | |
227 def writeAllPatternsForClips(path = 'D:/mirg/Chord_Analysis20141216/',tracks_patterns = dict()): | |
228 | |
229 for name, contents in tracks_patterns.iteritems(): | |
230 # create new file | |
231 csvfile = open(path + '/' + name + '_patterns.csv', "w+b") #opens the file for updating | |
232 w = csv.writer(csvfile) | |
233 | |
234 # compress pattern data ? | |
235 # e.g. 2 columns from-to for the long series of atomic increments | |
236 | |
237 w.writerow(contents) | |
238 csvfile.close() | |
239 | |
240 # reads output of spmf to table | |
241 def spmf2table(patternfile): | |
242 | |
243 patterns = [] | |
244 supports = [] | |
245 patterns_raw = [] | |
246 linecnt = 0 | |
247 # read all patterns | |
248 with open(patternfile, 'r') as f: | |
249 for line in f: | |
250 # a line looks like this: | |
251 # 1120401 -1 1120101 -1 #SUP: 916 | |
252 | |
253 # save pattern | |
254 #patterns.append(pattern) | |
255 #numeric? or just regex? | |
256 # we'll use string, so any representation works | |
257 | |
258 pattern,support = readPattern(line) | |
259 patterns.append(pattern) | |
260 supports.append(support) | |
261 | |
262 # here's the regex | |
263 # first the spacer | |
264 #spacer = '((\s-1\s)|((\s-1\s)*[0-9]+\s-1\s)+)' | |
265 #repattern = r'(' + spacer + '*' + spacer.join(pattern) + spacer + '*' + '.*)' | |
266 #print repattern | |
267 #patterns.append(re.compile(repattern)) | |
268 linecnt +=1 | |
269 | |
270 if linecnt > max_lines: | |
271 print_status('Not reading more than ' + str(max_lines) + ' lines :(') | |
272 break | |
273 | |
274 return patterns,supports | |
275 | |
276 # @param line: reads a line in the spmf output file with frequent patterns | |
277 # returns list of strings "pattern" and int "support" | |
278 def readPattern(line): | |
279 # locate support | |
280 suploc = line.find('#SUP:') | |
281 | |
282 # test whether we have a broken file | |
283 if suploc > 6: | |
284 support = int(line[suploc+5:-1]) | |
285 else: | |
286 support = -1 | |
287 | |
288 # extract pattern | |
289 pattern = line[:suploc].split(' -1 ')[:-1] | |
290 return (pattern,support) | |
291 | |
292 # @param line: reads a line in the spmf input file with chord sequence | |
293 # returns list of strings "pattern" and int "support" | |
294 def readSequence(line): | |
295 # locate support | |
296 suploc = line.find('-2') | |
297 | |
298 # extract pattern | |
299 sequence = line[:suploc].split(' -1 ')[:-1] | |
300 return sequence | |
301 | |
302 # finds open pattern in sequences | |
303 # @param [string] sequence input sequence | |
304 # @param [string] pattern pattern to be found | |
305 def openPatternInSequence(sequence,pattern): | |
306 patidx = 0 | |
307 for item in sequence: | |
308 if item == pattern[patidx]: | |
309 patidx +=1 | |
310 | |
311 # did we complet the pattern? | |
312 if patidx >= (len(pattern)-1): | |
313 # could also return the start index | |
314 return 1 | |
315 # finished the sequence before finishing pattern | |
316 return 0 | |
317 | |
318 # finds closed pattern in sequences | |
319 # @param [string] sequence input sequence | |
320 # @param [string] pattern pattern to be found | |
321 def closedPatternInSequence(sequence,pattern): | |
322 # alternatively use KnuthMorrisPratt with unsplit string | |
323 return ''.join(map(str, pattern)) in ''.join(map(str, sequence)) | |
324 | |
325 # reads all track names from the dictionary created by folder2spmf | |
326 # @param sourcedict path to dictionary | |
327 def getClipDict(sourcedict): | |
328 | |
329 f = open(sourcedict, 'rt') | |
330 reader = csv.reader(f) | |
331 | |
332 # skip first roow that contains legend | |
333 next(reader) | |
334 | |
335 # get following rows | |
336 tracks = dict() | |
337 for (track,key,mode,seqlen) in reader: | |
338 tracks[track]= (key,mode,seqlen) | |
339 #tracks.append((track,count)) | |
340 | |
341 f.close() | |
342 return tracks | |
343 | |
344 | |
345 # run spmf afterwards with java -jar spmf.jar run CM-SPADE Beethoven.spmf output.txt 50% 3 | |
346 if __name__ == "__main__": | |
347 #folder2spmf() | |
348 #match() | |
349 print "huhu" |