comparison dml-cla/python/chord_seq_spmf_helper.py @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:718306e29690
1 #!/usr/bin/python
2 # Part of DML (Digital Music Laboratory)
3 # Copyright 2014-2015 Daniel Wolff, City University
4
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public
16 # License along with this library; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
19 # -*- coding: utf-8 -*-
20 #
21 # This is a data conversion wrapper for the spmf toolkit
22 __author__="Daniel Wolff"
23
24 import chord_seq_key_relative as c2f
25 import csv
26 import re
27 import tempfile
28 import subprocess
29 import os
30 import platform
31 from aggregate import *
32 from csvutils import *
33
34 # command for threading
35 import subprocess, threading
36 import signal
37
38 # limit for sequences read
39 max_lines = 10000000
40
41
42 class Command(object):
43 def __init__(self, cmd):
44 self.cmd = cmd
45 self.process = None
46 self.text = 'SPMF terminated unexpectedly'
47
48 def run(self, timeout):
49 def target():
50 print_status('Thread started')
51 if 'Win' in platform.system():
52 self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=False)
53 else:
54 self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=False, preexec_fn=os.setsid)
55
56 self.text = self.process.stdout.read()
57 self.process.communicate()
58
59 print_status('Thread finished')
60
61 thread = threading.Thread(target=target)
62 thread.start()
63
64 # wait until timeout if specified
65 if timeout > 0:
66 thread.join(timeout)
67 if thread.is_alive():
68 print_status('Terminating process')
69 if 'Win' in platform.system():
70 self.text = 'Terminating SPMF after ' + str(timeout) + ' seconds'
71 self.process.kill()
72 else:
73 self.text = 'Terminating SPMF after ' + str(timeout) + ' seconds'
74 os.killpg(self.process.pid, signal.SIGTERM)
75 thread.join()
76
77 else:
78 thread.join()
79
80 # return retcode
81 return (self.process.returncode, self.text)
82
83
84 # runs the spmf java with method and parameters as specified
85 # 1st parameter: usually minimal support of sequence
86 # 2nd parameter: minimal length of sequence
87 # run spmf with java -jar spmf.jar run CM-SPADE Beethoven.spmf output.txt 50% 3
88 def spmf(file,method = "CM-SPADE", params=["70%", "3"], timeout=10):
89
90 # create outpput file name
91 outfile = tempfile.mktemp()
92 command = ["java"]
93 command.extend(["-Xmx1g","-jar","spmf.jar","run"])
94 command.extend([method, file, outfile])
95 command.extend(params)
96
97 #print_status('CWD:' + os.getcwd())
98 #print_status('Calling SPMF: ' + ' '.join(command))
99
100 proc = Command(command)
101 retcode, text = proc.run(timeout=timeout)
102
103 if (retcode==0):
104 #print_status("Finished")
105 return outfile
106 else:
107 print_status( "Terminated with errors" + text)
108 return outfile
109
110
111 # takes a dictionary of chords for one or multiple files
112 # in the form of dict[clipid] = [ (time,key,mode,fun,typ,bfun) ]
113 # and converts it into spmf
114 #
115 # output: tempfile of spmf output
116 def relchords2spmf(input):
117
118 # choose random filename for spmf location
119 # open spmf file
120 fspmf = tempfile.NamedTemporaryFile(delete=False)
121
122 # ---
123 # this is writing the spmf format
124 for track,trackdata in input.iteritems():
125 # write chord sequence as one line in spmf file
126 for (time,key,mode,fun,typ,bfun) in trackdata:
127 chord = c2f.fun2num(fun,typ,bfun,mode)
128
129 # -1 is the spearator of items or itemsets
130 fspmf.write(str(chord) + ' -1 ')
131
132 # the sequence is closed with -2
133 fspmf.write('-2\n')
134
135 fspmf.close()
136
137 return fspmf
138
139
140 ## takes a dictionary of chords for one or multiple files
141 ## in the form of dict[clipid] = [ (time,key,mode,fun,typ,bfun) ]
142 ## and converts it into spmf
143 #def folder2spmf(folderin = 'D:/mirg/Chord_Analysis20141216/', fileout = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf'):
144 #
145 # # get chords for all files
146 # output = c2f.folder2functions(folderin)
147 #
148 # # open log
149 # logfile = fileout + '.dic'
150 # csvfile = open(logfile, "w+b") #opens the file for updating
151 # w = csv.writer(csvfile)
152 # w.writerow(["track","key","mode","sequence length"])
153 #
154 # # open spmf file
155 # fspmf = open(fileout,'w')
156 # # ---
157 # # this is writing the spmf format
158 # for track,trackdata in output.iteritems():
159 # # write chord sequence as one line in spmf file
160 # for (time,key,mode,fun,typ,bfun) in trackdata:
161 # chord = c2f.fun2num(fun,typ,bfun,mode)
162 #
163 # # -1 is the spearator of items or itemsets
164 # fspmf.write(str(chord) + ' -1 ')
165 #
166 # # the sequence is closed with -2
167 # fspmf.write('-2\n')
168 # w.writerow([track, str(key), str(mode),str(len(trackdata))])
169 #
170 # fspmf.close()
171 # csvfile.close()
172
173 # read an spmf file
174 # def parsespmf(filein = 'D:/mirg/Chord_Analysis20141216/Beethoven.txt'):
175
176 # string sourcefile path to the source spmf file with chords from records
177 # string patternfile path to the pattern spmf file
178 # matches each of the patterns in patternfile
179 # to the chord sequences in sourcefile
180 def match(sourcefile = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf',sourcedict = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf.dic', patternfile = 'D:/mirg/Chord_Analysis20141216/Beethoven_70.txt'):
181
182 # define regular expressions for matching
183 # closed sequence
184
185 # ---
186 # we here assume that there are more files than patterns,
187 # as display of patterns is somehow limited
188 # therefore parallelisation will be 1 pattern/multiple files
189 # per instance
190 # ---
191
192 patterns = spmf2table(patternfile)
193
194 # ---
195 # now for the input sequences
196 # ---
197 # first: read track dictionary and get the input sequence names
198 tracks = getClipDict(sourcedict)
199
200 # read the input sequences
201 source = open(sourcefile, 'r')
202 patterns_tracks = dict()
203 tracks_patterns = dict()
204
205 # iterate over all tracks - to be parallelised
206 for track,count in tracks.iteritems():
207 sequence = readSequence(next(source))
208 print track
209 for p in range(0,len(patterns)):
210 # match open or closed pattern
211 if openPatternInSequence(sequence,patterns[p]):
212 if patterns_tracks.has_key(p):
213 patterns_tracks[p].append(track)
214 else:
215 patterns_tracks[p] = [track]
216
217 if tracks_patterns.has_key(track):
218 tracks_patterns[track].append(p)
219 else:
220 tracks_patterns[track] = [p]
221
222 # write clip index to files
223 writeAllPatternsForClips('D:/mirg/Chord_Analysis20141216/',tracks_patterns)
224 #print patterns_tracks[p]
225
226 # writes results to disk per key
227 def writeAllPatternsForClips(path = 'D:/mirg/Chord_Analysis20141216/',tracks_patterns = dict()):
228
229 for name, contents in tracks_patterns.iteritems():
230 # create new file
231 csvfile = open(path + '/' + name + '_patterns.csv', "w+b") #opens the file for updating
232 w = csv.writer(csvfile)
233
234 # compress pattern data ?
235 # e.g. 2 columns from-to for the long series of atomic increments
236
237 w.writerow(contents)
238 csvfile.close()
239
240 # reads output of spmf to table
241 def spmf2table(patternfile):
242
243 patterns = []
244 supports = []
245 patterns_raw = []
246 linecnt = 0
247 # read all patterns
248 with open(patternfile, 'r') as f:
249 for line in f:
250 # a line looks like this:
251 # 1120401 -1 1120101 -1 #SUP: 916
252
253 # save pattern
254 #patterns.append(pattern)
255 #numeric? or just regex?
256 # we'll use string, so any representation works
257
258 pattern,support = readPattern(line)
259 patterns.append(pattern)
260 supports.append(support)
261
262 # here's the regex
263 # first the spacer
264 #spacer = '((\s-1\s)|((\s-1\s)*[0-9]+\s-1\s)+)'
265 #repattern = r'(' + spacer + '*' + spacer.join(pattern) + spacer + '*' + '.*)'
266 #print repattern
267 #patterns.append(re.compile(repattern))
268 linecnt +=1
269
270 if linecnt > max_lines:
271 print_status('Not reading more than ' + str(max_lines) + ' lines :(')
272 break
273
274 return patterns,supports
275
276 # @param line: reads a line in the spmf output file with frequent patterns
277 # returns list of strings "pattern" and int "support"
278 def readPattern(line):
279 # locate support
280 suploc = line.find('#SUP:')
281
282 # test whether we have a broken file
283 if suploc > 6:
284 support = int(line[suploc+5:-1])
285 else:
286 support = -1
287
288 # extract pattern
289 pattern = line[:suploc].split(' -1 ')[:-1]
290 return (pattern,support)
291
292 # @param line: reads a line in the spmf input file with chord sequence
293 # returns list of strings "pattern" and int "support"
294 def readSequence(line):
295 # locate support
296 suploc = line.find('-2')
297
298 # extract pattern
299 sequence = line[:suploc].split(' -1 ')[:-1]
300 return sequence
301
302 # finds open pattern in sequences
303 # @param [string] sequence input sequence
304 # @param [string] pattern pattern to be found
305 def openPatternInSequence(sequence,pattern):
306 patidx = 0
307 for item in sequence:
308 if item == pattern[patidx]:
309 patidx +=1
310
311 # did we complet the pattern?
312 if patidx >= (len(pattern)-1):
313 # could also return the start index
314 return 1
315 # finished the sequence before finishing pattern
316 return 0
317
318 # finds closed pattern in sequences
319 # @param [string] sequence input sequence
320 # @param [string] pattern pattern to be found
321 def closedPatternInSequence(sequence,pattern):
322 # alternatively use KnuthMorrisPratt with unsplit string
323 return ''.join(map(str, pattern)) in ''.join(map(str, sequence))
324
325 # reads all track names from the dictionary created by folder2spmf
326 # @param sourcedict path to dictionary
327 def getClipDict(sourcedict):
328
329 f = open(sourcedict, 'rt')
330 reader = csv.reader(f)
331
332 # skip first roow that contains legend
333 next(reader)
334
335 # get following rows
336 tracks = dict()
337 for (track,key,mode,seqlen) in reader:
338 tracks[track]= (key,mode,seqlen)
339 #tracks.append((track,count))
340
341 f.close()
342 return tracks
343
344
345 # run spmf afterwards with java -jar spmf.jar run CM-SPADE Beethoven.spmf output.txt 50% 3
346 if __name__ == "__main__":
347 #folder2spmf()
348 #match()
349 print "huhu"