annotate dml-cla/python/chord_seq_spmf_helper.py @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
rev   line source
Daniel@0 1 #!/usr/bin/python
Daniel@0 2 # Part of DML (Digital Music Laboratory)
Daniel@0 3 # Copyright 2014-2015 Daniel Wolff, City University
Daniel@0 4
Daniel@0 5 # This program is free software; you can redistribute it and/or
Daniel@0 6 # modify it under the terms of the GNU General Public License
Daniel@0 7 # as published by the Free Software Foundation; either version 2
Daniel@0 8 # of the License, or (at your option) any later version.
Daniel@0 9 #
Daniel@0 10 # This program is distributed in the hope that it will be useful,
Daniel@0 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
Daniel@0 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Daniel@0 13 # GNU General Public License for more details.
Daniel@0 14 #
Daniel@0 15 # You should have received a copy of the GNU General Public
Daniel@0 16 # License along with this library; if not, write to the Free Software
Daniel@0 17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Daniel@0 18
Daniel@0 19 # -*- coding: utf-8 -*-
Daniel@0 20 #
Daniel@0 21 # This is a data conversion wrapper for the spmf toolkit
Daniel@0 22 __author__="Daniel Wolff"
Daniel@0 23
Daniel@0 24 import chord_seq_key_relative as c2f
Daniel@0 25 import csv
Daniel@0 26 import re
Daniel@0 27 import tempfile
Daniel@0 28 import subprocess
Daniel@0 29 import os
Daniel@0 30 import platform
Daniel@0 31 from aggregate import *
Daniel@0 32 from csvutils import *
Daniel@0 33
Daniel@0 34 # command for threading
Daniel@0 35 import subprocess, threading
Daniel@0 36 import signal
Daniel@0 37
Daniel@0 38 # limit for sequences read
Daniel@0 39 max_lines = 10000000
Daniel@0 40
Daniel@0 41
Daniel@0 42 class Command(object):
Daniel@0 43 def __init__(self, cmd):
Daniel@0 44 self.cmd = cmd
Daniel@0 45 self.process = None
Daniel@0 46 self.text = 'SPMF terminated unexpectedly'
Daniel@0 47
Daniel@0 48 def run(self, timeout):
Daniel@0 49 def target():
Daniel@0 50 print_status('Thread started')
Daniel@0 51 if 'Win' in platform.system():
Daniel@0 52 self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=False)
Daniel@0 53 else:
Daniel@0 54 self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=False, preexec_fn=os.setsid)
Daniel@0 55
Daniel@0 56 self.text = self.process.stdout.read()
Daniel@0 57 self.process.communicate()
Daniel@0 58
Daniel@0 59 print_status('Thread finished')
Daniel@0 60
Daniel@0 61 thread = threading.Thread(target=target)
Daniel@0 62 thread.start()
Daniel@0 63
Daniel@0 64 # wait until timeout if specified
Daniel@0 65 if timeout > 0:
Daniel@0 66 thread.join(timeout)
Daniel@0 67 if thread.is_alive():
Daniel@0 68 print_status('Terminating process')
Daniel@0 69 if 'Win' in platform.system():
Daniel@0 70 self.text = 'Terminating SPMF after ' + str(timeout) + ' seconds'
Daniel@0 71 self.process.kill()
Daniel@0 72 else:
Daniel@0 73 self.text = 'Terminating SPMF after ' + str(timeout) + ' seconds'
Daniel@0 74 os.killpg(self.process.pid, signal.SIGTERM)
Daniel@0 75 thread.join()
Daniel@0 76
Daniel@0 77 else:
Daniel@0 78 thread.join()
Daniel@0 79
Daniel@0 80 # return retcode
Daniel@0 81 return (self.process.returncode, self.text)
Daniel@0 82
Daniel@0 83
Daniel@0 84 # runs the spmf java with method and parameters as specified
Daniel@0 85 # 1st parameter: usually minimal support of sequence
Daniel@0 86 # 2nd parameter: minimal length of sequence
Daniel@0 87 # run spmf with java -jar spmf.jar run CM-SPADE Beethoven.spmf output.txt 50% 3
Daniel@0 88 def spmf(file,method = "CM-SPADE", params=["70%", "3"], timeout=10):
Daniel@0 89
Daniel@0 90 # create outpput file name
Daniel@0 91 outfile = tempfile.mktemp()
Daniel@0 92 command = ["java"]
Daniel@0 93 command.extend(["-Xmx1g","-jar","spmf.jar","run"])
Daniel@0 94 command.extend([method, file, outfile])
Daniel@0 95 command.extend(params)
Daniel@0 96
Daniel@0 97 #print_status('CWD:' + os.getcwd())
Daniel@0 98 #print_status('Calling SPMF: ' + ' '.join(command))
Daniel@0 99
Daniel@0 100 proc = Command(command)
Daniel@0 101 retcode, text = proc.run(timeout=timeout)
Daniel@0 102
Daniel@0 103 if (retcode==0):
Daniel@0 104 #print_status("Finished")
Daniel@0 105 return outfile
Daniel@0 106 else:
Daniel@0 107 print_status( "Terminated with errors" + text)
Daniel@0 108 return outfile
Daniel@0 109
Daniel@0 110
Daniel@0 111 # takes a dictionary of chords for one or multiple files
Daniel@0 112 # in the form of dict[clipid] = [ (time,key,mode,fun,typ,bfun) ]
Daniel@0 113 # and converts it into spmf
Daniel@0 114 #
Daniel@0 115 # output: tempfile of spmf output
Daniel@0 116 def relchords2spmf(input):
Daniel@0 117
Daniel@0 118 # choose random filename for spmf location
Daniel@0 119 # open spmf file
Daniel@0 120 fspmf = tempfile.NamedTemporaryFile(delete=False)
Daniel@0 121
Daniel@0 122 # ---
Daniel@0 123 # this is writing the spmf format
Daniel@0 124 for track,trackdata in input.iteritems():
Daniel@0 125 # write chord sequence as one line in spmf file
Daniel@0 126 for (time,key,mode,fun,typ,bfun) in trackdata:
Daniel@0 127 chord = c2f.fun2num(fun,typ,bfun,mode)
Daniel@0 128
Daniel@0 129 # -1 is the spearator of items or itemsets
Daniel@0 130 fspmf.write(str(chord) + ' -1 ')
Daniel@0 131
Daniel@0 132 # the sequence is closed with -2
Daniel@0 133 fspmf.write('-2\n')
Daniel@0 134
Daniel@0 135 fspmf.close()
Daniel@0 136
Daniel@0 137 return fspmf
Daniel@0 138
Daniel@0 139
Daniel@0 140 ## takes a dictionary of chords for one or multiple files
Daniel@0 141 ## in the form of dict[clipid] = [ (time,key,mode,fun,typ,bfun) ]
Daniel@0 142 ## and converts it into spmf
Daniel@0 143 #def folder2spmf(folderin = 'D:/mirg/Chord_Analysis20141216/', fileout = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf'):
Daniel@0 144 #
Daniel@0 145 # # get chords for all files
Daniel@0 146 # output = c2f.folder2functions(folderin)
Daniel@0 147 #
Daniel@0 148 # # open log
Daniel@0 149 # logfile = fileout + '.dic'
Daniel@0 150 # csvfile = open(logfile, "w+b") #opens the file for updating
Daniel@0 151 # w = csv.writer(csvfile)
Daniel@0 152 # w.writerow(["track","key","mode","sequence length"])
Daniel@0 153 #
Daniel@0 154 # # open spmf file
Daniel@0 155 # fspmf = open(fileout,'w')
Daniel@0 156 # # ---
Daniel@0 157 # # this is writing the spmf format
Daniel@0 158 # for track,trackdata in output.iteritems():
Daniel@0 159 # # write chord sequence as one line in spmf file
Daniel@0 160 # for (time,key,mode,fun,typ,bfun) in trackdata:
Daniel@0 161 # chord = c2f.fun2num(fun,typ,bfun,mode)
Daniel@0 162 #
Daniel@0 163 # # -1 is the spearator of items or itemsets
Daniel@0 164 # fspmf.write(str(chord) + ' -1 ')
Daniel@0 165 #
Daniel@0 166 # # the sequence is closed with -2
Daniel@0 167 # fspmf.write('-2\n')
Daniel@0 168 # w.writerow([track, str(key), str(mode),str(len(trackdata))])
Daniel@0 169 #
Daniel@0 170 # fspmf.close()
Daniel@0 171 # csvfile.close()
Daniel@0 172
Daniel@0 173 # read an spmf file
Daniel@0 174 # def parsespmf(filein = 'D:/mirg/Chord_Analysis20141216/Beethoven.txt'):
Daniel@0 175
Daniel@0 176 # string sourcefile path to the source spmf file with chords from records
Daniel@0 177 # string patternfile path to the pattern spmf file
Daniel@0 178 # matches each of the patterns in patternfile
Daniel@0 179 # to the chord sequences in sourcefile
Daniel@0 180 def match(sourcefile = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf',sourcedict = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf.dic', patternfile = 'D:/mirg/Chord_Analysis20141216/Beethoven_70.txt'):
Daniel@0 181
Daniel@0 182 # define regular expressions for matching
Daniel@0 183 # closed sequence
Daniel@0 184
Daniel@0 185 # ---
Daniel@0 186 # we here assume that there are more files than patterns,
Daniel@0 187 # as display of patterns is somehow limited
Daniel@0 188 # therefore parallelisation will be 1 pattern/multiple files
Daniel@0 189 # per instance
Daniel@0 190 # ---
Daniel@0 191
Daniel@0 192 patterns = spmf2table(patternfile)
Daniel@0 193
Daniel@0 194 # ---
Daniel@0 195 # now for the input sequences
Daniel@0 196 # ---
Daniel@0 197 # first: read track dictionary and get the input sequence names
Daniel@0 198 tracks = getClipDict(sourcedict)
Daniel@0 199
Daniel@0 200 # read the input sequences
Daniel@0 201 source = open(sourcefile, 'r')
Daniel@0 202 patterns_tracks = dict()
Daniel@0 203 tracks_patterns = dict()
Daniel@0 204
Daniel@0 205 # iterate over all tracks - to be parallelised
Daniel@0 206 for track,count in tracks.iteritems():
Daniel@0 207 sequence = readSequence(next(source))
Daniel@0 208 print track
Daniel@0 209 for p in range(0,len(patterns)):
Daniel@0 210 # match open or closed pattern
Daniel@0 211 if openPatternInSequence(sequence,patterns[p]):
Daniel@0 212 if patterns_tracks.has_key(p):
Daniel@0 213 patterns_tracks[p].append(track)
Daniel@0 214 else:
Daniel@0 215 patterns_tracks[p] = [track]
Daniel@0 216
Daniel@0 217 if tracks_patterns.has_key(track):
Daniel@0 218 tracks_patterns[track].append(p)
Daniel@0 219 else:
Daniel@0 220 tracks_patterns[track] = [p]
Daniel@0 221
Daniel@0 222 # write clip index to files
Daniel@0 223 writeAllPatternsForClips('D:/mirg/Chord_Analysis20141216/',tracks_patterns)
Daniel@0 224 #print patterns_tracks[p]
Daniel@0 225
Daniel@0 226 # writes results to disk per key
Daniel@0 227 def writeAllPatternsForClips(path = 'D:/mirg/Chord_Analysis20141216/',tracks_patterns = dict()):
Daniel@0 228
Daniel@0 229 for name, contents in tracks_patterns.iteritems():
Daniel@0 230 # create new file
Daniel@0 231 csvfile = open(path + '/' + name + '_patterns.csv', "w+b") #opens the file for updating
Daniel@0 232 w = csv.writer(csvfile)
Daniel@0 233
Daniel@0 234 # compress pattern data ?
Daniel@0 235 # e.g. 2 columns from-to for the long series of atomic increments
Daniel@0 236
Daniel@0 237 w.writerow(contents)
Daniel@0 238 csvfile.close()
Daniel@0 239
Daniel@0 240 # reads output of spmf to table
Daniel@0 241 def spmf2table(patternfile):
Daniel@0 242
Daniel@0 243 patterns = []
Daniel@0 244 supports = []
Daniel@0 245 patterns_raw = []
Daniel@0 246 linecnt = 0
Daniel@0 247 # read all patterns
Daniel@0 248 with open(patternfile, 'r') as f:
Daniel@0 249 for line in f:
Daniel@0 250 # a line looks like this:
Daniel@0 251 # 1120401 -1 1120101 -1 #SUP: 916
Daniel@0 252
Daniel@0 253 # save pattern
Daniel@0 254 #patterns.append(pattern)
Daniel@0 255 #numeric? or just regex?
Daniel@0 256 # we'll use string, so any representation works
Daniel@0 257
Daniel@0 258 pattern,support = readPattern(line)
Daniel@0 259 patterns.append(pattern)
Daniel@0 260 supports.append(support)
Daniel@0 261
Daniel@0 262 # here's the regex
Daniel@0 263 # first the spacer
Daniel@0 264 #spacer = '((\s-1\s)|((\s-1\s)*[0-9]+\s-1\s)+)'
Daniel@0 265 #repattern = r'(' + spacer + '*' + spacer.join(pattern) + spacer + '*' + '.*)'
Daniel@0 266 #print repattern
Daniel@0 267 #patterns.append(re.compile(repattern))
Daniel@0 268 linecnt +=1
Daniel@0 269
Daniel@0 270 if linecnt > max_lines:
Daniel@0 271 print_status('Not reading more than ' + str(max_lines) + ' lines :(')
Daniel@0 272 break
Daniel@0 273
Daniel@0 274 return patterns,supports
Daniel@0 275
Daniel@0 276 # @param line: reads a line in the spmf output file with frequent patterns
Daniel@0 277 # returns list of strings "pattern" and int "support"
Daniel@0 278 def readPattern(line):
Daniel@0 279 # locate support
Daniel@0 280 suploc = line.find('#SUP:')
Daniel@0 281
Daniel@0 282 # test whether we have a broken file
Daniel@0 283 if suploc > 6:
Daniel@0 284 support = int(line[suploc+5:-1])
Daniel@0 285 else:
Daniel@0 286 support = -1
Daniel@0 287
Daniel@0 288 # extract pattern
Daniel@0 289 pattern = line[:suploc].split(' -1 ')[:-1]
Daniel@0 290 return (pattern,support)
Daniel@0 291
Daniel@0 292 # @param line: reads a line in the spmf input file with chord sequence
Daniel@0 293 # returns list of strings "pattern" and int "support"
Daniel@0 294 def readSequence(line):
Daniel@0 295 # locate support
Daniel@0 296 suploc = line.find('-2')
Daniel@0 297
Daniel@0 298 # extract pattern
Daniel@0 299 sequence = line[:suploc].split(' -1 ')[:-1]
Daniel@0 300 return sequence
Daniel@0 301
Daniel@0 302 # finds open pattern in sequences
Daniel@0 303 # @param [string] sequence input sequence
Daniel@0 304 # @param [string] pattern pattern to be found
Daniel@0 305 def openPatternInSequence(sequence,pattern):
Daniel@0 306 patidx = 0
Daniel@0 307 for item in sequence:
Daniel@0 308 if item == pattern[patidx]:
Daniel@0 309 patidx +=1
Daniel@0 310
Daniel@0 311 # did we complet the pattern?
Daniel@0 312 if patidx >= (len(pattern)-1):
Daniel@0 313 # could also return the start index
Daniel@0 314 return 1
Daniel@0 315 # finished the sequence before finishing pattern
Daniel@0 316 return 0
Daniel@0 317
Daniel@0 318 # finds closed pattern in sequences
Daniel@0 319 # @param [string] sequence input sequence
Daniel@0 320 # @param [string] pattern pattern to be found
Daniel@0 321 def closedPatternInSequence(sequence,pattern):
Daniel@0 322 # alternatively use KnuthMorrisPratt with unsplit string
Daniel@0 323 return ''.join(map(str, pattern)) in ''.join(map(str, sequence))
Daniel@0 324
Daniel@0 325 # reads all track names from the dictionary created by folder2spmf
Daniel@0 326 # @param sourcedict path to dictionary
Daniel@0 327 def getClipDict(sourcedict):
Daniel@0 328
Daniel@0 329 f = open(sourcedict, 'rt')
Daniel@0 330 reader = csv.reader(f)
Daniel@0 331
Daniel@0 332 # skip first roow that contains legend
Daniel@0 333 next(reader)
Daniel@0 334
Daniel@0 335 # get following rows
Daniel@0 336 tracks = dict()
Daniel@0 337 for (track,key,mode,seqlen) in reader:
Daniel@0 338 tracks[track]= (key,mode,seqlen)
Daniel@0 339 #tracks.append((track,count))
Daniel@0 340
Daniel@0 341 f.close()
Daniel@0 342 return tracks
Daniel@0 343
Daniel@0 344
Daniel@0 345 # run spmf afterwards with java -jar spmf.jar run CM-SPADE Beethoven.spmf output.txt 50% 3
Daniel@0 346 if __name__ == "__main__":
Daniel@0 347 #folder2spmf()
Daniel@0 348 #match()
Daniel@0 349 print "huhu"