diff dml-cla/python/chord_seq_spmf_helper.py @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dml-cla/python/chord_seq_spmf_helper.py	Tue Feb 09 21:05:06 2016 +0100
@@ -0,0 +1,349 @@
+#!/usr/bin/python
+# Part of DML (Digital Music Laboratory)
+# Copyright 2014-2015 Daniel Wolff, City University
+ 
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+# -*- coding: utf-8 -*-
+#
+# This is a data conversion wrapper for the spmf toolkit
+__author__="Daniel Wolff"
+
+import chord_seq_key_relative as c2f
+import csv
+import re
+import tempfile
+import subprocess
+import os
+import platform
+from aggregate import *
+from csvutils import *
+
+# command for threading
+import subprocess, threading
+import signal
+
+# limit for sequences read
+max_lines = 10000000
+
+
+class Command(object):
+    def __init__(self, cmd):
+        self.cmd = cmd
+        self.process = None
+        self.text = 'SPMF terminated unexpectedly'
+
+    def run(self, timeout):
+        def target():
+            print_status('Thread started')
+            if 'Win' in platform.system():
+                self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=False)
+            else:
+                self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=False, preexec_fn=os.setsid)
+                
+            self.text = self.process.stdout.read()
+            self.process.communicate()
+
+            print_status('Thread finished')
+
+        thread = threading.Thread(target=target)
+        thread.start()
+        
+        # wait until timeout if specified
+        if timeout > 0:
+            thread.join(timeout)
+            if thread.is_alive():
+                print_status('Terminating process')
+                if 'Win' in platform.system():
+                    self.text = 'Terminating SPMF after ' + str(timeout) + ' seconds'
+                    self.process.kill()
+                else:
+                    self.text = 'Terminating SPMF after ' + str(timeout) + ' seconds'
+                    os.killpg(self.process.pid, signal.SIGTERM)
+                    thread.join()
+                    
+        else:
+            thread.join()
+            
+        # return retcode
+        return (self.process.returncode, self.text)
+
+
+# runs the spmf java with method and parameters as specified
+# 1st parameter: usually minimal support of sequence
+# 2nd parameter: minimal length of sequence
+# run spmf with java -jar spmf.jar run CM-SPADE Beethoven.spmf output.txt 50% 3
+def spmf(file,method = "CM-SPADE", params=["70%", "3"], timeout=10):
+    
+    # create outpput file name
+    outfile = tempfile.mktemp()
+    command = ["java"]
+    command.extend(["-Xmx1g","-jar","spmf.jar","run"])
+    command.extend([method, file, outfile])
+    command.extend(params)
+    
+    #print_status('CWD:' + os.getcwd())
+    #print_status('Calling SPMF: ' + ' '.join(command))
+    
+    proc = Command(command)
+    retcode, text = proc.run(timeout=timeout)
+    
+    if (retcode==0):
+        #print_status("Finished")
+        return outfile
+    else:
+        print_status( "Terminated with errors" + text)
+        return outfile
+    
+
+# takes a dictionary of chords for one or multiple files 
+# in the form of dict[clipid] = [ (time,key,mode,fun,typ,bfun) ]
+# and converts it into spmf
+#
+# output: tempfile of spmf output
+def relchords2spmf(input):
+
+    # choose random filename for spmf location
+    # open spmf file
+    fspmf = tempfile.NamedTemporaryFile(delete=False)
+    
+    # ---
+    # this is writing the spmf format
+    for track,trackdata in input.iteritems():
+        # write chord sequence as one line in spmf file
+        for (time,key,mode,fun,typ,bfun) in trackdata:
+            chord = c2f.fun2num(fun,typ,bfun,mode)
+            
+            # -1 is the spearator of items or itemsets
+            fspmf.write(str(chord) + ' -1 ') 
+            
+        # the sequence is closed with -2
+        fspmf.write('-2\n') 
+            
+    fspmf.close()
+    
+    return fspmf
+
+
+## takes a dictionary of chords for one or multiple files 
+## in the form of dict[clipid] = [ (time,key,mode,fun,typ,bfun) ]
+## and converts it into spmf
+#def folder2spmf(folderin = 'D:/mirg/Chord_Analysis20141216/', fileout = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf'):
+#
+#    # get chords for all files                    
+#    output  = c2f.folder2functions(folderin)
+#
+#    # open log
+#    logfile = fileout + '.dic'
+#    csvfile = open(logfile, "w+b") #opens the file for updating
+#    w = csv.writer(csvfile)
+#    w.writerow(["track","key","mode","sequence length"])
+#    
+#    # open spmf file
+#    fspmf = open(fileout,'w')
+#    # ---
+#    # this is writing the spmf format
+#    for track,trackdata in output.iteritems():
+#        # write chord sequence as one line in spmf file
+#        for (time,key,mode,fun,typ,bfun) in trackdata:
+#            chord = c2f.fun2num(fun,typ,bfun,mode)
+#            
+#            # -1 is the spearator of items or itemsets
+#            fspmf.write(str(chord) + ' -1 ') 
+#            
+#        # the sequence is closed with -2
+#        fspmf.write('-2\n') 
+#        w.writerow([track, str(key), str(mode),str(len(trackdata))])
+#            
+#    fspmf.close()
+#    csvfile.close()
+
+# read an spmf file
+# def parsespmf(filein = 'D:/mirg/Chord_Analysis20141216/Beethoven.txt'):
+
+# string sourcefile path to the source spmf file with chords from records
+# string patternfile path to the pattern spmf file
+# matches each of the patterns in patternfile
+#  to the chord sequences in sourcefile
+def match(sourcefile = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf',sourcedict = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf.dic', patternfile = 'D:/mirg/Chord_Analysis20141216/Beethoven_70.txt'):
+    
+    # define regular expressions for matching
+    # closed sequence
+    
+    # ---
+    # we here assume that there are more files than patterns,
+    # as display of patterns is somehow limited
+    # therefore parallelisation will be 1 pattern/multiple files
+    # per instance
+    # ---
+    
+    patterns = spmf2table(patternfile)
+        
+    # ---
+    # now for the input sequences
+    # ---
+    # first: read track dictionary and get the input sequence names
+    tracks = getClipDict(sourcedict)
+       
+    # read the input sequences
+    source = open(sourcefile, 'r')
+    patterns_tracks = dict()
+    tracks_patterns = dict()
+    
+    # iterate over all tracks - to be parallelised
+    for track,count in tracks.iteritems():
+        sequence = readSequence(next(source))
+        print track
+        for p in range(0,len(patterns)):
+            # match open or closed pattern
+            if openPatternInSequence(sequence,patterns[p]):
+                if patterns_tracks.has_key(p):
+                    patterns_tracks[p].append(track)
+                else:
+                    patterns_tracks[p] = [track]
+
+                if tracks_patterns.has_key(track):
+                    tracks_patterns[track].append(p)
+                else:
+                    tracks_patterns[track] = [p]  
+    
+    # write clip index to files
+    writeAllPatternsForClips('D:/mirg/Chord_Analysis20141216/',tracks_patterns)
+    #print patterns_tracks[p]
+
+# writes results to disk per key
+def writeAllPatternsForClips(path = 'D:/mirg/Chord_Analysis20141216/',tracks_patterns = dict()):
+
+    for name, contents in tracks_patterns.iteritems():
+        # create new file
+        csvfile = open(path + '/' + name + '_patterns.csv', "w+b") #opens the file for updating
+        w = csv.writer(csvfile)
+   
+        # compress pattern data ?
+        # e.g. 2 columns from-to for the long series of atomic increments
+        
+        w.writerow(contents)
+        csvfile.close()
+
+# reads output of spmf to table
+def spmf2table(patternfile):
+    
+    patterns = []
+    supports = []
+    patterns_raw = []
+    linecnt = 0
+    # read all patterns
+    with open(patternfile, 'r') as f:
+        for line in f:
+            # a line looks like this:
+            # 1120401 -1 1120101 -1 #SUP: 916
+
+            # save pattern
+            #patterns.append(pattern)
+            #numeric? or just regex?
+            # we'll use string, so any representation works
+
+            pattern,support = readPattern(line)
+            patterns.append(pattern)
+            supports.append(support)
+
+            # here's the regex
+            # first the spacer
+            #spacer = '((\s-1\s)|((\s-1\s)*[0-9]+\s-1\s)+)'
+            #repattern = r'(' + spacer + '*' + spacer.join(pattern) + spacer + '*' + '.*)'
+            #print repattern
+            #patterns.append(re.compile(repattern))
+            linecnt +=1
+            
+            if linecnt > max_lines:
+                print_status('Not reading more than ' + str(max_lines) + ' lines :(')
+                break
+            
+    return patterns,supports
+
+# @param line: reads a line in the spmf output file with frequent patterns
+# returns list of strings "pattern" and int "support"
+def readPattern(line):
+    # locate support 
+    suploc = line.find('#SUP:')
+    
+    # test whether we have a broken file
+    if suploc > 6:
+        support = int(line[suploc+5:-1])
+    else:
+        support = -1
+
+    # extract pattern
+    pattern = line[:suploc].split(' -1 ')[:-1]
+    return (pattern,support)
+   
+# @param line: reads a line in the spmf input file with chord sequence
+# returns list of strings "pattern" and int "support"
+def readSequence(line):
+    # locate support 
+    suploc = line.find('-2')
+
+    # extract pattern
+    sequence = line[:suploc].split(' -1 ')[:-1] 
+    return sequence
+
+# finds open pattern in sequences
+# @param [string] sequence input sequence
+# @param [string] pattern pattern to be found
+def openPatternInSequence(sequence,pattern):
+    patidx = 0
+    for item in sequence:
+        if item == pattern[patidx]:
+            patidx +=1
+            
+            # did we complet the pattern?
+            if patidx >= (len(pattern)-1):
+                # could also return the start index
+                return 1
+    # finished the sequence before finishing pattern
+    return 0
+            
+# finds closed pattern in sequences
+# @param [string] sequence input sequence
+# @param [string] pattern pattern to be found
+def closedPatternInSequence(sequence,pattern):
+    # alternatively use KnuthMorrisPratt with unsplit string
+    return ''.join(map(str, pattern)) in ''.join(map(str, sequence))  
+            
+# reads all track names from the dictionary created by folder2spmf
+# @param sourcedict path to dictionary
+def getClipDict(sourcedict):
+    
+    f = open(sourcedict, 'rt')
+    reader = csv.reader(f)
+    
+    # skip first roow that contains legend
+    next(reader)
+    
+    # get following rows
+    tracks = dict()
+    for (track,key,mode,seqlen) in reader:
+        tracks[track]= (key,mode,seqlen)
+        #tracks.append((track,count))
+    
+    f.close()
+    return tracks 
+    
+            
+# run spmf afterwards with java -jar spmf.jar run CM-SPADE Beethoven.spmf output.txt 50% 3
+if __name__ == "__main__":
+    #folder2spmf()
+    #match()
+    print "huhu"