view dml-cla/python/chord_seq_spmf_helper.py @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
line wrap: on
line source
#!/usr/bin/python
# Part of DML (Digital Music Laboratory)
# Copyright 2014-2015 Daniel Wolff, City University
 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

# -*- coding: utf-8 -*-
#
# This is a data conversion wrapper for the spmf toolkit
__author__="Daniel Wolff"

import chord_seq_key_relative as c2f
import csv
import re
import tempfile
import subprocess
import os
import platform
from aggregate import *
from csvutils import *

# command for threading
import subprocess, threading
import signal

# limit for sequences read
max_lines = 10000000


class Command(object):
    def __init__(self, cmd):
        self.cmd = cmd
        self.process = None
        self.text = 'SPMF terminated unexpectedly'

    def run(self, timeout):
        def target():
            print_status('Thread started')
            if 'Win' in platform.system():
                self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=False)
            else:
                self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=False, preexec_fn=os.setsid)
                
            self.text = self.process.stdout.read()
            self.process.communicate()

            print_status('Thread finished')

        thread = threading.Thread(target=target)
        thread.start()
        
        # wait until timeout if specified
        if timeout > 0:
            thread.join(timeout)
            if thread.is_alive():
                print_status('Terminating process')
                if 'Win' in platform.system():
                    self.text = 'Terminating SPMF after ' + str(timeout) + ' seconds'
                    self.process.kill()
                else:
                    self.text = 'Terminating SPMF after ' + str(timeout) + ' seconds'
                    os.killpg(self.process.pid, signal.SIGTERM)
                    thread.join()
                    
        else:
            thread.join()
            
        # return retcode
        return (self.process.returncode, self.text)


# runs the spmf java with method and parameters as specified
# 1st parameter: usually minimal support of sequence
# 2nd parameter: minimal length of sequence
# run spmf with java -jar spmf.jar run CM-SPADE Beethoven.spmf output.txt 50% 3
def spmf(file,method = "CM-SPADE", params=["70%", "3"], timeout=10):
    
    # create outpput file name
    outfile = tempfile.mktemp()
    command = ["java"]
    command.extend(["-Xmx1g","-jar","spmf.jar","run"])
    command.extend([method, file, outfile])
    command.extend(params)
    
    #print_status('CWD:' + os.getcwd())
    #print_status('Calling SPMF: ' + ' '.join(command))
    
    proc = Command(command)
    retcode, text = proc.run(timeout=timeout)
    
    if (retcode==0):
        #print_status("Finished")
        return outfile
    else:
        print_status( "Terminated with errors" + text)
        return outfile
    

# takes a dictionary of chords for one or multiple files 
# in the form of dict[clipid] = [ (time,key,mode,fun,typ,bfun) ]
# and converts it into spmf
#
# output: tempfile of spmf output
def relchords2spmf(input):

    # choose random filename for spmf location
    # open spmf file
    fspmf = tempfile.NamedTemporaryFile(delete=False)
    
    # ---
    # this is writing the spmf format
    for track,trackdata in input.iteritems():
        # write chord sequence as one line in spmf file
        for (time,key,mode,fun,typ,bfun) in trackdata:
            chord = c2f.fun2num(fun,typ,bfun,mode)
            
            # -1 is the spearator of items or itemsets
            fspmf.write(str(chord) + ' -1 ') 
            
        # the sequence is closed with -2
        fspmf.write('-2\n') 
            
    fspmf.close()
    
    return fspmf


## takes a dictionary of chords for one or multiple files 
## in the form of dict[clipid] = [ (time,key,mode,fun,typ,bfun) ]
## and converts it into spmf
#def folder2spmf(folderin = 'D:/mirg/Chord_Analysis20141216/', fileout = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf'):
#
#    # get chords for all files                    
#    output  = c2f.folder2functions(folderin)
#
#    # open log
#    logfile = fileout + '.dic'
#    csvfile = open(logfile, "w+b") #opens the file for updating
#    w = csv.writer(csvfile)
#    w.writerow(["track","key","mode","sequence length"])
#    
#    # open spmf file
#    fspmf = open(fileout,'w')
#    # ---
#    # this is writing the spmf format
#    for track,trackdata in output.iteritems():
#        # write chord sequence as one line in spmf file
#        for (time,key,mode,fun,typ,bfun) in trackdata:
#            chord = c2f.fun2num(fun,typ,bfun,mode)
#            
#            # -1 is the spearator of items or itemsets
#            fspmf.write(str(chord) + ' -1 ') 
#            
#        # the sequence is closed with -2
#        fspmf.write('-2\n') 
#        w.writerow([track, str(key), str(mode),str(len(trackdata))])
#            
#    fspmf.close()
#    csvfile.close()

# read an spmf file
# def parsespmf(filein = 'D:/mirg/Chord_Analysis20141216/Beethoven.txt'):

# string sourcefile path to the source spmf file with chords from records
# string patternfile path to the pattern spmf file
# matches each of the patterns in patternfile
#  to the chord sequences in sourcefile
def match(sourcefile = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf',sourcedict = 'D:/mirg/Chord_Analysis20141216/Beethoven.spmf.dic', patternfile = 'D:/mirg/Chord_Analysis20141216/Beethoven_70.txt'):
    
    # define regular expressions for matching
    # closed sequence
    
    # ---
    # we here assume that there are more files than patterns,
    # as display of patterns is somehow limited
    # therefore parallelisation will be 1 pattern/multiple files
    # per instance
    # ---
    
    patterns = spmf2table(patternfile)
        
    # ---
    # now for the input sequences
    # ---
    # first: read track dictionary and get the input sequence names
    tracks = getClipDict(sourcedict)
       
    # read the input sequences
    source = open(sourcefile, 'r')
    patterns_tracks = dict()
    tracks_patterns = dict()
    
    # iterate over all tracks - to be parallelised
    for track,count in tracks.iteritems():
        sequence = readSequence(next(source))
        print track
        for p in range(0,len(patterns)):
            # match open or closed pattern
            if openPatternInSequence(sequence,patterns[p]):
                if patterns_tracks.has_key(p):
                    patterns_tracks[p].append(track)
                else:
                    patterns_tracks[p] = [track]

                if tracks_patterns.has_key(track):
                    tracks_patterns[track].append(p)
                else:
                    tracks_patterns[track] = [p]  
    
    # write clip index to files
    writeAllPatternsForClips('D:/mirg/Chord_Analysis20141216/',tracks_patterns)
    #print patterns_tracks[p]

# writes results to disk per key
def writeAllPatternsForClips(path = 'D:/mirg/Chord_Analysis20141216/',tracks_patterns = dict()):

    for name, contents in tracks_patterns.iteritems():
        # create new file
        csvfile = open(path + '/' + name + '_patterns.csv', "w+b") #opens the file for updating
        w = csv.writer(csvfile)
   
        # compress pattern data ?
        # e.g. 2 columns from-to for the long series of atomic increments
        
        w.writerow(contents)
        csvfile.close()

# reads output of spmf to table
def spmf2table(patternfile):
    
    patterns = []
    supports = []
    patterns_raw = []
    linecnt = 0
    # read all patterns
    with open(patternfile, 'r') as f:
        for line in f:
            # a line looks like this:
            # 1120401 -1 1120101 -1 #SUP: 916

            # save pattern
            #patterns.append(pattern)
            #numeric? or just regex?
            # we'll use string, so any representation works

            pattern,support = readPattern(line)
            patterns.append(pattern)
            supports.append(support)

            # here's the regex
            # first the spacer
            #spacer = '((\s-1\s)|((\s-1\s)*[0-9]+\s-1\s)+)'
            #repattern = r'(' + spacer + '*' + spacer.join(pattern) + spacer + '*' + '.*)'
            #print repattern
            #patterns.append(re.compile(repattern))
            linecnt +=1
            
            if linecnt > max_lines:
                print_status('Not reading more than ' + str(max_lines) + ' lines :(')
                break
            
    return patterns,supports

# @param line: reads a line in the spmf output file with frequent patterns
# returns list of strings "pattern" and int "support"
def readPattern(line):
    # locate support 
    suploc = line.find('#SUP:')
    
    # test whether we have a broken file
    if suploc > 6:
        support = int(line[suploc+5:-1])
    else:
        support = -1

    # extract pattern
    pattern = line[:suploc].split(' -1 ')[:-1]
    return (pattern,support)
   
# @param line: reads a line in the spmf input file with chord sequence
# returns list of strings "pattern" and int "support"
def readSequence(line):
    # locate support 
    suploc = line.find('-2')

    # extract pattern
    sequence = line[:suploc].split(' -1 ')[:-1] 
    return sequence

# finds open pattern in sequences
# @param [string] sequence input sequence
# @param [string] pattern pattern to be found
def openPatternInSequence(sequence,pattern):
    patidx = 0
    for item in sequence:
        if item == pattern[patidx]:
            patidx +=1
            
            # did we complet the pattern?
            if patidx >= (len(pattern)-1):
                # could also return the start index
                return 1
    # finished the sequence before finishing pattern
    return 0
            
# finds closed pattern in sequences
# @param [string] sequence input sequence
# @param [string] pattern pattern to be found
def closedPatternInSequence(sequence,pattern):
    # alternatively use KnuthMorrisPratt with unsplit string
    return ''.join(map(str, pattern)) in ''.join(map(str, sequence))  
            
# reads all track names from the dictionary created by folder2spmf
# @param sourcedict path to dictionary
def getClipDict(sourcedict):
    
    f = open(sourcedict, 'rt')
    reader = csv.reader(f)
    
    # skip first roow that contains legend
    next(reader)
    
    # get following rows
    tracks = dict()
    for (track,key,mode,seqlen) in reader:
        tracks[track]= (key,mode,seqlen)
        #tracks.append((track,count))
    
    f.close()
    return tracks 
    
            
# run spmf afterwards with java -jar spmf.jar run CM-SPADE Beethoven.spmf output.txt 50% 3
if __name__ == "__main__":
    #folder2spmf()
    #match()
    print "huhu"