Daniel@0: # Part of DML (Digital Music Laboratory)
Daniel@0: # Copyright 2014-2015 Daniel Wolff, City University
Daniel@0:  
Daniel@0: # This program is free software; you can redistribute it and/or
Daniel@0: # modify it under the terms of the GNU General Public License
Daniel@0: # as published by the Free Software Foundation; either version 2
Daniel@0: # of the License, or (at your option) any later version.
Daniel@0: # 
Daniel@0: # This program is distributed in the hope that it will be useful,
Daniel@0: # but WITHOUT ANY WARRANTY; without even the implied warranty of
Daniel@0: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Daniel@0: # GNU General Public License for more details.
Daniel@0: # 
Daniel@0: # You should have received a copy of the GNU General Public
Daniel@0: # License along with this library; if not, write to the Free Software
Daniel@0: # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
Daniel@0: 
Daniel@0: #!/usr/bin/python
Daniel@0: # -*- coding: utf-8 -*-
Daniel@0: 
Daniel@0: # creates a histogram from given input files or folder
Daniel@0: 
Daniel@0: __author__="Daniel Wolff"
Daniel@0: __date__ ="$11-Feb-2015 18:18:47$"
Daniel@0: 
Daniel@0: import sys
Daniel@0: import os
Daniel@0: import csv
Daniel@0: import numpy
Daniel@0: import csv2json as c2j
Daniel@0: import re
Daniel@0: 
Daniel@0: 
Daniel@0: # global feature extensions
Daniel@0: #ext = tuple([".n3",".csv",".mid"])
Daniel@0: ext = tuple([".csv"])
Daniel@0: 
Daniel@0: floater = re.compile("((\d+)(.\d+)*)")
Daniel@0: # reads in any csv and returns a list of structure
Daniel@0: # time(float), data1, data2 ....data2
Daniel@0: def read_vamp_csv(filein = '', datapos = 0):
Daniel@0:     output = []
Daniel@0:     badcount = 0
Daniel@0:     with open(filein, 'rb') as csvfile:
Daniel@0:         contents = csv.reader(csvfile, delimiter=',', quotechar='"')
Daniel@0:         for row in contents:
Daniel@0: 	    if len(row) >= datapos + 2:
Daniel@0: 	        output.append([float(row[0])] + row[1:])
Daniel@0:             else: 
Daniel@0:                 badcount += 1
Daniel@0:     print "Ignored " + str(badcount) + " short rows" 
Daniel@0:     return output
Daniel@0: 
Daniel@0: #calculates the histogram
Daniel@0: def histogram(data, datapos = 1, nbins = -1):
Daniel@0:     
Daniel@0:     # symbols or numerical input?
Daniel@0:     if not nbins == -1:
Daniel@0:         
Daniel@0:         #convert to numpy data
Daniel@0: 	ddata = string2numpy(data,datapos)
Daniel@0:         
Daniel@0:         count,index = numpy.histogram(ddata,nbins-1)
Daniel@0:         count = count.tolist()
Daniel@0:         index = index.tolist()
Daniel@0:         
Daniel@0:     # here for strings
Daniel@0:     else: 
Daniel@0:         # build histogram on strings
Daniel@0:         histo = dict()
Daniel@0:         for row in data:
Daniel@0:             histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1 
Daniel@0:             index = histo.keys()
Daniel@0:             count = histo.values()
Daniel@0:             
Daniel@0:     # return histogram
Daniel@0:     return {"count":count, "index":index}
Daniel@0:     
Daniel@0: #calculates statistics for  numerical input
Daniel@0: def numstats(data,datapos):
Daniel@0:     
Daniel@0:     #convert to numpy data
Daniel@0:     ddata = string2numpy(data,datapos)
Daniel@0: 
Daniel@0:     avg = numpy.average(ddata).tolist()
Daniel@0:     med = numpy.median(ddata).tolist()
Daniel@0:     std = numpy.std(ddata).tolist()
Daniel@0:     
Daniel@0:     # return data
Daniel@0:     return {"average": avg, "median": med, "std": std}
Daniel@0: 
Daniel@0: def featurefilesinpath(path):
Daniel@0:     # ---
Daniel@0:     # we traverse the file structure
Daniel@0:     # and list files to copy
Daniel@0:     # ---
Daniel@0:     files = []
Daniel@0:     for (dirpath, dirnames, filenames) in os.walk(path):
Daniel@0:         for file in filenames:        
Daniel@0:             # we copy all requested files and the transform files as well!
Daniel@0:             if (file.endswith(ext)):
Daniel@0:                 source = os.path.join(dirpath, file).replace('\\','/')
Daniel@0:                 files.append(source)
Daniel@0:     return files
Daniel@0: 
Daniel@0: # convert to numpy
Daniel@0: def string2numpy(data,datapos):
Daniel@0:     try:
Daniel@0:         ddata = numpy.array(data, dtype=float)[:, datapos+1]
Daniel@0:     except:
Daniel@0:         edata = []
Daniel@0:         for row in data:
Daniel@0:             # account for verbatim units
Daniel@0:             m = re.search("[a-zA-Z]", row[datapos+1])
Daniel@0:             if m is not None:
Daniel@0:                 # take only the specified column datapos+1
Daniel@0:                 edata.append(row[datapos+1][:(m.start()-1)])
Daniel@0:             else:
Daniel@0:                 # take only the specified column datapos+1
Daniel@0:                 edata.append(row[datapos+1])
Daniel@0:         ddata = numpy.array(edata,dtype=float)
Daniel@0:     return ddata
Daniel@0: 
Daniel@0: # main entry point
Daniel@0: if __name__ == "__main__":
Daniel@0:     print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...."
Daniel@0:     print "datapos: column of data after timecode to process"
Daniel@0:     print "nbins: -1 for categorical data, otherwise number of bins for histogram"
Daniel@0:     
Daniel@0:     datapos = int(sys.argv[1])
Daniel@0:     nbins = int(sys.argv[2])
Daniel@0:     
Daniel@0:     # check and collate files
Daniel@0:     files = []
Daniel@0:     for path in sys.argv[3:]:
Daniel@0:         if os.path.isdir(path):
Daniel@0:             files.extend(featurefilesinpath(path))
Daniel@0:         else: 
Daniel@0:             if os.path.isfile(path):
Daniel@0:                 files.extend(path)
Daniel@0:     print "Number of files now loading: " + str(len(files))
Daniel@0:     
Daniel@0:     # we collate all data first and then count. 
Daniel@0:     # @todo: read all files and create dictionary first for large tasks
Daniel@0:     data = []
Daniel@0:     for file in files:
Daniel@0:         print file
Daniel@0:         data.extend(read_vamp_csv(file, datapos))
Daniel@0:     
Daniel@0:     print "Total data size in memory: " + str(sys.getsizeof(data))
Daniel@0:     
Daniel@0:     # now get the histogram for all data
Daniel@0:     histo = histogram(data,datapos,nbins)
Daniel@0:     print histo
Daniel@0:     print "Please input a description for the histogram analysis features"
Daniel@0:     c2j.data2json(histo)
Daniel@0:     
Daniel@0:     # further numerical analysis if this is not categorical data
Daniel@0:     if not nbins == -1: 
Daniel@0:         ns = numstats(data,datapos)
Daniel@0:         print ns
Daniel@0:         print "Please input a description for the general statistics features"
Daniel@0:         c2j.data2json(ns)
Daniel@0:         
Daniel@0:         
Daniel@0: