Daniel@0: # Part of DML (Digital Music Laboratory) Daniel@0: # Copyright 2014-2015 Daniel Wolff, City University Daniel@0: Daniel@0: # This program is free software; you can redistribute it and/or Daniel@0: # modify it under the terms of the GNU General Public License Daniel@0: # as published by the Free Software Foundation; either version 2 Daniel@0: # of the License, or (at your option) any later version. Daniel@0: # Daniel@0: # This program is distributed in the hope that it will be useful, Daniel@0: # but WITHOUT ANY WARRANTY; without even the implied warranty of Daniel@0: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Daniel@0: # GNU General Public License for more details. Daniel@0: # Daniel@0: # You should have received a copy of the GNU General Public Daniel@0: # License along with this library; if not, write to the Free Software Daniel@0: # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Daniel@0: Daniel@0: #!/usr/bin/python Daniel@0: # -*- coding: utf-8 -*- Daniel@0: Daniel@0: # creates a histogram from given input files or folder Daniel@0: Daniel@0: __author__="Daniel Wolff" Daniel@0: __date__ ="$11-Feb-2015 18:18:47$" Daniel@0: Daniel@0: import sys Daniel@0: import os Daniel@0: import csv Daniel@0: import numpy Daniel@0: import csv2json as c2j Daniel@0: import re Daniel@0: Daniel@0: Daniel@0: # global feature extensions Daniel@0: #ext = tuple([".n3",".csv",".mid"]) Daniel@0: ext = tuple([".csv"]) Daniel@0: Daniel@0: floater = re.compile("((\d+)(.\d+)*)") Daniel@0: # reads in any csv and returns a list of structure Daniel@0: # time(float), data1, data2 ....data2 Daniel@0: def read_vamp_csv(filein = '', datapos = 0): Daniel@0: output = [] Daniel@0: badcount = 0 Daniel@0: with open(filein, 'rb') as csvfile: Daniel@0: contents = csv.reader(csvfile, delimiter=',', quotechar='"') Daniel@0: for row in contents: Daniel@0: if len(row) >= datapos + 2: Daniel@0: output.append([float(row[0])] + row[1:]) Daniel@0: else: Daniel@0: badcount += 1 Daniel@0: print "Ignored " + str(badcount) + " short rows" Daniel@0: return output Daniel@0: Daniel@0: #calculates the histogram Daniel@0: def histogram(data, datapos = 1, nbins = -1): Daniel@0: Daniel@0: # symbols or numerical input? Daniel@0: if not nbins == -1: Daniel@0: Daniel@0: #convert to numpy data Daniel@0: ddata = string2numpy(data,datapos) Daniel@0: Daniel@0: count,index = numpy.histogram(ddata,nbins-1) Daniel@0: count = count.tolist() Daniel@0: index = index.tolist() Daniel@0: Daniel@0: # here for strings Daniel@0: else: Daniel@0: # build histogram on strings Daniel@0: histo = dict() Daniel@0: for row in data: Daniel@0: histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1 Daniel@0: index = histo.keys() Daniel@0: count = histo.values() Daniel@0: Daniel@0: # return histogram Daniel@0: return {"count":count, "index":index} Daniel@0: Daniel@0: #calculates statistics for numerical input Daniel@0: def numstats(data,datapos): Daniel@0: Daniel@0: #convert to numpy data Daniel@0: ddata = string2numpy(data,datapos) Daniel@0: Daniel@0: avg = numpy.average(ddata).tolist() Daniel@0: med = numpy.median(ddata).tolist() Daniel@0: std = numpy.std(ddata).tolist() Daniel@0: Daniel@0: # return data Daniel@0: return {"average": avg, "median": med, "std": std} Daniel@0: Daniel@0: def featurefilesinpath(path): Daniel@0: # --- Daniel@0: # we traverse the file structure Daniel@0: # and list files to copy Daniel@0: # --- Daniel@0: files = [] Daniel@0: for (dirpath, dirnames, filenames) in os.walk(path): Daniel@0: for file in filenames: Daniel@0: # we copy all requested files and the transform files as well! Daniel@0: if (file.endswith(ext)): Daniel@0: source = os.path.join(dirpath, file).replace('\\','/') Daniel@0: files.append(source) Daniel@0: return files Daniel@0: Daniel@0: # convert to numpy Daniel@0: def string2numpy(data,datapos): Daniel@0: try: Daniel@0: ddata = numpy.array(data, dtype=float)[:, datapos+1] Daniel@0: except: Daniel@0: edata = [] Daniel@0: for row in data: Daniel@0: # account for verbatim units Daniel@0: m = re.search("[a-zA-Z]", row[datapos+1]) Daniel@0: if m is not None: Daniel@0: # take only the specified column datapos+1 Daniel@0: edata.append(row[datapos+1][:(m.start()-1)]) Daniel@0: else: Daniel@0: # take only the specified column datapos+1 Daniel@0: edata.append(row[datapos+1]) Daniel@0: ddata = numpy.array(edata,dtype=float) Daniel@0: return ddata Daniel@0: Daniel@0: # main entry point Daniel@0: if __name__ == "__main__": Daniel@0: print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...." Daniel@0: print "datapos: column of data after timecode to process" Daniel@0: print "nbins: -1 for categorical data, otherwise number of bins for histogram" Daniel@0: Daniel@0: datapos = int(sys.argv[1]) Daniel@0: nbins = int(sys.argv[2]) Daniel@0: Daniel@0: # check and collate files Daniel@0: files = [] Daniel@0: for path in sys.argv[3:]: Daniel@0: if os.path.isdir(path): Daniel@0: files.extend(featurefilesinpath(path)) Daniel@0: else: Daniel@0: if os.path.isfile(path): Daniel@0: files.extend(path) Daniel@0: print "Number of files now loading: " + str(len(files)) Daniel@0: Daniel@0: # we collate all data first and then count. Daniel@0: # @todo: read all files and create dictionary first for large tasks Daniel@0: data = [] Daniel@0: for file in files: Daniel@0: print file Daniel@0: data.extend(read_vamp_csv(file, datapos)) Daniel@0: Daniel@0: print "Total data size in memory: " + str(sys.getsizeof(data)) Daniel@0: Daniel@0: # now get the histogram for all data Daniel@0: histo = histogram(data,datapos,nbins) Daniel@0: print histo Daniel@0: print "Please input a description for the histogram analysis features" Daniel@0: c2j.data2json(histo) Daniel@0: Daniel@0: # further numerical analysis if this is not categorical data Daniel@0: if not nbins == -1: Daniel@0: ns = numstats(data,datapos) Daniel@0: print ns Daniel@0: print "Please input a description for the general statistics features" Daniel@0: c2j.data2json(ns) Daniel@0: Daniel@0: Daniel@0: