Mercurial > hg > dml-open-backendtools
view collection_analysis/tools/vampstats.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
line wrap: on
line source
# Part of DML (Digital Music Laboratory) # Copyright 2014-2015 Daniel Wolff, City University # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #!/usr/bin/python # -*- coding: utf-8 -*- # creates a histogram from given input files or folder __author__="Daniel Wolff" __date__ ="$11-Feb-2015 18:18:47$" import sys import os import csv import numpy import csv2json as c2j import re # global feature extensions #ext = tuple([".n3",".csv",".mid"]) ext = tuple([".csv"]) floater = re.compile("((\d+)(.\d+)*)") # reads in any csv and returns a list of structure # time(float), data1, data2 ....data2 def read_vamp_csv(filein = '', datapos = 0): output = [] badcount = 0 with open(filein, 'rb') as csvfile: contents = csv.reader(csvfile, delimiter=',', quotechar='"') for row in contents: if len(row) >= datapos + 2: output.append([float(row[0])] + row[1:]) else: badcount += 1 print "Ignored " + str(badcount) + " short rows" return output #calculates the histogram def histogram(data, datapos = 1, nbins = -1): # symbols or numerical input? if not nbins == -1: #convert to numpy data ddata = string2numpy(data,datapos) count,index = numpy.histogram(ddata,nbins-1) count = count.tolist() index = index.tolist() # here for strings else: # build histogram on strings histo = dict() for row in data: histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1 index = histo.keys() count = histo.values() # return histogram return {"count":count, "index":index} #calculates statistics for numerical input def numstats(data,datapos): #convert to numpy data ddata = string2numpy(data,datapos) avg = numpy.average(ddata).tolist() med = numpy.median(ddata).tolist() std = numpy.std(ddata).tolist() # return data return {"average": avg, "median": med, "std": std} def featurefilesinpath(path): # --- # we traverse the file structure # and list files to copy # --- files = [] for (dirpath, dirnames, filenames) in os.walk(path): for file in filenames: # we copy all requested files and the transform files as well! if (file.endswith(ext)): source = os.path.join(dirpath, file).replace('\\','/') files.append(source) return files # convert to numpy def string2numpy(data,datapos): try: ddata = numpy.array(data, dtype=float)[:, datapos+1] except: edata = [] for row in data: # account for verbatim units m = re.search("[a-zA-Z]", row[datapos+1]) if m is not None: # take only the specified column datapos+1 edata.append(row[datapos+1][:(m.start()-1)]) else: # take only the specified column datapos+1 edata.append(row[datapos+1]) ddata = numpy.array(edata,dtype=float) return ddata # main entry point if __name__ == "__main__": print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...." print "datapos: column of data after timecode to process" print "nbins: -1 for categorical data, otherwise number of bins for histogram" datapos = int(sys.argv[1]) nbins = int(sys.argv[2]) # check and collate files files = [] for path in sys.argv[3:]: if os.path.isdir(path): files.extend(featurefilesinpath(path)) else: if os.path.isfile(path): files.extend(path) print "Number of files now loading: " + str(len(files)) # we collate all data first and then count. # @todo: read all files and create dictionary first for large tasks data = [] for file in files: print file data.extend(read_vamp_csv(file, datapos)) print "Total data size in memory: " + str(sys.getsizeof(data)) # now get the histogram for all data histo = histogram(data,datapos,nbins) print histo print "Please input a description for the histogram analysis features" c2j.data2json(histo) # further numerical analysis if this is not categorical data if not nbins == -1: ns = numstats(data,datapos) print ns print "Please input a description for the general statistics features" c2j.data2json(ns)