dml-open-backendtools: collection_analysis/tools/vampstats.py annotate

annotate collection_analysis/tools/vampstats.py @ 0:e34cf1b6fe09 tip

commit

author	Daniel Wolff
date	Sat, 20 Feb 2016 18:14:24 +0100
parents
children

rev	line source
Daniel@0	1 # Part of DML (Digital Music Laboratory)
Daniel@0	2 # Copyright 2014-2015 Daniel Wolff, City University
Daniel@0	3
Daniel@0	4 # This program is free software; you can redistribute it and/or
Daniel@0	5 # modify it under the terms of the GNU General Public License
Daniel@0	6 # as published by the Free Software Foundation; either version 2
Daniel@0	7 # of the License, or (at your option) any later version.
Daniel@0	8 #
Daniel@0	9 # This program is distributed in the hope that it will be useful,
Daniel@0	10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
Daniel@0	11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Daniel@0	12 # GNU General Public License for more details.
Daniel@0	13 #
Daniel@0	14 # You should have received a copy of the GNU General Public
Daniel@0	15 # License along with this library; if not, write to the Free Software
Daniel@0	16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Daniel@0	17
Daniel@0	18 #!/usr/bin/python
Daniel@0	19 # -- coding: utf-8 --
Daniel@0	20
Daniel@0	21 # creates a histogram from given input files or folder
Daniel@0	22
Daniel@0	23 __author__="Daniel Wolff"
Daniel@0	24 __date__ ="$11-Feb-2015 18:18:47$"
Daniel@0	25
Daniel@0	26 import sys
Daniel@0	27 import os
Daniel@0	28 import csv
Daniel@0	29 import numpy
Daniel@0	30 import csv2json as c2j
Daniel@0	31 import re
Daniel@0	32
Daniel@0	33
Daniel@0	34 # global feature extensions
Daniel@0	35 #ext = tuple([".n3",".csv",".mid"])
Daniel@0	36 ext = tuple([".csv"])
Daniel@0	37
Daniel@0	38 floater = re.compile("((\d+)(.\d+)*)")
Daniel@0	39 # reads in any csv and returns a list of structure
Daniel@0	40 # time(float), data1, data2 ....data2
Daniel@0	41 def read_vamp_csv(filein = '', datapos = 0):
Daniel@0	42 output = []
Daniel@0	43 badcount = 0
Daniel@0	44 with open(filein, 'rb') as csvfile:
Daniel@0	45 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
Daniel@0	46 for row in contents:
Daniel@0	47 if len(row) >= datapos + 2:
Daniel@0	48 output.append([float(row[0])] + row[1:])
Daniel@0	49 else:
Daniel@0	50 badcount += 1
Daniel@0	51 print "Ignored " + str(badcount) + " short rows"
Daniel@0	52 return output
Daniel@0	53
Daniel@0	54 #calculates the histogram
Daniel@0	55 def histogram(data, datapos = 1, nbins = -1):
Daniel@0	56
Daniel@0	57 # symbols or numerical input?
Daniel@0	58 if not nbins == -1:
Daniel@0	59
Daniel@0	60 #convert to numpy data
Daniel@0	61 ddata = string2numpy(data,datapos)
Daniel@0	62
Daniel@0	63 count,index = numpy.histogram(ddata,nbins-1)
Daniel@0	64 count = count.tolist()
Daniel@0	65 index = index.tolist()
Daniel@0	66
Daniel@0	67 # here for strings
Daniel@0	68 else:
Daniel@0	69 # build histogram on strings
Daniel@0	70 histo = dict()
Daniel@0	71 for row in data:
Daniel@0	72 histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1
Daniel@0	73 index = histo.keys()
Daniel@0	74 count = histo.values()
Daniel@0	75
Daniel@0	76 # return histogram
Daniel@0	77 return {"count":count, "index":index}
Daniel@0	78
Daniel@0	79 #calculates statistics for numerical input
Daniel@0	80 def numstats(data,datapos):
Daniel@0	81
Daniel@0	82 #convert to numpy data
Daniel@0	83 ddata = string2numpy(data,datapos)
Daniel@0	84
Daniel@0	85 avg = numpy.average(ddata).tolist()
Daniel@0	86 med = numpy.median(ddata).tolist()
Daniel@0	87 std = numpy.std(ddata).tolist()
Daniel@0	88
Daniel@0	89 # return data
Daniel@0	90 return {"average": avg, "median": med, "std": std}
Daniel@0	91
Daniel@0	92 def featurefilesinpath(path):
Daniel@0	93 # ---
Daniel@0	94 # we traverse the file structure
Daniel@0	95 # and list files to copy
Daniel@0	96 # ---
Daniel@0	97 files = []
Daniel@0	98 for (dirpath, dirnames, filenames) in os.walk(path):
Daniel@0	99 for file in filenames:
Daniel@0	100 # we copy all requested files and the transform files as well!
Daniel@0	101 if (file.endswith(ext)):
Daniel@0	102 source = os.path.join(dirpath, file).replace('\\','/')
Daniel@0	103 files.append(source)
Daniel@0	104 return files
Daniel@0	105
Daniel@0	106 # convert to numpy
Daniel@0	107 def string2numpy(data,datapos):
Daniel@0	108 try:
Daniel@0	109 ddata = numpy.array(data, dtype=float)[:, datapos+1]
Daniel@0	110 except:
Daniel@0	111 edata = []
Daniel@0	112 for row in data:
Daniel@0	113 # account for verbatim units
Daniel@0	114 m = re.search("[a-zA-Z]", row[datapos+1])
Daniel@0	115 if m is not None:
Daniel@0	116 # take only the specified column datapos+1
Daniel@0	117 edata.append(row[datapos+1][:(m.start()-1)])
Daniel@0	118 else:
Daniel@0	119 # take only the specified column datapos+1
Daniel@0	120 edata.append(row[datapos+1])
Daniel@0	121 ddata = numpy.array(edata,dtype=float)
Daniel@0	122 return ddata
Daniel@0	123
Daniel@0	124 # main entry point
Daniel@0	125 if __name__ == "__main__":
Daniel@0	126 print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...."
Daniel@0	127 print "datapos: column of data after timecode to process"
Daniel@0	128 print "nbins: -1 for categorical data, otherwise number of bins for histogram"
Daniel@0	129
Daniel@0	130 datapos = int(sys.argv[1])
Daniel@0	131 nbins = int(sys.argv[2])
Daniel@0	132
Daniel@0	133 # check and collate files
Daniel@0	134 files = []
Daniel@0	135 for path in sys.argv[3:]:
Daniel@0	136 if os.path.isdir(path):
Daniel@0	137 files.extend(featurefilesinpath(path))
Daniel@0	138 else:
Daniel@0	139 if os.path.isfile(path):
Daniel@0	140 files.extend(path)
Daniel@0	141 print "Number of files now loading: " + str(len(files))
Daniel@0	142
Daniel@0	143 # we collate all data first and then count.
Daniel@0	144 # @todo: read all files and create dictionary first for large tasks
Daniel@0	145 data = []
Daniel@0	146 for file in files:
Daniel@0	147 print file
Daniel@0	148 data.extend(read_vamp_csv(file, datapos))
Daniel@0	149
Daniel@0	150 print "Total data size in memory: " + str(sys.getsizeof(data))
Daniel@0	151
Daniel@0	152 # now get the histogram for all data
Daniel@0	153 histo = histogram(data,datapos,nbins)
Daniel@0	154 print histo
Daniel@0	155 print "Please input a description for the histogram analysis features"
Daniel@0	156 c2j.data2json(histo)
Daniel@0	157
Daniel@0	158 # further numerical analysis if this is not categorical data
Daniel@0	159 if not nbins == -1:
Daniel@0	160 ns = numstats(data,datapos)
Daniel@0	161 print ns
Daniel@0	162 print "Please input a description for the general statistics features"
Daniel@0	163 c2j.data2json(ns)
Daniel@0	164
Daniel@0	165
Daniel@0	166

Mercurial > hg > dml-open-backendtools

annotate collection_analysis/tools/vampstats.py @ 0:e34cf1b6fe09 tip