comparison collection_analysis/tools/vampstats.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e34cf1b6fe09
1 # Part of DML (Digital Music Laboratory)
2 # Copyright 2014-2015 Daniel Wolff, City University
3
4 # This program is free software; you can redistribute it and/or
5 # modify it under the terms of the GNU General Public License
6 # as published by the Free Software Foundation; either version 2
7 # of the License, or (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public
15 # License along with this library; if not, write to the Free Software
16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
18 #!/usr/bin/python
19 # -*- coding: utf-8 -*-
20
21 # creates a histogram from given input files or folder
22
23 __author__="Daniel Wolff"
24 __date__ ="$11-Feb-2015 18:18:47$"
25
26 import sys
27 import os
28 import csv
29 import numpy
30 import csv2json as c2j
31 import re
32
33
34 # global feature extensions
35 #ext = tuple([".n3",".csv",".mid"])
36 ext = tuple([".csv"])
37
38 floater = re.compile("((\d+)(.\d+)*)")
39 # reads in any csv and returns a list of structure
40 # time(float), data1, data2 ....data2
41 def read_vamp_csv(filein = '', datapos = 0):
42 output = []
43 badcount = 0
44 with open(filein, 'rb') as csvfile:
45 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
46 for row in contents:
47 if len(row) >= datapos + 2:
48 output.append([float(row[0])] + row[1:])
49 else:
50 badcount += 1
51 print "Ignored " + str(badcount) + " short rows"
52 return output
53
54 #calculates the histogram
55 def histogram(data, datapos = 1, nbins = -1):
56
57 # symbols or numerical input?
58 if not nbins == -1:
59
60 #convert to numpy data
61 ddata = string2numpy(data,datapos)
62
63 count,index = numpy.histogram(ddata,nbins-1)
64 count = count.tolist()
65 index = index.tolist()
66
67 # here for strings
68 else:
69 # build histogram on strings
70 histo = dict()
71 for row in data:
72 histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1
73 index = histo.keys()
74 count = histo.values()
75
76 # return histogram
77 return {"count":count, "index":index}
78
79 #calculates statistics for numerical input
80 def numstats(data,datapos):
81
82 #convert to numpy data
83 ddata = string2numpy(data,datapos)
84
85 avg = numpy.average(ddata).tolist()
86 med = numpy.median(ddata).tolist()
87 std = numpy.std(ddata).tolist()
88
89 # return data
90 return {"average": avg, "median": med, "std": std}
91
92 def featurefilesinpath(path):
93 # ---
94 # we traverse the file structure
95 # and list files to copy
96 # ---
97 files = []
98 for (dirpath, dirnames, filenames) in os.walk(path):
99 for file in filenames:
100 # we copy all requested files and the transform files as well!
101 if (file.endswith(ext)):
102 source = os.path.join(dirpath, file).replace('\\','/')
103 files.append(source)
104 return files
105
106 # convert to numpy
107 def string2numpy(data,datapos):
108 try:
109 ddata = numpy.array(data, dtype=float)[:, datapos+1]
110 except:
111 edata = []
112 for row in data:
113 # account for verbatim units
114 m = re.search("[a-zA-Z]", row[datapos+1])
115 if m is not None:
116 # take only the specified column datapos+1
117 edata.append(row[datapos+1][:(m.start()-1)])
118 else:
119 # take only the specified column datapos+1
120 edata.append(row[datapos+1])
121 ddata = numpy.array(edata,dtype=float)
122 return ddata
123
124 # main entry point
125 if __name__ == "__main__":
126 print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...."
127 print "datapos: column of data after timecode to process"
128 print "nbins: -1 for categorical data, otherwise number of bins for histogram"
129
130 datapos = int(sys.argv[1])
131 nbins = int(sys.argv[2])
132
133 # check and collate files
134 files = []
135 for path in sys.argv[3:]:
136 if os.path.isdir(path):
137 files.extend(featurefilesinpath(path))
138 else:
139 if os.path.isfile(path):
140 files.extend(path)
141 print "Number of files now loading: " + str(len(files))
142
143 # we collate all data first and then count.
144 # @todo: read all files and create dictionary first for large tasks
145 data = []
146 for file in files:
147 print file
148 data.extend(read_vamp_csv(file, datapos))
149
150 print "Total data size in memory: " + str(sys.getsizeof(data))
151
152 # now get the histogram for all data
153 histo = histogram(data,datapos,nbins)
154 print histo
155 print "Please input a description for the histogram analysis features"
156 c2j.data2json(histo)
157
158 # further numerical analysis if this is not categorical data
159 if not nbins == -1:
160 ns = numstats(data,datapos)
161 print ns
162 print "Please input a description for the general statistics features"
163 c2j.data2json(ns)
164
165
166