Daniel@0
|
1 # Part of DML (Digital Music Laboratory)
|
Daniel@0
|
2 # Copyright 2014-2015 Daniel Wolff, City University
|
Daniel@0
|
3
|
Daniel@0
|
4 # This program is free software; you can redistribute it and/or
|
Daniel@0
|
5 # modify it under the terms of the GNU General Public License
|
Daniel@0
|
6 # as published by the Free Software Foundation; either version 2
|
Daniel@0
|
7 # of the License, or (at your option) any later version.
|
Daniel@0
|
8 #
|
Daniel@0
|
9 # This program is distributed in the hope that it will be useful,
|
Daniel@0
|
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
Daniel@0
|
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
Daniel@0
|
12 # GNU General Public License for more details.
|
Daniel@0
|
13 #
|
Daniel@0
|
14 # You should have received a copy of the GNU General Public
|
Daniel@0
|
15 # License along with this library; if not, write to the Free Software
|
Daniel@0
|
16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Daniel@0
|
17
|
Daniel@0
|
18 #!/usr/bin/python
|
Daniel@0
|
19 # -*- coding: utf-8 -*-
|
Daniel@0
|
20
|
Daniel@0
|
21 # creates a histogram from given input files or folder
|
Daniel@0
|
22
|
Daniel@0
|
23 __author__="Daniel Wolff"
|
Daniel@0
|
24 __date__ ="$11-Feb-2015 18:18:47$"
|
Daniel@0
|
25
|
Daniel@0
|
26 import sys
|
Daniel@0
|
27 import os
|
Daniel@0
|
28 import csv
|
Daniel@0
|
29 import numpy
|
Daniel@0
|
30 import csv2json as c2j
|
Daniel@0
|
31 import re
|
Daniel@0
|
32
|
Daniel@0
|
33
|
Daniel@0
|
34 # global feature extensions
|
Daniel@0
|
35 #ext = tuple([".n3",".csv",".mid"])
|
Daniel@0
|
36 ext = tuple([".csv"])
|
Daniel@0
|
37
|
Daniel@0
|
38 floater = re.compile("((\d+)(.\d+)*)")
|
Daniel@0
|
39 # reads in any csv and returns a list of structure
|
Daniel@0
|
40 # time(float), data1, data2 ....data2
|
Daniel@0
|
41 def read_vamp_csv(filein = '', datapos = 0):
|
Daniel@0
|
42 output = []
|
Daniel@0
|
43 badcount = 0
|
Daniel@0
|
44 with open(filein, 'rb') as csvfile:
|
Daniel@0
|
45 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
|
Daniel@0
|
46 for row in contents:
|
Daniel@0
|
47 if len(row) >= datapos + 2:
|
Daniel@0
|
48 output.append([float(row[0])] + row[1:])
|
Daniel@0
|
49 else:
|
Daniel@0
|
50 badcount += 1
|
Daniel@0
|
51 print "Ignored " + str(badcount) + " short rows"
|
Daniel@0
|
52 return output
|
Daniel@0
|
53
|
Daniel@0
|
54 #calculates the histogram
|
Daniel@0
|
55 def histogram(data, datapos = 1, nbins = -1):
|
Daniel@0
|
56
|
Daniel@0
|
57 # symbols or numerical input?
|
Daniel@0
|
58 if not nbins == -1:
|
Daniel@0
|
59
|
Daniel@0
|
60 #convert to numpy data
|
Daniel@0
|
61 ddata = string2numpy(data,datapos)
|
Daniel@0
|
62
|
Daniel@0
|
63 count,index = numpy.histogram(ddata,nbins-1)
|
Daniel@0
|
64 count = count.tolist()
|
Daniel@0
|
65 index = index.tolist()
|
Daniel@0
|
66
|
Daniel@0
|
67 # here for strings
|
Daniel@0
|
68 else:
|
Daniel@0
|
69 # build histogram on strings
|
Daniel@0
|
70 histo = dict()
|
Daniel@0
|
71 for row in data:
|
Daniel@0
|
72 histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1
|
Daniel@0
|
73 index = histo.keys()
|
Daniel@0
|
74 count = histo.values()
|
Daniel@0
|
75
|
Daniel@0
|
76 # return histogram
|
Daniel@0
|
77 return {"count":count, "index":index}
|
Daniel@0
|
78
|
Daniel@0
|
79 #calculates statistics for numerical input
|
Daniel@0
|
80 def numstats(data,datapos):
|
Daniel@0
|
81
|
Daniel@0
|
82 #convert to numpy data
|
Daniel@0
|
83 ddata = string2numpy(data,datapos)
|
Daniel@0
|
84
|
Daniel@0
|
85 avg = numpy.average(ddata).tolist()
|
Daniel@0
|
86 med = numpy.median(ddata).tolist()
|
Daniel@0
|
87 std = numpy.std(ddata).tolist()
|
Daniel@0
|
88
|
Daniel@0
|
89 # return data
|
Daniel@0
|
90 return {"average": avg, "median": med, "std": std}
|
Daniel@0
|
91
|
Daniel@0
|
92 def featurefilesinpath(path):
|
Daniel@0
|
93 # ---
|
Daniel@0
|
94 # we traverse the file structure
|
Daniel@0
|
95 # and list files to copy
|
Daniel@0
|
96 # ---
|
Daniel@0
|
97 files = []
|
Daniel@0
|
98 for (dirpath, dirnames, filenames) in os.walk(path):
|
Daniel@0
|
99 for file in filenames:
|
Daniel@0
|
100 # we copy all requested files and the transform files as well!
|
Daniel@0
|
101 if (file.endswith(ext)):
|
Daniel@0
|
102 source = os.path.join(dirpath, file).replace('\\','/')
|
Daniel@0
|
103 files.append(source)
|
Daniel@0
|
104 return files
|
Daniel@0
|
105
|
Daniel@0
|
106 # convert to numpy
|
Daniel@0
|
107 def string2numpy(data,datapos):
|
Daniel@0
|
108 try:
|
Daniel@0
|
109 ddata = numpy.array(data, dtype=float)[:, datapos+1]
|
Daniel@0
|
110 except:
|
Daniel@0
|
111 edata = []
|
Daniel@0
|
112 for row in data:
|
Daniel@0
|
113 # account for verbatim units
|
Daniel@0
|
114 m = re.search("[a-zA-Z]", row[datapos+1])
|
Daniel@0
|
115 if m is not None:
|
Daniel@0
|
116 # take only the specified column datapos+1
|
Daniel@0
|
117 edata.append(row[datapos+1][:(m.start()-1)])
|
Daniel@0
|
118 else:
|
Daniel@0
|
119 # take only the specified column datapos+1
|
Daniel@0
|
120 edata.append(row[datapos+1])
|
Daniel@0
|
121 ddata = numpy.array(edata,dtype=float)
|
Daniel@0
|
122 return ddata
|
Daniel@0
|
123
|
Daniel@0
|
124 # main entry point
|
Daniel@0
|
125 if __name__ == "__main__":
|
Daniel@0
|
126 print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...."
|
Daniel@0
|
127 print "datapos: column of data after timecode to process"
|
Daniel@0
|
128 print "nbins: -1 for categorical data, otherwise number of bins for histogram"
|
Daniel@0
|
129
|
Daniel@0
|
130 datapos = int(sys.argv[1])
|
Daniel@0
|
131 nbins = int(sys.argv[2])
|
Daniel@0
|
132
|
Daniel@0
|
133 # check and collate files
|
Daniel@0
|
134 files = []
|
Daniel@0
|
135 for path in sys.argv[3:]:
|
Daniel@0
|
136 if os.path.isdir(path):
|
Daniel@0
|
137 files.extend(featurefilesinpath(path))
|
Daniel@0
|
138 else:
|
Daniel@0
|
139 if os.path.isfile(path):
|
Daniel@0
|
140 files.extend(path)
|
Daniel@0
|
141 print "Number of files now loading: " + str(len(files))
|
Daniel@0
|
142
|
Daniel@0
|
143 # we collate all data first and then count.
|
Daniel@0
|
144 # @todo: read all files and create dictionary first for large tasks
|
Daniel@0
|
145 data = []
|
Daniel@0
|
146 for file in files:
|
Daniel@0
|
147 print file
|
Daniel@0
|
148 data.extend(read_vamp_csv(file, datapos))
|
Daniel@0
|
149
|
Daniel@0
|
150 print "Total data size in memory: " + str(sys.getsizeof(data))
|
Daniel@0
|
151
|
Daniel@0
|
152 # now get the histogram for all data
|
Daniel@0
|
153 histo = histogram(data,datapos,nbins)
|
Daniel@0
|
154 print histo
|
Daniel@0
|
155 print "Please input a description for the histogram analysis features"
|
Daniel@0
|
156 c2j.data2json(histo)
|
Daniel@0
|
157
|
Daniel@0
|
158 # further numerical analysis if this is not categorical data
|
Daniel@0
|
159 if not nbins == -1:
|
Daniel@0
|
160 ns = numstats(data,datapos)
|
Daniel@0
|
161 print ns
|
Daniel@0
|
162 print "Please input a description for the general statistics features"
|
Daniel@0
|
163 c2j.data2json(ns)
|
Daniel@0
|
164
|
Daniel@0
|
165
|
Daniel@0
|
166
|