comparison collection_analysis/tools/vampstats_pitch_weighted.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e34cf1b6fe09
1 # Part of DML (Digital Music Laboratory)
2 # Copyright 2014-2015 Daniel Wolff, City University
3
4 # This program is free software; you can redistribute it and/or
5 # modify it under the terms of the GNU General Public License
6 # as published by the Free Software Foundation; either version 2
7 # of the License, or (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public
15 # License along with this library; if not, write to the Free Software
16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
18 #!/usr/bin/python
19 # -*- coding: utf-8 -*-
20
21 # creates a histogram from given input files or folder
22
23 __author__="Daniel Wolff, Dan"
24 __date__ ="$11-Feb-2015 18:18:47$"
25
26 import sys
27 import os
28 import csv
29 import numpy
30 import csv2json as c2j
31 import re
32
33
34 # global feature extensions
35 #ext = tuple([".n3",".csv",".mid"])
36 ext = tuple([".csv"])
37
38 floater = re.compile("((\d+)(.\d+)*)")
39 # reads in any csv and returns a list of structure
40 # time(float), data1, data2 ....data2
41 def read_vamp_csv(filein = '', datapos = 0):
42 output = []
43 badcount = 0
44 with open(filein, 'rb') as csvfile:
45 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
46 for row in contents:
47 if len(row) >= datapos + 2:
48 output.append([float(row[0])] + row[1:])
49 else:
50 badcount += 1
51 print "Ignored " + str(badcount) + " short rows"
52 return output
53
54 #calculates the histogram
55 def histogram(data, datapos = 1, nbins = -1):
56
57 # symbols or numerical input?
58 if not nbins == -1:
59
60 #convert to numpy data\
61 ddata = string2numpy(data,datapos)
62
63 # get time weights
64 tw_data = string2numpy(data,2)
65
66 # get loudness weights
67 lw_data = string2numpy(data,3)
68
69 count,index = numpy.histogram(ddata,nbins-1, weights=numpy.multiply(tw_data,lw_data))
70 count = count.tolist()
71 index = index.tolist()
72
73 # here for strings
74 else:
75 # build histogram on strings
76 histo = dict()
77 for row in data:
78 histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1
79 index = histo.keys()
80 count = histo.values()
81
82 # return histogram
83 return {"count":count, "index":index}
84
85 #calculates statistics for numerical input
86 def numstats(data,datapos):
87
88 #convert to numpy data
89 ddata = string2numpy(data,datapos)
90
91 avg = numpy.average(ddata).tolist()
92 med = numpy.median(ddata).tolist()
93 std = numpy.std(ddata).tolist()
94
95 # return data
96 return {"average": avg, "median": med, "std": std}
97
98 def featurefilesinpath(path):
99 # ---
100 # we traverse the file structure
101 # and list files to copy
102 # ---
103 files = []
104 for (dirpath, dirnames, filenames) in os.walk(path):
105 for file in filenames:
106 # we copy all requested files and the transform files as well!
107 if (file.endswith(ext)):
108 source = os.path.join(dirpath, file).replace('\\','/')
109 files.append(source)
110 return files
111
112 # convert to numpy
113 def string2numpy(data,datapos):
114 try:
115 ddata = numpy.array(data, dtype=float)[:, datapos+1]
116 except:
117 edata = []
118 for row in data:
119 #edata.append(float(floater.match(row[datapos+1]).group(1)))
120 m = re.search("[a-zA-Z]", row[datapos+1])
121 if m is not None:
122 # take onlly the specified column datapos+1
123 edata.append(row[datapos+1][:(m.start()-1)])
124 else:
125 # take onlly the specified column datapos+1
126 edata.append(row[datapos+1])
127 ddata = numpy.array(edata,dtype=float)
128 return ddata
129
130 # main entry point
131 if __name__ == "__main__":
132 print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...."
133 print "datapos: column of data after timecode to process"
134 print "nbins: -1 for categorical data, otherwise number of bins for histogram"
135
136 datapos = int(sys.argv[1])
137 nbins = int(sys.argv[2])
138
139 # check and collate files
140 files = []
141 for path in sys.argv[3:]:
142 if os.path.isdir(path):
143 files.extend(featurefilesinpath(path))
144 else:
145 if os.path.isfile(path):
146 files.extend(path)
147 print "Number of files now loading: " + str(len(files))
148
149 # we collate all data first and then count.
150 # @todo: read all files and create dictionary first for large tasks
151 data = []
152 for file in files:
153 print file
154 data.extend(read_vamp_csv(file, datapos))
155
156 print "Total data size in memory: " + str(sys.getsizeof(data))
157
158 # now get the histogram for all data
159 histo = histogram(data,datapos,nbins)
160 print histo
161 print "Please input a description for the histogram analysis features"
162 c2j.data2json(histo)
163
164 # further numerical analysis if this is not categorical data
165 if not nbins == -1:
166 ns = numstats(data,datapos)
167 print ns
168 print "Please input a description for the general statistics features"
169 c2j.data2json(ns)
170
171
172