Mercurial > hg > dml-open-backendtools
comparison collection_analysis/tools/vampstats.py @ 0:e34cf1b6fe09 tip
commit
| author | Daniel Wolff |
|---|---|
| date | Sat, 20 Feb 2016 18:14:24 +0100 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e34cf1b6fe09 |
|---|---|
| 1 # Part of DML (Digital Music Laboratory) | |
| 2 # Copyright 2014-2015 Daniel Wolff, City University | |
| 3 | |
| 4 # This program is free software; you can redistribute it and/or | |
| 5 # modify it under the terms of the GNU General Public License | |
| 6 # as published by the Free Software Foundation; either version 2 | |
| 7 # of the License, or (at your option) any later version. | |
| 8 # | |
| 9 # This program is distributed in the hope that it will be useful, | |
| 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 12 # GNU General Public License for more details. | |
| 13 # | |
| 14 # You should have received a copy of the GNU General Public | |
| 15 # License along with this library; if not, write to the Free Software | |
| 16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 17 | |
| 18 #!/usr/bin/python | |
| 19 # -*- coding: utf-8 -*- | |
| 20 | |
| 21 # creates a histogram from given input files or folder | |
| 22 | |
| 23 __author__="Daniel Wolff" | |
| 24 __date__ ="$11-Feb-2015 18:18:47$" | |
| 25 | |
| 26 import sys | |
| 27 import os | |
| 28 import csv | |
| 29 import numpy | |
| 30 import csv2json as c2j | |
| 31 import re | |
| 32 | |
| 33 | |
| 34 # global feature extensions | |
| 35 #ext = tuple([".n3",".csv",".mid"]) | |
| 36 ext = tuple([".csv"]) | |
| 37 | |
| 38 floater = re.compile("((\d+)(.\d+)*)") | |
| 39 # reads in any csv and returns a list of structure | |
| 40 # time(float), data1, data2 ....data2 | |
| 41 def read_vamp_csv(filein = '', datapos = 0): | |
| 42 output = [] | |
| 43 badcount = 0 | |
| 44 with open(filein, 'rb') as csvfile: | |
| 45 contents = csv.reader(csvfile, delimiter=',', quotechar='"') | |
| 46 for row in contents: | |
| 47 if len(row) >= datapos + 2: | |
| 48 output.append([float(row[0])] + row[1:]) | |
| 49 else: | |
| 50 badcount += 1 | |
| 51 print "Ignored " + str(badcount) + " short rows" | |
| 52 return output | |
| 53 | |
| 54 #calculates the histogram | |
| 55 def histogram(data, datapos = 1, nbins = -1): | |
| 56 | |
| 57 # symbols or numerical input? | |
| 58 if not nbins == -1: | |
| 59 | |
| 60 #convert to numpy data | |
| 61 ddata = string2numpy(data,datapos) | |
| 62 | |
| 63 count,index = numpy.histogram(ddata,nbins-1) | |
| 64 count = count.tolist() | |
| 65 index = index.tolist() | |
| 66 | |
| 67 # here for strings | |
| 68 else: | |
| 69 # build histogram on strings | |
| 70 histo = dict() | |
| 71 for row in data: | |
| 72 histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1 | |
| 73 index = histo.keys() | |
| 74 count = histo.values() | |
| 75 | |
| 76 # return histogram | |
| 77 return {"count":count, "index":index} | |
| 78 | |
| 79 #calculates statistics for numerical input | |
| 80 def numstats(data,datapos): | |
| 81 | |
| 82 #convert to numpy data | |
| 83 ddata = string2numpy(data,datapos) | |
| 84 | |
| 85 avg = numpy.average(ddata).tolist() | |
| 86 med = numpy.median(ddata).tolist() | |
| 87 std = numpy.std(ddata).tolist() | |
| 88 | |
| 89 # return data | |
| 90 return {"average": avg, "median": med, "std": std} | |
| 91 | |
| 92 def featurefilesinpath(path): | |
| 93 # --- | |
| 94 # we traverse the file structure | |
| 95 # and list files to copy | |
| 96 # --- | |
| 97 files = [] | |
| 98 for (dirpath, dirnames, filenames) in os.walk(path): | |
| 99 for file in filenames: | |
| 100 # we copy all requested files and the transform files as well! | |
| 101 if (file.endswith(ext)): | |
| 102 source = os.path.join(dirpath, file).replace('\\','/') | |
| 103 files.append(source) | |
| 104 return files | |
| 105 | |
| 106 # convert to numpy | |
| 107 def string2numpy(data,datapos): | |
| 108 try: | |
| 109 ddata = numpy.array(data, dtype=float)[:, datapos+1] | |
| 110 except: | |
| 111 edata = [] | |
| 112 for row in data: | |
| 113 # account for verbatim units | |
| 114 m = re.search("[a-zA-Z]", row[datapos+1]) | |
| 115 if m is not None: | |
| 116 # take only the specified column datapos+1 | |
| 117 edata.append(row[datapos+1][:(m.start()-1)]) | |
| 118 else: | |
| 119 # take only the specified column datapos+1 | |
| 120 edata.append(row[datapos+1]) | |
| 121 ddata = numpy.array(edata,dtype=float) | |
| 122 return ddata | |
| 123 | |
| 124 # main entry point | |
| 125 if __name__ == "__main__": | |
| 126 print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...." | |
| 127 print "datapos: column of data after timecode to process" | |
| 128 print "nbins: -1 for categorical data, otherwise number of bins for histogram" | |
| 129 | |
| 130 datapos = int(sys.argv[1]) | |
| 131 nbins = int(sys.argv[2]) | |
| 132 | |
| 133 # check and collate files | |
| 134 files = [] | |
| 135 for path in sys.argv[3:]: | |
| 136 if os.path.isdir(path): | |
| 137 files.extend(featurefilesinpath(path)) | |
| 138 else: | |
| 139 if os.path.isfile(path): | |
| 140 files.extend(path) | |
| 141 print "Number of files now loading: " + str(len(files)) | |
| 142 | |
| 143 # we collate all data first and then count. | |
| 144 # @todo: read all files and create dictionary first for large tasks | |
| 145 data = [] | |
| 146 for file in files: | |
| 147 print file | |
| 148 data.extend(read_vamp_csv(file, datapos)) | |
| 149 | |
| 150 print "Total data size in memory: " + str(sys.getsizeof(data)) | |
| 151 | |
| 152 # now get the histogram for all data | |
| 153 histo = histogram(data,datapos,nbins) | |
| 154 print histo | |
| 155 print "Please input a description for the histogram analysis features" | |
| 156 c2j.data2json(histo) | |
| 157 | |
| 158 # further numerical analysis if this is not categorical data | |
| 159 if not nbins == -1: | |
| 160 ns = numstats(data,datapos) | |
| 161 print ns | |
| 162 print "Please input a description for the general statistics features" | |
| 163 c2j.data2json(ns) | |
| 164 | |
| 165 | |
| 166 |
