dml-open-backendtools: collection_analysis/tools/vampstats.py comparison

comparison collection_analysis/tools/vampstats.py @ 0:e34cf1b6fe09 tip

commit

author	Daniel Wolff
date	Sat, 20 Feb 2016 18:14:24 +0100
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:e34cf1b6fe09
+# Part of DML (Digital Music Laboratory)
+# Copyright 2014-2015 Daniel Wolff, City University
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# creates a histogram from given input files or folder
+__author__="Daniel Wolff"
+__date__ ="$11-Feb-2015 18:18:47$"
+import sys
+import os
+import csv
+import numpy
+import csv2json as c2j
+import re
+# global feature extensions
+#ext = tuple([".n3",".csv",".mid"])
+ext = tuple([".csv"])
+floater = re.compile("((\d+)(.\d+)*)")
+# reads in any csv and returns a list of structure
+# time(float), data1, data2 ....data2
+def read_vamp_csv(filein = '', datapos = 0):
+output = []
+badcount = 0
+with open(filein, 'rb') as csvfile:
+contents = csv.reader(csvfile, delimiter=',', quotechar='"')
+for row in contents:
+	    if len(row) >= datapos + 2:
+	        output.append([float(row[0])] + row[1:])
+else:
+badcount += 1
+print "Ignored " + str(badcount) + " short rows"
+return output
+#calculates the histogram
+def histogram(data, datapos = 1, nbins = -1):
+# symbols or numerical input?
+if not nbins == -1:
+#convert to numpy data
+	ddata = string2numpy(data,datapos)
+count,index = numpy.histogram(ddata,nbins-1)
+count = count.tolist()
+index = index.tolist()
+# here for strings
+else:
+# build histogram on strings
+histo = dict()
+for row in data:
+histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1
+index = histo.keys()
+count = histo.values()
+# return histogram
+return {"count":count, "index":index}
+#calculates statistics for  numerical input
+def numstats(data,datapos):
+#convert to numpy data
+ddata = string2numpy(data,datapos)
+avg = numpy.average(ddata).tolist()
+med = numpy.median(ddata).tolist()
+std = numpy.std(ddata).tolist()
+# return data
+return {"average": avg, "median": med, "std": std}
+def featurefilesinpath(path):
+# ---
+# we traverse the file structure
+# and list files to copy
+# ---
+files = []
+for (dirpath, dirnames, filenames) in os.walk(path):
+for file in filenames:
+# we copy all requested files and the transform files as well!
+if (file.endswith(ext)):
+source = os.path.join(dirpath, file).replace('\\','/')
+files.append(source)
+return files
+# convert to numpy
+def string2numpy(data,datapos):
+try:
+ddata = numpy.array(data, dtype=float)[:, datapos+1]
+except:
+edata = []
+for row in data:
+# account for verbatim units
+m = re.search("[a-zA-Z]", row[datapos+1])
+if m is not None:
+# take only the specified column datapos+1
+edata.append(row[datapos+1][:(m.start()-1)])
+else:
+# take only the specified column datapos+1
+edata.append(row[datapos+1])
+ddata = numpy.array(edata,dtype=float)
+return ddata
+# main entry point
+if __name__ == "__main__":
+print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...."
+print "datapos: column of data after timecode to process"
+print "nbins: -1 for categorical data, otherwise number of bins for histogram"
+datapos = int(sys.argv[1])
+nbins = int(sys.argv[2])
+# check and collate files
+files = []
+for path in sys.argv[3:]:
+if os.path.isdir(path):
+files.extend(featurefilesinpath(path))
+else:
+if os.path.isfile(path):
+files.extend(path)
+print "Number of files now loading: " + str(len(files))
+# we collate all data first and then count.
+# @todo: read all files and create dictionary first for large tasks
+data = []
+for file in files:
+print file
+data.extend(read_vamp_csv(file, datapos))
+print "Total data size in memory: " + str(sys.getsizeof(data))
+# now get the histogram for all data
+histo = histogram(data,datapos,nbins)
+print histo
+print "Please input a description for the histogram analysis features"
+c2j.data2json(histo)
+# further numerical analysis if this is not categorical data
+if not nbins == -1:
+ns = numstats(data,datapos)
+print ns
+print "Please input a description for the general statistics features"
+c2j.data2json(ns)

Mercurial > hg > dml-open-backendtools

comparison collection_analysis/tools/vampstats.py @ 0:e34cf1b6fe09 tip