Mercurial > hg > dml-open-backendtools
diff collection_analysis/tools/vampstats_pitch_weighted.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/collection_analysis/tools/vampstats_pitch_weighted.py Sat Feb 20 18:14:24 2016 +0100 @@ -0,0 +1,172 @@ +# Part of DML (Digital Music Laboratory) +# Copyright 2014-2015 Daniel Wolff, City University + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# creates a histogram from given input files or folder + +__author__="Daniel Wolff, Dan" +__date__ ="$11-Feb-2015 18:18:47$" + +import sys +import os +import csv +import numpy +import csv2json as c2j +import re + + +# global feature extensions +#ext = tuple([".n3",".csv",".mid"]) +ext = tuple([".csv"]) + +floater = re.compile("((\d+)(.\d+)*)") +# reads in any csv and returns a list of structure +# time(float), data1, data2 ....data2 +def read_vamp_csv(filein = '', datapos = 0): + output = [] + badcount = 0 + with open(filein, 'rb') as csvfile: + contents = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in contents: + if len(row) >= datapos + 2: + output.append([float(row[0])] + row[1:]) + else: + badcount += 1 + print "Ignored " + str(badcount) + " short rows" + return output + +#calculates the histogram +def histogram(data, datapos = 1, nbins = -1): + + # symbols or numerical input? + if not nbins == -1: + + #convert to numpy data\ + ddata = string2numpy(data,datapos) + + # get time weights + tw_data = string2numpy(data,2) + + # get loudness weights + lw_data = string2numpy(data,3) + + count,index = numpy.histogram(ddata,nbins-1, weights=numpy.multiply(tw_data,lw_data)) + count = count.tolist() + index = index.tolist() + + # here for strings + else: + # build histogram on strings + histo = dict() + for row in data: + histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1 + index = histo.keys() + count = histo.values() + + # return histogram + return {"count":count, "index":index} + +#calculates statistics for numerical input +def numstats(data,datapos): + + #convert to numpy data + ddata = string2numpy(data,datapos) + + avg = numpy.average(ddata).tolist() + med = numpy.median(ddata).tolist() + std = numpy.std(ddata).tolist() + + # return data + return {"average": avg, "median": med, "std": std} + +def featurefilesinpath(path): + # --- + # we traverse the file structure + # and list files to copy + # --- + files = [] + for (dirpath, dirnames, filenames) in os.walk(path): + for file in filenames: + # we copy all requested files and the transform files as well! + if (file.endswith(ext)): + source = os.path.join(dirpath, file).replace('\\','/') + files.append(source) + return files + +# convert to numpy +def string2numpy(data,datapos): + try: + ddata = numpy.array(data, dtype=float)[:, datapos+1] + except: + edata = [] + for row in data: + #edata.append(float(floater.match(row[datapos+1]).group(1))) + m = re.search("[a-zA-Z]", row[datapos+1]) + if m is not None: + # take onlly the specified column datapos+1 + edata.append(row[datapos+1][:(m.start()-1)]) + else: + # take onlly the specified column datapos+1 + edata.append(row[datapos+1]) + ddata = numpy.array(edata,dtype=float) + return ddata + +# main entry point +if __name__ == "__main__": + print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...." + print "datapos: column of data after timecode to process" + print "nbins: -1 for categorical data, otherwise number of bins for histogram" + + datapos = int(sys.argv[1]) + nbins = int(sys.argv[2]) + + # check and collate files + files = [] + for path in sys.argv[3:]: + if os.path.isdir(path): + files.extend(featurefilesinpath(path)) + else: + if os.path.isfile(path): + files.extend(path) + print "Number of files now loading: " + str(len(files)) + + # we collate all data first and then count. + # @todo: read all files and create dictionary first for large tasks + data = [] + for file in files: + print file + data.extend(read_vamp_csv(file, datapos)) + + print "Total data size in memory: " + str(sys.getsizeof(data)) + + # now get the histogram for all data + histo = histogram(data,datapos,nbins) + print histo + print "Please input a description for the histogram analysis features" + c2j.data2json(histo) + + # further numerical analysis if this is not categorical data + if not nbins == -1: + ns = numstats(data,datapos) + print ns + print "Please input a description for the general statistics features" + c2j.data2json(ns) + + +