diff collection_analysis/tools/vampstats_pitch_weighted.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/collection_analysis/tools/vampstats_pitch_weighted.py	Sat Feb 20 18:14:24 2016 +0100
@@ -0,0 +1,172 @@
+# Part of DML (Digital Music Laboratory)
+# Copyright 2014-2015 Daniel Wolff, City University
+ 
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# creates a histogram from given input files or folder
+
+__author__="Daniel Wolff, Dan"
+__date__ ="$11-Feb-2015 18:18:47$"
+
+import sys
+import os
+import csv
+import numpy
+import csv2json as c2j
+import re
+
+
+# global feature extensions
+#ext = tuple([".n3",".csv",".mid"])
+ext = tuple([".csv"])
+
+floater = re.compile("((\d+)(.\d+)*)")
+# reads in any csv and returns a list of structure
+# time(float), data1, data2 ....data2
+def read_vamp_csv(filein = '', datapos = 0):
+    output = []
+    badcount = 0
+    with open(filein, 'rb') as csvfile:
+        contents = csv.reader(csvfile, delimiter=',', quotechar='"')
+        for row in contents:
+	    if len(row) >= datapos + 2:
+	        output.append([float(row[0])] + row[1:])
+            else: 
+                badcount += 1
+    print "Ignored " + str(badcount) + " short rows" 
+    return output
+
+#calculates the histogram
+def histogram(data, datapos = 1, nbins = -1):
+    
+    # symbols or numerical input?
+    if not nbins == -1:
+        
+        #convert to numpy data\
+        ddata = string2numpy(data,datapos)
+    
+        # get time weights
+        tw_data = string2numpy(data,2)
+ 
+        # get loudness weights
+        lw_data = string2numpy(data,3)
+        
+        count,index = numpy.histogram(ddata,nbins-1, weights=numpy.multiply(tw_data,lw_data))
+        count = count.tolist()
+        index = index.tolist()
+        
+    # here for strings
+    else: 
+        # build histogram on strings
+        histo = dict()
+        for row in data:
+            histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1 
+            index = histo.keys()
+            count = histo.values()
+            
+    # return histogram
+    return {"count":count, "index":index}
+    
+#calculates statistics for  numerical input
+def numstats(data,datapos):
+    
+    #convert to numpy data
+    ddata = string2numpy(data,datapos)
+
+    avg = numpy.average(ddata).tolist()
+    med = numpy.median(ddata).tolist()
+    std = numpy.std(ddata).tolist()
+    
+    # return data
+    return {"average": avg, "median": med, "std": std}
+
+def featurefilesinpath(path):
+    # ---
+    # we traverse the file structure
+    # and list files to copy
+    # ---
+    files = []
+    for (dirpath, dirnames, filenames) in os.walk(path):
+        for file in filenames:        
+            # we copy all requested files and the transform files as well!
+            if (file.endswith(ext)):
+                source = os.path.join(dirpath, file).replace('\\','/')
+                files.append(source)
+    return files
+
+# convert to numpy
+def string2numpy(data,datapos):
+    try:
+        ddata = numpy.array(data, dtype=float)[:, datapos+1]
+    except:
+        edata = []
+        for row in data:
+            #edata.append(float(floater.match(row[datapos+1]).group(1)))
+            m = re.search("[a-zA-Z]", row[datapos+1])
+            if m is not None:
+                # take onlly the specified column datapos+1
+                edata.append(row[datapos+1][:(m.start()-1)])
+            else:
+                # take onlly the specified column datapos+1
+                edata.append(row[datapos+1])
+        ddata = numpy.array(edata,dtype=float)
+    return ddata
+
+# main entry point
+if __name__ == "__main__":
+    print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...."
+    print "datapos: column of data after timecode to process"
+    print "nbins: -1 for categorical data, otherwise number of bins for histogram"
+    
+    datapos = int(sys.argv[1])
+    nbins = int(sys.argv[2])
+    
+    # check and collate files
+    files = []
+    for path in sys.argv[3:]:
+        if os.path.isdir(path):
+            files.extend(featurefilesinpath(path))
+        else: 
+            if os.path.isfile(path):
+                files.extend(path)
+    print "Number of files now loading: " + str(len(files))
+    
+    # we collate all data first and then count. 
+    # @todo: read all files and create dictionary first for large tasks
+    data = []
+    for file in files:
+        print file
+        data.extend(read_vamp_csv(file, datapos))
+    
+    print "Total data size in memory: " + str(sys.getsizeof(data))
+    
+    # now get the histogram for all data
+    histo = histogram(data,datapos,nbins)
+    print histo
+    print "Please input a description for the histogram analysis features"
+    c2j.data2json(histo)
+    
+    # further numerical analysis if this is not categorical data
+    if not nbins == -1: 
+        ns = numstats(data,datapos)
+        print ns
+        print "Please input a description for the general statistics features"
+        c2j.data2json(ns)
+        
+        
+