view collection_analysis/tools/vampstats.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
line wrap: on
line source
# Part of DML (Digital Music Laboratory)
# Copyright 2014-2015 Daniel Wolff, City University
 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

#!/usr/bin/python
# -*- coding: utf-8 -*-

# creates a histogram from given input files or folder

__author__="Daniel Wolff"
__date__ ="$11-Feb-2015 18:18:47$"

import sys
import os
import csv
import numpy
import csv2json as c2j
import re


# global feature extensions
#ext = tuple([".n3",".csv",".mid"])
ext = tuple([".csv"])

floater = re.compile("((\d+)(.\d+)*)")
# reads in any csv and returns a list of structure
# time(float), data1, data2 ....data2
def read_vamp_csv(filein = '', datapos = 0):
    output = []
    badcount = 0
    with open(filein, 'rb') as csvfile:
        contents = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in contents:
	    if len(row) >= datapos + 2:
	        output.append([float(row[0])] + row[1:])
            else: 
                badcount += 1
    print "Ignored " + str(badcount) + " short rows" 
    return output

#calculates the histogram
def histogram(data, datapos = 1, nbins = -1):
    
    # symbols or numerical input?
    if not nbins == -1:
        
        #convert to numpy data
	ddata = string2numpy(data,datapos)
        
        count,index = numpy.histogram(ddata,nbins-1)
        count = count.tolist()
        index = index.tolist()
        
    # here for strings
    else: 
        # build histogram on strings
        histo = dict()
        for row in data:
            histo[row[datapos+1]] = histo.get(row[datapos+1], 0) + 1 
            index = histo.keys()
            count = histo.values()
            
    # return histogram
    return {"count":count, "index":index}
    
#calculates statistics for  numerical input
def numstats(data,datapos):
    
    #convert to numpy data
    ddata = string2numpy(data,datapos)

    avg = numpy.average(ddata).tolist()
    med = numpy.median(ddata).tolist()
    std = numpy.std(ddata).tolist()
    
    # return data
    return {"average": avg, "median": med, "std": std}

def featurefilesinpath(path):
    # ---
    # we traverse the file structure
    # and list files to copy
    # ---
    files = []
    for (dirpath, dirnames, filenames) in os.walk(path):
        for file in filenames:        
            # we copy all requested files and the transform files as well!
            if (file.endswith(ext)):
                source = os.path.join(dirpath, file).replace('\\','/')
                files.append(source)
    return files

# convert to numpy
def string2numpy(data,datapos):
    try:
        ddata = numpy.array(data, dtype=float)[:, datapos+1]
    except:
        edata = []
        for row in data:
            # account for verbatim units
            m = re.search("[a-zA-Z]", row[datapos+1])
            if m is not None:
                # take only the specified column datapos+1
                edata.append(row[datapos+1][:(m.start()-1)])
            else:
                # take only the specified column datapos+1
                edata.append(row[datapos+1])
        ddata = numpy.array(edata,dtype=float)
    return ddata

# main entry point
if __name__ == "__main__":
    print "Usage: vampstats datapos nbins file1/dir1 file2/dir2 ...."
    print "datapos: column of data after timecode to process"
    print "nbins: -1 for categorical data, otherwise number of bins for histogram"
    
    datapos = int(sys.argv[1])
    nbins = int(sys.argv[2])
    
    # check and collate files
    files = []
    for path in sys.argv[3:]:
        if os.path.isdir(path):
            files.extend(featurefilesinpath(path))
        else: 
            if os.path.isfile(path):
                files.extend(path)
    print "Number of files now loading: " + str(len(files))
    
    # we collate all data first and then count. 
    # @todo: read all files and create dictionary first for large tasks
    data = []
    for file in files:
        print file
        data.extend(read_vamp_csv(file, datapos))
    
    print "Total data size in memory: " + str(sys.getsizeof(data))
    
    # now get the histogram for all data
    histo = histogram(data,datapos,nbins)
    print histo
    print "Please input a description for the histogram analysis features"
    c2j.data2json(histo)
    
    # further numerical analysis if this is not categorical data
    if not nbins == -1: 
        ns = numstats(data,datapos)
        print ns
        print "Please input a description for the general statistics features"
        c2j.data2json(ns)