view pyspark/csvParser.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
line wrap: on
line source
# Part of DML (Digital Music Laboratory)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

# -*- coding: utf-8 -*-
__author__="Daniel Wolff"

import codecs
import warnings
import numpy
import csv
from n3Parser import uri2path


# reads csv file into a table,
# the first column, containing "time" is converted to float, the rest is left at strings
# data formats are for example:
# for silvet pitch output:['time','duration','pitch','velocity','label']
# for qm_vamp_key_standard output: ['time','keynr','label']
# for qm_vamp_key_standard_tonic output: ['time','keynr','label']
#
# data can be nicely traversed:
# for time, duration,pitch,velocity,label 
def get_array_from_csv(input_f_file):

    output = []
    badcount = 0
    
    # keep track of column names
    ncols = 0
    with open(uri2path(input_f_file), 'rb') as csvfile:
        contents = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in contents:
            if ncols == 0:
                ncols = len(row)
                
	    if len(row) >= ncols:
                # we assume format time , ...
	        output.append([float(row[0])] + row[1:])
            else: 
                badcount += 1
                
    if badcount > 0:
        warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
        
    return output





# converts csv input to dictionary with entities named as in "columtype".
#
# first value (time) is assumed to be float
# for silvet pitch output call_
#  csv_to_dict(input_f_file, columtype = ['time','duration','pitch','velocity','label'])
# for qm_vamp_key_standard output call 
#    csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
# for qm_vamp_key_standard_tonic output call 
#    csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
def get_dict_from_csv(input_f_file, columtype = ['time']):
    
    output = []
    badcount = 0
    
    # keep track of column names
    ncols = 0
    with open(uri2path(input_f_file), 'rb') as csvfile:
        contents = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in contents:
            
            # initialise the column name
            if ncols == 0:
                ncols = len(row)
                
                # get number of descriptors, and append if left empty
                ncoldescr = len(columtype)
                if ncoldescr < ncols:
                    warnings.warn("Column types missing")
                    columtype.extend(['data'+str(i) for i in range(ncoldescr+1, ncols+1)])

	    if len(row) == ncols:
                # parse the csv data into dict
                rowdict = dict()
                for i,col in enumerate(columtype):
                    # first value (time) is transformed to float
                    if i == 0:
                        rowdict[col] = float(row[i])
                    else:
                        rowdict[col] = row[i]
                
                # append dictionary to output
                output.append(rowdict)
                
            else: 
                badcount += 1
                
    if badcount > 0:
        warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")

    return output