Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code
view annotation_scripts/annotationkit_create_annotation_protocol.py @ 5:b523456082ca tip
Update path to dataset and reflect modified chunk naming convention.
author | peterf |
---|---|
date | Mon, 01 Feb 2016 21:35:27 +0000 |
parents | f079d2de4aa2 |
children |
line wrap: on
line source
#!/usr/bin/python # # annotationkit_create_annotation_protocol.py: # Prepare annotation protocol CSV from list of audio files # # Read list of files from standard input and write CSV to standard output # See annotationkit_create_annotation_protocol_wrapper.sh for usage example # # Author: Peter Foster # (c) 2015 Peter Foster # import fileinput from scikits.audiolab import Sndfile from pandas import DataFrame import numpy as np import sys #Maximum number of chunks to sample from each file nChunksPerFile = np.inf #Duration of each chunk in seconds chunkDuration = 4 #Expected sample rate sampleRate = 48000 AudioChunks = [] np.random.seed(4756) for audioFile in fileinput.input(): audioFile = audioFile.strip() sf = Sndfile(audioFile, "r") if sf.samplerate != sampleRate: raise ValueError("wanted sample rate %g - got %g." % (sampleRate, sf.samplerate)) nChunksInFile = int(sf.nframes / (sf.samplerate * chunkDuration)) #Sample without replacement random chunks from file sampledChunks = np.random.choice(nChunksInFile, min(nChunksInFile,nChunksPerFile), replace=False) for chunk in sampledChunks: frameStart = chunk * chunkDuration * sf.samplerate AudioChunks.append((audioFile, chunk, frameStart)) sf.close() #Create DataFrame AudioChunks = DataFrame(AudioChunks) AudioChunks.columns = ['audiofile', 'chunk', 'framestart'] sys.stderr.write("Processed " + str(len(AudioChunks)) + " chunks in total, corresponding to " + str(len(AudioChunks) * chunkDuration / float(60)) + " minutes of audio.\n") #Write to CSV (stdout) AudioChunks.to_csv(sys.stdout)