diff python/simscene.py @ 35:5d19c2254677

added simscene.py with the accompanying input files to generate acoustic scenes using python
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Thu, 05 Oct 2017 14:53:15 +0100
parents
children a0eb120940b1
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/python/simscene.py	Thu Oct 05 14:53:15 2017 +0100
@@ -0,0 +1,867 @@
+#!/bin/python
+# -*- coding: utf-8 -*-
+# For licensing please see: LICENSE
+# Copyright (c) Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
+
+# Argparse
+import argparse
+
+# Logging
+import logging
+
+# Pandas
+import pandas as pd
+
+# Numpy
+import numpy as np
+import sys
+
+# Glob
+import glob
+import random
+
+# Librosa
+import librosa
+import librosa.display
+import librosa.output
+
+# Matplotlib
+from matplotlib import rc
+# rc('text', usetex=True)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from cycler import cycler
+
+# Tabulate
+from tabulate import tabulate
+
+def _N(t, sr=44100):
+    """
+    Helper function: Converts time to samples
+    """
+    return int(t*sr)
+
+def compute_energy(x):
+    return np.sqrt(np.mean(x**2))
+
+# def compute_energy_profile(x, w=1000):
+#     # Resize/Window signal
+#     #x = np.resize(x, (w,int(np.ceil(float(len(x)/w)))))
+#     x = np.array([[ii+jj for jj in range(w)] for ii in range(len(x)-w)])    
+#     return np.sqrt(np.mean(x**2, 1))
+
+def render_pattern(fname, input_path, sr=44100):
+    pattern = read_pattern_file(fname)
+
+    start_times_samples = []
+    end_times_samples = []
+    durations_samples = []
+    wav_files = []
+    
+    for n in range(len(pattern)):
+        # Try loading the file,
+        sampleid = pattern['sampleid'].loc[n]
+        candidates = glob.glob('{}/event/{}*wav'.format(input_path,sampleid))
+        chosen_fname = random.sample(candidates, 1)[0]
+
+        logging.debug('Loading {}'.format(chosen_fname))
+        
+        # For each sound in the pattern file, place it starting from starttime + an offset
+        # with a mean value of 0 and standard deviation of offset_stddev. The first event can
+        # not start earlier than time 0. If endtime is defined (not nan), then cut the event at
+        # end time.
+        wav, SR = librosa.load(chosen_fname, sr=sr)
+
+        # Read and assign an amplitude
+        amplitude_mean = float(pattern['amplitude'].loc[n])
+        amplitude_stddev = float(pattern['amplitude_stdev'].loc[n])
+        amplitude = amplitude_mean + np.random.randn()*amplitude_stddev
+        wav *= amplitude
+        
+        start_time = max(float(pattern['start_time'].loc[n]),0)
+        start_time_samples = int(start_time*SR)
+
+        fade_in_time = float(pattern['fade_in_time'].loc[n])
+        fade_out_time = float(pattern['fade_out_time'].loc[n])
+        end_time = float(pattern['end_time'].loc[n])
+
+        # If end_time is not defined (-1 or just empty)
+        # then just derive it from the length of the sample 
+        if np.isnan(end_time) or float(end_time) == -1:
+            duration_samples = len(wav)            
+            end_time_samples = start_time_samples + duration_samples
+        elif end_time - start_time > len(wav)/float(SR):
+            
+            # If given end_time is more than start_time + duration of sample
+            # then pad the file with zeros to reach the desired end time.
+            duration = end_time - start_time
+            duration_samples = int(duration*SR)
+            end_time_samples = start_time_samples + duration_samples
+            wav_arr = np.zeros(duration_samples)
+            wav_arr[:len(wav)] = wav
+            wav = wav_arr
+        else:
+            duration = end_time - start_time
+            duration_samples = int(duration*SR)
+            end_time_samples = start_time_samples + duration_samples
+
+        event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time)
+        
+        start_times_samples.append(start_time_samples)
+        end_times_samples.append(end_time_samples)
+        durations_samples.append(duration_samples)
+        wav_files.append(event_render)
+
+    pattern_duration = end_time_samples
+    pattern_arr = np.zeros(pattern_duration)
+
+    for n, s in enumerate(start_times_samples):
+        wav = wav_files[n]
+        pattern_arr[s:s+len(wav)] = wav
+    
+    return pattern_arr, 44100
+
+def read_events_file(fname):
+    if fname[-3:].lower() == 'xls':
+        df = pd.read_excel(fname)
+    elif fname[-4:].lower() == 'json':
+        df = pd.read_json(fname)    
+    elif fname[-3:].lower() in ['txt']:           
+        with open(fname) as f:
+            s = f.readline()
+            f.seek(0,0)
+            if ',' in s:
+                sep = ','
+            elif '\t' in s:
+                sep = '\t'
+            else:
+                sep = ' '
+            logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
+            df = pd.read_csv(f, header=None, sep=sep)                  
+            df.columns = ['label','sampleid','ebr','ebr_stddev','mean_time_between_instances','time_between_instances_stddev','start_time','end_time','fade_in_time','fade_out_time']
+    elif fname[-3:].lower() in ['csv']:
+        df = pd.read_json(fname)
+
+    logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))                   
+    return df
+
+def read_pattern_file(fname):
+    if fname[-3:].lower() == 'xls':
+        df = pd.read_excel(fname)
+    elif fname[-4:].lower() == 'json':
+        df = pd.read_json(fname)    
+    elif fname[-3:].lower() in ['txt']:           
+        with open(fname) as f:
+            s = f.readline()
+            f.seek(0,0)
+            if ',' in s:
+                sep = ','
+            elif '\t' in s:
+                sep = '\t'
+            else:
+                sep = ' '
+            logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
+            df = pd.read_csv(f, header=None, sep=sep)        
+            df.columns = ['eventid','start_time','end_time','time_offset_stdev','fade_in_time', 'fade_out_time', 'amplitude', 'amplitude_stdev']
+    elif fname[-3:].lower() in ['csv']:
+        df = pd.read_json(fname)            
+            
+    logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))                   
+    return df
+    
+def read_backgrounds_file(fname):
+    if fname[-3:].lower() == 'xls':
+        df = pd.read_excel(fname)
+    elif fname[-4:].lower() == 'json':
+        df = pd.read_json(fname)    
+    elif fname[-3:].lower() in ['txt']:           
+        with open(fname) as f:
+            s = f.readline()
+            f.seek(0,0)
+            if ',' in s:
+                sep = ','
+            elif '\t' in s:
+                sep = '\t'
+            else:
+                sep = ' '
+            logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
+            df = pd.read_csv(f, header=None, sep=sep)        
+            df.columns = ['label','sampleid','snr']
+    elif fname[-3:].lower() in ['csv']:
+        df = pd.read_json(fname)            
+            
+    logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))                   
+    return df
+
+def read_annotations_file(fname):
+    if fname[-3:].lower() == 'xls':
+        df = pd.read_excel(fname)
+    elif fname[-4:].lower() == 'json':
+        df = pd.read_json(fname)
+    elif fname[-3:].lower() in ['txt', 'csv']:
+                                
+        with open(fname) as f:
+            header = f.readline()
+        
+            s = f.readline()
+            f.seek(0,0)
+            if ',' in s:
+                sep = ','
+            elif '\t' in s:
+                sep = '\t'
+            else:
+                sep = ' '
+            if sep in header:
+                logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
+                df = pd.read_csv(f, header=None, sep=sep)
+                df.columns = ['start', 'stop', 'class']                        
+            else:
+                df.columns = ['start', 'stop', 'class']
+                df = pd.read_csv(f, sep=sep)
+                df = None
+
+    logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))                   
+    return df
+
+def run_demo():
+    print("TODO: Implement run_demo()")
+
+def fade(x, fade_in, fade_out, sr=44100):
+    """
+    Creates a fade-in-fade-out envelope
+    for audio array x.
+    """
+
+    if len(x) == 0:
+        return x
+    
+    fade_in_samples = int(fade_in*sr)
+    fade_out_samples = int(fade_out*sr)
+    
+    outp = np.ones_like(x)
+    for n in range(fade_in_samples):
+        outp[n] = n*1./fade_in_samples
+        
+    for n in range(fade_out_samples):
+        outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n
+    return outp*x
+
+def simscene(input_path,
+             output_path,
+             scene_duration,
+             score_events,
+             score_backgrounds,
+             **kwargs):
+    logging.info('simscene() is not yet implemented fully')
+    SR = 44100 # Samplerate. Should probably not be hardcoded
+    
+    events_df = score_events
+    backgrounds_df = score_backgrounds
+    
+    # Create empty numpy array
+    scene_arr = np.zeros(int(scene_duration*SR))
+
+    if 'append_to_filename' in kwargs:
+        append_to_filename = kwargs['append_to_filename']
+    else:
+        append_to_filename = None
+    
+    if 'end_cut' in kwargs:
+        end_cut = kwargs['end_cut']
+    else:
+        end_cut = False
+        
+    if 'figure_verbosity' in kwargs:
+        figure_verbosity = kwargs['figure_verbosity']
+    else:
+        figure_verbosity = 0
+        
+    if 'image_format' in kwargs:
+        image_format = kwargs['image_format']
+    else:
+        image_format = 'png'
+    
+    # Stores the starting and ending times of every track for visualization
+    # purposes
+    scene_starting_times = []
+    scene_ending_times = []
+
+    # List of tracks
+    track_list = []
+    background_energies = []
+    
+    for n in range(len(backgrounds_df)):
+        # Get label of background
+        label = str(backgrounds_df['label'].loc[n])
+
+        # First check if there are any pattern candidates. Give priorities
+        # To pattern files.
+        candidates = []
+        for pattern_format in ['xls', 'json', 'txt', 'csv']:
+            candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, backgrounds_df['sampleid'].loc[n], pattern_format))
+
+        if len(candidates) == 0:
+            # If no patterns are found, search for normal audio files
+            candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n]))           
+            chosen_fname = random.sample(candidates, 1)[0]
+            wav, sr = librosa.load(chosen_fname, sr=SR)
+        else:
+            chosen_fname = random.sample(candidates, 1)[0]
+            wav, sr = render_pattern(chosen_fname, input_path)
+            
+        duration = len(wav)/float(SR)
+        target_snr_db = float(backgrounds_df['snr'].loc[n])
+        target_snr = 10**(target_snr_db/20.0)
+        
+        energy = compute_energy(wav)
+
+        logging.debug('{}:energy:{}'.format(label,energy))
+        
+        
+        if n == 0:
+            # For the first background track, snr
+            # gives an amount by which it's going to be scaled (i.e. make it more silent)
+            amplitude_factor = target_snr
+            wav *= amplitude_factor
+        
+        if n > 0:
+            noise_energy = compute_energy(np.sum(np.array(track_list), axis=0))
+            logging.info('{}:noise_energy:{}'.format(label,noise_energy))
+
+            old_snr = energy/noise_energy
+            old_snr_db = 20*np.log10(old_snr)
+            logging.info('{}:old_snr:{}'.format(label,old_snr_db))
+            
+            amplitude_factor = target_snr/old_snr
+            
+
+            wav *= amplitude_factor
+            new_energy = compute_energy(wav)
+            new_snr = new_energy/noise_energy
+            new_snr_db = 20. * np.log10(new_snr)
+            logging.info('{}:new_snr:{}'.format(label,new_snr_db))
+            
+        
+        # Track array
+        track_arr = np.zeros(int(scene_duration*SR))
+        start_times = [0.0]
+        end_times = [start_times[-1]+len(wav)/float(SR)]
+    
+
+        # Start with the first time in the list
+        new_start_time = start_times[-1]
+        new_end_time = end_times[-1]
+
+        while new_start_time < scene_duration:
+            offset = duration
+            new_start_time += offset
+            new_end_time += offset
+
+            start_times.append(new_start_time)
+            end_times.append(new_end_time)
+
+        for n,t in enumerate(start_times):
+            # We need to be careful with the limits here
+            # since numpy will just ignore indexing that 
+            # exceeds
+
+            # Fading times in case we need to join many
+            # consecutive samples together.
+            # if n == 0:
+            #     # Little fade-out, fade-in to smoothly repeat the
+            #     # background.
+            #     fade_in_time = 0.0
+            #     fade_out_time = 0.01
+            # elif n > 0 and n < len(start_times) - 1:
+            #     fade_in_time = 0.01
+            #     fade_out_time = 0.01
+            # else:
+            #     fade_in_time = 0.01
+            #     fade_out_time = 0.0
+            begin = min(_N(t), len(track_arr))
+            end = min(len(track_arr), _N(t)+len(wav))
+
+            # Part of the wav to store
+            # part = fade(wav[:end-begin],fade_in_time,fade_out_time)
+            part = wav[:end-begin]
+
+
+            track_arr[begin:end] += part
+
+        track_list.append(track_arr)
+        scene_arr[:len(track_arr)] += track_arr
+
+        if channel_mode == 'separate':
+            librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR)
+
+        F = librosa.stft(track_arr, 1024)
+        energy_prof = librosa.feature.rmse(S=F)
+        background_energies.append(energy_prof)
+            
+        if figure_verbosity > 0:
+            plt.figure()
+            plt.subplot(3, 1, 1)
+            plt.title('`{}\' background waveform and spectrogram'.format(label))
+            librosa.display.waveplot(track_arr,sr=SR)
+
+            # Plot spectrogram 
+            Fdb = librosa.amplitude_to_db(F)
+            plt.subplot(3, 1, 2)
+            librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
+
+            # Plot energy profile
+            plt.subplot(3, 1, 3)
+            time = np.linspace(0,  len(track_arr)/SR, len(energy_prof.T))
+            plt.semilogy(time, energy_prof.T)
+            plt.xlim([0, len(track_arr)/SR])
+            plt.ylabel('energy (rms)')
+            
+            
+            # Tidy up and save to file
+            plt.tight_layout()
+            if append_to_filename:                
+                plt.savefig('{}/background_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
+            else:
+                plt.savefig('{}/background_{}.{}'.format(output_path, label, image_format), dpi=300)                
+
+    # Compute total energy of background
+    if len(backgrounds_df) > 0:
+        background_arr = np.sum(track_list, 0)
+        B = librosa.stft(background_arr, 1024)
+        background_energy = librosa.feature.rmse(S=B).flatten()
+    else:
+        background_energy = 0.0
+    
+    for n in range(len(events_df)):
+        # Get label of track
+        label = str(events_df['label'].loc[n])
+
+        # First check if there are any pattern candidates. Give priorities
+        # To pattern files.
+        candidates = []
+        for pattern_format in ['xls', 'json', 'txt', 'csv']:
+            candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, events_df['sampleid'].loc[n], pattern_format))
+
+        if len(candidates) == 0:
+            # If no patterns are found, search for normal audio files
+            candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n]))           
+            chosen_fname = random.sample(candidates, 1)[0]
+            wav, sr = librosa.load(chosen_fname, sr=SR)
+        else:
+            chosen_fname = random.sample(candidates, 1)[0]
+            wav, sr = render_pattern(chosen_fname, input_path)        
+        
+                  
+        # Apply a fader envelope
+        fade_in_time = float(events_df['fade_in_time'].loc[n])
+        fade_out_time = float(events_df['fade_out_time'].loc[n])
+        wav = fade(wav, fade_in_time, fade_out_time)
+
+        # Set target EBR
+        target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 + np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0)
+
+        # Mean time between instances \mu.  
+        mean_time_between_instances = events_df['mean_time_between_instances'].loc[n]
+        track_end_time = events_df['end_time'].loc[n]
+        
+        # Track array
+        track_arr = np.zeros(int(scene_duration*SR))
+        
+        #If \mu is -1, then play the event only once.
+        if mean_time_between_instances == -1:
+            track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav
+            start_times = [float(events_df['start_time'].loc[n])]
+            end_times = [float(events_df['end_time'].loc[n])]
+        else:
+            # If 0, then start next sample after this one (set it to the duration of the sample)
+            if mean_time_between_instances == 0:
+                mean_time_between_instances = len(wav)/float(SR)
+                
+            # Store the successive starting and ending times of the events (given e.g. the model)
+            # in the following lists.
+            start_times = [events_df['start_time'].loc[n]]
+            end_times = [start_times[-1]+len(wav)/float(SR)]
+
+            # Start with the first time in the list
+            new_start_time = start_times[-1]
+            new_end_time = end_times[-1]
+
+            # Until the scene is full
+            while new_start_time < track_end_time:
+                offset = float(mean_time_between_instances) +\
+                            float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn())
+                new_start_time += offset
+                new_end_time += offset
+                
+                # Only exception is if we have set the 'end_cut' flag 
+                # and the end time of the event surpasses the end time 
+                # of the track
+                if end_cut and new_end_time > track_end_time:
+                    break
+                else:
+                    start_times.append(new_start_time)
+                    end_times.append(new_end_time)
+
+            for t in start_times:
+                # We need to be careful with the limits here
+                # since numpy will just ignore indexing that 
+                # exceeds the size of the array
+                begin = min(_N(t), len(track_arr))
+                end = min(len(track_arr), _N(t)+len(wav))
+
+                # Part of the wav to store
+                part = wav[:end-begin]
+
+                # If wav file was concatenated, fade out
+                # quickly to avoid clicks
+                if len(part) < len(wav) and len(part) > fade_out_time*SR:
+                    part = fade(part, 0, fade_out_time)
+                    
+                track_arr[begin:end] += part
+
+        track_list.append(track_arr)
+        scene_arr[:len(track_arr)] += track_arr
+
+        # Compute energies
+        F = librosa.stft(track_arr, 1024)
+        energy_prof = librosa.feature.rmse(S=F).flatten()
+
+        # Compute current ebr
+        
+        if len(backgrounds_df) > 0:
+            ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten()
+            curr_ebr = np.max(ebr_prof)
+            logging.debug('{}:Target ebr: {}db'.format(label,20*np.log10(target_ebr)))
+            logging.debug('{}:Current track ebr: {}db'.format(label,20*np.log10(curr_ebr)))
+        
+            # Set correct ebr
+            track_arr = track_arr/curr_ebr*target_ebr
+
+            Fnew = librosa.stft(track_arr, 1024)
+            new_energy_prof = librosa.feature.rmse(S=Fnew).flatten()
+            new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten()
+            new_ebr = np.max(new_ebr_prof)
+            logging.debug('{}:New track ebr: {}db'.format(label,20*np.log10(new_ebr)))
+        
+        
+
+        if channel_mode == 'separate':
+            librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label), track_arr/np.max(track_arr), SR)
+
+
+        
+        
+            
+        if figure_verbosity > 0:
+            plt.figure()
+
+            plt.subplot(3,1,1)
+            plt.title('`{}\' event waveform and spectrogram'.format(label))                
+
+            librosa.display.waveplot(track_arr,sr=SR)
+            Fdb = librosa.amplitude_to_db(F)
+            plt.subplot(3, 1, 2)
+            librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
+
+            # Plot energy profile
+            plt.subplot(3, 1, 3)
+            time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
+            plt.semilogy(time, energy_prof.T)
+            plt.xlim([0, len(track_arr)/SR])            
+            plt.ylabel('energy (rms)')
+            
+
+            plt.tight_layout()
+            if append_to_filename:                
+                plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
+            else:
+                plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300)      
+
+            
+
+
+        scene_starting_times.append((label, start_times))
+        scene_ending_times.append((label, end_times))
+
+    if figure_verbosity > 0:
+        plt.figure()
+        ax0 = plt.subplot(3,1,1)
+        plt.title('Synthesized Scene')
+        librosa.display.waveplot(scene_arr, sr=SR)
+        F = librosa.stft(scene_arr)
+        Fdb = librosa.amplitude_to_db(F)
+        ax1 = plt.subplot(3,1,2)
+        librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
+        ax2 = plt.subplot(3,1,3)
+        ax2.set_xlim([0,scene_duration])
+
+        # Get labels
+        labels = [s[0] for s in scene_starting_times]
+
+
+        
+
+        # If background is active
+        if len(backgrounds_df) > 0:
+            labels.append('background')
+
+        # Set y axis limit. With a padding of 0.5.
+        ax2.set_ylim([-0.5, len(labels)-0.5])            
+
+        plt.yticks(range(len(labels)), labels)
+
+        for n in range(len(scene_starting_times)):
+            label = scene_starting_times[n][0]
+            start_times = scene_starting_times[n][1]
+            end_times = scene_ending_times[n][1]
+            color = ['r', 'g', 'y'][n % 3]
+
+            for m in range(len(start_times)):
+                plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4)
+                if figure_verbosity > 2:
+                    ax0.axvline(start_times[m], color=color, alpha=0.1)
+                    ax0.axvline(end_times[m], color=color, alpha=0.1)
+                    ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
+                    ax1.axvline(start_times[m], color=color, alpha=0.1)
+                    ax1.axvline(end_times[m], color=color, alpha=0.1)
+                    ax1.axvline(end_times[m], color=color, alpha=0.1)
+                    ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)                
+                    ax2.axvline(start_times[m], color=color, alpha=0.1)
+                    ax2.axvline(end_times[m], color=color, alpha=0.1)
+                    ax2.axvline(end_times[m], color=color, alpha=0.1)
+                    ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
+
+        if len(backgrounds_df) > 0:
+            plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4)
+
+        plt.tight_layout()
+
+        if append_to_filename:                
+            plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300)
+        else:
+            plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300)
+            
+    if figure_verbosity > 1:
+        plt.show()
+
+    # Replace nans (i.e. because of division-by-zero) of the scene with zeros.
+    scene_arr = np.nan_to_num(scene_arr)
+        
+    if channel_mode == 'mono':
+        if append_to_filename:
+            librosa.output.write_wav('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR)
+        else:
+            librosa.output.write_wav('{}/scene.wav'.format(output_path), scene_arr, SR)
+        
+
+    return scene_arr
+    
+    
+             
+def not_implemented():
+    print("TODO: not implemented")
+    
+if __name__=="__main__":
+    """
+    Main function, parses options and calls the simscene generation function
+    or a demo. The options given are almost identical to Lagrange et al's 
+    simscene.
+    """
+    argparser = argparse.ArgumentParser(
+            description="SimScene.py acoustic scene generator",
+    )
+    argparser.add_argument(
+        'input_path',
+        type=str,
+        help="Path of a directory containing wave files for sound backgrounds (in the `background' sub-directory) or events (in `event')"
+    )
+    argparser.add_argument(
+        'output_path',
+        type=str,
+        help="The directory the generated scenes and annotations will reside."
+    )    
+    argparser.add_argument(
+        'scene_duration',
+        type=float,
+        help="Duration of scene in seconds",
+    )
+    scene_duration = None
+    
+    argparser.add_argument(
+        '-e', '--score-events',
+        type=str,
+        help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
+    )
+    score_events = None
+    
+    argparser.add_argument(
+        '-b', '--score-backgrounds',
+        type=str,
+        help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
+    )
+    score_backgrounds = None
+
+    argparser.add_argument(
+        '--tag',
+        type=str,
+        help="Append _TAG_XXX to filenames, where XXX is an increment."
+    )
+    tag = None
+
+    argparser.add_argument(
+        '-N',
+        type=int,
+        help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, then the verbosity must be less or equal to 1"
+    )
+    generate_n = 1
+
+    argparser.add_argument(
+        '-t', '--time-mode',
+        type=str,
+        help="Mode of spacing between events. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
+        choices=['generate', 'abstract', 'replicate']
+    )
+    time_mode = 'generate'
+    
+    argparser.add_argument(
+        '-R', '--ebr-mode',
+        type=str,
+        help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
+        choices=['generate', 'abstract', 'replicate']
+    )
+    ebr_mode = 'generate'
+    
+    argparser.add_argument(
+        '-A', '--annotation-file',
+        type=float,
+        help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). (NOT IMPLEMENTED)"
+    )
+    annotation_file = None
+    
+    argparser.add_argument(
+        '-a', '--audio-file',
+        type=float,
+        help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)"
+    )
+    audio_file = None
+    
+    argparser.add_argument(
+        '-v', '--figure-verbosity', action='count',
+        help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot"
+    )
+    figure_verbosity = 0
+
+    argparser.add_argument(
+        '-x', '--image-format',
+        help="Image format for the figures",
+        choices=['png', 'jpg', 'pdf']
+    )
+    image_format = 'png'    
+    
+    argparser.add_argument(
+        '-C', '--channel-mode',
+        type=str,
+        help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.",
+        choices=['mono', 'separate']
+    )
+    channel_mode = 'mono'
+    
+    # argparser.add_argument(
+    #     '-m', '--min-space',
+    #     type=float,
+    #     help="Minimum space allowed between successive events (seconds). If -1, then allow overlapping between events."
+    # )
+    min_space = -1
+    
+    argparser.add_argument(
+        '-c', '--end-cut',
+        action='store_true',
+        help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, else remove the sample."
+    )
+    end_cut = None
+    
+    logging.basicConfig(level=logging.DEBUG)
+    
+    args = argparser.parse_args()
+    if args.input_path:
+        input_path = args.input_path
+        logging.debug("Using `{}' as input path".format(input_path))
+    if args.output_path:
+        output_path = args.output_path
+        logging.debug("Saving to `{}'".format(output_path))
+    if args.scene_duration:
+        if not (args.score_backgrounds or args.score_events):
+            print("You must provide one of -e or -b")
+        else:
+            if args.image_format:
+                image_format = args.image_format
+            if args.channel_mode:
+                channel_mode = args.channel_mode
+            if args.ebr_mode:
+                ebr_mode = args.ebr_mode
+                if ebr_mode not in ['generate']:
+                    logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode))
+                    ebr_mode = 'generate'
+            if args.time_mode:
+                time_mode = args.time_mode
+                if time_mode not in ['generate']:
+                    logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode))
+                    time_mode = 'generate'
+            if args.annotation_file:
+                annotations = read_annotations_file(args.annotation_file)
+                
+            scene_duration = float(args.scene_duration)
+            
+            if args.score_backgrounds:
+                score_backgrounds = read_backgrounds_file(args.score_backgrounds)
+            else:
+                score_backgrounds = []
+                
+            if args.score_events:
+                score_events = read_events_file(args.score_events)
+            else:
+                score_events = []
+                
+            if args.figure_verbosity:
+                figure_verbosity = args.figure_verbosity
+
+            if args.N:
+                generate_n = args.N
+
+            if args.tag:
+                tag = args.tag
+
+            if generate_n == 1:
+                append_to_filename = None
+                simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
+                         time_mode=time_mode,
+                         ebr_mode=ebr_mode,
+                         channel_mode=channel_mode,
+                         annotation_file=annotation_file,
+                         audio_file=audio_file,
+                         figure_verbosity=figure_verbosity,
+                         min_space=min_space,
+                         end_cut=end_cut,
+                         image_format=image_format,
+                         append_to_filename=append_to_filename)
+            else:
+                for n in range(generate_n):
+                    if tag:
+                        append_to_filename = '{}_{}'.format(tag, n)
+                    else:
+                        append_to_filename = '{}'.format(n)
+
+                    logging.info("Generating scene {}".format(n))
+                        
+                    simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
+                             time_mode=time_mode,
+                             ebr_mode=ebr_mode,
+                             channel_mode=channel_mode,
+                             annotation_file=annotation_file,
+                             audio_file=audio_file,
+                             figure_verbosity=min(figure_verbosity, 1),
+                             min_space=min_space,
+                             end_cut=end_cut,
+                             image_format=image_format,
+                             append_to_filename=append_to_filename)
+