view python/simscene.py @ 44:b7b1672b3c3b

Reading and writing of files now is done by soundfile since there seems to be a bug with writing .wav files with librosa (mplayer would play them as rubbish). Added soundfile as a requirement.
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Mon, 09 Oct 2017 11:55:03 +0100
parents f30d2066eebb
children 771dde08349a
line wrap: on
line source
#!/bin/python
# -*- coding: utf-8 -*-
# For licensing please see: LICENSE
# Copyright (c) Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>

# Argparse
import argparse

# Logging
import logging

# Pandas
import pandas as pd

# Numpy
import numpy as np

# Glob
import glob
import random

# Librosa
import librosa
import librosa.display

# PySoundfile

import soundfile as sf

# Matplotlib
import matplotlib.pyplot as plt

# Tabulate
from tabulate import tabulate


def _D(t, sr=44100):
    """
    Helper function: Converts time to samples
    """
    return int(t*sr)


def compute_energy(x):
    return np.sqrt(np.mean(x**2))


def timedict_to_dataframe(timedict):
    return pd.DataFrame([(key, val[0], val[1]) for key in timedict for val in timedict[key]],
                        columns=('filename', 'start_time', 'end_time'))


def render_pattern(fname, input_path, sr=44100):
    pattern = read_pattern_file(fname)

    # Store starting and end times in the format
    # {'filename': (start_time, end_time)}

    timesdict = {}

    start_times_samples = []
    end_times_samples = []
    durations_samples = []
    wav_files = []

    pattern_timedict = []

    for n in range(len(pattern)):
        # Try loading the file,
        sampleid = pattern['sampleid'].loc[n]
        candidates = []
        for pattern_format in ['xls', 'json', 'txt', 'csv']:
            candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, sampleid, pattern_format))

        if len(candidates) == 0:
            candidates = glob.glob('{}/event/{}*wav'.format(input_path, sampleid))
            chosen_fname = random.sample(candidates, 1)[0]
            
            wav, SR = sf.read(chosen_fname)
        else:
            chosen_fname = random.sample(candidates, 1)[0]

            logging.debug('Loading {}'.format(chosen_fname))
            wav, SR, pattern_timedict = render_pattern(chosen_fname, input_path)

        # For each sound in the pattern file, place it starting from starttime + an offset
        # with a mean value of 0 and standard deviation of offset_stddev. The first event can
        # not start earlier than time 0. If endtime is defined (not nan), then cut the event at
        # end time.

        # Read and assign an amplitude
        amplitude_mean = float(pattern['amplitude'].loc[n])
        amplitude_stddev = float(pattern['amplitude_stdev'].loc[n])
        amplitude = amplitude_mean + np.random.randn()*amplitude_stddev
        wav *= amplitude
        
        start_time = max(float(pattern['start_time'].loc[n]), 0)
        start_time_samples = int(start_time*SR)

        fade_in_time = float(pattern['fade_in_time'].loc[n])
        fade_out_time = float(pattern['fade_out_time'].loc[n])
        end_time = float(pattern['end_time'].loc[n])

        # If end_time is not defined (-1 or just empty)
        # then just derive it from the length of the sample 
        if np.isnan(end_time) or float(end_time) == -1:
            duration_samples = len(wav)            
            end_time_samples = start_time_samples + duration_samples
        elif end_time - start_time > len(wav)/float(SR):
            
            # If given end_time is more than start_time + duration of sample
            # then pad the file with zeros to reach the desired end time.
            duration = end_time - start_time
            duration_samples = int(duration*SR)
            end_time_samples = start_time_samples + duration_samples

            # Calculate end time in seconds
            end_time = end_time_samples/float(SR)

            wav_arr = np.zeros(duration_samples)
            wav_arr[:len(wav)] = wav
            wav = wav_arr
        else:
            duration = end_time - start_time
            duration_samples = int(duration*SR)
            end_time_samples = start_time_samples + duration_samples

        event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time)
        
        start_times_samples.append(start_time_samples)
        end_times_samples.append(end_time_samples)
        durations_samples.append(duration_samples)
        wav_files.append(event_render)

        if chosen_fname in timesdict:
            timesdict[chosen_fname].append((start_time, end_time))
        else:
            timesdict[chosen_fname] = [(start_time, end_time)]

        for pt in pattern_timedict:
            if pt in timesdict:
                timesdict[pt] += pattern_timedict[pt]
            else:
                timesdict[pt] = pattern_timedict[pt]

    pattern_duration = end_time_samples
    pattern_arr = np.zeros(pattern_duration)

    for n, s in enumerate(start_times_samples):
        wav = wav_files[n]
        pattern_arr[s:s+len(wav)] = wav

    return pattern_arr, 44100, timesdict


def read_events_file(fname):
    if fname[-3:].lower() == 'xls':
        df = pd.read_excel(fname)
    elif fname[-4:].lower() == 'json':
        df = pd.read_json(fname)    
    elif fname[-3:].lower() in ['txt']:           
        with open(fname) as f:
            s = f.readline()
            f.seek(0, 0)
            if ',' in s:
                sep = ','
            elif '\t' in s:
                sep = '\t'
            else:
                sep = ' '
            logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
            df = pd.read_csv(f, header=None, sep=sep)                  
            df.columns = ['label',
                          'sampleid',
                          'ebr',
                          'ebr_stddev',
                          'mean_time_between_instances',
                          'time_between_instances_stddev',
                          'start_time',
                          'end_time',
                          'fade_in_time',
                          'fade_out_time']
    elif fname[-3:].lower() in ['csv']:
        df = pd.read_json(fname)

    logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
    return df


def read_pattern_file(fname):
    if fname[-3:].lower() == 'xls':
        df = pd.read_excel(fname)
    elif fname[-4:].lower() == 'json':
        df = pd.read_json(fname)    
    elif fname[-3:].lower() in ['txt']:           
        with open(fname) as f:
            s = f.readline()
            f.seek(0, 0)
            if ',' in s:
                sep = ','
            elif '\t' in s:
                sep = '\t'
            else:
                sep = ' '
            logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
            df = pd.read_csv(f, header=None, sep=sep)        
            df.columns = ['eventid',
                          'start_time',
                          'end_time',
                          'time_offset_stdev',
                          'fade_in_time',
                          'fade_out_time',
                          'amplitude',
                          'amplitude_stdev']
    elif fname[-3:].lower() in ['csv']:
        df = pd.read_json(fname)            
            
    logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
    return df


def read_backgrounds_file(fname):
    if fname[-3:].lower() == 'xls':
        df = pd.read_excel(fname)
    elif fname[-4:].lower() == 'json':
        df = pd.read_json(fname)    
    elif fname[-3:].lower() in ['txt']:           
        with open(fname) as f:
            s = f.readline()
            f.seek(0, 0)
            if ',' in s:
                sep = ','
            elif '\t' in s:
                sep = '\t'
            else:
                sep = ' '
            logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
            df = pd.read_csv(f, header=None, sep=sep)        
            df.columns = ['label', 'sampleid', 'snr']
    elif fname[-3:].lower() in ['csv']:
        df = pd.read_json(fname)            
            
    logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
    return df


def read_annotations_file(fname):
    if fname[-3:].lower() == 'xls':
        df = pd.read_excel(fname)
    elif fname[-4:].lower() == 'json':
        df = pd.read_json(fname)
    elif fname[-3:].lower() in ['txt', 'csv']:
                                
        with open(fname) as f:
            header = f.readline()
        
            s = f.readline()
            f.seek(0, 0)
            if ',' in s:
                sep = ','
            elif '\t' in s:
                sep = '\t'
            else:
                sep = ' '
            if sep in header:
                logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
                df = pd.read_csv(f, header=None, sep=sep)
                df.columns = ['start', 'stop', 'class']                        
            else:
                df = pd.read_csv(f, sep=sep)
                df.columns = ['start', 'stop', 'class']
                df = None

    logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))                   
    return df


def run_demo():
    print("TODO: Implement run_demo()")


def fade(x, fade_in, fade_out, sr=44100):
    """
    Creates a fade-in-fade-out envelope
    for audio array x.
    """

    if len(x) == 0:
        return x
    
    fade_in_samples = int(fade_in*sr)
    fade_out_samples = int(fade_out*sr)
    
    outp = np.ones_like(x)
    for n in range(fade_in_samples):
        outp[n] = n*1./fade_in_samples
        
    for n in range(fade_out_samples):
        outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n
    return outp*x


def simscene(input_path,
             output_path,
             scene_duration,
             score_events,
             score_backgrounds,
             **kwargs):
    logging.warning('simscene() is not yet implemented fully')
    SR = 44100  # Samplerate. Should probably not be hardcoded
    
    events_df = score_events
    backgrounds_df = score_backgrounds

    # Store starting and ending times in the format
    # {'filename': [(start_time, end_time), (start_time, end_time), ...]}
    timedict = {}

    # Create empty numpy array
    scene_arr = np.zeros(int(scene_duration*SR))

    if 'append_to_filename' in kwargs:
        append_to_filename = kwargs['append_to_filename']
    else:
        append_to_filename = None
    
    if 'end_cut' in kwargs:
        end_cut = kwargs['end_cut']
    else:
        end_cut = False
        
    if 'figure_verbosity' in kwargs:
        figure_verbosity = kwargs['figure_verbosity']
    else:
        figure_verbosity = 0
        
    if 'image_format' in kwargs:
        image_format = kwargs['image_format']
    else:
        image_format = 'png'
    
    # Stores the starting and ending times of every track for visualization
    # purposes
    scene_starting_times = []
    scene_ending_times = []

    # List of tracks
    track_list = []
    background_energies = []

    for n in range(len(backgrounds_df)):
        # Get label of background
        label = str(backgrounds_df['label'].loc[n])

        # First check if there are any pattern candidates. Give priorities
        # To pattern files.
        candidates = []

        # List of pattern start and end times
        pattern_timedict = []

        for pattern_format in ['xls', 'json', 'txt', 'csv']:
            candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path,
                                                               backgrounds_df['sampleid'].loc[n],
                                                               pattern_format))

        if len(candidates) == 0:
            # If no patterns are found, search for normal audio files
            candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n]))           
            chosen_fname = random.sample(candidates, 1)[0]
            wav, sr = sf.read(chosen_fname)
        else:
            chosen_fname = random.sample(candidates, 1)[0]
            wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)

        duration = len(wav)/float(SR)
        target_snr_db = float(backgrounds_df['snr'].loc[n])
        target_snr = 10**(target_snr_db/20.0)
        
        energy = compute_energy(wav)

        logging.debug('{}:energy:{}'.format(label, energy))

        if n == 0:
            # For the first background track, snr
            # gives an amount by which it's going to be scaled (i.e. make it more silent)
            amplitude_factor = target_snr
            wav *= amplitude_factor
        
        if n > 0:
            noise_energy = compute_energy(np.sum(np.array(track_list), axis=0))
            logging.info('{}:noise_energy:{}'.format(label, noise_energy))

            old_snr = energy/noise_energy
            old_snr_db = 20*np.log10(old_snr)
            logging.info('{}:old_snr:{}'.format(label, old_snr_db))
            
            amplitude_factor = target_snr/old_snr

            wav *= amplitude_factor
            new_energy = compute_energy(wav)
            new_snr = new_energy/noise_energy
            new_snr_db = 20. * np.log10(new_snr)
            logging.info('{}:new_snr:{}'.format(label, new_snr_db))

        # Track array
        track_arr = np.zeros(int(scene_duration*SR))
        start_times = [0.0]
        end_times = [start_times[-1]+len(wav)/float(SR)]

        # Start with the first time in the list
        new_start_time = start_times[-1]
        new_end_time = end_times[-1]

        if chosen_fname in timedict:
            timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time)))
        else:
            timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))]

        while new_start_time < scene_duration:
            offset = duration
            new_start_time += offset

            # If already exceeded scene, break
            if new_start_time >= scene_duration:
                break

            new_end_time += offset

            start_times.append(new_start_time)
            end_times.append(new_end_time)

            # Update timesdict noting where each filename starts and stops
            if chosen_fname in timedict:
                timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time)))
            else:
                timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))]

            # Also update the times from the patterns
            for pt in pattern_timedict:
                pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in
                                        pattern_timedict[pt]]

                if pt in timedict:
                    timedict[pt] += pattern_timedict[pt]
                else:
                    timedict[pt] = pattern_timedict[pt]

            # And add those to the timedict dictionary

        for t in start_times:
            # We need to be careful with the limits here
            # since numpy will just ignore indexing that 
            # exceeds

            # Fading times in case we need to join many
            # consecutive samples together.
            # if n == 0:
            #     # Little fade-out, fade-in to smoothly repeat the
            #     # background.
            #     fade_in_time = 0.0
            #     fade_out_time = 0.01
            # elif n > 0 and n < len(start_times) - 1:
            #     fade_in_time = 0.01
            #     fade_out_time = 0.01
            # else:
            #     fade_in_time = 0.01
            #     fade_out_time = 0.0
            begin = min(_D(t), len(track_arr))
            end = min(len(track_arr), _D(t) + len(wav))

            # Part of the wav to store
            # part = fade(wav[:end-begin],fade_in_time,fade_out_time)
            part = wav[:end-begin]
            track_arr[begin:end] += part

        track_list.append(track_arr)
        scene_arr[:len(track_arr)] += track_arr

        if channel_mode == 'separate':
            librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR)

        F = librosa.stft(track_arr, 1024)
        energy_prof = librosa.feature.rmse(S=F)
        background_energies.append(energy_prof)
            
        if figure_verbosity > 0:
            plt.figure()
            plt.subplot(3, 1, 1)
            plt.title('`{}\' background waveform and spectrogram'.format(label))
            librosa.display.waveplot(track_arr, sr=SR)

            # Plot spectrogram 
            Fdb = librosa.amplitude_to_db(F)
            plt.subplot(3, 1, 2)
            librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')

            # Plot energy profile
            plt.subplot(3, 1, 3)
            time = np.linspace(0,  len(track_arr)/SR, len(energy_prof.T))
            plt.semilogy(time, energy_prof.T)
            plt.xlim([0, len(track_arr)/SR])
            plt.ylabel('energy (rms)')

            # Tidy up and save to file
            plt.tight_layout()
            if append_to_filename:                
                plt.savefig('{}/background_{}_{}.{}'.format(output_path,
                                                            label,
                                                            append_to_filename,
                                                            image_format),
                            dpi=300)
            else:
                plt.savefig('{}/background_{}.{}'.format(output_path,
                                                         label,
                                                         image_format),
                            dpi=300)

    # Compute total energy of background
    if len(backgrounds_df) > 0:
        background_arr = np.sum(track_list, 0)
        B = librosa.stft(background_arr, 1024)
        background_energy = librosa.feature.rmse(S=B).flatten()
    else:
        background_energy = 0.0

    for n in range(len(events_df)):
        # Get label of track
        label = str(events_df['label'].loc[n])

        # First check if there are any pattern candidates. Give priorities
        # To pattern files.
        candidates = []

        # List of pattern start and end times
        pattern_timedict = []

        for pattern_format in ['xls', 'json', 'txt', 'csv']:
            candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path,
                                                               events_df['sampleid'].loc[n],
                                                               pattern_format))

        if len(candidates) == 0:
            # If no patterns are found, search for normal audio files
            candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n]))           
            chosen_fname = random.sample(candidates, 1)[0]
            wav, sr = sf.read(chosen_fname)
        else:
            chosen_fname = random.sample(candidates, 1)[0]
            wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)

        logging.debug(chosen_fname)
        # Apply a fader envelope
        fade_in_time = float(events_df['fade_in_time'].loc[n])
        fade_out_time = float(events_df['fade_out_time'].loc[n])
        wav = fade(wav, fade_in_time, fade_out_time)

        # Set target EBR
        target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 +
                          np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0)

        # Mean time between instances \mu.  
        mean_time_between_instances = events_df['mean_time_between_instances'].loc[n]
        track_end_time = events_df['end_time'].loc[n]
        
        # Track array
        track_arr = np.zeros(int(scene_duration*SR))
        
        # If \mu is -1, then play the event only once.
        if mean_time_between_instances == -1:
            track_arr[_D(events_df['start_time'].loc[n]):_D(events_df['start_time'].loc[n]) + len(wav)] += wav
            start_times = [float(events_df['start_time'].loc[n])]
            end_times = [float(events_df['end_time'].loc[n])]

            new_start_time = start_times[-1]
            new_end_time = end_times[-1]

            if chosen_fname in timedict:
                timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time)))
            else:
                timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))]

            for pt in pattern_timedict:
                pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in
                                        pattern_timedict[pt]]

                if pt in timedict:
                    timedict[pt] += pattern_timedict[pt]
                else:
                    timedict[pt] = pattern_timedict[pt]

        else:
            # If 0, then start next sample after this one (set it to the duration of the sample)
            if mean_time_between_instances == 0:
                mean_time_between_instances = len(wav)/float(SR)
                
            # Store the successive starting and ending times of the events (given e.g. the model)
            # in the following lists.
            start_times = [events_df['start_time'].loc[n]]
            end_times = [start_times[-1]+len(wav)/float(SR)]

            # Start with the first time in the list
            new_start_time = start_times[-1]
            new_end_time = end_times[-1]

            if chosen_fname in timedict:
                timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time)))
            else:
                timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))]

            # Until the scene is full
            while new_start_time < track_end_time:
                offset = float(mean_time_between_instances) +\
                            float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn())
                new_start_time += offset

                # If already exceeded scene, break
                if new_start_time >= scene_duration:
                    break

                new_end_time += offset
                
                # Only exception is if we have set the 'end_cut' flag 
                # and the end time of the event surpasses the end time 
                # of the track
                if end_cut and new_end_time > track_end_time:
                    break
                else:
                    start_times.append(new_start_time)
                    end_times.append(new_end_time)

                    if chosen_fname in timedict:
                        timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time)))
                    else:
                        timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))]

                    # Also update the times from the patterns
                    for pt in pattern_timedict:
                        pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in
                                                pattern_timedict[pt]]

                        if pt in timedict:
                            timedict[pt] += pattern_timedict[pt]
                        else:
                            timedict[pt] = pattern_timedict[pt]

            for t in start_times:
                # We need to be careful with the limits here
                # since numpy will just ignore indexing that 
                # exceeds the size of the array
                begin = min(_D(t), len(track_arr))
                end = min(len(track_arr), _D(t) + len(wav))

                # Part of the wav to store
                part = wav[:end-begin]

                # If wav file was concatenated, fade out
                # quickly to avoid clicks
                if len(wav) > len(part) > fade_out_time*SR:
                    part = fade(part, 0, fade_out_time)
                    
                track_arr[begin:end] += part

        track_list.append(track_arr)
        scene_arr[:len(track_arr)] += track_arr

        # Compute energies
        F = librosa.stft(track_arr, 1024)
        energy_prof = librosa.feature.rmse(S=F).flatten()

        # Compute current ebr
        
        if len(backgrounds_df) > 0:
            ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten()
            curr_ebr = np.max(ebr_prof)
            logging.debug('{}:Target ebr: {}db'.format(label,
                                                       20*np.log10(target_ebr)))
            logging.debug('{}:Current track ebr: {}db'.format(label,
                                                              20*np.log10(curr_ebr)))
        
            # Set correct ebr
            track_arr = track_arr/curr_ebr*target_ebr

            Fnew = librosa.stft(track_arr, 1024)
            new_energy_prof = librosa.feature.rmse(S=Fnew).flatten()
            new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten()
            new_ebr = np.max(new_ebr_prof)
            logging.debug('{}:New track ebr: {}db'.format(label, 20*np.log10(new_ebr)))

        if channel_mode == 'separate':
            sf.write('{}/{}_event_track.wav'.format(output_path, label),
                                     track_arr/np.max(track_arr),
                                     SR)
            
        if figure_verbosity > 0:
            plt.figure()

            plt.subplot(3, 1, 1)
            plt.title('`{}\' event waveform and spectrogram'.format(label))                

            librosa.display.waveplot(track_arr, sr=SR)
            Fdb = librosa.amplitude_to_db(F)
            plt.subplot(3, 1, 2)
            librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')

            # Plot energy profile
            plt.subplot(3, 1, 3)
            time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
            plt.semilogy(time, energy_prof.T)
            plt.xlim([0, len(track_arr)/SR])            
            plt.ylabel('energy (rms)')

            plt.tight_layout()
            if append_to_filename:                
                plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
            else:
                plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300)      

        scene_starting_times.append((label, start_times))
        scene_ending_times.append((label, end_times))

    if figure_verbosity > 0:
        plt.figure()
        ax0 = plt.subplot(3, 1, 1)
        plt.title('Synthesized Scene')
        librosa.display.waveplot(scene_arr, sr=SR)
        F = librosa.stft(scene_arr)
        Fdb = librosa.amplitude_to_db(F)
        ax1 = plt.subplot(3, 1, 2)
        librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
        ax2 = plt.subplot(3, 1, 3)
        ax2.set_xlim([0, scene_duration])

        # Get labels
        labels = [s[0] for s in scene_starting_times]

        # If background is active
        if len(backgrounds_df) > 0:
            labels.append('background')

        # Set y axis limit. With a padding of 0.5.
        ax2.set_ylim([-0.5, len(labels)-0.5])            

        plt.yticks(range(len(labels)), labels)

        for n in range(len(scene_starting_times)):
            start_times = scene_starting_times[n][1]
            end_times = scene_ending_times[n][1]
            color = ['r', 'g', 'y'][n % 3]

            for m in range(len(start_times)):
                plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4)
                if figure_verbosity > 2:
                    ax0.axvline(start_times[m], color=color, alpha=0.1)
                    ax0.axvline(end_times[m], color=color, alpha=0.1)
                    ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
                    ax1.axvline(start_times[m], color=color, alpha=0.1)
                    ax1.axvline(end_times[m], color=color, alpha=0.1)
                    ax1.axvline(end_times[m], color=color, alpha=0.1)
                    ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)                
                    ax2.axvline(start_times[m], color=color, alpha=0.1)
                    ax2.axvline(end_times[m], color=color, alpha=0.1)
                    ax2.axvline(end_times[m], color=color, alpha=0.1)
                    ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)

        if len(backgrounds_df) > 0:
            plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4)

        plt.tight_layout()

        if append_to_filename:                
            plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300)
        else:
            plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300)

    timedict_df = timedict_to_dataframe(timedict)
    logging.debug(timedict_df)

    if append_to_filename:
        timedict_df.to_csv('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename))
    else:
        timedict_df.to_csv('{}/scene_offsets.csv'.format(output_path))

    if figure_verbosity > 1:
        plt.show()

    # Replace nans (i.e. because of division-by-zero) of the scene with zeros.
    scene_arr = np.nan_to_num(scene_arr)
        
    if channel_mode == 'mono':
        if append_to_filename:
            sf.write('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR)
        else:
            sf.write('{}/scene.wav'.format(output_path), scene_arr, SR)

    # Print timesdict

    return scene_arr


def not_implemented():
    logging.info("TODO: not implemented")


if __name__ == "__main__":
    """
    Main function, parses options and calls the simscene generation function
    or a demo. The options given are almost identical to Lagrange et al's 
    simscene.
    """
    argparser = argparse.ArgumentParser(
            description="SimScene.py acoustic scene generator",
    )
    argparser.add_argument(
        'input_path',
        type=str,
        help="Path of a directory containing wave files for sound backgrounds"
             "(in the `background' sub-directory) or events (in `event')"
    )

    input_path = '.'

    argparser.add_argument(
        'output_path',
        type=str,
        help="The directory the generated scenes and annotations will reside."
    )

    output_path = '.'

    argparser.add_argument(
        'scene_duration',
        type=float,
        help="Duration of scene in seconds",
    )
    scene_duration = None
    
    argparser.add_argument(
        '-e', '--score-events',
        type=str,
        help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
    )
    score_events = None
    
    argparser.add_argument(
        '-b', '--score-backgrounds',
        type=str,
        help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
    )
    score_backgrounds = None

    argparser.add_argument(
        '--tag',
        type=str,
        help="Append _TAG_XXX to filenames, where XXX is an increment."
    )
    tag = None

    argparser.add_argument(
        '-N',
        type=int,
        help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, "
             "then the verbosity must be less or equal to 1"
    )
    generate_n = 1

    argparser.add_argument(
        '-t', '--time-mode',
        type=str,
        help="Mode of spacing between events. `generate': values must be set for each track in the score files. "
             "`abstract': values are computed from an abstract representation of an existing acoustic scene. "
             "`replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
        choices=['generate', 'abstract', 'replicate']
    )
    time_mode = 'generate'
    
    argparser.add_argument(
        '-R', '--ebr-mode',
        type=str,
        help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the "
             "score files. `abstract': values are computed from an abstract representation of an existing acoustic "
             "scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
        choices=['generate', 'abstract', 'replicate']
    )
    ebr_mode = 'generate'
    
    argparser.add_argument(
        '-A', '--annotation-file',
        type=float,
        help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. "
             "ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). "
             "(NOT IMPLEMENTED)"
    )
    annotation_file = None
    
    argparser.add_argument(
        '-a', '--audio-file',
        type=float,
        help="If -R or -m are selected, this provides the source for sourcing the times or EBRs "
             "from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)"
    )
    audio_file = None
    
    argparser.add_argument(
        '-v', '--figure-verbosity', action='count',
        help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not "
             "display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot"
    )
    figure_verbosity = 0

    argparser.add_argument(
        '-x', '--image-format',
        help="Image format for the figures",
        choices=['png', 'jpg', 'pdf']
    )
    image_format = 'png'    
    
    argparser.add_argument(
        '-C', '--channel-mode',
        type=str,
        help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as "
             "'classes', each channel is saved in a separate .wav file.",
        choices=['mono', 'separate']
    )
    channel_mode = 'mono'
    
    argparser.add_argument(
        '-c', '--end-cut',
        action='store_true',
        help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, "
             "else remove the sample."
    )
    end_cut = None

    argparser.add_argument(
        '-L', '--logging-level',
        type=str,
        help="Set lowest logging level",
        choices=['debug', 'warning', 'info']
    )

    args = argparser.parse_args()

    if args.logging_level:
        if args.logging_level == 'debug':
            logging.basicConfig(level=logging.DEBUG)
        elif args.logging_level == 'info':
            logging.basicConfig(level=logging.INFO)
        elif args.logging_level == 'warning':
            logging.basicConfig(level=logging.WARNING)
    else:
        logging.basicConfig(level=logging.INFO)

    if args.input_path:
        input_path = args.input_path
        logging.debug("Using `{}' as input path".format(input_path))
    if args.output_path:
        output_path = args.output_path
        logging.debug("Saving to `{}'".format(output_path))
    if args.scene_duration:
        if not (args.score_backgrounds or args.score_events):
            print("You must provide one of -e or -b")
        else:
            if args.image_format:
                image_format = args.image_format
            if args.channel_mode:
                channel_mode = args.channel_mode
            if args.ebr_mode:
                ebr_mode = args.ebr_mode
                if ebr_mode not in ['generate']:
                    logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode))
                    ebr_mode = 'generate'
            if args.time_mode:
                time_mode = args.time_mode
                if time_mode not in ['generate']:
                    logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode))
                    time_mode = 'generate'
            if args.annotation_file:
                annotations = read_annotations_file(args.annotation_file)
                
            scene_duration = float(args.scene_duration)
            
            if args.score_backgrounds:
                score_backgrounds = read_backgrounds_file(args.score_backgrounds)
            else:
                score_backgrounds = []
                
            if args.score_events:
                score_events = read_events_file(args.score_events)
            else:
                score_events = []
                
            if args.figure_verbosity:
                figure_verbosity = args.figure_verbosity

            if args.N:
                generate_n = args.N

            if args.tag:
                tag = args.tag

            if generate_n == 1:
                append_to_filename = None
                simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
                         time_mode=time_mode,
                         ebr_mode=ebr_mode,
                         channel_mode=channel_mode,
                         annotation_file=annotation_file,
                         audio_file=audio_file,
                         figure_verbosity=figure_verbosity,
                         end_cut=end_cut,
                         image_format=image_format,
                         append_to_filename=append_to_filename)
            else:
                for n in range(generate_n):
                    if tag:
                        append_to_filename = '{}_{}'.format(tag, n)
                    else:
                        append_to_filename = '{}'.format(n)

                    logging.info("Generating scene {}".format(n))
                        
                    simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
                             time_mode=time_mode,
                             ebr_mode=ebr_mode,
                             channel_mode=channel_mode,
                             annotation_file=annotation_file,
                             audio_file=audio_file,
                             figure_verbosity=min(figure_verbosity, 1),
                             end_cut=end_cut,
                             image_format=image_format,
                             append_to_filename=append_to_filename)