e@35: #!/bin/python
e@35: # -*- coding: utf-8 -*-
e@35: # For licensing please see: LICENSE
e@35: # Copyright (c) Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
e@35: 
e@35: # Argparse
e@35: import argparse
e@35: 
e@35: # Logging
e@35: import logging
e@35: 
e@35: # Pandas
e@35: import pandas as pd
e@35: 
e@35: # Numpy
e@35: import numpy as np
e@35: 
e@35: # Glob
e@35: import glob
e@35: import random
e@35: 
e@35: # Librosa
e@35: import librosa
e@35: import librosa.display
e@44: 
e@44: # PySoundfile
e@44: import soundfile as sf
e@35: 
e@35: # Matplotlib
e@35: import matplotlib.pyplot as plt
e@35: 
e@35: # Tabulate
e@35: from tabulate import tabulate
e@35: 
e@41: 
e@42: def _D(t, sr=44100):
e@35:     """
e@35:     Helper function: Converts time to samples
e@35:     """
e@35:     return int(t*sr)
e@35: 
e@42: 
e@35: def compute_energy(x):
e@35:     return np.sqrt(np.mean(x**2))
e@35: 
e@35: 
e@41: def timedict_to_dataframe(timedict):
e@47:     print(timedict)
e@46:     return pd.DataFrame([(key, val[0], val[1], val[2]) for key in timedict for val in timedict[key]],
e@46:                         columns=('label', 'filename', 'start_time', 'end_time'))
e@41: 
e@47: def timedict_to_txt(timedict):
e@47:     str_  = ""
e@47:     for key in timedict:
e@47:         for val in timedict[key]:
e@47:             str_ += "{}\t{}\t{}\n".format(float(val[1]), float(val[2]), key)
e@47:     str_ += '\n'
e@47:     return str_
e@42: 
e@35: def render_pattern(fname, input_path, sr=44100):
e@35:     pattern = read_pattern_file(fname)
e@35: 
e@41:     # Store starting and end times in the format
e@41:     # {'filename': (start_time, end_time)}
e@41: 
e@41:     timesdict = {}
e@41: 
e@35:     start_times_samples = []
e@35:     end_times_samples = []
e@35:     durations_samples = []
e@35:     wav_files = []
e@41: 
e@41:     pattern_timedict = []
e@41: 
e@35:     for n in range(len(pattern)):
e@35:         # Try loading the file,
e@35:         sampleid = pattern['sampleid'].loc[n]
e@46:         label = pattern['sampleid'].loc[n]
e@38:         candidates = []
e@38:         for pattern_format in ['xls', 'json', 'txt', 'csv']:
e@38:             candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, sampleid, pattern_format))
e@35: 
e@38:         if len(candidates) == 0:
e@42:             candidates = glob.glob('{}/event/{}*wav'.format(input_path, sampleid))
e@38:             chosen_fname = random.sample(candidates, 1)[0]
e@38:             
e@44:             wav, SR = sf.read(chosen_fname)
e@38:         else:
e@38:             chosen_fname = random.sample(candidates, 1)[0]
e@38: 
e@38:             logging.debug('Loading {}'.format(chosen_fname))
e@41:             wav, SR, pattern_timedict = render_pattern(chosen_fname, input_path)
e@42: 
e@35:         # For each sound in the pattern file, place it starting from starttime + an offset
e@35:         # with a mean value of 0 and standard deviation of offset_stddev. The first event can
e@35:         # not start earlier than time 0. If endtime is defined (not nan), then cut the event at
e@35:         # end time.
e@35: 
e@35:         # Read and assign an amplitude
e@35:         amplitude_mean = float(pattern['amplitude'].loc[n])
e@35:         amplitude_stddev = float(pattern['amplitude_stdev'].loc[n])
e@35:         amplitude = amplitude_mean + np.random.randn()*amplitude_stddev
e@35:         wav *= amplitude
e@35:         
e@42:         start_time = max(float(pattern['start_time'].loc[n]), 0)
e@35:         start_time_samples = int(start_time*SR)
e@35: 
e@35:         fade_in_time = float(pattern['fade_in_time'].loc[n])
e@35:         fade_out_time = float(pattern['fade_out_time'].loc[n])
e@35:         end_time = float(pattern['end_time'].loc[n])
e@35: 
e@35:         # If end_time is not defined (-1 or just empty)
e@35:         # then just derive it from the length of the sample 
e@35:         if np.isnan(end_time) or float(end_time) == -1:
e@35:             duration_samples = len(wav)            
e@35:             end_time_samples = start_time_samples + duration_samples
e@45:             end_time = end_time_samples/float(SR)
e@45: 
e@35:         elif end_time - start_time > len(wav)/float(SR):
e@35:             
e@35:             # If given end_time is more than start_time + duration of sample
e@35:             # then pad the file with zeros to reach the desired end time.
e@35:             duration = end_time - start_time
e@35:             duration_samples = int(duration*SR)
e@35:             end_time_samples = start_time_samples + duration_samples
e@41: 
e@41:             # Calculate end time in seconds
e@41:             end_time = end_time_samples/float(SR)
e@41: 
e@35:             wav_arr = np.zeros(duration_samples)
e@35:             wav_arr[:len(wav)] = wav
e@35:             wav = wav_arr
e@35:         else:
e@35:             duration = end_time - start_time
e@35:             duration_samples = int(duration*SR)
e@35:             end_time_samples = start_time_samples + duration_samples
e@35: 
e@35:         event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time)
e@35:         
e@35:         start_times_samples.append(start_time_samples)
e@35:         end_times_samples.append(end_time_samples)
e@35:         durations_samples.append(duration_samples)
e@35:         wav_files.append(event_render)
e@35: 
e@46:         if label in timesdict:
e@46:             timesdict[label].append((chosen_fname,start_time, end_time))
e@41:         else:
e@46:             timesdict[label] = [(chosen_fname,start_time, end_time)]
e@41: 
e@41:         for pt in pattern_timedict:
e@41:             if pt in timesdict:
e@41:                 timesdict[pt] += pattern_timedict[pt]
e@41:             else:
e@41:                 timesdict[pt] = pattern_timedict[pt]
e@41: 
e@35:     pattern_duration = end_time_samples
e@35:     pattern_arr = np.zeros(pattern_duration)
e@35: 
e@35:     for n, s in enumerate(start_times_samples):
e@35:         wav = wav_files[n]
e@35:         pattern_arr[s:s+len(wav)] = wav
e@41: 
e@41:     return pattern_arr, 44100, timesdict
e@41: 
e@35: 
e@35: def read_events_file(fname):
e@35:     if fname[-3:].lower() == 'xls':
e@35:         df = pd.read_excel(fname)
e@35:     elif fname[-4:].lower() == 'json':
e@35:         df = pd.read_json(fname)    
e@35:     elif fname[-3:].lower() in ['txt']:           
e@35:         with open(fname) as f:
e@35:             s = f.readline()
e@42:             f.seek(0, 0)
e@35:             if ',' in s:
e@35:                 sep = ','
e@35:             elif '\t' in s:
e@35:                 sep = '\t'
e@35:             else:
e@35:                 sep = ' '
e@35:             logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35:             df = pd.read_csv(f, header=None, sep=sep)                  
e@42:             df.columns = ['label',
e@42:                           'sampleid',
e@42:                           'ebr',
e@42:                           'ebr_stddev',
e@42:                           'mean_time_between_instances',
e@42:                           'time_between_instances_stddev',
e@42:                           'start_time',
e@42:                           'end_time',
e@42:                           'fade_in_time',
e@42:                           'fade_out_time']
e@35:     elif fname[-3:].lower() in ['csv']:
e@35:         df = pd.read_json(fname)
e@35: 
e@42:     logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
e@35:     return df
e@35: 
e@41: 
e@35: def read_pattern_file(fname):
e@35:     if fname[-3:].lower() == 'xls':
e@35:         df = pd.read_excel(fname)
e@35:     elif fname[-4:].lower() == 'json':
e@35:         df = pd.read_json(fname)    
e@35:     elif fname[-3:].lower() in ['txt']:           
e@35:         with open(fname) as f:
e@35:             s = f.readline()
e@42:             f.seek(0, 0)
e@35:             if ',' in s:
e@35:                 sep = ','
e@35:             elif '\t' in s:
e@35:                 sep = '\t'
e@35:             else:
e@35:                 sep = ' '
e@35:             logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35:             df = pd.read_csv(f, header=None, sep=sep)        
e@42:             df.columns = ['eventid',
e@42:                           'start_time',
e@42:                           'end_time',
e@42:                           'time_offset_stdev',
e@42:                           'fade_in_time',
e@42:                           'fade_out_time',
e@42:                           'amplitude',
e@42:                           'amplitude_stdev']
e@35:     elif fname[-3:].lower() in ['csv']:
e@35:         df = pd.read_json(fname)            
e@35:             
e@42:     logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
e@35:     return df
e@41: 
e@41: 
e@35: def read_backgrounds_file(fname):
e@35:     if fname[-3:].lower() == 'xls':
e@35:         df = pd.read_excel(fname)
e@35:     elif fname[-4:].lower() == 'json':
e@35:         df = pd.read_json(fname)    
e@35:     elif fname[-3:].lower() in ['txt']:           
e@35:         with open(fname) as f:
e@35:             s = f.readline()
e@42:             f.seek(0, 0)
e@35:             if ',' in s:
e@35:                 sep = ','
e@35:             elif '\t' in s:
e@35:                 sep = '\t'
e@35:             else:
e@35:                 sep = ' '
e@35:             logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35:             df = pd.read_csv(f, header=None, sep=sep)        
e@42:             df.columns = ['label', 'sampleid', 'snr']
e@35:     elif fname[-3:].lower() in ['csv']:
e@35:         df = pd.read_json(fname)            
e@35:             
e@42:     logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
e@35:     return df
e@35: 
e@41: 
e@35: def read_annotations_file(fname):
e@35:     if fname[-3:].lower() == 'xls':
e@35:         df = pd.read_excel(fname)
e@35:     elif fname[-4:].lower() == 'json':
e@35:         df = pd.read_json(fname)
e@35:     elif fname[-3:].lower() in ['txt', 'csv']:
e@35:                                 
e@35:         with open(fname) as f:
e@35:             header = f.readline()
e@35:         
e@35:             s = f.readline()
e@42:             f.seek(0, 0)
e@35:             if ',' in s:
e@35:                 sep = ','
e@35:             elif '\t' in s:
e@35:                 sep = '\t'
e@35:             else:
e@35:                 sep = ' '
e@35:             if sep in header:
e@35:                 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35:                 df = pd.read_csv(f, header=None, sep=sep)
e@35:                 df.columns = ['start', 'stop', 'class']                        
e@35:             else:
e@41:                 df = pd.read_csv(f, sep=sep)
e@35:                 df.columns = ['start', 'stop', 'class']
e@35:                 df = None
e@35: 
e@35:     logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))                   
e@35:     return df
e@35: 
e@41: 
e@35: def run_demo():
e@35:     print("TODO: Implement run_demo()")
e@35: 
e@41: 
e@35: def fade(x, fade_in, fade_out, sr=44100):
e@35:     """
e@35:     Creates a fade-in-fade-out envelope
e@35:     for audio array x.
e@35:     """
e@35: 
e@35:     if len(x) == 0:
e@35:         return x
e@35:     
e@35:     fade_in_samples = int(fade_in*sr)
e@35:     fade_out_samples = int(fade_out*sr)
e@35:     
e@35:     outp = np.ones_like(x)
e@35:     for n in range(fade_in_samples):
e@35:         outp[n] = n*1./fade_in_samples
e@35:         
e@35:     for n in range(fade_out_samples):
e@35:         outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n
e@35:     return outp*x
e@35: 
e@41: 
e@35: def simscene(input_path,
e@35:              output_path,
e@35:              scene_duration,
e@35:              score_events,
e@35:              score_backgrounds,
e@35:              **kwargs):
e@47:     logging.warning('BER ratios have not yet been verified')
e@42:     SR = 44100  # Samplerate. Should probably not be hardcoded
e@35:     
e@35:     events_df = score_events
e@35:     backgrounds_df = score_backgrounds
e@41: 
e@41:     # Store starting and ending times in the format
e@41:     # {'filename': [(start_time, end_time), (start_time, end_time), ...]}
e@41:     timedict = {}
e@41: 
e@35:     # Create empty numpy array
e@35:     scene_arr = np.zeros(int(scene_duration*SR))
e@35: 
e@35:     if 'append_to_filename' in kwargs:
e@35:         append_to_filename = kwargs['append_to_filename']
e@35:     else:
e@35:         append_to_filename = None
e@35:     
e@35:     if 'end_cut' in kwargs:
e@35:         end_cut = kwargs['end_cut']
e@35:     else:
e@35:         end_cut = False
e@35:         
e@35:     if 'figure_verbosity' in kwargs:
e@35:         figure_verbosity = kwargs['figure_verbosity']
e@35:     else:
e@35:         figure_verbosity = 0
e@35:         
e@35:     if 'image_format' in kwargs:
e@35:         image_format = kwargs['image_format']
e@35:     else:
e@35:         image_format = 'png'
e@47: 
e@47:     if 'annot_format' in kwargs:
e@47:         annot_format = kwargs['annot_format']
e@47:     else:
e@47:         annot_format = 'sed_eval'
e@51: 
e@51:     if 'full_duration' in kwargs:
e@51:         full_duration = True
e@51:     else:
e@51:         full_duration = False
e@35:     
e@35:     # Stores the starting and ending times of every track for visualization
e@35:     # purposes
e@35:     scene_starting_times = []
e@35:     scene_ending_times = []
e@35: 
e@35:     # List of tracks
e@35:     track_list = []
e@35:     background_energies = []
e@41: 
e@35:     for n in range(len(backgrounds_df)):
e@35:         # Get label of background
e@35:         label = str(backgrounds_df['label'].loc[n])
e@35: 
e@35:         # First check if there are any pattern candidates. Give priorities
e@35:         # To pattern files.
e@35:         candidates = []
e@41: 
e@41:         # List of pattern start and end times
e@41:         pattern_timedict = []
e@41: 
e@35:         for pattern_format in ['xls', 'json', 'txt', 'csv']:
e@42:             candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path,
e@42:                                                                backgrounds_df['sampleid'].loc[n],
e@42:                                                                pattern_format))
e@35: 
e@35:         if len(candidates) == 0:
e@35:             # If no patterns are found, search for normal audio files
e@35:             candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n]))           
e@35:             chosen_fname = random.sample(candidates, 1)[0]
e@44:             wav, sr = sf.read(chosen_fname)
e@35:         else:
e@35:             chosen_fname = random.sample(candidates, 1)[0]
e@41:             wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)
e@41: 
e@35:         duration = len(wav)/float(SR)
e@35:         target_snr_db = float(backgrounds_df['snr'].loc[n])
e@35:         target_snr = 10**(target_snr_db/20.0)
e@35:         
e@35:         energy = compute_energy(wav)
e@35: 
e@41:         logging.debug('{}:energy:{}'.format(label, energy))
e@41: 
e@35:         if n == 0:
e@35:             # For the first background track, snr
e@35:             # gives an amount by which it's going to be scaled (i.e. make it more silent)
e@35:             amplitude_factor = target_snr
e@35:             wav *= amplitude_factor
e@35:         
e@35:         if n > 0:
e@35:             noise_energy = compute_energy(np.sum(np.array(track_list), axis=0))
e@41:             logging.info('{}:noise_energy:{}'.format(label, noise_energy))
e@35: 
e@35:             old_snr = energy/noise_energy
e@35:             old_snr_db = 20*np.log10(old_snr)
e@41:             logging.info('{}:old_snr:{}'.format(label, old_snr_db))
e@35:             
e@35:             amplitude_factor = target_snr/old_snr
e@35: 
e@35:             wav *= amplitude_factor
e@35:             new_energy = compute_energy(wav)
e@35:             new_snr = new_energy/noise_energy
e@35:             new_snr_db = 20. * np.log10(new_snr)
e@41:             logging.info('{}:new_snr:{}'.format(label, new_snr_db))
e@41: 
e@35:         # Track array
e@35:         track_arr = np.zeros(int(scene_duration*SR))
e@35:         start_times = [0.0]
e@35:         end_times = [start_times[-1]+len(wav)/float(SR)]
e@35: 
e@35:         # Start with the first time in the list
e@35:         new_start_time = start_times[-1]
e@35:         new_end_time = end_times[-1]
e@35: 
e@46:         if label in timedict:
e@46:             timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
e@41:         else:
e@46:             timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
e@41: 
e@35:         while new_start_time < scene_duration:
e@35:             offset = duration
e@35:             new_start_time += offset
e@41: 
e@41:             # If already exceeded scene, break
e@41:             if new_start_time >= scene_duration:
e@41:                 break
e@41: 
e@35:             new_end_time += offset
e@35: 
e@35:             start_times.append(new_start_time)
e@35:             end_times.append(new_end_time)
e@35: 
e@41:             # Update timesdict noting where each filename starts and stops
e@46:             if label in timedict:
e@46:                 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
e@41:             else:
e@46:                 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
e@41: 
e@41:             # Also update the times from the patterns
e@41:             for pt in pattern_timedict:
e@46:                 pattern_timedict[pt] = [(s0, s[1] + new_start_time, s[2] + new_start_time) for s in
e@41:                                         pattern_timedict[pt]]
e@41: 
e@41:                 if pt in timedict:
e@41:                     timedict[pt] += pattern_timedict[pt]
e@41:                 else:
e@41:                     timedict[pt] = pattern_timedict[pt]
e@41: 
e@41:             # And add those to the timedict dictionary
e@41: 
e@42:         for t in start_times:
e@35:             # We need to be careful with the limits here
e@35:             # since numpy will just ignore indexing that 
e@35:             # exceeds
e@35: 
e@35:             # Fading times in case we need to join many
e@35:             # consecutive samples together.
e@35:             # if n == 0:
e@35:             #     # Little fade-out, fade-in to smoothly repeat the
e@35:             #     # background.
e@35:             #     fade_in_time = 0.0
e@35:             #     fade_out_time = 0.01
e@35:             # elif n > 0 and n < len(start_times) - 1:
e@35:             #     fade_in_time = 0.01
e@35:             #     fade_out_time = 0.01
e@35:             # else:
e@35:             #     fade_in_time = 0.01
e@35:             #     fade_out_time = 0.0
e@42:             begin = min(_D(t), len(track_arr))
e@42:             end = min(len(track_arr), _D(t) + len(wav))
e@35: 
e@35:             # Part of the wav to store
e@35:             # part = fade(wav[:end-begin],fade_in_time,fade_out_time)
e@35:             part = wav[:end-begin]
e@35:             track_arr[begin:end] += part
e@35: 
e@35:         track_list.append(track_arr)
e@35:         scene_arr[:len(track_arr)] += track_arr
e@35: 
e@35:         if channel_mode == 'separate':
e@35:             librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR)
e@35: 
e@35:         F = librosa.stft(track_arr, 1024)
e@35:         energy_prof = librosa.feature.rmse(S=F)
e@35:         background_energies.append(energy_prof)
e@35:             
e@35:         if figure_verbosity > 0:
e@35:             plt.figure()
e@35:             plt.subplot(3, 1, 1)
e@35:             plt.title('`{}\' background waveform and spectrogram'.format(label))
e@41:             librosa.display.waveplot(track_arr, sr=SR)
e@35: 
e@35:             # Plot spectrogram 
e@35:             Fdb = librosa.amplitude_to_db(F)
e@35:             plt.subplot(3, 1, 2)
e@35:             librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
e@35: 
e@35:             # Plot energy profile
e@35:             plt.subplot(3, 1, 3)
e@35:             time = np.linspace(0,  len(track_arr)/SR, len(energy_prof.T))
e@35:             plt.semilogy(time, energy_prof.T)
e@35:             plt.xlim([0, len(track_arr)/SR])
e@35:             plt.ylabel('energy (rms)')
e@41: 
e@35:             # Tidy up and save to file
e@35:             plt.tight_layout()
e@35:             if append_to_filename:                
e@42:                 plt.savefig('{}/background_{}_{}.{}'.format(output_path,
e@42:                                                             label,
e@42:                                                             append_to_filename,
e@42:                                                             image_format),
e@42:                             dpi=300)
e@35:             else:
e@42:                 plt.savefig('{}/background_{}.{}'.format(output_path,
e@42:                                                          label,
e@42:                                                          image_format),
e@42:                             dpi=300)
e@35: 
e@35:     # Compute total energy of background
e@35:     if len(backgrounds_df) > 0:
e@35:         background_arr = np.sum(track_list, 0)
e@35:         B = librosa.stft(background_arr, 1024)
e@35:         background_energy = librosa.feature.rmse(S=B).flatten()
e@35:     else:
e@35:         background_energy = 0.0
e@41: 
e@35:     for n in range(len(events_df)):
e@35:         # Get label of track
e@35:         label = str(events_df['label'].loc[n])
e@35: 
e@35:         # First check if there are any pattern candidates. Give priorities
e@35:         # To pattern files.
e@35:         candidates = []
e@41: 
e@41:         # List of pattern start and end times
e@41:         pattern_timedict = []
e@41: 
e@35:         for pattern_format in ['xls', 'json', 'txt', 'csv']:
e@42:             candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path,
e@42:                                                                events_df['sampleid'].loc[n],
e@42:                                                                pattern_format))
e@35: 
e@35:         if len(candidates) == 0:
e@35:             # If no patterns are found, search for normal audio files
e@35:             candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n]))           
e@35:             chosen_fname = random.sample(candidates, 1)[0]
e@44:             wav, sr = sf.read(chosen_fname)
e@35:         else:
e@35:             chosen_fname = random.sample(candidates, 1)[0]
e@41:             wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)
e@41: 
e@42:         logging.debug(chosen_fname)
e@35:         # Apply a fader envelope
e@35:         fade_in_time = float(events_df['fade_in_time'].loc[n])
e@35:         fade_out_time = float(events_df['fade_out_time'].loc[n])
e@35:         wav = fade(wav, fade_in_time, fade_out_time)
e@35: 
e@35:         # Set target EBR
e@42:         target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 +
e@42:                           np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0)
e@35: 
e@35:         # Mean time between instances \mu.  
e@35:         mean_time_between_instances = events_df['mean_time_between_instances'].loc[n]
e@35:         track_end_time = events_df['end_time'].loc[n]
e@35:         
e@35:         # Track array
e@35:         track_arr = np.zeros(int(scene_duration*SR))
e@35:         
e@42:         # If \mu is -1, then play the event only once.
e@35:         if mean_time_between_instances == -1:
e@42:             track_arr[_D(events_df['start_time'].loc[n]):_D(events_df['start_time'].loc[n]) + len(wav)] += wav
e@35:             start_times = [float(events_df['start_time'].loc[n])]
e@35:             end_times = [float(events_df['end_time'].loc[n])]
e@41: 
e@41:             new_start_time = start_times[-1]
e@41:             new_end_time = end_times[-1]
e@41: 
e@46:             if label in timedict:
e@46:                 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
e@42:             else:
e@46:                 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
e@42: 
e@41:             for pt in pattern_timedict:
e@46:                 pattern_timedict[pt] = [(s[0], s[1] + new_start_time, s[2] + new_start_time) for s in
e@41:                                         pattern_timedict[pt]]
e@41: 
e@41:                 if pt in timedict:
e@41:                     timedict[pt] += pattern_timedict[pt]
e@41:                 else:
e@41:                     timedict[pt] = pattern_timedict[pt]
e@41: 
e@35:         else:
e@35:             # If 0, then start next sample after this one (set it to the duration of the sample)
e@35:             if mean_time_between_instances == 0:
e@35:                 mean_time_between_instances = len(wav)/float(SR)
e@51: 
e@51:             # If we are using -fd (full_duration) for each event then mean_time_between_instances denotes time AFTER
e@51:             # the end of the previous event.
e@51:             if full_duration and mean_time_between_instances > 0:
e@51:                 mean_time_between_instances += len(wav)/float(SR)
e@35:                 
e@35:             # Store the successive starting and ending times of the events (given e.g. the model)
e@35:             # in the following lists.
e@35:             start_times = [events_df['start_time'].loc[n]]
e@35:             end_times = [start_times[-1]+len(wav)/float(SR)]
e@35: 
e@35:             # Start with the first time in the list
e@35:             new_start_time = start_times[-1]
e@35:             new_end_time = end_times[-1]
e@35: 
e@41:             if chosen_fname in timedict:
e@46:                 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
e@41:             else:
e@46:                 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
e@41: 
e@35:             # Until the scene is full
e@35:             while new_start_time < track_end_time:
e@35:                 offset = float(mean_time_between_instances) +\
e@35:                             float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn())
e@35:                 new_start_time += offset
e@41: 
e@41:                 # If already exceeded scene, break
e@41:                 if new_start_time >= scene_duration:
e@41:                     break
e@41: 
e@35:                 new_end_time += offset
e@35:                 
e@35:                 # Only exception is if we have set the 'end_cut' flag 
e@35:                 # and the end time of the event surpasses the end time 
e@35:                 # of the track
e@35:                 if end_cut and new_end_time > track_end_time:
e@35:                     break
e@35:                 else:
e@35:                     start_times.append(new_start_time)
e@35:                     end_times.append(new_end_time)
e@35: 
e@46:                     if label in timedict:
e@46:                         timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
e@41:                     else:
e@46:                         timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
e@41: 
e@41:                     # Also update the times from the patterns
e@41:                     for pt in pattern_timedict:
e@48:                         pattern_timedict[pt] = [(s[0], s[1] + new_start_time, s[2] + new_start_time) for s in
e@41:                                                 pattern_timedict[pt]]
e@41: 
e@41:                         if pt in timedict:
e@41:                             timedict[pt] += pattern_timedict[pt]
e@41:                         else:
e@41:                             timedict[pt] = pattern_timedict[pt]
e@41: 
e@35:             for t in start_times:
e@35:                 # We need to be careful with the limits here
e@35:                 # since numpy will just ignore indexing that 
e@35:                 # exceeds the size of the array
e@42:                 begin = min(_D(t), len(track_arr))
e@42:                 end = min(len(track_arr), _D(t) + len(wav))
e@35: 
e@35:                 # Part of the wav to store
e@35:                 part = wav[:end-begin]
e@35: 
e@35:                 # If wav file was concatenated, fade out
e@35:                 # quickly to avoid clicks
e@42:                 if len(wav) > len(part) > fade_out_time*SR:
e@35:                     part = fade(part, 0, fade_out_time)
e@35:                     
e@35:                 track_arr[begin:end] += part
e@35: 
e@35:         track_list.append(track_arr)
e@35:         scene_arr[:len(track_arr)] += track_arr
e@35: 
e@35:         # Compute energies
e@35:         F = librosa.stft(track_arr, 1024)
e@35:         energy_prof = librosa.feature.rmse(S=F).flatten()
e@35: 
e@35:         # Compute current ebr
e@35:         
e@35:         if len(backgrounds_df) > 0:
e@35:             ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten()
e@35:             curr_ebr = np.max(ebr_prof)
e@42:             logging.debug('{}:Target ebr: {}db'.format(label,
e@42:                                                        20*np.log10(target_ebr)))
e@42:             logging.debug('{}:Current track ebr: {}db'.format(label,
e@42:                                                               20*np.log10(curr_ebr)))
e@35:         
e@35:             # Set correct ebr
e@35:             track_arr = track_arr/curr_ebr*target_ebr
e@35: 
e@35:             Fnew = librosa.stft(track_arr, 1024)
e@35:             new_energy_prof = librosa.feature.rmse(S=Fnew).flatten()
e@35:             new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten()
e@35:             new_ebr = np.max(new_ebr_prof)
e@42:             logging.debug('{}:New track ebr: {}db'.format(label, 20*np.log10(new_ebr)))
e@35: 
e@35:         if channel_mode == 'separate':
e@44:             sf.write('{}/{}_event_track.wav'.format(output_path, label),
e@42:                                      track_arr/np.max(track_arr),
e@42:                                      SR)
e@35:             
e@35:         if figure_verbosity > 0:
e@35:             plt.figure()
e@35: 
e@42:             plt.subplot(3, 1, 1)
e@35:             plt.title('`{}\' event waveform and spectrogram'.format(label))                
e@35: 
e@42:             librosa.display.waveplot(track_arr, sr=SR)
e@35:             Fdb = librosa.amplitude_to_db(F)
e@35:             plt.subplot(3, 1, 2)
e@35:             librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
e@35: 
e@35:             # Plot energy profile
e@35:             plt.subplot(3, 1, 3)
e@35:             time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
e@35:             plt.semilogy(time, energy_prof.T)
e@35:             plt.xlim([0, len(track_arr)/SR])            
e@35:             plt.ylabel('energy (rms)')
e@35: 
e@35:             plt.tight_layout()
e@35:             if append_to_filename:                
e@35:                 plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
e@35:             else:
e@35:                 plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300)      
e@35: 
e@35:         scene_starting_times.append((label, start_times))
e@35:         scene_ending_times.append((label, end_times))
e@35: 
e@35:     if figure_verbosity > 0:
e@35:         plt.figure()
e@42:         ax0 = plt.subplot(3, 1, 1)
e@35:         plt.title('Synthesized Scene')
e@35:         librosa.display.waveplot(scene_arr, sr=SR)
e@35:         F = librosa.stft(scene_arr)
e@35:         Fdb = librosa.amplitude_to_db(F)
e@42:         ax1 = plt.subplot(3, 1, 2)
e@35:         librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
e@42:         ax2 = plt.subplot(3, 1, 3)
e@42:         ax2.set_xlim([0, scene_duration])
e@35: 
e@35:         # Get labels
e@35:         labels = [s[0] for s in scene_starting_times]
e@35: 
e@35:         # If background is active
e@35:         if len(backgrounds_df) > 0:
e@35:             labels.append('background')
e@35: 
e@35:         # Set y axis limit. With a padding of 0.5.
e@35:         ax2.set_ylim([-0.5, len(labels)-0.5])            
e@35: 
e@35:         plt.yticks(range(len(labels)), labels)
e@35: 
e@35:         for n in range(len(scene_starting_times)):
e@35:             start_times = scene_starting_times[n][1]
e@35:             end_times = scene_ending_times[n][1]
e@35:             color = ['r', 'g', 'y'][n % 3]
e@35: 
e@35:             for m in range(len(start_times)):
e@35:                 plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4)
e@35:                 if figure_verbosity > 2:
e@35:                     ax0.axvline(start_times[m], color=color, alpha=0.1)
e@35:                     ax0.axvline(end_times[m], color=color, alpha=0.1)
e@35:                     ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
e@35:                     ax1.axvline(start_times[m], color=color, alpha=0.1)
e@35:                     ax1.axvline(end_times[m], color=color, alpha=0.1)
e@35:                     ax1.axvline(end_times[m], color=color, alpha=0.1)
e@35:                     ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)                
e@35:                     ax2.axvline(start_times[m], color=color, alpha=0.1)
e@35:                     ax2.axvline(end_times[m], color=color, alpha=0.1)
e@35:                     ax2.axvline(end_times[m], color=color, alpha=0.1)
e@35:                     ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
e@35: 
e@35:         if len(backgrounds_df) > 0:
e@35:             plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4)
e@35: 
e@35:         plt.tight_layout()
e@35: 
e@35:         if append_to_filename:                
e@35:             plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300)
e@35:         else:
e@35:             plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300)
e@41: 
e@47:     if annot_format == 'sed_eval':
e@47:         timedict_txt = timedict_to_txt(timedict)
e@47:         logging.debug(timedict_txt)
e@41: 
e@47:         if append_to_filename:
e@47:             with open('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename), 'w') as f:
e@47:                 f.write(timedict_txt)
e@47:         else:
e@47:             with open('{}/scene_offsets.csv'.format(output_path), 'w') as f:
e@47:                 f.write(timedict_txt)
e@47: 
e@47:     elif annot_format == 'pandas':
e@47:         timedict_df = timedict_to_dataframe(timedict)
e@47:         logging.debug(timedict_df)
e@47: 
e@47:         if append_to_filename:
e@47:             timedict_df.to_csv('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename))
e@47:         else:
e@47:             timedict_df.to_csv('{}/scene_offsets.csv'.format(output_path))
e@41: 
e@35:     if figure_verbosity > 1:
e@35:         plt.show()
e@35: 
e@35:     # Replace nans (i.e. because of division-by-zero) of the scene with zeros.
e@35:     scene_arr = np.nan_to_num(scene_arr)
e@35:         
e@35:     if channel_mode == 'mono':
e@35:         if append_to_filename:
e@44:             sf.write('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR)
e@35:         else:
e@44:             sf.write('{}/scene.wav'.format(output_path), scene_arr, SR)
e@41: 
e@41:     # Print timesdict
e@35: 
e@35:     return scene_arr
e@41: 
e@41: 
e@35: def not_implemented():
e@41:     logging.info("TODO: not implemented")
e@41: 
e@41: 
e@41: if __name__ == "__main__":
e@35:     """
e@35:     Main function, parses options and calls the simscene generation function
e@35:     or a demo. The options given are almost identical to Lagrange et al's 
e@35:     simscene.
e@35:     """
e@35:     argparser = argparse.ArgumentParser(
e@35:             description="SimScene.py acoustic scene generator",
e@35:     )
e@35:     argparser.add_argument(
e@35:         'input_path',
e@35:         type=str,
e@42:         help="Path of a directory containing wave files for sound backgrounds"
e@42:              "(in the `background' sub-directory) or events (in `event')"
e@35:     )
e@42: 
e@42:     input_path = '.'
e@42: 
e@35:     argparser.add_argument(
e@35:         'output_path',
e@35:         type=str,
e@35:         help="The directory the generated scenes and annotations will reside."
e@42:     )
e@42: 
e@42:     output_path = '.'
e@42: 
e@35:     argparser.add_argument(
e@35:         'scene_duration',
e@35:         type=float,
e@35:         help="Duration of scene in seconds",
e@35:     )
e@35:     scene_duration = None
e@35:     
e@35:     argparser.add_argument(
e@35:         '-e', '--score-events',
e@35:         type=str,
e@35:         help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
e@35:     )
e@35:     score_events = None
e@35:     
e@35:     argparser.add_argument(
e@35:         '-b', '--score-backgrounds',
e@35:         type=str,
e@35:         help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
e@35:     )
e@35:     score_backgrounds = None
e@35: 
e@35:     argparser.add_argument(
e@35:         '--tag',
e@35:         type=str,
e@35:         help="Append _TAG_XXX to filenames, where XXX is an increment."
e@35:     )
e@35:     tag = None
e@35: 
e@35:     argparser.add_argument(
e@35:         '-N',
e@35:         type=int,
e@42:         help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, "
e@42:              "then the verbosity must be less or equal to 1"
e@35:     )
e@35:     generate_n = 1
e@35: 
e@35:     argparser.add_argument(
e@35:         '-t', '--time-mode',
e@35:         type=str,
e@42:         help="Mode of spacing between events. `generate': values must be set for each track in the score files. "
e@42:              "`abstract': values are computed from an abstract representation of an existing acoustic scene. "
e@42:              "`replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
e@35:         choices=['generate', 'abstract', 'replicate']
e@35:     )
e@35:     time_mode = 'generate'
e@51: 
e@51:     argparser.add_argument(
e@51:         '-fd', '--full-duration',
e@51:         action='store_true',
e@51:         help="If enabled, times specified in the recipe refer to after the previous file finishes."
e@51:     )
e@51:     full_duration = False
e@51: 
e@35:     argparser.add_argument(
e@35:         '-R', '--ebr-mode',
e@35:         type=str,
e@42:         help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the "
e@42:              "score files. `abstract': values are computed from an abstract representation of an existing acoustic "
e@42:              "scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
e@35:         choices=['generate', 'abstract', 'replicate']
e@35:     )
e@35:     ebr_mode = 'generate'
e@35:     
e@35:     argparser.add_argument(
e@35:         '-A', '--annotation-file',
e@35:         type=float,
e@42:         help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. "
e@42:              "ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). "
e@42:              "(NOT IMPLEMENTED)"
e@35:     )
e@35:     annotation_file = None
e@35:     
e@35:     argparser.add_argument(
e@35:         '-a', '--audio-file',
e@35:         type=float,
e@42:         help="If -R or -m are selected, this provides the source for sourcing the times or EBRs "
e@42:              "from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)"
e@35:     )
e@35:     audio_file = None
e@35:     
e@35:     argparser.add_argument(
e@35:         '-v', '--figure-verbosity', action='count',
e@42:         help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not "
e@42:              "display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot"
e@35:     )
e@35:     figure_verbosity = 0
e@35: 
e@35:     argparser.add_argument(
e@35:         '-x', '--image-format',
e@35:         help="Image format for the figures",
e@35:         choices=['png', 'jpg', 'pdf']
e@35:     )
e@35:     image_format = 'png'    
e@35:     
e@35:     argparser.add_argument(
e@35:         '-C', '--channel-mode',
e@35:         type=str,
e@42:         help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as "
e@42:              "'classes', each channel is saved in a separate .wav file.",
e@35:         choices=['mono', 'separate']
e@35:     )
e@35:     channel_mode = 'mono'
e@35:     
e@35:     argparser.add_argument(
e@35:         '-c', '--end-cut',
e@35:         action='store_true',
e@42:         help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, "
e@42:              "else remove the sample."
e@35:     )
e@35:     end_cut = None
e@42: 
e@42:     argparser.add_argument(
e@42:         '-L', '--logging-level',
e@42:         type=str,
e@42:         help="Set lowest logging level",
e@42:         choices=['debug', 'warning', 'info']
e@42:     )
e@42: 
e@47:     argparser.add_argument(
e@47:         '--annot-format',
e@47:         type=str,
e@47:         help="Annotation format for generated scenes. Choices are: 'sed_eval' (default) - Format appropriate for "
e@47:              "DCASE 2017 challenge evaluator, 'pandas' - A more detailed format for the form <label, orig_filename, "
e@47:              "start, stop>",
e@47:         choices=['sed_eval', 'pandas']
e@47:     )
e@47: 
e@35:     args = argparser.parse_args()
e@42: 
e@42:     if args.logging_level:
e@42:         if args.logging_level == 'debug':
e@42:             logging.basicConfig(level=logging.DEBUG)
e@42:         elif args.logging_level == 'info':
e@42:             logging.basicConfig(level=logging.INFO)
e@42:         elif args.logging_level == 'warning':
e@42:             logging.basicConfig(level=logging.WARNING)
e@42:     else:
e@42:         logging.basicConfig(level=logging.INFO)
e@42: 
e@35:     if args.input_path:
e@35:         input_path = args.input_path
e@35:         logging.debug("Using `{}' as input path".format(input_path))
e@35:     if args.output_path:
e@35:         output_path = args.output_path
e@35:         logging.debug("Saving to `{}'".format(output_path))
e@51:     if args.full_duration:
e@51:         full_duration = True
e@35:     if args.scene_duration:
e@35:         if not (args.score_backgrounds or args.score_events):
e@35:             print("You must provide one of -e or -b")
e@35:         else:
e@35:             if args.image_format:
e@35:                 image_format = args.image_format
e@35:             if args.channel_mode:
e@35:                 channel_mode = args.channel_mode
e@35:             if args.ebr_mode:
e@35:                 ebr_mode = args.ebr_mode
e@35:                 if ebr_mode not in ['generate']:
e@35:                     logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode))
e@35:                     ebr_mode = 'generate'
e@35:             if args.time_mode:
e@35:                 time_mode = args.time_mode
e@35:                 if time_mode not in ['generate']:
e@35:                     logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode))
e@35:                     time_mode = 'generate'
e@35:             if args.annotation_file:
e@35:                 annotations = read_annotations_file(args.annotation_file)
e@35:                 
e@35:             scene_duration = float(args.scene_duration)
e@35:             
e@35:             if args.score_backgrounds:
e@35:                 score_backgrounds = read_backgrounds_file(args.score_backgrounds)
e@35:             else:
e@35:                 score_backgrounds = []
e@35:                 
e@35:             if args.score_events:
e@35:                 score_events = read_events_file(args.score_events)
e@35:             else:
e@35:                 score_events = []
e@35:                 
e@35:             if args.figure_verbosity:
e@35:                 figure_verbosity = args.figure_verbosity
e@35: 
e@35:             if args.N:
e@35:                 generate_n = args.N
e@35: 
e@35:             if args.tag:
e@35:                 tag = args.tag
e@35: 
e@35:             if generate_n == 1:
e@35:                 append_to_filename = None
e@35:                 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
e@35:                          time_mode=time_mode,
e@35:                          ebr_mode=ebr_mode,
e@35:                          channel_mode=channel_mode,
e@35:                          annotation_file=annotation_file,
e@35:                          audio_file=audio_file,
e@35:                          figure_verbosity=figure_verbosity,
e@35:                          end_cut=end_cut,
e@35:                          image_format=image_format,
e@51:                          append_to_filename=append_to_filename,
e@51:                          full_duration=full_duration)
e@35:             else:
e@35:                 for n in range(generate_n):
e@35:                     if tag:
e@35:                         append_to_filename = '{}_{}'.format(tag, n)
e@35:                     else:
e@35:                         append_to_filename = '{}'.format(n)
e@35: 
e@35:                     logging.info("Generating scene {}".format(n))
e@35:                         
e@35:                     simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
e@35:                              time_mode=time_mode,
e@35:                              ebr_mode=ebr_mode,
e@35:                              channel_mode=channel_mode,
e@35:                              annotation_file=annotation_file,
e@35:                              audio_file=audio_file,
e@35:                              figure_verbosity=min(figure_verbosity, 1),
e@35:                              end_cut=end_cut,
e@35:                              image_format=image_format,
e@51:                              append_to_filename=append_to_filename,
e@51:                              full_duration=full_duration)