Mercurial > hg > simscene-py
view python/simscene.py @ 44:b7b1672b3c3b
Reading and writing of files now is done by soundfile since there seems to be a bug with writing .wav files with librosa (mplayer would play them as rubbish). Added soundfile as a requirement.
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Mon, 09 Oct 2017 11:55:03 +0100 |
parents | f30d2066eebb |
children | 771dde08349a |
line wrap: on
line source
#!/bin/python # -*- coding: utf-8 -*- # For licensing please see: LICENSE # Copyright (c) Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> # Argparse import argparse # Logging import logging # Pandas import pandas as pd # Numpy import numpy as np # Glob import glob import random # Librosa import librosa import librosa.display # PySoundfile import soundfile as sf # Matplotlib import matplotlib.pyplot as plt # Tabulate from tabulate import tabulate def _D(t, sr=44100): """ Helper function: Converts time to samples """ return int(t*sr) def compute_energy(x): return np.sqrt(np.mean(x**2)) def timedict_to_dataframe(timedict): return pd.DataFrame([(key, val[0], val[1]) for key in timedict for val in timedict[key]], columns=('filename', 'start_time', 'end_time')) def render_pattern(fname, input_path, sr=44100): pattern = read_pattern_file(fname) # Store starting and end times in the format # {'filename': (start_time, end_time)} timesdict = {} start_times_samples = [] end_times_samples = [] durations_samples = [] wav_files = [] pattern_timedict = [] for n in range(len(pattern)): # Try loading the file, sampleid = pattern['sampleid'].loc[n] candidates = [] for pattern_format in ['xls', 'json', 'txt', 'csv']: candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, sampleid, pattern_format)) if len(candidates) == 0: candidates = glob.glob('{}/event/{}*wav'.format(input_path, sampleid)) chosen_fname = random.sample(candidates, 1)[0] wav, SR = sf.read(chosen_fname) else: chosen_fname = random.sample(candidates, 1)[0] logging.debug('Loading {}'.format(chosen_fname)) wav, SR, pattern_timedict = render_pattern(chosen_fname, input_path) # For each sound in the pattern file, place it starting from starttime + an offset # with a mean value of 0 and standard deviation of offset_stddev. The first event can # not start earlier than time 0. If endtime is defined (not nan), then cut the event at # end time. # Read and assign an amplitude amplitude_mean = float(pattern['amplitude'].loc[n]) amplitude_stddev = float(pattern['amplitude_stdev'].loc[n]) amplitude = amplitude_mean + np.random.randn()*amplitude_stddev wav *= amplitude start_time = max(float(pattern['start_time'].loc[n]), 0) start_time_samples = int(start_time*SR) fade_in_time = float(pattern['fade_in_time'].loc[n]) fade_out_time = float(pattern['fade_out_time'].loc[n]) end_time = float(pattern['end_time'].loc[n]) # If end_time is not defined (-1 or just empty) # then just derive it from the length of the sample if np.isnan(end_time) or float(end_time) == -1: duration_samples = len(wav) end_time_samples = start_time_samples + duration_samples elif end_time - start_time > len(wav)/float(SR): # If given end_time is more than start_time + duration of sample # then pad the file with zeros to reach the desired end time. duration = end_time - start_time duration_samples = int(duration*SR) end_time_samples = start_time_samples + duration_samples # Calculate end time in seconds end_time = end_time_samples/float(SR) wav_arr = np.zeros(duration_samples) wav_arr[:len(wav)] = wav wav = wav_arr else: duration = end_time - start_time duration_samples = int(duration*SR) end_time_samples = start_time_samples + duration_samples event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time) start_times_samples.append(start_time_samples) end_times_samples.append(end_time_samples) durations_samples.append(duration_samples) wav_files.append(event_render) if chosen_fname in timesdict: timesdict[chosen_fname].append((start_time, end_time)) else: timesdict[chosen_fname] = [(start_time, end_time)] for pt in pattern_timedict: if pt in timesdict: timesdict[pt] += pattern_timedict[pt] else: timesdict[pt] = pattern_timedict[pt] pattern_duration = end_time_samples pattern_arr = np.zeros(pattern_duration) for n, s in enumerate(start_times_samples): wav = wav_files[n] pattern_arr[s:s+len(wav)] = wav return pattern_arr, 44100, timesdict def read_events_file(fname): if fname[-3:].lower() == 'xls': df = pd.read_excel(fname) elif fname[-4:].lower() == 'json': df = pd.read_json(fname) elif fname[-3:].lower() in ['txt']: with open(fname) as f: s = f.readline() f.seek(0, 0) if ',' in s: sep = ',' elif '\t' in s: sep = '\t' else: sep = ' ' logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') df = pd.read_csv(f, header=None, sep=sep) df.columns = ['label', 'sampleid', 'ebr', 'ebr_stddev', 'mean_time_between_instances', 'time_between_instances_stddev', 'start_time', 'end_time', 'fade_in_time', 'fade_out_time'] elif fname[-3:].lower() in ['csv']: df = pd.read_json(fname) logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) return df def read_pattern_file(fname): if fname[-3:].lower() == 'xls': df = pd.read_excel(fname) elif fname[-4:].lower() == 'json': df = pd.read_json(fname) elif fname[-3:].lower() in ['txt']: with open(fname) as f: s = f.readline() f.seek(0, 0) if ',' in s: sep = ',' elif '\t' in s: sep = '\t' else: sep = ' ' logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') df = pd.read_csv(f, header=None, sep=sep) df.columns = ['eventid', 'start_time', 'end_time', 'time_offset_stdev', 'fade_in_time', 'fade_out_time', 'amplitude', 'amplitude_stdev'] elif fname[-3:].lower() in ['csv']: df = pd.read_json(fname) logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) return df def read_backgrounds_file(fname): if fname[-3:].lower() == 'xls': df = pd.read_excel(fname) elif fname[-4:].lower() == 'json': df = pd.read_json(fname) elif fname[-3:].lower() in ['txt']: with open(fname) as f: s = f.readline() f.seek(0, 0) if ',' in s: sep = ',' elif '\t' in s: sep = '\t' else: sep = ' ' logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') df = pd.read_csv(f, header=None, sep=sep) df.columns = ['label', 'sampleid', 'snr'] elif fname[-3:].lower() in ['csv']: df = pd.read_json(fname) logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) return df def read_annotations_file(fname): if fname[-3:].lower() == 'xls': df = pd.read_excel(fname) elif fname[-4:].lower() == 'json': df = pd.read_json(fname) elif fname[-3:].lower() in ['txt', 'csv']: with open(fname) as f: header = f.readline() s = f.readline() f.seek(0, 0) if ',' in s: sep = ',' elif '\t' in s: sep = '\t' else: sep = ' ' if sep in header: logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') df = pd.read_csv(f, header=None, sep=sep) df.columns = ['start', 'stop', 'class'] else: df = pd.read_csv(f, sep=sep) df.columns = ['start', 'stop', 'class'] df = None logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) return df def run_demo(): print("TODO: Implement run_demo()") def fade(x, fade_in, fade_out, sr=44100): """ Creates a fade-in-fade-out envelope for audio array x. """ if len(x) == 0: return x fade_in_samples = int(fade_in*sr) fade_out_samples = int(fade_out*sr) outp = np.ones_like(x) for n in range(fade_in_samples): outp[n] = n*1./fade_in_samples for n in range(fade_out_samples): outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n return outp*x def simscene(input_path, output_path, scene_duration, score_events, score_backgrounds, **kwargs): logging.warning('simscene() is not yet implemented fully') SR = 44100 # Samplerate. Should probably not be hardcoded events_df = score_events backgrounds_df = score_backgrounds # Store starting and ending times in the format # {'filename': [(start_time, end_time), (start_time, end_time), ...]} timedict = {} # Create empty numpy array scene_arr = np.zeros(int(scene_duration*SR)) if 'append_to_filename' in kwargs: append_to_filename = kwargs['append_to_filename'] else: append_to_filename = None if 'end_cut' in kwargs: end_cut = kwargs['end_cut'] else: end_cut = False if 'figure_verbosity' in kwargs: figure_verbosity = kwargs['figure_verbosity'] else: figure_verbosity = 0 if 'image_format' in kwargs: image_format = kwargs['image_format'] else: image_format = 'png' # Stores the starting and ending times of every track for visualization # purposes scene_starting_times = [] scene_ending_times = [] # List of tracks track_list = [] background_energies = [] for n in range(len(backgrounds_df)): # Get label of background label = str(backgrounds_df['label'].loc[n]) # First check if there are any pattern candidates. Give priorities # To pattern files. candidates = [] # List of pattern start and end times pattern_timedict = [] for pattern_format in ['xls', 'json', 'txt', 'csv']: candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, backgrounds_df['sampleid'].loc[n], pattern_format)) if len(candidates) == 0: # If no patterns are found, search for normal audio files candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n])) chosen_fname = random.sample(candidates, 1)[0] wav, sr = sf.read(chosen_fname) else: chosen_fname = random.sample(candidates, 1)[0] wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path) duration = len(wav)/float(SR) target_snr_db = float(backgrounds_df['snr'].loc[n]) target_snr = 10**(target_snr_db/20.0) energy = compute_energy(wav) logging.debug('{}:energy:{}'.format(label, energy)) if n == 0: # For the first background track, snr # gives an amount by which it's going to be scaled (i.e. make it more silent) amplitude_factor = target_snr wav *= amplitude_factor if n > 0: noise_energy = compute_energy(np.sum(np.array(track_list), axis=0)) logging.info('{}:noise_energy:{}'.format(label, noise_energy)) old_snr = energy/noise_energy old_snr_db = 20*np.log10(old_snr) logging.info('{}:old_snr:{}'.format(label, old_snr_db)) amplitude_factor = target_snr/old_snr wav *= amplitude_factor new_energy = compute_energy(wav) new_snr = new_energy/noise_energy new_snr_db = 20. * np.log10(new_snr) logging.info('{}:new_snr:{}'.format(label, new_snr_db)) # Track array track_arr = np.zeros(int(scene_duration*SR)) start_times = [0.0] end_times = [start_times[-1]+len(wav)/float(SR)] # Start with the first time in the list new_start_time = start_times[-1] new_end_time = end_times[-1] if chosen_fname in timedict: timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time))) else: timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))] while new_start_time < scene_duration: offset = duration new_start_time += offset # If already exceeded scene, break if new_start_time >= scene_duration: break new_end_time += offset start_times.append(new_start_time) end_times.append(new_end_time) # Update timesdict noting where each filename starts and stops if chosen_fname in timedict: timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time))) else: timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))] # Also update the times from the patterns for pt in pattern_timedict: pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in pattern_timedict[pt]] if pt in timedict: timedict[pt] += pattern_timedict[pt] else: timedict[pt] = pattern_timedict[pt] # And add those to the timedict dictionary for t in start_times: # We need to be careful with the limits here # since numpy will just ignore indexing that # exceeds # Fading times in case we need to join many # consecutive samples together. # if n == 0: # # Little fade-out, fade-in to smoothly repeat the # # background. # fade_in_time = 0.0 # fade_out_time = 0.01 # elif n > 0 and n < len(start_times) - 1: # fade_in_time = 0.01 # fade_out_time = 0.01 # else: # fade_in_time = 0.01 # fade_out_time = 0.0 begin = min(_D(t), len(track_arr)) end = min(len(track_arr), _D(t) + len(wav)) # Part of the wav to store # part = fade(wav[:end-begin],fade_in_time,fade_out_time) part = wav[:end-begin] track_arr[begin:end] += part track_list.append(track_arr) scene_arr[:len(track_arr)] += track_arr if channel_mode == 'separate': librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR) F = librosa.stft(track_arr, 1024) energy_prof = librosa.feature.rmse(S=F) background_energies.append(energy_prof) if figure_verbosity > 0: plt.figure() plt.subplot(3, 1, 1) plt.title('`{}\' background waveform and spectrogram'.format(label)) librosa.display.waveplot(track_arr, sr=SR) # Plot spectrogram Fdb = librosa.amplitude_to_db(F) plt.subplot(3, 1, 2) librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') # Plot energy profile plt.subplot(3, 1, 3) time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T)) plt.semilogy(time, energy_prof.T) plt.xlim([0, len(track_arr)/SR]) plt.ylabel('energy (rms)') # Tidy up and save to file plt.tight_layout() if append_to_filename: plt.savefig('{}/background_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300) else: plt.savefig('{}/background_{}.{}'.format(output_path, label, image_format), dpi=300) # Compute total energy of background if len(backgrounds_df) > 0: background_arr = np.sum(track_list, 0) B = librosa.stft(background_arr, 1024) background_energy = librosa.feature.rmse(S=B).flatten() else: background_energy = 0.0 for n in range(len(events_df)): # Get label of track label = str(events_df['label'].loc[n]) # First check if there are any pattern candidates. Give priorities # To pattern files. candidates = [] # List of pattern start and end times pattern_timedict = [] for pattern_format in ['xls', 'json', 'txt', 'csv']: candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, events_df['sampleid'].loc[n], pattern_format)) if len(candidates) == 0: # If no patterns are found, search for normal audio files candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n])) chosen_fname = random.sample(candidates, 1)[0] wav, sr = sf.read(chosen_fname) else: chosen_fname = random.sample(candidates, 1)[0] wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path) logging.debug(chosen_fname) # Apply a fader envelope fade_in_time = float(events_df['fade_in_time'].loc[n]) fade_out_time = float(events_df['fade_out_time'].loc[n]) wav = fade(wav, fade_in_time, fade_out_time) # Set target EBR target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 + np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0) # Mean time between instances \mu. mean_time_between_instances = events_df['mean_time_between_instances'].loc[n] track_end_time = events_df['end_time'].loc[n] # Track array track_arr = np.zeros(int(scene_duration*SR)) # If \mu is -1, then play the event only once. if mean_time_between_instances == -1: track_arr[_D(events_df['start_time'].loc[n]):_D(events_df['start_time'].loc[n]) + len(wav)] += wav start_times = [float(events_df['start_time'].loc[n])] end_times = [float(events_df['end_time'].loc[n])] new_start_time = start_times[-1] new_end_time = end_times[-1] if chosen_fname in timedict: timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time))) else: timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))] for pt in pattern_timedict: pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in pattern_timedict[pt]] if pt in timedict: timedict[pt] += pattern_timedict[pt] else: timedict[pt] = pattern_timedict[pt] else: # If 0, then start next sample after this one (set it to the duration of the sample) if mean_time_between_instances == 0: mean_time_between_instances = len(wav)/float(SR) # Store the successive starting and ending times of the events (given e.g. the model) # in the following lists. start_times = [events_df['start_time'].loc[n]] end_times = [start_times[-1]+len(wav)/float(SR)] # Start with the first time in the list new_start_time = start_times[-1] new_end_time = end_times[-1] if chosen_fname in timedict: timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time))) else: timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))] # Until the scene is full while new_start_time < track_end_time: offset = float(mean_time_between_instances) +\ float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn()) new_start_time += offset # If already exceeded scene, break if new_start_time >= scene_duration: break new_end_time += offset # Only exception is if we have set the 'end_cut' flag # and the end time of the event surpasses the end time # of the track if end_cut and new_end_time > track_end_time: break else: start_times.append(new_start_time) end_times.append(new_end_time) if chosen_fname in timedict: timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time))) else: timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))] # Also update the times from the patterns for pt in pattern_timedict: pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in pattern_timedict[pt]] if pt in timedict: timedict[pt] += pattern_timedict[pt] else: timedict[pt] = pattern_timedict[pt] for t in start_times: # We need to be careful with the limits here # since numpy will just ignore indexing that # exceeds the size of the array begin = min(_D(t), len(track_arr)) end = min(len(track_arr), _D(t) + len(wav)) # Part of the wav to store part = wav[:end-begin] # If wav file was concatenated, fade out # quickly to avoid clicks if len(wav) > len(part) > fade_out_time*SR: part = fade(part, 0, fade_out_time) track_arr[begin:end] += part track_list.append(track_arr) scene_arr[:len(track_arr)] += track_arr # Compute energies F = librosa.stft(track_arr, 1024) energy_prof = librosa.feature.rmse(S=F).flatten() # Compute current ebr if len(backgrounds_df) > 0: ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten() curr_ebr = np.max(ebr_prof) logging.debug('{}:Target ebr: {}db'.format(label, 20*np.log10(target_ebr))) logging.debug('{}:Current track ebr: {}db'.format(label, 20*np.log10(curr_ebr))) # Set correct ebr track_arr = track_arr/curr_ebr*target_ebr Fnew = librosa.stft(track_arr, 1024) new_energy_prof = librosa.feature.rmse(S=Fnew).flatten() new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten() new_ebr = np.max(new_ebr_prof) logging.debug('{}:New track ebr: {}db'.format(label, 20*np.log10(new_ebr))) if channel_mode == 'separate': sf.write('{}/{}_event_track.wav'.format(output_path, label), track_arr/np.max(track_arr), SR) if figure_verbosity > 0: plt.figure() plt.subplot(3, 1, 1) plt.title('`{}\' event waveform and spectrogram'.format(label)) librosa.display.waveplot(track_arr, sr=SR) Fdb = librosa.amplitude_to_db(F) plt.subplot(3, 1, 2) librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') # Plot energy profile plt.subplot(3, 1, 3) time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T)) plt.semilogy(time, energy_prof.T) plt.xlim([0, len(track_arr)/SR]) plt.ylabel('energy (rms)') plt.tight_layout() if append_to_filename: plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300) else: plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300) scene_starting_times.append((label, start_times)) scene_ending_times.append((label, end_times)) if figure_verbosity > 0: plt.figure() ax0 = plt.subplot(3, 1, 1) plt.title('Synthesized Scene') librosa.display.waveplot(scene_arr, sr=SR) F = librosa.stft(scene_arr) Fdb = librosa.amplitude_to_db(F) ax1 = plt.subplot(3, 1, 2) librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') ax2 = plt.subplot(3, 1, 3) ax2.set_xlim([0, scene_duration]) # Get labels labels = [s[0] for s in scene_starting_times] # If background is active if len(backgrounds_df) > 0: labels.append('background') # Set y axis limit. With a padding of 0.5. ax2.set_ylim([-0.5, len(labels)-0.5]) plt.yticks(range(len(labels)), labels) for n in range(len(scene_starting_times)): start_times = scene_starting_times[n][1] end_times = scene_ending_times[n][1] color = ['r', 'g', 'y'][n % 3] for m in range(len(start_times)): plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4) if figure_verbosity > 2: ax0.axvline(start_times[m], color=color, alpha=0.1) ax0.axvline(end_times[m], color=color, alpha=0.1) ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) ax1.axvline(start_times[m], color=color, alpha=0.1) ax1.axvline(end_times[m], color=color, alpha=0.1) ax1.axvline(end_times[m], color=color, alpha=0.1) ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) ax2.axvline(start_times[m], color=color, alpha=0.1) ax2.axvline(end_times[m], color=color, alpha=0.1) ax2.axvline(end_times[m], color=color, alpha=0.1) ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) if len(backgrounds_df) > 0: plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4) plt.tight_layout() if append_to_filename: plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300) else: plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300) timedict_df = timedict_to_dataframe(timedict) logging.debug(timedict_df) if append_to_filename: timedict_df.to_csv('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename)) else: timedict_df.to_csv('{}/scene_offsets.csv'.format(output_path)) if figure_verbosity > 1: plt.show() # Replace nans (i.e. because of division-by-zero) of the scene with zeros. scene_arr = np.nan_to_num(scene_arr) if channel_mode == 'mono': if append_to_filename: sf.write('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR) else: sf.write('{}/scene.wav'.format(output_path), scene_arr, SR) # Print timesdict return scene_arr def not_implemented(): logging.info("TODO: not implemented") if __name__ == "__main__": """ Main function, parses options and calls the simscene generation function or a demo. The options given are almost identical to Lagrange et al's simscene. """ argparser = argparse.ArgumentParser( description="SimScene.py acoustic scene generator", ) argparser.add_argument( 'input_path', type=str, help="Path of a directory containing wave files for sound backgrounds" "(in the `background' sub-directory) or events (in `event')" ) input_path = '.' argparser.add_argument( 'output_path', type=str, help="The directory the generated scenes and annotations will reside." ) output_path = '.' argparser.add_argument( 'scene_duration', type=float, help="Duration of scene in seconds", ) scene_duration = None argparser.add_argument( '-e', '--score-events', type=str, help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file" ) score_events = None argparser.add_argument( '-b', '--score-backgrounds', type=str, help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file" ) score_backgrounds = None argparser.add_argument( '--tag', type=str, help="Append _TAG_XXX to filenames, where XXX is an increment." ) tag = None argparser.add_argument( '-N', type=int, help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, " "then the verbosity must be less or equal to 1" ) generate_n = 1 argparser.add_argument( '-t', '--time-mode', type=str, help="Mode of spacing between events. `generate': values must be set for each track in the score files. " "`abstract': values are computed from an abstract representation of an existing acoustic scene. " "`replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)", choices=['generate', 'abstract', 'replicate'] ) time_mode = 'generate' argparser.add_argument( '-R', '--ebr-mode', type=str, help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the " "score files. `abstract': values are computed from an abstract representation of an existing acoustic " "scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)", choices=['generate', 'abstract', 'replicate'] ) ebr_mode = 'generate' argparser.add_argument( '-A', '--annotation-file', type=float, help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. " "ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). " "(NOT IMPLEMENTED)" ) annotation_file = None argparser.add_argument( '-a', '--audio-file', type=float, help="If -R or -m are selected, this provides the source for sourcing the times or EBRs " "from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)" ) audio_file = None argparser.add_argument( '-v', '--figure-verbosity', action='count', help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not " "display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot" ) figure_verbosity = 0 argparser.add_argument( '-x', '--image-format', help="Image format for the figures", choices=['png', 'jpg', 'pdf'] ) image_format = 'png' argparser.add_argument( '-C', '--channel-mode', type=str, help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as " "'classes', each channel is saved in a separate .wav file.", choices=['mono', 'separate'] ) channel_mode = 'mono' argparser.add_argument( '-c', '--end-cut', action='store_true', help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, " "else remove the sample." ) end_cut = None argparser.add_argument( '-L', '--logging-level', type=str, help="Set lowest logging level", choices=['debug', 'warning', 'info'] ) args = argparser.parse_args() if args.logging_level: if args.logging_level == 'debug': logging.basicConfig(level=logging.DEBUG) elif args.logging_level == 'info': logging.basicConfig(level=logging.INFO) elif args.logging_level == 'warning': logging.basicConfig(level=logging.WARNING) else: logging.basicConfig(level=logging.INFO) if args.input_path: input_path = args.input_path logging.debug("Using `{}' as input path".format(input_path)) if args.output_path: output_path = args.output_path logging.debug("Saving to `{}'".format(output_path)) if args.scene_duration: if not (args.score_backgrounds or args.score_events): print("You must provide one of -e or -b") else: if args.image_format: image_format = args.image_format if args.channel_mode: channel_mode = args.channel_mode if args.ebr_mode: ebr_mode = args.ebr_mode if ebr_mode not in ['generate']: logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode)) ebr_mode = 'generate' if args.time_mode: time_mode = args.time_mode if time_mode not in ['generate']: logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode)) time_mode = 'generate' if args.annotation_file: annotations = read_annotations_file(args.annotation_file) scene_duration = float(args.scene_duration) if args.score_backgrounds: score_backgrounds = read_backgrounds_file(args.score_backgrounds) else: score_backgrounds = [] if args.score_events: score_events = read_events_file(args.score_events) else: score_events = [] if args.figure_verbosity: figure_verbosity = args.figure_verbosity if args.N: generate_n = args.N if args.tag: tag = args.tag if generate_n == 1: append_to_filename = None simscene(input_path, output_path, scene_duration, score_events, score_backgrounds, time_mode=time_mode, ebr_mode=ebr_mode, channel_mode=channel_mode, annotation_file=annotation_file, audio_file=audio_file, figure_verbosity=figure_verbosity, end_cut=end_cut, image_format=image_format, append_to_filename=append_to_filename) else: for n in range(generate_n): if tag: append_to_filename = '{}_{}'.format(tag, n) else: append_to_filename = '{}'.format(n) logging.info("Generating scene {}".format(n)) simscene(input_path, output_path, scene_duration, score_events, score_backgrounds, time_mode=time_mode, ebr_mode=ebr_mode, channel_mode=channel_mode, annotation_file=annotation_file, audio_file=audio_file, figure_verbosity=min(figure_verbosity, 1), end_cut=end_cut, image_format=image_format, append_to_filename=append_to_filename)