e@35: #!/bin/python e@35: # -*- coding: utf-8 -*- e@35: # For licensing please see: LICENSE e@35: # Copyright (c) Emmanouil Theofanis Chourdakis e@35: e@35: # Argparse e@35: import argparse e@35: e@35: # Logging e@35: import logging e@35: e@35: # Pandas e@35: import pandas as pd e@35: e@35: # Numpy e@35: import numpy as np e@35: import sys e@35: e@35: # Glob e@35: import glob e@35: import random e@35: e@35: # Librosa e@35: import librosa e@35: import librosa.display e@35: import librosa.output e@35: e@35: # Matplotlib e@35: from matplotlib import rc e@35: # rc('text', usetex=True) e@35: import matplotlib.pyplot as plt e@35: import matplotlib.patches as patches e@35: from cycler import cycler e@35: e@35: # Tabulate e@35: from tabulate import tabulate e@35: e@35: def _N(t, sr=44100): e@35: """ e@35: Helper function: Converts time to samples e@35: """ e@35: return int(t*sr) e@35: e@35: def compute_energy(x): e@35: return np.sqrt(np.mean(x**2)) e@35: e@35: # def compute_energy_profile(x, w=1000): e@35: # # Resize/Window signal e@35: # #x = np.resize(x, (w,int(np.ceil(float(len(x)/w))))) e@35: # x = np.array([[ii+jj for jj in range(w)] for ii in range(len(x)-w)]) e@35: # return np.sqrt(np.mean(x**2, 1)) e@35: e@35: def render_pattern(fname, input_path, sr=44100): e@35: pattern = read_pattern_file(fname) e@35: e@35: start_times_samples = [] e@35: end_times_samples = [] e@35: durations_samples = [] e@35: wav_files = [] e@35: e@35: for n in range(len(pattern)): e@35: # Try loading the file, e@35: sampleid = pattern['sampleid'].loc[n] e@35: candidates = glob.glob('{}/event/{}*wav'.format(input_path,sampleid)) e@35: chosen_fname = random.sample(candidates, 1)[0] e@35: e@35: logging.debug('Loading {}'.format(chosen_fname)) e@35: e@35: # For each sound in the pattern file, place it starting from starttime + an offset e@35: # with a mean value of 0 and standard deviation of offset_stddev. The first event can e@35: # not start earlier than time 0. If endtime is defined (not nan), then cut the event at e@35: # end time. e@35: wav, SR = librosa.load(chosen_fname, sr=sr) e@35: e@35: # Read and assign an amplitude e@35: amplitude_mean = float(pattern['amplitude'].loc[n]) e@35: amplitude_stddev = float(pattern['amplitude_stdev'].loc[n]) e@35: amplitude = amplitude_mean + np.random.randn()*amplitude_stddev e@35: wav *= amplitude e@35: e@35: start_time = max(float(pattern['start_time'].loc[n]),0) e@35: start_time_samples = int(start_time*SR) e@35: e@35: fade_in_time = float(pattern['fade_in_time'].loc[n]) e@35: fade_out_time = float(pattern['fade_out_time'].loc[n]) e@35: end_time = float(pattern['end_time'].loc[n]) e@35: e@35: # If end_time is not defined (-1 or just empty) e@35: # then just derive it from the length of the sample e@35: if np.isnan(end_time) or float(end_time) == -1: e@35: duration_samples = len(wav) e@35: end_time_samples = start_time_samples + duration_samples e@35: elif end_time - start_time > len(wav)/float(SR): e@35: e@35: # If given end_time is more than start_time + duration of sample e@35: # then pad the file with zeros to reach the desired end time. e@35: duration = end_time - start_time e@35: duration_samples = int(duration*SR) e@35: end_time_samples = start_time_samples + duration_samples e@35: wav_arr = np.zeros(duration_samples) e@35: wav_arr[:len(wav)] = wav e@35: wav = wav_arr e@35: else: e@35: duration = end_time - start_time e@35: duration_samples = int(duration*SR) e@35: end_time_samples = start_time_samples + duration_samples e@35: e@35: event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time) e@35: e@35: start_times_samples.append(start_time_samples) e@35: end_times_samples.append(end_time_samples) e@35: durations_samples.append(duration_samples) e@35: wav_files.append(event_render) e@35: e@35: pattern_duration = end_time_samples e@35: pattern_arr = np.zeros(pattern_duration) e@35: e@35: for n, s in enumerate(start_times_samples): e@35: wav = wav_files[n] e@35: pattern_arr[s:s+len(wav)] = wav e@35: e@35: return pattern_arr, 44100 e@35: e@35: def read_events_file(fname): e@35: if fname[-3:].lower() == 'xls': e@35: df = pd.read_excel(fname) e@35: elif fname[-4:].lower() == 'json': e@35: df = pd.read_json(fname) e@35: elif fname[-3:].lower() in ['txt']: e@35: with open(fname) as f: e@35: s = f.readline() e@35: f.seek(0,0) e@35: if ',' in s: e@35: sep = ',' e@35: elif '\t' in s: e@35: sep = '\t' e@35: else: e@35: sep = ' ' e@35: logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') e@35: df = pd.read_csv(f, header=None, sep=sep) e@35: df.columns = ['label','sampleid','ebr','ebr_stddev','mean_time_between_instances','time_between_instances_stddev','start_time','end_time','fade_in_time','fade_out_time'] e@35: elif fname[-3:].lower() in ['csv']: e@35: df = pd.read_json(fname) e@35: e@35: logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) e@35: return df e@35: e@35: def read_pattern_file(fname): e@35: if fname[-3:].lower() == 'xls': e@35: df = pd.read_excel(fname) e@35: elif fname[-4:].lower() == 'json': e@35: df = pd.read_json(fname) e@35: elif fname[-3:].lower() in ['txt']: e@35: with open(fname) as f: e@35: s = f.readline() e@35: f.seek(0,0) e@35: if ',' in s: e@35: sep = ',' e@35: elif '\t' in s: e@35: sep = '\t' e@35: else: e@35: sep = ' ' e@35: logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') e@35: df = pd.read_csv(f, header=None, sep=sep) e@35: df.columns = ['eventid','start_time','end_time','time_offset_stdev','fade_in_time', 'fade_out_time', 'amplitude', 'amplitude_stdev'] e@35: elif fname[-3:].lower() in ['csv']: e@35: df = pd.read_json(fname) e@35: e@35: logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) e@35: return df e@35: e@35: def read_backgrounds_file(fname): e@35: if fname[-3:].lower() == 'xls': e@35: df = pd.read_excel(fname) e@35: elif fname[-4:].lower() == 'json': e@35: df = pd.read_json(fname) e@35: elif fname[-3:].lower() in ['txt']: e@35: with open(fname) as f: e@35: s = f.readline() e@35: f.seek(0,0) e@35: if ',' in s: e@35: sep = ',' e@35: elif '\t' in s: e@35: sep = '\t' e@35: else: e@35: sep = ' ' e@35: logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') e@35: df = pd.read_csv(f, header=None, sep=sep) e@35: df.columns = ['label','sampleid','snr'] e@35: elif fname[-3:].lower() in ['csv']: e@35: df = pd.read_json(fname) e@35: e@35: logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) e@35: return df e@35: e@35: def read_annotations_file(fname): e@35: if fname[-3:].lower() == 'xls': e@35: df = pd.read_excel(fname) e@35: elif fname[-4:].lower() == 'json': e@35: df = pd.read_json(fname) e@35: elif fname[-3:].lower() in ['txt', 'csv']: e@35: e@35: with open(fname) as f: e@35: header = f.readline() e@35: e@35: s = f.readline() e@35: f.seek(0,0) e@35: if ',' in s: e@35: sep = ',' e@35: elif '\t' in s: e@35: sep = '\t' e@35: else: e@35: sep = ' ' e@35: if sep in header: e@35: logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') e@35: df = pd.read_csv(f, header=None, sep=sep) e@35: df.columns = ['start', 'stop', 'class'] e@35: else: e@35: df.columns = ['start', 'stop', 'class'] e@35: df = pd.read_csv(f, sep=sep) e@35: df = None e@35: e@35: logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) e@35: return df e@35: e@35: def run_demo(): e@35: print("TODO: Implement run_demo()") e@35: e@35: def fade(x, fade_in, fade_out, sr=44100): e@35: """ e@35: Creates a fade-in-fade-out envelope e@35: for audio array x. e@35: """ e@35: e@35: if len(x) == 0: e@35: return x e@35: e@35: fade_in_samples = int(fade_in*sr) e@35: fade_out_samples = int(fade_out*sr) e@35: e@35: outp = np.ones_like(x) e@35: for n in range(fade_in_samples): e@35: outp[n] = n*1./fade_in_samples e@35: e@35: for n in range(fade_out_samples): e@35: outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n e@35: return outp*x e@35: e@35: def simscene(input_path, e@35: output_path, e@35: scene_duration, e@35: score_events, e@35: score_backgrounds, e@35: **kwargs): e@35: logging.info('simscene() is not yet implemented fully') e@35: SR = 44100 # Samplerate. Should probably not be hardcoded e@35: e@35: events_df = score_events e@35: backgrounds_df = score_backgrounds e@35: e@35: # Create empty numpy array e@35: scene_arr = np.zeros(int(scene_duration*SR)) e@35: e@35: if 'append_to_filename' in kwargs: e@35: append_to_filename = kwargs['append_to_filename'] e@35: else: e@35: append_to_filename = None e@35: e@35: if 'end_cut' in kwargs: e@35: end_cut = kwargs['end_cut'] e@35: else: e@35: end_cut = False e@35: e@35: if 'figure_verbosity' in kwargs: e@35: figure_verbosity = kwargs['figure_verbosity'] e@35: else: e@35: figure_verbosity = 0 e@35: e@35: if 'image_format' in kwargs: e@35: image_format = kwargs['image_format'] e@35: else: e@35: image_format = 'png' e@35: e@35: # Stores the starting and ending times of every track for visualization e@35: # purposes e@35: scene_starting_times = [] e@35: scene_ending_times = [] e@35: e@35: # List of tracks e@35: track_list = [] e@35: background_energies = [] e@35: e@35: for n in range(len(backgrounds_df)): e@35: # Get label of background e@35: label = str(backgrounds_df['label'].loc[n]) e@35: e@35: # First check if there are any pattern candidates. Give priorities e@35: # To pattern files. e@35: candidates = [] e@35: for pattern_format in ['xls', 'json', 'txt', 'csv']: e@35: candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, backgrounds_df['sampleid'].loc[n], pattern_format)) e@35: e@35: if len(candidates) == 0: e@35: # If no patterns are found, search for normal audio files e@35: candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n])) e@35: chosen_fname = random.sample(candidates, 1)[0] e@35: wav, sr = librosa.load(chosen_fname, sr=SR) e@35: else: e@35: chosen_fname = random.sample(candidates, 1)[0] e@35: wav, sr = render_pattern(chosen_fname, input_path) e@35: e@35: duration = len(wav)/float(SR) e@35: target_snr_db = float(backgrounds_df['snr'].loc[n]) e@35: target_snr = 10**(target_snr_db/20.0) e@35: e@35: energy = compute_energy(wav) e@35: e@35: logging.debug('{}:energy:{}'.format(label,energy)) e@35: e@35: e@35: if n == 0: e@35: # For the first background track, snr e@35: # gives an amount by which it's going to be scaled (i.e. make it more silent) e@35: amplitude_factor = target_snr e@35: wav *= amplitude_factor e@35: e@35: if n > 0: e@35: noise_energy = compute_energy(np.sum(np.array(track_list), axis=0)) e@35: logging.info('{}:noise_energy:{}'.format(label,noise_energy)) e@35: e@35: old_snr = energy/noise_energy e@35: old_snr_db = 20*np.log10(old_snr) e@35: logging.info('{}:old_snr:{}'.format(label,old_snr_db)) e@35: e@35: amplitude_factor = target_snr/old_snr e@35: e@35: e@35: wav *= amplitude_factor e@35: new_energy = compute_energy(wav) e@35: new_snr = new_energy/noise_energy e@35: new_snr_db = 20. * np.log10(new_snr) e@35: logging.info('{}:new_snr:{}'.format(label,new_snr_db)) e@35: e@35: e@35: # Track array e@35: track_arr = np.zeros(int(scene_duration*SR)) e@35: start_times = [0.0] e@35: end_times = [start_times[-1]+len(wav)/float(SR)] e@35: e@35: e@35: # Start with the first time in the list e@35: new_start_time = start_times[-1] e@35: new_end_time = end_times[-1] e@35: e@35: while new_start_time < scene_duration: e@35: offset = duration e@35: new_start_time += offset e@35: new_end_time += offset e@35: e@35: start_times.append(new_start_time) e@35: end_times.append(new_end_time) e@35: e@35: for n,t in enumerate(start_times): e@35: # We need to be careful with the limits here e@35: # since numpy will just ignore indexing that e@35: # exceeds e@35: e@35: # Fading times in case we need to join many e@35: # consecutive samples together. e@35: # if n == 0: e@35: # # Little fade-out, fade-in to smoothly repeat the e@35: # # background. e@35: # fade_in_time = 0.0 e@35: # fade_out_time = 0.01 e@35: # elif n > 0 and n < len(start_times) - 1: e@35: # fade_in_time = 0.01 e@35: # fade_out_time = 0.01 e@35: # else: e@35: # fade_in_time = 0.01 e@35: # fade_out_time = 0.0 e@35: begin = min(_N(t), len(track_arr)) e@35: end = min(len(track_arr), _N(t)+len(wav)) e@35: e@35: # Part of the wav to store e@35: # part = fade(wav[:end-begin],fade_in_time,fade_out_time) e@35: part = wav[:end-begin] e@35: e@35: e@35: track_arr[begin:end] += part e@35: e@35: track_list.append(track_arr) e@35: scene_arr[:len(track_arr)] += track_arr e@35: e@35: if channel_mode == 'separate': e@35: librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR) e@35: e@35: F = librosa.stft(track_arr, 1024) e@35: energy_prof = librosa.feature.rmse(S=F) e@35: background_energies.append(energy_prof) e@35: e@35: if figure_verbosity > 0: e@35: plt.figure() e@35: plt.subplot(3, 1, 1) e@35: plt.title('`{}\' background waveform and spectrogram'.format(label)) e@35: librosa.display.waveplot(track_arr,sr=SR) e@35: e@35: # Plot spectrogram e@35: Fdb = librosa.amplitude_to_db(F) e@35: plt.subplot(3, 1, 2) e@35: librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') e@35: e@35: # Plot energy profile e@35: plt.subplot(3, 1, 3) e@35: time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T)) e@35: plt.semilogy(time, energy_prof.T) e@35: plt.xlim([0, len(track_arr)/SR]) e@35: plt.ylabel('energy (rms)') e@35: e@35: e@35: # Tidy up and save to file e@35: plt.tight_layout() e@35: if append_to_filename: e@35: plt.savefig('{}/background_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300) e@35: else: e@35: plt.savefig('{}/background_{}.{}'.format(output_path, label, image_format), dpi=300) e@35: e@35: # Compute total energy of background e@35: if len(backgrounds_df) > 0: e@35: background_arr = np.sum(track_list, 0) e@35: B = librosa.stft(background_arr, 1024) e@35: background_energy = librosa.feature.rmse(S=B).flatten() e@35: else: e@35: background_energy = 0.0 e@35: e@35: for n in range(len(events_df)): e@35: # Get label of track e@35: label = str(events_df['label'].loc[n]) e@35: e@35: # First check if there are any pattern candidates. Give priorities e@35: # To pattern files. e@35: candidates = [] e@35: for pattern_format in ['xls', 'json', 'txt', 'csv']: e@35: candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, events_df['sampleid'].loc[n], pattern_format)) e@35: e@35: if len(candidates) == 0: e@35: # If no patterns are found, search for normal audio files e@35: candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n])) e@35: chosen_fname = random.sample(candidates, 1)[0] e@35: wav, sr = librosa.load(chosen_fname, sr=SR) e@35: else: e@35: chosen_fname = random.sample(candidates, 1)[0] e@35: wav, sr = render_pattern(chosen_fname, input_path) e@35: e@35: e@35: # Apply a fader envelope e@35: fade_in_time = float(events_df['fade_in_time'].loc[n]) e@35: fade_out_time = float(events_df['fade_out_time'].loc[n]) e@35: wav = fade(wav, fade_in_time, fade_out_time) e@35: e@35: # Set target EBR e@35: target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 + np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0) e@35: e@35: # Mean time between instances \mu. e@35: mean_time_between_instances = events_df['mean_time_between_instances'].loc[n] e@35: track_end_time = events_df['end_time'].loc[n] e@35: e@35: # Track array e@35: track_arr = np.zeros(int(scene_duration*SR)) e@35: e@35: #If \mu is -1, then play the event only once. e@35: if mean_time_between_instances == -1: e@35: track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav e@35: start_times = [float(events_df['start_time'].loc[n])] e@35: end_times = [float(events_df['end_time'].loc[n])] e@35: else: e@35: # If 0, then start next sample after this one (set it to the duration of the sample) e@35: if mean_time_between_instances == 0: e@35: mean_time_between_instances = len(wav)/float(SR) e@35: e@35: # Store the successive starting and ending times of the events (given e.g. the model) e@35: # in the following lists. e@35: start_times = [events_df['start_time'].loc[n]] e@35: end_times = [start_times[-1]+len(wav)/float(SR)] e@35: e@35: # Start with the first time in the list e@35: new_start_time = start_times[-1] e@35: new_end_time = end_times[-1] e@35: e@35: # Until the scene is full e@35: while new_start_time < track_end_time: e@35: offset = float(mean_time_between_instances) +\ e@35: float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn()) e@35: new_start_time += offset e@35: new_end_time += offset e@35: e@35: # Only exception is if we have set the 'end_cut' flag e@35: # and the end time of the event surpasses the end time e@35: # of the track e@35: if end_cut and new_end_time > track_end_time: e@35: break e@35: else: e@35: start_times.append(new_start_time) e@35: end_times.append(new_end_time) e@35: e@35: for t in start_times: e@35: # We need to be careful with the limits here e@35: # since numpy will just ignore indexing that e@35: # exceeds the size of the array e@35: begin = min(_N(t), len(track_arr)) e@35: end = min(len(track_arr), _N(t)+len(wav)) e@35: e@35: # Part of the wav to store e@35: part = wav[:end-begin] e@35: e@35: # If wav file was concatenated, fade out e@35: # quickly to avoid clicks e@35: if len(part) < len(wav) and len(part) > fade_out_time*SR: e@35: part = fade(part, 0, fade_out_time) e@35: e@35: track_arr[begin:end] += part e@35: e@35: track_list.append(track_arr) e@35: scene_arr[:len(track_arr)] += track_arr e@35: e@35: # Compute energies e@35: F = librosa.stft(track_arr, 1024) e@35: energy_prof = librosa.feature.rmse(S=F).flatten() e@35: e@35: # Compute current ebr e@35: e@35: if len(backgrounds_df) > 0: e@35: ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten() e@35: curr_ebr = np.max(ebr_prof) e@35: logging.debug('{}:Target ebr: {}db'.format(label,20*np.log10(target_ebr))) e@35: logging.debug('{}:Current track ebr: {}db'.format(label,20*np.log10(curr_ebr))) e@35: e@35: # Set correct ebr e@35: track_arr = track_arr/curr_ebr*target_ebr e@35: e@35: Fnew = librosa.stft(track_arr, 1024) e@35: new_energy_prof = librosa.feature.rmse(S=Fnew).flatten() e@35: new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten() e@35: new_ebr = np.max(new_ebr_prof) e@35: logging.debug('{}:New track ebr: {}db'.format(label,20*np.log10(new_ebr))) e@35: e@35: e@35: e@35: if channel_mode == 'separate': e@35: librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label), track_arr/np.max(track_arr), SR) e@35: e@35: e@35: e@35: e@35: e@35: if figure_verbosity > 0: e@35: plt.figure() e@35: e@35: plt.subplot(3,1,1) e@35: plt.title('`{}\' event waveform and spectrogram'.format(label)) e@35: e@35: librosa.display.waveplot(track_arr,sr=SR) e@35: Fdb = librosa.amplitude_to_db(F) e@35: plt.subplot(3, 1, 2) e@35: librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') e@35: e@35: # Plot energy profile e@35: plt.subplot(3, 1, 3) e@35: time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T)) e@35: plt.semilogy(time, energy_prof.T) e@35: plt.xlim([0, len(track_arr)/SR]) e@35: plt.ylabel('energy (rms)') e@35: e@35: e@35: plt.tight_layout() e@35: if append_to_filename: e@35: plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300) e@35: else: e@35: plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300) e@35: e@35: e@35: e@35: e@35: scene_starting_times.append((label, start_times)) e@35: scene_ending_times.append((label, end_times)) e@35: e@35: if figure_verbosity > 0: e@35: plt.figure() e@35: ax0 = plt.subplot(3,1,1) e@35: plt.title('Synthesized Scene') e@35: librosa.display.waveplot(scene_arr, sr=SR) e@35: F = librosa.stft(scene_arr) e@35: Fdb = librosa.amplitude_to_db(F) e@35: ax1 = plt.subplot(3,1,2) e@35: librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') e@35: ax2 = plt.subplot(3,1,3) e@35: ax2.set_xlim([0,scene_duration]) e@35: e@35: # Get labels e@35: labels = [s[0] for s in scene_starting_times] e@35: e@35: e@35: e@35: e@35: # If background is active e@35: if len(backgrounds_df) > 0: e@35: labels.append('background') e@35: e@35: # Set y axis limit. With a padding of 0.5. e@35: ax2.set_ylim([-0.5, len(labels)-0.5]) e@35: e@35: plt.yticks(range(len(labels)), labels) e@35: e@35: for n in range(len(scene_starting_times)): e@35: label = scene_starting_times[n][0] e@35: start_times = scene_starting_times[n][1] e@35: end_times = scene_ending_times[n][1] e@35: color = ['r', 'g', 'y'][n % 3] e@35: e@35: for m in range(len(start_times)): e@35: plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4) e@35: if figure_verbosity > 2: e@35: ax0.axvline(start_times[m], color=color, alpha=0.1) e@35: ax0.axvline(end_times[m], color=color, alpha=0.1) e@35: ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) e@35: ax1.axvline(start_times[m], color=color, alpha=0.1) e@35: ax1.axvline(end_times[m], color=color, alpha=0.1) e@35: ax1.axvline(end_times[m], color=color, alpha=0.1) e@35: ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) e@35: ax2.axvline(start_times[m], color=color, alpha=0.1) e@35: ax2.axvline(end_times[m], color=color, alpha=0.1) e@35: ax2.axvline(end_times[m], color=color, alpha=0.1) e@35: ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) e@35: e@35: if len(backgrounds_df) > 0: e@35: plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4) e@35: e@35: plt.tight_layout() e@35: e@35: if append_to_filename: e@35: plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300) e@35: else: e@35: plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300) e@35: e@35: if figure_verbosity > 1: e@35: plt.show() e@35: e@35: # Replace nans (i.e. because of division-by-zero) of the scene with zeros. e@35: scene_arr = np.nan_to_num(scene_arr) e@35: e@35: if channel_mode == 'mono': e@35: if append_to_filename: e@35: librosa.output.write_wav('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR) e@35: else: e@35: librosa.output.write_wav('{}/scene.wav'.format(output_path), scene_arr, SR) e@35: e@35: e@35: return scene_arr e@35: e@35: e@35: e@35: def not_implemented(): e@35: print("TODO: not implemented") e@35: e@35: if __name__=="__main__": e@35: """ e@35: Main function, parses options and calls the simscene generation function e@35: or a demo. The options given are almost identical to Lagrange et al's e@35: simscene. e@35: """ e@35: argparser = argparse.ArgumentParser( e@35: description="SimScene.py acoustic scene generator", e@35: ) e@35: argparser.add_argument( e@35: 'input_path', e@35: type=str, e@35: help="Path of a directory containing wave files for sound backgrounds (in the `background' sub-directory) or events (in `event')" e@35: ) e@35: argparser.add_argument( e@35: 'output_path', e@35: type=str, e@35: help="The directory the generated scenes and annotations will reside." e@35: ) e@35: argparser.add_argument( e@35: 'scene_duration', e@35: type=float, e@35: help="Duration of scene in seconds", e@35: ) e@35: scene_duration = None e@35: e@35: argparser.add_argument( e@35: '-e', '--score-events', e@35: type=str, e@35: help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file" e@35: ) e@35: score_events = None e@35: e@35: argparser.add_argument( e@35: '-b', '--score-backgrounds', e@35: type=str, e@35: help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file" e@35: ) e@35: score_backgrounds = None e@35: e@35: argparser.add_argument( e@35: '--tag', e@35: type=str, e@35: help="Append _TAG_XXX to filenames, where XXX is an increment." e@35: ) e@35: tag = None e@35: e@35: argparser.add_argument( e@35: '-N', e@35: type=int, e@35: help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, then the verbosity must be less or equal to 1" e@35: ) e@35: generate_n = 1 e@35: e@35: argparser.add_argument( e@35: '-t', '--time-mode', e@35: type=str, e@35: help="Mode of spacing between events. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)", e@35: choices=['generate', 'abstract', 'replicate'] e@35: ) e@35: time_mode = 'generate' e@35: e@35: argparser.add_argument( e@35: '-R', '--ebr-mode', e@35: type=str, e@35: help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)", e@35: choices=['generate', 'abstract', 'replicate'] e@35: ) e@35: ebr_mode = 'generate' e@35: e@35: argparser.add_argument( e@35: '-A', '--annotation-file', e@35: type=float, e@35: help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). (NOT IMPLEMENTED)" e@35: ) e@35: annotation_file = None e@35: e@35: argparser.add_argument( e@35: '-a', '--audio-file', e@35: type=float, e@35: help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)" e@35: ) e@35: audio_file = None e@35: e@35: argparser.add_argument( e@35: '-v', '--figure-verbosity', action='count', e@35: help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot" e@35: ) e@35: figure_verbosity = 0 e@35: e@35: argparser.add_argument( e@35: '-x', '--image-format', e@35: help="Image format for the figures", e@35: choices=['png', 'jpg', 'pdf'] e@35: ) e@35: image_format = 'png' e@35: e@35: argparser.add_argument( e@35: '-C', '--channel-mode', e@35: type=str, e@35: help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.", e@35: choices=['mono', 'separate'] e@35: ) e@35: channel_mode = 'mono' e@35: e@35: # argparser.add_argument( e@35: # '-m', '--min-space', e@35: # type=float, e@35: # help="Minimum space allowed between successive events (seconds). If -1, then allow overlapping between events." e@35: # ) e@35: min_space = -1 e@35: e@35: argparser.add_argument( e@35: '-c', '--end-cut', e@35: action='store_true', e@35: help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, else remove the sample." e@35: ) e@35: end_cut = None e@35: e@35: logging.basicConfig(level=logging.DEBUG) e@35: e@35: args = argparser.parse_args() e@35: if args.input_path: e@35: input_path = args.input_path e@35: logging.debug("Using `{}' as input path".format(input_path)) e@35: if args.output_path: e@35: output_path = args.output_path e@35: logging.debug("Saving to `{}'".format(output_path)) e@35: if args.scene_duration: e@35: if not (args.score_backgrounds or args.score_events): e@35: print("You must provide one of -e or -b") e@35: else: e@35: if args.image_format: e@35: image_format = args.image_format e@35: if args.channel_mode: e@35: channel_mode = args.channel_mode e@35: if args.ebr_mode: e@35: ebr_mode = args.ebr_mode e@35: if ebr_mode not in ['generate']: e@35: logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode)) e@35: ebr_mode = 'generate' e@35: if args.time_mode: e@35: time_mode = args.time_mode e@35: if time_mode not in ['generate']: e@35: logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode)) e@35: time_mode = 'generate' e@35: if args.annotation_file: e@35: annotations = read_annotations_file(args.annotation_file) e@35: e@35: scene_duration = float(args.scene_duration) e@35: e@35: if args.score_backgrounds: e@35: score_backgrounds = read_backgrounds_file(args.score_backgrounds) e@35: else: e@35: score_backgrounds = [] e@35: e@35: if args.score_events: e@35: score_events = read_events_file(args.score_events) e@35: else: e@35: score_events = [] e@35: e@35: if args.figure_verbosity: e@35: figure_verbosity = args.figure_verbosity e@35: e@35: if args.N: e@35: generate_n = args.N e@35: e@35: if args.tag: e@35: tag = args.tag e@35: e@35: if generate_n == 1: e@35: append_to_filename = None e@35: simscene(input_path, output_path, scene_duration, score_events, score_backgrounds, e@35: time_mode=time_mode, e@35: ebr_mode=ebr_mode, e@35: channel_mode=channel_mode, e@35: annotation_file=annotation_file, e@35: audio_file=audio_file, e@35: figure_verbosity=figure_verbosity, e@35: min_space=min_space, e@35: end_cut=end_cut, e@35: image_format=image_format, e@35: append_to_filename=append_to_filename) e@35: else: e@35: for n in range(generate_n): e@35: if tag: e@35: append_to_filename = '{}_{}'.format(tag, n) e@35: else: e@35: append_to_filename = '{}'.format(n) e@35: e@35: logging.info("Generating scene {}".format(n)) e@35: e@35: simscene(input_path, output_path, scene_duration, score_events, score_backgrounds, e@35: time_mode=time_mode, e@35: ebr_mode=ebr_mode, e@35: channel_mode=channel_mode, e@35: annotation_file=annotation_file, e@35: audio_file=audio_file, e@35: figure_verbosity=min(figure_verbosity, 1), e@35: min_space=min_space, e@35: end_cut=end_cut, e@35: image_format=image_format, e@35: append_to_filename=append_to_filename) e@35: