e@35: #!/bin/python e@35: # -*- coding: utf-8 -*- e@35: # For licensing please see: LICENSE e@35: # Copyright (c) Emmanouil Theofanis Chourdakis e@35: e@35: # Argparse e@35: import argparse e@35: e@35: # Logging e@35: import logging e@35: e@35: # Pandas e@35: import pandas as pd e@35: e@35: # Numpy e@35: import numpy as np e@35: e@35: # Glob e@35: import glob e@35: import random e@35: e@35: # Librosa e@35: import librosa e@35: import librosa.display e@44: e@44: # PySoundfile e@44: import soundfile as sf e@35: e@35: # Matplotlib e@35: import matplotlib.pyplot as plt e@35: e@35: # Tabulate e@35: from tabulate import tabulate e@35: e@41: e@42: def _D(t, sr=44100): e@35: """ e@35: Helper function: Converts time to samples e@35: """ e@35: return int(t*sr) e@35: e@42: e@35: def compute_energy(x): e@35: return np.sqrt(np.mean(x**2)) e@35: e@35: e@41: def timedict_to_dataframe(timedict): e@47: print(timedict) e@46: return pd.DataFrame([(key, val[0], val[1], val[2]) for key in timedict for val in timedict[key]], e@46: columns=('label', 'filename', 'start_time', 'end_time')) e@41: e@47: def timedict_to_txt(timedict): e@47: str_ = "" e@47: for key in timedict: e@47: for val in timedict[key]: e@47: str_ += "{}\t{}\t{}\n".format(float(val[1]), float(val[2]), key) e@47: str_ += '\n' e@47: return str_ e@42: e@35: def render_pattern(fname, input_path, sr=44100): e@35: pattern = read_pattern_file(fname) e@35: e@41: # Store starting and end times in the format e@41: # {'filename': (start_time, end_time)} e@41: e@41: timesdict = {} e@41: e@35: start_times_samples = [] e@35: end_times_samples = [] e@35: durations_samples = [] e@35: wav_files = [] e@41: e@41: pattern_timedict = [] e@41: e@35: for n in range(len(pattern)): e@35: # Try loading the file, e@35: sampleid = pattern['sampleid'].loc[n] e@46: label = pattern['sampleid'].loc[n] e@38: candidates = [] e@38: for pattern_format in ['xls', 'json', 'txt', 'csv']: e@38: candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, sampleid, pattern_format)) e@35: e@38: if len(candidates) == 0: e@42: candidates = glob.glob('{}/event/{}*wav'.format(input_path, sampleid)) e@38: chosen_fname = random.sample(candidates, 1)[0] e@38: e@44: wav, SR = sf.read(chosen_fname) e@38: else: e@38: chosen_fname = random.sample(candidates, 1)[0] e@38: e@38: logging.debug('Loading {}'.format(chosen_fname)) e@41: wav, SR, pattern_timedict = render_pattern(chosen_fname, input_path) e@42: e@35: # For each sound in the pattern file, place it starting from starttime + an offset e@35: # with a mean value of 0 and standard deviation of offset_stddev. The first event can e@35: # not start earlier than time 0. If endtime is defined (not nan), then cut the event at e@35: # end time. e@35: e@35: # Read and assign an amplitude e@35: amplitude_mean = float(pattern['amplitude'].loc[n]) e@35: amplitude_stddev = float(pattern['amplitude_stdev'].loc[n]) e@35: amplitude = amplitude_mean + np.random.randn()*amplitude_stddev e@35: wav *= amplitude e@35: e@42: start_time = max(float(pattern['start_time'].loc[n]), 0) e@35: start_time_samples = int(start_time*SR) e@35: e@35: fade_in_time = float(pattern['fade_in_time'].loc[n]) e@35: fade_out_time = float(pattern['fade_out_time'].loc[n]) e@35: end_time = float(pattern['end_time'].loc[n]) e@35: e@35: # If end_time is not defined (-1 or just empty) e@35: # then just derive it from the length of the sample e@35: if np.isnan(end_time) or float(end_time) == -1: e@35: duration_samples = len(wav) e@35: end_time_samples = start_time_samples + duration_samples e@45: end_time = end_time_samples/float(SR) e@45: e@35: elif end_time - start_time > len(wav)/float(SR): e@35: e@35: # If given end_time is more than start_time + duration of sample e@35: # then pad the file with zeros to reach the desired end time. e@35: duration = end_time - start_time e@35: duration_samples = int(duration*SR) e@35: end_time_samples = start_time_samples + duration_samples e@41: e@41: # Calculate end time in seconds e@41: end_time = end_time_samples/float(SR) e@41: e@35: wav_arr = np.zeros(duration_samples) e@35: wav_arr[:len(wav)] = wav e@35: wav = wav_arr e@35: else: e@35: duration = end_time - start_time e@35: duration_samples = int(duration*SR) e@35: end_time_samples = start_time_samples + duration_samples e@35: e@35: event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time) e@35: e@35: start_times_samples.append(start_time_samples) e@35: end_times_samples.append(end_time_samples) e@35: durations_samples.append(duration_samples) e@35: wav_files.append(event_render) e@35: e@46: if label in timesdict: e@46: timesdict[label].append((chosen_fname,start_time, end_time)) e@41: else: e@46: timesdict[label] = [(chosen_fname,start_time, end_time)] e@41: e@41: for pt in pattern_timedict: e@41: if pt in timesdict: e@41: timesdict[pt] += pattern_timedict[pt] e@41: else: e@41: timesdict[pt] = pattern_timedict[pt] e@41: e@35: pattern_duration = end_time_samples e@35: pattern_arr = np.zeros(pattern_duration) e@35: e@35: for n, s in enumerate(start_times_samples): e@35: wav = wav_files[n] e@35: pattern_arr[s:s+len(wav)] = wav e@41: e@41: return pattern_arr, 44100, timesdict e@41: e@35: e@35: def read_events_file(fname): e@35: if fname[-3:].lower() == 'xls': e@35: df = pd.read_excel(fname) e@35: elif fname[-4:].lower() == 'json': e@35: df = pd.read_json(fname) e@35: elif fname[-3:].lower() in ['txt']: e@35: with open(fname) as f: e@35: s = f.readline() e@42: f.seek(0, 0) e@35: if ',' in s: e@35: sep = ',' e@35: elif '\t' in s: e@35: sep = '\t' e@35: else: e@35: sep = ' ' e@35: logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') e@35: df = pd.read_csv(f, header=None, sep=sep) e@42: df.columns = ['label', e@42: 'sampleid', e@42: 'ebr', e@42: 'ebr_stddev', e@42: 'mean_time_between_instances', e@42: 'time_between_instances_stddev', e@42: 'start_time', e@42: 'end_time', e@42: 'fade_in_time', e@42: 'fade_out_time'] e@35: elif fname[-3:].lower() in ['csv']: e@35: df = pd.read_json(fname) e@35: e@42: logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) e@35: return df e@35: e@41: e@35: def read_pattern_file(fname): e@35: if fname[-3:].lower() == 'xls': e@35: df = pd.read_excel(fname) e@35: elif fname[-4:].lower() == 'json': e@35: df = pd.read_json(fname) e@35: elif fname[-3:].lower() in ['txt']: e@35: with open(fname) as f: e@35: s = f.readline() e@42: f.seek(0, 0) e@35: if ',' in s: e@35: sep = ',' e@35: elif '\t' in s: e@35: sep = '\t' e@35: else: e@35: sep = ' ' e@35: logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') e@35: df = pd.read_csv(f, header=None, sep=sep) e@42: df.columns = ['eventid', e@42: 'start_time', e@42: 'end_time', e@42: 'time_offset_stdev', e@42: 'fade_in_time', e@42: 'fade_out_time', e@42: 'amplitude', e@42: 'amplitude_stdev'] e@35: elif fname[-3:].lower() in ['csv']: e@35: df = pd.read_json(fname) e@35: e@42: logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) e@35: return df e@41: e@41: e@35: def read_backgrounds_file(fname): e@35: if fname[-3:].lower() == 'xls': e@35: df = pd.read_excel(fname) e@35: elif fname[-4:].lower() == 'json': e@35: df = pd.read_json(fname) e@35: elif fname[-3:].lower() in ['txt']: e@35: with open(fname) as f: e@35: s = f.readline() e@42: f.seek(0, 0) e@35: if ',' in s: e@35: sep = ',' e@35: elif '\t' in s: e@35: sep = '\t' e@35: else: e@35: sep = ' ' e@35: logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') e@35: df = pd.read_csv(f, header=None, sep=sep) e@42: df.columns = ['label', 'sampleid', 'snr'] e@35: elif fname[-3:].lower() in ['csv']: e@35: df = pd.read_json(fname) e@35: e@42: logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) e@35: return df e@35: e@41: e@35: def read_annotations_file(fname): e@35: if fname[-3:].lower() == 'xls': e@35: df = pd.read_excel(fname) e@35: elif fname[-4:].lower() == 'json': e@35: df = pd.read_json(fname) e@35: elif fname[-3:].lower() in ['txt', 'csv']: e@35: e@35: with open(fname) as f: e@35: header = f.readline() e@35: e@35: s = f.readline() e@42: f.seek(0, 0) e@35: if ',' in s: e@35: sep = ',' e@35: elif '\t' in s: e@35: sep = '\t' e@35: else: e@35: sep = ' ' e@35: if sep in header: e@35: logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') e@35: df = pd.read_csv(f, header=None, sep=sep) e@35: df.columns = ['start', 'stop', 'class'] e@35: else: e@41: df = pd.read_csv(f, sep=sep) e@35: df.columns = ['start', 'stop', 'class'] e@35: df = None e@35: e@35: logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) e@35: return df e@35: e@41: e@35: def run_demo(): e@35: print("TODO: Implement run_demo()") e@35: e@41: e@35: def fade(x, fade_in, fade_out, sr=44100): e@35: """ e@35: Creates a fade-in-fade-out envelope e@35: for audio array x. e@35: """ e@35: e@35: if len(x) == 0: e@35: return x e@35: e@35: fade_in_samples = int(fade_in*sr) e@35: fade_out_samples = int(fade_out*sr) e@35: e@35: outp = np.ones_like(x) e@35: for n in range(fade_in_samples): e@35: outp[n] = n*1./fade_in_samples e@35: e@35: for n in range(fade_out_samples): e@35: outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n e@35: return outp*x e@35: e@41: e@35: def simscene(input_path, e@35: output_path, e@35: scene_duration, e@35: score_events, e@35: score_backgrounds, e@35: **kwargs): e@47: logging.warning('BER ratios have not yet been verified') e@42: SR = 44100 # Samplerate. Should probably not be hardcoded e@35: e@35: events_df = score_events e@35: backgrounds_df = score_backgrounds e@41: e@41: # Store starting and ending times in the format e@41: # {'filename': [(start_time, end_time), (start_time, end_time), ...]} e@41: timedict = {} e@41: e@35: # Create empty numpy array e@35: scene_arr = np.zeros(int(scene_duration*SR)) e@35: e@35: if 'append_to_filename' in kwargs: e@35: append_to_filename = kwargs['append_to_filename'] e@35: else: e@35: append_to_filename = None e@35: e@35: if 'end_cut' in kwargs: e@35: end_cut = kwargs['end_cut'] e@35: else: e@35: end_cut = False e@35: e@35: if 'figure_verbosity' in kwargs: e@35: figure_verbosity = kwargs['figure_verbosity'] e@35: else: e@35: figure_verbosity = 0 e@35: e@35: if 'image_format' in kwargs: e@35: image_format = kwargs['image_format'] e@35: else: e@35: image_format = 'png' e@47: e@47: if 'annot_format' in kwargs: e@47: annot_format = kwargs['annot_format'] e@47: else: e@47: annot_format = 'sed_eval' e@51: e@51: if 'full_duration' in kwargs: e@51: full_duration = True e@51: else: e@51: full_duration = False e@35: e@35: # Stores the starting and ending times of every track for visualization e@35: # purposes e@35: scene_starting_times = [] e@35: scene_ending_times = [] e@35: e@35: # List of tracks e@35: track_list = [] e@35: background_energies = [] e@41: e@35: for n in range(len(backgrounds_df)): e@35: # Get label of background e@35: label = str(backgrounds_df['label'].loc[n]) e@35: e@35: # First check if there are any pattern candidates. Give priorities e@35: # To pattern files. e@35: candidates = [] e@41: e@41: # List of pattern start and end times e@41: pattern_timedict = [] e@41: e@35: for pattern_format in ['xls', 'json', 'txt', 'csv']: e@42: candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, e@42: backgrounds_df['sampleid'].loc[n], e@42: pattern_format)) e@35: e@35: if len(candidates) == 0: e@35: # If no patterns are found, search for normal audio files e@35: candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n])) e@35: chosen_fname = random.sample(candidates, 1)[0] e@44: wav, sr = sf.read(chosen_fname) e@35: else: e@35: chosen_fname = random.sample(candidates, 1)[0] e@41: wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path) e@41: e@35: duration = len(wav)/float(SR) e@35: target_snr_db = float(backgrounds_df['snr'].loc[n]) e@35: target_snr = 10**(target_snr_db/20.0) e@35: e@35: energy = compute_energy(wav) e@35: e@41: logging.debug('{}:energy:{}'.format(label, energy)) e@41: e@35: if n == 0: e@35: # For the first background track, snr e@35: # gives an amount by which it's going to be scaled (i.e. make it more silent) e@35: amplitude_factor = target_snr e@35: wav *= amplitude_factor e@35: e@35: if n > 0: e@35: noise_energy = compute_energy(np.sum(np.array(track_list), axis=0)) e@41: logging.info('{}:noise_energy:{}'.format(label, noise_energy)) e@35: e@35: old_snr = energy/noise_energy e@35: old_snr_db = 20*np.log10(old_snr) e@41: logging.info('{}:old_snr:{}'.format(label, old_snr_db)) e@35: e@35: amplitude_factor = target_snr/old_snr e@35: e@35: wav *= amplitude_factor e@35: new_energy = compute_energy(wav) e@35: new_snr = new_energy/noise_energy e@35: new_snr_db = 20. * np.log10(new_snr) e@41: logging.info('{}:new_snr:{}'.format(label, new_snr_db)) e@41: e@35: # Track array e@35: track_arr = np.zeros(int(scene_duration*SR)) e@35: start_times = [0.0] e@35: end_times = [start_times[-1]+len(wav)/float(SR)] e@35: e@35: # Start with the first time in the list e@35: new_start_time = start_times[-1] e@35: new_end_time = end_times[-1] e@35: e@46: if label in timedict: e@46: timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time))) e@41: else: e@46: timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))] e@41: e@35: while new_start_time < scene_duration: e@35: offset = duration e@35: new_start_time += offset e@41: e@41: # If already exceeded scene, break e@41: if new_start_time >= scene_duration: e@41: break e@41: e@35: new_end_time += offset e@35: e@35: start_times.append(new_start_time) e@35: end_times.append(new_end_time) e@35: e@41: # Update timesdict noting where each filename starts and stops e@46: if label in timedict: e@46: timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time))) e@41: else: e@46: timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))] e@41: e@41: # Also update the times from the patterns e@41: for pt in pattern_timedict: e@46: pattern_timedict[pt] = [(s0, s[1] + new_start_time, s[2] + new_start_time) for s in e@41: pattern_timedict[pt]] e@41: e@41: if pt in timedict: e@41: timedict[pt] += pattern_timedict[pt] e@41: else: e@41: timedict[pt] = pattern_timedict[pt] e@41: e@41: # And add those to the timedict dictionary e@41: e@42: for t in start_times: e@35: # We need to be careful with the limits here e@35: # since numpy will just ignore indexing that e@35: # exceeds e@35: e@35: # Fading times in case we need to join many e@35: # consecutive samples together. e@35: # if n == 0: e@35: # # Little fade-out, fade-in to smoothly repeat the e@35: # # background. e@35: # fade_in_time = 0.0 e@35: # fade_out_time = 0.01 e@35: # elif n > 0 and n < len(start_times) - 1: e@35: # fade_in_time = 0.01 e@35: # fade_out_time = 0.01 e@35: # else: e@35: # fade_in_time = 0.01 e@35: # fade_out_time = 0.0 e@42: begin = min(_D(t), len(track_arr)) e@42: end = min(len(track_arr), _D(t) + len(wav)) e@35: e@35: # Part of the wav to store e@35: # part = fade(wav[:end-begin],fade_in_time,fade_out_time) e@35: part = wav[:end-begin] e@35: track_arr[begin:end] += part e@35: e@35: track_list.append(track_arr) e@35: scene_arr[:len(track_arr)] += track_arr e@35: e@35: if channel_mode == 'separate': e@35: librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR) e@35: e@35: F = librosa.stft(track_arr, 1024) e@35: energy_prof = librosa.feature.rmse(S=F) e@35: background_energies.append(energy_prof) e@35: e@35: if figure_verbosity > 0: e@35: plt.figure() e@35: plt.subplot(3, 1, 1) e@35: plt.title('`{}\' background waveform and spectrogram'.format(label)) e@41: librosa.display.waveplot(track_arr, sr=SR) e@35: e@35: # Plot spectrogram e@35: Fdb = librosa.amplitude_to_db(F) e@35: plt.subplot(3, 1, 2) e@35: librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') e@35: e@35: # Plot energy profile e@35: plt.subplot(3, 1, 3) e@35: time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T)) e@35: plt.semilogy(time, energy_prof.T) e@35: plt.xlim([0, len(track_arr)/SR]) e@35: plt.ylabel('energy (rms)') e@41: e@35: # Tidy up and save to file e@35: plt.tight_layout() e@35: if append_to_filename: e@42: plt.savefig('{}/background_{}_{}.{}'.format(output_path, e@42: label, e@42: append_to_filename, e@42: image_format), e@42: dpi=300) e@35: else: e@42: plt.savefig('{}/background_{}.{}'.format(output_path, e@42: label, e@42: image_format), e@42: dpi=300) e@35: e@35: # Compute total energy of background e@35: if len(backgrounds_df) > 0: e@35: background_arr = np.sum(track_list, 0) e@35: B = librosa.stft(background_arr, 1024) e@35: background_energy = librosa.feature.rmse(S=B).flatten() e@35: else: e@35: background_energy = 0.0 e@41: e@35: for n in range(len(events_df)): e@35: # Get label of track e@35: label = str(events_df['label'].loc[n]) e@35: e@35: # First check if there are any pattern candidates. Give priorities e@35: # To pattern files. e@35: candidates = [] e@41: e@41: # List of pattern start and end times e@41: pattern_timedict = [] e@41: e@35: for pattern_format in ['xls', 'json', 'txt', 'csv']: e@42: candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, e@42: events_df['sampleid'].loc[n], e@42: pattern_format)) e@35: e@35: if len(candidates) == 0: e@35: # If no patterns are found, search for normal audio files e@35: candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n])) e@35: chosen_fname = random.sample(candidates, 1)[0] e@44: wav, sr = sf.read(chosen_fname) e@35: else: e@35: chosen_fname = random.sample(candidates, 1)[0] e@41: wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path) e@41: e@42: logging.debug(chosen_fname) e@35: # Apply a fader envelope e@35: fade_in_time = float(events_df['fade_in_time'].loc[n]) e@35: fade_out_time = float(events_df['fade_out_time'].loc[n]) e@35: wav = fade(wav, fade_in_time, fade_out_time) e@35: e@35: # Set target EBR e@42: target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 + e@42: np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0) e@35: e@35: # Mean time between instances \mu. e@35: mean_time_between_instances = events_df['mean_time_between_instances'].loc[n] e@35: track_end_time = events_df['end_time'].loc[n] e@35: e@35: # Track array e@35: track_arr = np.zeros(int(scene_duration*SR)) e@35: e@42: # If \mu is -1, then play the event only once. e@35: if mean_time_between_instances == -1: e@42: track_arr[_D(events_df['start_time'].loc[n]):_D(events_df['start_time'].loc[n]) + len(wav)] += wav e@35: start_times = [float(events_df['start_time'].loc[n])] e@35: end_times = [float(events_df['end_time'].loc[n])] e@41: e@41: new_start_time = start_times[-1] e@41: new_end_time = end_times[-1] e@41: e@46: if label in timedict: e@46: timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time))) e@42: else: e@46: timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))] e@42: e@41: for pt in pattern_timedict: e@46: pattern_timedict[pt] = [(s[0], s[1] + new_start_time, s[2] + new_start_time) for s in e@41: pattern_timedict[pt]] e@41: e@41: if pt in timedict: e@41: timedict[pt] += pattern_timedict[pt] e@41: else: e@41: timedict[pt] = pattern_timedict[pt] e@41: e@35: else: e@35: # If 0, then start next sample after this one (set it to the duration of the sample) e@35: if mean_time_between_instances == 0: e@35: mean_time_between_instances = len(wav)/float(SR) e@51: e@51: # If we are using -fd (full_duration) for each event then mean_time_between_instances denotes time AFTER e@51: # the end of the previous event. e@51: if full_duration and mean_time_between_instances > 0: e@51: mean_time_between_instances += len(wav)/float(SR) e@35: e@35: # Store the successive starting and ending times of the events (given e.g. the model) e@35: # in the following lists. e@35: start_times = [events_df['start_time'].loc[n]] e@35: end_times = [start_times[-1]+len(wav)/float(SR)] e@35: e@35: # Start with the first time in the list e@35: new_start_time = start_times[-1] e@35: new_end_time = end_times[-1] e@35: e@41: if chosen_fname in timedict: e@46: timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time))) e@41: else: e@46: timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))] e@41: e@35: # Until the scene is full e@35: while new_start_time < track_end_time: e@35: offset = float(mean_time_between_instances) +\ e@35: float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn()) e@35: new_start_time += offset e@41: e@41: # If already exceeded scene, break e@41: if new_start_time >= scene_duration: e@41: break e@41: e@35: new_end_time += offset e@35: e@35: # Only exception is if we have set the 'end_cut' flag e@35: # and the end time of the event surpasses the end time e@35: # of the track e@35: if end_cut and new_end_time > track_end_time: e@35: break e@35: else: e@35: start_times.append(new_start_time) e@35: end_times.append(new_end_time) e@35: e@46: if label in timedict: e@46: timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time))) e@41: else: e@46: timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))] e@41: e@41: # Also update the times from the patterns e@41: for pt in pattern_timedict: e@48: pattern_timedict[pt] = [(s[0], s[1] + new_start_time, s[2] + new_start_time) for s in e@41: pattern_timedict[pt]] e@41: e@41: if pt in timedict: e@41: timedict[pt] += pattern_timedict[pt] e@41: else: e@41: timedict[pt] = pattern_timedict[pt] e@41: e@35: for t in start_times: e@35: # We need to be careful with the limits here e@35: # since numpy will just ignore indexing that e@35: # exceeds the size of the array e@42: begin = min(_D(t), len(track_arr)) e@42: end = min(len(track_arr), _D(t) + len(wav)) e@35: e@35: # Part of the wav to store e@35: part = wav[:end-begin] e@35: e@35: # If wav file was concatenated, fade out e@35: # quickly to avoid clicks e@42: if len(wav) > len(part) > fade_out_time*SR: e@35: part = fade(part, 0, fade_out_time) e@35: e@35: track_arr[begin:end] += part e@35: e@35: track_list.append(track_arr) e@35: scene_arr[:len(track_arr)] += track_arr e@35: e@35: # Compute energies e@35: F = librosa.stft(track_arr, 1024) e@35: energy_prof = librosa.feature.rmse(S=F).flatten() e@35: e@35: # Compute current ebr e@35: e@35: if len(backgrounds_df) > 0: e@35: ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten() e@35: curr_ebr = np.max(ebr_prof) e@42: logging.debug('{}:Target ebr: {}db'.format(label, e@42: 20*np.log10(target_ebr))) e@42: logging.debug('{}:Current track ebr: {}db'.format(label, e@42: 20*np.log10(curr_ebr))) e@35: e@35: # Set correct ebr e@35: track_arr = track_arr/curr_ebr*target_ebr e@35: e@35: Fnew = librosa.stft(track_arr, 1024) e@35: new_energy_prof = librosa.feature.rmse(S=Fnew).flatten() e@35: new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten() e@35: new_ebr = np.max(new_ebr_prof) e@42: logging.debug('{}:New track ebr: {}db'.format(label, 20*np.log10(new_ebr))) e@35: e@35: if channel_mode == 'separate': e@44: sf.write('{}/{}_event_track.wav'.format(output_path, label), e@42: track_arr/np.max(track_arr), e@42: SR) e@35: e@35: if figure_verbosity > 0: e@35: plt.figure() e@35: e@42: plt.subplot(3, 1, 1) e@35: plt.title('`{}\' event waveform and spectrogram'.format(label)) e@35: e@42: librosa.display.waveplot(track_arr, sr=SR) e@35: Fdb = librosa.amplitude_to_db(F) e@35: plt.subplot(3, 1, 2) e@35: librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') e@35: e@35: # Plot energy profile e@35: plt.subplot(3, 1, 3) e@35: time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T)) e@35: plt.semilogy(time, energy_prof.T) e@35: plt.xlim([0, len(track_arr)/SR]) e@35: plt.ylabel('energy (rms)') e@35: e@35: plt.tight_layout() e@35: if append_to_filename: e@35: plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300) e@35: else: e@35: plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300) e@35: e@35: scene_starting_times.append((label, start_times)) e@35: scene_ending_times.append((label, end_times)) e@35: e@35: if figure_verbosity > 0: e@35: plt.figure() e@42: ax0 = plt.subplot(3, 1, 1) e@35: plt.title('Synthesized Scene') e@35: librosa.display.waveplot(scene_arr, sr=SR) e@35: F = librosa.stft(scene_arr) e@35: Fdb = librosa.amplitude_to_db(F) e@42: ax1 = plt.subplot(3, 1, 2) e@35: librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') e@42: ax2 = plt.subplot(3, 1, 3) e@42: ax2.set_xlim([0, scene_duration]) e@35: e@35: # Get labels e@35: labels = [s[0] for s in scene_starting_times] e@35: e@35: # If background is active e@35: if len(backgrounds_df) > 0: e@35: labels.append('background') e@35: e@35: # Set y axis limit. With a padding of 0.5. e@35: ax2.set_ylim([-0.5, len(labels)-0.5]) e@35: e@35: plt.yticks(range(len(labels)), labels) e@35: e@35: for n in range(len(scene_starting_times)): e@35: start_times = scene_starting_times[n][1] e@35: end_times = scene_ending_times[n][1] e@35: color = ['r', 'g', 'y'][n % 3] e@35: e@35: for m in range(len(start_times)): e@35: plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4) e@35: if figure_verbosity > 2: e@35: ax0.axvline(start_times[m], color=color, alpha=0.1) e@35: ax0.axvline(end_times[m], color=color, alpha=0.1) e@35: ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) e@35: ax1.axvline(start_times[m], color=color, alpha=0.1) e@35: ax1.axvline(end_times[m], color=color, alpha=0.1) e@35: ax1.axvline(end_times[m], color=color, alpha=0.1) e@35: ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) e@35: ax2.axvline(start_times[m], color=color, alpha=0.1) e@35: ax2.axvline(end_times[m], color=color, alpha=0.1) e@35: ax2.axvline(end_times[m], color=color, alpha=0.1) e@35: ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) e@35: e@35: if len(backgrounds_df) > 0: e@35: plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4) e@35: e@35: plt.tight_layout() e@35: e@35: if append_to_filename: e@35: plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300) e@35: else: e@35: plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300) e@41: e@47: if annot_format == 'sed_eval': e@47: timedict_txt = timedict_to_txt(timedict) e@47: logging.debug(timedict_txt) e@41: e@47: if append_to_filename: e@47: with open('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename), 'w') as f: e@47: f.write(timedict_txt) e@47: else: e@47: with open('{}/scene_offsets.csv'.format(output_path), 'w') as f: e@47: f.write(timedict_txt) e@47: e@47: elif annot_format == 'pandas': e@47: timedict_df = timedict_to_dataframe(timedict) e@47: logging.debug(timedict_df) e@47: e@47: if append_to_filename: e@47: timedict_df.to_csv('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename)) e@47: else: e@47: timedict_df.to_csv('{}/scene_offsets.csv'.format(output_path)) e@41: e@35: if figure_verbosity > 1: e@35: plt.show() e@35: e@35: # Replace nans (i.e. because of division-by-zero) of the scene with zeros. e@35: scene_arr = np.nan_to_num(scene_arr) e@35: e@35: if channel_mode == 'mono': e@35: if append_to_filename: e@44: sf.write('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR) e@35: else: e@44: sf.write('{}/scene.wav'.format(output_path), scene_arr, SR) e@41: e@41: # Print timesdict e@35: e@35: return scene_arr e@41: e@41: e@35: def not_implemented(): e@41: logging.info("TODO: not implemented") e@41: e@41: e@41: if __name__ == "__main__": e@35: """ e@35: Main function, parses options and calls the simscene generation function e@35: or a demo. The options given are almost identical to Lagrange et al's e@35: simscene. e@35: """ e@35: argparser = argparse.ArgumentParser( e@35: description="SimScene.py acoustic scene generator", e@35: ) e@35: argparser.add_argument( e@35: 'input_path', e@35: type=str, e@42: help="Path of a directory containing wave files for sound backgrounds" e@42: "(in the `background' sub-directory) or events (in `event')" e@35: ) e@42: e@42: input_path = '.' e@42: e@35: argparser.add_argument( e@35: 'output_path', e@35: type=str, e@35: help="The directory the generated scenes and annotations will reside." e@42: ) e@42: e@42: output_path = '.' e@42: e@35: argparser.add_argument( e@35: 'scene_duration', e@35: type=float, e@35: help="Duration of scene in seconds", e@35: ) e@35: scene_duration = None e@35: e@35: argparser.add_argument( e@35: '-e', '--score-events', e@35: type=str, e@35: help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file" e@35: ) e@35: score_events = None e@35: e@35: argparser.add_argument( e@35: '-b', '--score-backgrounds', e@35: type=str, e@35: help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file" e@35: ) e@35: score_backgrounds = None e@35: e@35: argparser.add_argument( e@35: '--tag', e@35: type=str, e@35: help="Append _TAG_XXX to filenames, where XXX is an increment." e@35: ) e@35: tag = None e@35: e@35: argparser.add_argument( e@35: '-N', e@35: type=int, e@42: help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, " e@42: "then the verbosity must be less or equal to 1" e@35: ) e@35: generate_n = 1 e@35: e@35: argparser.add_argument( e@35: '-t', '--time-mode', e@35: type=str, e@42: help="Mode of spacing between events. `generate': values must be set for each track in the score files. " e@42: "`abstract': values are computed from an abstract representation of an existing acoustic scene. " e@42: "`replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)", e@35: choices=['generate', 'abstract', 'replicate'] e@35: ) e@35: time_mode = 'generate' e@51: e@51: argparser.add_argument( e@51: '-fd', '--full-duration', e@51: action='store_true', e@51: help="If enabled, times specified in the recipe refer to after the previous file finishes." e@51: ) e@51: full_duration = False e@51: e@35: argparser.add_argument( e@35: '-R', '--ebr-mode', e@35: type=str, e@42: help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the " e@42: "score files. `abstract': values are computed from an abstract representation of an existing acoustic " e@42: "scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)", e@35: choices=['generate', 'abstract', 'replicate'] e@35: ) e@35: ebr_mode = 'generate' e@35: e@35: argparser.add_argument( e@35: '-A', '--annotation-file', e@35: type=float, e@42: help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. " e@42: "ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). " e@42: "(NOT IMPLEMENTED)" e@35: ) e@35: annotation_file = None e@35: e@35: argparser.add_argument( e@35: '-a', '--audio-file', e@35: type=float, e@42: help="If -R or -m are selected, this provides the source for sourcing the times or EBRs " e@42: "from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)" e@35: ) e@35: audio_file = None e@35: e@35: argparser.add_argument( e@35: '-v', '--figure-verbosity', action='count', e@42: help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not " e@42: "display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot" e@35: ) e@35: figure_verbosity = 0 e@35: e@35: argparser.add_argument( e@35: '-x', '--image-format', e@35: help="Image format for the figures", e@35: choices=['png', 'jpg', 'pdf'] e@35: ) e@35: image_format = 'png' e@35: e@35: argparser.add_argument( e@35: '-C', '--channel-mode', e@35: type=str, e@42: help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as " e@42: "'classes', each channel is saved in a separate .wav file.", e@35: choices=['mono', 'separate'] e@35: ) e@35: channel_mode = 'mono' e@35: e@35: argparser.add_argument( e@35: '-c', '--end-cut', e@35: action='store_true', e@42: help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, " e@42: "else remove the sample." e@35: ) e@35: end_cut = None e@42: e@42: argparser.add_argument( e@42: '-L', '--logging-level', e@42: type=str, e@42: help="Set lowest logging level", e@42: choices=['debug', 'warning', 'info'] e@42: ) e@42: e@47: argparser.add_argument( e@47: '--annot-format', e@47: type=str, e@47: help="Annotation format for generated scenes. Choices are: 'sed_eval' (default) - Format appropriate for " e@47: "DCASE 2017 challenge evaluator, 'pandas' - A more detailed format for the form ", e@47: choices=['sed_eval', 'pandas'] e@47: ) e@47: e@35: args = argparser.parse_args() e@42: e@42: if args.logging_level: e@42: if args.logging_level == 'debug': e@42: logging.basicConfig(level=logging.DEBUG) e@42: elif args.logging_level == 'info': e@42: logging.basicConfig(level=logging.INFO) e@42: elif args.logging_level == 'warning': e@42: logging.basicConfig(level=logging.WARNING) e@42: else: e@42: logging.basicConfig(level=logging.INFO) e@42: e@35: if args.input_path: e@35: input_path = args.input_path e@35: logging.debug("Using `{}' as input path".format(input_path)) e@35: if args.output_path: e@35: output_path = args.output_path e@35: logging.debug("Saving to `{}'".format(output_path)) e@51: if args.full_duration: e@51: full_duration = True e@35: if args.scene_duration: e@35: if not (args.score_backgrounds or args.score_events): e@35: print("You must provide one of -e or -b") e@35: else: e@35: if args.image_format: e@35: image_format = args.image_format e@35: if args.channel_mode: e@35: channel_mode = args.channel_mode e@35: if args.ebr_mode: e@35: ebr_mode = args.ebr_mode e@35: if ebr_mode not in ['generate']: e@35: logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode)) e@35: ebr_mode = 'generate' e@35: if args.time_mode: e@35: time_mode = args.time_mode e@35: if time_mode not in ['generate']: e@35: logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode)) e@35: time_mode = 'generate' e@35: if args.annotation_file: e@35: annotations = read_annotations_file(args.annotation_file) e@35: e@35: scene_duration = float(args.scene_duration) e@35: e@35: if args.score_backgrounds: e@35: score_backgrounds = read_backgrounds_file(args.score_backgrounds) e@35: else: e@35: score_backgrounds = [] e@35: e@35: if args.score_events: e@35: score_events = read_events_file(args.score_events) e@35: else: e@35: score_events = [] e@35: e@35: if args.figure_verbosity: e@35: figure_verbosity = args.figure_verbosity e@35: e@35: if args.N: e@35: generate_n = args.N e@35: e@35: if args.tag: e@35: tag = args.tag e@35: e@35: if generate_n == 1: e@35: append_to_filename = None e@35: simscene(input_path, output_path, scene_duration, score_events, score_backgrounds, e@35: time_mode=time_mode, e@35: ebr_mode=ebr_mode, e@35: channel_mode=channel_mode, e@35: annotation_file=annotation_file, e@35: audio_file=audio_file, e@35: figure_verbosity=figure_verbosity, e@35: end_cut=end_cut, e@35: image_format=image_format, e@51: append_to_filename=append_to_filename, e@51: full_duration=full_duration) e@35: else: e@35: for n in range(generate_n): e@35: if tag: e@35: append_to_filename = '{}_{}'.format(tag, n) e@35: else: e@35: append_to_filename = '{}'.format(n) e@35: e@35: logging.info("Generating scene {}".format(n)) e@35: e@35: simscene(input_path, output_path, scene_duration, score_events, score_backgrounds, e@35: time_mode=time_mode, e@35: ebr_mode=ebr_mode, e@35: channel_mode=channel_mode, e@35: annotation_file=annotation_file, e@35: audio_file=audio_file, e@35: figure_verbosity=min(figure_verbosity, 1), e@35: end_cut=end_cut, e@35: image_format=image_format, e@51: append_to_filename=append_to_filename, e@51: full_duration=full_duration)