Mercurial > hg > simscene-py
changeset 41:d97f5b9ac6a9
changed to conform more to pep8 (still needs working on); made it to generate a .csv in the end with the individual wav files and the start and end times of each
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Sat, 07 Oct 2017 15:22:02 +0100 |
parents | 01106e64c5aa |
children | f30d2066eebb |
files | python/simscene.py |
diffstat | 1 files changed, 159 insertions(+), 49 deletions(-) [+] |
line wrap: on
line diff
--- a/python/simscene.py Thu Oct 05 19:42:37 2017 +0100 +++ b/python/simscene.py Sat Oct 07 15:22:02 2017 +0100 @@ -35,6 +35,8 @@ # Tabulate from tabulate import tabulate + + def _N(t, sr=44100): """ Helper function: Converts time to samples @@ -50,14 +52,25 @@ # x = np.array([[ii+jj for jj in range(w)] for ii in range(len(x)-w)]) # return np.sqrt(np.mean(x**2, 1)) +def timedict_to_dataframe(timedict): + return pd.DataFrame([(key, val[0], val[1]) for key in timedict for val in timedict[key]], + columns=('filename', 'start_time', 'end_time')) + def render_pattern(fname, input_path, sr=44100): pattern = read_pattern_file(fname) + # Store starting and end times in the format + # {'filename': (start_time, end_time)} + + timesdict = {} + start_times_samples = [] end_times_samples = [] durations_samples = [] wav_files = [] - + + pattern_timedict = [] + for n in range(len(pattern)): # Try loading the file, sampleid = pattern['sampleid'].loc[n] @@ -74,7 +87,7 @@ chosen_fname = random.sample(candidates, 1)[0] logging.debug('Loading {}'.format(chosen_fname)) - wav, SR = render_pattern(chosen_fname, input_path) + wav, SR, pattern_timedict = render_pattern(chosen_fname, input_path) # For each sound in the pattern file, place it starting from starttime + an offset @@ -107,6 +120,10 @@ duration = end_time - start_time duration_samples = int(duration*SR) end_time_samples = start_time_samples + duration_samples + + # Calculate end time in seconds + end_time = end_time_samples/float(SR) + wav_arr = np.zeros(duration_samples) wav_arr[:len(wav)] = wav wav = wav_arr @@ -122,14 +139,26 @@ durations_samples.append(duration_samples) wav_files.append(event_render) + if chosen_fname in timesdict: + timesdict[chosen_fname].append((start_time, end_time)) + else: + timesdict[chosen_fname] = [(start_time, end_time)] + + for pt in pattern_timedict: + if pt in timesdict: + timesdict[pt] += pattern_timedict[pt] + else: + timesdict[pt] = pattern_timedict[pt] + pattern_duration = end_time_samples pattern_arr = np.zeros(pattern_duration) for n, s in enumerate(start_times_samples): wav = wav_files[n] pattern_arr[s:s+len(wav)] = wav - - return pattern_arr, 44100 + + return pattern_arr, 44100, timesdict + def read_events_file(fname): if fname[-3:].lower() == 'xls': @@ -155,6 +184,7 @@ logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) return df + def read_pattern_file(fname): if fname[-3:].lower() == 'xls': df = pd.read_excel(fname) @@ -178,7 +208,8 @@ logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) return df - + + def read_backgrounds_file(fname): if fname[-3:].lower() == 'xls': df = pd.read_excel(fname) @@ -203,6 +234,7 @@ logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) return df + def read_annotations_file(fname): if fname[-3:].lower() == 'xls': df = pd.read_excel(fname) @@ -226,16 +258,18 @@ df = pd.read_csv(f, header=None, sep=sep) df.columns = ['start', 'stop', 'class'] else: + df = pd.read_csv(f, sep=sep) df.columns = ['start', 'stop', 'class'] - df = pd.read_csv(f, sep=sep) df = None logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) return df + def run_demo(): print("TODO: Implement run_demo()") + def fade(x, fade_in, fade_out, sr=44100): """ Creates a fade-in-fade-out envelope @@ -256,6 +290,7 @@ outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n return outp*x + def simscene(input_path, output_path, scene_duration, @@ -267,7 +302,11 @@ events_df = score_events backgrounds_df = score_backgrounds - + + # Store starting and ending times in the format + # {'filename': [(start_time, end_time), (start_time, end_time), ...]} + timedict = {} + # Create empty numpy array scene_arr = np.zeros(int(scene_duration*SR)) @@ -299,7 +338,7 @@ # List of tracks track_list = [] background_energies = [] - + for n in range(len(backgrounds_df)): # Get label of background label = str(backgrounds_df['label'].loc[n]) @@ -307,6 +346,10 @@ # First check if there are any pattern candidates. Give priorities # To pattern files. candidates = [] + + # List of pattern start and end times + pattern_timedict = [] + for pattern_format in ['xls', 'json', 'txt', 'csv']: candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, backgrounds_df['sampleid'].loc[n], pattern_format)) @@ -317,7 +360,8 @@ wav, sr = librosa.load(chosen_fname, sr=SR) else: chosen_fname = random.sample(candidates, 1)[0] - wav, sr = render_pattern(chosen_fname, input_path) + wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path) + duration = len(wav)/float(SR) target_snr_db = float(backgrounds_df['snr'].loc[n]) @@ -325,9 +369,8 @@ energy = compute_energy(wav) - logging.debug('{}:energy:{}'.format(label,energy)) - - + logging.debug('{}:energy:{}'.format(label, energy)) + if n == 0: # For the first background track, snr # gives an amount by which it's going to be scaled (i.e. make it more silent) @@ -336,40 +379,67 @@ if n > 0: noise_energy = compute_energy(np.sum(np.array(track_list), axis=0)) - logging.info('{}:noise_energy:{}'.format(label,noise_energy)) + logging.info('{}:noise_energy:{}'.format(label, noise_energy)) old_snr = energy/noise_energy old_snr_db = 20*np.log10(old_snr) - logging.info('{}:old_snr:{}'.format(label,old_snr_db)) + logging.info('{}:old_snr:{}'.format(label, old_snr_db)) amplitude_factor = target_snr/old_snr - wav *= amplitude_factor new_energy = compute_energy(wav) new_snr = new_energy/noise_energy new_snr_db = 20. * np.log10(new_snr) - logging.info('{}:new_snr:{}'.format(label,new_snr_db)) - - + logging.info('{}:new_snr:{}'.format(label, new_snr_db)) + # Track array track_arr = np.zeros(int(scene_duration*SR)) start_times = [0.0] end_times = [start_times[-1]+len(wav)/float(SR)] - # Start with the first time in the list new_start_time = start_times[-1] new_end_time = end_times[-1] + if chosen_fname in timedict: + timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time))) + else: + timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))] + # + while new_start_time < scene_duration: offset = duration new_start_time += offset + + # If already exceeded scene, break + if new_start_time >= scene_duration: + break + new_end_time += offset start_times.append(new_start_time) end_times.append(new_end_time) + # Update timesdict noting where each filename starts and stops + if chosen_fname in timedict: + timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time))) + else: + timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))] + + # Also update the times from the patterns + for pt in pattern_timedict: + pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new + new_end_time) for s in + pattern_timedict[pt]] + + if pt in timedict: + timedict[pt] += pattern_timedict[pt] + else: + timedict[pt] = pattern_timedict[pt] + + # And add those to the timedict dictionary + + for n,t in enumerate(start_times): # We need to be careful with the limits here # since numpy will just ignore indexing that @@ -394,8 +464,6 @@ # Part of the wav to store # part = fade(wav[:end-begin],fade_in_time,fade_out_time) part = wav[:end-begin] - - track_arr[begin:end] += part track_list.append(track_arr) @@ -412,7 +480,7 @@ plt.figure() plt.subplot(3, 1, 1) plt.title('`{}\' background waveform and spectrogram'.format(label)) - librosa.display.waveplot(track_arr,sr=SR) + librosa.display.waveplot(track_arr, sr=SR) # Plot spectrogram Fdb = librosa.amplitude_to_db(F) @@ -425,8 +493,7 @@ plt.semilogy(time, energy_prof.T) plt.xlim([0, len(track_arr)/SR]) plt.ylabel('energy (rms)') - - + # Tidy up and save to file plt.tight_layout() if append_to_filename: @@ -441,7 +508,9 @@ background_energy = librosa.feature.rmse(S=B).flatten() else: background_energy = 0.0 - + + + for n in range(len(events_df)): # Get label of track label = str(events_df['label'].loc[n]) @@ -449,6 +518,10 @@ # First check if there are any pattern candidates. Give priorities # To pattern files. candidates = [] + + # List of pattern start and end times + pattern_timedict = [] + for pattern_format in ['xls', 'json', 'txt', 'csv']: candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, events_df['sampleid'].loc[n], pattern_format)) @@ -459,8 +532,9 @@ wav, sr = librosa.load(chosen_fname, sr=SR) else: chosen_fname = random.sample(candidates, 1)[0] - wav, sr = render_pattern(chosen_fname, input_path) - + logging.info('rendering pattern') + wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path) + # Apply a fader envelope fade_in_time = float(events_df['fade_in_time'].loc[n]) @@ -482,6 +556,19 @@ track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav start_times = [float(events_df['start_time'].loc[n])] end_times = [float(events_df['end_time'].loc[n])] + + new_start_time = start_times[-1] + new_end_time = end_times[-1] + + for pt in pattern_timedict: + pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in + pattern_timedict[pt]] + + if pt in timedict: + timedict[pt] += pattern_timedict[pt] + else: + timedict[pt] = pattern_timedict[pt] + else: # If 0, then start next sample after this one (set it to the duration of the sample) if mean_time_between_instances == 0: @@ -496,11 +583,21 @@ new_start_time = start_times[-1] new_end_time = end_times[-1] + if chosen_fname in timedict: + timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time))) + else: + timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))] + # Until the scene is full while new_start_time < track_end_time: offset = float(mean_time_between_instances) +\ float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn()) new_start_time += offset + + # If already exceeded scene, break + if new_start_time >= scene_duration: + break + new_end_time += offset # Only exception is if we have set the 'end_cut' flag @@ -512,6 +609,21 @@ start_times.append(new_start_time) end_times.append(new_end_time) + if chosen_fname in timedict: + timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time))) + else: + timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))] + + # Also update the times from the patterns + for pt in pattern_timedict: + pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in + pattern_timedict[pt]] + + if pt in timedict: + timedict[pt] += pattern_timedict[pt] + else: + timedict[pt] = pattern_timedict[pt] + for t in start_times: # We need to be careful with the limits here # since numpy will just ignore indexing that @@ -552,15 +664,9 @@ new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten() new_ebr = np.max(new_ebr_prof) logging.debug('{}:New track ebr: {}db'.format(label,20*np.log10(new_ebr))) - - if channel_mode == 'separate': librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label), track_arr/np.max(track_arr), SR) - - - - if figure_verbosity > 0: plt.figure() @@ -579,7 +685,6 @@ plt.semilogy(time, energy_prof.T) plt.xlim([0, len(track_arr)/SR]) plt.ylabel('energy (rms)') - plt.tight_layout() if append_to_filename: @@ -587,9 +692,6 @@ else: plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300) - - - scene_starting_times.append((label, start_times)) scene_ending_times.append((label, end_times)) @@ -608,9 +710,6 @@ # Get labels labels = [s[0] for s in scene_starting_times] - - - # If background is active if len(backgrounds_df) > 0: labels.append('background') @@ -646,11 +745,21 @@ plt.tight_layout() + + if append_to_filename: plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300) else: plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300) - + + timedict_df = timedict_to_dataframe(timedict) + logging.debug(timedict_df) + + if append_to_filename: + timedict_df.to_csv('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename)) + else: + timedict_df.to_csv('{}/scene_offsets.csv'.format(output_path)) + if figure_verbosity > 1: plt.show() @@ -662,16 +771,17 @@ librosa.output.write_wav('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR) else: librosa.output.write_wav('{}/scene.wav'.format(output_path), scene_arr, SR) - + + # Print timesdict return scene_arr - - - + + def not_implemented(): - print("TODO: not implemented") - -if __name__=="__main__": + logging.info("TODO: not implemented") + + +if __name__ == "__main__": """ Main function, parses options and calls the simscene generation function or a demo. The options given are almost identical to Lagrange et al's @@ -790,7 +900,7 @@ ) end_cut = None - logging.basicConfig(level=logging.DEBUG) + logging.basicConfig(level=logging.INFO) args = argparser.parse_args() if args.input_path: