Mercurial > hg > simscene-py
changeset 10:8637c974b4bc
fixed several bugs, added energy plots; set up background SNRs and EBRs (hopefully) correctly
author | Emmanouil Thoefanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Tue, 03 Oct 2017 19:08:28 +0100 |
parents | 53ee437b5ba3 |
children | cdf2eb89843a |
files | simscene.py |
diffstat | 1 files changed, 207 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/simscene.py Tue Oct 03 15:18:08 2017 +0100 +++ b/simscene.py Tue Oct 03 19:08:28 2017 +0100 @@ -41,6 +41,16 @@ """ return int(t*sr) +def compute_energy(x): + return np.sqrt(np.mean(x**2)) + +# def compute_energy_profile(x, w=1000): +# # Resize/Window signal +# #x = np.resize(x, (w,int(np.ceil(float(len(x)/w))))) +# x = np.array([[ii+jj for jj in range(w)] for ii in range(len(x)-w)]) +# return np.sqrt(np.mean(x**2, 1)) + + def read_events_file(fname): if fname[-3:].lower() == 'xls': df = pd.read_excel(fname) @@ -153,10 +163,11 @@ score_events, score_backgrounds, **kwargs): - logging.info('simscene() is not yet implemented') + logging.info('simscene() is not yet implemented fully') SR = 44100 # Samplerate. Should probably not be hardcoded events_df = score_events + backgrounds_df = score_backgrounds # Create empty numpy array scene_arr = np.zeros(int(scene_duration*SR)) @@ -180,33 +191,145 @@ # purposes scene_starting_times = [] scene_ending_times = [] + + # List of tracks + track_list = [] + background_energies = [] + + for n in range(len(backgrounds_df)): + # Get label of background + label = str(backgrounds_df['label'].loc[n]) + + candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n])) + chosen_fname = random.sample(candidates, 1)[0] + wav, sr = librosa.load(chosen_fname, sr=SR) + duration = len(wav)/float(SR) + target_snr = float(backgrounds_df['snr'].loc[n]) + energy = compute_energy(wav) + + logging.debug('{}:energy:{}'.format(label,energy)) + + + if n == 0: + # For the first background track, snr + # gives an amount by which it's going to be scaled (i.e. make it more silent) + amplitude_factor = target_snr + wav *= amplitude_factor + + if n > 0: + noise_energy = compute_energy(np.sum(np.array(track_list), axis=0)) + logging.info('{}:noise_energy:{}'.format(label,noise_energy)) + + old_snr = energy/noise_energy + logging.info('{}:old_snr:{}'.format(label,old_snr)) + + amplitude_factor = target_snr/old_snr + + + wav *= amplitude_factor + new_energy = compute_energy(wav) + new_snr = new_energy/noise_energy + logging.info('{}:new_snr:{}'.format(label,new_snr)) + + + # Track array + track_arr = np.zeros(int(scene_duration*SR)) + start_times = [0.0] + end_times = [start_times[-1]+len(wav)/float(SR)] + + + # Start with the first time in the list + new_start_time = start_times[-1] + new_end_time = end_times[-1] + + while new_start_time < scene_duration: + offset = duration + new_start_time += offset + new_end_time += offset + + start_times.append(new_start_time) + end_times.append(new_end_time) + + for t in start_times: + # We need to be careful with the limits here + # since numpy will just ignore indexing that + # exceeds + + begin = min(_N(t), len(track_arr)) + end = min(len(track_arr), _N(t)+len(wav)) + + # Part of the wav to stire + part = wav[:end-begin] + + track_arr[begin:end] += part + + track_list.append(track_arr) + scene_arr[:len(track_arr)] += track_arr + + if channel_mode == 'separate': + librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR) + + F = librosa.stft(track_arr, 1024) + energy_prof = librosa.feature.rmse(S=F) + background_energies.append(energy_prof) + + if figure_verbosity > 0: + plt.figure() + plt.subplot(3, 1, 1) + plt.title('`{}\' background waveform and spectrogram'.format(label)) + librosa.display.waveplot(track_arr,sr=SR) + + # Plot spectrogram + Fdb = librosa.amplitude_to_db(F) + plt.subplot(3, 1, 2) + librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') + + # Plot energy profile + plt.subplot(3, 1, 3) + time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T)) + plt.semilogy(time, energy_prof.T) + plt.xlim([0, len(track_arr)/SR]) + plt.ylabel('energy (rms)') + + + # Tidy up and save to file + plt.tight_layout() + plt.savefig('{}/background_{}.{}'.format(output_path, label, image_format), dpi=300) + + # Compute total energy of background + background_arr = np.sum(track_list, 0) + B = librosa.stft(background_arr, 1024) + background_energy = librosa.feature.rmse(S=B).flatten() for n in range(len(events_df)): # Get label of track label = str(events_df['label'].loc[n]) candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n])) - chosen_fname = random.sample(candidates,1)[0] + chosen_fname = random.sample(candidates, 1)[0] wav, sr = librosa.load(chosen_fname, sr=SR) assert sr == SR, "Sample rate of individual tracks must be 44100Hz (Failed: `{}' with sample rate: {} )".format(chosen_fname, sr) + # Apply a fader envelope fade_in_time = float(events_df['fade_in_time'].loc[n]) fade_out_time = float(events_df['fade_out_time'].loc[n]) wav = fade(wav, fade_in_time, fade_out_time) - + + # Set target EBR + target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 + np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0) + # Mean time between instances \mu. mean_time_between_instances = events_df['mean_time_between_instances'].loc[n] track_end_time = events_df['end_time'].loc[n] # Track array - track_arr = np.zeros(int(scene_duration*SR)) + track_arr = np.zeros(int(scene_duration*SR)) #If \mu is -1, then play the event only once. if mean_time_between_instances == -1: track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav else: - # If 0, then start next sample after this one (set it to the duration of the sample) if mean_time_between_instances == 0: mean_time_between_instances = len(wav)/float(SR) @@ -237,7 +360,6 @@ end_times.append(new_end_time) for t in start_times: - # We need to be careful with the limits here # since numpy will just ignore indexing that # exceeds the size of the array @@ -253,23 +375,65 @@ part = fade(part, 0, fade_out_time) track_arr[begin:end] += part - + + track_list.append(track_arr) scene_arr[:len(track_arr)] += track_arr + # Compute energies + F = librosa.stft(track_arr, 1024) + energy_prof = librosa.feature.rmse(S=F).flatten() + + + # Compute current ebr + +# logging.debug(energy_prof.shape) +# logging.debug(background_energy.shape) + ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten() + curr_ebr = np.max(ebr_prof) + logging.debug('{}:Target ebr: {}db'.format(label,20*np.log10(target_ebr))) + logging.debug('{}:Current track ebr: {}db'.format(label,20*np.log10(curr_ebr))) + + # Set correct ebr + track_arr = track_arr/curr_ebr*target_ebr + + Fnew = librosa.stft(track_arr, 1024) + new_energy_prof = librosa.feature.rmse(S=Fnew).flatten() + new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten() + new_ebr = np.max(new_ebr_prof) + logging.debug('{}:New track ebr: {}db'.format(label,20*np.log10(new_ebr))) + + + + if channel_mode == 'separate': + librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label), track_arr, SR) + + + + + if figure_verbosity > 0: plt.figure() - plt.subplot(2,1,1) - plt.title('`{}\' waveform and spectrogram'.format(label)) + plt.subplot(3,1,1) + plt.title('`{}\' event waveform and spectrogram'.format(label)) - visible_track = track_arr[int(start_times[0]*SR):int(end_times[-1]*SR)] - librosa.display.waveplot(visible_track,sr=SR) - F = librosa.stft(visible_track) + librosa.display.waveplot(track_arr,sr=SR) Fdb = librosa.amplitude_to_db(F) - plt.subplot(2,1,2) + plt.subplot(3, 1, 2) librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') + + # Plot energy profile + plt.subplot(3, 1, 3) + time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T)) + plt.semilogy(time, energy_prof.T) + plt.xlim([0, len(track_arr)/SR]) + plt.ylabel('energy (rms)') + + plt.tight_layout() - plt.savefig('{}/{}.{}'.format(output_path, label, image_format),dpi=300) + plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300) + + scene_starting_times.append((label, start_times)) @@ -286,12 +450,21 @@ librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') ax2 = plt.subplot(3,1,3) ax2.set_xlim([0,scene_duration]) - ax2.set_ylim([-0.5, len(scene_starting_times)-0.5]) # Get labels labels = [s[0] for s in scene_starting_times] - plt.yticks(range(len(scene_starting_times)), labels) + + + + # If background is active + if len(backgrounds_df) > 0: + labels.append('background') + + # Set y axis limit. With a padding of 0.5. + ax2.set_ylim([-0.5, len(labels)-0.5]) + + plt.yticks(range(len(labels)), labels) for n in range(len(scene_starting_times)): label = scene_starting_times[n][0] @@ -312,7 +485,11 @@ ax2.axvline(start_times[m], color=color, alpha=0.1) ax2.axvline(end_times[m], color=color, alpha=0.1) ax2.axvline(end_times[m], color=color, alpha=0.1) - ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) + ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) + + if len(backgrounds_df) > 0: + plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4) + plt.tight_layout() plt.savefig('{}/full-scene.{}'.format(output_path, image_format),dpi=300) if figure_verbosity > 1: @@ -320,7 +497,10 @@ if channel_mode == 'mono': librosa.output.write_wav('{}/full-scene.wav'.format(output_path), scene_arr, SR) - + + if channel_mode == 'classes': + scene_wav = np.array(track_list) + librosa.output.write_wav('{}/classes-scene.wav'.format(output_path), scene_wav, SR) return scene_arr @@ -402,7 +582,7 @@ '-v', '--figure-verbosity', action='count', help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot" ) - figure_verbosity = None + figure_verbosity = 0 argparser.add_argument( '-x', '--image-format', @@ -414,8 +594,8 @@ argparser.add_argument( '-C', '--channel-mode', type=str, - help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'classes' - As many channels as sound classes (events+textures), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.", - choices=['mono', 'classes', 'separate'] + help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.", + choices=['mono', 'separate'] ) channel_mode = 'mono' @@ -467,8 +647,14 @@ if args.score_backgrounds: score_backgrounds = read_backgrounds_file(args.score_backgrounds) + else: + score_backgrounds = [] + if args.score_events: score_events = read_events_file(args.score_events) + else: + score_events = [] + if args.figure_verbosity: figure_verbosity = args.figure_verbosity