simscene-py: python/simscene.py annotate

annotate python/simscene.py @ 35:5d19c2254677

added simscene.py with the accompanying input files to generate acoustic scenes using python

author	Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date	Thu, 05 Oct 2017 14:53:15 +0100
parents
children	a0eb120940b1

rev	line source
e@35	1 #!/bin/python
e@35	2 # -- coding: utf-8 --
e@35	3 # For licensing please see: LICENSE
e@35	4 # Copyright (c) Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
e@35	5
e@35	6 # Argparse
e@35	7 import argparse
e@35	8
e@35	9 # Logging
e@35	10 import logging
e@35	11
e@35	12 # Pandas
e@35	13 import pandas as pd
e@35	14
e@35	15 # Numpy
e@35	16 import numpy as np
e@35	17 import sys
e@35	18
e@35	19 # Glob
e@35	20 import glob
e@35	21 import random
e@35	22
e@35	23 # Librosa
e@35	24 import librosa
e@35	25 import librosa.display
e@35	26 import librosa.output
e@35	27
e@35	28 # Matplotlib
e@35	29 from matplotlib import rc
e@35	30 # rc('text', usetex=True)
e@35	31 import matplotlib.pyplot as plt
e@35	32 import matplotlib.patches as patches
e@35	33 from cycler import cycler
e@35	34
e@35	35 # Tabulate
e@35	36 from tabulate import tabulate
e@35	37
e@35	38 def _N(t, sr=44100):
e@35	39 """
e@35	40 Helper function: Converts time to samples
e@35	41 """
e@35	42 return int(t*sr)
e@35	43
e@35	44 def compute_energy(x):
e@35	45 return np.sqrt(np.mean(x**2))
e@35	46
e@35	47 # def compute_energy_profile(x, w=1000):
e@35	48 # # Resize/Window signal
e@35	49 # #x = np.resize(x, (w,int(np.ceil(float(len(x)/w)))))
e@35	50 # x = np.array([[ii+jj for jj in range(w)] for ii in range(len(x)-w)])
e@35	51 # return np.sqrt(np.mean(x**2, 1))
e@35	52
e@35	53 def render_pattern(fname, input_path, sr=44100):
e@35	54 pattern = read_pattern_file(fname)
e@35	55
e@35	56 start_times_samples = []
e@35	57 end_times_samples = []
e@35	58 durations_samples = []
e@35	59 wav_files = []
e@35	60
e@35	61 for n in range(len(pattern)):
e@35	62 # Try loading the file,
e@35	63 sampleid = pattern['sampleid'].loc[n]
e@35	64 candidates = glob.glob('{}/event/{}*wav'.format(input_path,sampleid))
e@35	65 chosen_fname = random.sample(candidates, 1)[0]
e@35	66
e@35	67 logging.debug('Loading {}'.format(chosen_fname))
e@35	68
e@35	69 # For each sound in the pattern file, place it starting from starttime + an offset
e@35	70 # with a mean value of 0 and standard deviation of offset_stddev. The first event can
e@35	71 # not start earlier than time 0. If endtime is defined (not nan), then cut the event at
e@35	72 # end time.
e@35	73 wav, SR = librosa.load(chosen_fname, sr=sr)
e@35	74
e@35	75 # Read and assign an amplitude
e@35	76 amplitude_mean = float(pattern['amplitude'].loc[n])
e@35	77 amplitude_stddev = float(pattern['amplitude_stdev'].loc[n])
e@35	78 amplitude = amplitude_mean + np.random.randn()*amplitude_stddev
e@35	79 wav *= amplitude
e@35	80
e@35	81 start_time = max(float(pattern['start_time'].loc[n]),0)
e@35	82 start_time_samples = int(start_time*SR)
e@35	83
e@35	84 fade_in_time = float(pattern['fade_in_time'].loc[n])
e@35	85 fade_out_time = float(pattern['fade_out_time'].loc[n])
e@35	86 end_time = float(pattern['end_time'].loc[n])
e@35	87
e@35	88 # If end_time is not defined (-1 or just empty)
e@35	89 # then just derive it from the length of the sample
e@35	90 if np.isnan(end_time) or float(end_time) == -1:
e@35	91 duration_samples = len(wav)
e@35	92 end_time_samples = start_time_samples + duration_samples
e@35	93 elif end_time - start_time > len(wav)/float(SR):
e@35	94
e@35	95 # If given end_time is more than start_time + duration of sample
e@35	96 # then pad the file with zeros to reach the desired end time.
e@35	97 duration = end_time - start_time
e@35	98 duration_samples = int(duration*SR)
e@35	99 end_time_samples = start_time_samples + duration_samples
e@35	100 wav_arr = np.zeros(duration_samples)
e@35	101 wav_arr[:len(wav)] = wav
e@35	102 wav = wav_arr
e@35	103 else:
e@35	104 duration = end_time - start_time
e@35	105 duration_samples = int(duration*SR)
e@35	106 end_time_samples = start_time_samples + duration_samples
e@35	107
e@35	108 event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time)
e@35	109
e@35	110 start_times_samples.append(start_time_samples)
e@35	111 end_times_samples.append(end_time_samples)
e@35	112 durations_samples.append(duration_samples)
e@35	113 wav_files.append(event_render)
e@35	114
e@35	115 pattern_duration = end_time_samples
e@35	116 pattern_arr = np.zeros(pattern_duration)
e@35	117
e@35	118 for n, s in enumerate(start_times_samples):
e@35	119 wav = wav_files[n]
e@35	120 pattern_arr[s:s+len(wav)] = wav
e@35	121
e@35	122 return pattern_arr, 44100
e@35	123
e@35	124 def read_events_file(fname):
e@35	125 if fname[-3:].lower() == 'xls':
e@35	126 df = pd.read_excel(fname)
e@35	127 elif fname[-4:].lower() == 'json':
e@35	128 df = pd.read_json(fname)
e@35	129 elif fname[-3:].lower() in ['txt']:
e@35	130 with open(fname) as f:
e@35	131 s = f.readline()
e@35	132 f.seek(0,0)
e@35	133 if ',' in s:
e@35	134 sep = ','
e@35	135 elif '\t' in s:
e@35	136 sep = '\t'
e@35	137 else:
e@35	138 sep = ' '
e@35	139 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35	140 df = pd.read_csv(f, header=None, sep=sep)
e@35	141 df.columns = ['label','sampleid','ebr','ebr_stddev','mean_time_between_instances','time_between_instances_stddev','start_time','end_time','fade_in_time','fade_out_time']
e@35	142 elif fname[-3:].lower() in ['csv']:
e@35	143 df = pd.read_json(fname)
e@35	144
e@35	145 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
e@35	146 return df
e@35	147
e@35	148 def read_pattern_file(fname):
e@35	149 if fname[-3:].lower() == 'xls':
e@35	150 df = pd.read_excel(fname)
e@35	151 elif fname[-4:].lower() == 'json':
e@35	152 df = pd.read_json(fname)
e@35	153 elif fname[-3:].lower() in ['txt']:
e@35	154 with open(fname) as f:
e@35	155 s = f.readline()
e@35	156 f.seek(0,0)
e@35	157 if ',' in s:
e@35	158 sep = ','
e@35	159 elif '\t' in s:
e@35	160 sep = '\t'
e@35	161 else:
e@35	162 sep = ' '
e@35	163 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35	164 df = pd.read_csv(f, header=None, sep=sep)
e@35	165 df.columns = ['eventid','start_time','end_time','time_offset_stdev','fade_in_time', 'fade_out_time', 'amplitude', 'amplitude_stdev']
e@35	166 elif fname[-3:].lower() in ['csv']:
e@35	167 df = pd.read_json(fname)
e@35	168
e@35	169 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
e@35	170 return df
e@35	171
e@35	172 def read_backgrounds_file(fname):
e@35	173 if fname[-3:].lower() == 'xls':
e@35	174 df = pd.read_excel(fname)
e@35	175 elif fname[-4:].lower() == 'json':
e@35	176 df = pd.read_json(fname)
e@35	177 elif fname[-3:].lower() in ['txt']:
e@35	178 with open(fname) as f:
e@35	179 s = f.readline()
e@35	180 f.seek(0,0)
e@35	181 if ',' in s:
e@35	182 sep = ','
e@35	183 elif '\t' in s:
e@35	184 sep = '\t'
e@35	185 else:
e@35	186 sep = ' '
e@35	187 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35	188 df = pd.read_csv(f, header=None, sep=sep)
e@35	189 df.columns = ['label','sampleid','snr']
e@35	190 elif fname[-3:].lower() in ['csv']:
e@35	191 df = pd.read_json(fname)
e@35	192
e@35	193 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
e@35	194 return df
e@35	195
e@35	196 def read_annotations_file(fname):
e@35	197 if fname[-3:].lower() == 'xls':
e@35	198 df = pd.read_excel(fname)
e@35	199 elif fname[-4:].lower() == 'json':
e@35	200 df = pd.read_json(fname)
e@35	201 elif fname[-3:].lower() in ['txt', 'csv']:
e@35	202
e@35	203 with open(fname) as f:
e@35	204 header = f.readline()
e@35	205
e@35	206 s = f.readline()
e@35	207 f.seek(0,0)
e@35	208 if ',' in s:
e@35	209 sep = ','
e@35	210 elif '\t' in s:
e@35	211 sep = '\t'
e@35	212 else:
e@35	213 sep = ' '
e@35	214 if sep in header:
e@35	215 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35	216 df = pd.read_csv(f, header=None, sep=sep)
e@35	217 df.columns = ['start', 'stop', 'class']
e@35	218 else:
e@35	219 df.columns = ['start', 'stop', 'class']
e@35	220 df = pd.read_csv(f, sep=sep)
e@35	221 df = None
e@35	222
e@35	223 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
e@35	224 return df
e@35	225
e@35	226 def run_demo():
e@35	227 print("TODO: Implement run_demo()")
e@35	228
e@35	229 def fade(x, fade_in, fade_out, sr=44100):
e@35	230 """
e@35	231 Creates a fade-in-fade-out envelope
e@35	232 for audio array x.
e@35	233 """
e@35	234
e@35	235 if len(x) == 0:
e@35	236 return x
e@35	237
e@35	238 fade_in_samples = int(fade_in*sr)
e@35	239 fade_out_samples = int(fade_out*sr)
e@35	240
e@35	241 outp = np.ones_like(x)
e@35	242 for n in range(fade_in_samples):
e@35	243 outp[n] = n*1./fade_in_samples
e@35	244
e@35	245 for n in range(fade_out_samples):
e@35	246 outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n
e@35	247 return outp*x
e@35	248
e@35	249 def simscene(input_path,
e@35	250 output_path,
e@35	251 scene_duration,
e@35	252 score_events,
e@35	253 score_backgrounds,
e@35	254 **kwargs):
e@35	255 logging.info('simscene() is not yet implemented fully')
e@35	256 SR = 44100 # Samplerate. Should probably not be hardcoded
e@35	257
e@35	258 events_df = score_events
e@35	259 backgrounds_df = score_backgrounds
e@35	260
e@35	261 # Create empty numpy array
e@35	262 scene_arr = np.zeros(int(scene_duration*SR))
e@35	263
e@35	264 if 'append_to_filename' in kwargs:
e@35	265 append_to_filename = kwargs['append_to_filename']
e@35	266 else:
e@35	267 append_to_filename = None
e@35	268
e@35	269 if 'end_cut' in kwargs:
e@35	270 end_cut = kwargs['end_cut']
e@35	271 else:
e@35	272 end_cut = False
e@35	273
e@35	274 if 'figure_verbosity' in kwargs:
e@35	275 figure_verbosity = kwargs['figure_verbosity']
e@35	276 else:
e@35	277 figure_verbosity = 0
e@35	278
e@35	279 if 'image_format' in kwargs:
e@35	280 image_format = kwargs['image_format']
e@35	281 else:
e@35	282 image_format = 'png'
e@35	283
e@35	284 # Stores the starting and ending times of every track for visualization
e@35	285 # purposes
e@35	286 scene_starting_times = []
e@35	287 scene_ending_times = []
e@35	288
e@35	289 # List of tracks
e@35	290 track_list = []
e@35	291 background_energies = []
e@35	292
e@35	293 for n in range(len(backgrounds_df)):
e@35	294 # Get label of background
e@35	295 label = str(backgrounds_df['label'].loc[n])
e@35	296
e@35	297 # First check if there are any pattern candidates. Give priorities
e@35	298 # To pattern files.
e@35	299 candidates = []
e@35	300 for pattern_format in ['xls', 'json', 'txt', 'csv']:
e@35	301 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, backgrounds_df['sampleid'].loc[n], pattern_format))
e@35	302
e@35	303 if len(candidates) == 0:
e@35	304 # If no patterns are found, search for normal audio files
e@35	305 candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n]))
e@35	306 chosen_fname = random.sample(candidates, 1)[0]
e@35	307 wav, sr = librosa.load(chosen_fname, sr=SR)
e@35	308 else:
e@35	309 chosen_fname = random.sample(candidates, 1)[0]
e@35	310 wav, sr = render_pattern(chosen_fname, input_path)
e@35	311
e@35	312 duration = len(wav)/float(SR)
e@35	313 target_snr_db = float(backgrounds_df['snr'].loc[n])
e@35	314 target_snr = 10**(target_snr_db/20.0)
e@35	315
e@35	316 energy = compute_energy(wav)
e@35	317
e@35	318 logging.debug('{}:energy:{}'.format(label,energy))
e@35	319
e@35	320
e@35	321 if n == 0:
e@35	322 # For the first background track, snr
e@35	323 # gives an amount by which it's going to be scaled (i.e. make it more silent)
e@35	324 amplitude_factor = target_snr
e@35	325 wav *= amplitude_factor
e@35	326
e@35	327 if n > 0:
e@35	328 noise_energy = compute_energy(np.sum(np.array(track_list), axis=0))
e@35	329 logging.info('{}:noise_energy:{}'.format(label,noise_energy))
e@35	330
e@35	331 old_snr = energy/noise_energy
e@35	332 old_snr_db = 20*np.log10(old_snr)
e@35	333 logging.info('{}:old_snr:{}'.format(label,old_snr_db))
e@35	334
e@35	335 amplitude_factor = target_snr/old_snr
e@35	336
e@35	337
e@35	338 wav *= amplitude_factor
e@35	339 new_energy = compute_energy(wav)
e@35	340 new_snr = new_energy/noise_energy
e@35	341 new_snr_db = 20. * np.log10(new_snr)
e@35	342 logging.info('{}:new_snr:{}'.format(label,new_snr_db))
e@35	343
e@35	344
e@35	345 # Track array
e@35	346 track_arr = np.zeros(int(scene_duration*SR))
e@35	347 start_times = [0.0]
e@35	348 end_times = [start_times[-1]+len(wav)/float(SR)]
e@35	349
e@35	350
e@35	351 # Start with the first time in the list
e@35	352 new_start_time = start_times[-1]
e@35	353 new_end_time = end_times[-1]
e@35	354
e@35	355 while new_start_time < scene_duration:
e@35	356 offset = duration
e@35	357 new_start_time += offset
e@35	358 new_end_time += offset
e@35	359
e@35	360 start_times.append(new_start_time)
e@35	361 end_times.append(new_end_time)
e@35	362
e@35	363 for n,t in enumerate(start_times):
e@35	364 # We need to be careful with the limits here
e@35	365 # since numpy will just ignore indexing that
e@35	366 # exceeds
e@35	367
e@35	368 # Fading times in case we need to join many
e@35	369 # consecutive samples together.
e@35	370 # if n == 0:
e@35	371 # # Little fade-out, fade-in to smoothly repeat the
e@35	372 # # background.
e@35	373 # fade_in_time = 0.0
e@35	374 # fade_out_time = 0.01
e@35	375 # elif n > 0 and n < len(start_times) - 1:
e@35	376 # fade_in_time = 0.01
e@35	377 # fade_out_time = 0.01
e@35	378 # else:
e@35	379 # fade_in_time = 0.01
e@35	380 # fade_out_time = 0.0
e@35	381 begin = min(_N(t), len(track_arr))
e@35	382 end = min(len(track_arr), _N(t)+len(wav))
e@35	383
e@35	384 # Part of the wav to store
e@35	385 # part = fade(wav[:end-begin],fade_in_time,fade_out_time)
e@35	386 part = wav[:end-begin]
e@35	387
e@35	388
e@35	389 track_arr[begin:end] += part
e@35	390
e@35	391 track_list.append(track_arr)
e@35	392 scene_arr[:len(track_arr)] += track_arr
e@35	393
e@35	394 if channel_mode == 'separate':
e@35	395 librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR)
e@35	396
e@35	397 F = librosa.stft(track_arr, 1024)
e@35	398 energy_prof = librosa.feature.rmse(S=F)
e@35	399 background_energies.append(energy_prof)
e@35	400
e@35	401 if figure_verbosity > 0:
e@35	402 plt.figure()
e@35	403 plt.subplot(3, 1, 1)
e@35	404 plt.title('`{}\' background waveform and spectrogram'.format(label))
e@35	405 librosa.display.waveplot(track_arr,sr=SR)
e@35	406
e@35	407 # Plot spectrogram
e@35	408 Fdb = librosa.amplitude_to_db(F)
e@35	409 plt.subplot(3, 1, 2)
e@35	410 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
e@35	411
e@35	412 # Plot energy profile
e@35	413 plt.subplot(3, 1, 3)
e@35	414 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
e@35	415 plt.semilogy(time, energy_prof.T)
e@35	416 plt.xlim([0, len(track_arr)/SR])
e@35	417 plt.ylabel('energy (rms)')
e@35	418
e@35	419
e@35	420 # Tidy up and save to file
e@35	421 plt.tight_layout()
e@35	422 if append_to_filename:
e@35	423 plt.savefig('{}/background_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
e@35	424 else:
e@35	425 plt.savefig('{}/background_{}.{}'.format(output_path, label, image_format), dpi=300)
e@35	426
e@35	427 # Compute total energy of background
e@35	428 if len(backgrounds_df) > 0:
e@35	429 background_arr = np.sum(track_list, 0)
e@35	430 B = librosa.stft(background_arr, 1024)
e@35	431 background_energy = librosa.feature.rmse(S=B).flatten()
e@35	432 else:
e@35	433 background_energy = 0.0
e@35	434
e@35	435 for n in range(len(events_df)):
e@35	436 # Get label of track
e@35	437 label = str(events_df['label'].loc[n])
e@35	438
e@35	439 # First check if there are any pattern candidates. Give priorities
e@35	440 # To pattern files.
e@35	441 candidates = []
e@35	442 for pattern_format in ['xls', 'json', 'txt', 'csv']:
e@35	443 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, events_df['sampleid'].loc[n], pattern_format))
e@35	444
e@35	445 if len(candidates) == 0:
e@35	446 # If no patterns are found, search for normal audio files
e@35	447 candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n]))
e@35	448 chosen_fname = random.sample(candidates, 1)[0]
e@35	449 wav, sr = librosa.load(chosen_fname, sr=SR)
e@35	450 else:
e@35	451 chosen_fname = random.sample(candidates, 1)[0]
e@35	452 wav, sr = render_pattern(chosen_fname, input_path)
e@35	453
e@35	454
e@35	455 # Apply a fader envelope
e@35	456 fade_in_time = float(events_df['fade_in_time'].loc[n])
e@35	457 fade_out_time = float(events_df['fade_out_time'].loc[n])
e@35	458 wav = fade(wav, fade_in_time, fade_out_time)
e@35	459
e@35	460 # Set target EBR
e@35	461 target_ebr = 10*(float(events_df['ebr'].loc[n])/20.0 + np.random.randn()float(events_df['ebr_stddev'].loc[n])/20.0)
e@35	462
e@35	463 # Mean time between instances \mu.
e@35	464 mean_time_between_instances = events_df['mean_time_between_instances'].loc[n]
e@35	465 track_end_time = events_df['end_time'].loc[n]
e@35	466
e@35	467 # Track array
e@35	468 track_arr = np.zeros(int(scene_duration*SR))
e@35	469
e@35	470 #If \mu is -1, then play the event only once.
e@35	471 if mean_time_between_instances == -1:
e@35	472 track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav
e@35	473 start_times = [float(events_df['start_time'].loc[n])]
e@35	474 end_times = [float(events_df['end_time'].loc[n])]
e@35	475 else:
e@35	476 # If 0, then start next sample after this one (set it to the duration of the sample)
e@35	477 if mean_time_between_instances == 0:
e@35	478 mean_time_between_instances = len(wav)/float(SR)
e@35	479
e@35	480 # Store the successive starting and ending times of the events (given e.g. the model)
e@35	481 # in the following lists.
e@35	482 start_times = [events_df['start_time'].loc[n]]
e@35	483 end_times = [start_times[-1]+len(wav)/float(SR)]
e@35	484
e@35	485 # Start with the first time in the list
e@35	486 new_start_time = start_times[-1]
e@35	487 new_end_time = end_times[-1]
e@35	488
e@35	489 # Until the scene is full
e@35	490 while new_start_time < track_end_time:
e@35	491 offset = float(mean_time_between_instances) +\
e@35	492 float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn())
e@35	493 new_start_time += offset
e@35	494 new_end_time += offset
e@35	495
e@35	496 # Only exception is if we have set the 'end_cut' flag
e@35	497 # and the end time of the event surpasses the end time
e@35	498 # of the track
e@35	499 if end_cut and new_end_time > track_end_time:
e@35	500 break
e@35	501 else:
e@35	502 start_times.append(new_start_time)
e@35	503 end_times.append(new_end_time)
e@35	504
e@35	505 for t in start_times:
e@35	506 # We need to be careful with the limits here
e@35	507 # since numpy will just ignore indexing that
e@35	508 # exceeds the size of the array
e@35	509 begin = min(_N(t), len(track_arr))
e@35	510 end = min(len(track_arr), _N(t)+len(wav))
e@35	511
e@35	512 # Part of the wav to store
e@35	513 part = wav[:end-begin]
e@35	514
e@35	515 # If wav file was concatenated, fade out
e@35	516 # quickly to avoid clicks
e@35	517 if len(part) < len(wav) and len(part) > fade_out_time*SR:
e@35	518 part = fade(part, 0, fade_out_time)
e@35	519
e@35	520 track_arr[begin:end] += part
e@35	521
e@35	522 track_list.append(track_arr)
e@35	523 scene_arr[:len(track_arr)] += track_arr
e@35	524
e@35	525 # Compute energies
e@35	526 F = librosa.stft(track_arr, 1024)
e@35	527 energy_prof = librosa.feature.rmse(S=F).flatten()
e@35	528
e@35	529 # Compute current ebr
e@35	530
e@35	531 if len(backgrounds_df) > 0:
e@35	532 ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten()
e@35	533 curr_ebr = np.max(ebr_prof)
e@35	534 logging.debug('{}:Target ebr: {}db'.format(label,20*np.log10(target_ebr)))
e@35	535 logging.debug('{}:Current track ebr: {}db'.format(label,20*np.log10(curr_ebr)))
e@35	536
e@35	537 # Set correct ebr
e@35	538 track_arr = track_arr/curr_ebr*target_ebr
e@35	539
e@35	540 Fnew = librosa.stft(track_arr, 1024)
e@35	541 new_energy_prof = librosa.feature.rmse(S=Fnew).flatten()
e@35	542 new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten()
e@35	543 new_ebr = np.max(new_ebr_prof)
e@35	544 logging.debug('{}:New track ebr: {}db'.format(label,20*np.log10(new_ebr)))
e@35	545
e@35	546
e@35	547
e@35	548 if channel_mode == 'separate':
e@35	549 librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label), track_arr/np.max(track_arr), SR)
e@35	550
e@35	551
e@35	552
e@35	553
e@35	554
e@35	555 if figure_verbosity > 0:
e@35	556 plt.figure()
e@35	557
e@35	558 plt.subplot(3,1,1)
e@35	559 plt.title('`{}\' event waveform and spectrogram'.format(label))
e@35	560
e@35	561 librosa.display.waveplot(track_arr,sr=SR)
e@35	562 Fdb = librosa.amplitude_to_db(F)
e@35	563 plt.subplot(3, 1, 2)
e@35	564 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
e@35	565
e@35	566 # Plot energy profile
e@35	567 plt.subplot(3, 1, 3)
e@35	568 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
e@35	569 plt.semilogy(time, energy_prof.T)
e@35	570 plt.xlim([0, len(track_arr)/SR])
e@35	571 plt.ylabel('energy (rms)')
e@35	572
e@35	573
e@35	574 plt.tight_layout()
e@35	575 if append_to_filename:
e@35	576 plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
e@35	577 else:
e@35	578 plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300)
e@35	579
e@35	580
e@35	581
e@35	582
e@35	583 scene_starting_times.append((label, start_times))
e@35	584 scene_ending_times.append((label, end_times))
e@35	585
e@35	586 if figure_verbosity > 0:
e@35	587 plt.figure()
e@35	588 ax0 = plt.subplot(3,1,1)
e@35	589 plt.title('Synthesized Scene')
e@35	590 librosa.display.waveplot(scene_arr, sr=SR)
e@35	591 F = librosa.stft(scene_arr)
e@35	592 Fdb = librosa.amplitude_to_db(F)
e@35	593 ax1 = plt.subplot(3,1,2)
e@35	594 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
e@35	595 ax2 = plt.subplot(3,1,3)
e@35	596 ax2.set_xlim([0,scene_duration])
e@35	597
e@35	598 # Get labels
e@35	599 labels = [s[0] for s in scene_starting_times]
e@35	600
e@35	601
e@35	602
e@35	603
e@35	604 # If background is active
e@35	605 if len(backgrounds_df) > 0:
e@35	606 labels.append('background')
e@35	607
e@35	608 # Set y axis limit. With a padding of 0.5.
e@35	609 ax2.set_ylim([-0.5, len(labels)-0.5])
e@35	610
e@35	611 plt.yticks(range(len(labels)), labels)
e@35	612
e@35	613 for n in range(len(scene_starting_times)):
e@35	614 label = scene_starting_times[n][0]
e@35	615 start_times = scene_starting_times[n][1]
e@35	616 end_times = scene_ending_times[n][1]
e@35	617 color = ['r', 'g', 'y'][n % 3]
e@35	618
e@35	619 for m in range(len(start_times)):
e@35	620 plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4)
e@35	621 if figure_verbosity > 2:
e@35	622 ax0.axvline(start_times[m], color=color, alpha=0.1)
e@35	623 ax0.axvline(end_times[m], color=color, alpha=0.1)
e@35	624 ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
e@35	625 ax1.axvline(start_times[m], color=color, alpha=0.1)
e@35	626 ax1.axvline(end_times[m], color=color, alpha=0.1)
e@35	627 ax1.axvline(end_times[m], color=color, alpha=0.1)
e@35	628 ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
e@35	629 ax2.axvline(start_times[m], color=color, alpha=0.1)
e@35	630 ax2.axvline(end_times[m], color=color, alpha=0.1)
e@35	631 ax2.axvline(end_times[m], color=color, alpha=0.1)
e@35	632 ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
e@35	633
e@35	634 if len(backgrounds_df) > 0:
e@35	635 plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4)
e@35	636
e@35	637 plt.tight_layout()
e@35	638
e@35	639 if append_to_filename:
e@35	640 plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300)
e@35	641 else:
e@35	642 plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300)
e@35	643
e@35	644 if figure_verbosity > 1:
e@35	645 plt.show()
e@35	646
e@35	647 # Replace nans (i.e. because of division-by-zero) of the scene with zeros.
e@35	648 scene_arr = np.nan_to_num(scene_arr)
e@35	649
e@35	650 if channel_mode == 'mono':
e@35	651 if append_to_filename:
e@35	652 librosa.output.write_wav('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR)
e@35	653 else:
e@35	654 librosa.output.write_wav('{}/scene.wav'.format(output_path), scene_arr, SR)
e@35	655
e@35	656
e@35	657 return scene_arr
e@35	658
e@35	659
e@35	660
e@35	661 def not_implemented():
e@35	662 print("TODO: not implemented")
e@35	663
e@35	664 if __name__=="__main__":
e@35	665 """
e@35	666 Main function, parses options and calls the simscene generation function
e@35	667 or a demo. The options given are almost identical to Lagrange et al's
e@35	668 simscene.
e@35	669 """
e@35	670 argparser = argparse.ArgumentParser(
e@35	671 description="SimScene.py acoustic scene generator",
e@35	672 )
e@35	673 argparser.add_argument(
e@35	674 'input_path',
e@35	675 type=str,
e@35	676 help="Path of a directory containing wave files for sound backgrounds (in the `background' sub-directory) or events (in `event')"
e@35	677 )
e@35	678 argparser.add_argument(
e@35	679 'output_path',
e@35	680 type=str,
e@35	681 help="The directory the generated scenes and annotations will reside."
e@35	682 )
e@35	683 argparser.add_argument(
e@35	684 'scene_duration',
e@35	685 type=float,
e@35	686 help="Duration of scene in seconds",
e@35	687 )
e@35	688 scene_duration = None
e@35	689
e@35	690 argparser.add_argument(
e@35	691 '-e', '--score-events',
e@35	692 type=str,
e@35	693 help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
e@35	694 )
e@35	695 score_events = None
e@35	696
e@35	697 argparser.add_argument(
e@35	698 '-b', '--score-backgrounds',
e@35	699 type=str,
e@35	700 help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
e@35	701 )
e@35	702 score_backgrounds = None
e@35	703
e@35	704 argparser.add_argument(
e@35	705 '--tag',
e@35	706 type=str,
e@35	707 help="Append _TAG_XXX to filenames, where XXX is an increment."
e@35	708 )
e@35	709 tag = None
e@35	710
e@35	711 argparser.add_argument(
e@35	712 '-N',
e@35	713 type=int,
e@35	714 help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, then the verbosity must be less or equal to 1"
e@35	715 )
e@35	716 generate_n = 1
e@35	717
e@35	718 argparser.add_argument(
e@35	719 '-t', '--time-mode',
e@35	720 type=str,
e@35	721 help="Mode of spacing between events. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
e@35	722 choices=['generate', 'abstract', 'replicate']
e@35	723 )
e@35	724 time_mode = 'generate'
e@35	725
e@35	726 argparser.add_argument(
e@35	727 '-R', '--ebr-mode',
e@35	728 type=str,
e@35	729 help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
e@35	730 choices=['generate', 'abstract', 'replicate']
e@35	731 )
e@35	732 ebr_mode = 'generate'
e@35	733
e@35	734 argparser.add_argument(
e@35	735 '-A', '--annotation-file',
e@35	736 type=float,
e@35	737 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). (NOT IMPLEMENTED)"
e@35	738 )
e@35	739 annotation_file = None
e@35	740
e@35	741 argparser.add_argument(
e@35	742 '-a', '--audio-file',
e@35	743 type=float,
e@35	744 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)"
e@35	745 )
e@35	746 audio_file = None
e@35	747
e@35	748 argparser.add_argument(
e@35	749 '-v', '--figure-verbosity', action='count',
e@35	750 help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot"
e@35	751 )
e@35	752 figure_verbosity = 0
e@35	753
e@35	754 argparser.add_argument(
e@35	755 '-x', '--image-format',
e@35	756 help="Image format for the figures",
e@35	757 choices=['png', 'jpg', 'pdf']
e@35	758 )
e@35	759 image_format = 'png'
e@35	760
e@35	761 argparser.add_argument(
e@35	762 '-C', '--channel-mode',
e@35	763 type=str,
e@35	764 help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.",
e@35	765 choices=['mono', 'separate']
e@35	766 )
e@35	767 channel_mode = 'mono'
e@35	768
e@35	769 # argparser.add_argument(
e@35	770 # '-m', '--min-space',
e@35	771 # type=float,
e@35	772 # help="Minimum space allowed between successive events (seconds). If -1, then allow overlapping between events."
e@35	773 # )
e@35	774 min_space = -1
e@35	775
e@35	776 argparser.add_argument(
e@35	777 '-c', '--end-cut',
e@35	778 action='store_true',
e@35	779 help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, else remove the sample."
e@35	780 )
e@35	781 end_cut = None
e@35	782
e@35	783 logging.basicConfig(level=logging.DEBUG)
e@35	784
e@35	785 args = argparser.parse_args()
e@35	786 if args.input_path:
e@35	787 input_path = args.input_path
e@35	788 logging.debug("Using `{}' as input path".format(input_path))
e@35	789 if args.output_path:
e@35	790 output_path = args.output_path
e@35	791 logging.debug("Saving to `{}'".format(output_path))
e@35	792 if args.scene_duration:
e@35	793 if not (args.score_backgrounds or args.score_events):
e@35	794 print("You must provide one of -e or -b")
e@35	795 else:
e@35	796 if args.image_format:
e@35	797 image_format = args.image_format
e@35	798 if args.channel_mode:
e@35	799 channel_mode = args.channel_mode
e@35	800 if args.ebr_mode:
e@35	801 ebr_mode = args.ebr_mode
e@35	802 if ebr_mode not in ['generate']:
e@35	803 logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode))
e@35	804 ebr_mode = 'generate'
e@35	805 if args.time_mode:
e@35	806 time_mode = args.time_mode
e@35	807 if time_mode not in ['generate']:
e@35	808 logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode))
e@35	809 time_mode = 'generate'
e@35	810 if args.annotation_file:
e@35	811 annotations = read_annotations_file(args.annotation_file)
e@35	812
e@35	813 scene_duration = float(args.scene_duration)
e@35	814
e@35	815 if args.score_backgrounds:
e@35	816 score_backgrounds = read_backgrounds_file(args.score_backgrounds)
e@35	817 else:
e@35	818 score_backgrounds = []
e@35	819
e@35	820 if args.score_events:
e@35	821 score_events = read_events_file(args.score_events)
e@35	822 else:
e@35	823 score_events = []
e@35	824
e@35	825 if args.figure_verbosity:
e@35	826 figure_verbosity = args.figure_verbosity
e@35	827
e@35	828 if args.N:
e@35	829 generate_n = args.N
e@35	830
e@35	831 if args.tag:
e@35	832 tag = args.tag
e@35	833
e@35	834 if generate_n == 1:
e@35	835 append_to_filename = None
e@35	836 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
e@35	837 time_mode=time_mode,
e@35	838 ebr_mode=ebr_mode,
e@35	839 channel_mode=channel_mode,
e@35	840 annotation_file=annotation_file,
e@35	841 audio_file=audio_file,
e@35	842 figure_verbosity=figure_verbosity,
e@35	843 min_space=min_space,
e@35	844 end_cut=end_cut,
e@35	845 image_format=image_format,
e@35	846 append_to_filename=append_to_filename)
e@35	847 else:
e@35	848 for n in range(generate_n):
e@35	849 if tag:
e@35	850 append_to_filename = '{}_{}'.format(tag, n)
e@35	851 else:
e@35	852 append_to_filename = '{}'.format(n)
e@35	853
e@35	854 logging.info("Generating scene {}".format(n))
e@35	855
e@35	856 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
e@35	857 time_mode=time_mode,
e@35	858 ebr_mode=ebr_mode,
e@35	859 channel_mode=channel_mode,
e@35	860 annotation_file=annotation_file,
e@35	861 audio_file=audio_file,
e@35	862 figure_verbosity=min(figure_verbosity, 1),
e@35	863 min_space=min_space,
e@35	864 end_cut=end_cut,
e@35	865 image_format=image_format,
e@35	866 append_to_filename=append_to_filename)
e@35	867

Mercurial > hg > simscene-py

annotate python/simscene.py @ 35:5d19c2254677