simscene-py: python/simscene.py annotate

annotate python/simscene.py @ 51:ebf92ed7d680 tip master

Added -fd (--full-duration) argument.

author	Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date	Sun, 30 Sep 2018 13:21:49 +0100
parents	b11264117ddb
children

rev	line source
e@35	1 #!/bin/python
e@35	2 # -- coding: utf-8 --
e@35	3 # For licensing please see: LICENSE
e@35	4 # Copyright (c) Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
e@35	5
e@35	6 # Argparse
e@35	7 import argparse
e@35	8
e@35	9 # Logging
e@35	10 import logging
e@35	11
e@35	12 # Pandas
e@35	13 import pandas as pd
e@35	14
e@35	15 # Numpy
e@35	16 import numpy as np
e@35	17
e@35	18 # Glob
e@35	19 import glob
e@35	20 import random
e@35	21
e@35	22 # Librosa
e@35	23 import librosa
e@35	24 import librosa.display
e@44	25
e@44	26 # PySoundfile
e@44	27 import soundfile as sf
e@35	28
e@35	29 # Matplotlib
e@35	30 import matplotlib.pyplot as plt
e@35	31
e@35	32 # Tabulate
e@35	33 from tabulate import tabulate
e@35	34
e@41	35
e@42	36 def _D(t, sr=44100):
e@35	37 """
e@35	38 Helper function: Converts time to samples
e@35	39 """
e@35	40 return int(t*sr)
e@35	41
e@42	42
e@35	43 def compute_energy(x):
e@35	44 return np.sqrt(np.mean(x**2))
e@35	45
e@35	46
e@41	47 def timedict_to_dataframe(timedict):
e@47	48 print(timedict)
e@46	49 return pd.DataFrame([(key, val[0], val[1], val[2]) for key in timedict for val in timedict[key]],
e@46	50 columns=('label', 'filename', 'start_time', 'end_time'))
e@41	51
e@47	52 def timedict_to_txt(timedict):
e@47	53 str_ = ""
e@47	54 for key in timedict:
e@47	55 for val in timedict[key]:
e@47	56 str_ += "{}\t{}\t{}\n".format(float(val[1]), float(val[2]), key)
e@47	57 str_ += '\n'
e@47	58 return str_
e@42	59
e@35	60 def render_pattern(fname, input_path, sr=44100):
e@35	61 pattern = read_pattern_file(fname)
e@35	62
e@41	63 # Store starting and end times in the format
e@41	64 # {'filename': (start_time, end_time)}
e@41	65
e@41	66 timesdict = {}
e@41	67
e@35	68 start_times_samples = []
e@35	69 end_times_samples = []
e@35	70 durations_samples = []
e@35	71 wav_files = []
e@41	72
e@41	73 pattern_timedict = []
e@41	74
e@35	75 for n in range(len(pattern)):
e@35	76 # Try loading the file,
e@35	77 sampleid = pattern['sampleid'].loc[n]
e@46	78 label = pattern['sampleid'].loc[n]
e@38	79 candidates = []
e@38	80 for pattern_format in ['xls', 'json', 'txt', 'csv']:
e@38	81 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, sampleid, pattern_format))
e@35	82
e@38	83 if len(candidates) == 0:
e@42	84 candidates = glob.glob('{}/event/{}*wav'.format(input_path, sampleid))
e@38	85 chosen_fname = random.sample(candidates, 1)[0]
e@38	86
e@44	87 wav, SR = sf.read(chosen_fname)
e@38	88 else:
e@38	89 chosen_fname = random.sample(candidates, 1)[0]
e@38	90
e@38	91 logging.debug('Loading {}'.format(chosen_fname))
e@41	92 wav, SR, pattern_timedict = render_pattern(chosen_fname, input_path)
e@42	93
e@35	94 # For each sound in the pattern file, place it starting from starttime + an offset
e@35	95 # with a mean value of 0 and standard deviation of offset_stddev. The first event can
e@35	96 # not start earlier than time 0. If endtime is defined (not nan), then cut the event at
e@35	97 # end time.
e@35	98
e@35	99 # Read and assign an amplitude
e@35	100 amplitude_mean = float(pattern['amplitude'].loc[n])
e@35	101 amplitude_stddev = float(pattern['amplitude_stdev'].loc[n])
e@35	102 amplitude = amplitude_mean + np.random.randn()*amplitude_stddev
e@35	103 wav *= amplitude
e@35	104
e@42	105 start_time = max(float(pattern['start_time'].loc[n]), 0)
e@35	106 start_time_samples = int(start_time*SR)
e@35	107
e@35	108 fade_in_time = float(pattern['fade_in_time'].loc[n])
e@35	109 fade_out_time = float(pattern['fade_out_time'].loc[n])
e@35	110 end_time = float(pattern['end_time'].loc[n])
e@35	111
e@35	112 # If end_time is not defined (-1 or just empty)
e@35	113 # then just derive it from the length of the sample
e@35	114 if np.isnan(end_time) or float(end_time) == -1:
e@35	115 duration_samples = len(wav)
e@35	116 end_time_samples = start_time_samples + duration_samples
e@45	117 end_time = end_time_samples/float(SR)
e@45	118
e@35	119 elif end_time - start_time > len(wav)/float(SR):
e@35	120
e@35	121 # If given end_time is more than start_time + duration of sample
e@35	122 # then pad the file with zeros to reach the desired end time.
e@35	123 duration = end_time - start_time
e@35	124 duration_samples = int(duration*SR)
e@35	125 end_time_samples = start_time_samples + duration_samples
e@41	126
e@41	127 # Calculate end time in seconds
e@41	128 end_time = end_time_samples/float(SR)
e@41	129
e@35	130 wav_arr = np.zeros(duration_samples)
e@35	131 wav_arr[:len(wav)] = wav
e@35	132 wav = wav_arr
e@35	133 else:
e@35	134 duration = end_time - start_time
e@35	135 duration_samples = int(duration*SR)
e@35	136 end_time_samples = start_time_samples + duration_samples
e@35	137
e@35	138 event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time)
e@35	139
e@35	140 start_times_samples.append(start_time_samples)
e@35	141 end_times_samples.append(end_time_samples)
e@35	142 durations_samples.append(duration_samples)
e@35	143 wav_files.append(event_render)
e@35	144
e@46	145 if label in timesdict:
e@46	146 timesdict[label].append((chosen_fname,start_time, end_time))
e@41	147 else:
e@46	148 timesdict[label] = [(chosen_fname,start_time, end_time)]
e@41	149
e@41	150 for pt in pattern_timedict:
e@41	151 if pt in timesdict:
e@41	152 timesdict[pt] += pattern_timedict[pt]
e@41	153 else:
e@41	154 timesdict[pt] = pattern_timedict[pt]
e@41	155
e@35	156 pattern_duration = end_time_samples
e@35	157 pattern_arr = np.zeros(pattern_duration)
e@35	158
e@35	159 for n, s in enumerate(start_times_samples):
e@35	160 wav = wav_files[n]
e@35	161 pattern_arr[s:s+len(wav)] = wav
e@41	162
e@41	163 return pattern_arr, 44100, timesdict
e@41	164
e@35	165
e@35	166 def read_events_file(fname):
e@35	167 if fname[-3:].lower() == 'xls':
e@35	168 df = pd.read_excel(fname)
e@35	169 elif fname[-4:].lower() == 'json':
e@35	170 df = pd.read_json(fname)
e@35	171 elif fname[-3:].lower() in ['txt']:
e@35	172 with open(fname) as f:
e@35	173 s = f.readline()
e@42	174 f.seek(0, 0)
e@35	175 if ',' in s:
e@35	176 sep = ','
e@35	177 elif '\t' in s:
e@35	178 sep = '\t'
e@35	179 else:
e@35	180 sep = ' '
e@35	181 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35	182 df = pd.read_csv(f, header=None, sep=sep)
e@42	183 df.columns = ['label',
e@42	184 'sampleid',
e@42	185 'ebr',
e@42	186 'ebr_stddev',
e@42	187 'mean_time_between_instances',
e@42	188 'time_between_instances_stddev',
e@42	189 'start_time',
e@42	190 'end_time',
e@42	191 'fade_in_time',
e@42	192 'fade_out_time']
e@35	193 elif fname[-3:].lower() in ['csv']:
e@35	194 df = pd.read_json(fname)
e@35	195
e@42	196 logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
e@35	197 return df
e@35	198
e@41	199
e@35	200 def read_pattern_file(fname):
e@35	201 if fname[-3:].lower() == 'xls':
e@35	202 df = pd.read_excel(fname)
e@35	203 elif fname[-4:].lower() == 'json':
e@35	204 df = pd.read_json(fname)
e@35	205 elif fname[-3:].lower() in ['txt']:
e@35	206 with open(fname) as f:
e@35	207 s = f.readline()
e@42	208 f.seek(0, 0)
e@35	209 if ',' in s:
e@35	210 sep = ','
e@35	211 elif '\t' in s:
e@35	212 sep = '\t'
e@35	213 else:
e@35	214 sep = ' '
e@35	215 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35	216 df = pd.read_csv(f, header=None, sep=sep)
e@42	217 df.columns = ['eventid',
e@42	218 'start_time',
e@42	219 'end_time',
e@42	220 'time_offset_stdev',
e@42	221 'fade_in_time',
e@42	222 'fade_out_time',
e@42	223 'amplitude',
e@42	224 'amplitude_stdev']
e@35	225 elif fname[-3:].lower() in ['csv']:
e@35	226 df = pd.read_json(fname)
e@35	227
e@42	228 logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
e@35	229 return df
e@41	230
e@41	231
e@35	232 def read_backgrounds_file(fname):
e@35	233 if fname[-3:].lower() == 'xls':
e@35	234 df = pd.read_excel(fname)
e@35	235 elif fname[-4:].lower() == 'json':
e@35	236 df = pd.read_json(fname)
e@35	237 elif fname[-3:].lower() in ['txt']:
e@35	238 with open(fname) as f:
e@35	239 s = f.readline()
e@42	240 f.seek(0, 0)
e@35	241 if ',' in s:
e@35	242 sep = ','
e@35	243 elif '\t' in s:
e@35	244 sep = '\t'
e@35	245 else:
e@35	246 sep = ' '
e@35	247 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35	248 df = pd.read_csv(f, header=None, sep=sep)
e@42	249 df.columns = ['label', 'sampleid', 'snr']
e@35	250 elif fname[-3:].lower() in ['csv']:
e@35	251 df = pd.read_json(fname)
e@35	252
e@42	253 logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
e@35	254 return df
e@35	255
e@41	256
e@35	257 def read_annotations_file(fname):
e@35	258 if fname[-3:].lower() == 'xls':
e@35	259 df = pd.read_excel(fname)
e@35	260 elif fname[-4:].lower() == 'json':
e@35	261 df = pd.read_json(fname)
e@35	262 elif fname[-3:].lower() in ['txt', 'csv']:
e@35	263
e@35	264 with open(fname) as f:
e@35	265 header = f.readline()
e@35	266
e@35	267 s = f.readline()
e@42	268 f.seek(0, 0)
e@35	269 if ',' in s:
e@35	270 sep = ','
e@35	271 elif '\t' in s:
e@35	272 sep = '\t'
e@35	273 else:
e@35	274 sep = ' '
e@35	275 if sep in header:
e@35	276 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
e@35	277 df = pd.read_csv(f, header=None, sep=sep)
e@35	278 df.columns = ['start', 'stop', 'class']
e@35	279 else:
e@41	280 df = pd.read_csv(f, sep=sep)
e@35	281 df.columns = ['start', 'stop', 'class']
e@35	282 df = None
e@35	283
e@35	284 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
e@35	285 return df
e@35	286
e@41	287
e@35	288 def run_demo():
e@35	289 print("TODO: Implement run_demo()")
e@35	290
e@41	291
e@35	292 def fade(x, fade_in, fade_out, sr=44100):
e@35	293 """
e@35	294 Creates a fade-in-fade-out envelope
e@35	295 for audio array x.
e@35	296 """
e@35	297
e@35	298 if len(x) == 0:
e@35	299 return x
e@35	300
e@35	301 fade_in_samples = int(fade_in*sr)
e@35	302 fade_out_samples = int(fade_out*sr)
e@35	303
e@35	304 outp = np.ones_like(x)
e@35	305 for n in range(fade_in_samples):
e@35	306 outp[n] = n*1./fade_in_samples
e@35	307
e@35	308 for n in range(fade_out_samples):
e@35	309 outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n
e@35	310 return outp*x
e@35	311
e@41	312
e@35	313 def simscene(input_path,
e@35	314 output_path,
e@35	315 scene_duration,
e@35	316 score_events,
e@35	317 score_backgrounds,
e@35	318 **kwargs):
e@47	319 logging.warning('BER ratios have not yet been verified')
e@42	320 SR = 44100 # Samplerate. Should probably not be hardcoded
e@35	321
e@35	322 events_df = score_events
e@35	323 backgrounds_df = score_backgrounds
e@41	324
e@41	325 # Store starting and ending times in the format
e@41	326 # {'filename': [(start_time, end_time), (start_time, end_time), ...]}
e@41	327 timedict = {}
e@41	328
e@35	329 # Create empty numpy array
e@35	330 scene_arr = np.zeros(int(scene_duration*SR))
e@35	331
e@35	332 if 'append_to_filename' in kwargs:
e@35	333 append_to_filename = kwargs['append_to_filename']
e@35	334 else:
e@35	335 append_to_filename = None
e@35	336
e@35	337 if 'end_cut' in kwargs:
e@35	338 end_cut = kwargs['end_cut']
e@35	339 else:
e@35	340 end_cut = False
e@35	341
e@35	342 if 'figure_verbosity' in kwargs:
e@35	343 figure_verbosity = kwargs['figure_verbosity']
e@35	344 else:
e@35	345 figure_verbosity = 0
e@35	346
e@35	347 if 'image_format' in kwargs:
e@35	348 image_format = kwargs['image_format']
e@35	349 else:
e@35	350 image_format = 'png'
e@47	351
e@47	352 if 'annot_format' in kwargs:
e@47	353 annot_format = kwargs['annot_format']
e@47	354 else:
e@47	355 annot_format = 'sed_eval'
e@51	356
e@51	357 if 'full_duration' in kwargs:
e@51	358 full_duration = True
e@51	359 else:
e@51	360 full_duration = False
e@35	361
e@35	362 # Stores the starting and ending times of every track for visualization
e@35	363 # purposes
e@35	364 scene_starting_times = []
e@35	365 scene_ending_times = []
e@35	366
e@35	367 # List of tracks
e@35	368 track_list = []
e@35	369 background_energies = []
e@41	370
e@35	371 for n in range(len(backgrounds_df)):
e@35	372 # Get label of background
e@35	373 label = str(backgrounds_df['label'].loc[n])
e@35	374
e@35	375 # First check if there are any pattern candidates. Give priorities
e@35	376 # To pattern files.
e@35	377 candidates = []
e@41	378
e@41	379 # List of pattern start and end times
e@41	380 pattern_timedict = []
e@41	381
e@35	382 for pattern_format in ['xls', 'json', 'txt', 'csv']:
e@42	383 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path,
e@42	384 backgrounds_df['sampleid'].loc[n],
e@42	385 pattern_format))
e@35	386
e@35	387 if len(candidates) == 0:
e@35	388 # If no patterns are found, search for normal audio files
e@35	389 candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n]))
e@35	390 chosen_fname = random.sample(candidates, 1)[0]
e@44	391 wav, sr = sf.read(chosen_fname)
e@35	392 else:
e@35	393 chosen_fname = random.sample(candidates, 1)[0]
e@41	394 wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)
e@41	395
e@35	396 duration = len(wav)/float(SR)
e@35	397 target_snr_db = float(backgrounds_df['snr'].loc[n])
e@35	398 target_snr = 10**(target_snr_db/20.0)
e@35	399
e@35	400 energy = compute_energy(wav)
e@35	401
e@41	402 logging.debug('{}:energy:{}'.format(label, energy))
e@41	403
e@35	404 if n == 0:
e@35	405 # For the first background track, snr
e@35	406 # gives an amount by which it's going to be scaled (i.e. make it more silent)
e@35	407 amplitude_factor = target_snr
e@35	408 wav *= amplitude_factor
e@35	409
e@35	410 if n > 0:
e@35	411 noise_energy = compute_energy(np.sum(np.array(track_list), axis=0))
e@41	412 logging.info('{}:noise_energy:{}'.format(label, noise_energy))
e@35	413
e@35	414 old_snr = energy/noise_energy
e@35	415 old_snr_db = 20*np.log10(old_snr)
e@41	416 logging.info('{}:old_snr:{}'.format(label, old_snr_db))
e@35	417
e@35	418 amplitude_factor = target_snr/old_snr
e@35	419
e@35	420 wav *= amplitude_factor
e@35	421 new_energy = compute_energy(wav)
e@35	422 new_snr = new_energy/noise_energy
e@35	423 new_snr_db = 20. * np.log10(new_snr)
e@41	424 logging.info('{}:new_snr:{}'.format(label, new_snr_db))
e@41	425
e@35	426 # Track array
e@35	427 track_arr = np.zeros(int(scene_duration*SR))
e@35	428 start_times = [0.0]
e@35	429 end_times = [start_times[-1]+len(wav)/float(SR)]
e@35	430
e@35	431 # Start with the first time in the list
e@35	432 new_start_time = start_times[-1]
e@35	433 new_end_time = end_times[-1]
e@35	434
e@46	435 if label in timedict:
e@46	436 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
e@41	437 else:
e@46	438 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
e@41	439
e@35	440 while new_start_time < scene_duration:
e@35	441 offset = duration
e@35	442 new_start_time += offset
e@41	443
e@41	444 # If already exceeded scene, break
e@41	445 if new_start_time >= scene_duration:
e@41	446 break
e@41	447
e@35	448 new_end_time += offset
e@35	449
e@35	450 start_times.append(new_start_time)
e@35	451 end_times.append(new_end_time)
e@35	452
e@41	453 # Update timesdict noting where each filename starts and stops
e@46	454 if label in timedict:
e@46	455 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
e@41	456 else:
e@46	457 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
e@41	458
e@41	459 # Also update the times from the patterns
e@41	460 for pt in pattern_timedict:
e@46	461 pattern_timedict[pt] = [(s0, s[1] + new_start_time, s[2] + new_start_time) for s in
e@41	462 pattern_timedict[pt]]
e@41	463
e@41	464 if pt in timedict:
e@41	465 timedict[pt] += pattern_timedict[pt]
e@41	466 else:
e@41	467 timedict[pt] = pattern_timedict[pt]
e@41	468
e@41	469 # And add those to the timedict dictionary
e@41	470
e@42	471 for t in start_times:
e@35	472 # We need to be careful with the limits here
e@35	473 # since numpy will just ignore indexing that
e@35	474 # exceeds
e@35	475
e@35	476 # Fading times in case we need to join many
e@35	477 # consecutive samples together.
e@35	478 # if n == 0:
e@35	479 # # Little fade-out, fade-in to smoothly repeat the
e@35	480 # # background.
e@35	481 # fade_in_time = 0.0
e@35	482 # fade_out_time = 0.01
e@35	483 # elif n > 0 and n < len(start_times) - 1:
e@35	484 # fade_in_time = 0.01
e@35	485 # fade_out_time = 0.01
e@35	486 # else:
e@35	487 # fade_in_time = 0.01
e@35	488 # fade_out_time = 0.0
e@42	489 begin = min(_D(t), len(track_arr))
e@42	490 end = min(len(track_arr), _D(t) + len(wav))
e@35	491
e@35	492 # Part of the wav to store
e@35	493 # part = fade(wav[:end-begin],fade_in_time,fade_out_time)
e@35	494 part = wav[:end-begin]
e@35	495 track_arr[begin:end] += part
e@35	496
e@35	497 track_list.append(track_arr)
e@35	498 scene_arr[:len(track_arr)] += track_arr
e@35	499
e@35	500 if channel_mode == 'separate':
e@35	501 librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR)
e@35	502
e@35	503 F = librosa.stft(track_arr, 1024)
e@35	504 energy_prof = librosa.feature.rmse(S=F)
e@35	505 background_energies.append(energy_prof)
e@35	506
e@35	507 if figure_verbosity > 0:
e@35	508 plt.figure()
e@35	509 plt.subplot(3, 1, 1)
e@35	510 plt.title('`{}\' background waveform and spectrogram'.format(label))
e@41	511 librosa.display.waveplot(track_arr, sr=SR)
e@35	512
e@35	513 # Plot spectrogram
e@35	514 Fdb = librosa.amplitude_to_db(F)
e@35	515 plt.subplot(3, 1, 2)
e@35	516 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
e@35	517
e@35	518 # Plot energy profile
e@35	519 plt.subplot(3, 1, 3)
e@35	520 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
e@35	521 plt.semilogy(time, energy_prof.T)
e@35	522 plt.xlim([0, len(track_arr)/SR])
e@35	523 plt.ylabel('energy (rms)')
e@41	524
e@35	525 # Tidy up and save to file
e@35	526 plt.tight_layout()
e@35	527 if append_to_filename:
e@42	528 plt.savefig('{}/background_{}_{}.{}'.format(output_path,
e@42	529 label,
e@42	530 append_to_filename,
e@42	531 image_format),
e@42	532 dpi=300)
e@35	533 else:
e@42	534 plt.savefig('{}/background_{}.{}'.format(output_path,
e@42	535 label,
e@42	536 image_format),
e@42	537 dpi=300)
e@35	538
e@35	539 # Compute total energy of background
e@35	540 if len(backgrounds_df) > 0:
e@35	541 background_arr = np.sum(track_list, 0)
e@35	542 B = librosa.stft(background_arr, 1024)
e@35	543 background_energy = librosa.feature.rmse(S=B).flatten()
e@35	544 else:
e@35	545 background_energy = 0.0
e@41	546
e@35	547 for n in range(len(events_df)):
e@35	548 # Get label of track
e@35	549 label = str(events_df['label'].loc[n])
e@35	550
e@35	551 # First check if there are any pattern candidates. Give priorities
e@35	552 # To pattern files.
e@35	553 candidates = []
e@41	554
e@41	555 # List of pattern start and end times
e@41	556 pattern_timedict = []
e@41	557
e@35	558 for pattern_format in ['xls', 'json', 'txt', 'csv']:
e@42	559 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path,
e@42	560 events_df['sampleid'].loc[n],
e@42	561 pattern_format))
e@35	562
e@35	563 if len(candidates) == 0:
e@35	564 # If no patterns are found, search for normal audio files
e@35	565 candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n]))
e@35	566 chosen_fname = random.sample(candidates, 1)[0]
e@44	567 wav, sr = sf.read(chosen_fname)
e@35	568 else:
e@35	569 chosen_fname = random.sample(candidates, 1)[0]
e@41	570 wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)
e@41	571
e@42	572 logging.debug(chosen_fname)
e@35	573 # Apply a fader envelope
e@35	574 fade_in_time = float(events_df['fade_in_time'].loc[n])
e@35	575 fade_out_time = float(events_df['fade_out_time'].loc[n])
e@35	576 wav = fade(wav, fade_in_time, fade_out_time)
e@35	577
e@35	578 # Set target EBR
e@42	579 target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 +
e@42	580 np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0)
e@35	581
e@35	582 # Mean time between instances \mu.
e@35	583 mean_time_between_instances = events_df['mean_time_between_instances'].loc[n]
e@35	584 track_end_time = events_df['end_time'].loc[n]
e@35	585
e@35	586 # Track array
e@35	587 track_arr = np.zeros(int(scene_duration*SR))
e@35	588
e@42	589 # If \mu is -1, then play the event only once.
e@35	590 if mean_time_between_instances == -1:
e@42	591 track_arr[_D(events_df['start_time'].loc[n]):_D(events_df['start_time'].loc[n]) + len(wav)] += wav
e@35	592 start_times = [float(events_df['start_time'].loc[n])]
e@35	593 end_times = [float(events_df['end_time'].loc[n])]
e@41	594
e@41	595 new_start_time = start_times[-1]
e@41	596 new_end_time = end_times[-1]
e@41	597
e@46	598 if label in timedict:
e@46	599 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
e@42	600 else:
e@46	601 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
e@42	602
e@41	603 for pt in pattern_timedict:
e@46	604 pattern_timedict[pt] = [(s[0], s[1] + new_start_time, s[2] + new_start_time) for s in
e@41	605 pattern_timedict[pt]]
e@41	606
e@41	607 if pt in timedict:
e@41	608 timedict[pt] += pattern_timedict[pt]
e@41	609 else:
e@41	610 timedict[pt] = pattern_timedict[pt]
e@41	611
e@35	612 else:
e@35	613 # If 0, then start next sample after this one (set it to the duration of the sample)
e@35	614 if mean_time_between_instances == 0:
e@35	615 mean_time_between_instances = len(wav)/float(SR)
e@51	616
e@51	617 # If we are using -fd (full_duration) for each event then mean_time_between_instances denotes time AFTER
e@51	618 # the end of the previous event.
e@51	619 if full_duration and mean_time_between_instances > 0:
e@51	620 mean_time_between_instances += len(wav)/float(SR)
e@35	621
e@35	622 # Store the successive starting and ending times of the events (given e.g. the model)
e@35	623 # in the following lists.
e@35	624 start_times = [events_df['start_time'].loc[n]]
e@35	625 end_times = [start_times[-1]+len(wav)/float(SR)]
e@35	626
e@35	627 # Start with the first time in the list
e@35	628 new_start_time = start_times[-1]
e@35	629 new_end_time = end_times[-1]
e@35	630
e@41	631 if chosen_fname in timedict:
e@46	632 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
e@41	633 else:
e@46	634 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
e@41	635
e@35	636 # Until the scene is full
e@35	637 while new_start_time < track_end_time:
e@35	638 offset = float(mean_time_between_instances) +\
e@35	639 float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn())
e@35	640 new_start_time += offset
e@41	641
e@41	642 # If already exceeded scene, break
e@41	643 if new_start_time >= scene_duration:
e@41	644 break
e@41	645
e@35	646 new_end_time += offset
e@35	647
e@35	648 # Only exception is if we have set the 'end_cut' flag
e@35	649 # and the end time of the event surpasses the end time
e@35	650 # of the track
e@35	651 if end_cut and new_end_time > track_end_time:
e@35	652 break
e@35	653 else:
e@35	654 start_times.append(new_start_time)
e@35	655 end_times.append(new_end_time)
e@35	656
e@46	657 if label in timedict:
e@46	658 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
e@41	659 else:
e@46	660 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
e@41	661
e@41	662 # Also update the times from the patterns
e@41	663 for pt in pattern_timedict:
e@48	664 pattern_timedict[pt] = [(s[0], s[1] + new_start_time, s[2] + new_start_time) for s in
e@41	665 pattern_timedict[pt]]
e@41	666
e@41	667 if pt in timedict:
e@41	668 timedict[pt] += pattern_timedict[pt]
e@41	669 else:
e@41	670 timedict[pt] = pattern_timedict[pt]
e@41	671
e@35	672 for t in start_times:
e@35	673 # We need to be careful with the limits here
e@35	674 # since numpy will just ignore indexing that
e@35	675 # exceeds the size of the array
e@42	676 begin = min(_D(t), len(track_arr))
e@42	677 end = min(len(track_arr), _D(t) + len(wav))
e@35	678
e@35	679 # Part of the wav to store
e@35	680 part = wav[:end-begin]
e@35	681
e@35	682 # If wav file was concatenated, fade out
e@35	683 # quickly to avoid clicks
e@42	684 if len(wav) > len(part) > fade_out_time*SR:
e@35	685 part = fade(part, 0, fade_out_time)
e@35	686
e@35	687 track_arr[begin:end] += part
e@35	688
e@35	689 track_list.append(track_arr)
e@35	690 scene_arr[:len(track_arr)] += track_arr
e@35	691
e@35	692 # Compute energies
e@35	693 F = librosa.stft(track_arr, 1024)
e@35	694 energy_prof = librosa.feature.rmse(S=F).flatten()
e@35	695
e@35	696 # Compute current ebr
e@35	697
e@35	698 if len(backgrounds_df) > 0:
e@35	699 ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten()
e@35	700 curr_ebr = np.max(ebr_prof)
e@42	701 logging.debug('{}:Target ebr: {}db'.format(label,
e@42	702 20*np.log10(target_ebr)))
e@42	703 logging.debug('{}:Current track ebr: {}db'.format(label,
e@42	704 20*np.log10(curr_ebr)))
e@35	705
e@35	706 # Set correct ebr
e@35	707 track_arr = track_arr/curr_ebr*target_ebr
e@35	708
e@35	709 Fnew = librosa.stft(track_arr, 1024)
e@35	710 new_energy_prof = librosa.feature.rmse(S=Fnew).flatten()
e@35	711 new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten()
e@35	712 new_ebr = np.max(new_ebr_prof)
e@42	713 logging.debug('{}:New track ebr: {}db'.format(label, 20*np.log10(new_ebr)))
e@35	714
e@35	715 if channel_mode == 'separate':
e@44	716 sf.write('{}/{}_event_track.wav'.format(output_path, label),
e@42	717 track_arr/np.max(track_arr),
e@42	718 SR)
e@35	719
e@35	720 if figure_verbosity > 0:
e@35	721 plt.figure()
e@35	722
e@42	723 plt.subplot(3, 1, 1)
e@35	724 plt.title('`{}\' event waveform and spectrogram'.format(label))
e@35	725
e@42	726 librosa.display.waveplot(track_arr, sr=SR)
e@35	727 Fdb = librosa.amplitude_to_db(F)
e@35	728 plt.subplot(3, 1, 2)
e@35	729 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
e@35	730
e@35	731 # Plot energy profile
e@35	732 plt.subplot(3, 1, 3)
e@35	733 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
e@35	734 plt.semilogy(time, energy_prof.T)
e@35	735 plt.xlim([0, len(track_arr)/SR])
e@35	736 plt.ylabel('energy (rms)')
e@35	737
e@35	738 plt.tight_layout()
e@35	739 if append_to_filename:
e@35	740 plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
e@35	741 else:
e@35	742 plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300)
e@35	743
e@35	744 scene_starting_times.append((label, start_times))
e@35	745 scene_ending_times.append((label, end_times))
e@35	746
e@35	747 if figure_verbosity > 0:
e@35	748 plt.figure()
e@42	749 ax0 = plt.subplot(3, 1, 1)
e@35	750 plt.title('Synthesized Scene')
e@35	751 librosa.display.waveplot(scene_arr, sr=SR)
e@35	752 F = librosa.stft(scene_arr)
e@35	753 Fdb = librosa.amplitude_to_db(F)
e@42	754 ax1 = plt.subplot(3, 1, 2)
e@35	755 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
e@42	756 ax2 = plt.subplot(3, 1, 3)
e@42	757 ax2.set_xlim([0, scene_duration])
e@35	758
e@35	759 # Get labels
e@35	760 labels = [s[0] for s in scene_starting_times]
e@35	761
e@35	762 # If background is active
e@35	763 if len(backgrounds_df) > 0:
e@35	764 labels.append('background')
e@35	765
e@35	766 # Set y axis limit. With a padding of 0.5.
e@35	767 ax2.set_ylim([-0.5, len(labels)-0.5])
e@35	768
e@35	769 plt.yticks(range(len(labels)), labels)
e@35	770
e@35	771 for n in range(len(scene_starting_times)):
e@35	772 start_times = scene_starting_times[n][1]
e@35	773 end_times = scene_ending_times[n][1]
e@35	774 color = ['r', 'g', 'y'][n % 3]
e@35	775
e@35	776 for m in range(len(start_times)):
e@35	777 plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4)
e@35	778 if figure_verbosity > 2:
e@35	779 ax0.axvline(start_times[m], color=color, alpha=0.1)
e@35	780 ax0.axvline(end_times[m], color=color, alpha=0.1)
e@35	781 ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
e@35	782 ax1.axvline(start_times[m], color=color, alpha=0.1)
e@35	783 ax1.axvline(end_times[m], color=color, alpha=0.1)
e@35	784 ax1.axvline(end_times[m], color=color, alpha=0.1)
e@35	785 ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
e@35	786 ax2.axvline(start_times[m], color=color, alpha=0.1)
e@35	787 ax2.axvline(end_times[m], color=color, alpha=0.1)
e@35	788 ax2.axvline(end_times[m], color=color, alpha=0.1)
e@35	789 ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
e@35	790
e@35	791 if len(backgrounds_df) > 0:
e@35	792 plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4)
e@35	793
e@35	794 plt.tight_layout()
e@35	795
e@35	796 if append_to_filename:
e@35	797 plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300)
e@35	798 else:
e@35	799 plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300)
e@41	800
e@47	801 if annot_format == 'sed_eval':
e@47	802 timedict_txt = timedict_to_txt(timedict)
e@47	803 logging.debug(timedict_txt)
e@41	804
e@47	805 if append_to_filename:
e@47	806 with open('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename), 'w') as f:
e@47	807 f.write(timedict_txt)
e@47	808 else:
e@47	809 with open('{}/scene_offsets.csv'.format(output_path), 'w') as f:
e@47	810 f.write(timedict_txt)
e@47	811
e@47	812 elif annot_format == 'pandas':
e@47	813 timedict_df = timedict_to_dataframe(timedict)
e@47	814 logging.debug(timedict_df)
e@47	815
e@47	816 if append_to_filename:
e@47	817 timedict_df.to_csv('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename))
e@47	818 else:
e@47	819 timedict_df.to_csv('{}/scene_offsets.csv'.format(output_path))
e@41	820
e@35	821 if figure_verbosity > 1:
e@35	822 plt.show()
e@35	823
e@35	824 # Replace nans (i.e. because of division-by-zero) of the scene with zeros.
e@35	825 scene_arr = np.nan_to_num(scene_arr)
e@35	826
e@35	827 if channel_mode == 'mono':
e@35	828 if append_to_filename:
e@44	829 sf.write('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR)
e@35	830 else:
e@44	831 sf.write('{}/scene.wav'.format(output_path), scene_arr, SR)
e@41	832
e@41	833 # Print timesdict
e@35	834
e@35	835 return scene_arr
e@41	836
e@41	837
e@35	838 def not_implemented():
e@41	839 logging.info("TODO: not implemented")
e@41	840
e@41	841
e@41	842 if __name__ == "__main__":
e@35	843 """
e@35	844 Main function, parses options and calls the simscene generation function
e@35	845 or a demo. The options given are almost identical to Lagrange et al's
e@35	846 simscene.
e@35	847 """
e@35	848 argparser = argparse.ArgumentParser(
e@35	849 description="SimScene.py acoustic scene generator",
e@35	850 )
e@35	851 argparser.add_argument(
e@35	852 'input_path',
e@35	853 type=str,
e@42	854 help="Path of a directory containing wave files for sound backgrounds"
e@42	855 "(in the `background' sub-directory) or events (in `event')"
e@35	856 )
e@42	857
e@42	858 input_path = '.'
e@42	859
e@35	860 argparser.add_argument(
e@35	861 'output_path',
e@35	862 type=str,
e@35	863 help="The directory the generated scenes and annotations will reside."
e@42	864 )
e@42	865
e@42	866 output_path = '.'
e@42	867
e@35	868 argparser.add_argument(
e@35	869 'scene_duration',
e@35	870 type=float,
e@35	871 help="Duration of scene in seconds",
e@35	872 )
e@35	873 scene_duration = None
e@35	874
e@35	875 argparser.add_argument(
e@35	876 '-e', '--score-events',
e@35	877 type=str,
e@35	878 help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
e@35	879 )
e@35	880 score_events = None
e@35	881
e@35	882 argparser.add_argument(
e@35	883 '-b', '--score-backgrounds',
e@35	884 type=str,
e@35	885 help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
e@35	886 )
e@35	887 score_backgrounds = None
e@35	888
e@35	889 argparser.add_argument(
e@35	890 '--tag',
e@35	891 type=str,
e@35	892 help="Append _TAG_XXX to filenames, where XXX is an increment."
e@35	893 )
e@35	894 tag = None
e@35	895
e@35	896 argparser.add_argument(
e@35	897 '-N',
e@35	898 type=int,
e@42	899 help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, "
e@42	900 "then the verbosity must be less or equal to 1"
e@35	901 )
e@35	902 generate_n = 1
e@35	903
e@35	904 argparser.add_argument(
e@35	905 '-t', '--time-mode',
e@35	906 type=str,
e@42	907 help="Mode of spacing between events. `generate': values must be set for each track in the score files. "
e@42	908 "`abstract': values are computed from an abstract representation of an existing acoustic scene. "
e@42	909 "`replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
e@35	910 choices=['generate', 'abstract', 'replicate']
e@35	911 )
e@35	912 time_mode = 'generate'
e@51	913
e@51	914 argparser.add_argument(
e@51	915 '-fd', '--full-duration',
e@51	916 action='store_true',
e@51	917 help="If enabled, times specified in the recipe refer to after the previous file finishes."
e@51	918 )
e@51	919 full_duration = False
e@51	920
e@35	921 argparser.add_argument(
e@35	922 '-R', '--ebr-mode',
e@35	923 type=str,
e@42	924 help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the "
e@42	925 "score files. `abstract': values are computed from an abstract representation of an existing acoustic "
e@42	926 "scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
e@35	927 choices=['generate', 'abstract', 'replicate']
e@35	928 )
e@35	929 ebr_mode = 'generate'
e@35	930
e@35	931 argparser.add_argument(
e@35	932 '-A', '--annotation-file',
e@35	933 type=float,
e@42	934 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. "
e@42	935 "ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). "
e@42	936 "(NOT IMPLEMENTED)"
e@35	937 )
e@35	938 annotation_file = None
e@35	939
e@35	940 argparser.add_argument(
e@35	941 '-a', '--audio-file',
e@35	942 type=float,
e@42	943 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs "
e@42	944 "from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)"
e@35	945 )
e@35	946 audio_file = None
e@35	947
e@35	948 argparser.add_argument(
e@35	949 '-v', '--figure-verbosity', action='count',
e@42	950 help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not "
e@42	951 "display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot"
e@35	952 )
e@35	953 figure_verbosity = 0
e@35	954
e@35	955 argparser.add_argument(
e@35	956 '-x', '--image-format',
e@35	957 help="Image format for the figures",
e@35	958 choices=['png', 'jpg', 'pdf']
e@35	959 )
e@35	960 image_format = 'png'
e@35	961
e@35	962 argparser.add_argument(
e@35	963 '-C', '--channel-mode',
e@35	964 type=str,
e@42	965 help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as "
e@42	966 "'classes', each channel is saved in a separate .wav file.",
e@35	967 choices=['mono', 'separate']
e@35	968 )
e@35	969 channel_mode = 'mono'
e@35	970
e@35	971 argparser.add_argument(
e@35	972 '-c', '--end-cut',
e@35	973 action='store_true',
e@42	974 help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, "
e@42	975 "else remove the sample."
e@35	976 )
e@35	977 end_cut = None
e@42	978
e@42	979 argparser.add_argument(
e@42	980 '-L', '--logging-level',
e@42	981 type=str,
e@42	982 help="Set lowest logging level",
e@42	983 choices=['debug', 'warning', 'info']
e@42	984 )
e@42	985
e@47	986 argparser.add_argument(
e@47	987 '--annot-format',
e@47	988 type=str,
e@47	989 help="Annotation format for generated scenes. Choices are: 'sed_eval' (default) - Format appropriate for "
e@47	990 "DCASE 2017 challenge evaluator, 'pandas' - A more detailed format for the form <label, orig_filename, "
e@47	991 "start, stop>",
e@47	992 choices=['sed_eval', 'pandas']
e@47	993 )
e@47	994
e@35	995 args = argparser.parse_args()
e@42	996
e@42	997 if args.logging_level:
e@42	998 if args.logging_level == 'debug':
e@42	999 logging.basicConfig(level=logging.DEBUG)
e@42	1000 elif args.logging_level == 'info':
e@42	1001 logging.basicConfig(level=logging.INFO)
e@42	1002 elif args.logging_level == 'warning':
e@42	1003 logging.basicConfig(level=logging.WARNING)
e@42	1004 else:
e@42	1005 logging.basicConfig(level=logging.INFO)
e@42	1006
e@35	1007 if args.input_path:
e@35	1008 input_path = args.input_path
e@35	1009 logging.debug("Using `{}' as input path".format(input_path))
e@35	1010 if args.output_path:
e@35	1011 output_path = args.output_path
e@35	1012 logging.debug("Saving to `{}'".format(output_path))
e@51	1013 if args.full_duration:
e@51	1014 full_duration = True
e@35	1015 if args.scene_duration:
e@35	1016 if not (args.score_backgrounds or args.score_events):
e@35	1017 print("You must provide one of -e or -b")
e@35	1018 else:
e@35	1019 if args.image_format:
e@35	1020 image_format = args.image_format
e@35	1021 if args.channel_mode:
e@35	1022 channel_mode = args.channel_mode
e@35	1023 if args.ebr_mode:
e@35	1024 ebr_mode = args.ebr_mode
e@35	1025 if ebr_mode not in ['generate']:
e@35	1026 logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode))
e@35	1027 ebr_mode = 'generate'
e@35	1028 if args.time_mode:
e@35	1029 time_mode = args.time_mode
e@35	1030 if time_mode not in ['generate']:
e@35	1031 logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode))
e@35	1032 time_mode = 'generate'
e@35	1033 if args.annotation_file:
e@35	1034 annotations = read_annotations_file(args.annotation_file)
e@35	1035
e@35	1036 scene_duration = float(args.scene_duration)
e@35	1037
e@35	1038 if args.score_backgrounds:
e@35	1039 score_backgrounds = read_backgrounds_file(args.score_backgrounds)
e@35	1040 else:
e@35	1041 score_backgrounds = []
e@35	1042
e@35	1043 if args.score_events:
e@35	1044 score_events = read_events_file(args.score_events)
e@35	1045 else:
e@35	1046 score_events = []
e@35	1047
e@35	1048 if args.figure_verbosity:
e@35	1049 figure_verbosity = args.figure_verbosity
e@35	1050
e@35	1051 if args.N:
e@35	1052 generate_n = args.N
e@35	1053
e@35	1054 if args.tag:
e@35	1055 tag = args.tag
e@35	1056
e@35	1057 if generate_n == 1:
e@35	1058 append_to_filename = None
e@35	1059 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
e@35	1060 time_mode=time_mode,
e@35	1061 ebr_mode=ebr_mode,
e@35	1062 channel_mode=channel_mode,
e@35	1063 annotation_file=annotation_file,
e@35	1064 audio_file=audio_file,
e@35	1065 figure_verbosity=figure_verbosity,
e@35	1066 end_cut=end_cut,
e@35	1067 image_format=image_format,
e@51	1068 append_to_filename=append_to_filename,
e@51	1069 full_duration=full_duration)
e@35	1070 else:
e@35	1071 for n in range(generate_n):
e@35	1072 if tag:
e@35	1073 append_to_filename = '{}_{}'.format(tag, n)
e@35	1074 else:
e@35	1075 append_to_filename = '{}'.format(n)
e@35	1076
e@35	1077 logging.info("Generating scene {}".format(n))
e@35	1078
e@35	1079 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
e@35	1080 time_mode=time_mode,
e@35	1081 ebr_mode=ebr_mode,
e@35	1082 channel_mode=channel_mode,
e@35	1083 annotation_file=annotation_file,
e@35	1084 audio_file=audio_file,
e@35	1085 figure_verbosity=min(figure_verbosity, 1),
e@35	1086 end_cut=end_cut,
e@35	1087 image_format=image_format,
e@51	1088 append_to_filename=append_to_filename,
e@51	1089 full_duration=full_duration)

Mercurial > hg > simscene-py

annotate python/simscene.py @ 51:ebf92ed7d680 tip master