Mercurial > hg > simscene-py
comparison python/simscene.py @ 35:5d19c2254677
added simscene.py with the accompanying input files to generate acoustic scenes using python
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Thu, 05 Oct 2017 14:53:15 +0100 |
parents | |
children | a0eb120940b1 |
comparison
equal
deleted
inserted
replaced
34:39399de892ef | 35:5d19c2254677 |
---|---|
1 #!/bin/python | |
2 # -*- coding: utf-8 -*- | |
3 # For licensing please see: LICENSE | |
4 # Copyright (c) Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> | |
5 | |
6 # Argparse | |
7 import argparse | |
8 | |
9 # Logging | |
10 import logging | |
11 | |
12 # Pandas | |
13 import pandas as pd | |
14 | |
15 # Numpy | |
16 import numpy as np | |
17 import sys | |
18 | |
19 # Glob | |
20 import glob | |
21 import random | |
22 | |
23 # Librosa | |
24 import librosa | |
25 import librosa.display | |
26 import librosa.output | |
27 | |
28 # Matplotlib | |
29 from matplotlib import rc | |
30 # rc('text', usetex=True) | |
31 import matplotlib.pyplot as plt | |
32 import matplotlib.patches as patches | |
33 from cycler import cycler | |
34 | |
35 # Tabulate | |
36 from tabulate import tabulate | |
37 | |
38 def _N(t, sr=44100): | |
39 """ | |
40 Helper function: Converts time to samples | |
41 """ | |
42 return int(t*sr) | |
43 | |
44 def compute_energy(x): | |
45 return np.sqrt(np.mean(x**2)) | |
46 | |
47 # def compute_energy_profile(x, w=1000): | |
48 # # Resize/Window signal | |
49 # #x = np.resize(x, (w,int(np.ceil(float(len(x)/w))))) | |
50 # x = np.array([[ii+jj for jj in range(w)] for ii in range(len(x)-w)]) | |
51 # return np.sqrt(np.mean(x**2, 1)) | |
52 | |
53 def render_pattern(fname, input_path, sr=44100): | |
54 pattern = read_pattern_file(fname) | |
55 | |
56 start_times_samples = [] | |
57 end_times_samples = [] | |
58 durations_samples = [] | |
59 wav_files = [] | |
60 | |
61 for n in range(len(pattern)): | |
62 # Try loading the file, | |
63 sampleid = pattern['sampleid'].loc[n] | |
64 candidates = glob.glob('{}/event/{}*wav'.format(input_path,sampleid)) | |
65 chosen_fname = random.sample(candidates, 1)[0] | |
66 | |
67 logging.debug('Loading {}'.format(chosen_fname)) | |
68 | |
69 # For each sound in the pattern file, place it starting from starttime + an offset | |
70 # with a mean value of 0 and standard deviation of offset_stddev. The first event can | |
71 # not start earlier than time 0. If endtime is defined (not nan), then cut the event at | |
72 # end time. | |
73 wav, SR = librosa.load(chosen_fname, sr=sr) | |
74 | |
75 # Read and assign an amplitude | |
76 amplitude_mean = float(pattern['amplitude'].loc[n]) | |
77 amplitude_stddev = float(pattern['amplitude_stdev'].loc[n]) | |
78 amplitude = amplitude_mean + np.random.randn()*amplitude_stddev | |
79 wav *= amplitude | |
80 | |
81 start_time = max(float(pattern['start_time'].loc[n]),0) | |
82 start_time_samples = int(start_time*SR) | |
83 | |
84 fade_in_time = float(pattern['fade_in_time'].loc[n]) | |
85 fade_out_time = float(pattern['fade_out_time'].loc[n]) | |
86 end_time = float(pattern['end_time'].loc[n]) | |
87 | |
88 # If end_time is not defined (-1 or just empty) | |
89 # then just derive it from the length of the sample | |
90 if np.isnan(end_time) or float(end_time) == -1: | |
91 duration_samples = len(wav) | |
92 end_time_samples = start_time_samples + duration_samples | |
93 elif end_time - start_time > len(wav)/float(SR): | |
94 | |
95 # If given end_time is more than start_time + duration of sample | |
96 # then pad the file with zeros to reach the desired end time. | |
97 duration = end_time - start_time | |
98 duration_samples = int(duration*SR) | |
99 end_time_samples = start_time_samples + duration_samples | |
100 wav_arr = np.zeros(duration_samples) | |
101 wav_arr[:len(wav)] = wav | |
102 wav = wav_arr | |
103 else: | |
104 duration = end_time - start_time | |
105 duration_samples = int(duration*SR) | |
106 end_time_samples = start_time_samples + duration_samples | |
107 | |
108 event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time) | |
109 | |
110 start_times_samples.append(start_time_samples) | |
111 end_times_samples.append(end_time_samples) | |
112 durations_samples.append(duration_samples) | |
113 wav_files.append(event_render) | |
114 | |
115 pattern_duration = end_time_samples | |
116 pattern_arr = np.zeros(pattern_duration) | |
117 | |
118 for n, s in enumerate(start_times_samples): | |
119 wav = wav_files[n] | |
120 pattern_arr[s:s+len(wav)] = wav | |
121 | |
122 return pattern_arr, 44100 | |
123 | |
124 def read_events_file(fname): | |
125 if fname[-3:].lower() == 'xls': | |
126 df = pd.read_excel(fname) | |
127 elif fname[-4:].lower() == 'json': | |
128 df = pd.read_json(fname) | |
129 elif fname[-3:].lower() in ['txt']: | |
130 with open(fname) as f: | |
131 s = f.readline() | |
132 f.seek(0,0) | |
133 if ',' in s: | |
134 sep = ',' | |
135 elif '\t' in s: | |
136 sep = '\t' | |
137 else: | |
138 sep = ' ' | |
139 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') | |
140 df = pd.read_csv(f, header=None, sep=sep) | |
141 df.columns = ['label','sampleid','ebr','ebr_stddev','mean_time_between_instances','time_between_instances_stddev','start_time','end_time','fade_in_time','fade_out_time'] | |
142 elif fname[-3:].lower() in ['csv']: | |
143 df = pd.read_json(fname) | |
144 | |
145 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) | |
146 return df | |
147 | |
148 def read_pattern_file(fname): | |
149 if fname[-3:].lower() == 'xls': | |
150 df = pd.read_excel(fname) | |
151 elif fname[-4:].lower() == 'json': | |
152 df = pd.read_json(fname) | |
153 elif fname[-3:].lower() in ['txt']: | |
154 with open(fname) as f: | |
155 s = f.readline() | |
156 f.seek(0,0) | |
157 if ',' in s: | |
158 sep = ',' | |
159 elif '\t' in s: | |
160 sep = '\t' | |
161 else: | |
162 sep = ' ' | |
163 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') | |
164 df = pd.read_csv(f, header=None, sep=sep) | |
165 df.columns = ['eventid','start_time','end_time','time_offset_stdev','fade_in_time', 'fade_out_time', 'amplitude', 'amplitude_stdev'] | |
166 elif fname[-3:].lower() in ['csv']: | |
167 df = pd.read_json(fname) | |
168 | |
169 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) | |
170 return df | |
171 | |
172 def read_backgrounds_file(fname): | |
173 if fname[-3:].lower() == 'xls': | |
174 df = pd.read_excel(fname) | |
175 elif fname[-4:].lower() == 'json': | |
176 df = pd.read_json(fname) | |
177 elif fname[-3:].lower() in ['txt']: | |
178 with open(fname) as f: | |
179 s = f.readline() | |
180 f.seek(0,0) | |
181 if ',' in s: | |
182 sep = ',' | |
183 elif '\t' in s: | |
184 sep = '\t' | |
185 else: | |
186 sep = ' ' | |
187 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') | |
188 df = pd.read_csv(f, header=None, sep=sep) | |
189 df.columns = ['label','sampleid','snr'] | |
190 elif fname[-3:].lower() in ['csv']: | |
191 df = pd.read_json(fname) | |
192 | |
193 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) | |
194 return df | |
195 | |
196 def read_annotations_file(fname): | |
197 if fname[-3:].lower() == 'xls': | |
198 df = pd.read_excel(fname) | |
199 elif fname[-4:].lower() == 'json': | |
200 df = pd.read_json(fname) | |
201 elif fname[-3:].lower() in ['txt', 'csv']: | |
202 | |
203 with open(fname) as f: | |
204 header = f.readline() | |
205 | |
206 s = f.readline() | |
207 f.seek(0,0) | |
208 if ',' in s: | |
209 sep = ',' | |
210 elif '\t' in s: | |
211 sep = '\t' | |
212 else: | |
213 sep = ' ' | |
214 if sep in header: | |
215 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.') | |
216 df = pd.read_csv(f, header=None, sep=sep) | |
217 df.columns = ['start', 'stop', 'class'] | |
218 else: | |
219 df.columns = ['start', 'stop', 'class'] | |
220 df = pd.read_csv(f, sep=sep) | |
221 df = None | |
222 | |
223 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql')) | |
224 return df | |
225 | |
226 def run_demo(): | |
227 print("TODO: Implement run_demo()") | |
228 | |
229 def fade(x, fade_in, fade_out, sr=44100): | |
230 """ | |
231 Creates a fade-in-fade-out envelope | |
232 for audio array x. | |
233 """ | |
234 | |
235 if len(x) == 0: | |
236 return x | |
237 | |
238 fade_in_samples = int(fade_in*sr) | |
239 fade_out_samples = int(fade_out*sr) | |
240 | |
241 outp = np.ones_like(x) | |
242 for n in range(fade_in_samples): | |
243 outp[n] = n*1./fade_in_samples | |
244 | |
245 for n in range(fade_out_samples): | |
246 outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n | |
247 return outp*x | |
248 | |
249 def simscene(input_path, | |
250 output_path, | |
251 scene_duration, | |
252 score_events, | |
253 score_backgrounds, | |
254 **kwargs): | |
255 logging.info('simscene() is not yet implemented fully') | |
256 SR = 44100 # Samplerate. Should probably not be hardcoded | |
257 | |
258 events_df = score_events | |
259 backgrounds_df = score_backgrounds | |
260 | |
261 # Create empty numpy array | |
262 scene_arr = np.zeros(int(scene_duration*SR)) | |
263 | |
264 if 'append_to_filename' in kwargs: | |
265 append_to_filename = kwargs['append_to_filename'] | |
266 else: | |
267 append_to_filename = None | |
268 | |
269 if 'end_cut' in kwargs: | |
270 end_cut = kwargs['end_cut'] | |
271 else: | |
272 end_cut = False | |
273 | |
274 if 'figure_verbosity' in kwargs: | |
275 figure_verbosity = kwargs['figure_verbosity'] | |
276 else: | |
277 figure_verbosity = 0 | |
278 | |
279 if 'image_format' in kwargs: | |
280 image_format = kwargs['image_format'] | |
281 else: | |
282 image_format = 'png' | |
283 | |
284 # Stores the starting and ending times of every track for visualization | |
285 # purposes | |
286 scene_starting_times = [] | |
287 scene_ending_times = [] | |
288 | |
289 # List of tracks | |
290 track_list = [] | |
291 background_energies = [] | |
292 | |
293 for n in range(len(backgrounds_df)): | |
294 # Get label of background | |
295 label = str(backgrounds_df['label'].loc[n]) | |
296 | |
297 # First check if there are any pattern candidates. Give priorities | |
298 # To pattern files. | |
299 candidates = [] | |
300 for pattern_format in ['xls', 'json', 'txt', 'csv']: | |
301 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, backgrounds_df['sampleid'].loc[n], pattern_format)) | |
302 | |
303 if len(candidates) == 0: | |
304 # If no patterns are found, search for normal audio files | |
305 candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n])) | |
306 chosen_fname = random.sample(candidates, 1)[0] | |
307 wav, sr = librosa.load(chosen_fname, sr=SR) | |
308 else: | |
309 chosen_fname = random.sample(candidates, 1)[0] | |
310 wav, sr = render_pattern(chosen_fname, input_path) | |
311 | |
312 duration = len(wav)/float(SR) | |
313 target_snr_db = float(backgrounds_df['snr'].loc[n]) | |
314 target_snr = 10**(target_snr_db/20.0) | |
315 | |
316 energy = compute_energy(wav) | |
317 | |
318 logging.debug('{}:energy:{}'.format(label,energy)) | |
319 | |
320 | |
321 if n == 0: | |
322 # For the first background track, snr | |
323 # gives an amount by which it's going to be scaled (i.e. make it more silent) | |
324 amplitude_factor = target_snr | |
325 wav *= amplitude_factor | |
326 | |
327 if n > 0: | |
328 noise_energy = compute_energy(np.sum(np.array(track_list), axis=0)) | |
329 logging.info('{}:noise_energy:{}'.format(label,noise_energy)) | |
330 | |
331 old_snr = energy/noise_energy | |
332 old_snr_db = 20*np.log10(old_snr) | |
333 logging.info('{}:old_snr:{}'.format(label,old_snr_db)) | |
334 | |
335 amplitude_factor = target_snr/old_snr | |
336 | |
337 | |
338 wav *= amplitude_factor | |
339 new_energy = compute_energy(wav) | |
340 new_snr = new_energy/noise_energy | |
341 new_snr_db = 20. * np.log10(new_snr) | |
342 logging.info('{}:new_snr:{}'.format(label,new_snr_db)) | |
343 | |
344 | |
345 # Track array | |
346 track_arr = np.zeros(int(scene_duration*SR)) | |
347 start_times = [0.0] | |
348 end_times = [start_times[-1]+len(wav)/float(SR)] | |
349 | |
350 | |
351 # Start with the first time in the list | |
352 new_start_time = start_times[-1] | |
353 new_end_time = end_times[-1] | |
354 | |
355 while new_start_time < scene_duration: | |
356 offset = duration | |
357 new_start_time += offset | |
358 new_end_time += offset | |
359 | |
360 start_times.append(new_start_time) | |
361 end_times.append(new_end_time) | |
362 | |
363 for n,t in enumerate(start_times): | |
364 # We need to be careful with the limits here | |
365 # since numpy will just ignore indexing that | |
366 # exceeds | |
367 | |
368 # Fading times in case we need to join many | |
369 # consecutive samples together. | |
370 # if n == 0: | |
371 # # Little fade-out, fade-in to smoothly repeat the | |
372 # # background. | |
373 # fade_in_time = 0.0 | |
374 # fade_out_time = 0.01 | |
375 # elif n > 0 and n < len(start_times) - 1: | |
376 # fade_in_time = 0.01 | |
377 # fade_out_time = 0.01 | |
378 # else: | |
379 # fade_in_time = 0.01 | |
380 # fade_out_time = 0.0 | |
381 begin = min(_N(t), len(track_arr)) | |
382 end = min(len(track_arr), _N(t)+len(wav)) | |
383 | |
384 # Part of the wav to store | |
385 # part = fade(wav[:end-begin],fade_in_time,fade_out_time) | |
386 part = wav[:end-begin] | |
387 | |
388 | |
389 track_arr[begin:end] += part | |
390 | |
391 track_list.append(track_arr) | |
392 scene_arr[:len(track_arr)] += track_arr | |
393 | |
394 if channel_mode == 'separate': | |
395 librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR) | |
396 | |
397 F = librosa.stft(track_arr, 1024) | |
398 energy_prof = librosa.feature.rmse(S=F) | |
399 background_energies.append(energy_prof) | |
400 | |
401 if figure_verbosity > 0: | |
402 plt.figure() | |
403 plt.subplot(3, 1, 1) | |
404 plt.title('`{}\' background waveform and spectrogram'.format(label)) | |
405 librosa.display.waveplot(track_arr,sr=SR) | |
406 | |
407 # Plot spectrogram | |
408 Fdb = librosa.amplitude_to_db(F) | |
409 plt.subplot(3, 1, 2) | |
410 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') | |
411 | |
412 # Plot energy profile | |
413 plt.subplot(3, 1, 3) | |
414 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T)) | |
415 plt.semilogy(time, energy_prof.T) | |
416 plt.xlim([0, len(track_arr)/SR]) | |
417 plt.ylabel('energy (rms)') | |
418 | |
419 | |
420 # Tidy up and save to file | |
421 plt.tight_layout() | |
422 if append_to_filename: | |
423 plt.savefig('{}/background_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300) | |
424 else: | |
425 plt.savefig('{}/background_{}.{}'.format(output_path, label, image_format), dpi=300) | |
426 | |
427 # Compute total energy of background | |
428 if len(backgrounds_df) > 0: | |
429 background_arr = np.sum(track_list, 0) | |
430 B = librosa.stft(background_arr, 1024) | |
431 background_energy = librosa.feature.rmse(S=B).flatten() | |
432 else: | |
433 background_energy = 0.0 | |
434 | |
435 for n in range(len(events_df)): | |
436 # Get label of track | |
437 label = str(events_df['label'].loc[n]) | |
438 | |
439 # First check if there are any pattern candidates. Give priorities | |
440 # To pattern files. | |
441 candidates = [] | |
442 for pattern_format in ['xls', 'json', 'txt', 'csv']: | |
443 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, events_df['sampleid'].loc[n], pattern_format)) | |
444 | |
445 if len(candidates) == 0: | |
446 # If no patterns are found, search for normal audio files | |
447 candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n])) | |
448 chosen_fname = random.sample(candidates, 1)[0] | |
449 wav, sr = librosa.load(chosen_fname, sr=SR) | |
450 else: | |
451 chosen_fname = random.sample(candidates, 1)[0] | |
452 wav, sr = render_pattern(chosen_fname, input_path) | |
453 | |
454 | |
455 # Apply a fader envelope | |
456 fade_in_time = float(events_df['fade_in_time'].loc[n]) | |
457 fade_out_time = float(events_df['fade_out_time'].loc[n]) | |
458 wav = fade(wav, fade_in_time, fade_out_time) | |
459 | |
460 # Set target EBR | |
461 target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 + np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0) | |
462 | |
463 # Mean time between instances \mu. | |
464 mean_time_between_instances = events_df['mean_time_between_instances'].loc[n] | |
465 track_end_time = events_df['end_time'].loc[n] | |
466 | |
467 # Track array | |
468 track_arr = np.zeros(int(scene_duration*SR)) | |
469 | |
470 #If \mu is -1, then play the event only once. | |
471 if mean_time_between_instances == -1: | |
472 track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav | |
473 start_times = [float(events_df['start_time'].loc[n])] | |
474 end_times = [float(events_df['end_time'].loc[n])] | |
475 else: | |
476 # If 0, then start next sample after this one (set it to the duration of the sample) | |
477 if mean_time_between_instances == 0: | |
478 mean_time_between_instances = len(wav)/float(SR) | |
479 | |
480 # Store the successive starting and ending times of the events (given e.g. the model) | |
481 # in the following lists. | |
482 start_times = [events_df['start_time'].loc[n]] | |
483 end_times = [start_times[-1]+len(wav)/float(SR)] | |
484 | |
485 # Start with the first time in the list | |
486 new_start_time = start_times[-1] | |
487 new_end_time = end_times[-1] | |
488 | |
489 # Until the scene is full | |
490 while new_start_time < track_end_time: | |
491 offset = float(mean_time_between_instances) +\ | |
492 float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn()) | |
493 new_start_time += offset | |
494 new_end_time += offset | |
495 | |
496 # Only exception is if we have set the 'end_cut' flag | |
497 # and the end time of the event surpasses the end time | |
498 # of the track | |
499 if end_cut and new_end_time > track_end_time: | |
500 break | |
501 else: | |
502 start_times.append(new_start_time) | |
503 end_times.append(new_end_time) | |
504 | |
505 for t in start_times: | |
506 # We need to be careful with the limits here | |
507 # since numpy will just ignore indexing that | |
508 # exceeds the size of the array | |
509 begin = min(_N(t), len(track_arr)) | |
510 end = min(len(track_arr), _N(t)+len(wav)) | |
511 | |
512 # Part of the wav to store | |
513 part = wav[:end-begin] | |
514 | |
515 # If wav file was concatenated, fade out | |
516 # quickly to avoid clicks | |
517 if len(part) < len(wav) and len(part) > fade_out_time*SR: | |
518 part = fade(part, 0, fade_out_time) | |
519 | |
520 track_arr[begin:end] += part | |
521 | |
522 track_list.append(track_arr) | |
523 scene_arr[:len(track_arr)] += track_arr | |
524 | |
525 # Compute energies | |
526 F = librosa.stft(track_arr, 1024) | |
527 energy_prof = librosa.feature.rmse(S=F).flatten() | |
528 | |
529 # Compute current ebr | |
530 | |
531 if len(backgrounds_df) > 0: | |
532 ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten() | |
533 curr_ebr = np.max(ebr_prof) | |
534 logging.debug('{}:Target ebr: {}db'.format(label,20*np.log10(target_ebr))) | |
535 logging.debug('{}:Current track ebr: {}db'.format(label,20*np.log10(curr_ebr))) | |
536 | |
537 # Set correct ebr | |
538 track_arr = track_arr/curr_ebr*target_ebr | |
539 | |
540 Fnew = librosa.stft(track_arr, 1024) | |
541 new_energy_prof = librosa.feature.rmse(S=Fnew).flatten() | |
542 new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten() | |
543 new_ebr = np.max(new_ebr_prof) | |
544 logging.debug('{}:New track ebr: {}db'.format(label,20*np.log10(new_ebr))) | |
545 | |
546 | |
547 | |
548 if channel_mode == 'separate': | |
549 librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label), track_arr/np.max(track_arr), SR) | |
550 | |
551 | |
552 | |
553 | |
554 | |
555 if figure_verbosity > 0: | |
556 plt.figure() | |
557 | |
558 plt.subplot(3,1,1) | |
559 plt.title('`{}\' event waveform and spectrogram'.format(label)) | |
560 | |
561 librosa.display.waveplot(track_arr,sr=SR) | |
562 Fdb = librosa.amplitude_to_db(F) | |
563 plt.subplot(3, 1, 2) | |
564 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') | |
565 | |
566 # Plot energy profile | |
567 plt.subplot(3, 1, 3) | |
568 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T)) | |
569 plt.semilogy(time, energy_prof.T) | |
570 plt.xlim([0, len(track_arr)/SR]) | |
571 plt.ylabel('energy (rms)') | |
572 | |
573 | |
574 plt.tight_layout() | |
575 if append_to_filename: | |
576 plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300) | |
577 else: | |
578 plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300) | |
579 | |
580 | |
581 | |
582 | |
583 scene_starting_times.append((label, start_times)) | |
584 scene_ending_times.append((label, end_times)) | |
585 | |
586 if figure_verbosity > 0: | |
587 plt.figure() | |
588 ax0 = plt.subplot(3,1,1) | |
589 plt.title('Synthesized Scene') | |
590 librosa.display.waveplot(scene_arr, sr=SR) | |
591 F = librosa.stft(scene_arr) | |
592 Fdb = librosa.amplitude_to_db(F) | |
593 ax1 = plt.subplot(3,1,2) | |
594 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz') | |
595 ax2 = plt.subplot(3,1,3) | |
596 ax2.set_xlim([0,scene_duration]) | |
597 | |
598 # Get labels | |
599 labels = [s[0] for s in scene_starting_times] | |
600 | |
601 | |
602 | |
603 | |
604 # If background is active | |
605 if len(backgrounds_df) > 0: | |
606 labels.append('background') | |
607 | |
608 # Set y axis limit. With a padding of 0.5. | |
609 ax2.set_ylim([-0.5, len(labels)-0.5]) | |
610 | |
611 plt.yticks(range(len(labels)), labels) | |
612 | |
613 for n in range(len(scene_starting_times)): | |
614 label = scene_starting_times[n][0] | |
615 start_times = scene_starting_times[n][1] | |
616 end_times = scene_ending_times[n][1] | |
617 color = ['r', 'g', 'y'][n % 3] | |
618 | |
619 for m in range(len(start_times)): | |
620 plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4) | |
621 if figure_verbosity > 2: | |
622 ax0.axvline(start_times[m], color=color, alpha=0.1) | |
623 ax0.axvline(end_times[m], color=color, alpha=0.1) | |
624 ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) | |
625 ax1.axvline(start_times[m], color=color, alpha=0.1) | |
626 ax1.axvline(end_times[m], color=color, alpha=0.1) | |
627 ax1.axvline(end_times[m], color=color, alpha=0.1) | |
628 ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) | |
629 ax2.axvline(start_times[m], color=color, alpha=0.1) | |
630 ax2.axvline(end_times[m], color=color, alpha=0.1) | |
631 ax2.axvline(end_times[m], color=color, alpha=0.1) | |
632 ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1) | |
633 | |
634 if len(backgrounds_df) > 0: | |
635 plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4) | |
636 | |
637 plt.tight_layout() | |
638 | |
639 if append_to_filename: | |
640 plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300) | |
641 else: | |
642 plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300) | |
643 | |
644 if figure_verbosity > 1: | |
645 plt.show() | |
646 | |
647 # Replace nans (i.e. because of division-by-zero) of the scene with zeros. | |
648 scene_arr = np.nan_to_num(scene_arr) | |
649 | |
650 if channel_mode == 'mono': | |
651 if append_to_filename: | |
652 librosa.output.write_wav('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR) | |
653 else: | |
654 librosa.output.write_wav('{}/scene.wav'.format(output_path), scene_arr, SR) | |
655 | |
656 | |
657 return scene_arr | |
658 | |
659 | |
660 | |
661 def not_implemented(): | |
662 print("TODO: not implemented") | |
663 | |
664 if __name__=="__main__": | |
665 """ | |
666 Main function, parses options and calls the simscene generation function | |
667 or a demo. The options given are almost identical to Lagrange et al's | |
668 simscene. | |
669 """ | |
670 argparser = argparse.ArgumentParser( | |
671 description="SimScene.py acoustic scene generator", | |
672 ) | |
673 argparser.add_argument( | |
674 'input_path', | |
675 type=str, | |
676 help="Path of a directory containing wave files for sound backgrounds (in the `background' sub-directory) or events (in `event')" | |
677 ) | |
678 argparser.add_argument( | |
679 'output_path', | |
680 type=str, | |
681 help="The directory the generated scenes and annotations will reside." | |
682 ) | |
683 argparser.add_argument( | |
684 'scene_duration', | |
685 type=float, | |
686 help="Duration of scene in seconds", | |
687 ) | |
688 scene_duration = None | |
689 | |
690 argparser.add_argument( | |
691 '-e', '--score-events', | |
692 type=str, | |
693 help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file" | |
694 ) | |
695 score_events = None | |
696 | |
697 argparser.add_argument( | |
698 '-b', '--score-backgrounds', | |
699 type=str, | |
700 help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file" | |
701 ) | |
702 score_backgrounds = None | |
703 | |
704 argparser.add_argument( | |
705 '--tag', | |
706 type=str, | |
707 help="Append _TAG_XXX to filenames, where XXX is an increment." | |
708 ) | |
709 tag = None | |
710 | |
711 argparser.add_argument( | |
712 '-N', | |
713 type=int, | |
714 help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, then the verbosity must be less or equal to 1" | |
715 ) | |
716 generate_n = 1 | |
717 | |
718 argparser.add_argument( | |
719 '-t', '--time-mode', | |
720 type=str, | |
721 help="Mode of spacing between events. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)", | |
722 choices=['generate', 'abstract', 'replicate'] | |
723 ) | |
724 time_mode = 'generate' | |
725 | |
726 argparser.add_argument( | |
727 '-R', '--ebr-mode', | |
728 type=str, | |
729 help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)", | |
730 choices=['generate', 'abstract', 'replicate'] | |
731 ) | |
732 ebr_mode = 'generate' | |
733 | |
734 argparser.add_argument( | |
735 '-A', '--annotation-file', | |
736 type=float, | |
737 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). (NOT IMPLEMENTED)" | |
738 ) | |
739 annotation_file = None | |
740 | |
741 argparser.add_argument( | |
742 '-a', '--audio-file', | |
743 type=float, | |
744 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)" | |
745 ) | |
746 audio_file = None | |
747 | |
748 argparser.add_argument( | |
749 '-v', '--figure-verbosity', action='count', | |
750 help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot" | |
751 ) | |
752 figure_verbosity = 0 | |
753 | |
754 argparser.add_argument( | |
755 '-x', '--image-format', | |
756 help="Image format for the figures", | |
757 choices=['png', 'jpg', 'pdf'] | |
758 ) | |
759 image_format = 'png' | |
760 | |
761 argparser.add_argument( | |
762 '-C', '--channel-mode', | |
763 type=str, | |
764 help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.", | |
765 choices=['mono', 'separate'] | |
766 ) | |
767 channel_mode = 'mono' | |
768 | |
769 # argparser.add_argument( | |
770 # '-m', '--min-space', | |
771 # type=float, | |
772 # help="Minimum space allowed between successive events (seconds). If -1, then allow overlapping between events." | |
773 # ) | |
774 min_space = -1 | |
775 | |
776 argparser.add_argument( | |
777 '-c', '--end-cut', | |
778 action='store_true', | |
779 help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, else remove the sample." | |
780 ) | |
781 end_cut = None | |
782 | |
783 logging.basicConfig(level=logging.DEBUG) | |
784 | |
785 args = argparser.parse_args() | |
786 if args.input_path: | |
787 input_path = args.input_path | |
788 logging.debug("Using `{}' as input path".format(input_path)) | |
789 if args.output_path: | |
790 output_path = args.output_path | |
791 logging.debug("Saving to `{}'".format(output_path)) | |
792 if args.scene_duration: | |
793 if not (args.score_backgrounds or args.score_events): | |
794 print("You must provide one of -e or -b") | |
795 else: | |
796 if args.image_format: | |
797 image_format = args.image_format | |
798 if args.channel_mode: | |
799 channel_mode = args.channel_mode | |
800 if args.ebr_mode: | |
801 ebr_mode = args.ebr_mode | |
802 if ebr_mode not in ['generate']: | |
803 logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode)) | |
804 ebr_mode = 'generate' | |
805 if args.time_mode: | |
806 time_mode = args.time_mode | |
807 if time_mode not in ['generate']: | |
808 logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode)) | |
809 time_mode = 'generate' | |
810 if args.annotation_file: | |
811 annotations = read_annotations_file(args.annotation_file) | |
812 | |
813 scene_duration = float(args.scene_duration) | |
814 | |
815 if args.score_backgrounds: | |
816 score_backgrounds = read_backgrounds_file(args.score_backgrounds) | |
817 else: | |
818 score_backgrounds = [] | |
819 | |
820 if args.score_events: | |
821 score_events = read_events_file(args.score_events) | |
822 else: | |
823 score_events = [] | |
824 | |
825 if args.figure_verbosity: | |
826 figure_verbosity = args.figure_verbosity | |
827 | |
828 if args.N: | |
829 generate_n = args.N | |
830 | |
831 if args.tag: | |
832 tag = args.tag | |
833 | |
834 if generate_n == 1: | |
835 append_to_filename = None | |
836 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds, | |
837 time_mode=time_mode, | |
838 ebr_mode=ebr_mode, | |
839 channel_mode=channel_mode, | |
840 annotation_file=annotation_file, | |
841 audio_file=audio_file, | |
842 figure_verbosity=figure_verbosity, | |
843 min_space=min_space, | |
844 end_cut=end_cut, | |
845 image_format=image_format, | |
846 append_to_filename=append_to_filename) | |
847 else: | |
848 for n in range(generate_n): | |
849 if tag: | |
850 append_to_filename = '{}_{}'.format(tag, n) | |
851 else: | |
852 append_to_filename = '{}'.format(n) | |
853 | |
854 logging.info("Generating scene {}".format(n)) | |
855 | |
856 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds, | |
857 time_mode=time_mode, | |
858 ebr_mode=ebr_mode, | |
859 channel_mode=channel_mode, | |
860 annotation_file=annotation_file, | |
861 audio_file=audio_file, | |
862 figure_verbosity=min(figure_verbosity, 1), | |
863 min_space=min_space, | |
864 end_cut=end_cut, | |
865 image_format=image_format, | |
866 append_to_filename=append_to_filename) | |
867 |