comparison python/simscene.py @ 35:5d19c2254677

added simscene.py with the accompanying input files to generate acoustic scenes using python
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Thu, 05 Oct 2017 14:53:15 +0100
parents
children a0eb120940b1
comparison
equal deleted inserted replaced
34:39399de892ef 35:5d19c2254677
1 #!/bin/python
2 # -*- coding: utf-8 -*-
3 # For licensing please see: LICENSE
4 # Copyright (c) Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
5
6 # Argparse
7 import argparse
8
9 # Logging
10 import logging
11
12 # Pandas
13 import pandas as pd
14
15 # Numpy
16 import numpy as np
17 import sys
18
19 # Glob
20 import glob
21 import random
22
23 # Librosa
24 import librosa
25 import librosa.display
26 import librosa.output
27
28 # Matplotlib
29 from matplotlib import rc
30 # rc('text', usetex=True)
31 import matplotlib.pyplot as plt
32 import matplotlib.patches as patches
33 from cycler import cycler
34
35 # Tabulate
36 from tabulate import tabulate
37
38 def _N(t, sr=44100):
39 """
40 Helper function: Converts time to samples
41 """
42 return int(t*sr)
43
44 def compute_energy(x):
45 return np.sqrt(np.mean(x**2))
46
47 # def compute_energy_profile(x, w=1000):
48 # # Resize/Window signal
49 # #x = np.resize(x, (w,int(np.ceil(float(len(x)/w)))))
50 # x = np.array([[ii+jj for jj in range(w)] for ii in range(len(x)-w)])
51 # return np.sqrt(np.mean(x**2, 1))
52
53 def render_pattern(fname, input_path, sr=44100):
54 pattern = read_pattern_file(fname)
55
56 start_times_samples = []
57 end_times_samples = []
58 durations_samples = []
59 wav_files = []
60
61 for n in range(len(pattern)):
62 # Try loading the file,
63 sampleid = pattern['sampleid'].loc[n]
64 candidates = glob.glob('{}/event/{}*wav'.format(input_path,sampleid))
65 chosen_fname = random.sample(candidates, 1)[0]
66
67 logging.debug('Loading {}'.format(chosen_fname))
68
69 # For each sound in the pattern file, place it starting from starttime + an offset
70 # with a mean value of 0 and standard deviation of offset_stddev. The first event can
71 # not start earlier than time 0. If endtime is defined (not nan), then cut the event at
72 # end time.
73 wav, SR = librosa.load(chosen_fname, sr=sr)
74
75 # Read and assign an amplitude
76 amplitude_mean = float(pattern['amplitude'].loc[n])
77 amplitude_stddev = float(pattern['amplitude_stdev'].loc[n])
78 amplitude = amplitude_mean + np.random.randn()*amplitude_stddev
79 wav *= amplitude
80
81 start_time = max(float(pattern['start_time'].loc[n]),0)
82 start_time_samples = int(start_time*SR)
83
84 fade_in_time = float(pattern['fade_in_time'].loc[n])
85 fade_out_time = float(pattern['fade_out_time'].loc[n])
86 end_time = float(pattern['end_time'].loc[n])
87
88 # If end_time is not defined (-1 or just empty)
89 # then just derive it from the length of the sample
90 if np.isnan(end_time) or float(end_time) == -1:
91 duration_samples = len(wav)
92 end_time_samples = start_time_samples + duration_samples
93 elif end_time - start_time > len(wav)/float(SR):
94
95 # If given end_time is more than start_time + duration of sample
96 # then pad the file with zeros to reach the desired end time.
97 duration = end_time - start_time
98 duration_samples = int(duration*SR)
99 end_time_samples = start_time_samples + duration_samples
100 wav_arr = np.zeros(duration_samples)
101 wav_arr[:len(wav)] = wav
102 wav = wav_arr
103 else:
104 duration = end_time - start_time
105 duration_samples = int(duration*SR)
106 end_time_samples = start_time_samples + duration_samples
107
108 event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time)
109
110 start_times_samples.append(start_time_samples)
111 end_times_samples.append(end_time_samples)
112 durations_samples.append(duration_samples)
113 wav_files.append(event_render)
114
115 pattern_duration = end_time_samples
116 pattern_arr = np.zeros(pattern_duration)
117
118 for n, s in enumerate(start_times_samples):
119 wav = wav_files[n]
120 pattern_arr[s:s+len(wav)] = wav
121
122 return pattern_arr, 44100
123
124 def read_events_file(fname):
125 if fname[-3:].lower() == 'xls':
126 df = pd.read_excel(fname)
127 elif fname[-4:].lower() == 'json':
128 df = pd.read_json(fname)
129 elif fname[-3:].lower() in ['txt']:
130 with open(fname) as f:
131 s = f.readline()
132 f.seek(0,0)
133 if ',' in s:
134 sep = ','
135 elif '\t' in s:
136 sep = '\t'
137 else:
138 sep = ' '
139 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
140 df = pd.read_csv(f, header=None, sep=sep)
141 df.columns = ['label','sampleid','ebr','ebr_stddev','mean_time_between_instances','time_between_instances_stddev','start_time','end_time','fade_in_time','fade_out_time']
142 elif fname[-3:].lower() in ['csv']:
143 df = pd.read_json(fname)
144
145 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
146 return df
147
148 def read_pattern_file(fname):
149 if fname[-3:].lower() == 'xls':
150 df = pd.read_excel(fname)
151 elif fname[-4:].lower() == 'json':
152 df = pd.read_json(fname)
153 elif fname[-3:].lower() in ['txt']:
154 with open(fname) as f:
155 s = f.readline()
156 f.seek(0,0)
157 if ',' in s:
158 sep = ','
159 elif '\t' in s:
160 sep = '\t'
161 else:
162 sep = ' '
163 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
164 df = pd.read_csv(f, header=None, sep=sep)
165 df.columns = ['eventid','start_time','end_time','time_offset_stdev','fade_in_time', 'fade_out_time', 'amplitude', 'amplitude_stdev']
166 elif fname[-3:].lower() in ['csv']:
167 df = pd.read_json(fname)
168
169 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
170 return df
171
172 def read_backgrounds_file(fname):
173 if fname[-3:].lower() == 'xls':
174 df = pd.read_excel(fname)
175 elif fname[-4:].lower() == 'json':
176 df = pd.read_json(fname)
177 elif fname[-3:].lower() in ['txt']:
178 with open(fname) as f:
179 s = f.readline()
180 f.seek(0,0)
181 if ',' in s:
182 sep = ','
183 elif '\t' in s:
184 sep = '\t'
185 else:
186 sep = ' '
187 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
188 df = pd.read_csv(f, header=None, sep=sep)
189 df.columns = ['label','sampleid','snr']
190 elif fname[-3:].lower() in ['csv']:
191 df = pd.read_json(fname)
192
193 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
194 return df
195
196 def read_annotations_file(fname):
197 if fname[-3:].lower() == 'xls':
198 df = pd.read_excel(fname)
199 elif fname[-4:].lower() == 'json':
200 df = pd.read_json(fname)
201 elif fname[-3:].lower() in ['txt', 'csv']:
202
203 with open(fname) as f:
204 header = f.readline()
205
206 s = f.readline()
207 f.seek(0,0)
208 if ',' in s:
209 sep = ','
210 elif '\t' in s:
211 sep = '\t'
212 else:
213 sep = ' '
214 if sep in header:
215 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
216 df = pd.read_csv(f, header=None, sep=sep)
217 df.columns = ['start', 'stop', 'class']
218 else:
219 df.columns = ['start', 'stop', 'class']
220 df = pd.read_csv(f, sep=sep)
221 df = None
222
223 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
224 return df
225
226 def run_demo():
227 print("TODO: Implement run_demo()")
228
229 def fade(x, fade_in, fade_out, sr=44100):
230 """
231 Creates a fade-in-fade-out envelope
232 for audio array x.
233 """
234
235 if len(x) == 0:
236 return x
237
238 fade_in_samples = int(fade_in*sr)
239 fade_out_samples = int(fade_out*sr)
240
241 outp = np.ones_like(x)
242 for n in range(fade_in_samples):
243 outp[n] = n*1./fade_in_samples
244
245 for n in range(fade_out_samples):
246 outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n
247 return outp*x
248
249 def simscene(input_path,
250 output_path,
251 scene_duration,
252 score_events,
253 score_backgrounds,
254 **kwargs):
255 logging.info('simscene() is not yet implemented fully')
256 SR = 44100 # Samplerate. Should probably not be hardcoded
257
258 events_df = score_events
259 backgrounds_df = score_backgrounds
260
261 # Create empty numpy array
262 scene_arr = np.zeros(int(scene_duration*SR))
263
264 if 'append_to_filename' in kwargs:
265 append_to_filename = kwargs['append_to_filename']
266 else:
267 append_to_filename = None
268
269 if 'end_cut' in kwargs:
270 end_cut = kwargs['end_cut']
271 else:
272 end_cut = False
273
274 if 'figure_verbosity' in kwargs:
275 figure_verbosity = kwargs['figure_verbosity']
276 else:
277 figure_verbosity = 0
278
279 if 'image_format' in kwargs:
280 image_format = kwargs['image_format']
281 else:
282 image_format = 'png'
283
284 # Stores the starting and ending times of every track for visualization
285 # purposes
286 scene_starting_times = []
287 scene_ending_times = []
288
289 # List of tracks
290 track_list = []
291 background_energies = []
292
293 for n in range(len(backgrounds_df)):
294 # Get label of background
295 label = str(backgrounds_df['label'].loc[n])
296
297 # First check if there are any pattern candidates. Give priorities
298 # To pattern files.
299 candidates = []
300 for pattern_format in ['xls', 'json', 'txt', 'csv']:
301 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, backgrounds_df['sampleid'].loc[n], pattern_format))
302
303 if len(candidates) == 0:
304 # If no patterns are found, search for normal audio files
305 candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n]))
306 chosen_fname = random.sample(candidates, 1)[0]
307 wav, sr = librosa.load(chosen_fname, sr=SR)
308 else:
309 chosen_fname = random.sample(candidates, 1)[0]
310 wav, sr = render_pattern(chosen_fname, input_path)
311
312 duration = len(wav)/float(SR)
313 target_snr_db = float(backgrounds_df['snr'].loc[n])
314 target_snr = 10**(target_snr_db/20.0)
315
316 energy = compute_energy(wav)
317
318 logging.debug('{}:energy:{}'.format(label,energy))
319
320
321 if n == 0:
322 # For the first background track, snr
323 # gives an amount by which it's going to be scaled (i.e. make it more silent)
324 amplitude_factor = target_snr
325 wav *= amplitude_factor
326
327 if n > 0:
328 noise_energy = compute_energy(np.sum(np.array(track_list), axis=0))
329 logging.info('{}:noise_energy:{}'.format(label,noise_energy))
330
331 old_snr = energy/noise_energy
332 old_snr_db = 20*np.log10(old_snr)
333 logging.info('{}:old_snr:{}'.format(label,old_snr_db))
334
335 amplitude_factor = target_snr/old_snr
336
337
338 wav *= amplitude_factor
339 new_energy = compute_energy(wav)
340 new_snr = new_energy/noise_energy
341 new_snr_db = 20. * np.log10(new_snr)
342 logging.info('{}:new_snr:{}'.format(label,new_snr_db))
343
344
345 # Track array
346 track_arr = np.zeros(int(scene_duration*SR))
347 start_times = [0.0]
348 end_times = [start_times[-1]+len(wav)/float(SR)]
349
350
351 # Start with the first time in the list
352 new_start_time = start_times[-1]
353 new_end_time = end_times[-1]
354
355 while new_start_time < scene_duration:
356 offset = duration
357 new_start_time += offset
358 new_end_time += offset
359
360 start_times.append(new_start_time)
361 end_times.append(new_end_time)
362
363 for n,t in enumerate(start_times):
364 # We need to be careful with the limits here
365 # since numpy will just ignore indexing that
366 # exceeds
367
368 # Fading times in case we need to join many
369 # consecutive samples together.
370 # if n == 0:
371 # # Little fade-out, fade-in to smoothly repeat the
372 # # background.
373 # fade_in_time = 0.0
374 # fade_out_time = 0.01
375 # elif n > 0 and n < len(start_times) - 1:
376 # fade_in_time = 0.01
377 # fade_out_time = 0.01
378 # else:
379 # fade_in_time = 0.01
380 # fade_out_time = 0.0
381 begin = min(_N(t), len(track_arr))
382 end = min(len(track_arr), _N(t)+len(wav))
383
384 # Part of the wav to store
385 # part = fade(wav[:end-begin],fade_in_time,fade_out_time)
386 part = wav[:end-begin]
387
388
389 track_arr[begin:end] += part
390
391 track_list.append(track_arr)
392 scene_arr[:len(track_arr)] += track_arr
393
394 if channel_mode == 'separate':
395 librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR)
396
397 F = librosa.stft(track_arr, 1024)
398 energy_prof = librosa.feature.rmse(S=F)
399 background_energies.append(energy_prof)
400
401 if figure_verbosity > 0:
402 plt.figure()
403 plt.subplot(3, 1, 1)
404 plt.title('`{}\' background waveform and spectrogram'.format(label))
405 librosa.display.waveplot(track_arr,sr=SR)
406
407 # Plot spectrogram
408 Fdb = librosa.amplitude_to_db(F)
409 plt.subplot(3, 1, 2)
410 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
411
412 # Plot energy profile
413 plt.subplot(3, 1, 3)
414 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
415 plt.semilogy(time, energy_prof.T)
416 plt.xlim([0, len(track_arr)/SR])
417 plt.ylabel('energy (rms)')
418
419
420 # Tidy up and save to file
421 plt.tight_layout()
422 if append_to_filename:
423 plt.savefig('{}/background_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
424 else:
425 plt.savefig('{}/background_{}.{}'.format(output_path, label, image_format), dpi=300)
426
427 # Compute total energy of background
428 if len(backgrounds_df) > 0:
429 background_arr = np.sum(track_list, 0)
430 B = librosa.stft(background_arr, 1024)
431 background_energy = librosa.feature.rmse(S=B).flatten()
432 else:
433 background_energy = 0.0
434
435 for n in range(len(events_df)):
436 # Get label of track
437 label = str(events_df['label'].loc[n])
438
439 # First check if there are any pattern candidates. Give priorities
440 # To pattern files.
441 candidates = []
442 for pattern_format in ['xls', 'json', 'txt', 'csv']:
443 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, events_df['sampleid'].loc[n], pattern_format))
444
445 if len(candidates) == 0:
446 # If no patterns are found, search for normal audio files
447 candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n]))
448 chosen_fname = random.sample(candidates, 1)[0]
449 wav, sr = librosa.load(chosen_fname, sr=SR)
450 else:
451 chosen_fname = random.sample(candidates, 1)[0]
452 wav, sr = render_pattern(chosen_fname, input_path)
453
454
455 # Apply a fader envelope
456 fade_in_time = float(events_df['fade_in_time'].loc[n])
457 fade_out_time = float(events_df['fade_out_time'].loc[n])
458 wav = fade(wav, fade_in_time, fade_out_time)
459
460 # Set target EBR
461 target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 + np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0)
462
463 # Mean time between instances \mu.
464 mean_time_between_instances = events_df['mean_time_between_instances'].loc[n]
465 track_end_time = events_df['end_time'].loc[n]
466
467 # Track array
468 track_arr = np.zeros(int(scene_duration*SR))
469
470 #If \mu is -1, then play the event only once.
471 if mean_time_between_instances == -1:
472 track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav
473 start_times = [float(events_df['start_time'].loc[n])]
474 end_times = [float(events_df['end_time'].loc[n])]
475 else:
476 # If 0, then start next sample after this one (set it to the duration of the sample)
477 if mean_time_between_instances == 0:
478 mean_time_between_instances = len(wav)/float(SR)
479
480 # Store the successive starting and ending times of the events (given e.g. the model)
481 # in the following lists.
482 start_times = [events_df['start_time'].loc[n]]
483 end_times = [start_times[-1]+len(wav)/float(SR)]
484
485 # Start with the first time in the list
486 new_start_time = start_times[-1]
487 new_end_time = end_times[-1]
488
489 # Until the scene is full
490 while new_start_time < track_end_time:
491 offset = float(mean_time_between_instances) +\
492 float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn())
493 new_start_time += offset
494 new_end_time += offset
495
496 # Only exception is if we have set the 'end_cut' flag
497 # and the end time of the event surpasses the end time
498 # of the track
499 if end_cut and new_end_time > track_end_time:
500 break
501 else:
502 start_times.append(new_start_time)
503 end_times.append(new_end_time)
504
505 for t in start_times:
506 # We need to be careful with the limits here
507 # since numpy will just ignore indexing that
508 # exceeds the size of the array
509 begin = min(_N(t), len(track_arr))
510 end = min(len(track_arr), _N(t)+len(wav))
511
512 # Part of the wav to store
513 part = wav[:end-begin]
514
515 # If wav file was concatenated, fade out
516 # quickly to avoid clicks
517 if len(part) < len(wav) and len(part) > fade_out_time*SR:
518 part = fade(part, 0, fade_out_time)
519
520 track_arr[begin:end] += part
521
522 track_list.append(track_arr)
523 scene_arr[:len(track_arr)] += track_arr
524
525 # Compute energies
526 F = librosa.stft(track_arr, 1024)
527 energy_prof = librosa.feature.rmse(S=F).flatten()
528
529 # Compute current ebr
530
531 if len(backgrounds_df) > 0:
532 ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten()
533 curr_ebr = np.max(ebr_prof)
534 logging.debug('{}:Target ebr: {}db'.format(label,20*np.log10(target_ebr)))
535 logging.debug('{}:Current track ebr: {}db'.format(label,20*np.log10(curr_ebr)))
536
537 # Set correct ebr
538 track_arr = track_arr/curr_ebr*target_ebr
539
540 Fnew = librosa.stft(track_arr, 1024)
541 new_energy_prof = librosa.feature.rmse(S=Fnew).flatten()
542 new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten()
543 new_ebr = np.max(new_ebr_prof)
544 logging.debug('{}:New track ebr: {}db'.format(label,20*np.log10(new_ebr)))
545
546
547
548 if channel_mode == 'separate':
549 librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label), track_arr/np.max(track_arr), SR)
550
551
552
553
554
555 if figure_verbosity > 0:
556 plt.figure()
557
558 plt.subplot(3,1,1)
559 plt.title('`{}\' event waveform and spectrogram'.format(label))
560
561 librosa.display.waveplot(track_arr,sr=SR)
562 Fdb = librosa.amplitude_to_db(F)
563 plt.subplot(3, 1, 2)
564 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
565
566 # Plot energy profile
567 plt.subplot(3, 1, 3)
568 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
569 plt.semilogy(time, energy_prof.T)
570 plt.xlim([0, len(track_arr)/SR])
571 plt.ylabel('energy (rms)')
572
573
574 plt.tight_layout()
575 if append_to_filename:
576 plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
577 else:
578 plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300)
579
580
581
582
583 scene_starting_times.append((label, start_times))
584 scene_ending_times.append((label, end_times))
585
586 if figure_verbosity > 0:
587 plt.figure()
588 ax0 = plt.subplot(3,1,1)
589 plt.title('Synthesized Scene')
590 librosa.display.waveplot(scene_arr, sr=SR)
591 F = librosa.stft(scene_arr)
592 Fdb = librosa.amplitude_to_db(F)
593 ax1 = plt.subplot(3,1,2)
594 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
595 ax2 = plt.subplot(3,1,3)
596 ax2.set_xlim([0,scene_duration])
597
598 # Get labels
599 labels = [s[0] for s in scene_starting_times]
600
601
602
603
604 # If background is active
605 if len(backgrounds_df) > 0:
606 labels.append('background')
607
608 # Set y axis limit. With a padding of 0.5.
609 ax2.set_ylim([-0.5, len(labels)-0.5])
610
611 plt.yticks(range(len(labels)), labels)
612
613 for n in range(len(scene_starting_times)):
614 label = scene_starting_times[n][0]
615 start_times = scene_starting_times[n][1]
616 end_times = scene_ending_times[n][1]
617 color = ['r', 'g', 'y'][n % 3]
618
619 for m in range(len(start_times)):
620 plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4)
621 if figure_verbosity > 2:
622 ax0.axvline(start_times[m], color=color, alpha=0.1)
623 ax0.axvline(end_times[m], color=color, alpha=0.1)
624 ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
625 ax1.axvline(start_times[m], color=color, alpha=0.1)
626 ax1.axvline(end_times[m], color=color, alpha=0.1)
627 ax1.axvline(end_times[m], color=color, alpha=0.1)
628 ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
629 ax2.axvline(start_times[m], color=color, alpha=0.1)
630 ax2.axvline(end_times[m], color=color, alpha=0.1)
631 ax2.axvline(end_times[m], color=color, alpha=0.1)
632 ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
633
634 if len(backgrounds_df) > 0:
635 plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4)
636
637 plt.tight_layout()
638
639 if append_to_filename:
640 plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300)
641 else:
642 plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300)
643
644 if figure_verbosity > 1:
645 plt.show()
646
647 # Replace nans (i.e. because of division-by-zero) of the scene with zeros.
648 scene_arr = np.nan_to_num(scene_arr)
649
650 if channel_mode == 'mono':
651 if append_to_filename:
652 librosa.output.write_wav('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR)
653 else:
654 librosa.output.write_wav('{}/scene.wav'.format(output_path), scene_arr, SR)
655
656
657 return scene_arr
658
659
660
661 def not_implemented():
662 print("TODO: not implemented")
663
664 if __name__=="__main__":
665 """
666 Main function, parses options and calls the simscene generation function
667 or a demo. The options given are almost identical to Lagrange et al's
668 simscene.
669 """
670 argparser = argparse.ArgumentParser(
671 description="SimScene.py acoustic scene generator",
672 )
673 argparser.add_argument(
674 'input_path',
675 type=str,
676 help="Path of a directory containing wave files for sound backgrounds (in the `background' sub-directory) or events (in `event')"
677 )
678 argparser.add_argument(
679 'output_path',
680 type=str,
681 help="The directory the generated scenes and annotations will reside."
682 )
683 argparser.add_argument(
684 'scene_duration',
685 type=float,
686 help="Duration of scene in seconds",
687 )
688 scene_duration = None
689
690 argparser.add_argument(
691 '-e', '--score-events',
692 type=str,
693 help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
694 )
695 score_events = None
696
697 argparser.add_argument(
698 '-b', '--score-backgrounds',
699 type=str,
700 help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
701 )
702 score_backgrounds = None
703
704 argparser.add_argument(
705 '--tag',
706 type=str,
707 help="Append _TAG_XXX to filenames, where XXX is an increment."
708 )
709 tag = None
710
711 argparser.add_argument(
712 '-N',
713 type=int,
714 help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, then the verbosity must be less or equal to 1"
715 )
716 generate_n = 1
717
718 argparser.add_argument(
719 '-t', '--time-mode',
720 type=str,
721 help="Mode of spacing between events. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
722 choices=['generate', 'abstract', 'replicate']
723 )
724 time_mode = 'generate'
725
726 argparser.add_argument(
727 '-R', '--ebr-mode',
728 type=str,
729 help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
730 choices=['generate', 'abstract', 'replicate']
731 )
732 ebr_mode = 'generate'
733
734 argparser.add_argument(
735 '-A', '--annotation-file',
736 type=float,
737 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). (NOT IMPLEMENTED)"
738 )
739 annotation_file = None
740
741 argparser.add_argument(
742 '-a', '--audio-file',
743 type=float,
744 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)"
745 )
746 audio_file = None
747
748 argparser.add_argument(
749 '-v', '--figure-verbosity', action='count',
750 help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot"
751 )
752 figure_verbosity = 0
753
754 argparser.add_argument(
755 '-x', '--image-format',
756 help="Image format for the figures",
757 choices=['png', 'jpg', 'pdf']
758 )
759 image_format = 'png'
760
761 argparser.add_argument(
762 '-C', '--channel-mode',
763 type=str,
764 help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.",
765 choices=['mono', 'separate']
766 )
767 channel_mode = 'mono'
768
769 # argparser.add_argument(
770 # '-m', '--min-space',
771 # type=float,
772 # help="Minimum space allowed between successive events (seconds). If -1, then allow overlapping between events."
773 # )
774 min_space = -1
775
776 argparser.add_argument(
777 '-c', '--end-cut',
778 action='store_true',
779 help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, else remove the sample."
780 )
781 end_cut = None
782
783 logging.basicConfig(level=logging.DEBUG)
784
785 args = argparser.parse_args()
786 if args.input_path:
787 input_path = args.input_path
788 logging.debug("Using `{}' as input path".format(input_path))
789 if args.output_path:
790 output_path = args.output_path
791 logging.debug("Saving to `{}'".format(output_path))
792 if args.scene_duration:
793 if not (args.score_backgrounds or args.score_events):
794 print("You must provide one of -e or -b")
795 else:
796 if args.image_format:
797 image_format = args.image_format
798 if args.channel_mode:
799 channel_mode = args.channel_mode
800 if args.ebr_mode:
801 ebr_mode = args.ebr_mode
802 if ebr_mode not in ['generate']:
803 logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode))
804 ebr_mode = 'generate'
805 if args.time_mode:
806 time_mode = args.time_mode
807 if time_mode not in ['generate']:
808 logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode))
809 time_mode = 'generate'
810 if args.annotation_file:
811 annotations = read_annotations_file(args.annotation_file)
812
813 scene_duration = float(args.scene_duration)
814
815 if args.score_backgrounds:
816 score_backgrounds = read_backgrounds_file(args.score_backgrounds)
817 else:
818 score_backgrounds = []
819
820 if args.score_events:
821 score_events = read_events_file(args.score_events)
822 else:
823 score_events = []
824
825 if args.figure_verbosity:
826 figure_verbosity = args.figure_verbosity
827
828 if args.N:
829 generate_n = args.N
830
831 if args.tag:
832 tag = args.tag
833
834 if generate_n == 1:
835 append_to_filename = None
836 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
837 time_mode=time_mode,
838 ebr_mode=ebr_mode,
839 channel_mode=channel_mode,
840 annotation_file=annotation_file,
841 audio_file=audio_file,
842 figure_verbosity=figure_verbosity,
843 min_space=min_space,
844 end_cut=end_cut,
845 image_format=image_format,
846 append_to_filename=append_to_filename)
847 else:
848 for n in range(generate_n):
849 if tag:
850 append_to_filename = '{}_{}'.format(tag, n)
851 else:
852 append_to_filename = '{}'.format(n)
853
854 logging.info("Generating scene {}".format(n))
855
856 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
857 time_mode=time_mode,
858 ebr_mode=ebr_mode,
859 channel_mode=channel_mode,
860 annotation_file=annotation_file,
861 audio_file=audio_file,
862 figure_verbosity=min(figure_verbosity, 1),
863 min_space=min_space,
864 end_cut=end_cut,
865 image_format=image_format,
866 append_to_filename=append_to_filename)
867