e@35
|
1 #!/bin/python
|
e@35
|
2 # -*- coding: utf-8 -*-
|
e@35
|
3 # For licensing please see: LICENSE
|
e@35
|
4 # Copyright (c) Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
|
e@35
|
5
|
e@35
|
6 # Argparse
|
e@35
|
7 import argparse
|
e@35
|
8
|
e@35
|
9 # Logging
|
e@35
|
10 import logging
|
e@35
|
11
|
e@35
|
12 # Pandas
|
e@35
|
13 import pandas as pd
|
e@35
|
14
|
e@35
|
15 # Numpy
|
e@35
|
16 import numpy as np
|
e@35
|
17
|
e@35
|
18 # Glob
|
e@35
|
19 import glob
|
e@35
|
20 import random
|
e@35
|
21
|
e@35
|
22 # Librosa
|
e@35
|
23 import librosa
|
e@35
|
24 import librosa.display
|
e@44
|
25
|
e@44
|
26 # PySoundfile
|
e@44
|
27 import soundfile as sf
|
e@35
|
28
|
e@35
|
29 # Matplotlib
|
e@35
|
30 import matplotlib.pyplot as plt
|
e@35
|
31
|
e@35
|
32 # Tabulate
|
e@35
|
33 from tabulate import tabulate
|
e@35
|
34
|
e@41
|
35
|
e@42
|
36 def _D(t, sr=44100):
|
e@35
|
37 """
|
e@35
|
38 Helper function: Converts time to samples
|
e@35
|
39 """
|
e@35
|
40 return int(t*sr)
|
e@35
|
41
|
e@42
|
42
|
e@35
|
43 def compute_energy(x):
|
e@35
|
44 return np.sqrt(np.mean(x**2))
|
e@35
|
45
|
e@35
|
46
|
e@41
|
47 def timedict_to_dataframe(timedict):
|
e@47
|
48 print(timedict)
|
e@46
|
49 return pd.DataFrame([(key, val[0], val[1], val[2]) for key in timedict for val in timedict[key]],
|
e@46
|
50 columns=('label', 'filename', 'start_time', 'end_time'))
|
e@41
|
51
|
e@47
|
52 def timedict_to_txt(timedict):
|
e@47
|
53 str_ = ""
|
e@47
|
54 for key in timedict:
|
e@47
|
55 for val in timedict[key]:
|
e@47
|
56 str_ += "{}\t{}\t{}\n".format(float(val[1]), float(val[2]), key)
|
e@47
|
57 str_ += '\n'
|
e@47
|
58 return str_
|
e@42
|
59
|
e@35
|
60 def render_pattern(fname, input_path, sr=44100):
|
e@35
|
61 pattern = read_pattern_file(fname)
|
e@35
|
62
|
e@41
|
63 # Store starting and end times in the format
|
e@41
|
64 # {'filename': (start_time, end_time)}
|
e@41
|
65
|
e@41
|
66 timesdict = {}
|
e@41
|
67
|
e@35
|
68 start_times_samples = []
|
e@35
|
69 end_times_samples = []
|
e@35
|
70 durations_samples = []
|
e@35
|
71 wav_files = []
|
e@41
|
72
|
e@41
|
73 pattern_timedict = []
|
e@41
|
74
|
e@35
|
75 for n in range(len(pattern)):
|
e@35
|
76 # Try loading the file,
|
e@35
|
77 sampleid = pattern['sampleid'].loc[n]
|
e@46
|
78 label = pattern['sampleid'].loc[n]
|
e@38
|
79 candidates = []
|
e@38
|
80 for pattern_format in ['xls', 'json', 'txt', 'csv']:
|
e@38
|
81 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, sampleid, pattern_format))
|
e@35
|
82
|
e@38
|
83 if len(candidates) == 0:
|
e@42
|
84 candidates = glob.glob('{}/event/{}*wav'.format(input_path, sampleid))
|
e@38
|
85 chosen_fname = random.sample(candidates, 1)[0]
|
e@38
|
86
|
e@44
|
87 wav, SR = sf.read(chosen_fname)
|
e@38
|
88 else:
|
e@38
|
89 chosen_fname = random.sample(candidates, 1)[0]
|
e@38
|
90
|
e@38
|
91 logging.debug('Loading {}'.format(chosen_fname))
|
e@41
|
92 wav, SR, pattern_timedict = render_pattern(chosen_fname, input_path)
|
e@42
|
93
|
e@35
|
94 # For each sound in the pattern file, place it starting from starttime + an offset
|
e@35
|
95 # with a mean value of 0 and standard deviation of offset_stddev. The first event can
|
e@35
|
96 # not start earlier than time 0. If endtime is defined (not nan), then cut the event at
|
e@35
|
97 # end time.
|
e@35
|
98
|
e@35
|
99 # Read and assign an amplitude
|
e@35
|
100 amplitude_mean = float(pattern['amplitude'].loc[n])
|
e@35
|
101 amplitude_stddev = float(pattern['amplitude_stdev'].loc[n])
|
e@35
|
102 amplitude = amplitude_mean + np.random.randn()*amplitude_stddev
|
e@35
|
103 wav *= amplitude
|
e@35
|
104
|
e@42
|
105 start_time = max(float(pattern['start_time'].loc[n]), 0)
|
e@35
|
106 start_time_samples = int(start_time*SR)
|
e@35
|
107
|
e@35
|
108 fade_in_time = float(pattern['fade_in_time'].loc[n])
|
e@35
|
109 fade_out_time = float(pattern['fade_out_time'].loc[n])
|
e@35
|
110 end_time = float(pattern['end_time'].loc[n])
|
e@35
|
111
|
e@35
|
112 # If end_time is not defined (-1 or just empty)
|
e@35
|
113 # then just derive it from the length of the sample
|
e@35
|
114 if np.isnan(end_time) or float(end_time) == -1:
|
e@35
|
115 duration_samples = len(wav)
|
e@35
|
116 end_time_samples = start_time_samples + duration_samples
|
e@45
|
117 end_time = end_time_samples/float(SR)
|
e@45
|
118
|
e@35
|
119 elif end_time - start_time > len(wav)/float(SR):
|
e@35
|
120
|
e@35
|
121 # If given end_time is more than start_time + duration of sample
|
e@35
|
122 # then pad the file with zeros to reach the desired end time.
|
e@35
|
123 duration = end_time - start_time
|
e@35
|
124 duration_samples = int(duration*SR)
|
e@35
|
125 end_time_samples = start_time_samples + duration_samples
|
e@41
|
126
|
e@41
|
127 # Calculate end time in seconds
|
e@41
|
128 end_time = end_time_samples/float(SR)
|
e@41
|
129
|
e@35
|
130 wav_arr = np.zeros(duration_samples)
|
e@35
|
131 wav_arr[:len(wav)] = wav
|
e@35
|
132 wav = wav_arr
|
e@35
|
133 else:
|
e@35
|
134 duration = end_time - start_time
|
e@35
|
135 duration_samples = int(duration*SR)
|
e@35
|
136 end_time_samples = start_time_samples + duration_samples
|
e@35
|
137
|
e@35
|
138 event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time)
|
e@35
|
139
|
e@35
|
140 start_times_samples.append(start_time_samples)
|
e@35
|
141 end_times_samples.append(end_time_samples)
|
e@35
|
142 durations_samples.append(duration_samples)
|
e@35
|
143 wav_files.append(event_render)
|
e@35
|
144
|
e@46
|
145 if label in timesdict:
|
e@46
|
146 timesdict[label].append((chosen_fname,start_time, end_time))
|
e@41
|
147 else:
|
e@46
|
148 timesdict[label] = [(chosen_fname,start_time, end_time)]
|
e@41
|
149
|
e@41
|
150 for pt in pattern_timedict:
|
e@41
|
151 if pt in timesdict:
|
e@41
|
152 timesdict[pt] += pattern_timedict[pt]
|
e@41
|
153 else:
|
e@41
|
154 timesdict[pt] = pattern_timedict[pt]
|
e@41
|
155
|
e@35
|
156 pattern_duration = end_time_samples
|
e@35
|
157 pattern_arr = np.zeros(pattern_duration)
|
e@35
|
158
|
e@35
|
159 for n, s in enumerate(start_times_samples):
|
e@35
|
160 wav = wav_files[n]
|
e@35
|
161 pattern_arr[s:s+len(wav)] = wav
|
e@41
|
162
|
e@41
|
163 return pattern_arr, 44100, timesdict
|
e@41
|
164
|
e@35
|
165
|
e@35
|
166 def read_events_file(fname):
|
e@35
|
167 if fname[-3:].lower() == 'xls':
|
e@35
|
168 df = pd.read_excel(fname)
|
e@35
|
169 elif fname[-4:].lower() == 'json':
|
e@35
|
170 df = pd.read_json(fname)
|
e@35
|
171 elif fname[-3:].lower() in ['txt']:
|
e@35
|
172 with open(fname) as f:
|
e@35
|
173 s = f.readline()
|
e@42
|
174 f.seek(0, 0)
|
e@35
|
175 if ',' in s:
|
e@35
|
176 sep = ','
|
e@35
|
177 elif '\t' in s:
|
e@35
|
178 sep = '\t'
|
e@35
|
179 else:
|
e@35
|
180 sep = ' '
|
e@35
|
181 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
|
e@35
|
182 df = pd.read_csv(f, header=None, sep=sep)
|
e@42
|
183 df.columns = ['label',
|
e@42
|
184 'sampleid',
|
e@42
|
185 'ebr',
|
e@42
|
186 'ebr_stddev',
|
e@42
|
187 'mean_time_between_instances',
|
e@42
|
188 'time_between_instances_stddev',
|
e@42
|
189 'start_time',
|
e@42
|
190 'end_time',
|
e@42
|
191 'fade_in_time',
|
e@42
|
192 'fade_out_time']
|
e@35
|
193 elif fname[-3:].lower() in ['csv']:
|
e@35
|
194 df = pd.read_json(fname)
|
e@35
|
195
|
e@42
|
196 logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
|
e@35
|
197 return df
|
e@35
|
198
|
e@41
|
199
|
e@35
|
200 def read_pattern_file(fname):
|
e@35
|
201 if fname[-3:].lower() == 'xls':
|
e@35
|
202 df = pd.read_excel(fname)
|
e@35
|
203 elif fname[-4:].lower() == 'json':
|
e@35
|
204 df = pd.read_json(fname)
|
e@35
|
205 elif fname[-3:].lower() in ['txt']:
|
e@35
|
206 with open(fname) as f:
|
e@35
|
207 s = f.readline()
|
e@42
|
208 f.seek(0, 0)
|
e@35
|
209 if ',' in s:
|
e@35
|
210 sep = ','
|
e@35
|
211 elif '\t' in s:
|
e@35
|
212 sep = '\t'
|
e@35
|
213 else:
|
e@35
|
214 sep = ' '
|
e@35
|
215 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
|
e@35
|
216 df = pd.read_csv(f, header=None, sep=sep)
|
e@42
|
217 df.columns = ['eventid',
|
e@42
|
218 'start_time',
|
e@42
|
219 'end_time',
|
e@42
|
220 'time_offset_stdev',
|
e@42
|
221 'fade_in_time',
|
e@42
|
222 'fade_out_time',
|
e@42
|
223 'amplitude',
|
e@42
|
224 'amplitude_stdev']
|
e@35
|
225 elif fname[-3:].lower() in ['csv']:
|
e@35
|
226 df = pd.read_json(fname)
|
e@35
|
227
|
e@42
|
228 logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
|
e@35
|
229 return df
|
e@41
|
230
|
e@41
|
231
|
e@35
|
232 def read_backgrounds_file(fname):
|
e@35
|
233 if fname[-3:].lower() == 'xls':
|
e@35
|
234 df = pd.read_excel(fname)
|
e@35
|
235 elif fname[-4:].lower() == 'json':
|
e@35
|
236 df = pd.read_json(fname)
|
e@35
|
237 elif fname[-3:].lower() in ['txt']:
|
e@35
|
238 with open(fname) as f:
|
e@35
|
239 s = f.readline()
|
e@42
|
240 f.seek(0, 0)
|
e@35
|
241 if ',' in s:
|
e@35
|
242 sep = ','
|
e@35
|
243 elif '\t' in s:
|
e@35
|
244 sep = '\t'
|
e@35
|
245 else:
|
e@35
|
246 sep = ' '
|
e@35
|
247 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
|
e@35
|
248 df = pd.read_csv(f, header=None, sep=sep)
|
e@42
|
249 df.columns = ['label', 'sampleid', 'snr']
|
e@35
|
250 elif fname[-3:].lower() in ['csv']:
|
e@35
|
251 df = pd.read_json(fname)
|
e@35
|
252
|
e@42
|
253 logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
|
e@35
|
254 return df
|
e@35
|
255
|
e@41
|
256
|
e@35
|
257 def read_annotations_file(fname):
|
e@35
|
258 if fname[-3:].lower() == 'xls':
|
e@35
|
259 df = pd.read_excel(fname)
|
e@35
|
260 elif fname[-4:].lower() == 'json':
|
e@35
|
261 df = pd.read_json(fname)
|
e@35
|
262 elif fname[-3:].lower() in ['txt', 'csv']:
|
e@35
|
263
|
e@35
|
264 with open(fname) as f:
|
e@35
|
265 header = f.readline()
|
e@35
|
266
|
e@35
|
267 s = f.readline()
|
e@42
|
268 f.seek(0, 0)
|
e@35
|
269 if ',' in s:
|
e@35
|
270 sep = ','
|
e@35
|
271 elif '\t' in s:
|
e@35
|
272 sep = '\t'
|
e@35
|
273 else:
|
e@35
|
274 sep = ' '
|
e@35
|
275 if sep in header:
|
e@35
|
276 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
|
e@35
|
277 df = pd.read_csv(f, header=None, sep=sep)
|
e@35
|
278 df.columns = ['start', 'stop', 'class']
|
e@35
|
279 else:
|
e@41
|
280 df = pd.read_csv(f, sep=sep)
|
e@35
|
281 df.columns = ['start', 'stop', 'class']
|
e@35
|
282 df = None
|
e@35
|
283
|
e@35
|
284 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
|
e@35
|
285 return df
|
e@35
|
286
|
e@41
|
287
|
e@35
|
288 def run_demo():
|
e@35
|
289 print("TODO: Implement run_demo()")
|
e@35
|
290
|
e@41
|
291
|
e@35
|
292 def fade(x, fade_in, fade_out, sr=44100):
|
e@35
|
293 """
|
e@35
|
294 Creates a fade-in-fade-out envelope
|
e@35
|
295 for audio array x.
|
e@35
|
296 """
|
e@35
|
297
|
e@35
|
298 if len(x) == 0:
|
e@35
|
299 return x
|
e@35
|
300
|
e@35
|
301 fade_in_samples = int(fade_in*sr)
|
e@35
|
302 fade_out_samples = int(fade_out*sr)
|
e@35
|
303
|
e@35
|
304 outp = np.ones_like(x)
|
e@35
|
305 for n in range(fade_in_samples):
|
e@35
|
306 outp[n] = n*1./fade_in_samples
|
e@35
|
307
|
e@35
|
308 for n in range(fade_out_samples):
|
e@35
|
309 outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n
|
e@35
|
310 return outp*x
|
e@35
|
311
|
e@41
|
312
|
e@35
|
313 def simscene(input_path,
|
e@35
|
314 output_path,
|
e@35
|
315 scene_duration,
|
e@35
|
316 score_events,
|
e@35
|
317 score_backgrounds,
|
e@35
|
318 **kwargs):
|
e@47
|
319 logging.warning('BER ratios have not yet been verified')
|
e@42
|
320 SR = 44100 # Samplerate. Should probably not be hardcoded
|
e@35
|
321
|
e@35
|
322 events_df = score_events
|
e@35
|
323 backgrounds_df = score_backgrounds
|
e@41
|
324
|
e@41
|
325 # Store starting and ending times in the format
|
e@41
|
326 # {'filename': [(start_time, end_time), (start_time, end_time), ...]}
|
e@41
|
327 timedict = {}
|
e@41
|
328
|
e@35
|
329 # Create empty numpy array
|
e@35
|
330 scene_arr = np.zeros(int(scene_duration*SR))
|
e@35
|
331
|
e@35
|
332 if 'append_to_filename' in kwargs:
|
e@35
|
333 append_to_filename = kwargs['append_to_filename']
|
e@35
|
334 else:
|
e@35
|
335 append_to_filename = None
|
e@35
|
336
|
e@35
|
337 if 'end_cut' in kwargs:
|
e@35
|
338 end_cut = kwargs['end_cut']
|
e@35
|
339 else:
|
e@35
|
340 end_cut = False
|
e@35
|
341
|
e@35
|
342 if 'figure_verbosity' in kwargs:
|
e@35
|
343 figure_verbosity = kwargs['figure_verbosity']
|
e@35
|
344 else:
|
e@35
|
345 figure_verbosity = 0
|
e@35
|
346
|
e@35
|
347 if 'image_format' in kwargs:
|
e@35
|
348 image_format = kwargs['image_format']
|
e@35
|
349 else:
|
e@35
|
350 image_format = 'png'
|
e@47
|
351
|
e@47
|
352 if 'annot_format' in kwargs:
|
e@47
|
353 annot_format = kwargs['annot_format']
|
e@47
|
354 else:
|
e@47
|
355 annot_format = 'sed_eval'
|
e@51
|
356
|
e@51
|
357 if 'full_duration' in kwargs:
|
e@51
|
358 full_duration = True
|
e@51
|
359 else:
|
e@51
|
360 full_duration = False
|
e@35
|
361
|
e@35
|
362 # Stores the starting and ending times of every track for visualization
|
e@35
|
363 # purposes
|
e@35
|
364 scene_starting_times = []
|
e@35
|
365 scene_ending_times = []
|
e@35
|
366
|
e@35
|
367 # List of tracks
|
e@35
|
368 track_list = []
|
e@35
|
369 background_energies = []
|
e@41
|
370
|
e@35
|
371 for n in range(len(backgrounds_df)):
|
e@35
|
372 # Get label of background
|
e@35
|
373 label = str(backgrounds_df['label'].loc[n])
|
e@35
|
374
|
e@35
|
375 # First check if there are any pattern candidates. Give priorities
|
e@35
|
376 # To pattern files.
|
e@35
|
377 candidates = []
|
e@41
|
378
|
e@41
|
379 # List of pattern start and end times
|
e@41
|
380 pattern_timedict = []
|
e@41
|
381
|
e@35
|
382 for pattern_format in ['xls', 'json', 'txt', 'csv']:
|
e@42
|
383 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path,
|
e@42
|
384 backgrounds_df['sampleid'].loc[n],
|
e@42
|
385 pattern_format))
|
e@35
|
386
|
e@35
|
387 if len(candidates) == 0:
|
e@35
|
388 # If no patterns are found, search for normal audio files
|
e@35
|
389 candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n]))
|
e@35
|
390 chosen_fname = random.sample(candidates, 1)[0]
|
e@44
|
391 wav, sr = sf.read(chosen_fname)
|
e@35
|
392 else:
|
e@35
|
393 chosen_fname = random.sample(candidates, 1)[0]
|
e@41
|
394 wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)
|
e@41
|
395
|
e@35
|
396 duration = len(wav)/float(SR)
|
e@35
|
397 target_snr_db = float(backgrounds_df['snr'].loc[n])
|
e@35
|
398 target_snr = 10**(target_snr_db/20.0)
|
e@35
|
399
|
e@35
|
400 energy = compute_energy(wav)
|
e@35
|
401
|
e@41
|
402 logging.debug('{}:energy:{}'.format(label, energy))
|
e@41
|
403
|
e@35
|
404 if n == 0:
|
e@35
|
405 # For the first background track, snr
|
e@35
|
406 # gives an amount by which it's going to be scaled (i.e. make it more silent)
|
e@35
|
407 amplitude_factor = target_snr
|
e@35
|
408 wav *= amplitude_factor
|
e@35
|
409
|
e@35
|
410 if n > 0:
|
e@35
|
411 noise_energy = compute_energy(np.sum(np.array(track_list), axis=0))
|
e@41
|
412 logging.info('{}:noise_energy:{}'.format(label, noise_energy))
|
e@35
|
413
|
e@35
|
414 old_snr = energy/noise_energy
|
e@35
|
415 old_snr_db = 20*np.log10(old_snr)
|
e@41
|
416 logging.info('{}:old_snr:{}'.format(label, old_snr_db))
|
e@35
|
417
|
e@35
|
418 amplitude_factor = target_snr/old_snr
|
e@35
|
419
|
e@35
|
420 wav *= amplitude_factor
|
e@35
|
421 new_energy = compute_energy(wav)
|
e@35
|
422 new_snr = new_energy/noise_energy
|
e@35
|
423 new_snr_db = 20. * np.log10(new_snr)
|
e@41
|
424 logging.info('{}:new_snr:{}'.format(label, new_snr_db))
|
e@41
|
425
|
e@35
|
426 # Track array
|
e@35
|
427 track_arr = np.zeros(int(scene_duration*SR))
|
e@35
|
428 start_times = [0.0]
|
e@35
|
429 end_times = [start_times[-1]+len(wav)/float(SR)]
|
e@35
|
430
|
e@35
|
431 # Start with the first time in the list
|
e@35
|
432 new_start_time = start_times[-1]
|
e@35
|
433 new_end_time = end_times[-1]
|
e@35
|
434
|
e@46
|
435 if label in timedict:
|
e@46
|
436 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
|
e@41
|
437 else:
|
e@46
|
438 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
|
e@41
|
439
|
e@35
|
440 while new_start_time < scene_duration:
|
e@35
|
441 offset = duration
|
e@35
|
442 new_start_time += offset
|
e@41
|
443
|
e@41
|
444 # If already exceeded scene, break
|
e@41
|
445 if new_start_time >= scene_duration:
|
e@41
|
446 break
|
e@41
|
447
|
e@35
|
448 new_end_time += offset
|
e@35
|
449
|
e@35
|
450 start_times.append(new_start_time)
|
e@35
|
451 end_times.append(new_end_time)
|
e@35
|
452
|
e@41
|
453 # Update timesdict noting where each filename starts and stops
|
e@46
|
454 if label in timedict:
|
e@46
|
455 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
|
e@41
|
456 else:
|
e@46
|
457 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
|
e@41
|
458
|
e@41
|
459 # Also update the times from the patterns
|
e@41
|
460 for pt in pattern_timedict:
|
e@46
|
461 pattern_timedict[pt] = [(s0, s[1] + new_start_time, s[2] + new_start_time) for s in
|
e@41
|
462 pattern_timedict[pt]]
|
e@41
|
463
|
e@41
|
464 if pt in timedict:
|
e@41
|
465 timedict[pt] += pattern_timedict[pt]
|
e@41
|
466 else:
|
e@41
|
467 timedict[pt] = pattern_timedict[pt]
|
e@41
|
468
|
e@41
|
469 # And add those to the timedict dictionary
|
e@41
|
470
|
e@42
|
471 for t in start_times:
|
e@35
|
472 # We need to be careful with the limits here
|
e@35
|
473 # since numpy will just ignore indexing that
|
e@35
|
474 # exceeds
|
e@35
|
475
|
e@35
|
476 # Fading times in case we need to join many
|
e@35
|
477 # consecutive samples together.
|
e@35
|
478 # if n == 0:
|
e@35
|
479 # # Little fade-out, fade-in to smoothly repeat the
|
e@35
|
480 # # background.
|
e@35
|
481 # fade_in_time = 0.0
|
e@35
|
482 # fade_out_time = 0.01
|
e@35
|
483 # elif n > 0 and n < len(start_times) - 1:
|
e@35
|
484 # fade_in_time = 0.01
|
e@35
|
485 # fade_out_time = 0.01
|
e@35
|
486 # else:
|
e@35
|
487 # fade_in_time = 0.01
|
e@35
|
488 # fade_out_time = 0.0
|
e@42
|
489 begin = min(_D(t), len(track_arr))
|
e@42
|
490 end = min(len(track_arr), _D(t) + len(wav))
|
e@35
|
491
|
e@35
|
492 # Part of the wav to store
|
e@35
|
493 # part = fade(wav[:end-begin],fade_in_time,fade_out_time)
|
e@35
|
494 part = wav[:end-begin]
|
e@35
|
495 track_arr[begin:end] += part
|
e@35
|
496
|
e@35
|
497 track_list.append(track_arr)
|
e@35
|
498 scene_arr[:len(track_arr)] += track_arr
|
e@35
|
499
|
e@35
|
500 if channel_mode == 'separate':
|
e@35
|
501 librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR)
|
e@35
|
502
|
e@35
|
503 F = librosa.stft(track_arr, 1024)
|
e@35
|
504 energy_prof = librosa.feature.rmse(S=F)
|
e@35
|
505 background_energies.append(energy_prof)
|
e@35
|
506
|
e@35
|
507 if figure_verbosity > 0:
|
e@35
|
508 plt.figure()
|
e@35
|
509 plt.subplot(3, 1, 1)
|
e@35
|
510 plt.title('`{}\' background waveform and spectrogram'.format(label))
|
e@41
|
511 librosa.display.waveplot(track_arr, sr=SR)
|
e@35
|
512
|
e@35
|
513 # Plot spectrogram
|
e@35
|
514 Fdb = librosa.amplitude_to_db(F)
|
e@35
|
515 plt.subplot(3, 1, 2)
|
e@35
|
516 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
|
e@35
|
517
|
e@35
|
518 # Plot energy profile
|
e@35
|
519 plt.subplot(3, 1, 3)
|
e@35
|
520 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
|
e@35
|
521 plt.semilogy(time, energy_prof.T)
|
e@35
|
522 plt.xlim([0, len(track_arr)/SR])
|
e@35
|
523 plt.ylabel('energy (rms)')
|
e@41
|
524
|
e@35
|
525 # Tidy up and save to file
|
e@35
|
526 plt.tight_layout()
|
e@35
|
527 if append_to_filename:
|
e@42
|
528 plt.savefig('{}/background_{}_{}.{}'.format(output_path,
|
e@42
|
529 label,
|
e@42
|
530 append_to_filename,
|
e@42
|
531 image_format),
|
e@42
|
532 dpi=300)
|
e@35
|
533 else:
|
e@42
|
534 plt.savefig('{}/background_{}.{}'.format(output_path,
|
e@42
|
535 label,
|
e@42
|
536 image_format),
|
e@42
|
537 dpi=300)
|
e@35
|
538
|
e@35
|
539 # Compute total energy of background
|
e@35
|
540 if len(backgrounds_df) > 0:
|
e@35
|
541 background_arr = np.sum(track_list, 0)
|
e@35
|
542 B = librosa.stft(background_arr, 1024)
|
e@35
|
543 background_energy = librosa.feature.rmse(S=B).flatten()
|
e@35
|
544 else:
|
e@35
|
545 background_energy = 0.0
|
e@41
|
546
|
e@35
|
547 for n in range(len(events_df)):
|
e@35
|
548 # Get label of track
|
e@35
|
549 label = str(events_df['label'].loc[n])
|
e@35
|
550
|
e@35
|
551 # First check if there are any pattern candidates. Give priorities
|
e@35
|
552 # To pattern files.
|
e@35
|
553 candidates = []
|
e@41
|
554
|
e@41
|
555 # List of pattern start and end times
|
e@41
|
556 pattern_timedict = []
|
e@41
|
557
|
e@35
|
558 for pattern_format in ['xls', 'json', 'txt', 'csv']:
|
e@42
|
559 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path,
|
e@42
|
560 events_df['sampleid'].loc[n],
|
e@42
|
561 pattern_format))
|
e@35
|
562
|
e@35
|
563 if len(candidates) == 0:
|
e@35
|
564 # If no patterns are found, search for normal audio files
|
e@35
|
565 candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n]))
|
e@35
|
566 chosen_fname = random.sample(candidates, 1)[0]
|
e@44
|
567 wav, sr = sf.read(chosen_fname)
|
e@35
|
568 else:
|
e@35
|
569 chosen_fname = random.sample(candidates, 1)[0]
|
e@41
|
570 wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)
|
e@41
|
571
|
e@42
|
572 logging.debug(chosen_fname)
|
e@35
|
573 # Apply a fader envelope
|
e@35
|
574 fade_in_time = float(events_df['fade_in_time'].loc[n])
|
e@35
|
575 fade_out_time = float(events_df['fade_out_time'].loc[n])
|
e@35
|
576 wav = fade(wav, fade_in_time, fade_out_time)
|
e@35
|
577
|
e@35
|
578 # Set target EBR
|
e@42
|
579 target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 +
|
e@42
|
580 np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0)
|
e@35
|
581
|
e@35
|
582 # Mean time between instances \mu.
|
e@35
|
583 mean_time_between_instances = events_df['mean_time_between_instances'].loc[n]
|
e@35
|
584 track_end_time = events_df['end_time'].loc[n]
|
e@35
|
585
|
e@35
|
586 # Track array
|
e@35
|
587 track_arr = np.zeros(int(scene_duration*SR))
|
e@35
|
588
|
e@42
|
589 # If \mu is -1, then play the event only once.
|
e@35
|
590 if mean_time_between_instances == -1:
|
e@42
|
591 track_arr[_D(events_df['start_time'].loc[n]):_D(events_df['start_time'].loc[n]) + len(wav)] += wav
|
e@35
|
592 start_times = [float(events_df['start_time'].loc[n])]
|
e@35
|
593 end_times = [float(events_df['end_time'].loc[n])]
|
e@41
|
594
|
e@41
|
595 new_start_time = start_times[-1]
|
e@41
|
596 new_end_time = end_times[-1]
|
e@41
|
597
|
e@46
|
598 if label in timedict:
|
e@46
|
599 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
|
e@42
|
600 else:
|
e@46
|
601 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
|
e@42
|
602
|
e@41
|
603 for pt in pattern_timedict:
|
e@46
|
604 pattern_timedict[pt] = [(s[0], s[1] + new_start_time, s[2] + new_start_time) for s in
|
e@41
|
605 pattern_timedict[pt]]
|
e@41
|
606
|
e@41
|
607 if pt in timedict:
|
e@41
|
608 timedict[pt] += pattern_timedict[pt]
|
e@41
|
609 else:
|
e@41
|
610 timedict[pt] = pattern_timedict[pt]
|
e@41
|
611
|
e@35
|
612 else:
|
e@35
|
613 # If 0, then start next sample after this one (set it to the duration of the sample)
|
e@35
|
614 if mean_time_between_instances == 0:
|
e@35
|
615 mean_time_between_instances = len(wav)/float(SR)
|
e@51
|
616
|
e@51
|
617 # If we are using -fd (full_duration) for each event then mean_time_between_instances denotes time AFTER
|
e@51
|
618 # the end of the previous event.
|
e@51
|
619 if full_duration and mean_time_between_instances > 0:
|
e@51
|
620 mean_time_between_instances += len(wav)/float(SR)
|
e@35
|
621
|
e@35
|
622 # Store the successive starting and ending times of the events (given e.g. the model)
|
e@35
|
623 # in the following lists.
|
e@35
|
624 start_times = [events_df['start_time'].loc[n]]
|
e@35
|
625 end_times = [start_times[-1]+len(wav)/float(SR)]
|
e@35
|
626
|
e@35
|
627 # Start with the first time in the list
|
e@35
|
628 new_start_time = start_times[-1]
|
e@35
|
629 new_end_time = end_times[-1]
|
e@35
|
630
|
e@41
|
631 if chosen_fname in timedict:
|
e@46
|
632 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
|
e@41
|
633 else:
|
e@46
|
634 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
|
e@41
|
635
|
e@35
|
636 # Until the scene is full
|
e@35
|
637 while new_start_time < track_end_time:
|
e@35
|
638 offset = float(mean_time_between_instances) +\
|
e@35
|
639 float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn())
|
e@35
|
640 new_start_time += offset
|
e@41
|
641
|
e@41
|
642 # If already exceeded scene, break
|
e@41
|
643 if new_start_time >= scene_duration:
|
e@41
|
644 break
|
e@41
|
645
|
e@35
|
646 new_end_time += offset
|
e@35
|
647
|
e@35
|
648 # Only exception is if we have set the 'end_cut' flag
|
e@35
|
649 # and the end time of the event surpasses the end time
|
e@35
|
650 # of the track
|
e@35
|
651 if end_cut and new_end_time > track_end_time:
|
e@35
|
652 break
|
e@35
|
653 else:
|
e@35
|
654 start_times.append(new_start_time)
|
e@35
|
655 end_times.append(new_end_time)
|
e@35
|
656
|
e@46
|
657 if label in timedict:
|
e@46
|
658 timedict[label].append((chosen_fname, new_start_time, min(scene_duration, new_end_time)))
|
e@41
|
659 else:
|
e@46
|
660 timedict[label] = [(chosen_fname, new_start_time, min(scene_duration, new_end_time))]
|
e@41
|
661
|
e@41
|
662 # Also update the times from the patterns
|
e@41
|
663 for pt in pattern_timedict:
|
e@48
|
664 pattern_timedict[pt] = [(s[0], s[1] + new_start_time, s[2] + new_start_time) for s in
|
e@41
|
665 pattern_timedict[pt]]
|
e@41
|
666
|
e@41
|
667 if pt in timedict:
|
e@41
|
668 timedict[pt] += pattern_timedict[pt]
|
e@41
|
669 else:
|
e@41
|
670 timedict[pt] = pattern_timedict[pt]
|
e@41
|
671
|
e@35
|
672 for t in start_times:
|
e@35
|
673 # We need to be careful with the limits here
|
e@35
|
674 # since numpy will just ignore indexing that
|
e@35
|
675 # exceeds the size of the array
|
e@42
|
676 begin = min(_D(t), len(track_arr))
|
e@42
|
677 end = min(len(track_arr), _D(t) + len(wav))
|
e@35
|
678
|
e@35
|
679 # Part of the wav to store
|
e@35
|
680 part = wav[:end-begin]
|
e@35
|
681
|
e@35
|
682 # If wav file was concatenated, fade out
|
e@35
|
683 # quickly to avoid clicks
|
e@42
|
684 if len(wav) > len(part) > fade_out_time*SR:
|
e@35
|
685 part = fade(part, 0, fade_out_time)
|
e@35
|
686
|
e@35
|
687 track_arr[begin:end] += part
|
e@35
|
688
|
e@35
|
689 track_list.append(track_arr)
|
e@35
|
690 scene_arr[:len(track_arr)] += track_arr
|
e@35
|
691
|
e@35
|
692 # Compute energies
|
e@35
|
693 F = librosa.stft(track_arr, 1024)
|
e@35
|
694 energy_prof = librosa.feature.rmse(S=F).flatten()
|
e@35
|
695
|
e@35
|
696 # Compute current ebr
|
e@35
|
697
|
e@35
|
698 if len(backgrounds_df) > 0:
|
e@35
|
699 ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten()
|
e@35
|
700 curr_ebr = np.max(ebr_prof)
|
e@42
|
701 logging.debug('{}:Target ebr: {}db'.format(label,
|
e@42
|
702 20*np.log10(target_ebr)))
|
e@42
|
703 logging.debug('{}:Current track ebr: {}db'.format(label,
|
e@42
|
704 20*np.log10(curr_ebr)))
|
e@35
|
705
|
e@35
|
706 # Set correct ebr
|
e@35
|
707 track_arr = track_arr/curr_ebr*target_ebr
|
e@35
|
708
|
e@35
|
709 Fnew = librosa.stft(track_arr, 1024)
|
e@35
|
710 new_energy_prof = librosa.feature.rmse(S=Fnew).flatten()
|
e@35
|
711 new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten()
|
e@35
|
712 new_ebr = np.max(new_ebr_prof)
|
e@42
|
713 logging.debug('{}:New track ebr: {}db'.format(label, 20*np.log10(new_ebr)))
|
e@35
|
714
|
e@35
|
715 if channel_mode == 'separate':
|
e@44
|
716 sf.write('{}/{}_event_track.wav'.format(output_path, label),
|
e@42
|
717 track_arr/np.max(track_arr),
|
e@42
|
718 SR)
|
e@35
|
719
|
e@35
|
720 if figure_verbosity > 0:
|
e@35
|
721 plt.figure()
|
e@35
|
722
|
e@42
|
723 plt.subplot(3, 1, 1)
|
e@35
|
724 plt.title('`{}\' event waveform and spectrogram'.format(label))
|
e@35
|
725
|
e@42
|
726 librosa.display.waveplot(track_arr, sr=SR)
|
e@35
|
727 Fdb = librosa.amplitude_to_db(F)
|
e@35
|
728 plt.subplot(3, 1, 2)
|
e@35
|
729 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
|
e@35
|
730
|
e@35
|
731 # Plot energy profile
|
e@35
|
732 plt.subplot(3, 1, 3)
|
e@35
|
733 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
|
e@35
|
734 plt.semilogy(time, energy_prof.T)
|
e@35
|
735 plt.xlim([0, len(track_arr)/SR])
|
e@35
|
736 plt.ylabel('energy (rms)')
|
e@35
|
737
|
e@35
|
738 plt.tight_layout()
|
e@35
|
739 if append_to_filename:
|
e@35
|
740 plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
|
e@35
|
741 else:
|
e@35
|
742 plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300)
|
e@35
|
743
|
e@35
|
744 scene_starting_times.append((label, start_times))
|
e@35
|
745 scene_ending_times.append((label, end_times))
|
e@35
|
746
|
e@35
|
747 if figure_verbosity > 0:
|
e@35
|
748 plt.figure()
|
e@42
|
749 ax0 = plt.subplot(3, 1, 1)
|
e@35
|
750 plt.title('Synthesized Scene')
|
e@35
|
751 librosa.display.waveplot(scene_arr, sr=SR)
|
e@35
|
752 F = librosa.stft(scene_arr)
|
e@35
|
753 Fdb = librosa.amplitude_to_db(F)
|
e@42
|
754 ax1 = plt.subplot(3, 1, 2)
|
e@35
|
755 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
|
e@42
|
756 ax2 = plt.subplot(3, 1, 3)
|
e@42
|
757 ax2.set_xlim([0, scene_duration])
|
e@35
|
758
|
e@35
|
759 # Get labels
|
e@35
|
760 labels = [s[0] for s in scene_starting_times]
|
e@35
|
761
|
e@35
|
762 # If background is active
|
e@35
|
763 if len(backgrounds_df) > 0:
|
e@35
|
764 labels.append('background')
|
e@35
|
765
|
e@35
|
766 # Set y axis limit. With a padding of 0.5.
|
e@35
|
767 ax2.set_ylim([-0.5, len(labels)-0.5])
|
e@35
|
768
|
e@35
|
769 plt.yticks(range(len(labels)), labels)
|
e@35
|
770
|
e@35
|
771 for n in range(len(scene_starting_times)):
|
e@35
|
772 start_times = scene_starting_times[n][1]
|
e@35
|
773 end_times = scene_ending_times[n][1]
|
e@35
|
774 color = ['r', 'g', 'y'][n % 3]
|
e@35
|
775
|
e@35
|
776 for m in range(len(start_times)):
|
e@35
|
777 plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4)
|
e@35
|
778 if figure_verbosity > 2:
|
e@35
|
779 ax0.axvline(start_times[m], color=color, alpha=0.1)
|
e@35
|
780 ax0.axvline(end_times[m], color=color, alpha=0.1)
|
e@35
|
781 ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
|
e@35
|
782 ax1.axvline(start_times[m], color=color, alpha=0.1)
|
e@35
|
783 ax1.axvline(end_times[m], color=color, alpha=0.1)
|
e@35
|
784 ax1.axvline(end_times[m], color=color, alpha=0.1)
|
e@35
|
785 ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
|
e@35
|
786 ax2.axvline(start_times[m], color=color, alpha=0.1)
|
e@35
|
787 ax2.axvline(end_times[m], color=color, alpha=0.1)
|
e@35
|
788 ax2.axvline(end_times[m], color=color, alpha=0.1)
|
e@35
|
789 ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
|
e@35
|
790
|
e@35
|
791 if len(backgrounds_df) > 0:
|
e@35
|
792 plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4)
|
e@35
|
793
|
e@35
|
794 plt.tight_layout()
|
e@35
|
795
|
e@35
|
796 if append_to_filename:
|
e@35
|
797 plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300)
|
e@35
|
798 else:
|
e@35
|
799 plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300)
|
e@41
|
800
|
e@47
|
801 if annot_format == 'sed_eval':
|
e@47
|
802 timedict_txt = timedict_to_txt(timedict)
|
e@47
|
803 logging.debug(timedict_txt)
|
e@41
|
804
|
e@47
|
805 if append_to_filename:
|
e@47
|
806 with open('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename), 'w') as f:
|
e@47
|
807 f.write(timedict_txt)
|
e@47
|
808 else:
|
e@47
|
809 with open('{}/scene_offsets.csv'.format(output_path), 'w') as f:
|
e@47
|
810 f.write(timedict_txt)
|
e@47
|
811
|
e@47
|
812 elif annot_format == 'pandas':
|
e@47
|
813 timedict_df = timedict_to_dataframe(timedict)
|
e@47
|
814 logging.debug(timedict_df)
|
e@47
|
815
|
e@47
|
816 if append_to_filename:
|
e@47
|
817 timedict_df.to_csv('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename))
|
e@47
|
818 else:
|
e@47
|
819 timedict_df.to_csv('{}/scene_offsets.csv'.format(output_path))
|
e@41
|
820
|
e@35
|
821 if figure_verbosity > 1:
|
e@35
|
822 plt.show()
|
e@35
|
823
|
e@35
|
824 # Replace nans (i.e. because of division-by-zero) of the scene with zeros.
|
e@35
|
825 scene_arr = np.nan_to_num(scene_arr)
|
e@35
|
826
|
e@35
|
827 if channel_mode == 'mono':
|
e@35
|
828 if append_to_filename:
|
e@44
|
829 sf.write('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR)
|
e@35
|
830 else:
|
e@44
|
831 sf.write('{}/scene.wav'.format(output_path), scene_arr, SR)
|
e@41
|
832
|
e@41
|
833 # Print timesdict
|
e@35
|
834
|
e@35
|
835 return scene_arr
|
e@41
|
836
|
e@41
|
837
|
e@35
|
838 def not_implemented():
|
e@41
|
839 logging.info("TODO: not implemented")
|
e@41
|
840
|
e@41
|
841
|
e@41
|
842 if __name__ == "__main__":
|
e@35
|
843 """
|
e@35
|
844 Main function, parses options and calls the simscene generation function
|
e@35
|
845 or a demo. The options given are almost identical to Lagrange et al's
|
e@35
|
846 simscene.
|
e@35
|
847 """
|
e@35
|
848 argparser = argparse.ArgumentParser(
|
e@35
|
849 description="SimScene.py acoustic scene generator",
|
e@35
|
850 )
|
e@35
|
851 argparser.add_argument(
|
e@35
|
852 'input_path',
|
e@35
|
853 type=str,
|
e@42
|
854 help="Path of a directory containing wave files for sound backgrounds"
|
e@42
|
855 "(in the `background' sub-directory) or events (in `event')"
|
e@35
|
856 )
|
e@42
|
857
|
e@42
|
858 input_path = '.'
|
e@42
|
859
|
e@35
|
860 argparser.add_argument(
|
e@35
|
861 'output_path',
|
e@35
|
862 type=str,
|
e@35
|
863 help="The directory the generated scenes and annotations will reside."
|
e@42
|
864 )
|
e@42
|
865
|
e@42
|
866 output_path = '.'
|
e@42
|
867
|
e@35
|
868 argparser.add_argument(
|
e@35
|
869 'scene_duration',
|
e@35
|
870 type=float,
|
e@35
|
871 help="Duration of scene in seconds",
|
e@35
|
872 )
|
e@35
|
873 scene_duration = None
|
e@35
|
874
|
e@35
|
875 argparser.add_argument(
|
e@35
|
876 '-e', '--score-events',
|
e@35
|
877 type=str,
|
e@35
|
878 help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
|
e@35
|
879 )
|
e@35
|
880 score_events = None
|
e@35
|
881
|
e@35
|
882 argparser.add_argument(
|
e@35
|
883 '-b', '--score-backgrounds',
|
e@35
|
884 type=str,
|
e@35
|
885 help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
|
e@35
|
886 )
|
e@35
|
887 score_backgrounds = None
|
e@35
|
888
|
e@35
|
889 argparser.add_argument(
|
e@35
|
890 '--tag',
|
e@35
|
891 type=str,
|
e@35
|
892 help="Append _TAG_XXX to filenames, where XXX is an increment."
|
e@35
|
893 )
|
e@35
|
894 tag = None
|
e@35
|
895
|
e@35
|
896 argparser.add_argument(
|
e@35
|
897 '-N',
|
e@35
|
898 type=int,
|
e@42
|
899 help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, "
|
e@42
|
900 "then the verbosity must be less or equal to 1"
|
e@35
|
901 )
|
e@35
|
902 generate_n = 1
|
e@35
|
903
|
e@35
|
904 argparser.add_argument(
|
e@35
|
905 '-t', '--time-mode',
|
e@35
|
906 type=str,
|
e@42
|
907 help="Mode of spacing between events. `generate': values must be set for each track in the score files. "
|
e@42
|
908 "`abstract': values are computed from an abstract representation of an existing acoustic scene. "
|
e@42
|
909 "`replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
|
e@35
|
910 choices=['generate', 'abstract', 'replicate']
|
e@35
|
911 )
|
e@35
|
912 time_mode = 'generate'
|
e@51
|
913
|
e@51
|
914 argparser.add_argument(
|
e@51
|
915 '-fd', '--full-duration',
|
e@51
|
916 action='store_true',
|
e@51
|
917 help="If enabled, times specified in the recipe refer to after the previous file finishes."
|
e@51
|
918 )
|
e@51
|
919 full_duration = False
|
e@51
|
920
|
e@35
|
921 argparser.add_argument(
|
e@35
|
922 '-R', '--ebr-mode',
|
e@35
|
923 type=str,
|
e@42
|
924 help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the "
|
e@42
|
925 "score files. `abstract': values are computed from an abstract representation of an existing acoustic "
|
e@42
|
926 "scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
|
e@35
|
927 choices=['generate', 'abstract', 'replicate']
|
e@35
|
928 )
|
e@35
|
929 ebr_mode = 'generate'
|
e@35
|
930
|
e@35
|
931 argparser.add_argument(
|
e@35
|
932 '-A', '--annotation-file',
|
e@35
|
933 type=float,
|
e@42
|
934 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. "
|
e@42
|
935 "ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). "
|
e@42
|
936 "(NOT IMPLEMENTED)"
|
e@35
|
937 )
|
e@35
|
938 annotation_file = None
|
e@35
|
939
|
e@35
|
940 argparser.add_argument(
|
e@35
|
941 '-a', '--audio-file',
|
e@35
|
942 type=float,
|
e@42
|
943 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs "
|
e@42
|
944 "from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)"
|
e@35
|
945 )
|
e@35
|
946 audio_file = None
|
e@35
|
947
|
e@35
|
948 argparser.add_argument(
|
e@35
|
949 '-v', '--figure-verbosity', action='count',
|
e@42
|
950 help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not "
|
e@42
|
951 "display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot"
|
e@35
|
952 )
|
e@35
|
953 figure_verbosity = 0
|
e@35
|
954
|
e@35
|
955 argparser.add_argument(
|
e@35
|
956 '-x', '--image-format',
|
e@35
|
957 help="Image format for the figures",
|
e@35
|
958 choices=['png', 'jpg', 'pdf']
|
e@35
|
959 )
|
e@35
|
960 image_format = 'png'
|
e@35
|
961
|
e@35
|
962 argparser.add_argument(
|
e@35
|
963 '-C', '--channel-mode',
|
e@35
|
964 type=str,
|
e@42
|
965 help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as "
|
e@42
|
966 "'classes', each channel is saved in a separate .wav file.",
|
e@35
|
967 choices=['mono', 'separate']
|
e@35
|
968 )
|
e@35
|
969 channel_mode = 'mono'
|
e@35
|
970
|
e@35
|
971 argparser.add_argument(
|
e@35
|
972 '-c', '--end-cut',
|
e@35
|
973 action='store_true',
|
e@42
|
974 help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, "
|
e@42
|
975 "else remove the sample."
|
e@35
|
976 )
|
e@35
|
977 end_cut = None
|
e@42
|
978
|
e@42
|
979 argparser.add_argument(
|
e@42
|
980 '-L', '--logging-level',
|
e@42
|
981 type=str,
|
e@42
|
982 help="Set lowest logging level",
|
e@42
|
983 choices=['debug', 'warning', 'info']
|
e@42
|
984 )
|
e@42
|
985
|
e@47
|
986 argparser.add_argument(
|
e@47
|
987 '--annot-format',
|
e@47
|
988 type=str,
|
e@47
|
989 help="Annotation format for generated scenes. Choices are: 'sed_eval' (default) - Format appropriate for "
|
e@47
|
990 "DCASE 2017 challenge evaluator, 'pandas' - A more detailed format for the form <label, orig_filename, "
|
e@47
|
991 "start, stop>",
|
e@47
|
992 choices=['sed_eval', 'pandas']
|
e@47
|
993 )
|
e@47
|
994
|
e@35
|
995 args = argparser.parse_args()
|
e@42
|
996
|
e@42
|
997 if args.logging_level:
|
e@42
|
998 if args.logging_level == 'debug':
|
e@42
|
999 logging.basicConfig(level=logging.DEBUG)
|
e@42
|
1000 elif args.logging_level == 'info':
|
e@42
|
1001 logging.basicConfig(level=logging.INFO)
|
e@42
|
1002 elif args.logging_level == 'warning':
|
e@42
|
1003 logging.basicConfig(level=logging.WARNING)
|
e@42
|
1004 else:
|
e@42
|
1005 logging.basicConfig(level=logging.INFO)
|
e@42
|
1006
|
e@35
|
1007 if args.input_path:
|
e@35
|
1008 input_path = args.input_path
|
e@35
|
1009 logging.debug("Using `{}' as input path".format(input_path))
|
e@35
|
1010 if args.output_path:
|
e@35
|
1011 output_path = args.output_path
|
e@35
|
1012 logging.debug("Saving to `{}'".format(output_path))
|
e@51
|
1013 if args.full_duration:
|
e@51
|
1014 full_duration = True
|
e@35
|
1015 if args.scene_duration:
|
e@35
|
1016 if not (args.score_backgrounds or args.score_events):
|
e@35
|
1017 print("You must provide one of -e or -b")
|
e@35
|
1018 else:
|
e@35
|
1019 if args.image_format:
|
e@35
|
1020 image_format = args.image_format
|
e@35
|
1021 if args.channel_mode:
|
e@35
|
1022 channel_mode = args.channel_mode
|
e@35
|
1023 if args.ebr_mode:
|
e@35
|
1024 ebr_mode = args.ebr_mode
|
e@35
|
1025 if ebr_mode not in ['generate']:
|
e@35
|
1026 logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode))
|
e@35
|
1027 ebr_mode = 'generate'
|
e@35
|
1028 if args.time_mode:
|
e@35
|
1029 time_mode = args.time_mode
|
e@35
|
1030 if time_mode not in ['generate']:
|
e@35
|
1031 logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode))
|
e@35
|
1032 time_mode = 'generate'
|
e@35
|
1033 if args.annotation_file:
|
e@35
|
1034 annotations = read_annotations_file(args.annotation_file)
|
e@35
|
1035
|
e@35
|
1036 scene_duration = float(args.scene_duration)
|
e@35
|
1037
|
e@35
|
1038 if args.score_backgrounds:
|
e@35
|
1039 score_backgrounds = read_backgrounds_file(args.score_backgrounds)
|
e@35
|
1040 else:
|
e@35
|
1041 score_backgrounds = []
|
e@35
|
1042
|
e@35
|
1043 if args.score_events:
|
e@35
|
1044 score_events = read_events_file(args.score_events)
|
e@35
|
1045 else:
|
e@35
|
1046 score_events = []
|
e@35
|
1047
|
e@35
|
1048 if args.figure_verbosity:
|
e@35
|
1049 figure_verbosity = args.figure_verbosity
|
e@35
|
1050
|
e@35
|
1051 if args.N:
|
e@35
|
1052 generate_n = args.N
|
e@35
|
1053
|
e@35
|
1054 if args.tag:
|
e@35
|
1055 tag = args.tag
|
e@35
|
1056
|
e@35
|
1057 if generate_n == 1:
|
e@35
|
1058 append_to_filename = None
|
e@35
|
1059 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
|
e@35
|
1060 time_mode=time_mode,
|
e@35
|
1061 ebr_mode=ebr_mode,
|
e@35
|
1062 channel_mode=channel_mode,
|
e@35
|
1063 annotation_file=annotation_file,
|
e@35
|
1064 audio_file=audio_file,
|
e@35
|
1065 figure_verbosity=figure_verbosity,
|
e@35
|
1066 end_cut=end_cut,
|
e@35
|
1067 image_format=image_format,
|
e@51
|
1068 append_to_filename=append_to_filename,
|
e@51
|
1069 full_duration=full_duration)
|
e@35
|
1070 else:
|
e@35
|
1071 for n in range(generate_n):
|
e@35
|
1072 if tag:
|
e@35
|
1073 append_to_filename = '{}_{}'.format(tag, n)
|
e@35
|
1074 else:
|
e@35
|
1075 append_to_filename = '{}'.format(n)
|
e@35
|
1076
|
e@35
|
1077 logging.info("Generating scene {}".format(n))
|
e@35
|
1078
|
e@35
|
1079 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
|
e@35
|
1080 time_mode=time_mode,
|
e@35
|
1081 ebr_mode=ebr_mode,
|
e@35
|
1082 channel_mode=channel_mode,
|
e@35
|
1083 annotation_file=annotation_file,
|
e@35
|
1084 audio_file=audio_file,
|
e@35
|
1085 figure_verbosity=min(figure_verbosity, 1),
|
e@35
|
1086 end_cut=end_cut,
|
e@35
|
1087 image_format=image_format,
|
e@51
|
1088 append_to_filename=append_to_filename,
|
e@51
|
1089 full_duration=full_duration)
|