e@35
|
1 #!/bin/python
|
e@35
|
2 # -*- coding: utf-8 -*-
|
e@35
|
3 # For licensing please see: LICENSE
|
e@35
|
4 # Copyright (c) Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
|
e@35
|
5
|
e@35
|
6 # Argparse
|
e@35
|
7 import argparse
|
e@35
|
8
|
e@35
|
9 # Logging
|
e@35
|
10 import logging
|
e@35
|
11
|
e@35
|
12 # Pandas
|
e@35
|
13 import pandas as pd
|
e@35
|
14
|
e@35
|
15 # Numpy
|
e@35
|
16 import numpy as np
|
e@35
|
17 import sys
|
e@35
|
18
|
e@35
|
19 # Glob
|
e@35
|
20 import glob
|
e@35
|
21 import random
|
e@35
|
22
|
e@35
|
23 # Librosa
|
e@35
|
24 import librosa
|
e@35
|
25 import librosa.display
|
e@35
|
26 import librosa.output
|
e@35
|
27
|
e@35
|
28 # Matplotlib
|
e@35
|
29 from matplotlib import rc
|
e@35
|
30 # rc('text', usetex=True)
|
e@35
|
31 import matplotlib.pyplot as plt
|
e@35
|
32 import matplotlib.patches as patches
|
e@35
|
33 from cycler import cycler
|
e@35
|
34
|
e@35
|
35 # Tabulate
|
e@35
|
36 from tabulate import tabulate
|
e@35
|
37
|
e@35
|
38 def _N(t, sr=44100):
|
e@35
|
39 """
|
e@35
|
40 Helper function: Converts time to samples
|
e@35
|
41 """
|
e@35
|
42 return int(t*sr)
|
e@35
|
43
|
e@35
|
44 def compute_energy(x):
|
e@35
|
45 return np.sqrt(np.mean(x**2))
|
e@35
|
46
|
e@35
|
47 # def compute_energy_profile(x, w=1000):
|
e@35
|
48 # # Resize/Window signal
|
e@35
|
49 # #x = np.resize(x, (w,int(np.ceil(float(len(x)/w)))))
|
e@35
|
50 # x = np.array([[ii+jj for jj in range(w)] for ii in range(len(x)-w)])
|
e@35
|
51 # return np.sqrt(np.mean(x**2, 1))
|
e@35
|
52
|
e@35
|
53 def render_pattern(fname, input_path, sr=44100):
|
e@35
|
54 pattern = read_pattern_file(fname)
|
e@35
|
55
|
e@35
|
56 start_times_samples = []
|
e@35
|
57 end_times_samples = []
|
e@35
|
58 durations_samples = []
|
e@35
|
59 wav_files = []
|
e@35
|
60
|
e@35
|
61 for n in range(len(pattern)):
|
e@35
|
62 # Try loading the file,
|
e@35
|
63 sampleid = pattern['sampleid'].loc[n]
|
e@35
|
64 candidates = glob.glob('{}/event/{}*wav'.format(input_path,sampleid))
|
e@35
|
65 chosen_fname = random.sample(candidates, 1)[0]
|
e@35
|
66
|
e@35
|
67 logging.debug('Loading {}'.format(chosen_fname))
|
e@35
|
68
|
e@35
|
69 # For each sound in the pattern file, place it starting from starttime + an offset
|
e@35
|
70 # with a mean value of 0 and standard deviation of offset_stddev. The first event can
|
e@35
|
71 # not start earlier than time 0. If endtime is defined (not nan), then cut the event at
|
e@35
|
72 # end time.
|
e@35
|
73 wav, SR = librosa.load(chosen_fname, sr=sr)
|
e@35
|
74
|
e@35
|
75 # Read and assign an amplitude
|
e@35
|
76 amplitude_mean = float(pattern['amplitude'].loc[n])
|
e@35
|
77 amplitude_stddev = float(pattern['amplitude_stdev'].loc[n])
|
e@35
|
78 amplitude = amplitude_mean + np.random.randn()*amplitude_stddev
|
e@35
|
79 wav *= amplitude
|
e@35
|
80
|
e@35
|
81 start_time = max(float(pattern['start_time'].loc[n]),0)
|
e@35
|
82 start_time_samples = int(start_time*SR)
|
e@35
|
83
|
e@35
|
84 fade_in_time = float(pattern['fade_in_time'].loc[n])
|
e@35
|
85 fade_out_time = float(pattern['fade_out_time'].loc[n])
|
e@35
|
86 end_time = float(pattern['end_time'].loc[n])
|
e@35
|
87
|
e@35
|
88 # If end_time is not defined (-1 or just empty)
|
e@35
|
89 # then just derive it from the length of the sample
|
e@35
|
90 if np.isnan(end_time) or float(end_time) == -1:
|
e@35
|
91 duration_samples = len(wav)
|
e@35
|
92 end_time_samples = start_time_samples + duration_samples
|
e@35
|
93 elif end_time - start_time > len(wav)/float(SR):
|
e@35
|
94
|
e@35
|
95 # If given end_time is more than start_time + duration of sample
|
e@35
|
96 # then pad the file with zeros to reach the desired end time.
|
e@35
|
97 duration = end_time - start_time
|
e@35
|
98 duration_samples = int(duration*SR)
|
e@35
|
99 end_time_samples = start_time_samples + duration_samples
|
e@35
|
100 wav_arr = np.zeros(duration_samples)
|
e@35
|
101 wav_arr[:len(wav)] = wav
|
e@35
|
102 wav = wav_arr
|
e@35
|
103 else:
|
e@35
|
104 duration = end_time - start_time
|
e@35
|
105 duration_samples = int(duration*SR)
|
e@35
|
106 end_time_samples = start_time_samples + duration_samples
|
e@35
|
107
|
e@35
|
108 event_render = fade(wav[:duration_samples], fade_in_time, fade_out_time)
|
e@35
|
109
|
e@35
|
110 start_times_samples.append(start_time_samples)
|
e@35
|
111 end_times_samples.append(end_time_samples)
|
e@35
|
112 durations_samples.append(duration_samples)
|
e@35
|
113 wav_files.append(event_render)
|
e@35
|
114
|
e@35
|
115 pattern_duration = end_time_samples
|
e@35
|
116 pattern_arr = np.zeros(pattern_duration)
|
e@35
|
117
|
e@35
|
118 for n, s in enumerate(start_times_samples):
|
e@35
|
119 wav = wav_files[n]
|
e@35
|
120 pattern_arr[s:s+len(wav)] = wav
|
e@35
|
121
|
e@35
|
122 return pattern_arr, 44100
|
e@35
|
123
|
e@35
|
124 def read_events_file(fname):
|
e@35
|
125 if fname[-3:].lower() == 'xls':
|
e@35
|
126 df = pd.read_excel(fname)
|
e@35
|
127 elif fname[-4:].lower() == 'json':
|
e@35
|
128 df = pd.read_json(fname)
|
e@35
|
129 elif fname[-3:].lower() in ['txt']:
|
e@35
|
130 with open(fname) as f:
|
e@35
|
131 s = f.readline()
|
e@35
|
132 f.seek(0,0)
|
e@35
|
133 if ',' in s:
|
e@35
|
134 sep = ','
|
e@35
|
135 elif '\t' in s:
|
e@35
|
136 sep = '\t'
|
e@35
|
137 else:
|
e@35
|
138 sep = ' '
|
e@35
|
139 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
|
e@35
|
140 df = pd.read_csv(f, header=None, sep=sep)
|
e@35
|
141 df.columns = ['label','sampleid','ebr','ebr_stddev','mean_time_between_instances','time_between_instances_stddev','start_time','end_time','fade_in_time','fade_out_time']
|
e@35
|
142 elif fname[-3:].lower() in ['csv']:
|
e@35
|
143 df = pd.read_json(fname)
|
e@35
|
144
|
e@35
|
145 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
|
e@35
|
146 return df
|
e@35
|
147
|
e@35
|
148 def read_pattern_file(fname):
|
e@35
|
149 if fname[-3:].lower() == 'xls':
|
e@35
|
150 df = pd.read_excel(fname)
|
e@35
|
151 elif fname[-4:].lower() == 'json':
|
e@35
|
152 df = pd.read_json(fname)
|
e@35
|
153 elif fname[-3:].lower() in ['txt']:
|
e@35
|
154 with open(fname) as f:
|
e@35
|
155 s = f.readline()
|
e@35
|
156 f.seek(0,0)
|
e@35
|
157 if ',' in s:
|
e@35
|
158 sep = ','
|
e@35
|
159 elif '\t' in s:
|
e@35
|
160 sep = '\t'
|
e@35
|
161 else:
|
e@35
|
162 sep = ' '
|
e@35
|
163 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
|
e@35
|
164 df = pd.read_csv(f, header=None, sep=sep)
|
e@35
|
165 df.columns = ['eventid','start_time','end_time','time_offset_stdev','fade_in_time', 'fade_out_time', 'amplitude', 'amplitude_stdev']
|
e@35
|
166 elif fname[-3:].lower() in ['csv']:
|
e@35
|
167 df = pd.read_json(fname)
|
e@35
|
168
|
e@35
|
169 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
|
e@35
|
170 return df
|
e@35
|
171
|
e@35
|
172 def read_backgrounds_file(fname):
|
e@35
|
173 if fname[-3:].lower() == 'xls':
|
e@35
|
174 df = pd.read_excel(fname)
|
e@35
|
175 elif fname[-4:].lower() == 'json':
|
e@35
|
176 df = pd.read_json(fname)
|
e@35
|
177 elif fname[-3:].lower() in ['txt']:
|
e@35
|
178 with open(fname) as f:
|
e@35
|
179 s = f.readline()
|
e@35
|
180 f.seek(0,0)
|
e@35
|
181 if ',' in s:
|
e@35
|
182 sep = ','
|
e@35
|
183 elif '\t' in s:
|
e@35
|
184 sep = '\t'
|
e@35
|
185 else:
|
e@35
|
186 sep = ' '
|
e@35
|
187 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
|
e@35
|
188 df = pd.read_csv(f, header=None, sep=sep)
|
e@35
|
189 df.columns = ['label','sampleid','snr']
|
e@35
|
190 elif fname[-3:].lower() in ['csv']:
|
e@35
|
191 df = pd.read_json(fname)
|
e@35
|
192
|
e@35
|
193 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
|
e@35
|
194 return df
|
e@35
|
195
|
e@35
|
196 def read_annotations_file(fname):
|
e@35
|
197 if fname[-3:].lower() == 'xls':
|
e@35
|
198 df = pd.read_excel(fname)
|
e@35
|
199 elif fname[-4:].lower() == 'json':
|
e@35
|
200 df = pd.read_json(fname)
|
e@35
|
201 elif fname[-3:].lower() in ['txt', 'csv']:
|
e@35
|
202
|
e@35
|
203 with open(fname) as f:
|
e@35
|
204 header = f.readline()
|
e@35
|
205
|
e@35
|
206 s = f.readline()
|
e@35
|
207 f.seek(0,0)
|
e@35
|
208 if ',' in s:
|
e@35
|
209 sep = ','
|
e@35
|
210 elif '\t' in s:
|
e@35
|
211 sep = '\t'
|
e@35
|
212 else:
|
e@35
|
213 sep = ' '
|
e@35
|
214 if sep in header:
|
e@35
|
215 logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
|
e@35
|
216 df = pd.read_csv(f, header=None, sep=sep)
|
e@35
|
217 df.columns = ['start', 'stop', 'class']
|
e@35
|
218 else:
|
e@35
|
219 df.columns = ['start', 'stop', 'class']
|
e@35
|
220 df = pd.read_csv(f, sep=sep)
|
e@35
|
221 df = None
|
e@35
|
222
|
e@35
|
223 logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
|
e@35
|
224 return df
|
e@35
|
225
|
e@35
|
226 def run_demo():
|
e@35
|
227 print("TODO: Implement run_demo()")
|
e@35
|
228
|
e@35
|
229 def fade(x, fade_in, fade_out, sr=44100):
|
e@35
|
230 """
|
e@35
|
231 Creates a fade-in-fade-out envelope
|
e@35
|
232 for audio array x.
|
e@35
|
233 """
|
e@35
|
234
|
e@35
|
235 if len(x) == 0:
|
e@35
|
236 return x
|
e@35
|
237
|
e@35
|
238 fade_in_samples = int(fade_in*sr)
|
e@35
|
239 fade_out_samples = int(fade_out*sr)
|
e@35
|
240
|
e@35
|
241 outp = np.ones_like(x)
|
e@35
|
242 for n in range(fade_in_samples):
|
e@35
|
243 outp[n] = n*1./fade_in_samples
|
e@35
|
244
|
e@35
|
245 for n in range(fade_out_samples):
|
e@35
|
246 outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n
|
e@35
|
247 return outp*x
|
e@35
|
248
|
e@35
|
249 def simscene(input_path,
|
e@35
|
250 output_path,
|
e@35
|
251 scene_duration,
|
e@35
|
252 score_events,
|
e@35
|
253 score_backgrounds,
|
e@35
|
254 **kwargs):
|
e@35
|
255 logging.info('simscene() is not yet implemented fully')
|
e@35
|
256 SR = 44100 # Samplerate. Should probably not be hardcoded
|
e@35
|
257
|
e@35
|
258 events_df = score_events
|
e@35
|
259 backgrounds_df = score_backgrounds
|
e@35
|
260
|
e@35
|
261 # Create empty numpy array
|
e@35
|
262 scene_arr = np.zeros(int(scene_duration*SR))
|
e@35
|
263
|
e@35
|
264 if 'append_to_filename' in kwargs:
|
e@35
|
265 append_to_filename = kwargs['append_to_filename']
|
e@35
|
266 else:
|
e@35
|
267 append_to_filename = None
|
e@35
|
268
|
e@35
|
269 if 'end_cut' in kwargs:
|
e@35
|
270 end_cut = kwargs['end_cut']
|
e@35
|
271 else:
|
e@35
|
272 end_cut = False
|
e@35
|
273
|
e@35
|
274 if 'figure_verbosity' in kwargs:
|
e@35
|
275 figure_verbosity = kwargs['figure_verbosity']
|
e@35
|
276 else:
|
e@35
|
277 figure_verbosity = 0
|
e@35
|
278
|
e@35
|
279 if 'image_format' in kwargs:
|
e@35
|
280 image_format = kwargs['image_format']
|
e@35
|
281 else:
|
e@35
|
282 image_format = 'png'
|
e@35
|
283
|
e@35
|
284 # Stores the starting and ending times of every track for visualization
|
e@35
|
285 # purposes
|
e@35
|
286 scene_starting_times = []
|
e@35
|
287 scene_ending_times = []
|
e@35
|
288
|
e@35
|
289 # List of tracks
|
e@35
|
290 track_list = []
|
e@35
|
291 background_energies = []
|
e@35
|
292
|
e@35
|
293 for n in range(len(backgrounds_df)):
|
e@35
|
294 # Get label of background
|
e@35
|
295 label = str(backgrounds_df['label'].loc[n])
|
e@35
|
296
|
e@35
|
297 # First check if there are any pattern candidates. Give priorities
|
e@35
|
298 # To pattern files.
|
e@35
|
299 candidates = []
|
e@35
|
300 for pattern_format in ['xls', 'json', 'txt', 'csv']:
|
e@35
|
301 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, backgrounds_df['sampleid'].loc[n], pattern_format))
|
e@35
|
302
|
e@35
|
303 if len(candidates) == 0:
|
e@35
|
304 # If no patterns are found, search for normal audio files
|
e@35
|
305 candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n]))
|
e@35
|
306 chosen_fname = random.sample(candidates, 1)[0]
|
e@35
|
307 wav, sr = librosa.load(chosen_fname, sr=SR)
|
e@35
|
308 else:
|
e@35
|
309 chosen_fname = random.sample(candidates, 1)[0]
|
e@35
|
310 wav, sr = render_pattern(chosen_fname, input_path)
|
e@35
|
311
|
e@35
|
312 duration = len(wav)/float(SR)
|
e@35
|
313 target_snr_db = float(backgrounds_df['snr'].loc[n])
|
e@35
|
314 target_snr = 10**(target_snr_db/20.0)
|
e@35
|
315
|
e@35
|
316 energy = compute_energy(wav)
|
e@35
|
317
|
e@35
|
318 logging.debug('{}:energy:{}'.format(label,energy))
|
e@35
|
319
|
e@35
|
320
|
e@35
|
321 if n == 0:
|
e@35
|
322 # For the first background track, snr
|
e@35
|
323 # gives an amount by which it's going to be scaled (i.e. make it more silent)
|
e@35
|
324 amplitude_factor = target_snr
|
e@35
|
325 wav *= amplitude_factor
|
e@35
|
326
|
e@35
|
327 if n > 0:
|
e@35
|
328 noise_energy = compute_energy(np.sum(np.array(track_list), axis=0))
|
e@35
|
329 logging.info('{}:noise_energy:{}'.format(label,noise_energy))
|
e@35
|
330
|
e@35
|
331 old_snr = energy/noise_energy
|
e@35
|
332 old_snr_db = 20*np.log10(old_snr)
|
e@35
|
333 logging.info('{}:old_snr:{}'.format(label,old_snr_db))
|
e@35
|
334
|
e@35
|
335 amplitude_factor = target_snr/old_snr
|
e@35
|
336
|
e@35
|
337
|
e@35
|
338 wav *= amplitude_factor
|
e@35
|
339 new_energy = compute_energy(wav)
|
e@35
|
340 new_snr = new_energy/noise_energy
|
e@35
|
341 new_snr_db = 20. * np.log10(new_snr)
|
e@35
|
342 logging.info('{}:new_snr:{}'.format(label,new_snr_db))
|
e@35
|
343
|
e@35
|
344
|
e@35
|
345 # Track array
|
e@35
|
346 track_arr = np.zeros(int(scene_duration*SR))
|
e@35
|
347 start_times = [0.0]
|
e@35
|
348 end_times = [start_times[-1]+len(wav)/float(SR)]
|
e@35
|
349
|
e@35
|
350
|
e@35
|
351 # Start with the first time in the list
|
e@35
|
352 new_start_time = start_times[-1]
|
e@35
|
353 new_end_time = end_times[-1]
|
e@35
|
354
|
e@35
|
355 while new_start_time < scene_duration:
|
e@35
|
356 offset = duration
|
e@35
|
357 new_start_time += offset
|
e@35
|
358 new_end_time += offset
|
e@35
|
359
|
e@35
|
360 start_times.append(new_start_time)
|
e@35
|
361 end_times.append(new_end_time)
|
e@35
|
362
|
e@35
|
363 for n,t in enumerate(start_times):
|
e@35
|
364 # We need to be careful with the limits here
|
e@35
|
365 # since numpy will just ignore indexing that
|
e@35
|
366 # exceeds
|
e@35
|
367
|
e@35
|
368 # Fading times in case we need to join many
|
e@35
|
369 # consecutive samples together.
|
e@35
|
370 # if n == 0:
|
e@35
|
371 # # Little fade-out, fade-in to smoothly repeat the
|
e@35
|
372 # # background.
|
e@35
|
373 # fade_in_time = 0.0
|
e@35
|
374 # fade_out_time = 0.01
|
e@35
|
375 # elif n > 0 and n < len(start_times) - 1:
|
e@35
|
376 # fade_in_time = 0.01
|
e@35
|
377 # fade_out_time = 0.01
|
e@35
|
378 # else:
|
e@35
|
379 # fade_in_time = 0.01
|
e@35
|
380 # fade_out_time = 0.0
|
e@35
|
381 begin = min(_N(t), len(track_arr))
|
e@35
|
382 end = min(len(track_arr), _N(t)+len(wav))
|
e@35
|
383
|
e@35
|
384 # Part of the wav to store
|
e@35
|
385 # part = fade(wav[:end-begin],fade_in_time,fade_out_time)
|
e@35
|
386 part = wav[:end-begin]
|
e@35
|
387
|
e@35
|
388
|
e@35
|
389 track_arr[begin:end] += part
|
e@35
|
390
|
e@35
|
391 track_list.append(track_arr)
|
e@35
|
392 scene_arr[:len(track_arr)] += track_arr
|
e@35
|
393
|
e@35
|
394 if channel_mode == 'separate':
|
e@35
|
395 librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR)
|
e@35
|
396
|
e@35
|
397 F = librosa.stft(track_arr, 1024)
|
e@35
|
398 energy_prof = librosa.feature.rmse(S=F)
|
e@35
|
399 background_energies.append(energy_prof)
|
e@35
|
400
|
e@35
|
401 if figure_verbosity > 0:
|
e@35
|
402 plt.figure()
|
e@35
|
403 plt.subplot(3, 1, 1)
|
e@35
|
404 plt.title('`{}\' background waveform and spectrogram'.format(label))
|
e@35
|
405 librosa.display.waveplot(track_arr,sr=SR)
|
e@35
|
406
|
e@35
|
407 # Plot spectrogram
|
e@35
|
408 Fdb = librosa.amplitude_to_db(F)
|
e@35
|
409 plt.subplot(3, 1, 2)
|
e@35
|
410 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
|
e@35
|
411
|
e@35
|
412 # Plot energy profile
|
e@35
|
413 plt.subplot(3, 1, 3)
|
e@35
|
414 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
|
e@35
|
415 plt.semilogy(time, energy_prof.T)
|
e@35
|
416 plt.xlim([0, len(track_arr)/SR])
|
e@35
|
417 plt.ylabel('energy (rms)')
|
e@35
|
418
|
e@35
|
419
|
e@35
|
420 # Tidy up and save to file
|
e@35
|
421 plt.tight_layout()
|
e@35
|
422 if append_to_filename:
|
e@35
|
423 plt.savefig('{}/background_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
|
e@35
|
424 else:
|
e@35
|
425 plt.savefig('{}/background_{}.{}'.format(output_path, label, image_format), dpi=300)
|
e@35
|
426
|
e@35
|
427 # Compute total energy of background
|
e@35
|
428 if len(backgrounds_df) > 0:
|
e@35
|
429 background_arr = np.sum(track_list, 0)
|
e@35
|
430 B = librosa.stft(background_arr, 1024)
|
e@35
|
431 background_energy = librosa.feature.rmse(S=B).flatten()
|
e@35
|
432 else:
|
e@35
|
433 background_energy = 0.0
|
e@35
|
434
|
e@35
|
435 for n in range(len(events_df)):
|
e@35
|
436 # Get label of track
|
e@35
|
437 label = str(events_df['label'].loc[n])
|
e@35
|
438
|
e@35
|
439 # First check if there are any pattern candidates. Give priorities
|
e@35
|
440 # To pattern files.
|
e@35
|
441 candidates = []
|
e@35
|
442 for pattern_format in ['xls', 'json', 'txt', 'csv']:
|
e@35
|
443 candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, events_df['sampleid'].loc[n], pattern_format))
|
e@35
|
444
|
e@35
|
445 if len(candidates) == 0:
|
e@35
|
446 # If no patterns are found, search for normal audio files
|
e@35
|
447 candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n]))
|
e@35
|
448 chosen_fname = random.sample(candidates, 1)[0]
|
e@35
|
449 wav, sr = librosa.load(chosen_fname, sr=SR)
|
e@35
|
450 else:
|
e@35
|
451 chosen_fname = random.sample(candidates, 1)[0]
|
e@35
|
452 wav, sr = render_pattern(chosen_fname, input_path)
|
e@35
|
453
|
e@35
|
454
|
e@35
|
455 # Apply a fader envelope
|
e@35
|
456 fade_in_time = float(events_df['fade_in_time'].loc[n])
|
e@35
|
457 fade_out_time = float(events_df['fade_out_time'].loc[n])
|
e@35
|
458 wav = fade(wav, fade_in_time, fade_out_time)
|
e@35
|
459
|
e@35
|
460 # Set target EBR
|
e@35
|
461 target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 + np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0)
|
e@35
|
462
|
e@35
|
463 # Mean time between instances \mu.
|
e@35
|
464 mean_time_between_instances = events_df['mean_time_between_instances'].loc[n]
|
e@35
|
465 track_end_time = events_df['end_time'].loc[n]
|
e@35
|
466
|
e@35
|
467 # Track array
|
e@35
|
468 track_arr = np.zeros(int(scene_duration*SR))
|
e@35
|
469
|
e@35
|
470 #If \mu is -1, then play the event only once.
|
e@35
|
471 if mean_time_between_instances == -1:
|
e@35
|
472 track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav
|
e@35
|
473 start_times = [float(events_df['start_time'].loc[n])]
|
e@35
|
474 end_times = [float(events_df['end_time'].loc[n])]
|
e@35
|
475 else:
|
e@35
|
476 # If 0, then start next sample after this one (set it to the duration of the sample)
|
e@35
|
477 if mean_time_between_instances == 0:
|
e@35
|
478 mean_time_between_instances = len(wav)/float(SR)
|
e@35
|
479
|
e@35
|
480 # Store the successive starting and ending times of the events (given e.g. the model)
|
e@35
|
481 # in the following lists.
|
e@35
|
482 start_times = [events_df['start_time'].loc[n]]
|
e@35
|
483 end_times = [start_times[-1]+len(wav)/float(SR)]
|
e@35
|
484
|
e@35
|
485 # Start with the first time in the list
|
e@35
|
486 new_start_time = start_times[-1]
|
e@35
|
487 new_end_time = end_times[-1]
|
e@35
|
488
|
e@35
|
489 # Until the scene is full
|
e@35
|
490 while new_start_time < track_end_time:
|
e@35
|
491 offset = float(mean_time_between_instances) +\
|
e@35
|
492 float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn())
|
e@35
|
493 new_start_time += offset
|
e@35
|
494 new_end_time += offset
|
e@35
|
495
|
e@35
|
496 # Only exception is if we have set the 'end_cut' flag
|
e@35
|
497 # and the end time of the event surpasses the end time
|
e@35
|
498 # of the track
|
e@35
|
499 if end_cut and new_end_time > track_end_time:
|
e@35
|
500 break
|
e@35
|
501 else:
|
e@35
|
502 start_times.append(new_start_time)
|
e@35
|
503 end_times.append(new_end_time)
|
e@35
|
504
|
e@35
|
505 for t in start_times:
|
e@35
|
506 # We need to be careful with the limits here
|
e@35
|
507 # since numpy will just ignore indexing that
|
e@35
|
508 # exceeds the size of the array
|
e@35
|
509 begin = min(_N(t), len(track_arr))
|
e@35
|
510 end = min(len(track_arr), _N(t)+len(wav))
|
e@35
|
511
|
e@35
|
512 # Part of the wav to store
|
e@35
|
513 part = wav[:end-begin]
|
e@35
|
514
|
e@35
|
515 # If wav file was concatenated, fade out
|
e@35
|
516 # quickly to avoid clicks
|
e@35
|
517 if len(part) < len(wav) and len(part) > fade_out_time*SR:
|
e@35
|
518 part = fade(part, 0, fade_out_time)
|
e@35
|
519
|
e@35
|
520 track_arr[begin:end] += part
|
e@35
|
521
|
e@35
|
522 track_list.append(track_arr)
|
e@35
|
523 scene_arr[:len(track_arr)] += track_arr
|
e@35
|
524
|
e@35
|
525 # Compute energies
|
e@35
|
526 F = librosa.stft(track_arr, 1024)
|
e@35
|
527 energy_prof = librosa.feature.rmse(S=F).flatten()
|
e@35
|
528
|
e@35
|
529 # Compute current ebr
|
e@35
|
530
|
e@35
|
531 if len(backgrounds_df) > 0:
|
e@35
|
532 ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten()
|
e@35
|
533 curr_ebr = np.max(ebr_prof)
|
e@35
|
534 logging.debug('{}:Target ebr: {}db'.format(label,20*np.log10(target_ebr)))
|
e@35
|
535 logging.debug('{}:Current track ebr: {}db'.format(label,20*np.log10(curr_ebr)))
|
e@35
|
536
|
e@35
|
537 # Set correct ebr
|
e@35
|
538 track_arr = track_arr/curr_ebr*target_ebr
|
e@35
|
539
|
e@35
|
540 Fnew = librosa.stft(track_arr, 1024)
|
e@35
|
541 new_energy_prof = librosa.feature.rmse(S=Fnew).flatten()
|
e@35
|
542 new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten()
|
e@35
|
543 new_ebr = np.max(new_ebr_prof)
|
e@35
|
544 logging.debug('{}:New track ebr: {}db'.format(label,20*np.log10(new_ebr)))
|
e@35
|
545
|
e@35
|
546
|
e@35
|
547
|
e@35
|
548 if channel_mode == 'separate':
|
e@35
|
549 librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label), track_arr/np.max(track_arr), SR)
|
e@35
|
550
|
e@35
|
551
|
e@35
|
552
|
e@35
|
553
|
e@35
|
554
|
e@35
|
555 if figure_verbosity > 0:
|
e@35
|
556 plt.figure()
|
e@35
|
557
|
e@35
|
558 plt.subplot(3,1,1)
|
e@35
|
559 plt.title('`{}\' event waveform and spectrogram'.format(label))
|
e@35
|
560
|
e@35
|
561 librosa.display.waveplot(track_arr,sr=SR)
|
e@35
|
562 Fdb = librosa.amplitude_to_db(F)
|
e@35
|
563 plt.subplot(3, 1, 2)
|
e@35
|
564 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
|
e@35
|
565
|
e@35
|
566 # Plot energy profile
|
e@35
|
567 plt.subplot(3, 1, 3)
|
e@35
|
568 time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
|
e@35
|
569 plt.semilogy(time, energy_prof.T)
|
e@35
|
570 plt.xlim([0, len(track_arr)/SR])
|
e@35
|
571 plt.ylabel('energy (rms)')
|
e@35
|
572
|
e@35
|
573
|
e@35
|
574 plt.tight_layout()
|
e@35
|
575 if append_to_filename:
|
e@35
|
576 plt.savefig('{}/event_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
|
e@35
|
577 else:
|
e@35
|
578 plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300)
|
e@35
|
579
|
e@35
|
580
|
e@35
|
581
|
e@35
|
582
|
e@35
|
583 scene_starting_times.append((label, start_times))
|
e@35
|
584 scene_ending_times.append((label, end_times))
|
e@35
|
585
|
e@35
|
586 if figure_verbosity > 0:
|
e@35
|
587 plt.figure()
|
e@35
|
588 ax0 = plt.subplot(3,1,1)
|
e@35
|
589 plt.title('Synthesized Scene')
|
e@35
|
590 librosa.display.waveplot(scene_arr, sr=SR)
|
e@35
|
591 F = librosa.stft(scene_arr)
|
e@35
|
592 Fdb = librosa.amplitude_to_db(F)
|
e@35
|
593 ax1 = plt.subplot(3,1,2)
|
e@35
|
594 librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
|
e@35
|
595 ax2 = plt.subplot(3,1,3)
|
e@35
|
596 ax2.set_xlim([0,scene_duration])
|
e@35
|
597
|
e@35
|
598 # Get labels
|
e@35
|
599 labels = [s[0] for s in scene_starting_times]
|
e@35
|
600
|
e@35
|
601
|
e@35
|
602
|
e@35
|
603
|
e@35
|
604 # If background is active
|
e@35
|
605 if len(backgrounds_df) > 0:
|
e@35
|
606 labels.append('background')
|
e@35
|
607
|
e@35
|
608 # Set y axis limit. With a padding of 0.5.
|
e@35
|
609 ax2.set_ylim([-0.5, len(labels)-0.5])
|
e@35
|
610
|
e@35
|
611 plt.yticks(range(len(labels)), labels)
|
e@35
|
612
|
e@35
|
613 for n in range(len(scene_starting_times)):
|
e@35
|
614 label = scene_starting_times[n][0]
|
e@35
|
615 start_times = scene_starting_times[n][1]
|
e@35
|
616 end_times = scene_ending_times[n][1]
|
e@35
|
617 color = ['r', 'g', 'y'][n % 3]
|
e@35
|
618
|
e@35
|
619 for m in range(len(start_times)):
|
e@35
|
620 plt.hlines(y=float(n), xmin=start_times[m], xmax=end_times[m], alpha=0.5, color=color, linewidth=4)
|
e@35
|
621 if figure_verbosity > 2:
|
e@35
|
622 ax0.axvline(start_times[m], color=color, alpha=0.1)
|
e@35
|
623 ax0.axvline(end_times[m], color=color, alpha=0.1)
|
e@35
|
624 ax0.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
|
e@35
|
625 ax1.axvline(start_times[m], color=color, alpha=0.1)
|
e@35
|
626 ax1.axvline(end_times[m], color=color, alpha=0.1)
|
e@35
|
627 ax1.axvline(end_times[m], color=color, alpha=0.1)
|
e@35
|
628 ax1.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
|
e@35
|
629 ax2.axvline(start_times[m], color=color, alpha=0.1)
|
e@35
|
630 ax2.axvline(end_times[m], color=color, alpha=0.1)
|
e@35
|
631 ax2.axvline(end_times[m], color=color, alpha=0.1)
|
e@35
|
632 ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
|
e@35
|
633
|
e@35
|
634 if len(backgrounds_df) > 0:
|
e@35
|
635 plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4)
|
e@35
|
636
|
e@35
|
637 plt.tight_layout()
|
e@35
|
638
|
e@35
|
639 if append_to_filename:
|
e@35
|
640 plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300)
|
e@35
|
641 else:
|
e@35
|
642 plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300)
|
e@35
|
643
|
e@35
|
644 if figure_verbosity > 1:
|
e@35
|
645 plt.show()
|
e@35
|
646
|
e@35
|
647 # Replace nans (i.e. because of division-by-zero) of the scene with zeros.
|
e@35
|
648 scene_arr = np.nan_to_num(scene_arr)
|
e@35
|
649
|
e@35
|
650 if channel_mode == 'mono':
|
e@35
|
651 if append_to_filename:
|
e@35
|
652 librosa.output.write_wav('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR)
|
e@35
|
653 else:
|
e@35
|
654 librosa.output.write_wav('{}/scene.wav'.format(output_path), scene_arr, SR)
|
e@35
|
655
|
e@35
|
656
|
e@35
|
657 return scene_arr
|
e@35
|
658
|
e@35
|
659
|
e@35
|
660
|
e@35
|
661 def not_implemented():
|
e@35
|
662 print("TODO: not implemented")
|
e@35
|
663
|
e@35
|
664 if __name__=="__main__":
|
e@35
|
665 """
|
e@35
|
666 Main function, parses options and calls the simscene generation function
|
e@35
|
667 or a demo. The options given are almost identical to Lagrange et al's
|
e@35
|
668 simscene.
|
e@35
|
669 """
|
e@35
|
670 argparser = argparse.ArgumentParser(
|
e@35
|
671 description="SimScene.py acoustic scene generator",
|
e@35
|
672 )
|
e@35
|
673 argparser.add_argument(
|
e@35
|
674 'input_path',
|
e@35
|
675 type=str,
|
e@35
|
676 help="Path of a directory containing wave files for sound backgrounds (in the `background' sub-directory) or events (in `event')"
|
e@35
|
677 )
|
e@35
|
678 argparser.add_argument(
|
e@35
|
679 'output_path',
|
e@35
|
680 type=str,
|
e@35
|
681 help="The directory the generated scenes and annotations will reside."
|
e@35
|
682 )
|
e@35
|
683 argparser.add_argument(
|
e@35
|
684 'scene_duration',
|
e@35
|
685 type=float,
|
e@35
|
686 help="Duration of scene in seconds",
|
e@35
|
687 )
|
e@35
|
688 scene_duration = None
|
e@35
|
689
|
e@35
|
690 argparser.add_argument(
|
e@35
|
691 '-e', '--score-events',
|
e@35
|
692 type=str,
|
e@35
|
693 help="Score events file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
|
e@35
|
694 )
|
e@35
|
695 score_events = None
|
e@35
|
696
|
e@35
|
697 argparser.add_argument(
|
e@35
|
698 '-b', '--score-backgrounds',
|
e@35
|
699 type=str,
|
e@35
|
700 help="Score backgrounds file as a comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls) file"
|
e@35
|
701 )
|
e@35
|
702 score_backgrounds = None
|
e@35
|
703
|
e@35
|
704 argparser.add_argument(
|
e@35
|
705 '--tag',
|
e@35
|
706 type=str,
|
e@35
|
707 help="Append _TAG_XXX to filenames, where XXX is an increment."
|
e@35
|
708 )
|
e@35
|
709 tag = None
|
e@35
|
710
|
e@35
|
711 argparser.add_argument(
|
e@35
|
712 '-N',
|
e@35
|
713 type=int,
|
e@35
|
714 help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, then the verbosity must be less or equal to 1"
|
e@35
|
715 )
|
e@35
|
716 generate_n = 1
|
e@35
|
717
|
e@35
|
718 argparser.add_argument(
|
e@35
|
719 '-t', '--time-mode',
|
e@35
|
720 type=str,
|
e@35
|
721 help="Mode of spacing between events. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
|
e@35
|
722 choices=['generate', 'abstract', 'replicate']
|
e@35
|
723 )
|
e@35
|
724 time_mode = 'generate'
|
e@35
|
725
|
e@35
|
726 argparser.add_argument(
|
e@35
|
727 '-R', '--ebr-mode',
|
e@35
|
728 type=str,
|
e@35
|
729 help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
|
e@35
|
730 choices=['generate', 'abstract', 'replicate']
|
e@35
|
731 )
|
e@35
|
732 ebr_mode = 'generate'
|
e@35
|
733
|
e@35
|
734 argparser.add_argument(
|
e@35
|
735 '-A', '--annotation-file',
|
e@35
|
736 type=float,
|
e@35
|
737 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). (NOT IMPLEMENTED)"
|
e@35
|
738 )
|
e@35
|
739 annotation_file = None
|
e@35
|
740
|
e@35
|
741 argparser.add_argument(
|
e@35
|
742 '-a', '--audio-file',
|
e@35
|
743 type=float,
|
e@35
|
744 help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)"
|
e@35
|
745 )
|
e@35
|
746 audio_file = None
|
e@35
|
747
|
e@35
|
748 argparser.add_argument(
|
e@35
|
749 '-v', '--figure-verbosity', action='count',
|
e@35
|
750 help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot"
|
e@35
|
751 )
|
e@35
|
752 figure_verbosity = 0
|
e@35
|
753
|
e@35
|
754 argparser.add_argument(
|
e@35
|
755 '-x', '--image-format',
|
e@35
|
756 help="Image format for the figures",
|
e@35
|
757 choices=['png', 'jpg', 'pdf']
|
e@35
|
758 )
|
e@35
|
759 image_format = 'png'
|
e@35
|
760
|
e@35
|
761 argparser.add_argument(
|
e@35
|
762 '-C', '--channel-mode',
|
e@35
|
763 type=str,
|
e@35
|
764 help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.",
|
e@35
|
765 choices=['mono', 'separate']
|
e@35
|
766 )
|
e@35
|
767 channel_mode = 'mono'
|
e@35
|
768
|
e@35
|
769 # argparser.add_argument(
|
e@35
|
770 # '-m', '--min-space',
|
e@35
|
771 # type=float,
|
e@35
|
772 # help="Minimum space allowed between successive events (seconds). If -1, then allow overlapping between events."
|
e@35
|
773 # )
|
e@35
|
774 min_space = -1
|
e@35
|
775
|
e@35
|
776 argparser.add_argument(
|
e@35
|
777 '-c', '--end-cut',
|
e@35
|
778 action='store_true',
|
e@35
|
779 help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, else remove the sample."
|
e@35
|
780 )
|
e@35
|
781 end_cut = None
|
e@35
|
782
|
e@35
|
783 logging.basicConfig(level=logging.DEBUG)
|
e@35
|
784
|
e@35
|
785 args = argparser.parse_args()
|
e@35
|
786 if args.input_path:
|
e@35
|
787 input_path = args.input_path
|
e@35
|
788 logging.debug("Using `{}' as input path".format(input_path))
|
e@35
|
789 if args.output_path:
|
e@35
|
790 output_path = args.output_path
|
e@35
|
791 logging.debug("Saving to `{}'".format(output_path))
|
e@35
|
792 if args.scene_duration:
|
e@35
|
793 if not (args.score_backgrounds or args.score_events):
|
e@35
|
794 print("You must provide one of -e or -b")
|
e@35
|
795 else:
|
e@35
|
796 if args.image_format:
|
e@35
|
797 image_format = args.image_format
|
e@35
|
798 if args.channel_mode:
|
e@35
|
799 channel_mode = args.channel_mode
|
e@35
|
800 if args.ebr_mode:
|
e@35
|
801 ebr_mode = args.ebr_mode
|
e@35
|
802 if ebr_mode not in ['generate']:
|
e@35
|
803 logging.warning("`{}' not yet implemented for EBR_MODE, using default.".format(ebr_mode))
|
e@35
|
804 ebr_mode = 'generate'
|
e@35
|
805 if args.time_mode:
|
e@35
|
806 time_mode = args.time_mode
|
e@35
|
807 if time_mode not in ['generate']:
|
e@35
|
808 logging.warning("`{}' not yet implemented for TIME_MODE, using default.".format(time_mode))
|
e@35
|
809 time_mode = 'generate'
|
e@35
|
810 if args.annotation_file:
|
e@35
|
811 annotations = read_annotations_file(args.annotation_file)
|
e@35
|
812
|
e@35
|
813 scene_duration = float(args.scene_duration)
|
e@35
|
814
|
e@35
|
815 if args.score_backgrounds:
|
e@35
|
816 score_backgrounds = read_backgrounds_file(args.score_backgrounds)
|
e@35
|
817 else:
|
e@35
|
818 score_backgrounds = []
|
e@35
|
819
|
e@35
|
820 if args.score_events:
|
e@35
|
821 score_events = read_events_file(args.score_events)
|
e@35
|
822 else:
|
e@35
|
823 score_events = []
|
e@35
|
824
|
e@35
|
825 if args.figure_verbosity:
|
e@35
|
826 figure_verbosity = args.figure_verbosity
|
e@35
|
827
|
e@35
|
828 if args.N:
|
e@35
|
829 generate_n = args.N
|
e@35
|
830
|
e@35
|
831 if args.tag:
|
e@35
|
832 tag = args.tag
|
e@35
|
833
|
e@35
|
834 if generate_n == 1:
|
e@35
|
835 append_to_filename = None
|
e@35
|
836 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
|
e@35
|
837 time_mode=time_mode,
|
e@35
|
838 ebr_mode=ebr_mode,
|
e@35
|
839 channel_mode=channel_mode,
|
e@35
|
840 annotation_file=annotation_file,
|
e@35
|
841 audio_file=audio_file,
|
e@35
|
842 figure_verbosity=figure_verbosity,
|
e@35
|
843 min_space=min_space,
|
e@35
|
844 end_cut=end_cut,
|
e@35
|
845 image_format=image_format,
|
e@35
|
846 append_to_filename=append_to_filename)
|
e@35
|
847 else:
|
e@35
|
848 for n in range(generate_n):
|
e@35
|
849 if tag:
|
e@35
|
850 append_to_filename = '{}_{}'.format(tag, n)
|
e@35
|
851 else:
|
e@35
|
852 append_to_filename = '{}'.format(n)
|
e@35
|
853
|
e@35
|
854 logging.info("Generating scene {}".format(n))
|
e@35
|
855
|
e@35
|
856 simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
|
e@35
|
857 time_mode=time_mode,
|
e@35
|
858 ebr_mode=ebr_mode,
|
e@35
|
859 channel_mode=channel_mode,
|
e@35
|
860 annotation_file=annotation_file,
|
e@35
|
861 audio_file=audio_file,
|
e@35
|
862 figure_verbosity=min(figure_verbosity, 1),
|
e@35
|
863 min_space=min_space,
|
e@35
|
864 end_cut=end_cut,
|
e@35
|
865 image_format=image_format,
|
e@35
|
866 append_to_filename=append_to_filename)
|
e@35
|
867
|