Mercurial > hg > simscene-py

--- a/python/simscene.py	Thu Oct 05 19:42:37 2017 +0100
+++ b/python/simscene.py	Sat Oct 07 15:22:02 2017 +0100
@@ -35,6 +35,8 @@
 # Tabulate
 from tabulate import tabulate

+
+
 def _N(t, sr=44100):
     """
     Helper function: Converts time to samples
@@ -50,14 +52,25 @@
 #     x = np.array([[ii+jj for jj in range(w)] for ii in range(len(x)-w)])
 #     return np.sqrt(np.mean(x**2, 1))

+def timedict_to_dataframe(timedict):
+    return pd.DataFrame([(key, val[0], val[1]) for key in timedict for val in timedict[key]],
+                        columns=('filename', 'start_time', 'end_time'))
+
 def render_pattern(fname, input_path, sr=44100):
     pattern = read_pattern_file(fname)

+    # Store starting and end times in the format
+    # {'filename': (start_time, end_time)}
+
+    timesdict = {}
+
     start_times_samples = []
     end_times_samples = []
     durations_samples = []
     wav_files = []
-
+
+    pattern_timedict = []
+
     for n in range(len(pattern)):
         # Try loading the file,
         sampleid = pattern['sampleid'].loc[n]
@@ -74,7 +87,7 @@
             chosen_fname = random.sample(candidates, 1)[0]

             logging.debug('Loading {}'.format(chosen_fname))
-            wav, SR = render_pattern(chosen_fname, input_path)
+            wav, SR, pattern_timedict = render_pattern(chosen_fname, input_path)


         # For each sound in the pattern file, place it starting from starttime + an offset
@@ -107,6 +120,10 @@
             duration = end_time - start_time
             duration_samples = int(duration*SR)
             end_time_samples = start_time_samples + duration_samples
+
+            # Calculate end time in seconds
+            end_time = end_time_samples/float(SR)
+
             wav_arr = np.zeros(duration_samples)
             wav_arr[:len(wav)] = wav
             wav = wav_arr
@@ -122,14 +139,26 @@
         durations_samples.append(duration_samples)
         wav_files.append(event_render)

+        if chosen_fname in timesdict:
+            timesdict[chosen_fname].append((start_time, end_time))
+        else:
+            timesdict[chosen_fname] = [(start_time, end_time)]
+
+        for pt in pattern_timedict:
+            if pt in timesdict:
+                timesdict[pt] += pattern_timedict[pt]
+            else:
+                timesdict[pt] = pattern_timedict[pt]
+
     pattern_duration = end_time_samples
     pattern_arr = np.zeros(pattern_duration)

     for n, s in enumerate(start_times_samples):
         wav = wav_files[n]
         pattern_arr[s:s+len(wav)] = wav
-
-    return pattern_arr, 44100
+
+    return pattern_arr, 44100, timesdict
+

 def read_events_file(fname):
     if fname[-3:].lower() == 'xls':
@@ -155,6 +184,7 @@
     logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
     return df

+
 def read_pattern_file(fname):
     if fname[-3:].lower() == 'xls':
         df = pd.read_excel(fname)
@@ -178,7 +208,8 @@

     logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
     return df
-
+
+
 def read_backgrounds_file(fname):
     if fname[-3:].lower() == 'xls':
         df = pd.read_excel(fname)
@@ -203,6 +234,7 @@
     logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
     return df

+
 def read_annotations_file(fname):
     if fname[-3:].lower() == 'xls':
         df = pd.read_excel(fname)
@@ -226,16 +258,18 @@
                 df = pd.read_csv(f, header=None, sep=sep)
                 df.columns = ['start', 'stop', 'class']
             else:
+                df = pd.read_csv(f, sep=sep)
                 df.columns = ['start', 'stop', 'class']
-                df = pd.read_csv(f, sep=sep)
                 df = None

     logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
     return df

+
 def run_demo():
     print("TODO: Implement run_demo()")

+
 def fade(x, fade_in, fade_out, sr=44100):
     """
     Creates a fade-in-fade-out envelope
@@ -256,6 +290,7 @@
         outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n
     return outp*x

+
 def simscene(input_path,
              output_path,
              scene_duration,
@@ -267,7 +302,11 @@

     events_df = score_events
     backgrounds_df = score_backgrounds
-
+
+    # Store starting and ending times in the format
+    # {'filename': [(start_time, end_time), (start_time, end_time), ...]}
+    timedict = {}
+
     # Create empty numpy array
     scene_arr = np.zeros(int(scene_duration*SR))

@@ -299,7 +338,7 @@
     # List of tracks
     track_list = []
     background_energies = []
-
+
     for n in range(len(backgrounds_df)):
         # Get label of background
         label = str(backgrounds_df['label'].loc[n])
@@ -307,6 +346,10 @@
         # First check if there are any pattern candidates. Give priorities
         # To pattern files.
         candidates = []
+
+        # List of pattern start and end times
+        pattern_timedict = []
+
         for pattern_format in ['xls', 'json', 'txt', 'csv']:
             candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, backgrounds_df['sampleid'].loc[n], pattern_format))

@@ -317,7 +360,8 @@
             wav, sr = librosa.load(chosen_fname, sr=SR)
         else:
             chosen_fname = random.sample(candidates, 1)[0]
-            wav, sr = render_pattern(chosen_fname, input_path)
+            wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)
+

         duration = len(wav)/float(SR)
         target_snr_db = float(backgrounds_df['snr'].loc[n])
@@ -325,9 +369,8 @@

         energy = compute_energy(wav)

-        logging.debug('{}:energy:{}'.format(label,energy))
-
-
+        logging.debug('{}:energy:{}'.format(label, energy))
+
         if n == 0:
             # For the first background track, snr
             # gives an amount by which it's going to be scaled (i.e. make it more silent)
@@ -336,40 +379,67 @@

         if n > 0:
             noise_energy = compute_energy(np.sum(np.array(track_list), axis=0))
-            logging.info('{}:noise_energy:{}'.format(label,noise_energy))
+            logging.info('{}:noise_energy:{}'.format(label, noise_energy))

             old_snr = energy/noise_energy
             old_snr_db = 20*np.log10(old_snr)
-            logging.info('{}:old_snr:{}'.format(label,old_snr_db))
+            logging.info('{}:old_snr:{}'.format(label, old_snr_db))

             amplitude_factor = target_snr/old_snr
-

             wav *= amplitude_factor
             new_energy = compute_energy(wav)
             new_snr = new_energy/noise_energy
             new_snr_db = 20. * np.log10(new_snr)
-            logging.info('{}:new_snr:{}'.format(label,new_snr_db))
-
-
+            logging.info('{}:new_snr:{}'.format(label, new_snr_db))
+
         # Track array
         track_arr = np.zeros(int(scene_duration*SR))
         start_times = [0.0]
         end_times = [start_times[-1]+len(wav)/float(SR)]
-

         # Start with the first time in the list
         new_start_time = start_times[-1]
         new_end_time = end_times[-1]

+        if chosen_fname in timedict:
+            timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time)))
+        else:
+            timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))]
+        #
+
         while new_start_time < scene_duration:
             offset = duration
             new_start_time += offset
+
+            # If already exceeded scene, break
+            if new_start_time >= scene_duration:
+                break
+
             new_end_time += offset

             start_times.append(new_start_time)
             end_times.append(new_end_time)

+            # Update timesdict noting where each filename starts and stops
+            if chosen_fname in timedict:
+                timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time)))
+            else:
+                timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))]
+
+            # Also update the times from the patterns
+            for pt in pattern_timedict:
+                pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new + new_end_time) for s in
+                                        pattern_timedict[pt]]
+
+                if pt in timedict:
+                    timedict[pt] += pattern_timedict[pt]
+                else:
+                    timedict[pt] = pattern_timedict[pt]
+
+            # And add those to the timedict dictionary
+
+
         for n,t in enumerate(start_times):
             # We need to be careful with the limits here
             # since numpy will just ignore indexing that
@@ -394,8 +464,6 @@
             # Part of the wav to store
             # part = fade(wav[:end-begin],fade_in_time,fade_out_time)
             part = wav[:end-begin]
-
-
             track_arr[begin:end] += part

         track_list.append(track_arr)
@@ -412,7 +480,7 @@
             plt.figure()
             plt.subplot(3, 1, 1)
             plt.title('`{}\' background waveform and spectrogram'.format(label))
-            librosa.display.waveplot(track_arr,sr=SR)
+            librosa.display.waveplot(track_arr, sr=SR)

             # Plot spectrogram
             Fdb = librosa.amplitude_to_db(F)
@@ -425,8 +493,7 @@
             plt.semilogy(time, energy_prof.T)
             plt.xlim([0, len(track_arr)/SR])
             plt.ylabel('energy (rms)')
-
-
+
             # Tidy up and save to file
             plt.tight_layout()
             if append_to_filename:
@@ -441,7 +508,9 @@
         background_energy = librosa.feature.rmse(S=B).flatten()
     else:
         background_energy = 0.0
-
+
+
+
     for n in range(len(events_df)):
         # Get label of track
         label = str(events_df['label'].loc[n])
@@ -449,6 +518,10 @@
         # First check if there are any pattern candidates. Give priorities
         # To pattern files.
         candidates = []
+
+        # List of pattern start and end times
+        pattern_timedict = []
+
         for pattern_format in ['xls', 'json', 'txt', 'csv']:
             candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, events_df['sampleid'].loc[n], pattern_format))

@@ -459,8 +532,9 @@
             wav, sr = librosa.load(chosen_fname, sr=SR)
         else:
             chosen_fname = random.sample(candidates, 1)[0]
-            wav, sr = render_pattern(chosen_fname, input_path)
-
+            logging.info('rendering pattern')
+            wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)
+

         # Apply a fader envelope
         fade_in_time = float(events_df['fade_in_time'].loc[n])
@@ -482,6 +556,19 @@
             track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav
             start_times = [float(events_df['start_time'].loc[n])]
             end_times = [float(events_df['end_time'].loc[n])]
+
+            new_start_time = start_times[-1]
+            new_end_time = end_times[-1]
+
+            for pt in pattern_timedict:
+                pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in
+                                        pattern_timedict[pt]]
+
+                if pt in timedict:
+                    timedict[pt] += pattern_timedict[pt]
+                else:
+                    timedict[pt] = pattern_timedict[pt]
+
         else:
             # If 0, then start next sample after this one (set it to the duration of the sample)
             if mean_time_between_instances == 0:
@@ -496,11 +583,21 @@
             new_start_time = start_times[-1]
             new_end_time = end_times[-1]

+            if chosen_fname in timedict:
+                timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time)))
+            else:
+                timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))]
+
             # Until the scene is full
             while new_start_time < track_end_time:
                 offset = float(mean_time_between_instances) +\
                             float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn())
                 new_start_time += offset
+
+                # If already exceeded scene, break
+                if new_start_time >= scene_duration:
+                    break
+
                 new_end_time += offset

                 # Only exception is if we have set the 'end_cut' flag
@@ -512,6 +609,21 @@
                     start_times.append(new_start_time)
                     end_times.append(new_end_time)

+                    if chosen_fname in timedict:
+                        timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time)))
+                    else:
+                        timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))]
+
+                    # Also update the times from the patterns
+                    for pt in pattern_timedict:
+                        pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in
+                                                pattern_timedict[pt]]
+
+                        if pt in timedict:
+                            timedict[pt] += pattern_timedict[pt]
+                        else:
+                            timedict[pt] = pattern_timedict[pt]
+
             for t in start_times:
                 # We need to be careful with the limits here
                 # since numpy will just ignore indexing that
@@ -552,15 +664,9 @@
             new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten()
             new_ebr = np.max(new_ebr_prof)
             logging.debug('{}:New track ebr: {}db'.format(label,20*np.log10(new_ebr)))
-
-

         if channel_mode == 'separate':
             librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label), track_arr/np.max(track_arr), SR)
-
-
-
-

         if figure_verbosity > 0:
             plt.figure()
@@ -579,7 +685,6 @@
             plt.semilogy(time, energy_prof.T)
             plt.xlim([0, len(track_arr)/SR])
             plt.ylabel('energy (rms)')
-

             plt.tight_layout()
             if append_to_filename:
@@ -587,9 +692,6 @@
             else:
                 plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300)

-
-
-
         scene_starting_times.append((label, start_times))
         scene_ending_times.append((label, end_times))

@@ -608,9 +710,6 @@
         # Get labels
         labels = [s[0] for s in scene_starting_times]

-
-
-
         # If background is active
         if len(backgrounds_df) > 0:
             labels.append('background')
@@ -646,11 +745,21 @@

         plt.tight_layout()

+
+
         if append_to_filename:
             plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300)
         else:
             plt.savefig('{}/scene.{}'.format(output_path, image_format), dpi=300)
-
+
+    timedict_df = timedict_to_dataframe(timedict)
+    logging.debug(timedict_df)
+
+    if append_to_filename:
+        timedict_df.to_csv('{}/scene_{}_offsets.csv'.format(output_path, append_to_filename))
+    else:
+        timedict_df.to_csv('{}/scene_offsets.csv'.format(output_path))
+
     if figure_verbosity > 1:
         plt.show()

@@ -662,16 +771,17 @@
             librosa.output.write_wav('{}/scene_{}.wav'.format(output_path, append_to_filename), scene_arr, SR)
         else:
             librosa.output.write_wav('{}/scene.wav'.format(output_path), scene_arr, SR)
-
+
+    # Print timesdict

     return scene_arr
-
-
-
+
+
 def not_implemented():
-    print("TODO: not implemented")
-
-if __name__=="__main__":
+    logging.info("TODO: not implemented")
+
+
+if __name__ == "__main__":
     """
     Main function, parses options and calls the simscene generation function
     or a demo. The options given are almost identical to Lagrange et al's
@@ -790,7 +900,7 @@
     )
     end_cut = None

-    logging.basicConfig(level=logging.DEBUG)
+    logging.basicConfig(level=logging.INFO)

     args = argparser.parse_args()
     if args.input_path: