Mercurial > hg > simscene-py

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/requirements.txt	Mon Oct 02 19:21:31 2017 +0100
@@ -0,0 +1,7 @@
+argparse
+pandas
+numpy
+glob
+librosa
+matplotlib
+tabulate
--- a/simscene.py	Mon Oct 02 15:54:26 2017 +0100
+++ b/simscene.py	Mon Oct 02 19:21:31 2017 +0100
@@ -3,13 +3,41 @@
 # For licensing please see: LICENSE
 # Copyright (c) Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>

+# Argparse
 import argparse
+
+# Logging
 import logging
+
+# Pandas
 import pandas as pd
+
+# Numpy
+import numpy as np
 import sys

+# Glob
+import glob
+import random
+
+# Librosa
+import librosa
+import librosa.display
+import librosa.output
+
+# Matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+
+# Tabulate
 from tabulate import tabulate

+def _N(t, sr=44100):
+    """
+    Helper function: Converts time to samples
+    """
+    return int(t*sr)
+
 def read_events_file(fname):
     if fname[-3:].lower() == 'xls':
         df = pd.read_excel(fname)
@@ -62,8 +90,7 @@
                 df = pd.read_csv(f, sep=sep)
                 df = None
             df.columns = ['label','sampleid','snr']
-
-
+
     logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
     return df

@@ -99,7 +126,24 @@

 def run_demo():
     print("TODO: Implement run_demo()")
+
+def fade(x, fade_in, fade_out, sr=44100):
+    """
+    Creates a fade-in-fade-out envelope
+    for audio array x.
+    """

+    fade_in_samples = int(fade_in*sr)
+    fade_out_samples = int(fade_out*sr)
+
+    outp = np.ones_like(x)
+    for n in range(fade_in_samples):
+        outp[n] = n*1./fade_in_samples
+
+    for n in range(fade_out_samples):
+        outp[len(outp)-fade_out_samples+n] = 1-1./fade_out_samples*n
+    return outp*x
+
 def simscene(input_path,
              output_path,
              scene_duration,
@@ -107,7 +151,156 @@
              score_backgrounds,
              **kwargs):
     logging.info('simscene() is not yet implemented')
+    SR = 44100 # Samplerate. Should probably not be hardcoded
+
+    events_df = score_events
+
+    # Create empty numpy array
+    scene_arr = np.zeros(int(scene_duration*SR))
+
+    if 'end_cut' in kwargs:
+        end_cut = kwargs['end_cut']
+    else:
+        end_cut = False
+
+    if 'figure_verbosity' in kwargs:
+        figure_verbosity = kwargs['figure_verbosity']
+    else:
+        figure_verbosity = 0
+
+    if 'image_format' in kwargs:
+        image_format = kwargs['image_format']
+    else:
+        image_format = 'png'
+
+    # Stores the starting and ending times of every track for visualization
+    # purposes
+    scene_starting_times = []
+    scene_ending_times = []
+
+
+    for n in range(len(events_df)):
+        # Get label of track
+        label = str(events_df['label'].loc[n])
+
+        candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n]))
+        wav, SR = librosa.load(random.sample(candidates,1)[0])
+
+        # Apply a fader envelope
+        fade_in_time = float(events_df['fade_in_time'].loc[n])
+        fade_out_time = float(events_df['fade_out_time'].loc[n])
+        wav = fade(wav, fade_in_time, fade_out_time)
+
+        # Mean time between instances \mu.
+        mean_time_between_instances = events_df['mean_time_between_instances'].loc[n]
+        track_end_time = events_df['end_time'].loc[n]
+
+        # Track array
+        track_arr = np.zeros(int(scene_duration*SR))
+
+        #If \mu is -1, then play the event only once.
+        if mean_time_between_instances == -1:
+            track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav
+        else:
+
+            # If 0, then start next sample after this one (set it to the duration of the sample)
+            if mean_time_between_instances == 0:
+                mean_time_between_instances = len(wav)/float(SR)
+
+            # Store the successive starting and ending times of the events (given e.g. the model)
+            # in the following lists.
+            start_times = [events_df['start_time'].loc[n]]
+            end_times = [events_df['end_time'].loc[n]]

+            # Start with the first time in the list
+            new_start_time = start_times[-1]
+            new_end_time = end_times[-1]
+
+            # Until the scene is full
+            while new_start_time < track_end_time:
+                offset = float(mean_time_between_instances) +\
+                            float(events_df['time_between_instances_stddev'].loc[n]*np.random.randn())
+                new_start_time += offset
+                new_end_time += offset
+
+                # Only exception is if we have set the 'end_cut' flag
+                # and the end time of the event surpasses the end time
+                # of the track
+                if end_cut and new_end_time > track_end_time:
+                    break
+                else:
+                    start_times.append(new_start_time)
+                    end_times.append(new_end_time)
+
+            for t in start_times:
+
+                # We need to be careful with the limits here
+                # since numpy will just ignore indexing that
+                # exceeds the size of the array
+                begin = min(_N(t), len(track_arr))
+                end = min(len(track_arr), _N(t)+len(wav))
+
+                # Part of the wav to store
+                part = wav[:end-begin]
+
+                # If wav file was concatenated, fade out
+                # quickly to avoid clicks
+                if len(part) < len(wav) and len(part) > fade_out_time*SR:
+                    part = fade(part, 0, fade_out_time)
+
+                track_arr[begin:end] += part
+
+            scene_arr[:len(track_arr)] += track_arr
+
+        if figure_verbosity > 0:
+            plt.figure()
+
+            plt.subplot(2,1,1)
+            plt.title('`{}\' waveform and spectrogram'.format(label))
+
+            visible_track = track_arr[int(start_times[0]*SR):int(end_times[-1]*SR)]
+            librosa.display.waveplot(visible_track)
+            F = librosa.stft(visible_track)
+            Fdb = librosa.amplitude_to_db(F)
+            plt.subplot(2,1,2)
+            librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
+            plt.savefig('{}/{}.{}'.format(output_path, label, image_format))
+
+
+        scene_starting_times.append((label, start_times))
+        scene_ending_times.append((label, end_times))
+
+    if figure_verbosity > 0:
+        plt.figure()
+        plt.subplot(3,1,1)
+        plt.title('Waveform and spectrogram for the full track')
+        librosa.display.waveplot(scene_arr)
+        F = librosa.stft(scene_arr)
+        Fdb = librosa.amplitude_to_db(F)
+        plt.subplot(3,1,2)
+        librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
+        ax = plt.subplot(3,1,3)
+
+        for n in range(len(scene_starting_times)):
+            label = scene_starting_times[n][0]
+            start_times = scene_starting_times[n][1]
+            end_times = scene_ending_times[n][1]
+            for m in range(len(start_times)):
+                ax.add_patch(
+                    patches.Rectangle(
+                        (start_times[m], float(n)),
+                        end_times[m]-start_times[m], 0.2
+                    )
+                )
+
+        plt.savefig('{}/full-scene.{}'.format(output_path, image_format))
+    if figure_verbosity > 1:
+        plt.show()
+
+    if channel_mode == 'mono':
+        librosa.output.write_wav('{}/full-scene.wav'.format(output_path), scene_arr, SR)
+
+    return scene_arr


@@ -189,6 +382,13 @@
         help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not display them, 2 - Save and display figures"
     )
     figure_verbosity = None
+
+    argparser.add_argument(
+        '-x', '--image-format',
+        help="Image format for the figures",
+        choices=['png', 'jpg', 'pdf']
+    )
+    image_format = 'png'

     argparser.add_argument(
         '-C', '--channel-mode',
@@ -196,7 +396,7 @@
         help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'classes' - As many channels as sound classes (events+textures), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.",
         choices=['mono', 'classes', 'separate']
     )
-    channel_mode = None
+    channel_mode = 'mono'

     argparser.add_argument(
         '-m', '--min-space',
@@ -225,6 +425,10 @@
         if not (args.score_backgrounds or args.score_events):
             print("You must provide one of -e or -b")
         else:
+            if args.image_format:
+                image_format = args.image_format
+            if args.channel_mode:
+                channel_mode = args.channel_mode
             if args.ebr_mode:
                 ebr_mode = args.ebr_mode
                 if ebr_mode not in ['generate']:
@@ -244,6 +448,8 @@
                 score_backgrounds = read_backgrounds_file(args.score_backgrounds)
             if args.score_events:
                 score_events = read_events_file(args.score_events)
+            if args.figure_verbosity:
+                figure_verbosity = args.figure_verbosity

             simscene(input_path, output_path, scene_duration, score_events, score_backgrounds,
                      time_mode=time_mode,
@@ -253,4 +459,5 @@
                      audio_file=audio_file,
                      figure_verbosity=figure_verbosity,
                      min_space=min_space,
-                     end_cut=end_cut)
+                     end_cut=end_cut,
+                     image_format=image_format)