Mercurial > hg > simscene-py

--- a/python/simscene.py	Sat Oct 07 15:22:02 2017 +0100
+++ b/python/simscene.py	Sun Oct 08 15:22:21 2017 +0100
@@ -14,7 +14,6 @@

 # Numpy
 import numpy as np
-import sys

 # Glob
 import glob
@@ -26,36 +25,28 @@
 import librosa.output

 # Matplotlib
-from matplotlib import rc
-# rc('text', usetex=True)
 import matplotlib.pyplot as plt
-import matplotlib.patches as patches
-from cycler import cycler

 # Tabulate
 from tabulate import tabulate


-
-def _N(t, sr=44100):
+def _D(t, sr=44100):
     """
     Helper function: Converts time to samples
     """
     return int(t*sr)

+
 def compute_energy(x):
     return np.sqrt(np.mean(x**2))

-# def compute_energy_profile(x, w=1000):
-#     # Resize/Window signal
-#     #x = np.resize(x, (w,int(np.ceil(float(len(x)/w)))))
-#     x = np.array([[ii+jj for jj in range(w)] for ii in range(len(x)-w)])
-#     return np.sqrt(np.mean(x**2, 1))

 def timedict_to_dataframe(timedict):
     return pd.DataFrame([(key, val[0], val[1]) for key in timedict for val in timedict[key]],
                         columns=('filename', 'start_time', 'end_time'))

+
 def render_pattern(fname, input_path, sr=44100):
     pattern = read_pattern_file(fname)

@@ -79,7 +70,7 @@
             candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, sampleid, pattern_format))

         if len(candidates) == 0:
-            candidates = glob.glob('{}/event/{}*wav'.format(input_path,sampleid))
+            candidates = glob.glob('{}/event/{}*wav'.format(input_path, sampleid))
             chosen_fname = random.sample(candidates, 1)[0]

             wav, SR = librosa.load(chosen_fname, sr=sr)
@@ -88,8 +79,7 @@

             logging.debug('Loading {}'.format(chosen_fname))
             wav, SR, pattern_timedict = render_pattern(chosen_fname, input_path)
-
-
+
         # For each sound in the pattern file, place it starting from starttime + an offset
         # with a mean value of 0 and standard deviation of offset_stddev. The first event can
         # not start earlier than time 0. If endtime is defined (not nan), then cut the event at
@@ -101,7 +91,7 @@
         amplitude = amplitude_mean + np.random.randn()*amplitude_stddev
         wav *= amplitude

-        start_time = max(float(pattern['start_time'].loc[n]),0)
+        start_time = max(float(pattern['start_time'].loc[n]), 0)
         start_time_samples = int(start_time*SR)

         fade_in_time = float(pattern['fade_in_time'].loc[n])
@@ -168,7 +158,7 @@
     elif fname[-3:].lower() in ['txt']:
         with open(fname) as f:
             s = f.readline()
-            f.seek(0,0)
+            f.seek(0, 0)
             if ',' in s:
                 sep = ','
             elif '\t' in s:
@@ -177,11 +167,20 @@
                 sep = ' '
             logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
             df = pd.read_csv(f, header=None, sep=sep)
-            df.columns = ['label','sampleid','ebr','ebr_stddev','mean_time_between_instances','time_between_instances_stddev','start_time','end_time','fade_in_time','fade_out_time']
+            df.columns = ['label',
+                          'sampleid',
+                          'ebr',
+                          'ebr_stddev',
+                          'mean_time_between_instances',
+                          'time_between_instances_stddev',
+                          'start_time',
+                          'end_time',
+                          'fade_in_time',
+                          'fade_out_time']
     elif fname[-3:].lower() in ['csv']:
         df = pd.read_json(fname)

-    logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
+    logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
     return df


@@ -193,7 +192,7 @@
     elif fname[-3:].lower() in ['txt']:
         with open(fname) as f:
             s = f.readline()
-            f.seek(0,0)
+            f.seek(0, 0)
             if ',' in s:
                 sep = ','
             elif '\t' in s:
@@ -202,11 +201,18 @@
                 sep = ' '
             logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
             df = pd.read_csv(f, header=None, sep=sep)
-            df.columns = ['eventid','start_time','end_time','time_offset_stdev','fade_in_time', 'fade_out_time', 'amplitude', 'amplitude_stdev']
+            df.columns = ['eventid',
+                          'start_time',
+                          'end_time',
+                          'time_offset_stdev',
+                          'fade_in_time',
+                          'fade_out_time',
+                          'amplitude',
+                          'amplitude_stdev']
     elif fname[-3:].lower() in ['csv']:
         df = pd.read_json(fname)

-    logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
+    logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
     return df


@@ -218,7 +224,7 @@
     elif fname[-3:].lower() in ['txt']:
         with open(fname) as f:
             s = f.readline()
-            f.seek(0,0)
+            f.seek(0, 0)
             if ',' in s:
                 sep = ','
             elif '\t' in s:
@@ -227,11 +233,11 @@
                 sep = ' '
             logging.warning('Probably no header or malformed .csv. Will try to parse it raw.')
             df = pd.read_csv(f, header=None, sep=sep)
-            df.columns = ['label','sampleid','snr']
+            df.columns = ['label', 'sampleid', 'snr']
     elif fname[-3:].lower() in ['csv']:
         df = pd.read_json(fname)

-    logging.info('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
+    logging.debug('Using input:\n'+tabulate(df, headers='keys', tablefmt='psql'))
     return df


@@ -246,7 +252,7 @@
             header = f.readline()

             s = f.readline()
-            f.seek(0,0)
+            f.seek(0, 0)
             if ',' in s:
                 sep = ','
             elif '\t' in s:
@@ -297,8 +303,8 @@
              score_events,
              score_backgrounds,
              **kwargs):
-    logging.info('simscene() is not yet implemented fully')
-    SR = 44100 # Samplerate. Should probably not be hardcoded
+    logging.warning('simscene() is not yet implemented fully')
+    SR = 44100  # Samplerate. Should probably not be hardcoded

     events_df = score_events
     backgrounds_df = score_backgrounds
@@ -351,7 +357,9 @@
         pattern_timedict = []

         for pattern_format in ['xls', 'json', 'txt', 'csv']:
-            candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, backgrounds_df['sampleid'].loc[n], pattern_format))
+            candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path,
+                                                               backgrounds_df['sampleid'].loc[n],
+                                                               pattern_format))

         if len(candidates) == 0:
             # If no patterns are found, search for normal audio files
@@ -362,7 +370,6 @@
             chosen_fname = random.sample(candidates, 1)[0]
             wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)

-
         duration = len(wav)/float(SR)
         target_snr_db = float(backgrounds_df['snr'].loc[n])
         target_snr = 10**(target_snr_db/20.0)
@@ -406,7 +413,6 @@
             timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time)))
         else:
             timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))]
-        #

         while new_start_time < scene_duration:
             offset = duration
@@ -429,7 +435,7 @@

             # Also update the times from the patterns
             for pt in pattern_timedict:
-                pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new + new_end_time) for s in
+                pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in
                                         pattern_timedict[pt]]

                 if pt in timedict:
@@ -439,8 +445,7 @@

             # And add those to the timedict dictionary

-
-        for n,t in enumerate(start_times):
+        for t in start_times:
             # We need to be careful with the limits here
             # since numpy will just ignore indexing that
             # exceeds
@@ -458,8 +463,8 @@
             # else:
             #     fade_in_time = 0.01
             #     fade_out_time = 0.0
-            begin = min(_N(t), len(track_arr))
-            end = min(len(track_arr), _N(t)+len(wav))
+            begin = min(_D(t), len(track_arr))
+            end = min(len(track_arr), _D(t) + len(wav))

             # Part of the wav to store
             # part = fade(wav[:end-begin],fade_in_time,fade_out_time)
@@ -497,9 +502,16 @@
             # Tidy up and save to file
             plt.tight_layout()
             if append_to_filename:
-                plt.savefig('{}/background_{}_{}.{}'.format(output_path, label, append_to_filename, image_format), dpi=300)
+                plt.savefig('{}/background_{}_{}.{}'.format(output_path,
+                                                            label,
+                                                            append_to_filename,
+                                                            image_format),
+                            dpi=300)
             else:
-                plt.savefig('{}/background_{}.{}'.format(output_path, label, image_format), dpi=300)
+                plt.savefig('{}/background_{}.{}'.format(output_path,
+                                                         label,
+                                                         image_format),
+                            dpi=300)

     # Compute total energy of background
     if len(backgrounds_df) > 0:
@@ -509,8 +521,6 @@
     else:
         background_energy = 0.0

-
-
     for n in range(len(events_df)):
         # Get label of track
         label = str(events_df['label'].loc[n])
@@ -523,7 +533,9 @@
         pattern_timedict = []

         for pattern_format in ['xls', 'json', 'txt', 'csv']:
-            candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path, events_df['sampleid'].loc[n], pattern_format))
+            candidates += glob.glob('{}/pattern/{}*.{}'.format(input_path,
+                                                               events_df['sampleid'].loc[n],
+                                                               pattern_format))

         if len(candidates) == 0:
             # If no patterns are found, search for normal audio files
@@ -532,17 +544,17 @@
             wav, sr = librosa.load(chosen_fname, sr=SR)
         else:
             chosen_fname = random.sample(candidates, 1)[0]
-            logging.info('rendering pattern')
             wav, sr, pattern_timedict = render_pattern(chosen_fname, input_path)

-
+        logging.debug(chosen_fname)
         # Apply a fader envelope
         fade_in_time = float(events_df['fade_in_time'].loc[n])
         fade_out_time = float(events_df['fade_out_time'].loc[n])
         wav = fade(wav, fade_in_time, fade_out_time)

         # Set target EBR
-        target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 + np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0)
+        target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 +
+                          np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0)

         # Mean time between instances \mu.
         mean_time_between_instances = events_df['mean_time_between_instances'].loc[n]
@@ -551,15 +563,20 @@
         # Track array
         track_arr = np.zeros(int(scene_duration*SR))

-        #If \mu is -1, then play the event only once.
+        # If \mu is -1, then play the event only once.
         if mean_time_between_instances == -1:
-            track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav
+            track_arr[_D(events_df['start_time'].loc[n]):_D(events_df['start_time'].loc[n]) + len(wav)] += wav
             start_times = [float(events_df['start_time'].loc[n])]
             end_times = [float(events_df['end_time'].loc[n])]

             new_start_time = start_times[-1]
             new_end_time = end_times[-1]

+            if chosen_fname in timedict:
+                timedict[chosen_fname].append((new_start_time, min(scene_duration, new_end_time)))
+            else:
+                timedict[chosen_fname] = [(new_start_time, min(scene_duration, new_end_time))]
+
             for pt in pattern_timedict:
                 pattern_timedict[pt] = [(s[0] + new_start_time, s[1] + new_start_time) for s in
                                         pattern_timedict[pt]]
@@ -628,15 +645,15 @@
                 # We need to be careful with the limits here
                 # since numpy will just ignore indexing that
                 # exceeds the size of the array
-                begin = min(_N(t), len(track_arr))
-                end = min(len(track_arr), _N(t)+len(wav))
+                begin = min(_D(t), len(track_arr))
+                end = min(len(track_arr), _D(t) + len(wav))

                 # Part of the wav to store
                 part = wav[:end-begin]

                 # If wav file was concatenated, fade out
                 # quickly to avoid clicks
-                if len(part) < len(wav) and len(part) > fade_out_time*SR:
+                if len(wav) > len(part) > fade_out_time*SR:
                     part = fade(part, 0, fade_out_time)

                 track_arr[begin:end] += part
@@ -653,8 +670,10 @@
         if len(backgrounds_df) > 0:
             ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten()
             curr_ebr = np.max(ebr_prof)
-            logging.debug('{}:Target ebr: {}db'.format(label,20*np.log10(target_ebr)))
-            logging.debug('{}:Current track ebr: {}db'.format(label,20*np.log10(curr_ebr)))
+            logging.debug('{}:Target ebr: {}db'.format(label,
+                                                       20*np.log10(target_ebr)))
+            logging.debug('{}:Current track ebr: {}db'.format(label,
+                                                              20*np.log10(curr_ebr)))

             # Set correct ebr
             track_arr = track_arr/curr_ebr*target_ebr
@@ -663,18 +682,20 @@
             new_energy_prof = librosa.feature.rmse(S=Fnew).flatten()
             new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten()
             new_ebr = np.max(new_ebr_prof)
-            logging.debug('{}:New track ebr: {}db'.format(label,20*np.log10(new_ebr)))
+            logging.debug('{}:New track ebr: {}db'.format(label, 20*np.log10(new_ebr)))

         if channel_mode == 'separate':
-            librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label), track_arr/np.max(track_arr), SR)
+            librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label),
+                                     track_arr/np.max(track_arr),
+                                     SR)

         if figure_verbosity > 0:
             plt.figure()

-            plt.subplot(3,1,1)
+            plt.subplot(3, 1, 1)
             plt.title('`{}\' event waveform and spectrogram'.format(label))

-            librosa.display.waveplot(track_arr,sr=SR)
+            librosa.display.waveplot(track_arr, sr=SR)
             Fdb = librosa.amplitude_to_db(F)
             plt.subplot(3, 1, 2)
             librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
@@ -697,15 +718,15 @@

     if figure_verbosity > 0:
         plt.figure()
-        ax0 = plt.subplot(3,1,1)
+        ax0 = plt.subplot(3, 1, 1)
         plt.title('Synthesized Scene')
         librosa.display.waveplot(scene_arr, sr=SR)
         F = librosa.stft(scene_arr)
         Fdb = librosa.amplitude_to_db(F)
-        ax1 = plt.subplot(3,1,2)
+        ax1 = plt.subplot(3, 1, 2)
         librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
-        ax2 = plt.subplot(3,1,3)
-        ax2.set_xlim([0,scene_duration])
+        ax2 = plt.subplot(3, 1, 3)
+        ax2.set_xlim([0, scene_duration])

         # Get labels
         labels = [s[0] for s in scene_starting_times]
@@ -720,7 +741,6 @@
         plt.yticks(range(len(labels)), labels)

         for n in range(len(scene_starting_times)):
-            label = scene_starting_times[n][0]
             start_times = scene_starting_times[n][1]
             end_times = scene_ending_times[n][1]
             color = ['r', 'g', 'y'][n % 3]
@@ -745,8 +765,6 @@

         plt.tight_layout()

-
-
         if append_to_filename:
             plt.savefig('{}/scene_{}.{}'.format(output_path, append_to_filename, image_format), dpi=300)
         else:
@@ -793,13 +811,20 @@
     argparser.add_argument(
         'input_path',
         type=str,
-        help="Path of a directory containing wave files for sound backgrounds (in the `background' sub-directory) or events (in `event')"
+        help="Path of a directory containing wave files for sound backgrounds"
+             "(in the `background' sub-directory) or events (in `event')"
     )
+
+    input_path = '.'
+
     argparser.add_argument(
         'output_path',
         type=str,
         help="The directory the generated scenes and annotations will reside."
-    )
+    )
+
+    output_path = '.'
+
     argparser.add_argument(
         'scene_duration',
         type=float,
@@ -831,14 +856,17 @@
     argparser.add_argument(
         '-N',
         type=int,
-        help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, then the verbosity must be less or equal to 1"
+        help="Generate N instances of the scene. If not specified only generate a single instance. Note that if N > 1, "
+             "then the verbosity must be less or equal to 1"
     )
     generate_n = 1

     argparser.add_argument(
         '-t', '--time-mode',
         type=str,
-        help="Mode of spacing between events. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
+        help="Mode of spacing between events. `generate': values must be set for each track in the score files. "
+             "`abstract': values are computed from an abstract representation of an existing acoustic scene. "
+             "`replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
         choices=['generate', 'abstract', 'replicate']
     )
     time_mode = 'generate'
@@ -846,7 +874,9 @@
     argparser.add_argument(
         '-R', '--ebr-mode',
         type=str,
-        help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the score files. `abstract': values are computed from an abstract representation of an existing acoustic scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
+        help="Mode for Event to Background power level ratio. `generate': values must be set for each track in the "
+             "score files. `abstract': values are computed from an abstract representation of an existing acoustic "
+             "scene. `replicate': values are replicated from an existing acousting scene. (NOT IMPLEMENTED)",
         choices=['generate', 'abstract', 'replicate']
     )
     ebr_mode = 'generate'
@@ -854,20 +884,24 @@
     argparser.add_argument(
         '-A', '--annotation-file',
         type=float,
-        help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). (NOT IMPLEMENTED)"
+        help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from ANNOTATION_FILE. "
+             "ANNOTATION_FILE must be comma-separated text file (.csv, .txt), JSON (.json), or Excel (.xls). "
+             "(NOT IMPLEMENTED)"
     )
     annotation_file = None

     argparser.add_argument(
         '-a', '--audio-file',
         type=float,
-        help="If -R or -m are selected, this provides the source for sourcing the times or EBRs from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)"
+        help="If -R or -m are selected, this provides the source for sourcing the times or EBRs "
+             "from AUDIO_FILE. AUDIO_FILE must be a 44100Hz .wav file. (NOT IMPLEMENTED)"
     )
     audio_file = None

     argparser.add_argument(
         '-v', '--figure-verbosity', action='count',
-        help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot"
+        help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not "
+             "display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot"
     )
     figure_verbosity = 0

@@ -881,28 +915,39 @@
     argparser.add_argument(
         '-C', '--channel-mode',
         type=str,
-        help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.",
+        help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as "
+             "'classes', each channel is saved in a separate .wav file.",
         choices=['mono', 'separate']
     )
     channel_mode = 'mono'

-    # argparser.add_argument(
-    #     '-m', '--min-space',
-    #     type=float,
-    #     help="Minimum space allowed between successive events (seconds). If -1, then allow overlapping between events."
-    # )
-    min_space = -1
-
     argparser.add_argument(
         '-c', '--end-cut',
         action='store_true',
-        help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, else remove the sample."
+        help="If the last sample ends after the scene ends then: if enabled, cut the sample to duration, "
+             "else remove the sample."
     )
     end_cut = None
-
-    logging.basicConfig(level=logging.INFO)
-
+
+    argparser.add_argument(
+        '-L', '--logging-level',
+        type=str,
+        help="Set lowest logging level",
+        choices=['debug', 'warning', 'info']
+    )
+
     args = argparser.parse_args()
+
+    if args.logging_level:
+        if args.logging_level == 'debug':
+            logging.basicConfig(level=logging.DEBUG)
+        elif args.logging_level == 'info':
+            logging.basicConfig(level=logging.INFO)
+        elif args.logging_level == 'warning':
+            logging.basicConfig(level=logging.WARNING)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
     if args.input_path:
         input_path = args.input_path
         logging.debug("Using `{}' as input path".format(input_path))
@@ -960,7 +1005,6 @@
                          annotation_file=annotation_file,
                          audio_file=audio_file,
                          figure_verbosity=figure_verbosity,
-                         min_space=min_space,
                          end_cut=end_cut,
                          image_format=image_format,
                          append_to_filename=append_to_filename)
@@ -980,8 +1024,6 @@
                              annotation_file=annotation_file,
                              audio_file=audio_file,
                              figure_verbosity=min(figure_verbosity, 1),
-                             min_space=min_space,
                              end_cut=end_cut,
                              image_format=image_format,
                              append_to_filename=append_to_filename)
-