Mercurial > hg > simscene-py

--- a/simscene.py	Tue Oct 03 15:18:08 2017 +0100
+++ b/simscene.py	Tue Oct 03 19:08:28 2017 +0100
@@ -41,6 +41,16 @@
     """
     return int(t*sr)

+def compute_energy(x):
+    return np.sqrt(np.mean(x**2))
+
+# def compute_energy_profile(x, w=1000):
+#     # Resize/Window signal
+#     #x = np.resize(x, (w,int(np.ceil(float(len(x)/w)))))
+#     x = np.array([[ii+jj for jj in range(w)] for ii in range(len(x)-w)])
+#     return np.sqrt(np.mean(x**2, 1))
+
+
 def read_events_file(fname):
     if fname[-3:].lower() == 'xls':
         df = pd.read_excel(fname)
@@ -153,10 +163,11 @@
              score_events,
              score_backgrounds,
              **kwargs):
-    logging.info('simscene() is not yet implemented')
+    logging.info('simscene() is not yet implemented fully')
     SR = 44100 # Samplerate. Should probably not be hardcoded

     events_df = score_events
+    backgrounds_df = score_backgrounds

     # Create empty numpy array
     scene_arr = np.zeros(int(scene_duration*SR))
@@ -180,33 +191,145 @@
     # purposes
     scene_starting_times = []
     scene_ending_times = []
+
+    # List of tracks
+    track_list = []
+    background_energies = []
+
+    for n in range(len(backgrounds_df)):
+        # Get label of background
+        label = str(backgrounds_df['label'].loc[n])
+
+        candidates = glob.glob('{}/background/{}*.wav'.format(input_path, backgrounds_df['sampleid'].loc[n]))
+        chosen_fname = random.sample(candidates, 1)[0]
+        wav, sr = librosa.load(chosen_fname, sr=SR)
+        duration = len(wav)/float(SR)
+        target_snr = float(backgrounds_df['snr'].loc[n])
+        energy = compute_energy(wav)
+
+        logging.debug('{}:energy:{}'.format(label,energy))
+
+
+        if n == 0:
+            # For the first background track, snr
+            # gives an amount by which it's going to be scaled (i.e. make it more silent)
+            amplitude_factor = target_snr
+            wav *= amplitude_factor
+
+        if n > 0:
+            noise_energy = compute_energy(np.sum(np.array(track_list), axis=0))
+            logging.info('{}:noise_energy:{}'.format(label,noise_energy))
+
+            old_snr = energy/noise_energy
+            logging.info('{}:old_snr:{}'.format(label,old_snr))
+
+            amplitude_factor = target_snr/old_snr
+
+
+            wav *= amplitude_factor
+            new_energy = compute_energy(wav)
+            new_snr = new_energy/noise_energy
+            logging.info('{}:new_snr:{}'.format(label,new_snr))
+
+
+        # Track array
+        track_arr = np.zeros(int(scene_duration*SR))
+        start_times = [0.0]
+        end_times = [start_times[-1]+len(wav)/float(SR)]
+
+
+        # Start with the first time in the list
+        new_start_time = start_times[-1]
+        new_end_time = end_times[-1]
+
+        while new_start_time < scene_duration:
+            offset = duration
+            new_start_time += offset
+            new_end_time += offset
+
+            start_times.append(new_start_time)
+            end_times.append(new_end_time)
+
+        for t in start_times:
+            # We need to be careful with the limits here
+            # since numpy will just ignore indexing that
+            # exceeds
+
+            begin = min(_N(t), len(track_arr))
+            end = min(len(track_arr), _N(t)+len(wav))
+
+            # Part of the wav to stire
+            part = wav[:end-begin]
+
+            track_arr[begin:end] += part
+
+        track_list.append(track_arr)
+        scene_arr[:len(track_arr)] += track_arr
+
+        if channel_mode == 'separate':
+            librosa.output.write_wav('{}/{}_background_track.wav'.format(output_path, label), track_arr, SR)
+
+        F = librosa.stft(track_arr, 1024)
+        energy_prof = librosa.feature.rmse(S=F)
+        background_energies.append(energy_prof)
+
+        if figure_verbosity > 0:
+            plt.figure()
+            plt.subplot(3, 1, 1)
+            plt.title('`{}\' background waveform and spectrogram'.format(label))
+            librosa.display.waveplot(track_arr,sr=SR)
+
+            # Plot spectrogram
+            Fdb = librosa.amplitude_to_db(F)
+            plt.subplot(3, 1, 2)
+            librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
+
+            # Plot energy profile
+            plt.subplot(3, 1, 3)
+            time = np.linspace(0,  len(track_arr)/SR, len(energy_prof.T))
+            plt.semilogy(time, energy_prof.T)
+            plt.xlim([0, len(track_arr)/SR])
+            plt.ylabel('energy (rms)')
+
+
+            # Tidy up and save to file
+            plt.tight_layout()
+            plt.savefig('{}/background_{}.{}'.format(output_path, label, image_format), dpi=300)
+
+    # Compute total energy of background
+    background_arr = np.sum(track_list, 0)
+    B = librosa.stft(background_arr, 1024)
+    background_energy = librosa.feature.rmse(S=B).flatten()

     for n in range(len(events_df)):
         # Get label of track
         label = str(events_df['label'].loc[n])

         candidates = glob.glob('{}/event/{}*.wav'.format(input_path, events_df['sampleid'].loc[n]))
-        chosen_fname = random.sample(candidates,1)[0]
+        chosen_fname = random.sample(candidates, 1)[0]
         wav, sr = librosa.load(chosen_fname, sr=SR)
         assert sr == SR, "Sample rate of individual tracks must be 44100Hz (Failed: `{}' with sample rate: {} )".format(chosen_fname, sr)
+

         # Apply a fader envelope
         fade_in_time = float(events_df['fade_in_time'].loc[n])
         fade_out_time = float(events_df['fade_out_time'].loc[n])
         wav = fade(wav, fade_in_time, fade_out_time)
-
+
+        # Set target EBR
+        target_ebr = 10**(float(events_df['ebr'].loc[n])/20.0 + np.random.randn()*float(events_df['ebr_stddev'].loc[n])/20.0)
+
         # Mean time between instances \mu.
         mean_time_between_instances = events_df['mean_time_between_instances'].loc[n]
         track_end_time = events_df['end_time'].loc[n]

         # Track array
-        track_arr = np.zeros(int(scene_duration*SR))
+        track_arr = np.zeros(int(scene_duration*SR))

         #If \mu is -1, then play the event only once.
         if mean_time_between_instances == -1:
             track_arr[_N(events_df['start_time'].loc[n]):_N(events_df['start_time'].loc[n])+len(wav)] += wav
         else:
-
             # If 0, then start next sample after this one (set it to the duration of the sample)
             if mean_time_between_instances == 0:
                 mean_time_between_instances = len(wav)/float(SR)
@@ -237,7 +360,6 @@
                     end_times.append(new_end_time)

             for t in start_times:
-
                 # We need to be careful with the limits here
                 # since numpy will just ignore indexing that
                 # exceeds the size of the array
@@ -253,23 +375,65 @@
                     part = fade(part, 0, fade_out_time)

                 track_arr[begin:end] += part
-
+
+            track_list.append(track_arr)
             scene_arr[:len(track_arr)] += track_arr

+        # Compute energies
+        F = librosa.stft(track_arr, 1024)
+        energy_prof = librosa.feature.rmse(S=F).flatten()
+
+
+        # Compute current ebr
+
+#        logging.debug(energy_prof.shape)
+#        logging.debug(background_energy.shape)
+        ebr_prof = energy_prof/background_energy[:len(energy_prof)].flatten()
+        curr_ebr = np.max(ebr_prof)
+        logging.debug('{}:Target ebr: {}db'.format(label,20*np.log10(target_ebr)))
+        logging.debug('{}:Current track ebr: {}db'.format(label,20*np.log10(curr_ebr)))
+
+        # Set correct ebr
+        track_arr = track_arr/curr_ebr*target_ebr
+
+        Fnew = librosa.stft(track_arr, 1024)
+        new_energy_prof = librosa.feature.rmse(S=Fnew).flatten()
+        new_ebr_prof = new_energy_prof/background_energy[:len(energy_prof)].flatten()
+        new_ebr = np.max(new_ebr_prof)
+        logging.debug('{}:New track ebr: {}db'.format(label,20*np.log10(new_ebr)))
+
+
+
+        if channel_mode == 'separate':
+            librosa.output.write_wav('{}/{}_event_track.wav'.format(output_path, label), track_arr, SR)
+
+
+
+
+
         if figure_verbosity > 0:
             plt.figure()

-            plt.subplot(2,1,1)
-            plt.title('`{}\' waveform and spectrogram'.format(label))
+            plt.subplot(3,1,1)
+            plt.title('`{}\' event waveform and spectrogram'.format(label))

-            visible_track = track_arr[int(start_times[0]*SR):int(end_times[-1]*SR)]
-            librosa.display.waveplot(visible_track,sr=SR)
-            F = librosa.stft(visible_track)
+            librosa.display.waveplot(track_arr,sr=SR)
             Fdb = librosa.amplitude_to_db(F)
-            plt.subplot(2,1,2)
+            plt.subplot(3, 1, 2)
             librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
+
+            # Plot energy profile
+            plt.subplot(3, 1, 3)
+            time = np.linspace(0, len(track_arr)/SR, len(energy_prof.T))
+            plt.semilogy(time, energy_prof.T)
+            plt.xlim([0, len(track_arr)/SR])
+            plt.ylabel('energy (rms)')
+
+
             plt.tight_layout()
-            plt.savefig('{}/{}.{}'.format(output_path, label, image_format),dpi=300)
+            plt.savefig('{}/event_{}.{}'.format(output_path, label, image_format), dpi=300)
+
+


         scene_starting_times.append((label, start_times))
@@ -286,12 +450,21 @@
         librosa.display.specshow(Fdb, sr=SR, x_axis='time', y_axis='hz')
         ax2 = plt.subplot(3,1,3)
         ax2.set_xlim([0,scene_duration])
-        ax2.set_ylim([-0.5, len(scene_starting_times)-0.5])

         # Get labels
         labels = [s[0] for s in scene_starting_times]

-        plt.yticks(range(len(scene_starting_times)), labels)
+
+
+
+        # If background is active
+        if len(backgrounds_df) > 0:
+            labels.append('background')
+
+        # Set y axis limit. With a padding of 0.5.
+        ax2.set_ylim([-0.5, len(labels)-0.5])
+
+        plt.yticks(range(len(labels)), labels)

         for n in range(len(scene_starting_times)):
             label = scene_starting_times[n][0]
@@ -312,7 +485,11 @@
                     ax2.axvline(start_times[m], color=color, alpha=0.1)
                     ax2.axvline(end_times[m], color=color, alpha=0.1)
                     ax2.axvline(end_times[m], color=color, alpha=0.1)
-                    ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
+                    ax2.axvspan(start_times[m], end_times[m], color=color, alpha=0.1)
+
+        if len(backgrounds_df) > 0:
+            plt.axhline(y=len(scene_starting_times), alpha=0.5, color='k', linewidth=4)
+
         plt.tight_layout()
         plt.savefig('{}/full-scene.{}'.format(output_path, image_format),dpi=300)
     if figure_verbosity > 1:
@@ -320,7 +497,10 @@

     if channel_mode == 'mono':
         librosa.output.write_wav('{}/full-scene.wav'.format(output_path), scene_arr, SR)
-
+
+    if channel_mode == 'classes':
+        scene_wav = np.array(track_list)
+        librosa.output.write_wav('{}/classes-scene.wav'.format(output_path), scene_wav, SR)
     return scene_arr


@@ -402,7 +582,7 @@
         '-v', '--figure-verbosity', action='count',
         help="Increase figure verbosity. (Default) 0 - Don't save or display figures, 1 - Save pictures but do not display them, 2 - Save and display figures, 3 - Add shades over the events in the final plot"
     )
-    figure_verbosity = None
+    figure_verbosity = 0

     argparser.add_argument(
         '-x', '--image-format',
@@ -414,8 +594,8 @@
     argparser.add_argument(
         '-C', '--channel-mode',
         type=str,
-        help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'classes' - As many channels as sound classes (events+textures), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.",
-        choices=['mono', 'classes', 'separate']
+        help="number of audio channels contained in file. (Default) 'mono' - 1 channel (mono), 'separate' - Same as 'classes', each channel is saved in a separate .wav file.",
+        choices=['mono', 'separate']
     )
     channel_mode = 'mono'

@@ -467,8 +647,14 @@

             if args.score_backgrounds:
                 score_backgrounds = read_backgrounds_file(args.score_backgrounds)
+            else:
+                score_backgrounds = []
+
             if args.score_events:
                 score_events = read_events_file(args.score_events)
+            else:
+                score_events = []
+
             if args.figure_verbosity:
                 figure_verbosity = args.figure_verbosity