# HG changeset patch # User Dan Stowell # Date 1452252647 0 # Node ID 73317239d6d16aace402cb6f322ae7602964b4fb autoencoder-specgram first checkin diff -r 000000000000 -r 73317239d6d1 .gitignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.gitignore Fri Jan 08 11:30:47 2016 +0000 @@ -0,0 +1,3 @@ +pdf +*.pyc +*~ diff -r 000000000000 -r 73317239d6d1 LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE Fri Jan 08 11:30:47 2016 +0000 @@ -0,0 +1,22 @@ +autoencoder-specgram + +Copyright (c) 2016 Dan Stowell + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff -r 000000000000 -r 73317239d6d1 README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Fri Jan 08 11:30:47 2016 +0000 @@ -0,0 +1,38 @@ + +Spectrogram auto-encoder +(c) Dan Stowell 2016. + + +A simple example of an autoencoder set up for spectrograms, with two convolutional layers - thought of as one "encoding" layer and one "decoding" layer. + +It's meant to be a fairly minimal example of doing this in Theano, using the Lasagne framework to make things easier. + +By default it simply makes a training set from different chunks of the same single spectrogram (from the supplied wave file). This is not a good training set! + +Notable (potentially unusual) things about this implementation: + * Data is not pre-whitened, instead we use a custom layer (NormalisationLayer) to normalise the mean-and-variance of the data for us. This is because I want the spectrogram to be normalised when it is input but not normalised when it is output. + * It's a convolutional net but only along the time axis; along the frequency axis it's fully-connected. + * There's no maxpooling/downsampling. + + +SYSTEM REQUIREMENTS +=================== + +* Python +* Theano (NOTE: please check the Lasagne page for preferred Theano version) +* Lasagne https://github.com/Lasagne/Lasagne +* Matplotlib +* scikits.audiolab + +Tested on Ubuntu 14.04 with Python 2.7. + +USAGE +===== + + python autoencoder-specgram.py + +It creates a "pdf" folder and puts plots in there (multi-page PDFs) as it goes along. +There's a "progress" pdf which gets repeatedly overwritten - you should see the output quality gradually getting better. + +Look in userconfig.py for configuration options. + diff -r 000000000000 -r 73317239d6d1 autoencoder-specgram.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/autoencoder-specgram.py Fri Jan 08 11:30:47 2016 +0000 @@ -0,0 +1,222 @@ + +# Spectrogram auto-encoder +# Dan Stowell 2016. +# +# Unusual things about this implementation: +# * Data is not pre-whitened, instead we use a custom layer (NormalisationLayer) to normalise the mean-and-variance of the data for us. This is because I want the spectrogram to be normalised when it is input but not normalised when it is output. +# * It's a convolutional net but only along the time axis; along the frequency axis it's fully-connected. + +import numpy as np + +import theano +import theano.tensor as T +import lasagne +#import downhill +from lasagne.nonlinearities import rectify, leaky_rectify, very_leaky_rectify +from numpy import float32 + +import matplotlib +#matplotlib.use('PDF') # http://www.astrobetter.com/plotting-to-a-file-in-python/ +import matplotlib.pyplot as plt +import matplotlib.cm as cm +from matplotlib.backends.backend_pdf import PdfPages +plt.rcParams.update({'font.size': 6}) + +from userconfig import * +import util +from layers_custom import * + +################################################################################################################### +# create Theano variables for input minibatch +input_var = T.tensor4('X') +# note that in general, the main data tensors will have these axes: +# - minibatchsize +# - numchannels (always 1 for us, since spectrograms) +# - numfilts (or specbinnum for input) +# - numtimebins + +if example_is_audio: + # load our example audio file as a specgram + examplegram = util.standard_specgram((util.load_soundfile(examplewavpath, 0))) + print("examplegram is of shape %s" % str(np.shape(examplegram))) + +################################################################################################################### +# here we define our "semi-convolutional" autoencoder +# NOTE: lasagne assumes pooling is on the TRAILING axis of the tensor, so we always use time as the trailing axis + +def make_custom_convlayer(network, in_num_chans, out_num_chans): + "Applies our special padding and reshaping to do 1D convolution on 2D data" + network = lasagne.layers.PadLayer(network, width=(featframe_len-1)/2, batch_ndim=3) # NOTE: the "batch_ndim" is used to stop batch dims being padded, but here ALSO to skip first data dim + print("shape after pad layer: %s" % str(network.output_shape)) + network = lasagne.layers.Conv2DLayer(network, out_num_chans, (in_num_chans, featframe_len), stride=(1,1), pad=0, nonlinearity=very_leaky_rectify, W=lasagne.init.Orthogonal()) # we pad "manually" in order to do it in one dimension only + filters = network.W + network = lasagne.layers.ReshapeLayer(network, ([0], [2], [1], [3])) # reinterpret channels as rows + print("shape after conv layer: %s" % str(network.output_shape)) + return network, filters + +network = lasagne.layers.InputLayer((None, 1, specbinnum, numtimebins), input_var) +print("shape after input layer: %s" % str(network.output_shape)) +# +# normalisation layer +# -- note that we deliberately normalise the input but do not undo that at the output. +# -- note that the normalisation params are not set by the training procedure, they need to be set before training begins. +network = NormalisationLayer(network, specbinnum) +normlayer = network # we need to remember this one so we can set its parameters +# +network, filters_enc = make_custom_convlayer(network, in_num_chans=specbinnum, out_num_chans=numfilters) +# +########################################### +########################################### +########################################### +# NOTE: in this example we have no maxpooling or other downsampling. +# That's a notable absence compared to standard architectures. +# For the present code, with 1 encoding layer and 1 decoding layer, +# you might expect that to happen about here. +########################################### +########################################### +########################################### + +latents = network # we remember the "latents" at the midpoint of the net, since we'll want to inspect them, and maybe regularise them too + +network, filters_dec = make_custom_convlayer(network, in_num_chans=numfilters, out_num_chans=specbinnum) + +network = lasagne.layers.NonlinearityLayer(network, nonlinearity=rectify) # finally a standard rectify since nonneg (specgram) target + +################################################################################################################### +# define simple L2 loss function with a mild touch of regularisation +prediction = lasagne.layers.get_output(network) +loss = lasagne.objectives.squared_error(prediction, input_var) +loss = loss.mean() + 1e-4 * lasagne.regularization.regularize_network_params(network, lasagne.regularization.l2) + +################################################################################################################### + +plot_probedata_data = None +def plot_probedata(outpostfix, plottitle=None): + """Visualises the network behaviour. + NOTE: currently accesses globals. Should really be passed in the network, filters etc""" + global plot_probedata_data + + if plottitle==None: + plottitle = outpostfix + + if np.shape(plot_probedata_data)==(): + if example_is_audio: + plot_probedata_data = np.array([[examplegram[:, examplegram_startindex:examplegram_startindex+numtimebins]]], float32) + else: + plot_probedata_data = np.zeros((minibatchsize, 1, specbinnum, numtimebins), dtype=float32) + for _ in range(5): + plot_probedata_data[:, :, np.random.randint(specbinnum), np.random.randint(numtimebins)] = 1 + + test_prediction = lasagne.layers.get_output(network, deterministic=True) + test_latents = lasagne.layers.get_output(latents, deterministic=True) + predict_fn = theano.function([input_var], test_prediction) + latents_fn = theano.function([input_var], test_latents) + prediction = predict_fn(plot_probedata_data) + latentsval = latents_fn(plot_probedata_data) + if False: + print("Probedata has shape %s and meanabs %g" % ( plot_probedata_data.shape, np.mean(np.abs(plot_probedata_data )))) + print("Latents has shape %s and meanabs %g" % (latentsval.shape, np.mean(np.abs(latentsval)))) + print("Prediction has shape %s and meanabs %g" % (prediction.shape, np.mean(np.abs(prediction)))) + print("Ratio %g" % (np.mean(np.abs(prediction)) / np.mean(np.abs(plot_probedata_data)))) + + util.mkdir_p('pdf') + pdf = PdfPages('pdf/autoenc_probe_%s.pdf' % outpostfix) + plt.figure(frameon=False) + # + plt.subplot(3, 1, 1) + plotdata = plot_probedata_data[0,0,:,:] + plt.imshow(plotdata, origin='lower', interpolation='nearest', cmap='RdBu', aspect='auto', vmin=-np.max(np.abs(plotdata)), vmax=np.max(np.abs(plotdata))) + plt.ylabel('Input') + plt.title("%s" % (plottitle)) + # + plt.subplot(3, 1, 2) + plotdata = latentsval[0,0,:,:] + plt.imshow(plotdata, origin='lower', interpolation='nearest', cmap='RdBu', aspect='auto', vmin=-np.max(np.abs(plotdata)), vmax=np.max(np.abs(plotdata))) + plt.ylabel('Latents') + # + plt.subplot(3, 1, 3) + plotdata = prediction[0,0,:,:] + plt.imshow(plotdata, origin='lower', interpolation='nearest', cmap='RdBu', aspect='auto', vmin=-np.max(np.abs(plotdata)), vmax=np.max(np.abs(plotdata))) + plt.ylabel('Output') + # + pdf.savefig() + plt.close() + ## + for filtvar, filtlbl, isenc in [ + (filters_enc, 'encoding', True), + (filters_dec, 'decoding', False), + ]: + plt.figure(frameon=False) + vals = filtvar.get_value() + #print(" %s filters have shape %s" % (filtlbl, vals.shape)) + vlim = np.max(np.abs(vals)) + for whichfilt in range(numfilters): + plt.subplot(3, 8, whichfilt+1) + # NOTE: for encoding/decoding filters, we grab the "slice" of interest from the tensor in different ways: different axes, and flipped. + if isenc: + plotdata = vals[numfilters-(1+whichfilt),0,::-1,::-1] + else: + plotdata = vals[:,0,whichfilt,:] + + plt.imshow(plotdata, origin='lower', interpolation='nearest', cmap='RdBu', aspect='auto', vmin=-vlim, vmax=vlim) + plt.xticks([]) + if whichfilt==0: + plt.title("%s filters (%s)" % (filtlbl, outpostfix)) + else: + plt.yticks([]) + + pdf.savefig() + plt.close() + ## + pdf.close() + +plot_probedata('init') + +################################################################################################################### +if True: + ################################### + # here we set up some training data. this is ALL A BIT SIMPLE - for a proper experiment we'd prepare a full dataset, and it might be too big to be all in memory. + training_data_size=100 + training_data = np.zeros((training_data_size, minibatchsize, 1, specbinnum, numtimebins), dtype=float32) + if example_is_audio: + # manually grab a load of random subsets of the training audio + training_data_size=100 + for which_training_batch in range(training_data_size): + for which_training_datum in range(minibatchsize): + startindex = np.random.randint(examplegram.shape[1]-numtimebins) + training_data[which_training_batch, which_training_datum, :, :, :] = examplegram[:, startindex:startindex+numtimebins] + else: + # make some simple (sparse) data that we can train with + for which_training_batch in range(training_data_size): + for which_training_datum in range(minibatchsize): + for _ in range(5): + training_data[which_training_batch, which_training_datum, :, np.random.randint(specbinnum), np.random.randint(numtimebins)] = 1 + + ################################### + # pre-training setup + + # set the normalisation parameters manually as an estimate from the training data + normlayer.set_normalisation(training_data) + + ################################### + # training + + # compile training function that updates parameters and returns training loss + params = lasagne.layers.get_all_params(network, trainable=True) + updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) + train_fn = theano.function([input_var], loss, updates=updates) + + # train network + numepochs = 2048 # 3 # 100 # 5000 + for epoch in range(numepochs): + loss = 0 + for input_batch in training_data: + loss += train_fn(input_batch) + if epoch==0 or epoch==numepochs-1 or (2 ** int(np.log2(epoch)) == epoch): + lossreadout = loss / len(training_data) + infostring = "Epoch %d/%d: Loss %g" % (epoch, numepochs, lossreadout) + print(infostring) + plot_probedata('progress', plottitle="progress (%s)" % infostring) + + plot_probedata('trained', plottitle="trained (%d epochs; Loss %g)" % (numepochs, lossreadout)) + diff -r 000000000000 -r 73317239d6d1 layers_custom.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/layers_custom.py Fri Jan 08 11:30:47 2016 +0000 @@ -0,0 +1,51 @@ + +import numpy as np + +import theano +import theano.tensor as T + +from lasagne.layers.base import Layer + +############################################################################################################### + +class NormalisationLayer(Layer): + """ + This layer applies a simple mean-and-std normalisation to input data. + This allows you to "learn" the mean+std from training data and then apply it "live" to any future incoming data. + + NOTE: the parameters are NOT learnt during training, but must be initialised BEFORE training using the set_normalisation() function. + """ + def __init__(self, incoming, numbins, **kwargs): + "numbins is the number of frequency bins in the spectrograms we're going to be normalising" + super(NormalisationLayer, self).__init__(incoming, **kwargs) + self.numbins = numbins + self._output_shape = None + self.initialised = False + # and for the normalisation, per frequency bin - typically, we "sub" the mean and then "mul" by 1/std (I write this as mul rather than div because often more efficient) + self.normn_sub = theano.shared(np.zeros((1, 1, numbins, 1), dtype=theano.config.floatX), borrow=True, name='norm_sub', broadcastable=(1, 1, 0, 1)) + self.normn_mul = theano.shared(np.ones( (1, 1, numbins, 1), dtype=theano.config.floatX), borrow=True, name='norm_mul', broadcastable=(1, 1, 0, 1)) + # here we're defining a theano func that I can use to "manually" normalise some data if needed as a separate thing + inputdata = T.tensor4('inputdata') + self.transform_some_data = theano.function([inputdata], (inputdata - self.normn_sub) * self.normn_mul) + + def get_output_shape_for(self, input_shape): + return input_shape + + def get_output_for(self, inputdata, **kwargs): + #if not self.initialised: + # print("NormalisationLayer must be initalised with normalisation parameters before training") + return (inputdata - self.normn_sub) * self.normn_mul + + def set_normalisation(self, databatches): + numbins = self.numbins + # we first collapse the data batches, essentially into one very long spectrogram... + #print("databatches.shape: %s" % str(databatches.shape)) + data = np.concatenate(np.vstack(np.vstack(databatches)), axis=-1) + #print("data.shape: %s" % str(data.shape)) + + centre = np.mean(data, axis=1) + self.normn_sub.set_value( centre.astype(theano.config.floatX).reshape((1,1,numbins,1)), borrow=True) + self.normn_mul.set_value(1. / data.std( axis=1).reshape((1,1,-1,1)), borrow=True) + + self.initialised = True + diff -r 000000000000 -r 73317239d6d1 renneschiffchaff20130320bout1filt.wav Binary file renneschiffchaff20130320bout1filt.wav has changed diff -r 000000000000 -r 73317239d6d1 userconfig.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/userconfig.py Fri Jan 08 11:30:47 2016 +0000 @@ -0,0 +1,38 @@ + +# Configuration options that you might like to change + +example_is_audio = True # if False, generates simple sparse data for probing; else loads an audio file +examplegram_startindex = 550 # just choosing which bit to plot + +#examplewavpath = "~/birdsong/linhart2015mar/concatall/perfolder/PC1101-rep-day2.wav" +examplewavpath = "509.WAV" +examplewavpath = "renneschiffchaff20130320bout1filt.wav" + +srate = 22050. +wavdownsample = 2 # eg 44 kHz audio, factor of 2, gets loaded as 22 kHz. for no downsampling, set this ratio to 1 + +audioframe_len = 128 +audioframe_stride = 64 + +specbinlow = 10 +specbinnum = 32 + +featframe_len = 9 +featframe_stride = 16 +numfilters = 6 +minibatchsize = 16 +numtimebins = 160 # 128 # 48 # NOTE that this size needs really to be compatible with downsampling (maxpooling) steps if you use them. + + +########################################################### +# Below, we calculate some other things based on the config + +import os +examplewavpath = os.path.expanduser(examplewavpath) + + +hopsize_secs = audioframe_stride / float(srate) +print("Specgram frame hop size: %.3g ms" % (hopsize_secs * 1000)) +specgramlen_secs = hopsize_secs * numtimebins +print("Specgram duration: %.3g s" % specgramlen_secs) + diff -r 000000000000 -r 73317239d6d1 util.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/util.py Fri Jan 08 11:30:47 2016 +0000 @@ -0,0 +1,71 @@ + +# utility functions + +import numpy as np +from numpy import float32 + +import os, errno +from scikits.audiolab import Sndfile +from scikits.audiolab import Format + +from matplotlib.mlab import specgram + +from userconfig import * + +######################################################## + +def standard_specgram(signal): + "Return specgram matrix, made using the audio-layer config" + return np.array(specgram(signal, NFFT=audioframe_len, noverlap=audioframe_len-audioframe_stride, window=np.hamming(audioframe_len))[0][specbinlow:specbinlow+specbinnum,:], dtype=float32) + +def load_soundfile(inwavpath, startpossecs, maxdursecs=None): + """Loads audio data, optionally limiting to a specified start position and duration. + Must be SINGLE-CHANNEL and matching our desired sample-rate.""" + framelen = 4096 + hopspls = framelen + unhopspls = framelen - hopspls + if (framelen % wavdownsample) != 0: raise ValueError("framelen needs to be a multiple of wavdownsample: %i, %i" % (framelen, wavdownsample)) + if (hopspls % wavdownsample) != 0: raise ValueError("hopspls needs to be a multiple of wavdownsample: %i, %i" % (hopspls , wavdownsample)) + if maxdursecs==None: + maxdursecs = 9999 + sf = Sndfile(inwavpath, "r") + splsread = 0 + framesread = 0 + if sf.channels != 1: raise ValueError("Sound file %s has multiple channels (%i) - mono required." % (inwavpath, sf.channels)) + timemax_spls = int(maxdursecs * sf.samplerate) + if sf.samplerate != (srate * wavdownsample): + raise ValueError("Sample rate mismatch: we expect %g, file has %g" % (srate, sf.samplerate)) + if startpossecs > 0: + sf.seek(startpossecs * sf.samplerate) # note: returns IOError if beyond the end + audiodata = np.array([], dtype=np.float32) + while(True): + try: + if splsread==0: + chunk = sf.read_frames(framelen)[::wavdownsample] + splsread += framelen + else: + chunk = np.hstack((chunk[:unhopspls], sf.read_frames(hopspls)[::wavdownsample] )) + splsread += hopspls + framesread += 1 + if framesread % 25000 == 0: + print("Read %i frames" % framesread) + if len(chunk) != (framelen / wavdownsample): + print("Not read sufficient samples - returning") + break + chunk = np.array(chunk, dtype=np.float32) + audiodata = np.hstack((audiodata, chunk)) + if splsread >= timemax_spls: + break + except RuntimeError: + break + sf.close() + return audiodata + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: raise +