Mercurial > hg > autoencoder-specgram

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.gitignore	Fri Jan 08 11:30:47 2016 +0000
@@ -0,0 +1,3 @@
+pdf
+*.pyc
+*~
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/LICENSE	Fri Jan 08 11:30:47 2016 +0000
@@ -0,0 +1,22 @@
+autoencoder-specgram
+
+Copyright (c) 2016 Dan Stowell
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md	Fri Jan 08 11:30:47 2016 +0000
@@ -0,0 +1,38 @@
+
+Spectrogram auto-encoder
+(c) Dan Stowell 2016.
+
+
+A simple example of an autoencoder set up for spectrograms, with two convolutional layers - thought of as one "encoding" layer and one "decoding" layer.
+
+It's meant to be a fairly minimal example of doing this in Theano, using the Lasagne framework to make things easier.
+
+By default it simply makes a training set from different chunks of the same single spectrogram (from the supplied wave file). This is not a good training set!
+
+Notable (potentially unusual) things about this implementation:
+ * Data is not pre-whitened, instead we use a custom layer (NormalisationLayer) to normalise the mean-and-variance of the data for us. This is because I want the spectrogram to be normalised when it is input but not normalised when it is output.
+ * It's a convolutional net but only along the time axis; along the frequency axis it's fully-connected.
+ * There's no maxpooling/downsampling.
+
+
+SYSTEM REQUIREMENTS
+===================
+
+* Python
+* Theano (NOTE: please check the Lasagne page for preferred Theano version)
+* Lasagne https://github.com/Lasagne/Lasagne
+* Matplotlib
+* scikits.audiolab
+
+Tested on Ubuntu 14.04 with Python 2.7.
+
+USAGE
+=====
+
+          python autoencoder-specgram.py
+
+It creates a "pdf" folder and puts plots in there (multi-page PDFs) as it goes along.
+There's a "progress" pdf which gets repeatedly overwritten - you should see the output quality gradually getting better.
+
+Look in userconfig.py for configuration options.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/autoencoder-specgram.py	Fri Jan 08 11:30:47 2016 +0000
@@ -0,0 +1,222 @@
+
+# Spectrogram auto-encoder
+# Dan Stowell 2016.
+#
+# Unusual things about this implementation:
+#  * Data is not pre-whitened, instead we use a custom layer (NormalisationLayer) to normalise the mean-and-variance of the data for us. This is because I want the spectrogram to be normalised when it is input but not normalised when it is output.
+#  * It's a convolutional net but only along the time axis; along the frequency axis it's fully-connected.
+
+import numpy as np
+
+import theano
+import theano.tensor as T
+import lasagne
+#import downhill
+from lasagne.nonlinearities import rectify, leaky_rectify, very_leaky_rectify
+from numpy import float32
+
+import matplotlib
+#matplotlib.use('PDF') # http://www.astrobetter.com/plotting-to-a-file-in-python/
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+from matplotlib.backends.backend_pdf import PdfPages
+plt.rcParams.update({'font.size': 6})
+
+from userconfig import *
+import util
+from layers_custom import *
+
+###################################################################################################################
+# create Theano variables for input minibatch
+input_var = T.tensor4('X')
+# note that in general, the main data tensors will have these axes:
+#   - minibatchsize
+#   - numchannels (always 1 for us, since spectrograms)
+#   - numfilts (or specbinnum for input)
+#   - numtimebins
+
+if example_is_audio:
+	# load our example audio file as a specgram
+	examplegram = util.standard_specgram((util.load_soundfile(examplewavpath, 0)))
+	print("examplegram is of shape %s" % str(np.shape(examplegram)))
+
+###################################################################################################################
+# here we define our "semi-convolutional" autoencoder
+# NOTE: lasagne assumes pooling is on the TRAILING axis of the tensor, so we always use time as the trailing axis
+
+def make_custom_convlayer(network, in_num_chans, out_num_chans):
+	"Applies our special padding and reshaping to do 1D convolution on 2D data"
+	network = lasagne.layers.PadLayer(network, width=(featframe_len-1)/2, batch_ndim=3) # NOTE: the "batch_ndim" is used to stop batch dims being padded, but here ALSO to skip first data dim
+	print("shape after pad layer: %s" % str(network.output_shape))
+	network = lasagne.layers.Conv2DLayer(network, out_num_chans, (in_num_chans, featframe_len), stride=(1,1), pad=0, nonlinearity=very_leaky_rectify, W=lasagne.init.Orthogonal()) # we pad "manually" in order to do it in one dimension only
+	filters = network.W
+	network = lasagne.layers.ReshapeLayer(network, ([0], [2], [1], [3])) # reinterpret channels as rows
+	print("shape after conv layer: %s" % str(network.output_shape))
+	return network, filters
+
+network = lasagne.layers.InputLayer((None, 1, specbinnum, numtimebins), input_var)
+print("shape after input layer: %s" % str(network.output_shape))
+#
+# normalisation layer
+#  -- note that we deliberately normalise the input but do not undo that at the output.
+#  -- note that the normalisation params are not set by the training procedure, they need to be set before training begins.
+network = NormalisationLayer(network, specbinnum)
+normlayer = network # we need to remember this one so we can set its parameters
+#
+network, filters_enc = make_custom_convlayer(network, in_num_chans=specbinnum, out_num_chans=numfilters)
+#
+###########################################
+###########################################
+###########################################
+# NOTE: in this example we have no maxpooling or other downsampling.
+# That's a notable absence compared to standard architectures.
+# For the present code, with 1 encoding layer and 1 decoding layer,
+# you might expect that to happen about here.
+###########################################
+###########################################
+###########################################
+
+latents = network  # we remember the "latents" at the midpoint of the net, since we'll want to inspect them, and maybe regularise them too
+
+network, filters_dec = make_custom_convlayer(network, in_num_chans=numfilters, out_num_chans=specbinnum)
+
+network = lasagne.layers.NonlinearityLayer(network, nonlinearity=rectify)  # finally a standard rectify since nonneg (specgram) target
+
+###################################################################################################################
+# define simple L2 loss function with a mild touch of regularisation
+prediction = lasagne.layers.get_output(network)
+loss = lasagne.objectives.squared_error(prediction, input_var)
+loss = loss.mean() + 1e-4 * lasagne.regularization.regularize_network_params(network, lasagne.regularization.l2)
+
+###################################################################################################################
+
+plot_probedata_data = None
+def plot_probedata(outpostfix, plottitle=None):
+	"""Visualises the network behaviour.
+	NOTE: currently accesses globals. Should really be passed in the network, filters etc"""
+	global plot_probedata_data
+
+	if plottitle==None:
+		plottitle = outpostfix
+
+	if np.shape(plot_probedata_data)==():
+		if example_is_audio:
+			plot_probedata_data = np.array([[examplegram[:, examplegram_startindex:examplegram_startindex+numtimebins]]], float32)
+		else:
+			plot_probedata_data = np.zeros((minibatchsize, 1, specbinnum, numtimebins), dtype=float32)
+			for _ in range(5):
+				plot_probedata_data[:, :, np.random.randint(specbinnum), np.random.randint(numtimebins)] = 1
+
+	test_prediction = lasagne.layers.get_output(network, deterministic=True)
+	test_latents    = lasagne.layers.get_output(latents, deterministic=True)
+	predict_fn = theano.function([input_var], test_prediction)
+	latents_fn = theano.function([input_var], test_latents)
+	prediction = predict_fn(plot_probedata_data)
+	latentsval = latents_fn(plot_probedata_data)
+	if False:
+		print("Probedata  has shape %s and meanabs %g" % ( plot_probedata_data.shape, np.mean(np.abs(plot_probedata_data ))))
+		print("Latents has shape %s and meanabs %g" % (latentsval.shape, np.mean(np.abs(latentsval))))
+		print("Prediction has shape %s and meanabs %g" % (prediction.shape, np.mean(np.abs(prediction))))
+		print("Ratio %g" % (np.mean(np.abs(prediction)) / np.mean(np.abs(plot_probedata_data))))
+
+	util.mkdir_p('pdf')
+	pdf = PdfPages('pdf/autoenc_probe_%s.pdf' % outpostfix)
+	plt.figure(frameon=False)
+	#
+	plt.subplot(3, 1, 1)
+	plotdata = plot_probedata_data[0,0,:,:]
+	plt.imshow(plotdata, origin='lower', interpolation='nearest', cmap='RdBu', aspect='auto', vmin=-np.max(np.abs(plotdata)), vmax=np.max(np.abs(plotdata)))
+	plt.ylabel('Input')
+	plt.title("%s" % (plottitle))
+	#
+	plt.subplot(3, 1, 2)
+	plotdata = latentsval[0,0,:,:]
+	plt.imshow(plotdata, origin='lower', interpolation='nearest', cmap='RdBu', aspect='auto', vmin=-np.max(np.abs(plotdata)), vmax=np.max(np.abs(plotdata)))
+	plt.ylabel('Latents')
+	#
+	plt.subplot(3, 1, 3)
+	plotdata = prediction[0,0,:,:]
+	plt.imshow(plotdata, origin='lower', interpolation='nearest', cmap='RdBu', aspect='auto', vmin=-np.max(np.abs(plotdata)), vmax=np.max(np.abs(plotdata)))
+	plt.ylabel('Output')
+	#
+	pdf.savefig()
+	plt.close()
+	##
+	for filtvar, filtlbl, isenc in [
+		(filters_enc, 'encoding', True),
+		(filters_dec, 'decoding', False),
+			]:
+		plt.figure(frameon=False)
+		vals = filtvar.get_value()
+		#print("        %s filters have shape %s" % (filtlbl, vals.shape))
+		vlim = np.max(np.abs(vals))
+		for whichfilt in range(numfilters):
+			plt.subplot(3, 8, whichfilt+1)
+			# NOTE: for encoding/decoding filters, we grab the "slice" of interest from the tensor in different ways: different axes, and flipped.
+			if isenc:
+				plotdata = vals[numfilters-(1+whichfilt),0,::-1,::-1]
+			else:
+				plotdata = vals[:,0,whichfilt,:]
+
+			plt.imshow(plotdata, origin='lower', interpolation='nearest', cmap='RdBu', aspect='auto', vmin=-vlim, vmax=vlim)
+			plt.xticks([])
+			if whichfilt==0:
+				plt.title("%s filters (%s)" % (filtlbl, outpostfix))
+			else:
+				plt.yticks([])
+
+		pdf.savefig()
+		plt.close()
+	##
+	pdf.close()
+
+plot_probedata('init')
+
+###################################################################################################################
+if True:
+	###################################
+	# here we set up some training data. this is ALL A BIT SIMPLE - for a proper experiment we'd prepare a full dataset, and it might be too big to be all in memory.
+	training_data_size=100
+	training_data = np.zeros((training_data_size, minibatchsize, 1, specbinnum, numtimebins), dtype=float32)
+	if example_is_audio:
+		# manually grab a load of random subsets of the training audio
+		training_data_size=100
+		for which_training_batch in range(training_data_size):
+			for which_training_datum in range(minibatchsize):
+				startindex = np.random.randint(examplegram.shape[1]-numtimebins)
+				training_data[which_training_batch, which_training_datum, :, :, :] = examplegram[:, startindex:startindex+numtimebins]
+	else:
+		# make some simple (sparse) data that we can train with
+		for which_training_batch in range(training_data_size):
+			for which_training_datum in range(minibatchsize):
+				for _ in range(5):
+					training_data[which_training_batch, which_training_datum, :, np.random.randint(specbinnum), np.random.randint(numtimebins)] = 1
+
+	###################################
+	# pre-training setup
+
+	# set the normalisation parameters manually as an estimate from the training data
+	normlayer.set_normalisation(training_data)
+
+	###################################
+	# training
+
+	# compile training function that updates parameters and returns training loss
+	params = lasagne.layers.get_all_params(network, trainable=True)
+	updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9)
+	train_fn = theano.function([input_var], loss, updates=updates)
+
+	# train network
+	numepochs = 2048 # 3 # 100 # 5000
+	for epoch in range(numepochs):
+		loss = 0
+		for input_batch in training_data:
+			loss += train_fn(input_batch)
+		if epoch==0 or epoch==numepochs-1 or (2 ** int(np.log2(epoch)) == epoch):
+			lossreadout = loss / len(training_data)
+			infostring = "Epoch %d/%d: Loss %g" % (epoch, numepochs, lossreadout)
+			print(infostring)
+			plot_probedata('progress', plottitle="progress (%s)" % infostring)
+
+	plot_probedata('trained', plottitle="trained (%d epochs; Loss %g)" % (numepochs, lossreadout))
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/layers_custom.py	Fri Jan 08 11:30:47 2016 +0000
@@ -0,0 +1,51 @@
+
+import numpy as np
+
+import theano
+import theano.tensor as T
+
+from lasagne.layers.base import Layer
+
+###############################################################################################################
+
+class NormalisationLayer(Layer):
+	"""
+	This layer applies a simple mean-and-std normalisation to input data.
+	This allows you to "learn" the mean+std from training data and then apply it "live" to any future incoming data.
+
+	NOTE: the parameters are NOT learnt during training, but must be initialised BEFORE training using the set_normalisation() function.
+	"""
+	def __init__(self, incoming, numbins, **kwargs):
+		"numbins is the number of frequency bins in the spectrograms we're going to be normalising"
+		super(NormalisationLayer, self).__init__(incoming, **kwargs)
+		self.numbins = numbins
+		self._output_shape = None
+		self.initialised = False
+		# and for the normalisation, per frequency bin - typically, we "sub" the mean and then "mul" by 1/std (I write this as mul rather than div because often more efficient)
+		self.normn_sub = theano.shared(np.zeros((1, 1, numbins,  1), dtype=theano.config.floatX), borrow=True, name='norm_sub', broadcastable=(1, 1, 0, 1))
+		self.normn_mul = theano.shared(np.ones( (1, 1, numbins,  1), dtype=theano.config.floatX), borrow=True, name='norm_mul', broadcastable=(1, 1, 0, 1))
+		# here we're defining a theano func that I can use to "manually" normalise some data if needed as a separate thing
+		inputdata = T.tensor4('inputdata')
+		self.transform_some_data = theano.function([inputdata], (inputdata - self.normn_sub) * self.normn_mul)
+
+	def get_output_shape_for(self, input_shape):
+		return input_shape
+
+	def get_output_for(self, inputdata, **kwargs):
+		#if not self.initialised:
+		#	print("NormalisationLayer must be initalised with normalisation parameters before training")
+		return (inputdata - self.normn_sub) * self.normn_mul
+
+	def set_normalisation(self, databatches):
+		numbins = self.numbins
+		# we first collapse the data batches, essentially into one very long spectrogram...
+		#print("databatches.shape: %s" % str(databatches.shape))
+		data = np.concatenate(np.vstack(np.vstack(databatches)), axis=-1)
+		#print("data.shape: %s" % str(data.shape))
+
+		centre = np.mean(data, axis=1)
+		self.normn_sub.set_value( centre.astype(theano.config.floatX).reshape((1,1,numbins,1)), borrow=True)
+		self.normn_mul.set_value(1. / data.std( axis=1).reshape((1,1,-1,1)), borrow=True)
+
+		self.initialised = True
+
Binary file renneschiffchaff20130320bout1filt.wav has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/userconfig.py	Fri Jan 08 11:30:47 2016 +0000
@@ -0,0 +1,38 @@
+
+# Configuration options that you might like to change
+
+example_is_audio = True   # if False, generates simple sparse data for probing; else loads an audio file
+examplegram_startindex = 550   # just choosing which bit to plot
+
+#examplewavpath = "~/birdsong/linhart2015mar/concatall/perfolder/PC1101-rep-day2.wav"
+examplewavpath = "509.WAV"
+examplewavpath = "renneschiffchaff20130320bout1filt.wav"
+
+srate = 22050.
+wavdownsample = 2  # eg 44 kHz audio, factor of 2, gets loaded as 22 kHz. for no downsampling, set this ratio to 1
+
+audioframe_len    = 128
+audioframe_stride = 64
+
+specbinlow = 10
+specbinnum = 32
+
+featframe_len    = 9
+featframe_stride = 16
+numfilters       = 6
+minibatchsize    = 16
+numtimebins = 160 # 128 # 48 # NOTE that this size needs really to be compatible with downsampling (maxpooling) steps if you use them.
+
+
+###########################################################
+# Below, we calculate some other things based on the config
+
+import os
+examplewavpath = os.path.expanduser(examplewavpath)
+
+
+hopsize_secs = audioframe_stride / float(srate)
+print("Specgram frame hop size: %.3g ms" % (hopsize_secs * 1000))
+specgramlen_secs = hopsize_secs * numtimebins
+print("Specgram duration: %.3g s" % specgramlen_secs)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/util.py	Fri Jan 08 11:30:47 2016 +0000
@@ -0,0 +1,71 @@
+
+# utility functions
+
+import numpy as np
+from numpy import float32
+
+import os, errno
+from scikits.audiolab import Sndfile
+from scikits.audiolab import Format
+
+from matplotlib.mlab import specgram
+
+from userconfig import *
+
+########################################################
+
+def standard_specgram(signal):
+	"Return specgram matrix, made using the audio-layer config"
+	return np.array(specgram(signal, NFFT=audioframe_len, noverlap=audioframe_len-audioframe_stride, window=np.hamming(audioframe_len))[0][specbinlow:specbinlow+specbinnum,:], dtype=float32)
+
+def load_soundfile(inwavpath, startpossecs, maxdursecs=None):
+	"""Loads audio data, optionally limiting to a specified start position and duration.
+	Must be SINGLE-CHANNEL and matching our desired sample-rate."""
+	framelen = 4096
+	hopspls = framelen
+	unhopspls = framelen - hopspls
+	if (framelen % wavdownsample) != 0: raise ValueError("framelen needs to be a multiple of wavdownsample: %i, %i" % (framelen, wavdownsample))
+	if (hopspls  % wavdownsample) != 0: raise ValueError("hopspls  needs to be a multiple of wavdownsample: %i, %i" % (hopspls , wavdownsample))
+	if maxdursecs==None:
+		maxdursecs = 9999
+	sf = Sndfile(inwavpath, "r")
+	splsread = 0
+	framesread = 0
+	if sf.channels != 1:       raise ValueError("Sound file %s has multiple channels (%i) - mono required." % (inwavpath, sf.channels))
+	timemax_spls   = int(maxdursecs * sf.samplerate)
+	if sf.samplerate != (srate * wavdownsample):
+		raise ValueError("Sample rate mismatch: we expect %g, file has %g" % (srate, sf.samplerate))
+	if startpossecs > 0:
+			sf.seek(startpossecs * sf.samplerate) # note: returns IOError if beyond the end
+	audiodata = np.array([], dtype=np.float32)
+	while(True):
+		try:
+			if splsread==0:
+				chunk = sf.read_frames(framelen)[::wavdownsample]
+				splsread += framelen
+			else:
+				chunk = np.hstack((chunk[:unhopspls], sf.read_frames(hopspls)[::wavdownsample] ))
+				splsread += hopspls
+			framesread += 1
+			if framesread % 25000 == 0:
+				print("Read %i frames" % framesread)
+			if len(chunk) != (framelen / wavdownsample):
+				print("Not read sufficient samples - returning")
+				break
+			chunk = np.array(chunk, dtype=np.float32)
+			audiodata = np.hstack((audiodata, chunk))
+			if splsread >= timemax_spls:
+				break
+		except RuntimeError:
+			break
+	sf.close()
+	return audiodata
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError as exc: # Python >2.5
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else: raise
+