Maria@1: # -*- coding: utf-8 -*- Maria@1: """ Maria@1: Created on Mon Aug 17 13:37:00 2015 Maria@1: Maria@1: @author: mariapanteli Maria@1: """ Maria@1: """Scale transform descriptor from mel spectrogram""" Maria@1: Maria@1: import numpy Maria@1: import librosa Maria@1: import scipy.signal Maria@1: Maria@1: class ScaleTransform: Maria@1: def __init__(self): Maria@1: self.y = None Maria@1: self.sr = None Maria@1: self.nmels = 40 Maria@1: self.melspec = None Maria@1: self.melsr = None Maria@1: self.op = None Maria@1: self.opmellin = None Maria@1: Maria@1: def load_audiofile(self, filename='test.wav', sr=None): Maria@1: """Load audio""" Maria@1: self.y, self.sr = librosa.load(filename, sr=sr) Maria@1: Maria@1: def mel_spectrogram(self, y=None, sr=None): Maria@1: """Get mel spectrogram""" Maria@1: if self.y is None: Maria@1: self.y = y Maria@1: if self.sr is None: Maria@1: self.sr = sr Maria@1: win1 = int(round(0.04 * self.sr)) Maria@1: hop1 = int(round(win1 / 8.)) Maria@1: nfft1 = int(2 ** numpy.ceil(numpy.log2(win1))) Maria@1: D = numpy.abs(librosa.stft(self.y, n_fft=nfft1, hop_length=hop1, win_length=win1, window=scipy.signal.hamming)) ** 2 Maria@1: melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=self.nmels, fmax=8000) Maria@1: melsr = self.sr/float(hop1) Maria@1: self.melspec = melspec Maria@1: self.melsr = melsr Maria@1: Maria@1: def post_process_spec(self, melspec=None, medianfilt=True, sqrt=True, diff=True, subtractmean=True, halfwave=True, maxnormal=True): Maria@1: """Some post processing of the mel spectrogram""" Maria@1: if self.melspec is None: Maria@1: self.melspec = melspec Maria@1: if medianfilt: Maria@1: ks = int(0.1 * self.melsr) # 100ms kernel size Maria@1: if ks % 2 == 0: ks += 1 # ks must be odd Maria@1: for i in range(self.nmels): Maria@1: self.melspec[i, :] = scipy.signal.medfilt(self.melspec[i, :], kernel_size=ks) Maria@1: if sqrt: Maria@1: self.melspec = self.melspec ** .5 Maria@1: if diff: Maria@1: # append one frame before diff to keep number of frames the same Maria@1: self.melspec = numpy.concatenate((self.melspec, self.melspec[:, -1, None]), axis=1) Maria@1: self.melspec = numpy.diff(self.melspec, n=1, axis=1) Maria@1: if subtractmean: Maria@1: mean = self.melspec.mean(axis=1) Maria@1: mean.shape = (mean.shape[0], 1) Maria@1: self.melspec = self.melspec - mean Maria@1: if halfwave: Maria@1: self.melspec[numpy.where(self.melspec < 0)] = 0 Maria@1: if maxnormal: Maria@1: self.melspec = self.melspec / self.melspec.max() Maria@1: Maria@1: def onset_patterns(self, melspec=None, melsr=None, center=False): Maria@1: """Get rhythm periodicities by applying stft in each mel band""" Maria@1: if self.melspec is None: Maria@1: self.melspec = melspec Maria@1: if self.melsr is None: Maria@1: self.melsr = melsr Maria@1: win2 = int(round(8 * self.melsr)) Maria@1: hop2 = int(round(0.5 * self.melsr)) Maria@1: nfft2 = int(2**numpy.ceil(numpy.log2(win2))) Maria@1: Maria@1: # some preprocessing for the second frame decomposition Maria@1: melspectemp = self.melspec Maria@1: if melspectemp.shape[1] < nfft2: Maria@1: # if buffer too short pad with zeros to have at least one 8-sec window Maria@1: nzeros = nfft2 - melspectemp.shape[1] Maria@1: melspectemp = numpy.concatenate([numpy.zeros((self.nmels, int(numpy.ceil(nzeros / 2.)))), melspectemp, numpy.zeros((self.nmels,int(numpy.ceil(nzeros / 2.))))], axis=1) Maria@1: temp = numpy.abs(librosa.stft(y=melspectemp[0, :], win_length=win2, hop_length=hop2, n_fft=nfft2, window=scipy.signal.hamming, center=center)) Maria@1: nframes = temp.shape[1] Maria@1: Maria@1: # filter periodicities in the range 30-960 bpm Maria@1: freqresinbpm = float(self.melsr) / float(nfft2/2.)*60. Maria@1: minmag = int(numpy.floor(30. / freqresinbpm)) # min tempo 30bpm Maria@1: maxmag = int(numpy.ceil(960. / freqresinbpm)) # max tempo 960 bpm Maria@1: magsinds = range(minmag, maxmag) # indices of selected stft magnitudes Maria@1: Maria@1: # loop over all mel_bands and get rhythm periodicities (stft magnitudes) Maria@1: nmags = len(magsinds) Maria@1: fft2 = numpy.zeros((self.nmels, nmags, nframes)) Maria@1: for i in range(self.nmels): Maria@1: fftmags = numpy.abs(librosa.stft(y=melspectemp[i, :], win_length=win2, hop_length=hop2, n_fft=nfft2, window=scipy.signal.hamming, center=center)) Maria@1: fftmags = fftmags[magsinds, :] Maria@1: fft2[i, :, :] = fftmags Maria@1: op = fft2 Maria@1: self.op = op Maria@1: Maria@1: Maria@1: def post_process_op(self, median_filt=True): Maria@1: """Some smoothing of the onset patterns""" Maria@1: if median_filt: Maria@1: hop2 = int(round(0.5 * self.melsr)) Maria@1: ssr = self.melsr/float(hop2) Maria@1: ks = int(0.5 * ssr) # 100ms kernel size Maria@1: if ks % 2 == 0: ks += 1 # ks must be odd Maria@1: nmels, nmags, nframes = self.op.shape Maria@1: for i in range(nmels): Maria@1: for j in range(nframes): Maria@1: self.op[i, :, j] = numpy.convolve(self.op[i, :, j], numpy.ones(ks) / ks, mode='same') Maria@1: Maria@1: Maria@1: def mellin_transform(self, op=None): Maria@1: """ Maria@1: Apply mellin transform to remove tempo (scale) information. Maria@1: Code adapted from a MATLAB implementation by Andre Holzapfel. Maria@1: """ Maria@1: if self.op is None: Maria@1: self.op = op Maria@1: nmels, nmags, nframes = self.op.shape Maria@1: nmagsout = 200 Maria@1: u_max = numpy.log(nmags) Maria@1: delta_c = numpy.pi / u_max Maria@1: c_max = nmagsout Maria@1: c = numpy.arange(delta_c, c_max, delta_c) Maria@1: k = range(1, nmags) Maria@1: exponent = 0.5 - c * 1j Maria@1: Maria@1: normMat = 1. / (exponent * numpy.sqrt(2 * numpy.pi)) Maria@1: normMat.shape = (normMat.shape[0], 1) Maria@1: normMat = numpy.repeat(normMat.T, nmels, axis=0) Maria@1: kernelMat = numpy.asarray([numpy.power(ki, exponent) for ki in k]) Maria@1: opmellin = numpy.zeros((nmels, nmagsout, nframes)) Maria@1: for i in range(nframes): Maria@1: self.op[:, -1, i] = 0 Maria@1: deltaMat = - numpy.diff(self.op[:, :, i]) Maria@1: mellin = numpy.abs(numpy.dot(deltaMat, kernelMat) * normMat) Maria@1: opmellin[:, :, i] = mellin[:, :nmagsout] Maria@1: self.opmellin = opmellin Maria@1: Maria@1: Maria@1: def post_process_mellin(self, opmellin=None, normFrame=True, aveBands=False): Maria@1: """Some post processing of the scale transform""" Maria@1: if self.opmellin is None: Maria@1: self.opmellin = opmellin Maria@1: if aveBands: Maria@1: self.opmellin = numpy.mean(self.opmellin, axis=0, keepdims=True) Maria@1: nmels, nmags, nframes = self.opmellin.shape Maria@1: self.opmellin = self.opmellin.reshape((nmels*nmags, nframes)) Maria@1: if normFrame: Maria@1: min_opmellin = numpy.amin(self.opmellin, axis=0, keepdims=True) Maria@1: max_opmellin = numpy.amax(self.opmellin, axis=0, keepdims=True) Maria@1: denom = max_opmellin - min_opmellin Maria@1: denom[denom==0] = 1 # avoid division by 0 if frame is all 0s-silent Maria@1: self.opmellin = (self.opmellin - min_opmellin) / denom Maria@1: Maria@1: Maria@1: def get_scale_transform(self, filename='test.wav'): Maria@1: """Return scale transform for filename""" Maria@1: self.load_audiofile(filename=filename) Maria@1: self.mel_spectrogram() Maria@1: self.post_process_spec() Maria@1: self.onset_patterns() Maria@1: self.post_process_op() Maria@1: self.mellin_transform() Maria@1: self.post_process_mellin(aveBands=True) Maria@1: return self.opmellin Maria@1: Maria@1: Maria@1: if __name__ == '__main__': Maria@1: op = ScaleTransform() Maria@1: op.get_scale_transform()