Mercurial > hg > rhythm-melody-feature-evaluation

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/audio/dataset_info.rtf	Mon Aug 01 21:10:31 2016 -0400
@@ -0,0 +1,40 @@
+{\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170
+{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
+{\colortbl;\red255\green255\blue255;}
+\paperw11900\paperh16840\margl1440\margr1440\vieww10800\viewh8400\viewkind0
+\pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\pardirnatural
+
+\f0\fs24 \cf0 A dataset of 3000 synthesised rhythms and 3000 synthesised melodies for the evaluation of audio features. The dataset was created to support research of [1]. \expnd0\expndtw0\kerning0
+\
+\kerning1\expnd0\expndtw0 \
+You can download the dataset of synthesised rhythms and melodies from [2]. \
+If you use this dataset for research please cite [1]. \
+\
+For any questions please contact m.x.panteli\{at\}gmail.com.\
+This code is licensed under the terms of the MIT License. \
+Copyright (c) 2016 Maria Panteli.\
+\expnd0\expndtw0\kerning0
+\
+Usage:\
+The dataset contains 3000 rhythms and 3000 melodies. It represents 30 unique rhythms and likewise 30 melodies that are transformed in 100 different ways. \
+\
+The rhythm transformations correspond to changes in Timbre, Global Tempo, Recording Quality, Local Tempo. The melody transformations correspond to changes in Timbre, Global Tempo, Recording Quality, Key. Each transformation is represented by 25 unique values, for example, there are 25 distinct timbres, 25 global tempo shifts, 25 degradations of the recording quality and 25 local tempo or key shifts. \
+\
+The file names are of the form i_j_k.wav where\
+i : transformation, 1 to 4 (Timb, GTemp, RecQ, LTemp/KeyT)\
+j : rhythm/melody family, 1 to 30\
+k : tramsformation value (1 to 25 shifts/degradations)\
+\
+The 30 rhythms and 30 melodies were selected from a number of western and non western music styles. There are 6 styles for rhythm and 6 styles for melody and each style is represented by a total of 5 rhythms or melodies. For example rhythm families 1 to 5 are from the \'91Afro-American\'92 style, melody families 1 to 5 are from the \'91Dutch folk\'92 style. \
+\
+The dataset consists of both monotimbral and polytimbral rhythms and monophonic and polyphonic melodies. In particular, rhythm/melody families 1 to 20 are monotimbral/monophonic and rhythm/melody families 21 to 30 are polytimbral/polyphonic.\
+\
+See [1] and Metadata.csv for more details. \
+\
+\kerning1\expnd0\expndtw0 [1] M. Panteli and S. Dixon. On the Evaluation of Rhythmic and Melodic Descriptors for Music Similarity. In
+\i Proceedings of the 17th International Society for Music Information Retrieval Conference
+\i0 , pages 468-474, 2016.\expnd0\expndtw0\kerning0
+\
+\
+\kerning1\expnd0\expndtw0 [2] Rhythms - {\field{\*\fldinst{HYPERLINK "https://archive.org/details/panteli_maria_hotmail_rhythm"}}{\fldrslt https://archive.org/details/panteli_maria_rhythm_dataset}}  \
+     Melodies - {\field{\*\fldinst{HYPERLINK "https://archive.org/details/panteli_maria_hotmail_melody"}}{\fldrslt https://archive.org/details/panteli_maria_melody_dataset}}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/evaluate.py	Mon Aug 01 21:10:31 2016 -0400
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 12 18:56:28 2016
+
+@author: mariapanteli
+"""
+"""Run classification and retrieval experiments"""
+
+import os
+import numpy
+import pandas
+import sklearn.metrics.pairwise as PW
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+
+import classifiers as cc
+
+
+def post_process_frames(frames, pca_frames=True, n_pcas=20):
+    """Standardize and PCA data."""
+    frames = StandardScaler().fit_transform(frames.T).T  # standardise n_samples
+    if pca_frames:
+        frames = PCA(n_components=n_pcas).fit_transform(frames)
+    return frames
+
+
+def classification_experiments(features, labels, feat_labels, group_labels, nfolds=5):
+    """ classify rhythms/melodies and average accuracy by label grouping,
+        eg, average accuracy per transformation or transformation value
+    """
+    tlabels, inds = numpy.unique(group_labels, return_index=True)
+    tlabels = tlabels[numpy.argsort(inds)]
+    tlabelinds = [numpy.where(group_labels==tt)[0] for tt in tlabels]
+
+    results_classification = []
+    classifiers = ["KNN", "LDA", "NB", "SVM"]
+    for feat, feat_label in zip(features, feat_labels):
+        for cl in classifiers:
+            if cl == "KNN":
+                accuracies = cc.classifyKNN(feat, labels, kfold=nfolds)
+            elif cl == "LDA":
+                accuracies = cc.classifyLDA(feat, labels, kfold=nfolds)
+            elif cl == "NB":
+                accuracies = cc.classifyNB(feat, labels, kfold=nfolds)
+            elif cl == "SVM":
+                accuracies = cc.classifySVM(feat, labels, kfold=nfolds)
+            group_accuracy = [numpy.nanmean(accuracies[labelinds]) for labelinds in tlabelinds]
+            group_accuracy.append(numpy.mean(accuracies))
+            group_accuracy.append(cl)
+            group_accuracy.append(feat_label)
+            results_classification.append(group_accuracy)
+    return results_classification, tlabels
+
+
+def topK_experiments(features, labels, feat_labels, group_labels, K=99):
+    """ query rhythms/melodies and assess recall rate at top K ,
+        average accuracy by label grouping, eg, by transformation or transformation value
+    """
+    tlabels, inds = numpy.unique(group_labels, return_index=True)
+    tlabels = tlabels[numpy.argsort(inds)]
+    tlabelinds = [numpy.where(group_labels==tt)[0] for tt in tlabels]
+
+    results_topK = []
+    dist_metrics = ["euclidean", "cosine", "correlation", "mahalanobis"]
+    for feat, feat_label in zip(features, feat_labels):
+        for metric in dist_metrics:
+            D = PW.pairwise_distances(feat, metric=metric)
+            accuracies = numpy.ones((len(labels), 1), dtype=float) * numpy.nan
+            for label in numpy.unique(labels):
+                queryind = numpy.where(labels == label)[0]
+                truematchinds = numpy.where(labels == label)[0]
+                truematchinds = set(truematchinds) - set(queryind)  # remove queryind
+                sortindex = numpy.argsort(D[queryind, :]).flatten()
+                sortindex = sortindex[1:]  # remove queryind (top of list)
+                topKinds = set(sortindex[:K])
+                correctinds = truematchinds & topKinds
+                wronginds = truematchinds - correctinds
+                accuracies[list(correctinds)] = 1
+                accuracies[list(wronginds)] = 0
+            group_accuracy = [numpy.nanmean(accuracies[labelinds]) for labelinds in tlabelinds]
+            group_accuracy.append(numpy.mean(accuracies[numpy.where(numpy.isnan(accuracies) == False)[0]]))
+            group_accuracy.append(metric)
+            group_accuracy.append(feat_label)
+            results_topK.append(group_accuracy)
+    return results_topK, tlabels
+
+
+if __name__ == '__main__':
+    # Load metadata
+    meta = pandas.read_csv(os.path.join('data', 'Metadata.csv'), sep=',')
+    labels = numpy.array(meta["family"].get_values(), dtype=str)
+
+    # Load features and post process
+    st = post_process_frames(pandas.read_csv(os.path.join('data','ST.csv'),header=None).get_values())
+    op = post_process_frames(pandas.read_csv(os.path.join('data','OP.csv'),header=None).get_values())
+    fp = post_process_frames(pandas.read_csv(os.path.join('data','FP.csv'),header=None).get_values())
+    pb = post_process_frames(pandas.read_csv(os.path.join('data','PB.csv'),header=None).get_values())
+    ig = post_process_frames(pandas.read_csv(os.path.join('data','IG.csv'),header=None).get_values())
+    fmt = post_process_frames(pandas.read_csv(os.path.join('data','FMT.csv'),header=None).get_values())
+
+    features = [st, op, fp, pb, ig, fmt]
+    feat_labels = ["ST", "OP", "FP", "PB", "IG", "FMT"]
+    test_classes = ["transformation", "value", "style", "monopoly"]
+
+    write_file = False  # set it to True if you want to write output file
+    for test_class in test_classes:
+        group_labels = meta[test_class].get_values()
+        results_class, tlabels = classification_experiments(features, labels, feat_labels, group_labels)
+        results_topK, tlabels = topK_experiments(features, labels, feat_labels, group_labels)
+        header = numpy.append(tlabels, ['mean accuracy', 'metric', 'feature'])
+        results = numpy.concatenate((header[None, :], numpy.array(results_class), numpy.array(results_topK)))
+
+        if write_file:
+            filename = os.path.join('data','results_' + test_class + '.csv')
+            numpy.savetxt(filename, results, fmt='%s', delimiter=',')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_features.py	Mon Aug 01 21:10:31 2016 -0400
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Feb 11 19:51:24 2016
+
+@author: mariapanteli
+"""
+""" Extract scale transform and pitch bihistogram descriptors"""
+
+import os
+import numpy
+import pandas
+import sys
+sys.path.append('util')
+import pitch_bihist as pbh
+import scale_transform as stf
+
+
+def extract_pitch_bihist(filenames):
+    """ compute pitch bihistogram feature for each audio file in filenames
+    """
+    melf = []
+    for filename in filenames:
+        print filename
+        pb = pbh.PitchBihist()
+        pbihist = pb.get_pitchbihist(filename=filename)
+        average_frame = numpy.nanmean(pbihist.T, axis=0, keepdims=True)
+        melf.append(average_frame)
+    melf = numpy.concatenate(melf)
+    return melf
+
+
+def extract_scale_transform(filenames):
+    """ compute scale transform feature for each audio file in filenames
+    """
+    rhyf = []
+    for filename in filenames_mel:
+        print filename
+        st = stf.ScaleTransform()
+        stransf = st.get_scale_transform(filename=filename)
+        average_frame = numpy.nanmean(stransf.T, axis=0, keepdims=True)
+        rhyf.append(average_frame)
+    rhyf = numpy.concatenate(rhyf)
+    return rhyf
+
+
+if __name__ == '__main__':
+    # Load metadata to get audio filenames
+    df = pandas.read_csv(os.path.join('data','Metadata.csv'))
+    filenames = df["filename"].get_values()
+
+    # Specify audio directory and append to audio filenames
+    filenames_mel = [os.path.join('audio','melodies', ff) for ff in filenames]
+    filenames_rhy = [os.path.join('audio','rhythms', ff) for ff in filenames]
+
+    pitch_bihist = extract_pitch_bihist(filenames_mel)
+    scale_transform = extract_scale_transform(filenames_rhy)
+
+    # Save .csv where rows = n_files, columns = n_features
+    write_file = False
+    if write_file:
+        numpy.savetxt(os.path.join('data', 'PB.csv'), pitch_bihist, delimiter=',', fmt='%10.12f')
+        numpy.savetxt(os.path.join('data', 'ST.csv'), scale_transform, delimiter=',', fmt='%10.12f')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/results.py	Mon Aug 01 21:10:31 2016 -0400
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 20 11:30:20 2016
+
+@author: mariapanteli
+"""
+
+import numpy
+import pandas
+import os
+import matplotlib.pyplot as plt
+#%matplotlib inline
+from scipy import stats
+from statsmodels.stats.multicomp import MultiComparison
+
+
+# utility functions
+def load_csv_results(testclass="transformation"):
+    datadf = pandas.read_csv(os.path.join('data', 'results_' + testclass + '.csv'), sep=',')
+    return datadf
+
+
+def data_frame_to_latex_table(results_df, rhyinds, melinds):
+    results_df = pandas.concat([results_df.iloc[:, -1], results_df.iloc[:, -2], results_df.iloc[:, 0:-2]], axis=1)
+    print results_df.iloc[rhyinds, :].to_latex(index=False)
+    print results_df.iloc[melinds, :].to_latex(index=False)
+
+
+def boxplotfigure(data, labels=None, rotate=False, xlabel='', ylabel='', figurename=None):
+    plt.figure()
+    plt.boxplot(data, 1, labels=labels)
+    plt.ylim(0,1)
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+    if rotate:
+        plt.xticks(plt.xticks()[0], labels, rotation=45)
+    plt.tight_layout()
+    if figurename is not None:
+        plt.savefig(figurename, bbox_inches='tight')
+
+meta = pandas.read_csv(os.path.join('data', 'Metadata.csv'), sep=',')
+rhyfeat = ["ST", "OP", "FP"]
+melfeat = ["PB", "IG", "FMT"]
+transformations = load_csv_results("transformation")
+rhyinds = numpy.concatenate([numpy.where(transformations["feature"]==feat)[0] for feat in rhyfeat])
+melinds = numpy.concatenate([numpy.where(transformations["feature"]==feat)[0] for feat in melfeat])
+
+# print mean accuracy as Latex table
+mean_accuracy_df = transformations.iloc[:, -3:]
+data_frame_to_latex_table(mean_accuracy_df, rhyinds, melinds)
+
+# print accuracy per transformation as Latex table
+data_frame_to_latex_table(load_csv_results("transformation"), rhyinds, melinds)
+
+# print accuracy per transformation value as Latex table
+# warning: this gives a big table 48 x 100
+# data_frame_to_latex_table(loadcsvresults("value"), rhyinds, melinds)
+
+# indices of classification accuracies (for boxplot results and statistical tests)
+classmodels = ["KNN", "LDA", "NB", "SVM"]
+classmodelsinds = numpy.concatenate([numpy.where(transformations["metric"]==model)[0] for model in classmodels])
+
+rhyinds = numpy.asarray(list(set(rhyinds) & set(classmodelsinds)))
+melinds = numpy.asarray(list(set(melinds) & set(classmodelsinds)))
+
+# load style data
+styledata = load_csv_results("style").iloc[:, 0:-3]
+styledatarhy = styledata.get_values()[rhyinds, :]
+styledatamel = styledata.get_values()[melinds, :]
+
+# style box plot
+rhystylelabels = ['Afro-American', 'North-Indian', 'African', 'Classical', 'EDM', 'Latin-Brazilian']
+boxplotfigure(styledatarhy, labels=rhystylelabels, rotate=True, xlabel="Music Style", ylabel="Classification Accuracy")
+melstylelabels = ['Dutch Folk', 'Classical (M)', 'Byzantine', 'Pop (M)', 'Classical (P)', 'Pop (P)']
+boxplotfigure(styledatamel, labels=melstylelabels, rotate=True, xlabel="Music Style", ylabel="Classification Accuracy")
+
+# style paired t-test
+DataRhy = numpy.asarray(numpy.reshape(styledatarhy, -1), dtype='float')
+Groups = numpy.reshape(numpy.repeat(numpy.asarray(rhystylelabels)[:, None].T, styledatarhy.shape[0], axis=0), -1)
+modrhy = MultiComparison(DataRhy, Groups)
+DataMel = numpy.asarray(numpy.reshape(styledatamel, -1), dtype='float')
+Groups = numpy.reshape(numpy.repeat(numpy.asarray(melstylelabels)[:, None].T, styledatamel.shape[0], axis=0), -1)
+modmel = MultiComparison(DataMel, Groups)
+print "multiple comparison test for each rhythm style"
+print modrhy.allpairtest(stats.ttest_rel, method='Bonf')[0]
+print "multiple comparison test for each melody style"
+print modmel.allpairtest(stats.ttest_rel, method='Bonf')[0]
+
+# load monophonic/polyphonic data
+monopolydata = numpy.asarray(load_csv_results("monopoly").get_values()[:, 0:-3], dtype=float)
+
+# mono/poly paired t-test
+print "Melody, ttest, mono/poly: "+str(stats.ttest_rel(monopolydata[melinds, 0], monopolydata[melinds, 1]))
+print "Melody, Mean, mono/poly: "+str(numpy.mean(monopolydata[melinds, :], axis=0))
+print "Melody, Std, mono/poly: "+str(numpy.std(monopolydata[melinds, :], axis=0))
+print "Rhythm, ttest, mono/poly: "+str(stats.ttest_rel(monopolydata[rhyinds, 0], monopolydata[rhyinds, 1]))
+print "Rhythm, Mean, mono/poly: "+str(numpy.mean(monopolydata[rhyinds, :], axis=0))
+print "Rhythm, Std, mono/poly: "+str(numpy.std(monopolydata[rhyinds, :], axis=0))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/util/classifiers.py	Mon Aug 01 21:10:31 2016 -0400
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct  5 14:59:39 2015
+
+@author: mariapanteli
+"""
+"""Selection of classifiers"""
+
+import numpy
+
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.cross_validation import cross_val_predict
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
+from sklearn import svm
+from sklearn import metrics
+
+def classifyKNN(X, Y, nn=1, kfold=5):
+    model = KNeighborsClassifier(n_neighbors=nn, metric='euclidean')
+    Y_pred = cross_val_predict(model, X, Y, cv=kfold)
+    accuracies = []
+    for i in range(len(Y_pred)):
+        accuracies.append(metrics.accuracy_score([Y[i]], [Y_pred[i]]))
+    return numpy.asarray(accuracies)
+
+def classifySVM(X, Y, kfold=5):
+    model = svm.SVC(kernel='linear')
+    Y_pred = cross_val_predict(model, X, Y, cv=kfold)
+    accuracies = []
+    for i in range(len(Y_pred)):
+        accuracies.append(metrics.accuracy_score([Y[i]], [Y_pred[i]]))
+    return numpy.asarray(accuracies)
+
+def classifyLDA(X, Y, kfold=5):
+    model = LDA(n_components=20)
+    Y_pred = cross_val_predict(model, X, Y, cv=kfold)
+    accuracies = []
+    for i in range(len(Y_pred)):
+        accuracies.append(metrics.accuracy_score([Y[i]], [Y_pred[i]]))
+    return numpy.asarray(accuracies)
+
+def classifyNB(X, Y, kfold=5):
+    model = GaussianNB()
+    Y_pred = cross_val_predict(model, X, Y, cv=kfold)
+    accuracies = []
+    for i in range(len(Y_pred)):
+        accuracies.append(metrics.accuracy_score([Y[i]], [Y_pred[i]]))
+    return numpy.asarray(accuracies)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/util/pitch_bihist.py	Mon Aug 01 21:10:31 2016 -0400
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb  2 22:26:10 2016
+
+@author: mariapanteli
+"""
+"""Pitch bihistogram descriptor from chroma representation"""
+
+import numpy
+import scipy.signal
+
+import smoothiecore as s
+
+class PitchBihist:
+    def __init__(self):
+        self.y = None
+        self.sr = None
+        self.chroma = None
+        self.chromasr = None
+        self.bihist = None
+
+    def bihist_from_chroma(self, filename='test.wav'):
+        self.chroma, self.chromasr = s.get_chroma(filename=filename)
+
+        # second frame decomposition
+        win2 = int(round(8 * self.chromasr))  # default win2 size 8 sec.
+        hop2 = int(round(0.5 * self.chromasr))  # default hop2 size 0.5 sec.
+        n_bins, n_chroma_frames = self.chroma.shape
+        n_frames = max(1, int(1 + numpy.floor((n_chroma_frames-win2) / float(hop2))))
+        bihistframes = numpy.empty((n_bins * n_bins, n_frames))
+
+        # loop over all 8-sec frames
+        for i in range(n_frames):
+            frame = self.chroma[:, (i*  hop2):min((i * hop2 + win2), n_chroma_frames)]
+            bihist = self.bihistogram(frame)
+            bihist = numpy.reshape(bihist, -1)
+            bihistframes[:, i] = bihist
+        self.bihist = bihistframes
+
+    def bihistogram(self, spec, winsec=0.5, align=True):
+        win = int(round(winsec * self.chromasr))
+        ker = numpy.concatenate([numpy.zeros((win, 1)), numpy.ones((win+1, 1))], axis=0)
+        spec = spec.T  # transpose to have frames as rows in convolution
+
+        # energy threshold
+        thr = 0.3 * numpy.max(spec)
+        spec[spec < max(thr, 0)] = 0
+
+        # transitions via convolution
+        tra = scipy.signal.convolve2d(spec, ker, mode='same')
+        tra[spec > 0] = 0
+
+        # multiply with original
+        B = numpy.dot(tra.T, spec)
+
+        # normalize
+        mxB = numpy.max(B)
+        mnB = numpy.min(B)
+        if mxB != mnB:  # avoid division by 0
+            B = (B - mnB) / float(mxB-mnB)
+
+        # circularly shift to highest magnitude
+        if align:
+            ref = numpy.argmax(numpy.sum(spec, axis=0))
+            B = numpy.roll(B, -ref, axis=0)
+            B = numpy.roll(B, -ref, axis=1)
+        return B
+
+    def get_pitchbihist(self, filename='test.wav'):
+        self.bihist_from_chroma(filename=filename)
+        return self.bihist
+
+
+if __name__ == '__main__':
+    pb = PitchBihist()
+    pb.get_pitchbihist()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/util/scale_transform.py	Mon Aug 01 21:10:31 2016 -0400
@@ -0,0 +1,174 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Aug 17 13:37:00 2015
+
+@author: mariapanteli
+"""
+"""Scale transform descriptor from mel spectrogram"""
+
+import numpy
+import librosa
+import scipy.signal
+
+class ScaleTransform:
+    def __init__(self):
+        self.y = None
+        self.sr = None
+        self.nmels = 40
+        self.melspec = None
+        self.melsr = None
+        self.op = None
+        self.opmellin = None
+
+    def load_audiofile(self, filename='test.wav', sr=None):
+        """Load audio"""
+        self.y, self.sr = librosa.load(filename, sr=sr)
+
+    def mel_spectrogram(self, y=None, sr=None):
+        """Get mel spectrogram"""
+        if self.y is None:
+            self.y = y
+        if self.sr is None:
+            self.sr = sr
+        win1 = int(round(0.04 * self.sr))
+        hop1 = int(round(win1 / 8.))
+        nfft1 = int(2 ** numpy.ceil(numpy.log2(win1)))
+        D = numpy.abs(librosa.stft(self.y, n_fft=nfft1, hop_length=hop1, win_length=win1, window=scipy.signal.hamming)) ** 2
+        melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=self.nmels, fmax=8000)
+        melsr = self.sr/float(hop1)
+        self.melspec = melspec
+        self.melsr = melsr
+
+    def post_process_spec(self, melspec=None, medianfilt=True, sqrt=True, diff=True, subtractmean=True, halfwave=True, maxnormal=True):
+        """Some post processing of the mel spectrogram"""
+        if self.melspec is None:
+            self.melspec = melspec
+        if medianfilt:
+            ks = int(0.1 * self.melsr) # 100ms kernel size
+            if ks % 2 == 0: ks += 1  # ks must be odd
+            for i in range(self.nmels):
+                self.melspec[i, :] = scipy.signal.medfilt(self.melspec[i, :], kernel_size=ks)
+        if sqrt:
+            self.melspec = self.melspec ** .5
+        if diff:
+            # append one frame before diff to keep number of frames the same
+            self.melspec = numpy.concatenate((self.melspec, self.melspec[:, -1, None]), axis=1)
+            self.melspec = numpy.diff(self.melspec, n=1, axis=1)
+        if subtractmean:
+            mean = self.melspec.mean(axis=1)
+            mean.shape = (mean.shape[0], 1)
+            self.melspec = self.melspec - mean
+        if halfwave:
+            self.melspec[numpy.where(self.melspec < 0)] = 0
+        if maxnormal:
+            self.melspec = self.melspec / self.melspec.max()
+
+    def onset_patterns(self, melspec=None, melsr=None, center=False):
+        """Get rhythm periodicities by applying stft in each mel band"""
+        if self.melspec is None:
+            self.melspec = melspec
+        if self.melsr is None:
+            self.melsr = melsr
+        win2 = int(round(8 * self.melsr))
+        hop2 = int(round(0.5 * self.melsr))
+        nfft2 = int(2**numpy.ceil(numpy.log2(win2)))
+
+        # some preprocessing for the second frame decomposition
+        melspectemp = self.melspec
+        if melspectemp.shape[1] < nfft2:
+            # if buffer too short pad with zeros to have at least one 8-sec window
+            nzeros = nfft2 - melspectemp.shape[1]
+            melspectemp = numpy.concatenate([numpy.zeros((self.nmels, int(numpy.ceil(nzeros / 2.)))), melspectemp, numpy.zeros((self.nmels,int(numpy.ceil(nzeros / 2.))))], axis=1)
+        temp = numpy.abs(librosa.stft(y=melspectemp[0, :], win_length=win2, hop_length=hop2, n_fft=nfft2, window=scipy.signal.hamming, center=center))
+        nframes = temp.shape[1]
+
+        # filter periodicities in the range 30-960 bpm
+        freqresinbpm = float(self.melsr) / float(nfft2/2.)*60.
+        minmag = int(numpy.floor(30. / freqresinbpm))  # min tempo 30bpm
+        maxmag = int(numpy.ceil(960. / freqresinbpm))  # max tempo 960 bpm
+        magsinds = range(minmag, maxmag)  # indices of selected stft magnitudes
+
+        # loop over all mel_bands and get rhythm periodicities (stft magnitudes)
+        nmags = len(magsinds)
+        fft2 = numpy.zeros((self.nmels, nmags, nframes))
+        for i in range(self.nmels):
+            fftmags = numpy.abs(librosa.stft(y=melspectemp[i, :], win_length=win2, hop_length=hop2, n_fft=nfft2, window=scipy.signal.hamming, center=center))
+            fftmags = fftmags[magsinds, :]
+            fft2[i, :, :] = fftmags
+        op = fft2
+        self.op = op
+
+
+    def post_process_op(self, median_filt=True):
+        """Some smoothing of the onset patterns"""
+        if median_filt:
+            hop2 = int(round(0.5 * self.melsr))
+            ssr = self.melsr/float(hop2)
+            ks = int(0.5 * ssr)  # 100ms kernel size
+            if ks % 2 == 0: ks += 1  # ks must be odd
+            nmels, nmags, nframes = self.op.shape
+            for i in range(nmels):
+                for j in range(nframes):
+                    self.op[i, :, j] = numpy.convolve(self.op[i, :, j], numpy.ones(ks) / ks, mode='same')
+
+
+    def mellin_transform(self, op=None):
+        """
+        Apply mellin transform to remove tempo (scale) information.
+        Code adapted from a MATLAB implementation by Andre Holzapfel.
+        """
+        if self.op is None:
+            self.op = op
+        nmels, nmags, nframes = self.op.shape
+        nmagsout = 200
+        u_max = numpy.log(nmags)
+        delta_c = numpy.pi / u_max
+        c_max = nmagsout
+        c = numpy.arange(delta_c, c_max, delta_c)
+        k = range(1, nmags)
+        exponent = 0.5 - c * 1j
+
+        normMat = 1. / (exponent * numpy.sqrt(2 * numpy.pi))
+        normMat.shape = (normMat.shape[0], 1)
+        normMat = numpy.repeat(normMat.T, nmels, axis=0)
+        kernelMat = numpy.asarray([numpy.power(ki, exponent) for ki in k])
+        opmellin = numpy.zeros((nmels, nmagsout, nframes))
+        for i in range(nframes):
+            self.op[:, -1, i] = 0
+            deltaMat = - numpy.diff(self.op[:, :, i])
+            mellin = numpy.abs(numpy.dot(deltaMat, kernelMat) * normMat)
+            opmellin[:, :, i] = mellin[:, :nmagsout]
+        self.opmellin = opmellin
+
+
+    def post_process_mellin(self, opmellin=None, normFrame=True, aveBands=False):
+        """Some post processing of the scale transform"""
+        if self.opmellin is None:
+            self.opmellin = opmellin
+        if aveBands:
+            self.opmellin = numpy.mean(self.opmellin, axis=0, keepdims=True)
+        nmels, nmags, nframes = self.opmellin.shape
+        self.opmellin = self.opmellin.reshape((nmels*nmags, nframes))
+        if normFrame:
+            min_opmellin = numpy.amin(self.opmellin, axis=0, keepdims=True)
+            max_opmellin = numpy.amax(self.opmellin, axis=0, keepdims=True)
+            denom = max_opmellin - min_opmellin
+            denom[denom==0] = 1 # avoid division by 0 if frame is all 0s-silent
+            self.opmellin = (self.opmellin - min_opmellin) / denom
+
+
+    def get_scale_transform(self, filename='test.wav'):
+        """Return scale transform for filename"""
+        self.load_audiofile(filename=filename)
+        self.mel_spectrogram()
+        self.post_process_spec()
+        self.onset_patterns()
+        self.post_process_op()
+        self.mellin_transform()
+        self.post_process_mellin(aveBands=True)
+        return self.opmellin
+
+
+if __name__ == '__main__':
+    op = ScaleTransform()
+    op.get_scale_transform()