Mercurial > hg > plosone_underreview

--- a/notebooks/explain_components.ipynb	Thu Sep 14 10:16:59 2017 +0100
+++ b/notebooks/explain_components.ipynb	Thu Sep 14 13:07:19 2017 +0100
@@ -32,7 +32,9 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -64,6 +66,7 @@
    "cell_type": "code",
    "execution_count": 14,
    "metadata": {
+    "collapsed": false,
     "scrolled": false
    },
    "outputs": [
@@ -230,7 +233,9 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -273,7 +278,9 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -291,7 +298,9 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "name": "stderr",
@@ -369,7 +378,9 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -425,7 +436,9 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -457,7 +470,9 @@
   {
    "cell_type": "code",
    "execution_count": 31,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -490,7 +505,9 @@
   {
    "cell_type": "code",
    "execution_count": 33,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -533,7 +550,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.12"
+   "version": "2.7.11"
   }
  },
  "nbformat": 4,
--- a/scripts/load_features.py	Thu Sep 14 10:16:59 2017 +0100
+++ b/scripts/load_features.py	Thu Sep 14 13:07:19 2017 +0100
@@ -34,7 +34,7 @@
         print 'extracting onset patterns and mfccs...'
         songframes = pd.read_csv(melspec_file, engine="c", header=None)
         songframes.iloc[np.where(np.isnan(songframes))] = 0
-	n_stop = np.int(np.ceil(stop_sec * self.framessr))
+        n_stop = np.int(np.ceil(stop_sec * self.framessr))
         songframes = songframes.iloc[0:min(len(songframes), n_stop), :]
         melspec = songframes.get_values().T
         op = self.get_op_from_melspec(melspec, K=2)
@@ -54,7 +54,7 @@
         songframes = pd.read_csv(chroma_file, engine="c", header=None)
         songframes.iloc[np.where(np.isnan(songframes))] = 0
         n_stop = np.int(np.ceil(stop_sec * self.framessr))
-	songframes = songframes.iloc[0:min(len(songframes), n_stop), :]
+        songframes = songframes.iloc[0:min(len(songframes), n_stop), :]
         chroma = songframes.get_values().T
         ch = self.get_ave_chroma(chroma)
         if scale:
@@ -103,7 +103,7 @@
         return music_idx


-    def get_features(self, df, stop_sec=30.0, class_label='Country'):
+    def get_features(self, df, stop_sec=30.0, class_label='Country', precomp_melody=False):
         oplist = []
         mflist = []
         chlist = []
@@ -122,14 +122,16 @@
             try:
                 op, mfcc = self.get_op_mfcc_for_file(df['Melspec'].iloc[i], stop_sec=stop_sec)
                 ch = self.get_chroma_for_file(df['Chroma'].iloc[i], stop_sec=stop_sec)
-                #pb = self.get_pb_from_melodia(df['Melodia'].iloc[i], stop_sec=stop_sec)
-                pb = self.load_precomputed_pb_from_melodia(df['Melodia'].iloc[i], stop_sec=stop_sec)
-                #pb = self.get_contour_feat_from_melodia(df['Melodia'].iloc[i])
+                pb = self.get_pb_for_file(df['Melodia'].iloc[i], precomp_melody=precomp_melody, stop_sec=stop_sec)
+                #if precomp_melody:
+                #    pb = self.load_precomputed_pb_from_melodia(df['Melodia'].iloc[i], stop_sec=stop_sec)
+                #else:
+                #    pb = self.get_pb_from_melodia(df['Melodia'].iloc[i], stop_sec=stop_sec)
             except:
                 continue
             n_stop = np.int(np.ceil(stop_sec * self.framessr2))
-	    print n_stop, len(op), len(mfcc), len(ch), len(pb)
-	    min_n_frames = np.min([n_stop, len(op), len(mfcc), len(ch), len(pb)])  # ideally, features should have the same number of frames
+            print n_stop, len(op), len(mfcc), len(ch), len(pb)
+            min_n_frames = np.min([n_stop, len(op), len(mfcc), len(ch), len(pb)])  # ideally, features should have the same number of frames
             if min_n_frames==0:
                 # no features extracted -> skip this file
                 continue
@@ -148,9 +150,10 @@
     def get_op_from_melspec(self, melspec, K=None):
         op = opm.OPMellin(win2sec=self.win2sec)
         opmellin = op.get_opmellin_from_melspec(melspec=melspec, melsr=self.framessr)
+        opmel = pd.DataFrame(opmellin.T)
         if K is not None:
             opmel =  self.mean_K_bands(opmellin.T, K)
-        opmel = pd.DataFrame(opmel)
+            opmel = pd.DataFrame(opmel)
         return opmel


@@ -169,10 +172,10 @@
         return mfcc


-    def get_ave_chroma(self, chroma, avelocalframes=True, stdlocalframes=True, alignchroma=True):
+    def get_ave_chroma(self, chroma, alignchroma=True, avelocalframes=True, stdlocalframes=True):
         chroma[np.where(np.isnan(chroma))] = 0
         if alignchroma:
-            maxind = np.argmax(np.sum(chroma, axis=1))
+            maxind = np.argmax(np.sum(chroma, axis=1))  # bin with max magnitude across time
             chroma = np.roll(chroma, -maxind, axis=0)
         if avelocalframes:
             chroma = self.average_local_frames(chroma, getstd=stdlocalframes)
@@ -229,13 +232,15 @@
         return newframes


-    def get_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True, stop_sec=30.0):
-        pb = []
-        if not os.path.exists(melodia_file):
-            return pb
-        print 'extracting pitch bihist from melodia...'
-        pb = pbi.PitchBihist(win2sec=self.win2sec)
-        pbihist = pb.bihist_from_melodia(filename=melodia_file, stop_sec=stop_sec)
+    def get_pb_for_file(self, melodia_file, precomp_melody=False, nmfpb=True, scale=True, stop_sec=30.0):
+        pbihist = []
+        if precomp_melody:
+            pbihist = self.load_precomp_pb_from_melodia(melodia_file=melodia_file, stop_sec=stop_sec)
+        else:
+            pbihist = self.extract_pb_from_melodia(melodia_file=melodia_file, stop_sec=stop_sec)
+        if len(pbihist) == 0:
+            # no file was found
+            return pbihist
         if nmfpb is True:
             pbihist = self.nmfpitchbihist(pbihist)
         pbihist = pd.DataFrame(pbihist.T)
@@ -245,7 +250,18 @@
         return pbihist


-    def load_precomputed_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True, stop_sec=30.0):
+    def extract_pb_from_melodia(self, melodia_file=None, stop_sec=30.0):
+        pbihist = []
+        if not os.path.exists(melodia_file):
+            return pbihist
+        print 'extracting pitch bihist from melodia...'
+        pb = pbi.PitchBihist(win2sec=self.win2sec)
+        pbihist = pb.bihist_from_melodia(filename=melodia_file, stop_sec=stop_sec)
+        return pbihist
+
+
+    def load_precomp_pb_from_melodia(self, melodia_file=None, stop_sec=30.0):
+        pbihist = []
         base = os.path.basename(melodia_file)
         root = '/import/c4dm-05/mariap/Melodia-melody-'+str(int(self.win2sec))+'sec/'
         root_BL = '/import/c4dm-04/mariap/FeatureCsvs_BL_old/PB-melodia/'
@@ -254,21 +270,53 @@
             root = root_SM
         else:
             root = root_BL
-	    base = base.split('_')[-1].split('.csv')[0]+'_vamp_mtg-melodia_melodia_melody.csv'
+        base = base.split('_')[-1].split('.csv')[0]+'_vamp_mtg-melodia_melodia_melody.csv'
         print 'load precomputed pitch bihist', root
-        #if self.win2sec == 8:
-        #    pbihist = pd.read_csv(os.path.join(root, base))
-        #else:
-	if 1:
-            pbihist = np.loadtxt(os.path.join(root, base), delimiter=',').T
-            if nmfpb is True:
-                pbihist = self.nmfpitchbihist(pbihist)
-            pbihist = pd.DataFrame(pbihist.T)
-	n_stop = np.int(np.ceil(stop_sec * self.framessr2))
-	pbihist = pbihist.iloc[:np.min([pbihist.shape[0], n_stop]), :]
-        print pbihist.shape
-        if scale:
-            # scale all frames by mean and std of recording
-            pbihist = (pbihist - np.nanmean(pbihist)) / np.nanstd(pbihist)
+        pbihist = np.loadtxt(os.path.join(root, base), delimiter=',').T
+        n_stop = np.int(np.ceil(stop_sec * self.framessr2))
+        pbihist = pbihist[:, :np.min([pbihist.shape[0], n_stop])]
         return pbihist
+
+
+    # def get_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True, stop_sec=30.0):
+    #     if not os.path.exists(melodia_file):
+    #         return []
+    #     print 'extracting pitch bihist from melodia...'
+    #     pb = pbi.PitchBihist(win2sec=self.win2sec)
+    #     pbihist = pb.bihist_from_melodia(filename=melodia_file, stop_sec=stop_sec)
+    #     if nmfpb is True:
+    #         pbihist = self.nmfpitchbihist(pbihist)
+    #     pbihist = pd.DataFrame(pbihist.T)
+    #     if scale:
+    #         # scale all frames by mean and std of recording
+    #         pbihist = (pbihist - np.nanmean(pbihist)) / np.nanstd(pbihist)
+    #     return pbihist
+
+
+    # def load_precomputed_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True, stop_sec=30.0):
+    #     base = os.path.basename(melodia_file)
+    #     root = '/import/c4dm-05/mariap/Melodia-melody-'+str(int(self.win2sec))+'sec/'
+    #     root_BL = '/import/c4dm-04/mariap/FeatureCsvs_BL_old/PB-melodia/'
+    #     root_SM = '/import/c4dm-04/mariap/FeatureCsvs/PB-melodia/'
+    #     if 'SampleAudio' in base:
+    #         root = root_SM
+    #     else:
+    #         root = root_BL
+    #     base = base.split('_')[-1].split('.csv')[0]+'_vamp_mtg-melodia_melodia_melody.csv'
+    #     print 'load precomputed pitch bihist', root
+    #     #if self.win2sec == 8:
+    #     #    pbihist = pd.read_csv(os.path.join(root, base))
+    #     #else:
+    #     if 1:
+    #         pbihist = np.loadtxt(os.path.join(root, base), delimiter=',').T
+    #         if nmfpb is True:
+    #             pbihist = self.nmfpitchbihist(pbihist)
+    #         pbihist = pd.DataFrame(pbihist.T)
+    #     n_stop = np.int(np.ceil(stop_sec * self.framessr2))
+    #     pbihist = pbihist.iloc[:np.min([pbihist.shape[0], n_stop]), :]
+    #     print pbihist.shape
+    #     if scale:
+    #         # scale all frames by mean and std of recording
+    #         pbihist = (pbihist - np.nanmean(pbihist)) / np.nanstd(pbihist)
+    #     return pbihist
--- a/scripts/results.py	Thu Sep 14 10:16:59 2017 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,131 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Jul 12 20:49:48 2016
-
-@author: mariapanteli
-"""
-
-import numpy as np
-import pandas as pd
-import pickle
-from collections import Counter
-from sklearn.cluster import KMeans
-
-import utils
-import utils_spatial
-
-
-def country_outlier_df(counts, labels, out_file=None, normalize=False):
-    if len(counts.keys()) < len(np.unique(labels)):
-        for label in np.unique(labels):
-            if not counts.has_key(label):
-                counts.update({label:0})
-    if normalize is True:
-        counts = normalize_outlier_counts(counts, Counter(labels))
-    df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
-    df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True)
-    if out_file is not None:
-        df.to_csv(out_file, index=False)
-    return df
-
-
-def normalize_outlier_counts(outlier_counts, country_counts):
-    '''Normalize a dictionary of outlier counts per country by
-        the total number of recordings per country
-    '''
-    for key in outlier_counts.keys():
-        # dictionaries should have the same keys
-        outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key])
-    return outlier_counts
-
-
-def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
-    threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
-    global_counts = Counter(Y[y_pred])
-    df = country_outlier_df(global_counts, Y, normalize=True)
-    if out_file is not None:
-        df.to_csv(out_file, index=False)
-    return df, threshold, MD
-
-
-def print_most_least_outliers_topN(df, N=10):
-    sort_inds = df['Outliers'].argsort()  # ascending order
-    df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]]
-    df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]]
-    print "most outliers "
-    print df_most
-    print "least outliers "
-    print df_least
-
-
-def load_metadata(Yaudio, metadata_file):
-    df = pd.read_csv(metadata_file)
-    df_audio = pd.DataFrame({'Audio':Yaudio})
-    ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio
-    return ddf
-
-
-def clusters_metadata(df, cl_pred, out_file=None):
-    def get_top_N_counts(labels, N=3):
-        ulab, ucount = np.unique(labels, return_counts=True)
-        inds = np.argsort(ucount)
-        return zip(ulab[inds[-N:]],ucount[inds[-N:]])
-    info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))])
-    styles_description = []
-    uniq_cl = np.unique(cl_pred)
-    for ccl in uniq_cl:
-        inds = np.where(cl_pred==ccl)[0]
-        styles_description.append(get_top_N_counts(info[inds], N=3))
-    df_styles = pd.DataFrame(data=styles_description, index=uniq_cl)
-    print df_styles.to_latex()
-    if out_file is not None:
-        df_styles.to_csv(out_file, index=False)
-
-
-if __name__ == '__main__':
-    # load LDA-transformed frames
-    X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb'))
-    ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv')
-    w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
-    w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
-    X = np.concatenate(X_list, axis=1)
-
-    # global outliers
-    df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
-    print_most_least_outliers_topN(df_global, N=10)
-
-    spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
-    spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
-    df_local = country_outlier_df(spatial_counts, Y, normalize=True)
-    print_most_least_outliers_topN(df_local, N=10)
-
-    feat = [Xrhy, Xmel, Xmfc, Xchr]
-    feat_labels = ['rhy', 'mel', 'mfc', 'chr']
-    tabs_feat = []
-    for i in range(len(feat)):
-        XX = feat[i]
-        df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999)
-        print_most_least_outliers_topN(df_feat, N=5)
-
-    # how many styles are there
-    #bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
-    bestncl = 13
-
-    # get cluster predictions and metadata for each cluster
-    cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
-    centroids = cluster_model.cluster_centers_
-    cl_pred = cluster_model.predict(X)
-    ddf['Clusters'] = cl_pred
-    clusters_metadata(ddf, cl_pred)
-
-    # how similar are the cultures and which ones seem to be global outliers
-    cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
-
-    # Moran on Mahalanobis distances
-    data = cluster_freq.get_values()
-    data_countries = cluster_freq.index
-    #threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999)
-    threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999)
-    y = np.sqrt(MD)
-    utils_spatial.print_Moran_outliers(y, w, data_countries)
-    utils_spatial.plot_Moran_scatterplot(y, w, data_countries)
--- a/scripts/results_classification.py	Thu Sep 14 10:16:59 2017 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,87 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Nov 10 15:10:32 2016
-
-@author: mariapanteli
-"""
-import numpy as np
-import pandas as pd
-from sklearn import metrics
-
-import map_and_average
-import util_feature_learning
-
-
-FILENAMES = map_and_average.OUTPUT_FILES
-
-
-def load_data_from_pickle(filename):
-    X_list, Y, Yaudio = pickle.load(open(filename,'rb'))
-    X = np.concatenate(data_list, axis=1)
-    return X, Y, Yaudio
-
-
-def get_train_test_indices():
-    trainset, valset, testset = map_and_average.load_train_val_test_sets()
-    trainaudiolabels, testaudiolabels = trainset[2], testset[2]
-    # train, test indices
-    aa_train = np.unique(trainaudiolabels)
-    aa_test = np.unique(testaudiolabels)
-    traininds = np.array([i for i, item in enumerate(audiolabs) if item in aa_train])
-    testinds = np.array([i for i, item in enumerate(audiolabs) if item in aa_test])
-    return traininds, testinds
-
-
-def get_train_test_sets(X, Y, traininds, testinds):
-    X_train = X[traininds, :]
-    Y_train = Y[traininds]
-    X_test = X[testinds, :]
-    Y_test = Y[testinds]
-    return X_train, Y_train, X_test, Y_test
-
-
-def classify_for_filenames(file_list=FILENAMES):
-    df_results = pd.DataFrame()
-    feat_learner = util_feature_learning.Transformer()
-    for filename in file_list:
-        X, Y, Yaudio = load_data_from_pickle(filename)
-        traininds, testinds = get_train_test_indices()
-        X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds)
-        df_result = feat_learner.classify(X_train, Y_train, X_test, Y_test)
-        df_results = pd.concat([df_results, df_result], axis=0, ignore_index=True)
-    return df_results
-
-
-def plot_CF(CF, labels=None, figurename=None):
-    labels[labels=='United States of America'] = 'United States Amer.'
-    plt.imshow(CF, cmap="Greys")
-    plt.xticks(range(len(labels)), labels, rotation='vertical', fontsize=4)
-    plt.yticks(range(len(labels)), labels, fontsize=4)
-    if figurename is not None:
-        plt.savefig(figurename, bbox_inches='tight')
-
-
-def confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False):
-    feat_learner = util_feature_learning.Transformer()
-    accuracy, predictions = util_feature_learning.classification_accuracy(X_train, Y_train,
-                        X_test, Y_test, model=util_feature_learning.modelLDA)
-    labels = np.unique(Y_test)  # TODO: countries in geographical proximity
-    CF = metrics.confusion_matrix(Y_test, predictions, labels=labels)
-    if saveCF:
-        np.savetxt('data/CFlabels.csv', labels, fmt='%s')
-        np.savetxt('data/CF.csv', CF, fmt='%10.5f')
-    if plots:
-        plot_CF(CF, labels=labels, figurename='data/conf_matrix.pdf')
-    return accuracy, predictions
-
-
-if __name__ == '__main__':
-    df_results = classify_for_filenames(file_list=FILENAMES)
-    max_i = np.argmax(df_results[:, 1])
-    feat_learning_i = max_i % 4  # 4 classifiers for each feature learning method
-    filename = FILENAMES[feat_learning_i]
-    X, Y, Yaudio = load_data_from_pickle(filename)
-    traininds, testinds = get_train_test_indices()
-    X_train, Y_train, X_test, Y_test = get_train_test_sets(X, Y, traininds, testinds)
-    confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=True, plots=True)
-
--- a/tests/test_load_features.py	Thu Sep 14 10:16:59 2017 +0100
+++ b/tests/test_load_features.py	Thu Sep 14 13:07:19 2017 +0100
@@ -7,8 +7,9 @@

 import pytest

+import os
 import numpy as np
-
+import pandas as pd
 import scripts.load_features as load_features

 feat_loader = load_features.FeatureLoader(win2sec=8)
@@ -95,3 +96,164 @@
     aveframes_true = np.array([[0.5, 0], [1.5, 1], [0.1, 0], [0.1, 0]])
     # test only the second frame which contains values 0 or values 1 for all 8-second frame entries
     assert np.array_equal(aveframes[:, 1], aveframes_true[:, 1])
+
+
+def test_get_op_from_melspec_n_frames():
+    dur_sec = 10.0
+    dur_frames = np.int(np.round(dur_sec * feat_loader.framessr))
+    np.random.seed(1)
+    melspec = np.random.randn(40, dur_frames) # melspec with 40 melbands
+    melspec = melspec - np.min(melspec)  # melspec must be positive
+    opmel = feat_loader.get_op_from_melspec(melspec)
+    n_frames = opmel.shape[0]
+    # expect 4 frames for windows not centered and .5 sec hop size
+    # np.round((dur_sec - feat_loader.win2sec) * feat_loader.framessr2)
+    assert n_frames == 4
+
+
+def test_get_op_from_melspec_n_bins():
+    dur_sec = 10.0
+    dur_frames = np.int(np.round(dur_sec * feat_loader.framessr))
+    np.random.seed(1)
+    melspec = np.random.randn(40, dur_frames) # melspec with 40 melbands
+    melspec = melspec - np.min(melspec)  # melspec must be positive
+    opmel = feat_loader.get_op_from_melspec(melspec)
+    n_bins = opmel.shape[1]
+    assert n_bins == 40 * 200
+
+
+def test_get_op_from_melspec_K_bands():
+    dur_sec = 10.0
+    dur_frames = np.int(np.round(dur_sec * feat_loader.framessr))
+    np.random.seed(1)
+    melspec = np.random.randn(40, dur_frames) # melspec with 40 melbands
+    melspec = melspec - np.min(melspec)  # melspec must be positive
+    K = 2
+    opmel = feat_loader.get_op_from_melspec(melspec, K=K)
+    n_bins = opmel.shape[1]
+    assert n_bins == K * 200
+
+
+def test_get_mfcc_from_melspec_n_coef():
+    dur_sec = 10.0
+    dur_frames = np.int(np.round(dur_sec * feat_loader.framessr))
+    np.random.seed(1)
+    melspec = np.random.randn(40, dur_frames) # melspec with 40 melbands
+    melspec = melspec - np.min(melspec)  # melspec must be positive
+    mfcc = feat_loader.get_mfcc_from_melspec(melspec, deltamfcc=False, avelocalframes=False)
+    assert mfcc.shape[1] == 20
+
+
+def test_get_mfcc_from_melspec_n_coef_delta():
+    dur_sec = 10.0
+    dur_frames = np.int(np.round(dur_sec * feat_loader.framessr))
+    np.random.seed(1)
+    melspec = np.random.randn(40, dur_frames) # melspec with 40 melbands
+    melspec = melspec - np.min(melspec)  # melspec must be positive
+    mfcc = feat_loader.get_mfcc_from_melspec(melspec, deltamfcc=True, avelocalframes=False)
+    assert mfcc.shape[1] == 40
+
+
+def test_get_mfcc_from_melspec_n_frames():
+    dur_sec = 10.0
+    dur_frames = np.int(np.round(dur_sec * feat_loader.framessr))
+    np.random.seed(1)
+    melspec = np.random.randn(40, dur_frames) # melspec with 40 melbands
+    melspec = melspec - np.min(melspec)  # melspec must be positive
+    mfcc = feat_loader.get_mfcc_from_melspec(melspec, deltamfcc=False, avelocalframes=False)
+    assert mfcc.shape[0] == dur_frames
+
+
+def test_get_mfcc_from_melspec_n_frames_win2():
+    dur_sec = 10.0
+    dur_frames = np.int(np.round(dur_sec * feat_loader.framessr))
+    np.random.seed(1)
+    melspec = np.random.randn(40, dur_frames) # melspec with 40 melbands
+    melspec = melspec - np.min(melspec)  # melspec must be positive
+    mfcc = feat_loader.get_mfcc_from_melspec(melspec, deltamfcc=False, avelocalframes=True)
+    n_frames_true = np.round((dur_sec - feat_loader.win2sec) * feat_loader.framessr2)
+    assert mfcc.shape[0] == n_frames_true
+
+
+def test_get_ave_chroma_align():
+    dur_sec = 10.0
+    dur_frames = np.int(np.round(dur_sec * feat_loader.framessr))
+    np.random.seed(1)
+    chroma = np.random.randn(60, dur_frames) # chroma with 60 bins
+    chroma = chroma - np.min(chroma)  # chroma must be positive
+    ave_chroma = feat_loader.get_ave_chroma(chroma, alignchroma=True, avelocalframes=False)
+    # the maximum bin across time is the first bin (after alignment)
+    assert np.argmax(np.sum(ave_chroma, axis=0)) == 0
+
+
+def test_get_ave_chroma_n_frames():
+    dur_sec = 10.0
+    dur_frames = np.int(np.round(dur_sec * feat_loader.framessr))
+    np.random.seed(1)
+    chroma = np.random.randn(60, dur_frames) # chroma with 60 bins
+    chroma = chroma - np.min(chroma)  # chroma must be positive
+    ave_chroma = feat_loader.get_ave_chroma(chroma, avelocalframes=True, stdlocalframes=False)
+    n_frames_true = np.round((dur_sec - feat_loader.win2sec) * feat_loader.framessr2)
+    assert ave_chroma.shape[0] == n_frames_true
+
+
+def test_get_ave_chroma_n_bins():
+    dur_sec = 10.0
+    dur_frames = np.int(np.round(dur_sec * feat_loader.framessr))
+    np.random.seed(1)
+    chroma = np.random.randn(60, dur_frames) # chroma with 60 bins
+    chroma = chroma - np.min(chroma)  # chroma must be positive
+    ave_chroma = feat_loader.get_ave_chroma(chroma, avelocalframes=True, stdlocalframes=True)
+    assert ave_chroma.shape[1] == 120
+
+
+def test_get_pb_for_file_empty():
+    pbihist = feat_loader.get_pb_for_file('')
+    assert np.array_equal(pbihist, [])
+
+
+def test_get_pb_for_file_n_bins():
+    pbihist = feat_loader.get_pb_for_file('data/sample_dataset/Melodia/mel_1_2_1.csv', nmfpb=False, scale=False)
+    assert pbihist.shape[1] == 3600
+
+
+def test_get_pb_for_file_align():
+    pbihist = feat_loader.get_pb_for_file('data/sample_dataset/Melodia/mel_1_2_1.csv', nmfpb=False, scale=False)
+    pbihist = pbihist.get_values()
+    assert np.sum(pbihist[:, :60].ravel()) > np.sum(pbihist[:, 60:120].ravel())
+
+
+def test_get_pb_for_file_nmf():
+    pbihist = feat_loader.get_pb_for_file('data/sample_dataset/Melodia/mel_1_2_1.csv', nmfpb=True, scale=False)
+    assert pbihist.shape[1] == 240
+
+
+def test_get_features():
+    df = pd.read_csv('data/sample_dataset/metadata.csv')
+    df = df.iloc[:1, :]
+    os.chdir('data/')
+    data_list = feat_loader.get_features(df)
+    os.chdir('..')
+    assert len(np.unique(data_list[-1])) == 1
+
+
+def test_get_features_n_files():
+    df = pd.read_csv('data/sample_dataset/metadata.csv')
+    n_files = 3
+    df = df.iloc[:n_files, :]
+    os.chdir('data/')
+    data_list = feat_loader.get_features(df)
+    os.chdir('..')
+    assert len(np.unique(data_list[-1])) == n_files
+
+
+def test_get_features_n_frames():
+    df = pd.read_csv('data/sample_dataset/metadata.csv')
+    df = df.iloc[:1, :]
+    os.chdir('data/')
+    data_list = feat_loader.get_features(df)
+    os.chdir('..')
+    dur_sec = 11.5  # duration of first file in metadata.csv is > 11 seconds
+    n_frames_true = np.round((dur_sec - feat_loader.win2sec) * feat_loader.framessr2)
+    assert len(data_list[0]) == n_frames_true
+
\ No newline at end of file