Mercurial > hg > plosone_underreview
changeset 76:d17833be50ca branch-tests
merged
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Fri, 22 Sep 2017 16:30:36 +0100 |
parents | 02faad4a996b (diff) cc028157502a (current diff) |
children | bde45ce0eeab |
files | |
diffstat | 3 files changed, 529 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/notebooks/results_30_seconds_and_figures.ipynb Fri Sep 22 16:30:36 2017 +0100 @@ -0,0 +1,489 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pickle \n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import sys\n", + "sys.path.append('../')\n", + "import scripts.outliers as outliers\n", + "import scripts.utils as utils" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: there are 21 disconnected observations\n", + "Island ids: [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n", + "Antigua and Barbuda\n", + "Australia\n", + "Cuba\n", + "Fiji\n", + "French Polynesia\n", + "Grenada\n", + "Iceland\n", + "Jamaica\n", + "Japan\n", + "Kiribati\n", + "Malta\n", + "New Zealand\n", + "Philippines\n", + "Puerto Rico\n", + "Republic of Serbia\n", + "Saint Lucia\n", + "Samoa\n", + "Solomon Islands\n", + "South Korea\n", + "The Bahamas\n", + "Trinidad and Tobago\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mariapanteli/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (0,1,2,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,31,35,38,39,40,41,44,45,48,55,56,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,93,95,96) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " data = self._reader.read(nrows)\n" + ] + } + ], + "source": [ + "DATA_FILE = '../data/lda_data_melodia_8_30sec.pickle'\n", + "METADATA_FILE = '/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv'\n", + "#METADATA_FILE = '../data/metadata.csv'\n", + "\n", + "dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(8200, 108)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_list, Y, Yaudio = dataset\n", + "X = np.concatenate(X_list, axis=1)\n", + "ddf.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "136 Botswana 0.611111 90 55\n", + "72 Ivory Coast 0.600000 15 9\n", + "95 Chad 0.545455 11 6\n", + "43 Benin 0.538462 26 14\n", + "86 Gambia 0.500000 50 25\n", + "20 Pakistan 0.494505 91 45\n", + "106 Nepal 0.473684 95 45\n", + "78 El Salvador 0.454545 33 15\n", + "64 Mozambique 0.441176 34 15\n", + "135 French Guiana 0.428571 28 12\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "1 Lithuania 0.000000 47 0\n", + "119 Denmark 0.000000 16 0\n", + "27 South Korea 0.000000 11 0\n", + "120 Kazakhstan 0.011364 88 1\n", + "31 Czech Republic 0.024390 41 1\n", + "15 Netherlands 0.029851 67 2\n", + "30 Afghanistan 0.041667 24 1\n", + "105 Sudan 0.044118 68 3\n", + "102 Nicaragua 0.047619 21 1\n", + "0 Canada 0.050000 100 5\n" + ] + } + ], + "source": [ + "# global outliers\n", + "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999, out_file='../data/results/global_outliers.csv')\n", + "outliers.print_most_least_outliers_topN(df_global, N=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "328\n", + "210\n", + "194\n", + "85\n", + "388\n", + "266\n", + "309\n", + "455\n", + "365\n", + "282\n", + "197\n", + "122\n", + "206\n", + "457\n", + "298\n", + "597\n", + "354\n", + "191\n", + "193\n", + "198\n", + "263\n", + "334\n", + "812\n", + "415\n", + "44\n", + "107\n", + "366\n", + "323\n", + "450\n", + "116\n", + "150\n", + "260\n", + "230\n", + "118\n", + "389\n", + "237\n", + "274\n", + "466\n", + "147\n", + "134\n", + "86\n", + "91\n", + "574\n", + "111\n", + "296\n", + "221\n", + "261\n", + "224\n", + "190\n", + "150\n", + "139\n", + "350\n", + "268\n", + "453\n", + "192\n", + "468\n", + "266\n", + "187\n", + "275\n", + "337\n", + "179\n", + "366\n", + "211\n", + "213\n", + "428\n", + "468\n", + "164\n", + "348\n", + "328\n", + "193\n", + "197\n", + "193\n", + "166\n", + "290\n", + "196\n", + "224\n", + "111\n", + "258\n", + "295\n", + "227\n", + "252\n", + "433\n", + "305\n", + "290\n", + "183\n", + "243\n", + "63\n", + "197\n", + "274\n", + "363\n", + "113\n", + "192\n", + "258\n", + "494\n", + "299\n", + "484\n", + "198\n", + "191\n", + "174\n", + "280\n", + "735\n", + "211\n", + "221\n", + "134\n", + "125\n", + "119\n", + "151\n", + "203\n", + "229\n", + "430\n", + "311\n", + "424\n", + "337\n", + "268\n", + "175\n", + "228\n", + "175\n", + "437\n", + "284\n", + "129\n", + "366\n", + "222\n", + "66\n", + "498\n", + "400\n", + "430\n", + "187\n", + "470\n", + "298\n", + "231\n", + "272\n", + "261\n", + "239\n", + "154\n", + "22\n", + "426\n", + "332\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "46 China 0.260000 100 26\n", + "67 Brazil 0.240000 100 24\n", + "101 Colombia 0.211111 90 19\n", + "64 Mozambique 0.205882 34 7\n", + "76 Iran 0.188679 53 10\n", + "65 Uganda 0.176471 85 15\n", + "27 Kenya 0.164948 97 16\n", + "126 South Sudan 0.163043 92 15\n", + "24 Azerbaijan 0.153846 13 2\n", + "23 India 0.147368 95 14\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "0 Canada 0 100 0\n", + "95 Portugal 0 100 0\n", + "94 Iraq 0 87 0\n", + "93 Grenada 0 37 0\n", + "90 French Polynesia 0 15 0\n", + "89 Croatia 0 31 0\n", + "88 Morocco 0 40 0\n", + "87 Philippines 0 100 0\n", + "86 Gambia 0 50 0\n", + "85 Sierra Leone 0 100 0\n" + ] + } + ], + "source": [ + "# local outliers\n", + "df_local = outliers.get_local_outliers_df(X, Y, w_dict, out_file='../data/results/spatial_outliers.csv')\n", + "outliers.print_most_least_outliers_topN(df_local, N=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "43 Benin 0.500000 26 13\n", + "136 Botswana 0.488889 90 44\n", + "106 Nepal 0.421053 95 40\n", + "84 Belize 0.418605 43 18\n", + "19 Yemen 0.416667 12 5\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "28 Tajikistan 0 19 0\n", + "119 Denmark 0 16 0\n", + "96 Uruguay 0 31 0\n", + "25 Republic of Serbia 0 16 0\n", + "27 South Korea 0 11 0\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "117 Zimbabwe 0.533333 15 8\n", + "96 Uruguay 0.483871 31 15\n", + "68 Guinea 0.454545 11 5\n", + "63 Senegal 0.390244 41 16\n", + "86 Gambia 0.380000 50 19\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "90 French Polynesia 0.000000 15 0\n", + "37 Rwanda 0.000000 17 0\n", + "119 Denmark 0.000000 16 0\n", + "18 New Zealand 0.000000 34 0\n", + "120 Kazakhstan 0.022727 88 2\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "17 French Guiana 0.678571 28 19\n", + "136 Botswana 0.477778 90 43\n", + "72 Ivory Coast 0.400000 15 6\n", + "23 Azerbaijan 0.384615 13 5\n", + "106 Nepal 0.347368 95 33\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "68 Guinea 0 11 0\n", + "55 Mali 0 17 0\n", + "77 Algeria 0 27 0\n", + "33 Saint Lucia 0 43 0\n", + "31 Czech Republic 0 41 0\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "43 Benin 0.538462 26 14\n", + "20 Pakistan 0.461538 91 42\n", + "86 Gambia 0.360000 50 18\n", + "52 Indonesia 0.350000 100 35\n", + "136 Botswana 0.311111 90 28\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "107 Kiribati 0 17 0\n", + "1 Lithuania 0 47 0\n", + "134 Paraguay 0 23 0\n", + "131 Tunisia 0 39 0\n", + "19 Yemen 0 12 0\n" + ] + } + ], + "source": [ + "# outliers for features\n", + "feat = X_list\n", + "feat_labels = ['rhy', 'mel', 'mfc', 'chr']\n", + "tabs_feat = []\n", + "for i in range(len(feat)):\n", + " XX = feat[i]\n", + " output_csv = '../data/results/global_outliers_'+feat_labels[i]+'.csv'\n", + " df_feat, threshold, MD = outliers.get_outliers_df(XX, Y, chi2thr=0.999, out_file=output_csv)\n", + " outliers.print_most_least_outliers_topN(df_feat, N=5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5\n", + "6\n", + "7\n", + "8\n", + "9\n", + "10\n", + "11\n", + "12\n", + "13\n", + "14\n", + "15\n", + "16\n", + "17\n", + "18" + ] + } + ], + "source": [ + "centroids, cl_pred = outliers.get_country_clusters(X, bestncl=None, max_ncl=30)\n", + "ddf['Clusters'] = cl_pred\n", + "ddf.to_csv('../data/results/df_and_clusters.csv', index=False)\n", + "print len(np.unique(cl_pred))\n", + "outliers.print_clusters_metadata(ddf, cl_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}
--- a/notebooks/results_for_30_seconds.ipynb Fri Sep 22 16:29:32 2017 +0100 +++ b/notebooks/results_for_30_seconds.ipynb Fri Sep 22 16:30:36 2017 +0100 @@ -3,7 +3,9 @@ { "cell_type": "code", "execution_count": 36, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -88,7 +90,9 @@ { "cell_type": "code", "execution_count": 37, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -124,7 +128,9 @@ { "cell_type": "code", "execution_count": 38, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -145,7 +151,9 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -216,6 +224,7 @@ "cell_type": "code", "execution_count": 29, "metadata": { + "collapsed": false, "scrolled": true }, "outputs": [ @@ -270,6 +279,7 @@ "cell_type": "code", "execution_count": 63, "metadata": { + "collapsed": false, "scrolled": true }, "outputs": [ @@ -324,7 +334,9 @@ { "cell_type": "code", "execution_count": 43, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -371,7 +383,9 @@ { "cell_type": "code", "execution_count": 46, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -450,7 +464,9 @@ { "cell_type": "code", "execution_count": 48, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stderr", @@ -662,7 +678,9 @@ { "cell_type": "code", "execution_count": 53, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "name": "stdout", @@ -718,7 +736,9 @@ { "cell_type": "code", "execution_count": 55, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -738,7 +758,9 @@ { "cell_type": "code", "execution_count": 62, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": {
--- a/scripts/outliers.py Fri Sep 22 16:29:32 2017 +0100 +++ b/scripts/outliers.py Fri Sep 22 16:30:36 2017 +0100 @@ -15,7 +15,7 @@ import utils_spatial -def country_outlier_df(counts, labels, normalize=False): +def country_outlier_df(counts, labels, normalize=False, out_file=None): if len(counts.keys()) < len(np.unique(labels)): for label in np.unique(labels): if not counts.has_key(label): @@ -33,6 +33,8 @@ df_n_outliers.rename(columns={'index':'Country', 0:'N_Outliers'}, inplace=True) df = pd.merge(df, df_n_country, on='Country', how='left') df = pd.merge(df, df_n_outliers, on='Country', how='left') + if out_file is not None: + df.to_csv(out_file, index=False) return df @@ -50,9 +52,7 @@ def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) global_counts = Counter(Y[y_pred]) - df = country_outlier_df(global_counts, Y, normalize=True) - if out_file is not None: - df.to_csv(out_file, index=False) + df = country_outlier_df(global_counts, Y, normalize=True, out_file=out_file) return df, threshold, MD @@ -100,16 +100,16 @@ return [X_list, Y, Yaudio], ddf, w_dict -def get_local_outliers_df(X, Y, w_dict): +def get_local_outliers_df(X, Y, w_dict, out_file=None): spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) - df_local = country_outlier_df(spatial_counts, Y, normalize=True) + df_local = country_outlier_df(spatial_counts, Y, normalize=True, out_file=out_file) return df_local -def get_country_clusters(X, bestncl=None): +def get_country_clusters(X, bestncl=None, max_ncl=50): if bestncl is None: - bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") + bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=max_ncl, metric="cosine") # get cluster predictions and metadata for each cluster cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) centroids = cluster_model.cluster_centers_