m@8: { m@8: "cells": [ m@8: { m@8: "cell_type": "code", m@11: "execution_count": 1, m@26: "metadata": { m@26: "collapsed": true m@26: }, m@11: "outputs": [], m@8: "source": [ m@8: "import numpy as np\n", m@8: "import pickle\n", m@8: "from scipy.stats import pearsonr\n", m@8: "from scipy.stats import skew\n", m@8: "import sys\n", m@8: "from sklearn.metrics.pairwise import pairwise_distances\n", m@8: "%matplotlib inline\n", m@8: "import matplotlib.pyplot as plt\n", m@8: "\n", m@8: "%load_ext autoreload\n", m@8: "%autoreload 2\n", m@8: "\n", m@8: "sys.path.append('../')\n", Maria@18: "import scripts.outliers as outliers\n", m@8: "import scripts.utils_spatial as utils_spatial" m@8: ] m@8: }, m@8: { m@8: "cell_type": "code", m@12: "execution_count": 2, m@11: "metadata": {}, m@8: "outputs": [ m@8: { m@12: "name": "stderr", m@12: "output_type": "stream", m@12: "text": [ m@12: "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:189: UserWarning: There are 21 disconnected observations\n", m@12: " warnings.warn(\"There are %d disconnected observations\" % ni)\n", m@12: "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:190: UserWarning: Island ids: 3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121\n", m@12: " warnings.warn(\"Island ids: %s\" % ', '.join(str(island) for island in self.islands))\n" m@12: ] m@12: }, m@12: { m@8: "name": "stdout", m@8: "output_type": "stream", m@8: "text": [ m@8: "Antigua and Barbuda\n", m@8: "Australia\n", m@8: "Cuba\n", m@8: "Fiji\n", m@8: "French Polynesia\n", m@8: "Grenada\n", m@8: "Iceland\n", m@8: "Jamaica\n", m@8: "Japan\n", m@8: "Kiribati\n", m@8: "Malta\n", m@8: "New Zealand\n", m@8: "Philippines\n", m@8: "Puerto Rico\n", m@8: "Republic of Serbia\n", m@8: "Saint Lucia\n", m@8: "Samoa\n", m@8: "Solomon Islands\n", m@8: "South Korea\n", m@8: "The Bahamas\n", m@8: "Trinidad and Tobago\n" m@8: ] m@8: } m@8: ], m@8: "source": [ m@8: "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n", Maria@18: "ddf = outliers.load_metadata(Yaudio, metadata_file='../data/metadata.csv')\n", m@8: "w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)\n", m@8: "w_dict = utils_spatial.from_weights_to_dict(w, data_countries)\n", m@8: "Xrhy, Xmel, Xmfc, Xchr = X_list\n", m@8: "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)\n", m@8: "\n", m@8: "# global outliers\n", Maria@18: "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)" m@8: ] m@8: }, m@8: { m@8: "cell_type": "code", m@26: "execution_count": null, m@11: "metadata": {}, m@8: "outputs": [ m@8: { m@8: "data": { m@8: "text/plain": [ m@8: "(8200, 380)" m@8: ] m@8: }, m@12: "execution_count": 3, m@8: "metadata": {}, m@8: "output_type": "execute_result" m@8: } m@8: ], m@8: "source": [ m@8: "X.shape" m@8: ] m@8: }, m@8: { m@8: "cell_type": "code", m@26: "execution_count": null, m@26: "metadata": { m@26: "collapsed": true m@26: }, m@8: "outputs": [], m@8: "source": [ m@8: "D = pairwise_distances(X, metric='mahalanobis')" m@8: ] m@8: }, m@8: { m@8: "cell_type": "code", m@26: "execution_count": null, m@12: "metadata": {}, m@26: "outputs": [], m@12: "source": [ m@12: "D.shape" m@12: ] m@12: }, m@12: { m@12: "cell_type": "code", m@26: "execution_count": null, m@11: "metadata": {}, m@26: "outputs": [], m@8: "source": [ m@8: "plt.hist(D.ravel(), bins=100);" m@8: ] m@8: }, m@8: { m@8: "cell_type": "code", m@26: "execution_count": null, m@8: "metadata": { m@8: "collapsed": true m@8: }, m@8: "outputs": [], m@8: "source": [ m@8: "def n_occurrence_from_D(D, k=10, n_items=None):\n", m@8: " if n_items is None:\n", m@8: " n_items = len(D)\n", m@8: " sort_idx = np.argsort(D, axis=1)\n", m@8: " D_k = sort_idx[:, 1:(k+1)] # nearest neighbour is the item itself\n", m@8: " N_k = np.bincount(D_k.astype(int).ravel(), minlength=n_items)\n", m@8: " return N_k" m@8: ] m@8: }, m@8: { m@8: "cell_type": "code", m@26: "execution_count": null, m@11: "metadata": {}, m@26: "outputs": [], m@8: "source": [ m@8: "N_k = n_occurrence_from_D(D, k=100)\n", m@8: "print skew(N_k)\n", m@26: "plt.figure()\n", m@26: "plt.hist(N_k, bins=100);\n", m@26: "plt.figure()\n", m@26: "plt.plot(np.sort(N_k))" m@8: ] m@8: }, m@8: { m@8: "cell_type": "code", m@12: "execution_count": 11, m@26: "metadata": { m@26: "collapsed": true m@26: }, m@8: "outputs": [], m@8: "source": [ m@12: "#sort_idx = np.argsort(D, axis=1)\n", m@12: "k = 10\n", m@12: "D_k = sort_idx[:, 1:(k+1)]" m@12: ] m@12: }, m@12: { m@12: "cell_type": "code", m@12: "execution_count": 12, m@12: "metadata": {}, m@12: "outputs": [ m@12: { m@12: "data": { m@12: "text/plain": [ m@12: "array([[4650, 2942, 3520, ..., 1318, 6678, 6056],\n", m@12: " [1933, 6143, 6757, ..., 7269, 4321, 1563],\n", m@12: " [3170, 2549, 4860, ..., 6678, 7414, 6056],\n", m@12: " ..., \n", m@12: " [6016, 2243, 1616, ..., 7627, 2018, 515],\n", m@12: " [7027, 4860, 6346, ..., 997, 3892, 1846],\n", m@12: " [5119, 1563, 4035, ..., 3486, 7617, 3854]])" m@12: ] m@12: }, m@12: "execution_count": 12, m@12: "metadata": {}, m@12: "output_type": "execute_result" m@12: } m@12: ], m@12: "source": [ m@12: "D_k" m@8: ] m@8: }, m@8: { m@8: "cell_type": "code", m@8: "execution_count": null, m@8: "metadata": { m@8: "collapsed": true m@8: }, m@8: "outputs": [], m@8: "source": [] m@8: } m@8: ], m@8: "metadata": { m@8: "kernelspec": { m@8: "display_name": "Python 2", m@8: "language": "python", m@8: "name": "python2" m@8: }, m@8: "language_info": { m@8: "codemirror_mode": { m@8: "name": "ipython", m@8: "version": 2 m@8: }, m@8: "file_extension": ".py", m@8: "mimetype": "text/x-python", m@8: "name": "python", m@8: "nbconvert_exporter": "python", m@8: "pygments_lexer": "ipython2", m@8: "version": "2.7.12" m@8: } m@8: }, m@8: "nbformat": 4, m@11: "nbformat_minor": 1 m@8: }