Mercurial > hg > plosone_underreview
comparison notebooks/sensitivity_experiment.ipynb @ 42:90f8a2ea6f6f branch-tests
notebook results and load_features minor edits
author | mpanteli <m.x.panteli@gmail.com> |
---|---|
date | Fri, 15 Sep 2017 16:17:17 +0100 |
parents | e4736064d282 |
children | 081ff4ea7da7 |
comparison
equal
deleted
inserted
replaced
41:57f53b0d1eaa | 42:90f8a2ea6f6f |
---|---|
1 { | 1 { |
2 "cells": [ | 2 "cells": [ |
3 { | 3 { |
4 "cell_type": "code", | 4 "cell_type": "code", |
5 "execution_count": 58, | 5 "execution_count": 15, |
6 "metadata": {}, | 6 "metadata": {}, |
7 "outputs": [ | 7 "outputs": [ |
8 { | 8 { |
9 "name": "stdout", | 9 "name": "stdout", |
10 "output_type": "stream", | 10 "output_type": "stream", |
14 ] | 14 ] |
15 } | 15 } |
16 ], | 16 ], |
17 "source": [ | 17 "source": [ |
18 "import numpy as np\n", | 18 "import numpy as np\n", |
19 "import pandas as pd\n", | |
19 "\n", | 20 "\n", |
20 "%matplotlib inline\n", | 21 "%matplotlib inline\n", |
21 "import matplotlib.pyplot as plt\n", | 22 "import matplotlib.pyplot as plt\n", |
22 "\n", | 23 "\n", |
23 "%load_ext autoreload\n", | 24 "%load_ext autoreload\n", |
25 "\n", | 26 "\n", |
26 "import sys\n", | 27 "import sys\n", |
27 "sys.path.append('../')\n", | 28 "sys.path.append('../')\n", |
28 "import scripts.load_dataset as load_dataset\n", | 29 "import scripts.load_dataset as load_dataset\n", |
29 "import scripts.map_and_average as mapper\n", | 30 "import scripts.map_and_average as mapper\n", |
30 "import scripts.classification\n", | 31 "import scripts.classification as classification\n", |
31 "import scripts.outliers as outliers" | 32 "import scripts.outliers as outliers" |
32 ] | 33 ] |
33 }, | 34 }, |
34 { | 35 { |
35 "cell_type": "code", | 36 "cell_type": "code", |
36 "execution_count": 46, | 37 "execution_count": 2, |
37 "metadata": {}, | 38 "metadata": { |
39 "collapsed": true | |
40 }, | |
38 "outputs": [], | 41 "outputs": [], |
39 "source": [ | 42 "source": [ |
40 "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n", | 43 "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n", |
41 "n_iters = 10\n", | 44 "n_iters = 10" |
42 "df = load_dataset.sample_dataset(csv_file=load_dataset.METADATA_FILE)" | |
43 ] | 45 ] |
44 }, | 46 }, |
45 { | 47 { |
46 "cell_type": "code", | 48 "cell_type": "code", |
47 "execution_count": 5, | 49 "execution_count": 5, |
57 "metadata": {}, | 59 "metadata": {}, |
58 "output_type": "execute_result" | 60 "output_type": "execute_result" |
59 } | 61 } |
60 ], | 62 ], |
61 "source": [ | 63 "source": [ |
64 "df = load_dataset.sample_dataset(csv_file=load_dataset.METADATA_FILE)\n", | |
62 "df.shape" | 65 "df.shape" |
63 ] | 66 ] |
64 }, | 67 }, |
65 { | 68 { |
66 "cell_type": "code", | 69 "cell_type": "code", |
4610 " load_dataset.features_for_train_test_sets(df, write_output=True)" | 4613 " load_dataset.features_for_train_test_sets(df, write_output=True)" |
4611 ] | 4614 ] |
4612 }, | 4615 }, |
4613 { | 4616 { |
4614 "cell_type": "code", | 4617 "cell_type": "code", |
4615 "execution_count": null, | 4618 "execution_count": 3, |
4616 "metadata": { | 4619 "metadata": {}, |
4617 "collapsed": true | 4620 "outputs": [ |
4618 }, | 4621 { |
4619 "outputs": [], | 4622 "name": "stdout", |
4623 "output_type": "stream", | |
4624 "text": [ | |
4625 "iteration 0\n", | |
4626 "mapping...\n", | |
4627 "/import/c4dm-04/mariap/train_data_melodia_8_0.pickle\n", | |
4628 "(203219, 840) (68100, 840) (67143, 840)\n", | |
4629 "mapping rhy\n", | |
4630 "training with PCA transform...\n", | |
4631 "variance explained 1.0\n", | |
4632 "140 400\n", | |
4633 "training with PCA transform...\n", | |
4634 "variance explained 0.990203912455\n", | |
4635 "training with LDA transform...\n" | |
4636 ] | |
4637 }, | |
4638 { | |
4639 "name": "stderr", | |
4640 "output_type": "stream", | |
4641 "text": [ | |
4642 "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", | |
4643 " y = column_or_1d(y, warn=True)\n", | |
4644 "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/discriminant_analysis.py:455: UserWarning: The priors do not sum to 1. Renormalizing\n", | |
4645 " UserWarning)\n" | |
4646 ] | |
4647 }, | |
4648 { | |
4649 "name": "stdout", | |
4650 "output_type": "stream", | |
4651 "text": [ | |
4652 "variance explained 1.0\n", | |
4653 "transform test data...\n", | |
4654 "mapping mel\n", | |
4655 "training with PCA transform...\n", | |
4656 "variance explained 1.0\n", | |
4657 "214 240\n", | |
4658 "training with PCA transform...\n", | |
4659 "variance explained 0.990094273777\n", | |
4660 "training with LDA transform...\n", | |
4661 "variance explained 1.0\n", | |
4662 "transform test data...\n", | |
4663 "mapping mfc\n", | |
4664 "training with PCA transform...\n", | |
4665 "variance explained 1.0\n", | |
4666 "39 80\n", | |
4667 "training with PCA transform...\n", | |
4668 "variance explained 0.9914399357\n", | |
4669 "training with LDA transform...\n", | |
4670 "variance explained 0.941390777379\n", | |
4671 "transform test data...\n", | |
4672 "mapping chr\n", | |
4673 "training with PCA transform...\n", | |
4674 "variance explained 1.0\n", | |
4675 "70 120\n", | |
4676 "training with PCA transform...\n", | |
4677 "variance explained 0.990511935176\n", | |
4678 "training with LDA transform...\n", | |
4679 "variance explained 0.953613938607\n", | |
4680 "transform test data...\n" | |
4681 ] | |
4682 }, | |
4683 { | |
4684 "ename": "ValueError", | |
4685 "evalue": "all the input array dimensions except for the concatenation axis must match exactly", | |
4686 "output_type": "error", | |
4687 "traceback": [ | |
4688 "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
4689 "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", | |
4690 "\u001b[0;32m<ipython-input-3-971892d5bd8d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m output_file in OUTPUT_FILES]\n\u001b[1;32m 7\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlda_map_and_average_frames\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_variance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.99\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mldadata_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;31m# classification and confusion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
4691 "\u001b[0;31mValueError\u001b[0m: all the input array dimensions except for the concatenation axis must match exactly" | |
4692 ] | |
4693 } | |
4694 ], | |
4620 "source": [ | 4695 "source": [ |
4621 "for n in range(n_iters):\n", | 4696 "for n in range(n_iters):\n", |
4622 " print \"iteration %d\" % n\n", | 4697 " print \"iteration %d\" % n\n", |
4623 " \n", | 4698 " \n", |
4624 " print \"mapping...\"\n", | 4699 " print \"mapping...\"\n", |
4625 " mapper.INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n", | 4700 " mapper.INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n", |
4626 " output_file in OUTPUT_FILES]\n", | 4701 " output_file in OUTPUT_FILES]\n", |
4627 " _, _, ldadata_list, _, _, Y, Yaudio = mapper.lda_map_and_average_frames(min_variance=0.99)\n", | 4702 " _, _, ldadata_list, _, _, Y, Yaudio = mapper.lda_map_and_average_frames(min_variance=0.99)\n", |
4628 " X = np.concatenate(ldadata_list)\n", | 4703 " X = np.concatenate(ldadata_list, axis=1)\n", |
4629 " \n", | 4704 " \n", |
4630 " # classification and confusion\n", | 4705 " # classification and confusion\n", |
4631 " print \"classifying...\"\n", | 4706 " print \"classifying...\"\n", |
4632 " traininds, testinds = classification.get_train_test_indices()\n", | 4707 " traininds, testinds = classification.get_train_test_indices()\n", |
4633 " X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n", | 4708 " X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n", |
4634 " accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n", | 4709 " accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n", |
4635 " print accuracy\n", | 4710 " print accuracy\n", |
4636 " \n", | 4711 " \n", |
4637 " # outliers\n", | 4712 " # outliers\n", |
4638 " print \"detecting outliers...\"\n", | 4713 " print \"detecting outliers...\"\n", |
4639 " ddf = outliers.load_metadata(Yaudio, metadata_file=load_dataset.METADATA_FILE)\n", | 4714 " #ddf = outliers.load_metadata(Yaudio, metadata_file=load_dataset.METADATA_FILE)\n", |
4640 " df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)\n", | 4715 " df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n", |
4641 " print_most_least_outliers_topN(df_global, N=10)\n", | 4716 " outliers.print_most_least_outliers_topN(df_global, N=10)\n", |
4642 " \n", | 4717 " \n", |
4643 " # write output\n", | 4718 " # write output\n", |
4644 " print \"writing file\"\n", | 4719 " print \"writing file\"\n", |
4645 " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)" | 4720 " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)" |
4646 ] | 4721 ] |
4722 }, | |
4723 { | |
4724 "cell_type": "code", | |
4725 "execution_count": 4, | |
4726 "metadata": { | |
4727 "collapsed": true | |
4728 }, | |
4729 "outputs": [], | |
4730 "source": [ | |
4731 "X = np.concatenate(ldadata_list, axis=1)" | |
4732 ] | |
4733 }, | |
4734 { | |
4735 "cell_type": "code", | |
4736 "execution_count": 5, | |
4737 "metadata": {}, | |
4738 "outputs": [ | |
4739 { | |
4740 "data": { | |
4741 "text/plain": [ | |
4742 "(8089, 381)" | |
4743 ] | |
4744 }, | |
4745 "execution_count": 5, | |
4746 "metadata": {}, | |
4747 "output_type": "execute_result" | |
4748 } | |
4749 ], | |
4750 "source": [ | |
4751 "X.shape" | |
4752 ] | |
4753 }, | |
4754 { | |
4755 "cell_type": "code", | |
4756 "execution_count": 10, | |
4757 "metadata": {}, | |
4758 "outputs": [ | |
4759 { | |
4760 "name": "stdout", | |
4761 "output_type": "stream", | |
4762 "text": [ | |
4763 "0.176354062249\n" | |
4764 ] | |
4765 }, | |
4766 { | |
4767 "name": "stderr", | |
4768 "output_type": "stream", | |
4769 "text": [ | |
4770 "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n", | |
4771 " 'precision', 'predicted', average, warn_for)\n" | |
4772 ] | |
4773 } | |
4774 ], | |
4775 "source": [ | |
4776 "#traininds, testinds = classification.get_train_test_indices()\n", | |
4777 "traininds = np.arange(5000)\n", | |
4778 "testinds = np.arange(len(X)-1600, len(X))\n", | |
4779 "X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n", | |
4780 "accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n", | |
4781 "print accuracy" | |
4782 ] | |
4783 }, | |
4784 { | |
4785 "cell_type": "code", | |
4786 "execution_count": 13, | |
4787 "metadata": {}, | |
4788 "outputs": [ | |
4789 { | |
4790 "name": "stdout", | |
4791 "output_type": "stream", | |
4792 "text": [ | |
4793 "detecting outliers...\n", | |
4794 "most outliers \n", | |
4795 " Country Outliers\n", | |
4796 "136 Botswana 0.590909\n", | |
4797 "71 Ivory Coast 0.571429\n", | |
4798 "86 Gambia 0.541667\n", | |
4799 "43 Benin 0.538462\n", | |
4800 "62 Fiji 0.466667\n", | |
4801 "20 Pakistan 0.461538\n", | |
4802 "65 Uganda 0.437500\n", | |
4803 "14 Liberia 0.425000\n", | |
4804 "78 El Salvador 0.424242\n", | |
4805 "51 Western Sahara 0.421687\n", | |
4806 "least outliers \n", | |
4807 " Country Outliers\n", | |
4808 "119 Denmark 0.000000\n", | |
4809 "30 Afghanistan 0.000000\n", | |
4810 "113 Iceland 0.000000\n", | |
4811 "28 Tajikistan 0.000000\n", | |
4812 "74 Czech Republic 0.000000\n", | |
4813 "27 South Korea 0.000000\n", | |
4814 "1 Lithuania 0.000000\n", | |
4815 "15 Netherlands 0.014925\n", | |
4816 "121 Poland 0.040000\n", | |
4817 "134 Paraguay 0.043478\n" | |
4818 ] | |
4819 } | |
4820 ], | |
4821 "source": [ | |
4822 "print \"detecting outliers...\"\n", | |
4823 "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n", | |
4824 "outliers.print_most_least_outliers_topN(df_global, N=10)" | |
4825 ] | |
4826 }, | |
4827 { | |
4828 "cell_type": "markdown", | |
4829 "metadata": {}, | |
4830 "source": [ | |
4831 "## correlation of outlier results" | |
4832 ] | |
4833 }, | |
4834 { | |
4835 "cell_type": "markdown", | |
4836 "metadata": {}, | |
4837 "source": [ | |
4838 "Let's use Kendal correlation to compare the ranked list of countries sorted by most to least outliers.\n", | |
4839 "<br> First load the ranked list of outlier countries.\n", | |
4840 "<br> Sort by outlier percentage in descending order." | |
4841 ] | |
4842 }, | |
4843 { | |
4844 "cell_type": "code", | |
4845 "execution_count": null, | |
4846 "metadata": { | |
4847 "collapsed": true | |
4848 }, | |
4849 "outputs": [], | |
4850 "source": [ | |
4851 "ranked_countries = pd.DataFrame()\n", | |
4852 "ranked_outliers = pd.DataFrame()\n", | |
4853 "for n in range(n_iters):\n", | |
4854 " df_global = pd.read_csv('../data/outliers_'+str(n)+'.csv')\n", | |
4855 " df_global = df_global.sort_values('Outliers', axis=0, ascending=False, inplace=True)\n", | |
4856 " ranked_countries = pd.concat([ranked_countries, df_global['Country']], axis=1)\n", | |
4857 " ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)" | |
4858 ] | |
4859 }, | |
4860 { | |
4861 "cell_type": "markdown", | |
4862 "metadata": {}, | |
4863 "source": [ | |
4864 "Remove countries with 0% outliers as these are in random (probably alphabetical) order." | |
4865 ] | |
4866 }, | |
4867 { | |
4868 "cell_type": "code", | |
4869 "execution_count": null, | |
4870 "metadata": { | |
4871 "collapsed": true | |
4872 }, | |
4873 "outputs": [], | |
4874 "source": [ | |
4875 "zero_idx = np.where(np.sum(ranked_outliers, axis=1)==0)[0]\n", | |
4876 "first_zero_idx = np.min(zero_idx)\n", | |
4877 "ranked_countries = ranked_countries.iloc[:first_zero_idx, :]\n", | |
4878 "ranked_outliers = ranked_outliers.iloc[:first_zero_idx, :]\n", | |
4879 "\n", | |
4880 "print ranked_countries.head()\n", | |
4881 "print ranked_outliers.head()" | |
4882 ] | |
4883 }, | |
4884 { | |
4885 "cell_type": "markdown", | |
4886 "metadata": {}, | |
4887 "source": [ | |
4888 "And now kendalltau correlation" | |
4889 ] | |
4890 }, | |
4891 { | |
4892 "cell_type": "code", | |
4893 "execution_count": 33, | |
4894 "metadata": {}, | |
4895 "outputs": [ | |
4896 { | |
4897 "name": "stdout", | |
4898 "output_type": "stream", | |
4899 "text": [ | |
4900 "KendalltauResult(correlation=0.99999999999999989, pvalue=2.5428927239036995e-67)\n" | |
4901 ] | |
4902 } | |
4903 ], | |
4904 "source": [ | |
4905 "from scipy.stats import kendalltau\n", | |
4906 "for i in range(len(ranked_countries)-1):\n", | |
4907 " for j in range(i+1, len(ranked_countries)):\n", | |
4908 " print kendalltau(ranked_countries.iloc[:, i], ranked_countries.iloc[:, j])" | |
4909 ] | |
4910 }, | |
4911 { | |
4912 "cell_type": "code", | |
4913 "execution_count": 34, | |
4914 "metadata": {}, | |
4915 "outputs": [ | |
4916 { | |
4917 "data": { | |
4918 "text/plain": [ | |
4919 "SpearmanrResult(correlation=1.0, pvalue=0.0)" | |
4920 ] | |
4921 }, | |
4922 "execution_count": 34, | |
4923 "metadata": {}, | |
4924 "output_type": "execute_result" | |
4925 } | |
4926 ], | |
4927 "source": [ | |
4928 "from scipy.stats import spearmanr\n", | |
4929 "spearmanr(ranked_countries)" | |
4930 ] | |
4931 }, | |
4932 { | |
4933 "cell_type": "code", | |
4934 "execution_count": null, | |
4935 "metadata": { | |
4936 "collapsed": true | |
4937 }, | |
4938 "outputs": [], | |
4939 "source": [] | |
4647 } | 4940 } |
4648 ], | 4941 ], |
4649 "metadata": { | 4942 "metadata": { |
4650 "kernelspec": { | 4943 "kernelspec": { |
4651 "display_name": "Python 2", | 4944 "display_name": "Python 2", |