comparison notebooks/sensitivity_experiment.ipynb @ 42:90f8a2ea6f6f branch-tests

notebook results and load_features minor edits
author mpanteli <m.x.panteli@gmail.com>
date Fri, 15 Sep 2017 16:17:17 +0100
parents e4736064d282
children 081ff4ea7da7
comparison
equal deleted inserted replaced
41:57f53b0d1eaa 42:90f8a2ea6f6f
1 { 1 {
2 "cells": [ 2 "cells": [
3 { 3 {
4 "cell_type": "code", 4 "cell_type": "code",
5 "execution_count": 58, 5 "execution_count": 15,
6 "metadata": {}, 6 "metadata": {},
7 "outputs": [ 7 "outputs": [
8 { 8 {
9 "name": "stdout", 9 "name": "stdout",
10 "output_type": "stream", 10 "output_type": "stream",
14 ] 14 ]
15 } 15 }
16 ], 16 ],
17 "source": [ 17 "source": [
18 "import numpy as np\n", 18 "import numpy as np\n",
19 "import pandas as pd\n",
19 "\n", 20 "\n",
20 "%matplotlib inline\n", 21 "%matplotlib inline\n",
21 "import matplotlib.pyplot as plt\n", 22 "import matplotlib.pyplot as plt\n",
22 "\n", 23 "\n",
23 "%load_ext autoreload\n", 24 "%load_ext autoreload\n",
25 "\n", 26 "\n",
26 "import sys\n", 27 "import sys\n",
27 "sys.path.append('../')\n", 28 "sys.path.append('../')\n",
28 "import scripts.load_dataset as load_dataset\n", 29 "import scripts.load_dataset as load_dataset\n",
29 "import scripts.map_and_average as mapper\n", 30 "import scripts.map_and_average as mapper\n",
30 "import scripts.classification\n", 31 "import scripts.classification as classification\n",
31 "import scripts.outliers as outliers" 32 "import scripts.outliers as outliers"
32 ] 33 ]
33 }, 34 },
34 { 35 {
35 "cell_type": "code", 36 "cell_type": "code",
36 "execution_count": 46, 37 "execution_count": 2,
37 "metadata": {}, 38 "metadata": {
39 "collapsed": true
40 },
38 "outputs": [], 41 "outputs": [],
39 "source": [ 42 "source": [
40 "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n", 43 "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n",
41 "n_iters = 10\n", 44 "n_iters = 10"
42 "df = load_dataset.sample_dataset(csv_file=load_dataset.METADATA_FILE)"
43 ] 45 ]
44 }, 46 },
45 { 47 {
46 "cell_type": "code", 48 "cell_type": "code",
47 "execution_count": 5, 49 "execution_count": 5,
57 "metadata": {}, 59 "metadata": {},
58 "output_type": "execute_result" 60 "output_type": "execute_result"
59 } 61 }
60 ], 62 ],
61 "source": [ 63 "source": [
64 "df = load_dataset.sample_dataset(csv_file=load_dataset.METADATA_FILE)\n",
62 "df.shape" 65 "df.shape"
63 ] 66 ]
64 }, 67 },
65 { 68 {
66 "cell_type": "code", 69 "cell_type": "code",
4610 " load_dataset.features_for_train_test_sets(df, write_output=True)" 4613 " load_dataset.features_for_train_test_sets(df, write_output=True)"
4611 ] 4614 ]
4612 }, 4615 },
4613 { 4616 {
4614 "cell_type": "code", 4617 "cell_type": "code",
4615 "execution_count": null, 4618 "execution_count": 3,
4616 "metadata": { 4619 "metadata": {},
4617 "collapsed": true 4620 "outputs": [
4618 }, 4621 {
4619 "outputs": [], 4622 "name": "stdout",
4623 "output_type": "stream",
4624 "text": [
4625 "iteration 0\n",
4626 "mapping...\n",
4627 "/import/c4dm-04/mariap/train_data_melodia_8_0.pickle\n",
4628 "(203219, 840) (68100, 840) (67143, 840)\n",
4629 "mapping rhy\n",
4630 "training with PCA transform...\n",
4631 "variance explained 1.0\n",
4632 "140 400\n",
4633 "training with PCA transform...\n",
4634 "variance explained 0.990203912455\n",
4635 "training with LDA transform...\n"
4636 ]
4637 },
4638 {
4639 "name": "stderr",
4640 "output_type": "stream",
4641 "text": [
4642 "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
4643 " y = column_or_1d(y, warn=True)\n",
4644 "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/discriminant_analysis.py:455: UserWarning: The priors do not sum to 1. Renormalizing\n",
4645 " UserWarning)\n"
4646 ]
4647 },
4648 {
4649 "name": "stdout",
4650 "output_type": "stream",
4651 "text": [
4652 "variance explained 1.0\n",
4653 "transform test data...\n",
4654 "mapping mel\n",
4655 "training with PCA transform...\n",
4656 "variance explained 1.0\n",
4657 "214 240\n",
4658 "training with PCA transform...\n",
4659 "variance explained 0.990094273777\n",
4660 "training with LDA transform...\n",
4661 "variance explained 1.0\n",
4662 "transform test data...\n",
4663 "mapping mfc\n",
4664 "training with PCA transform...\n",
4665 "variance explained 1.0\n",
4666 "39 80\n",
4667 "training with PCA transform...\n",
4668 "variance explained 0.9914399357\n",
4669 "training with LDA transform...\n",
4670 "variance explained 0.941390777379\n",
4671 "transform test data...\n",
4672 "mapping chr\n",
4673 "training with PCA transform...\n",
4674 "variance explained 1.0\n",
4675 "70 120\n",
4676 "training with PCA transform...\n",
4677 "variance explained 0.990511935176\n",
4678 "training with LDA transform...\n",
4679 "variance explained 0.953613938607\n",
4680 "transform test data...\n"
4681 ]
4682 },
4683 {
4684 "ename": "ValueError",
4685 "evalue": "all the input array dimensions except for the concatenation axis must match exactly",
4686 "output_type": "error",
4687 "traceback": [
4688 "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
4689 "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
4690 "\u001b[0;32m<ipython-input-3-971892d5bd8d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m output_file in OUTPUT_FILES]\n\u001b[1;32m 7\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlda_map_and_average_frames\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_variance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.99\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mldadata_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;31m# classification and confusion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
4691 "\u001b[0;31mValueError\u001b[0m: all the input array dimensions except for the concatenation axis must match exactly"
4692 ]
4693 }
4694 ],
4620 "source": [ 4695 "source": [
4621 "for n in range(n_iters):\n", 4696 "for n in range(n_iters):\n",
4622 " print \"iteration %d\" % n\n", 4697 " print \"iteration %d\" % n\n",
4623 " \n", 4698 " \n",
4624 " print \"mapping...\"\n", 4699 " print \"mapping...\"\n",
4625 " mapper.INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n", 4700 " mapper.INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n",
4626 " output_file in OUTPUT_FILES]\n", 4701 " output_file in OUTPUT_FILES]\n",
4627 " _, _, ldadata_list, _, _, Y, Yaudio = mapper.lda_map_and_average_frames(min_variance=0.99)\n", 4702 " _, _, ldadata_list, _, _, Y, Yaudio = mapper.lda_map_and_average_frames(min_variance=0.99)\n",
4628 " X = np.concatenate(ldadata_list)\n", 4703 " X = np.concatenate(ldadata_list, axis=1)\n",
4629 " \n", 4704 " \n",
4630 " # classification and confusion\n", 4705 " # classification and confusion\n",
4631 " print \"classifying...\"\n", 4706 " print \"classifying...\"\n",
4632 " traininds, testinds = classification.get_train_test_indices()\n", 4707 " traininds, testinds = classification.get_train_test_indices()\n",
4633 " X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n", 4708 " X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n",
4634 " accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n", 4709 " accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n",
4635 " print accuracy\n", 4710 " print accuracy\n",
4636 " \n", 4711 " \n",
4637 " # outliers\n", 4712 " # outliers\n",
4638 " print \"detecting outliers...\"\n", 4713 " print \"detecting outliers...\"\n",
4639 " ddf = outliers.load_metadata(Yaudio, metadata_file=load_dataset.METADATA_FILE)\n", 4714 " #ddf = outliers.load_metadata(Yaudio, metadata_file=load_dataset.METADATA_FILE)\n",
4640 " df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)\n", 4715 " df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n",
4641 " print_most_least_outliers_topN(df_global, N=10)\n", 4716 " outliers.print_most_least_outliers_topN(df_global, N=10)\n",
4642 " \n", 4717 " \n",
4643 " # write output\n", 4718 " # write output\n",
4644 " print \"writing file\"\n", 4719 " print \"writing file\"\n",
4645 " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)" 4720 " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)"
4646 ] 4721 ]
4722 },
4723 {
4724 "cell_type": "code",
4725 "execution_count": 4,
4726 "metadata": {
4727 "collapsed": true
4728 },
4729 "outputs": [],
4730 "source": [
4731 "X = np.concatenate(ldadata_list, axis=1)"
4732 ]
4733 },
4734 {
4735 "cell_type": "code",
4736 "execution_count": 5,
4737 "metadata": {},
4738 "outputs": [
4739 {
4740 "data": {
4741 "text/plain": [
4742 "(8089, 381)"
4743 ]
4744 },
4745 "execution_count": 5,
4746 "metadata": {},
4747 "output_type": "execute_result"
4748 }
4749 ],
4750 "source": [
4751 "X.shape"
4752 ]
4753 },
4754 {
4755 "cell_type": "code",
4756 "execution_count": 10,
4757 "metadata": {},
4758 "outputs": [
4759 {
4760 "name": "stdout",
4761 "output_type": "stream",
4762 "text": [
4763 "0.176354062249\n"
4764 ]
4765 },
4766 {
4767 "name": "stderr",
4768 "output_type": "stream",
4769 "text": [
4770 "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
4771 " 'precision', 'predicted', average, warn_for)\n"
4772 ]
4773 }
4774 ],
4775 "source": [
4776 "#traininds, testinds = classification.get_train_test_indices()\n",
4777 "traininds = np.arange(5000)\n",
4778 "testinds = np.arange(len(X)-1600, len(X))\n",
4779 "X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n",
4780 "accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n",
4781 "print accuracy"
4782 ]
4783 },
4784 {
4785 "cell_type": "code",
4786 "execution_count": 13,
4787 "metadata": {},
4788 "outputs": [
4789 {
4790 "name": "stdout",
4791 "output_type": "stream",
4792 "text": [
4793 "detecting outliers...\n",
4794 "most outliers \n",
4795 " Country Outliers\n",
4796 "136 Botswana 0.590909\n",
4797 "71 Ivory Coast 0.571429\n",
4798 "86 Gambia 0.541667\n",
4799 "43 Benin 0.538462\n",
4800 "62 Fiji 0.466667\n",
4801 "20 Pakistan 0.461538\n",
4802 "65 Uganda 0.437500\n",
4803 "14 Liberia 0.425000\n",
4804 "78 El Salvador 0.424242\n",
4805 "51 Western Sahara 0.421687\n",
4806 "least outliers \n",
4807 " Country Outliers\n",
4808 "119 Denmark 0.000000\n",
4809 "30 Afghanistan 0.000000\n",
4810 "113 Iceland 0.000000\n",
4811 "28 Tajikistan 0.000000\n",
4812 "74 Czech Republic 0.000000\n",
4813 "27 South Korea 0.000000\n",
4814 "1 Lithuania 0.000000\n",
4815 "15 Netherlands 0.014925\n",
4816 "121 Poland 0.040000\n",
4817 "134 Paraguay 0.043478\n"
4818 ]
4819 }
4820 ],
4821 "source": [
4822 "print \"detecting outliers...\"\n",
4823 "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n",
4824 "outliers.print_most_least_outliers_topN(df_global, N=10)"
4825 ]
4826 },
4827 {
4828 "cell_type": "markdown",
4829 "metadata": {},
4830 "source": [
4831 "## correlation of outlier results"
4832 ]
4833 },
4834 {
4835 "cell_type": "markdown",
4836 "metadata": {},
4837 "source": [
4838 "Let's use Kendal correlation to compare the ranked list of countries sorted by most to least outliers.\n",
4839 "<br> First load the ranked list of outlier countries.\n",
4840 "<br> Sort by outlier percentage in descending order."
4841 ]
4842 },
4843 {
4844 "cell_type": "code",
4845 "execution_count": null,
4846 "metadata": {
4847 "collapsed": true
4848 },
4849 "outputs": [],
4850 "source": [
4851 "ranked_countries = pd.DataFrame()\n",
4852 "ranked_outliers = pd.DataFrame()\n",
4853 "for n in range(n_iters):\n",
4854 " df_global = pd.read_csv('../data/outliers_'+str(n)+'.csv')\n",
4855 " df_global = df_global.sort_values('Outliers', axis=0, ascending=False, inplace=True)\n",
4856 " ranked_countries = pd.concat([ranked_countries, df_global['Country']], axis=1)\n",
4857 " ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)"
4858 ]
4859 },
4860 {
4861 "cell_type": "markdown",
4862 "metadata": {},
4863 "source": [
4864 "Remove countries with 0% outliers as these are in random (probably alphabetical) order."
4865 ]
4866 },
4867 {
4868 "cell_type": "code",
4869 "execution_count": null,
4870 "metadata": {
4871 "collapsed": true
4872 },
4873 "outputs": [],
4874 "source": [
4875 "zero_idx = np.where(np.sum(ranked_outliers, axis=1)==0)[0]\n",
4876 "first_zero_idx = np.min(zero_idx)\n",
4877 "ranked_countries = ranked_countries.iloc[:first_zero_idx, :]\n",
4878 "ranked_outliers = ranked_outliers.iloc[:first_zero_idx, :]\n",
4879 "\n",
4880 "print ranked_countries.head()\n",
4881 "print ranked_outliers.head()"
4882 ]
4883 },
4884 {
4885 "cell_type": "markdown",
4886 "metadata": {},
4887 "source": [
4888 "And now kendalltau correlation"
4889 ]
4890 },
4891 {
4892 "cell_type": "code",
4893 "execution_count": 33,
4894 "metadata": {},
4895 "outputs": [
4896 {
4897 "name": "stdout",
4898 "output_type": "stream",
4899 "text": [
4900 "KendalltauResult(correlation=0.99999999999999989, pvalue=2.5428927239036995e-67)\n"
4901 ]
4902 }
4903 ],
4904 "source": [
4905 "from scipy.stats import kendalltau\n",
4906 "for i in range(len(ranked_countries)-1):\n",
4907 " for j in range(i+1, len(ranked_countries)):\n",
4908 " print kendalltau(ranked_countries.iloc[:, i], ranked_countries.iloc[:, j])"
4909 ]
4910 },
4911 {
4912 "cell_type": "code",
4913 "execution_count": 34,
4914 "metadata": {},
4915 "outputs": [
4916 {
4917 "data": {
4918 "text/plain": [
4919 "SpearmanrResult(correlation=1.0, pvalue=0.0)"
4920 ]
4921 },
4922 "execution_count": 34,
4923 "metadata": {},
4924 "output_type": "execute_result"
4925 }
4926 ],
4927 "source": [
4928 "from scipy.stats import spearmanr\n",
4929 "spearmanr(ranked_countries)"
4930 ]
4931 },
4932 {
4933 "cell_type": "code",
4934 "execution_count": null,
4935 "metadata": {
4936 "collapsed": true
4937 },
4938 "outputs": [],
4939 "source": []
4647 } 4940 }
4648 ], 4941 ],
4649 "metadata": { 4942 "metadata": {
4650 "kernelspec": { 4943 "kernelspec": {
4651 "display_name": "Python 2", 4944 "display_name": "Python 2",