Mercurial > hg > plosone_underreview
diff notebooks/sensitivity_experiment.ipynb @ 55:98cd5317e504 branch-tests
updated notebooks
author | mpanteli <m.x.panteli@gmail.com> |
---|---|
date | Tue, 19 Sep 2017 21:27:09 +0100 |
parents | d3de9ac0d545 |
children | 444041185ba9 a6606b255ad7 |
line wrap: on
line diff
--- a/notebooks/sensitivity_experiment.ipynb Tue Sep 19 18:41:14 2017 +0100 +++ b/notebooks/sensitivity_experiment.ipynb Tue Sep 19 21:27:09 2017 +0100 @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -4546,36 +4546,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "iteration 0\n", - "mapping...\n", - "/import/c4dm-04/mariap/train_data_melodia_8_0.pickle\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m<ipython-input-7-f093c6f2c550>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m mapper.OUTPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n\u001b[1;32m 8\u001b[0m output_file in MAPPER_OUTPUT_FILES]\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlda_map_and_average_frames\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_variance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.99\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/map_and_average.pyc\u001b[0m in \u001b[0;36mlda_map_and_average_frames\u001b[0;34m(dataset, n_components, min_variance)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mlda_map_and_average_frames\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_components\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmin_variance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdataset\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 151\u001b[0;31m \u001b[0mtrainset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtestset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_train_val_test_sets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 152\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0mtrainset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtestset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/map_and_average.pyc\u001b[0m in \u001b[0;36mload_train_val_test_sets\u001b[0;34m()\u001b[0m\n\u001b[1;32m 69\u001b[0m '''\n\u001b[1;32m 70\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 71\u001b[0;31m \u001b[0mtrainset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 72\u001b[0m \u001b[0mvalset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0mtestset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/map_and_average.pyc\u001b[0m in \u001b[0;36mload_data_from_pickle\u001b[0;34m(pickle_file)\u001b[0m\n\u001b[1;32m 57\u001b[0m '''\n\u001b[1;32m 58\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpickle_file\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 59\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudiolabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 60\u001b[0m \u001b[0;31m# remove 'unknown' and 'unidentified' country\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudiolabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mremove_inds\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudiolabels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/homes/mp305/anaconda/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mload\u001b[0;34m(file)\u001b[0m\n\u001b[1;32m 1382\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1383\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1384\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mUnpickler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1385\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1386\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/homes/mp305/anaconda/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mload\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 862\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 864\u001b[0;31m \u001b[0mdispatch\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 865\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0m_Stop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstopinst\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 866\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mstopinst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/homes/mp305/anaconda/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mload_string\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 967\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 968\u001b[0;31m \u001b[0mrep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 969\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mq\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m\"\\\"'\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# double or single quote\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrep\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], + "outputs": [], "source": [ "MAPPER_OUTPUT_FILES = mapper.OUTPUT_FILES\n", "for n in range(n_iters):\n", @@ -4599,37 +4572,238 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "iteration 0\n" - ] - }, - { - "ename": "IOError", - "evalue": "[Errno 2] No such file or directory: '/import/c4dm-04/mariap/nmf_data_melodia_8_0.pickle'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m<ipython-input-5-eb8ccb858c3f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOUTPUT_FILES\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCLASS_INPUT_FILES\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mINPUT_FILES\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOUTPUT_FILES\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclassification\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCLASS_INPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;31m# classification and confusion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/classification.pyc\u001b[0m in \u001b[0;36mload_data_from_pickle\u001b[0;34m(filename)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0mX_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: '/import/c4dm-04/mariap/nmf_data_melodia_8_0.pickle'" + "iteration 0\n", + "classifying...\n", + "/import/c4dm-04/mariap/train_data_melodia_8_0.pickle\n", + "0.17294625462\n", + "detecting outliers...\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "136 Botswana 0.590909 88 52\n", + "31 Ivory Coast 0.571429 14 8\n", + "86 Gambia 0.541667 48 26\n", + "42 Benin 0.538462 26 14\n", + "102 Fiji 0.466667 15 7\n", + "20 Pakistan 0.461538 91 42\n", + "64 Uganda 0.437500 80 35\n", + "14 Liberia 0.425000 40 17\n", + "78 El Salvador 0.424242 33 14\n", + "50 Western Sahara 0.421687 83 35\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "1 Lithuania 0.000000 47 0\n", + "30 Afghanistan 0.000000 24 0\n", + "28 Tajikistan 0.000000 19 0\n", + "27 South Korea 0.000000 11 0\n", + "113 Iceland 0.000000 14 0\n", + "119 Denmark 0.000000 16 0\n", + "74 Czech Republic 0.000000 41 0\n", + "15 Netherlands 0.014925 67 1\n", + "121 Poland 0.040000 100 4\n", + "134 Paraguay 0.043478 23 1\n", + "writing file\n", + "iteration 1\n", + "classifying...\n", + "/import/c4dm-04/mariap/train_data_melodia_8_1.pickle\n", + "0.149811300704\n", + "detecting outliers...\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "60 Chad 0.545455 11 6\n", + "62 Fiji 0.533333 15 8\n", + "86 Gambia 0.520833 48 25\n", + "21 Pakistan 0.500000 88 44\n", + "43 Benin 0.500000 26 13\n", + "32 Ivory Coast 0.500000 14 7\n", + "136 Botswana 0.488095 84 41\n", + "78 El Salvador 0.484848 33 16\n", + "106 Nepal 0.436782 87 38\n", + "135 French Guiana 0.428571 28 12\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "1 Lithuania 0.000000 47 0\n", + "113 Iceland 0.000000 14 0\n", + "119 Denmark 0.000000 16 0\n", + "74 Czech Republic 0.000000 41 0\n", + "28 South Korea 0.000000 11 0\n", + "16 Netherlands 0.029851 67 2\n", + "31 Afghanistan 0.041667 24 1\n", + "134 Paraguay 0.043478 23 1\n", + "105 Sudan 0.045455 66 3\n", + "120 Kazakhstan 0.045455 88 4\n", + "writing file\n", + "iteration 2\n", + "classifying...\n", + "/import/c4dm-04/mariap/train_data_melodia_8_2.pickle\n", + "0.178052269426\n", + "detecting outliers...\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "136 Botswana 0.615385 78 48\n", + "86 Gambia 0.520833 48 25\n", + "72 Ivory Coast 0.500000 14 7\n", + "62 Fiji 0.466667 15 7\n", + "43 Benin 0.461538 26 12\n", + "20 Pakistan 0.451613 93 42\n", + "17 French Guiana 0.428571 28 12\n", + "14 Liberia 0.425000 40 17\n", + "78 El Salvador 0.424242 33 14\n", + "51 Western Sahara 0.414634 82 34\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "119 Denmark 0.000000 16 0\n", + "113 Iceland 0.000000 14 0\n", + "27 South Korea 0.000000 11 0\n", + "1 Lithuania 0.000000 47 0\n", + "31 Czech Republic 0.024390 41 1\n", + "15 Netherlands 0.029851 67 2\n", + "30 Afghanistan 0.041667 24 1\n", + "105 Sudan 0.045455 66 3\n", + "120 Kazakhstan 0.045455 88 4\n", + "100 Antigua and Barbuda 0.047619 42 2\n", + "writing file\n", + "iteration 3\n", + "classifying...\n", + "/import/c4dm-04/mariap/train_data_melodia_8_3.pickle\n", + "0.177243715126\n", + "detecting outliers...\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "136 Botswana 0.617284 81 50\n", + "31 Ivory Coast 0.571429 14 8\n", + "86 Gambia 0.541667 48 26\n", + "43 Benin 0.538462 26 14\n", + "62 Fiji 0.533333 15 8\n", + "20 Pakistan 0.468750 96 45\n", + "51 Western Sahara 0.439024 82 36\n", + "14 Liberia 0.425000 40 17\n", + "78 El Salvador 0.424242 33 14\n", + "106 Nepal 0.416667 96 40\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "113 Iceland 0.000000 14 0\n", + "30 Afghanistan 0.000000 24 0\n", + "119 Denmark 0.000000 16 0\n", + "134 Paraguay 0.000000 23 0\n", + "27 South Korea 0.000000 11 0\n", + "1 Lithuania 0.000000 47 0\n", + "100 Antigua and Barbuda 0.023810 42 1\n", + "74 Czech Republic 0.024390 41 1\n", + "15 Netherlands 0.029851 67 2\n", + "105 Sudan 0.045455 66 3\n", + "writing file\n", + "iteration 4\n", + "classifying...\n", + "/import/c4dm-04/mariap/train_data_melodia_8_4.pickle\n", + "0.186733308352\n", + "detecting outliers...\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "60 Chad 0.727273 11 8\n", + "136 Botswana 0.630952 84 53\n", + "72 Ivory Coast 0.571429 14 8\n", + "62 Fiji 0.533333 15 8\n", + "86 Gambia 0.520833 48 25\n", + "43 Benin 0.500000 26 13\n", + "20 Pakistan 0.468085 94 44\n", + "135 French Guiana 0.464286 28 13\n", + "64 Mozambique 0.441176 34 15\n", + "51 Western Sahara 0.439024 82 36\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "1 Lithuania 0.000000 47 0\n", + "27 South Korea 0.000000 11 0\n", + "113 Iceland 0.000000 14 0\n", + "119 Denmark 0.000000 16 0\n", + "15 Netherlands 0.014925 67 1\n", + "31 Czech Republic 0.024390 41 1\n", + "112 Israel 0.030000 100 3\n", + "30 Afghanistan 0.041667 24 1\n", + "134 Paraguay 0.043478 23 1\n", + "105 Sudan 0.045455 66 3\n", + "writing file\n", + "iteration 5\n", + "classifying...\n", + "/import/c4dm-04/mariap/train_data_melodia_8_5.pickle\n", + "0.163125082162\n", + "detecting outliers...\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "136 Botswana 0.607143 84 51\n", + "72 Ivory Coast 0.571429 14 8\n", + "21 Pakistan 0.553191 94 52\n", + "95 Chad 0.545455 11 6\n", + "63 Fiji 0.533333 15 8\n", + "86 Gambia 0.520833 48 25\n", + "44 Benin 0.500000 26 13\n", + "78 El Salvador 0.454545 33 15\n", + "117 Zimbabwe 0.428571 14 6\n", + "66 Uganda 0.418605 86 36\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "119 Denmark 0.000000 16 0\n", + "1 Lithuania 0.000000 47 0\n", + "28 South Korea 0.000000 11 0\n", + "113 Iceland 0.000000 14 0\n", + "32 Czech Republic 0.024390 41 1\n", + "16 Netherlands 0.029851 67 2\n", + "31 Afghanistan 0.041667 24 1\n", + "134 Paraguay 0.043478 23 1\n", + "120 Kazakhstan 0.045455 88 4\n", + "105 Sudan 0.045455 66 3\n", + "writing file\n", + "iteration 6\n", + "classifying...\n", + "/import/c4dm-04/mariap/train_data_melodia_8_6.pickle\n", + "0.179816192246\n", + "detecting outliers...\n", + "most outliers \n", + " Country Outliers N_Country N_Outliers\n", + "136 Botswana 0.574468 94 54\n", + "32 Ivory Coast 0.571429 14 8\n", + "86 Gambia 0.520833 48 25\n", + "21 Pakistan 0.516854 89 46\n", + "62 Fiji 0.466667 15 7\n", + "43 Benin 0.461538 26 12\n", + "95 Chad 0.454545 11 5\n", + "78 El Salvador 0.454545 33 15\n", + "51 Western Sahara 0.439024 82 36\n", + "63 Senegal 0.405405 37 15\n", + "least outliers \n", + " Country Outliers N_Country N_Outliers\n", + "1 Lithuania 0.000000 47 0\n", + "119 Denmark 0.000000 16 0\n", + "28 South Korea 0.000000 11 0\n", + "113 Iceland 0.000000 14 0\n", + "16 Netherlands 0.014925 67 1\n", + "74 Czech Republic 0.024390 41 1\n", + "13 Germany 0.040000 100 4\n", + "31 Afghanistan 0.041667 24 1\n", + "105 Sudan 0.045455 66 3\n", + "120 Kazakhstan 0.045455 88 4\n", + "writing file\n" ] } ], "source": [ + "n_iters = 7\n", + "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n", + "MAPPER_OUTPUT_FILES = mapper.OUTPUT_FILES\n", "for n in range(n_iters):\n", " print \"iteration %d\" % n\n", " CLASS_INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n", - " output_file in mapper.OUTPUT_FILES]\n", - " mapper.INPUT_FILES = OUTPUT_FILES\n", - " ldadata_list, Y, Yaudio = classification.load_data_from_pickle(CLASS_INPUT_FILES[2])\n", - " X = np.concatenate(ldadata_list, axis=1)\n", + " output_file in MAPPER_OUTPUT_FILES]\n", + " mapper.INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n", + " output_file in OUTPUT_FILES]\n", + " X, Y, Yaudio = classification.load_data_from_pickle(CLASS_INPUT_FILES[0])\n", + " #X = np.concatenate(ldadata_list, axis=1)\n", " # classification and confusion\n", " print \"classifying...\"\n", " traininds, testinds = classification.get_train_test_indices(Yaudio)\n", @@ -4649,110 +4823,6 @@ ] }, { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "X = np.concatenate(ldadata_list, axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(8089, 381)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.176354062249\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n", - " 'precision', 'predicted', average, warn_for)\n" - ] - } - ], - "source": [ - "#traininds, testinds = classification.get_train_test_indices()\n", - "traininds = np.arange(5000)\n", - "testinds = np.arange(len(X)-1600, len(X))\n", - "X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n", - "accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n", - "print accuracy" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "detecting outliers...\n", - "most outliers \n", - " Country Outliers\n", - "136 Botswana 0.590909\n", - "71 Ivory Coast 0.571429\n", - "86 Gambia 0.541667\n", - "43 Benin 0.538462\n", - "62 Fiji 0.466667\n", - "20 Pakistan 0.461538\n", - "65 Uganda 0.437500\n", - "14 Liberia 0.425000\n", - "78 El Salvador 0.424242\n", - "51 Western Sahara 0.421687\n", - "least outliers \n", - " Country Outliers\n", - "119 Denmark 0.000000\n", - "30 Afghanistan 0.000000\n", - "113 Iceland 0.000000\n", - "28 Tajikistan 0.000000\n", - "74 Czech Republic 0.000000\n", - "27 South Korea 0.000000\n", - "1 Lithuania 0.000000\n", - "15 Netherlands 0.014925\n", - "121 Poland 0.040000\n", - "134 Paraguay 0.043478\n" - ] - } - ], - "source": [ - "print \"detecting outliers...\"\n", - "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n", - "outliers.print_most_least_outliers_topN(df_global, N=10)" - ] - }, - { "cell_type": "markdown", "metadata": {}, "source": [