Mercurial > hg > plosone_underreview
comparison notebooks/sensitivity_experiment.ipynb @ 55:98cd5317e504 branch-tests
updated notebooks
author | mpanteli <m.x.panteli@gmail.com> |
---|---|
date | Tue, 19 Sep 2017 21:27:09 +0100 |
parents | d3de9ac0d545 |
children | 444041185ba9 a6606b255ad7 |
comparison
equal
deleted
inserted
replaced
54:dbcd5b2a4efa | 55:98cd5317e504 |
---|---|
32 "import scripts.outliers as outliers" | 32 "import scripts.outliers as outliers" |
33 ] | 33 ] |
34 }, | 34 }, |
35 { | 35 { |
36 "cell_type": "code", | 36 "cell_type": "code", |
37 "execution_count": 3, | 37 "execution_count": 2, |
38 "metadata": { | 38 "metadata": { |
39 "collapsed": true | 39 "collapsed": true |
40 }, | 40 }, |
41 "outputs": [], | 41 "outputs": [], |
42 "source": [ | 42 "source": [ |
4544 "## Map frames and write output for the lda transformed frames" | 4544 "## Map frames and write output for the lda transformed frames" |
4545 ] | 4545 ] |
4546 }, | 4546 }, |
4547 { | 4547 { |
4548 "cell_type": "code", | 4548 "cell_type": "code", |
4549 "execution_count": 7, | 4549 "execution_count": null, |
4550 "metadata": {}, | 4550 "metadata": {}, |
4551 "outputs": [ | 4551 "outputs": [], |
4552 { | |
4553 "name": "stdout", | |
4554 "output_type": "stream", | |
4555 "text": [ | |
4556 "iteration 0\n", | |
4557 "mapping...\n", | |
4558 "/import/c4dm-04/mariap/train_data_melodia_8_0.pickle\n" | |
4559 ] | |
4560 }, | |
4561 { | |
4562 "ename": "KeyboardInterrupt", | |
4563 "evalue": "", | |
4564 "output_type": "error", | |
4565 "traceback": [ | |
4566 "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
4567 "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", | |
4568 "\u001b[0;32m<ipython-input-7-f093c6f2c550>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m mapper.OUTPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n\u001b[1;32m 8\u001b[0m output_file in MAPPER_OUTPUT_FILES]\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlda_map_and_average_frames\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_variance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.99\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
4569 "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/map_and_average.pyc\u001b[0m in \u001b[0;36mlda_map_and_average_frames\u001b[0;34m(dataset, n_components, min_variance)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mlda_map_and_average_frames\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_components\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmin_variance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdataset\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 151\u001b[0;31m \u001b[0mtrainset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtestset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_train_val_test_sets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 152\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0mtrainset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtestset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
4570 "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/map_and_average.pyc\u001b[0m in \u001b[0;36mload_train_val_test_sets\u001b[0;34m()\u001b[0m\n\u001b[1;32m 69\u001b[0m '''\n\u001b[1;32m 70\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 71\u001b[0;31m \u001b[0mtrainset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 72\u001b[0m \u001b[0mvalset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0mtestset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
4571 "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/map_and_average.pyc\u001b[0m in \u001b[0;36mload_data_from_pickle\u001b[0;34m(pickle_file)\u001b[0m\n\u001b[1;32m 57\u001b[0m '''\n\u001b[1;32m 58\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpickle_file\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 59\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudiolabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 60\u001b[0m \u001b[0;31m# remove 'unknown' and 'unidentified' country\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudiolabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mremove_inds\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudiolabels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
4572 "\u001b[0;32m/homes/mp305/anaconda/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mload\u001b[0;34m(file)\u001b[0m\n\u001b[1;32m 1382\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1383\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1384\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mUnpickler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1385\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1386\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
4573 "\u001b[0;32m/homes/mp305/anaconda/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mload\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 862\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 864\u001b[0;31m \u001b[0mdispatch\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 865\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0m_Stop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstopinst\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 866\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mstopinst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
4574 "\u001b[0;32m/homes/mp305/anaconda/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mload_string\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 967\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 968\u001b[0;31m \u001b[0mrep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 969\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mq\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m\"\\\"'\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# double or single quote\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrep\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
4575 "\u001b[0;31mKeyboardInterrupt\u001b[0m: " | |
4576 ] | |
4577 } | |
4578 ], | |
4579 "source": [ | 4552 "source": [ |
4580 "MAPPER_OUTPUT_FILES = mapper.OUTPUT_FILES\n", | 4553 "MAPPER_OUTPUT_FILES = mapper.OUTPUT_FILES\n", |
4581 "for n in range(n_iters):\n", | 4554 "for n in range(n_iters):\n", |
4582 " print \"iteration %d\" % n\n", | 4555 " print \"iteration %d\" % n\n", |
4583 " \n", | 4556 " \n", |
4597 "## Classification only - assuming mapper files are exported " | 4570 "## Classification only - assuming mapper files are exported " |
4598 ] | 4571 ] |
4599 }, | 4572 }, |
4600 { | 4573 { |
4601 "cell_type": "code", | 4574 "cell_type": "code", |
4602 "execution_count": 5, | 4575 "execution_count": 19, |
4603 "metadata": {}, | 4576 "metadata": {}, |
4604 "outputs": [ | 4577 "outputs": [ |
4605 { | 4578 { |
4606 "name": "stdout", | 4579 "name": "stdout", |
4607 "output_type": "stream", | 4580 "output_type": "stream", |
4608 "text": [ | 4581 "text": [ |
4609 "iteration 0\n" | 4582 "iteration 0\n", |
4610 ] | 4583 "classifying...\n", |
4611 }, | 4584 "/import/c4dm-04/mariap/train_data_melodia_8_0.pickle\n", |
4612 { | 4585 "0.17294625462\n", |
4613 "ename": "IOError", | 4586 "detecting outliers...\n", |
4614 "evalue": "[Errno 2] No such file or directory: '/import/c4dm-04/mariap/nmf_data_melodia_8_0.pickle'", | 4587 "most outliers \n", |
4615 "output_type": "error", | 4588 " Country Outliers N_Country N_Outliers\n", |
4616 "traceback": [ | 4589 "136 Botswana 0.590909 88 52\n", |
4617 "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | 4590 "31 Ivory Coast 0.571429 14 8\n", |
4618 "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", | 4591 "86 Gambia 0.541667 48 26\n", |
4619 "\u001b[0;32m<ipython-input-5-eb8ccb858c3f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOUTPUT_FILES\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCLASS_INPUT_FILES\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mINPUT_FILES\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOUTPUT_FILES\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclassification\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCLASS_INPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;31m# classification and confusion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | 4592 "42 Benin 0.538462 26 14\n", |
4620 "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/classification.pyc\u001b[0m in \u001b[0;36mload_data_from_pickle\u001b[0;34m(filename)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0mX_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | 4593 "102 Fiji 0.466667 15 7\n", |
4621 "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: '/import/c4dm-04/mariap/nmf_data_melodia_8_0.pickle'" | 4594 "20 Pakistan 0.461538 91 42\n", |
4595 "64 Uganda 0.437500 80 35\n", | |
4596 "14 Liberia 0.425000 40 17\n", | |
4597 "78 El Salvador 0.424242 33 14\n", | |
4598 "50 Western Sahara 0.421687 83 35\n", | |
4599 "least outliers \n", | |
4600 " Country Outliers N_Country N_Outliers\n", | |
4601 "1 Lithuania 0.000000 47 0\n", | |
4602 "30 Afghanistan 0.000000 24 0\n", | |
4603 "28 Tajikistan 0.000000 19 0\n", | |
4604 "27 South Korea 0.000000 11 0\n", | |
4605 "113 Iceland 0.000000 14 0\n", | |
4606 "119 Denmark 0.000000 16 0\n", | |
4607 "74 Czech Republic 0.000000 41 0\n", | |
4608 "15 Netherlands 0.014925 67 1\n", | |
4609 "121 Poland 0.040000 100 4\n", | |
4610 "134 Paraguay 0.043478 23 1\n", | |
4611 "writing file\n", | |
4612 "iteration 1\n", | |
4613 "classifying...\n", | |
4614 "/import/c4dm-04/mariap/train_data_melodia_8_1.pickle\n", | |
4615 "0.149811300704\n", | |
4616 "detecting outliers...\n", | |
4617 "most outliers \n", | |
4618 " Country Outliers N_Country N_Outliers\n", | |
4619 "60 Chad 0.545455 11 6\n", | |
4620 "62 Fiji 0.533333 15 8\n", | |
4621 "86 Gambia 0.520833 48 25\n", | |
4622 "21 Pakistan 0.500000 88 44\n", | |
4623 "43 Benin 0.500000 26 13\n", | |
4624 "32 Ivory Coast 0.500000 14 7\n", | |
4625 "136 Botswana 0.488095 84 41\n", | |
4626 "78 El Salvador 0.484848 33 16\n", | |
4627 "106 Nepal 0.436782 87 38\n", | |
4628 "135 French Guiana 0.428571 28 12\n", | |
4629 "least outliers \n", | |
4630 " Country Outliers N_Country N_Outliers\n", | |
4631 "1 Lithuania 0.000000 47 0\n", | |
4632 "113 Iceland 0.000000 14 0\n", | |
4633 "119 Denmark 0.000000 16 0\n", | |
4634 "74 Czech Republic 0.000000 41 0\n", | |
4635 "28 South Korea 0.000000 11 0\n", | |
4636 "16 Netherlands 0.029851 67 2\n", | |
4637 "31 Afghanistan 0.041667 24 1\n", | |
4638 "134 Paraguay 0.043478 23 1\n", | |
4639 "105 Sudan 0.045455 66 3\n", | |
4640 "120 Kazakhstan 0.045455 88 4\n", | |
4641 "writing file\n", | |
4642 "iteration 2\n", | |
4643 "classifying...\n", | |
4644 "/import/c4dm-04/mariap/train_data_melodia_8_2.pickle\n", | |
4645 "0.178052269426\n", | |
4646 "detecting outliers...\n", | |
4647 "most outliers \n", | |
4648 " Country Outliers N_Country N_Outliers\n", | |
4649 "136 Botswana 0.615385 78 48\n", | |
4650 "86 Gambia 0.520833 48 25\n", | |
4651 "72 Ivory Coast 0.500000 14 7\n", | |
4652 "62 Fiji 0.466667 15 7\n", | |
4653 "43 Benin 0.461538 26 12\n", | |
4654 "20 Pakistan 0.451613 93 42\n", | |
4655 "17 French Guiana 0.428571 28 12\n", | |
4656 "14 Liberia 0.425000 40 17\n", | |
4657 "78 El Salvador 0.424242 33 14\n", | |
4658 "51 Western Sahara 0.414634 82 34\n", | |
4659 "least outliers \n", | |
4660 " Country Outliers N_Country N_Outliers\n", | |
4661 "119 Denmark 0.000000 16 0\n", | |
4662 "113 Iceland 0.000000 14 0\n", | |
4663 "27 South Korea 0.000000 11 0\n", | |
4664 "1 Lithuania 0.000000 47 0\n", | |
4665 "31 Czech Republic 0.024390 41 1\n", | |
4666 "15 Netherlands 0.029851 67 2\n", | |
4667 "30 Afghanistan 0.041667 24 1\n", | |
4668 "105 Sudan 0.045455 66 3\n", | |
4669 "120 Kazakhstan 0.045455 88 4\n", | |
4670 "100 Antigua and Barbuda 0.047619 42 2\n", | |
4671 "writing file\n", | |
4672 "iteration 3\n", | |
4673 "classifying...\n", | |
4674 "/import/c4dm-04/mariap/train_data_melodia_8_3.pickle\n", | |
4675 "0.177243715126\n", | |
4676 "detecting outliers...\n", | |
4677 "most outliers \n", | |
4678 " Country Outliers N_Country N_Outliers\n", | |
4679 "136 Botswana 0.617284 81 50\n", | |
4680 "31 Ivory Coast 0.571429 14 8\n", | |
4681 "86 Gambia 0.541667 48 26\n", | |
4682 "43 Benin 0.538462 26 14\n", | |
4683 "62 Fiji 0.533333 15 8\n", | |
4684 "20 Pakistan 0.468750 96 45\n", | |
4685 "51 Western Sahara 0.439024 82 36\n", | |
4686 "14 Liberia 0.425000 40 17\n", | |
4687 "78 El Salvador 0.424242 33 14\n", | |
4688 "106 Nepal 0.416667 96 40\n", | |
4689 "least outliers \n", | |
4690 " Country Outliers N_Country N_Outliers\n", | |
4691 "113 Iceland 0.000000 14 0\n", | |
4692 "30 Afghanistan 0.000000 24 0\n", | |
4693 "119 Denmark 0.000000 16 0\n", | |
4694 "134 Paraguay 0.000000 23 0\n", | |
4695 "27 South Korea 0.000000 11 0\n", | |
4696 "1 Lithuania 0.000000 47 0\n", | |
4697 "100 Antigua and Barbuda 0.023810 42 1\n", | |
4698 "74 Czech Republic 0.024390 41 1\n", | |
4699 "15 Netherlands 0.029851 67 2\n", | |
4700 "105 Sudan 0.045455 66 3\n", | |
4701 "writing file\n", | |
4702 "iteration 4\n", | |
4703 "classifying...\n", | |
4704 "/import/c4dm-04/mariap/train_data_melodia_8_4.pickle\n", | |
4705 "0.186733308352\n", | |
4706 "detecting outliers...\n", | |
4707 "most outliers \n", | |
4708 " Country Outliers N_Country N_Outliers\n", | |
4709 "60 Chad 0.727273 11 8\n", | |
4710 "136 Botswana 0.630952 84 53\n", | |
4711 "72 Ivory Coast 0.571429 14 8\n", | |
4712 "62 Fiji 0.533333 15 8\n", | |
4713 "86 Gambia 0.520833 48 25\n", | |
4714 "43 Benin 0.500000 26 13\n", | |
4715 "20 Pakistan 0.468085 94 44\n", | |
4716 "135 French Guiana 0.464286 28 13\n", | |
4717 "64 Mozambique 0.441176 34 15\n", | |
4718 "51 Western Sahara 0.439024 82 36\n", | |
4719 "least outliers \n", | |
4720 " Country Outliers N_Country N_Outliers\n", | |
4721 "1 Lithuania 0.000000 47 0\n", | |
4722 "27 South Korea 0.000000 11 0\n", | |
4723 "113 Iceland 0.000000 14 0\n", | |
4724 "119 Denmark 0.000000 16 0\n", | |
4725 "15 Netherlands 0.014925 67 1\n", | |
4726 "31 Czech Republic 0.024390 41 1\n", | |
4727 "112 Israel 0.030000 100 3\n", | |
4728 "30 Afghanistan 0.041667 24 1\n", | |
4729 "134 Paraguay 0.043478 23 1\n", | |
4730 "105 Sudan 0.045455 66 3\n", | |
4731 "writing file\n", | |
4732 "iteration 5\n", | |
4733 "classifying...\n", | |
4734 "/import/c4dm-04/mariap/train_data_melodia_8_5.pickle\n", | |
4735 "0.163125082162\n", | |
4736 "detecting outliers...\n", | |
4737 "most outliers \n", | |
4738 " Country Outliers N_Country N_Outliers\n", | |
4739 "136 Botswana 0.607143 84 51\n", | |
4740 "72 Ivory Coast 0.571429 14 8\n", | |
4741 "21 Pakistan 0.553191 94 52\n", | |
4742 "95 Chad 0.545455 11 6\n", | |
4743 "63 Fiji 0.533333 15 8\n", | |
4744 "86 Gambia 0.520833 48 25\n", | |
4745 "44 Benin 0.500000 26 13\n", | |
4746 "78 El Salvador 0.454545 33 15\n", | |
4747 "117 Zimbabwe 0.428571 14 6\n", | |
4748 "66 Uganda 0.418605 86 36\n", | |
4749 "least outliers \n", | |
4750 " Country Outliers N_Country N_Outliers\n", | |
4751 "119 Denmark 0.000000 16 0\n", | |
4752 "1 Lithuania 0.000000 47 0\n", | |
4753 "28 South Korea 0.000000 11 0\n", | |
4754 "113 Iceland 0.000000 14 0\n", | |
4755 "32 Czech Republic 0.024390 41 1\n", | |
4756 "16 Netherlands 0.029851 67 2\n", | |
4757 "31 Afghanistan 0.041667 24 1\n", | |
4758 "134 Paraguay 0.043478 23 1\n", | |
4759 "120 Kazakhstan 0.045455 88 4\n", | |
4760 "105 Sudan 0.045455 66 3\n", | |
4761 "writing file\n", | |
4762 "iteration 6\n", | |
4763 "classifying...\n", | |
4764 "/import/c4dm-04/mariap/train_data_melodia_8_6.pickle\n", | |
4765 "0.179816192246\n", | |
4766 "detecting outliers...\n", | |
4767 "most outliers \n", | |
4768 " Country Outliers N_Country N_Outliers\n", | |
4769 "136 Botswana 0.574468 94 54\n", | |
4770 "32 Ivory Coast 0.571429 14 8\n", | |
4771 "86 Gambia 0.520833 48 25\n", | |
4772 "21 Pakistan 0.516854 89 46\n", | |
4773 "62 Fiji 0.466667 15 7\n", | |
4774 "43 Benin 0.461538 26 12\n", | |
4775 "95 Chad 0.454545 11 5\n", | |
4776 "78 El Salvador 0.454545 33 15\n", | |
4777 "51 Western Sahara 0.439024 82 36\n", | |
4778 "63 Senegal 0.405405 37 15\n", | |
4779 "least outliers \n", | |
4780 " Country Outliers N_Country N_Outliers\n", | |
4781 "1 Lithuania 0.000000 47 0\n", | |
4782 "119 Denmark 0.000000 16 0\n", | |
4783 "28 South Korea 0.000000 11 0\n", | |
4784 "113 Iceland 0.000000 14 0\n", | |
4785 "16 Netherlands 0.014925 67 1\n", | |
4786 "74 Czech Republic 0.024390 41 1\n", | |
4787 "13 Germany 0.040000 100 4\n", | |
4788 "31 Afghanistan 0.041667 24 1\n", | |
4789 "105 Sudan 0.045455 66 3\n", | |
4790 "120 Kazakhstan 0.045455 88 4\n", | |
4791 "writing file\n" | |
4622 ] | 4792 ] |
4623 } | 4793 } |
4624 ], | 4794 ], |
4625 "source": [ | 4795 "source": [ |
4796 "n_iters = 7\n", | |
4797 "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n", | |
4798 "MAPPER_OUTPUT_FILES = mapper.OUTPUT_FILES\n", | |
4626 "for n in range(n_iters):\n", | 4799 "for n in range(n_iters):\n", |
4627 " print \"iteration %d\" % n\n", | 4800 " print \"iteration %d\" % n\n", |
4628 " CLASS_INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n", | 4801 " CLASS_INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n", |
4629 " output_file in mapper.OUTPUT_FILES]\n", | 4802 " output_file in MAPPER_OUTPUT_FILES]\n", |
4630 " mapper.INPUT_FILES = OUTPUT_FILES\n", | 4803 " mapper.INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n", |
4631 " ldadata_list, Y, Yaudio = classification.load_data_from_pickle(CLASS_INPUT_FILES[2])\n", | 4804 " output_file in OUTPUT_FILES]\n", |
4632 " X = np.concatenate(ldadata_list, axis=1)\n", | 4805 " X, Y, Yaudio = classification.load_data_from_pickle(CLASS_INPUT_FILES[0])\n", |
4806 " #X = np.concatenate(ldadata_list, axis=1)\n", | |
4633 " # classification and confusion\n", | 4807 " # classification and confusion\n", |
4634 " print \"classifying...\"\n", | 4808 " print \"classifying...\"\n", |
4635 " traininds, testinds = classification.get_train_test_indices(Yaudio)\n", | 4809 " traininds, testinds = classification.get_train_test_indices(Yaudio)\n", |
4636 " X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n", | 4810 " X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n", |
4637 " accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n", | 4811 " accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n", |
4644 " outliers.print_most_least_outliers_topN(df_global, N=10)\n", | 4818 " outliers.print_most_least_outliers_topN(df_global, N=10)\n", |
4645 " \n", | 4819 " \n", |
4646 " # write output\n", | 4820 " # write output\n", |
4647 " print \"writing file\"\n", | 4821 " print \"writing file\"\n", |
4648 " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)" | 4822 " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)" |
4649 ] | |
4650 }, | |
4651 { | |
4652 "cell_type": "code", | |
4653 "execution_count": 4, | |
4654 "metadata": { | |
4655 "collapsed": true | |
4656 }, | |
4657 "outputs": [], | |
4658 "source": [ | |
4659 "X = np.concatenate(ldadata_list, axis=1)" | |
4660 ] | |
4661 }, | |
4662 { | |
4663 "cell_type": "code", | |
4664 "execution_count": 5, | |
4665 "metadata": {}, | |
4666 "outputs": [ | |
4667 { | |
4668 "data": { | |
4669 "text/plain": [ | |
4670 "(8089, 381)" | |
4671 ] | |
4672 }, | |
4673 "execution_count": 5, | |
4674 "metadata": {}, | |
4675 "output_type": "execute_result" | |
4676 } | |
4677 ], | |
4678 "source": [ | |
4679 "X.shape" | |
4680 ] | |
4681 }, | |
4682 { | |
4683 "cell_type": "code", | |
4684 "execution_count": 10, | |
4685 "metadata": {}, | |
4686 "outputs": [ | |
4687 { | |
4688 "name": "stdout", | |
4689 "output_type": "stream", | |
4690 "text": [ | |
4691 "0.176354062249\n" | |
4692 ] | |
4693 }, | |
4694 { | |
4695 "name": "stderr", | |
4696 "output_type": "stream", | |
4697 "text": [ | |
4698 "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n", | |
4699 " 'precision', 'predicted', average, warn_for)\n" | |
4700 ] | |
4701 } | |
4702 ], | |
4703 "source": [ | |
4704 "#traininds, testinds = classification.get_train_test_indices()\n", | |
4705 "traininds = np.arange(5000)\n", | |
4706 "testinds = np.arange(len(X)-1600, len(X))\n", | |
4707 "X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n", | |
4708 "accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n", | |
4709 "print accuracy" | |
4710 ] | |
4711 }, | |
4712 { | |
4713 "cell_type": "code", | |
4714 "execution_count": 13, | |
4715 "metadata": {}, | |
4716 "outputs": [ | |
4717 { | |
4718 "name": "stdout", | |
4719 "output_type": "stream", | |
4720 "text": [ | |
4721 "detecting outliers...\n", | |
4722 "most outliers \n", | |
4723 " Country Outliers\n", | |
4724 "136 Botswana 0.590909\n", | |
4725 "71 Ivory Coast 0.571429\n", | |
4726 "86 Gambia 0.541667\n", | |
4727 "43 Benin 0.538462\n", | |
4728 "62 Fiji 0.466667\n", | |
4729 "20 Pakistan 0.461538\n", | |
4730 "65 Uganda 0.437500\n", | |
4731 "14 Liberia 0.425000\n", | |
4732 "78 El Salvador 0.424242\n", | |
4733 "51 Western Sahara 0.421687\n", | |
4734 "least outliers \n", | |
4735 " Country Outliers\n", | |
4736 "119 Denmark 0.000000\n", | |
4737 "30 Afghanistan 0.000000\n", | |
4738 "113 Iceland 0.000000\n", | |
4739 "28 Tajikistan 0.000000\n", | |
4740 "74 Czech Republic 0.000000\n", | |
4741 "27 South Korea 0.000000\n", | |
4742 "1 Lithuania 0.000000\n", | |
4743 "15 Netherlands 0.014925\n", | |
4744 "121 Poland 0.040000\n", | |
4745 "134 Paraguay 0.043478\n" | |
4746 ] | |
4747 } | |
4748 ], | |
4749 "source": [ | |
4750 "print \"detecting outliers...\"\n", | |
4751 "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n", | |
4752 "outliers.print_most_least_outliers_topN(df_global, N=10)" | |
4753 ] | 4823 ] |
4754 }, | 4824 }, |
4755 { | 4825 { |
4756 "cell_type": "markdown", | 4826 "cell_type": "markdown", |
4757 "metadata": {}, | 4827 "metadata": {}, |