comparison notebooks/sensitivity_experiment.ipynb @ 55:98cd5317e504 branch-tests

updated notebooks
author mpanteli <m.x.panteli@gmail.com>
date Tue, 19 Sep 2017 21:27:09 +0100
parents d3de9ac0d545
children 444041185ba9 a6606b255ad7
comparison
equal deleted inserted replaced
54:dbcd5b2a4efa 55:98cd5317e504
32 "import scripts.outliers as outliers" 32 "import scripts.outliers as outliers"
33 ] 33 ]
34 }, 34 },
35 { 35 {
36 "cell_type": "code", 36 "cell_type": "code",
37 "execution_count": 3, 37 "execution_count": 2,
38 "metadata": { 38 "metadata": {
39 "collapsed": true 39 "collapsed": true
40 }, 40 },
41 "outputs": [], 41 "outputs": [],
42 "source": [ 42 "source": [
4544 "## Map frames and write output for the lda transformed frames" 4544 "## Map frames and write output for the lda transformed frames"
4545 ] 4545 ]
4546 }, 4546 },
4547 { 4547 {
4548 "cell_type": "code", 4548 "cell_type": "code",
4549 "execution_count": 7, 4549 "execution_count": null,
4550 "metadata": {}, 4550 "metadata": {},
4551 "outputs": [ 4551 "outputs": [],
4552 {
4553 "name": "stdout",
4554 "output_type": "stream",
4555 "text": [
4556 "iteration 0\n",
4557 "mapping...\n",
4558 "/import/c4dm-04/mariap/train_data_melodia_8_0.pickle\n"
4559 ]
4560 },
4561 {
4562 "ename": "KeyboardInterrupt",
4563 "evalue": "",
4564 "output_type": "error",
4565 "traceback": [
4566 "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
4567 "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
4568 "\u001b[0;32m<ipython-input-7-f093c6f2c550>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m mapper.OUTPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n\u001b[1;32m 8\u001b[0m output_file in MAPPER_OUTPUT_FILES]\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlda_map_and_average_frames\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_variance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.99\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
4569 "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/map_and_average.pyc\u001b[0m in \u001b[0;36mlda_map_and_average_frames\u001b[0;34m(dataset, n_components, min_variance)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mlda_map_and_average_frames\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_components\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmin_variance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdataset\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 151\u001b[0;31m \u001b[0mtrainset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtestset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_train_val_test_sets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 152\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0mtrainset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtestset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
4570 "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/map_and_average.pyc\u001b[0m in \u001b[0;36mload_train_val_test_sets\u001b[0;34m()\u001b[0m\n\u001b[1;32m 69\u001b[0m '''\n\u001b[1;32m 70\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 71\u001b[0;31m \u001b[0mtrainset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 72\u001b[0m \u001b[0mvalset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0mtestset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
4571 "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/map_and_average.pyc\u001b[0m in \u001b[0;36mload_data_from_pickle\u001b[0;34m(pickle_file)\u001b[0m\n\u001b[1;32m 57\u001b[0m '''\n\u001b[1;32m 58\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpickle_file\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 59\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudiolabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 60\u001b[0m \u001b[0;31m# remove 'unknown' and 'unidentified' country\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudiolabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mremove_inds\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudiolabels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
4572 "\u001b[0;32m/homes/mp305/anaconda/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mload\u001b[0;34m(file)\u001b[0m\n\u001b[1;32m 1382\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1383\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1384\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mUnpickler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1385\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1386\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
4573 "\u001b[0;32m/homes/mp305/anaconda/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mload\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 862\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 864\u001b[0;31m \u001b[0mdispatch\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 865\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0m_Stop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstopinst\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 866\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mstopinst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
4574 "\u001b[0;32m/homes/mp305/anaconda/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mload_string\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 967\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 968\u001b[0;31m \u001b[0mrep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 969\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mq\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m\"\\\"'\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# double or single quote\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrep\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
4575 "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
4576 ]
4577 }
4578 ],
4579 "source": [ 4552 "source": [
4580 "MAPPER_OUTPUT_FILES = mapper.OUTPUT_FILES\n", 4553 "MAPPER_OUTPUT_FILES = mapper.OUTPUT_FILES\n",
4581 "for n in range(n_iters):\n", 4554 "for n in range(n_iters):\n",
4582 " print \"iteration %d\" % n\n", 4555 " print \"iteration %d\" % n\n",
4583 " \n", 4556 " \n",
4597 "## Classification only - assuming mapper files are exported " 4570 "## Classification only - assuming mapper files are exported "
4598 ] 4571 ]
4599 }, 4572 },
4600 { 4573 {
4601 "cell_type": "code", 4574 "cell_type": "code",
4602 "execution_count": 5, 4575 "execution_count": 19,
4603 "metadata": {}, 4576 "metadata": {},
4604 "outputs": [ 4577 "outputs": [
4605 { 4578 {
4606 "name": "stdout", 4579 "name": "stdout",
4607 "output_type": "stream", 4580 "output_type": "stream",
4608 "text": [ 4581 "text": [
4609 "iteration 0\n" 4582 "iteration 0\n",
4610 ] 4583 "classifying...\n",
4611 }, 4584 "/import/c4dm-04/mariap/train_data_melodia_8_0.pickle\n",
4612 { 4585 "0.17294625462\n",
4613 "ename": "IOError", 4586 "detecting outliers...\n",
4614 "evalue": "[Errno 2] No such file or directory: '/import/c4dm-04/mariap/nmf_data_melodia_8_0.pickle'", 4587 "most outliers \n",
4615 "output_type": "error", 4588 " Country Outliers N_Country N_Outliers\n",
4616 "traceback": [ 4589 "136 Botswana 0.590909 88 52\n",
4617 "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 4590 "31 Ivory Coast 0.571429 14 8\n",
4618 "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", 4591 "86 Gambia 0.541667 48 26\n",
4619 "\u001b[0;32m<ipython-input-5-eb8ccb858c3f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOUTPUT_FILES\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCLASS_INPUT_FILES\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mINPUT_FILES\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOUTPUT_FILES\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclassification\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCLASS_INPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mldadata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;31m# classification and confusion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 4592 "42 Benin 0.538462 26 14\n",
4620 "\u001b[0;32m/homes/mp305/code/pythoncode/plosone_underreview/scripts/classification.pyc\u001b[0m in \u001b[0;36mload_data_from_pickle\u001b[0;34m(filename)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0mX_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mYaudio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 4593 "102 Fiji 0.466667 15 7\n",
4621 "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: '/import/c4dm-04/mariap/nmf_data_melodia_8_0.pickle'" 4594 "20 Pakistan 0.461538 91 42\n",
4595 "64 Uganda 0.437500 80 35\n",
4596 "14 Liberia 0.425000 40 17\n",
4597 "78 El Salvador 0.424242 33 14\n",
4598 "50 Western Sahara 0.421687 83 35\n",
4599 "least outliers \n",
4600 " Country Outliers N_Country N_Outliers\n",
4601 "1 Lithuania 0.000000 47 0\n",
4602 "30 Afghanistan 0.000000 24 0\n",
4603 "28 Tajikistan 0.000000 19 0\n",
4604 "27 South Korea 0.000000 11 0\n",
4605 "113 Iceland 0.000000 14 0\n",
4606 "119 Denmark 0.000000 16 0\n",
4607 "74 Czech Republic 0.000000 41 0\n",
4608 "15 Netherlands 0.014925 67 1\n",
4609 "121 Poland 0.040000 100 4\n",
4610 "134 Paraguay 0.043478 23 1\n",
4611 "writing file\n",
4612 "iteration 1\n",
4613 "classifying...\n",
4614 "/import/c4dm-04/mariap/train_data_melodia_8_1.pickle\n",
4615 "0.149811300704\n",
4616 "detecting outliers...\n",
4617 "most outliers \n",
4618 " Country Outliers N_Country N_Outliers\n",
4619 "60 Chad 0.545455 11 6\n",
4620 "62 Fiji 0.533333 15 8\n",
4621 "86 Gambia 0.520833 48 25\n",
4622 "21 Pakistan 0.500000 88 44\n",
4623 "43 Benin 0.500000 26 13\n",
4624 "32 Ivory Coast 0.500000 14 7\n",
4625 "136 Botswana 0.488095 84 41\n",
4626 "78 El Salvador 0.484848 33 16\n",
4627 "106 Nepal 0.436782 87 38\n",
4628 "135 French Guiana 0.428571 28 12\n",
4629 "least outliers \n",
4630 " Country Outliers N_Country N_Outliers\n",
4631 "1 Lithuania 0.000000 47 0\n",
4632 "113 Iceland 0.000000 14 0\n",
4633 "119 Denmark 0.000000 16 0\n",
4634 "74 Czech Republic 0.000000 41 0\n",
4635 "28 South Korea 0.000000 11 0\n",
4636 "16 Netherlands 0.029851 67 2\n",
4637 "31 Afghanistan 0.041667 24 1\n",
4638 "134 Paraguay 0.043478 23 1\n",
4639 "105 Sudan 0.045455 66 3\n",
4640 "120 Kazakhstan 0.045455 88 4\n",
4641 "writing file\n",
4642 "iteration 2\n",
4643 "classifying...\n",
4644 "/import/c4dm-04/mariap/train_data_melodia_8_2.pickle\n",
4645 "0.178052269426\n",
4646 "detecting outliers...\n",
4647 "most outliers \n",
4648 " Country Outliers N_Country N_Outliers\n",
4649 "136 Botswana 0.615385 78 48\n",
4650 "86 Gambia 0.520833 48 25\n",
4651 "72 Ivory Coast 0.500000 14 7\n",
4652 "62 Fiji 0.466667 15 7\n",
4653 "43 Benin 0.461538 26 12\n",
4654 "20 Pakistan 0.451613 93 42\n",
4655 "17 French Guiana 0.428571 28 12\n",
4656 "14 Liberia 0.425000 40 17\n",
4657 "78 El Salvador 0.424242 33 14\n",
4658 "51 Western Sahara 0.414634 82 34\n",
4659 "least outliers \n",
4660 " Country Outliers N_Country N_Outliers\n",
4661 "119 Denmark 0.000000 16 0\n",
4662 "113 Iceland 0.000000 14 0\n",
4663 "27 South Korea 0.000000 11 0\n",
4664 "1 Lithuania 0.000000 47 0\n",
4665 "31 Czech Republic 0.024390 41 1\n",
4666 "15 Netherlands 0.029851 67 2\n",
4667 "30 Afghanistan 0.041667 24 1\n",
4668 "105 Sudan 0.045455 66 3\n",
4669 "120 Kazakhstan 0.045455 88 4\n",
4670 "100 Antigua and Barbuda 0.047619 42 2\n",
4671 "writing file\n",
4672 "iteration 3\n",
4673 "classifying...\n",
4674 "/import/c4dm-04/mariap/train_data_melodia_8_3.pickle\n",
4675 "0.177243715126\n",
4676 "detecting outliers...\n",
4677 "most outliers \n",
4678 " Country Outliers N_Country N_Outliers\n",
4679 "136 Botswana 0.617284 81 50\n",
4680 "31 Ivory Coast 0.571429 14 8\n",
4681 "86 Gambia 0.541667 48 26\n",
4682 "43 Benin 0.538462 26 14\n",
4683 "62 Fiji 0.533333 15 8\n",
4684 "20 Pakistan 0.468750 96 45\n",
4685 "51 Western Sahara 0.439024 82 36\n",
4686 "14 Liberia 0.425000 40 17\n",
4687 "78 El Salvador 0.424242 33 14\n",
4688 "106 Nepal 0.416667 96 40\n",
4689 "least outliers \n",
4690 " Country Outliers N_Country N_Outliers\n",
4691 "113 Iceland 0.000000 14 0\n",
4692 "30 Afghanistan 0.000000 24 0\n",
4693 "119 Denmark 0.000000 16 0\n",
4694 "134 Paraguay 0.000000 23 0\n",
4695 "27 South Korea 0.000000 11 0\n",
4696 "1 Lithuania 0.000000 47 0\n",
4697 "100 Antigua and Barbuda 0.023810 42 1\n",
4698 "74 Czech Republic 0.024390 41 1\n",
4699 "15 Netherlands 0.029851 67 2\n",
4700 "105 Sudan 0.045455 66 3\n",
4701 "writing file\n",
4702 "iteration 4\n",
4703 "classifying...\n",
4704 "/import/c4dm-04/mariap/train_data_melodia_8_4.pickle\n",
4705 "0.186733308352\n",
4706 "detecting outliers...\n",
4707 "most outliers \n",
4708 " Country Outliers N_Country N_Outliers\n",
4709 "60 Chad 0.727273 11 8\n",
4710 "136 Botswana 0.630952 84 53\n",
4711 "72 Ivory Coast 0.571429 14 8\n",
4712 "62 Fiji 0.533333 15 8\n",
4713 "86 Gambia 0.520833 48 25\n",
4714 "43 Benin 0.500000 26 13\n",
4715 "20 Pakistan 0.468085 94 44\n",
4716 "135 French Guiana 0.464286 28 13\n",
4717 "64 Mozambique 0.441176 34 15\n",
4718 "51 Western Sahara 0.439024 82 36\n",
4719 "least outliers \n",
4720 " Country Outliers N_Country N_Outliers\n",
4721 "1 Lithuania 0.000000 47 0\n",
4722 "27 South Korea 0.000000 11 0\n",
4723 "113 Iceland 0.000000 14 0\n",
4724 "119 Denmark 0.000000 16 0\n",
4725 "15 Netherlands 0.014925 67 1\n",
4726 "31 Czech Republic 0.024390 41 1\n",
4727 "112 Israel 0.030000 100 3\n",
4728 "30 Afghanistan 0.041667 24 1\n",
4729 "134 Paraguay 0.043478 23 1\n",
4730 "105 Sudan 0.045455 66 3\n",
4731 "writing file\n",
4732 "iteration 5\n",
4733 "classifying...\n",
4734 "/import/c4dm-04/mariap/train_data_melodia_8_5.pickle\n",
4735 "0.163125082162\n",
4736 "detecting outliers...\n",
4737 "most outliers \n",
4738 " Country Outliers N_Country N_Outliers\n",
4739 "136 Botswana 0.607143 84 51\n",
4740 "72 Ivory Coast 0.571429 14 8\n",
4741 "21 Pakistan 0.553191 94 52\n",
4742 "95 Chad 0.545455 11 6\n",
4743 "63 Fiji 0.533333 15 8\n",
4744 "86 Gambia 0.520833 48 25\n",
4745 "44 Benin 0.500000 26 13\n",
4746 "78 El Salvador 0.454545 33 15\n",
4747 "117 Zimbabwe 0.428571 14 6\n",
4748 "66 Uganda 0.418605 86 36\n",
4749 "least outliers \n",
4750 " Country Outliers N_Country N_Outliers\n",
4751 "119 Denmark 0.000000 16 0\n",
4752 "1 Lithuania 0.000000 47 0\n",
4753 "28 South Korea 0.000000 11 0\n",
4754 "113 Iceland 0.000000 14 0\n",
4755 "32 Czech Republic 0.024390 41 1\n",
4756 "16 Netherlands 0.029851 67 2\n",
4757 "31 Afghanistan 0.041667 24 1\n",
4758 "134 Paraguay 0.043478 23 1\n",
4759 "120 Kazakhstan 0.045455 88 4\n",
4760 "105 Sudan 0.045455 66 3\n",
4761 "writing file\n",
4762 "iteration 6\n",
4763 "classifying...\n",
4764 "/import/c4dm-04/mariap/train_data_melodia_8_6.pickle\n",
4765 "0.179816192246\n",
4766 "detecting outliers...\n",
4767 "most outliers \n",
4768 " Country Outliers N_Country N_Outliers\n",
4769 "136 Botswana 0.574468 94 54\n",
4770 "32 Ivory Coast 0.571429 14 8\n",
4771 "86 Gambia 0.520833 48 25\n",
4772 "21 Pakistan 0.516854 89 46\n",
4773 "62 Fiji 0.466667 15 7\n",
4774 "43 Benin 0.461538 26 12\n",
4775 "95 Chad 0.454545 11 5\n",
4776 "78 El Salvador 0.454545 33 15\n",
4777 "51 Western Sahara 0.439024 82 36\n",
4778 "63 Senegal 0.405405 37 15\n",
4779 "least outliers \n",
4780 " Country Outliers N_Country N_Outliers\n",
4781 "1 Lithuania 0.000000 47 0\n",
4782 "119 Denmark 0.000000 16 0\n",
4783 "28 South Korea 0.000000 11 0\n",
4784 "113 Iceland 0.000000 14 0\n",
4785 "16 Netherlands 0.014925 67 1\n",
4786 "74 Czech Republic 0.024390 41 1\n",
4787 "13 Germany 0.040000 100 4\n",
4788 "31 Afghanistan 0.041667 24 1\n",
4789 "105 Sudan 0.045455 66 3\n",
4790 "120 Kazakhstan 0.045455 88 4\n",
4791 "writing file\n"
4622 ] 4792 ]
4623 } 4793 }
4624 ], 4794 ],
4625 "source": [ 4795 "source": [
4796 "n_iters = 7\n",
4797 "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n",
4798 "MAPPER_OUTPUT_FILES = mapper.OUTPUT_FILES\n",
4626 "for n in range(n_iters):\n", 4799 "for n in range(n_iters):\n",
4627 " print \"iteration %d\" % n\n", 4800 " print \"iteration %d\" % n\n",
4628 " CLASS_INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n", 4801 " CLASS_INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n",
4629 " output_file in mapper.OUTPUT_FILES]\n", 4802 " output_file in MAPPER_OUTPUT_FILES]\n",
4630 " mapper.INPUT_FILES = OUTPUT_FILES\n", 4803 " mapper.INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n",
4631 " ldadata_list, Y, Yaudio = classification.load_data_from_pickle(CLASS_INPUT_FILES[2])\n", 4804 " output_file in OUTPUT_FILES]\n",
4632 " X = np.concatenate(ldadata_list, axis=1)\n", 4805 " X, Y, Yaudio = classification.load_data_from_pickle(CLASS_INPUT_FILES[0])\n",
4806 " #X = np.concatenate(ldadata_list, axis=1)\n",
4633 " # classification and confusion\n", 4807 " # classification and confusion\n",
4634 " print \"classifying...\"\n", 4808 " print \"classifying...\"\n",
4635 " traininds, testinds = classification.get_train_test_indices(Yaudio)\n", 4809 " traininds, testinds = classification.get_train_test_indices(Yaudio)\n",
4636 " X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n", 4810 " X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n",
4637 " accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n", 4811 " accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n",
4644 " outliers.print_most_least_outliers_topN(df_global, N=10)\n", 4818 " outliers.print_most_least_outliers_topN(df_global, N=10)\n",
4645 " \n", 4819 " \n",
4646 " # write output\n", 4820 " # write output\n",
4647 " print \"writing file\"\n", 4821 " print \"writing file\"\n",
4648 " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)" 4822 " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)"
4649 ]
4650 },
4651 {
4652 "cell_type": "code",
4653 "execution_count": 4,
4654 "metadata": {
4655 "collapsed": true
4656 },
4657 "outputs": [],
4658 "source": [
4659 "X = np.concatenate(ldadata_list, axis=1)"
4660 ]
4661 },
4662 {
4663 "cell_type": "code",
4664 "execution_count": 5,
4665 "metadata": {},
4666 "outputs": [
4667 {
4668 "data": {
4669 "text/plain": [
4670 "(8089, 381)"
4671 ]
4672 },
4673 "execution_count": 5,
4674 "metadata": {},
4675 "output_type": "execute_result"
4676 }
4677 ],
4678 "source": [
4679 "X.shape"
4680 ]
4681 },
4682 {
4683 "cell_type": "code",
4684 "execution_count": 10,
4685 "metadata": {},
4686 "outputs": [
4687 {
4688 "name": "stdout",
4689 "output_type": "stream",
4690 "text": [
4691 "0.176354062249\n"
4692 ]
4693 },
4694 {
4695 "name": "stderr",
4696 "output_type": "stream",
4697 "text": [
4698 "/homes/mp305/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
4699 " 'precision', 'predicted', average, warn_for)\n"
4700 ]
4701 }
4702 ],
4703 "source": [
4704 "#traininds, testinds = classification.get_train_test_indices()\n",
4705 "traininds = np.arange(5000)\n",
4706 "testinds = np.arange(len(X)-1600, len(X))\n",
4707 "X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n",
4708 "accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n",
4709 "print accuracy"
4710 ]
4711 },
4712 {
4713 "cell_type": "code",
4714 "execution_count": 13,
4715 "metadata": {},
4716 "outputs": [
4717 {
4718 "name": "stdout",
4719 "output_type": "stream",
4720 "text": [
4721 "detecting outliers...\n",
4722 "most outliers \n",
4723 " Country Outliers\n",
4724 "136 Botswana 0.590909\n",
4725 "71 Ivory Coast 0.571429\n",
4726 "86 Gambia 0.541667\n",
4727 "43 Benin 0.538462\n",
4728 "62 Fiji 0.466667\n",
4729 "20 Pakistan 0.461538\n",
4730 "65 Uganda 0.437500\n",
4731 "14 Liberia 0.425000\n",
4732 "78 El Salvador 0.424242\n",
4733 "51 Western Sahara 0.421687\n",
4734 "least outliers \n",
4735 " Country Outliers\n",
4736 "119 Denmark 0.000000\n",
4737 "30 Afghanistan 0.000000\n",
4738 "113 Iceland 0.000000\n",
4739 "28 Tajikistan 0.000000\n",
4740 "74 Czech Republic 0.000000\n",
4741 "27 South Korea 0.000000\n",
4742 "1 Lithuania 0.000000\n",
4743 "15 Netherlands 0.014925\n",
4744 "121 Poland 0.040000\n",
4745 "134 Paraguay 0.043478\n"
4746 ]
4747 }
4748 ],
4749 "source": [
4750 "print \"detecting outliers...\"\n",
4751 "df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)\n",
4752 "outliers.print_most_least_outliers_topN(df_global, N=10)"
4753 ] 4823 ]
4754 }, 4824 },
4755 { 4825 {
4756 "cell_type": "markdown", 4826 "cell_type": "markdown",
4757 "metadata": {}, 4827 "metadata": {},