m@15
|
1 {
|
m@15
|
2 "cells": [
|
m@15
|
3 {
|
m@15
|
4 "cell_type": "code",
|
m@17
|
5 "execution_count": 2,
|
m@17
|
6 "metadata": {},
|
m@15
|
7 "outputs": [
|
m@15
|
8 {
|
m@17
|
9 "name": "stdout",
|
m@15
|
10 "output_type": "stream",
|
m@15
|
11 "text": [
|
m@17
|
12 "The autoreload extension is already loaded. To reload it, use:\n",
|
m@17
|
13 " %reload_ext autoreload\n"
|
m@15
|
14 ]
|
m@15
|
15 }
|
m@15
|
16 ],
|
m@15
|
17 "source": [
|
m@15
|
18 "import numpy as np\n",
|
m@15
|
19 "\n",
|
m@15
|
20 "%matplotlib inline\n",
|
m@15
|
21 "import matplotlib.pyplot as plt\n",
|
m@15
|
22 "\n",
|
m@15
|
23 "%load_ext autoreload\n",
|
m@15
|
24 "%autoreload 2\n",
|
m@15
|
25 "\n",
|
m@15
|
26 "import sys\n",
|
m@15
|
27 "sys.path.append('../')\n",
|
m@15
|
28 "import scripts.load_dataset as load_dataset\n",
|
m@15
|
29 "import scripts.map_and_average as mapper\n",
|
Maria@18
|
30 "import scripts.classification\n",
|
Maria@18
|
31 "import scripts.outliers as outliers"
|
m@15
|
32 ]
|
m@15
|
33 },
|
m@15
|
34 {
|
m@15
|
35 "cell_type": "code",
|
m@17
|
36 "execution_count": 4,
|
m@17
|
37 "metadata": {},
|
m@15
|
38 "outputs": [],
|
m@15
|
39 "source": [
|
m@17
|
40 "OUTPUT_FILES = load_dataset.OUTPUT_FILES\n",
|
m@15
|
41 "n_iters = 10"
|
m@15
|
42 ]
|
m@15
|
43 },
|
m@15
|
44 {
|
m@15
|
45 "cell_type": "code",
|
m@17
|
46 "execution_count": null,
|
m@17
|
47 "metadata": {},
|
m@17
|
48 "outputs": [],
|
m@15
|
49 "source": [
|
m@17
|
50 "df = load_dataset.sample_dataset(csv_file=load_dataset.METADATA_FILE)\n",
|
m@15
|
51 "for n in range(n_iters):\n",
|
m@15
|
52 " print \"iteration %d\" % n\n",
|
m@15
|
53 " load_dataset.OUTPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n",
|
m@15
|
54 " output_file in OUTPUT_FILES]\n",
|
m@15
|
55 " load_dataset.features_for_train_test_sets(df, write_output=True)"
|
m@15
|
56 ]
|
m@15
|
57 },
|
m@15
|
58 {
|
m@15
|
59 "cell_type": "code",
|
m@15
|
60 "execution_count": null,
|
m@15
|
61 "metadata": {
|
m@15
|
62 "collapsed": true
|
m@15
|
63 },
|
m@15
|
64 "outputs": [],
|
m@15
|
65 "source": [
|
m@15
|
66 "for n in range(n_iters):\n",
|
m@15
|
67 " print \"iteration %d\" % n\n",
|
m@15
|
68 " \n",
|
m@15
|
69 " print \"mapping...\"\n",
|
m@15
|
70 " mapper.INPUT_FILES = [output_file.split('.pickle')[0]+'_'+str(n)+'.pickle' for \n",
|
m@15
|
71 " output_file in OUTPUT_FILES]\n",
|
m@15
|
72 " _, _, ldadata_list, _, _, Y, Yaudio = mapper.lda_map_and_average_frames(min_variance=0.99)\n",
|
m@15
|
73 " X = np.concatenate(ldadata_list)\n",
|
m@15
|
74 " \n",
|
m@15
|
75 " # classification and confusion\n",
|
m@15
|
76 " print \"classifying...\"\n",
|
Maria@18
|
77 " traininds, testinds = classification.get_train_test_indices()\n",
|
Maria@18
|
78 " X_train, Y_train, X_test, Y_test = classification.get_train_test_sets(X, Y, traininds, testinds)\n",
|
Maria@18
|
79 " accuracy, _ = classification.confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False)\n",
|
m@15
|
80 " print accuracy\n",
|
m@15
|
81 " \n",
|
m@15
|
82 " # outliers\n",
|
m@15
|
83 " print \"detecting outliers...\"\n",
|
Maria@18
|
84 " ddf = outliers.load_metadata(Yaudio, metadata_file=load_dataset.METADATA_FILE)\n",
|
m@15
|
85 " df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)\n",
|
m@15
|
86 " print_most_least_outliers_topN(df_global, N=10)\n",
|
m@15
|
87 " \n",
|
m@15
|
88 " # write output\n",
|
m@15
|
89 " print \"writing file\"\n",
|
m@15
|
90 " df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)"
|
m@15
|
91 ]
|
m@15
|
92 }
|
m@15
|
93 ],
|
m@15
|
94 "metadata": {
|
m@15
|
95 "kernelspec": {
|
m@15
|
96 "display_name": "Python 2",
|
m@15
|
97 "language": "python",
|
m@15
|
98 "name": "python2"
|
m@15
|
99 }
|
m@15
|
100 },
|
m@15
|
101 "nbformat": 4,
|
m@17
|
102 "nbformat_minor": 1
|
m@15
|
103 }
|