changeset 32:4bdcab1e821c

tidy up directory
author DaveM
date Wed, 15 Mar 2017 11:33:55 +0000
parents 55813e99c6cf
children 74d123779d3b
files _code/Gensim LDA tutorial.ipynb _code/Hierarchical Clustering.ipynb code/Gensim LDA tutorial.ipynb code/Hierarchical Clustering.ipynb code/RUNME.sh code/aglomCluster.m code/depthCheck.m code/rfFeatureSelection.m code/runme.m code/testparseData.m code/traceLinkageToBinary.m code/traverseDownOneStep.m code/treeLinkFeatures.m phase2/aglomCluster.m phase2/dataWithFeatures12345.txt phase2/depthCheck.m phase2/rfFeatureSelection.m phase2/runme.m phase2/testparseData.m phase2/traceLinkageToBinary.m phase2/traverseDownOneStep.m phase2/treeLinkFeatures.m
diffstat 22 files changed, 650 insertions(+), 693 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/_code/Gensim LDA tutorial.ipynb	Wed Mar 15 11:33:55 2017 +0000
@@ -0,0 +1,111 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.10.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import six\n",
+    "print six.__version__\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import gensim\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import gensim\n",
+    "import bz2\n",
+    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "IOError",
+     "evalue": "[Errno 2] No such file or directory: 'wiki_en_wordids.txt'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mIOError\u001b[0m                                   Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-4-54645b7b2c38>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# load id->word mapping (the dictionary), one of the results of step 2 above\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mid2word\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDictionary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_from_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_wordids.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;31m# load corpus iterator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mmm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMmCorpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_tfidf.mm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;31m# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/Library/Python/2.7/site-packages/gensim/corpora/dictionary.pyc\u001b[0m in \u001b[0;36mload_from_text\u001b[0;34m(fname)\u001b[0m\n\u001b[1;32m    342\u001b[0m         \"\"\"\n\u001b[1;32m    343\u001b[0m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDictionary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 344\u001b[0;31m         \u001b[0;32mwith\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msmart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    345\u001b[0m             \u001b[0;32mfor\u001b[0m \u001b[0mlineno\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    346\u001b[0m                 \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_unicode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36msmart_open\u001b[0;34m(uri, mode, **kw)\u001b[0m\n\u001b[1;32m    125\u001b[0m             \u001b[0;31m# local files -- both read & write supported\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    126\u001b[0m             \u001b[0;31m# compression, if any, is determined by the filename extension (.gz, .bz2)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfile_smart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    128\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscheme\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"s3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"s3n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    129\u001b[0m             \u001b[0;31m# Get an S3 host. It is required for sigv4 operations.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36mfile_smart_open\u001b[0;34m(fname, mode)\u001b[0m\n\u001b[1;32m    556\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mmake_closing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGzipFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    557\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 558\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    559\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    560\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: 'wiki_en_wordids.txt'"
+     ]
+    }
+   ],
+   "source": [
+    "# load id->word mapping (the dictionary), one of the results of step 2 above\n",
+    "id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')\n",
+    "# load corpus iterator\n",
+    "mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')\n",
+    "# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\n",
+    "\n",
+    "print(mm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/_code/Hierarchical Clustering.ipynb	Wed Mar 15 11:33:55 2017 +0000
@@ -0,0 +1,207 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from matplotlib import pyplot as plt\n",
+    "from scipy.cluster.hierarchy import dendrogram, linkage, cophenet\n",
+    "from scipy.spatial.distance import pdist\n",
+    "import sklearn \n",
+    "import numpy as np\n",
+    "import csv\n",
+    "\n",
+    "dataFolder = '../data/'\n",
+    "keyFile = 'AdobeNormalised'\n",
+    "datapath = dataFolder + keyFile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "X = np.genfromtxt(datapath+'.csv', delimiter = ',', skip_header = 1)\n",
+    "filenames = np.loadtxt(datapath+'_filenames.csv', dtype = str)\n",
+    "labels = np.loadtxt(datapath+'_labels.csv', dtype = str)\n",
+    "features = np.loadtxt(datapath+'_features.csv', dtype = str)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "agglo = cluster.FeatureAgglomeration()\n",
+    "agglo.fit(X)\n",
+    "X_reduced = agglo.transform(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "Z = linkage(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[  8.51810000e-01   4.00000000e-06   2.46000000e-04 ...,   2.10260000e-02\n",
+      "    1.98220000e-02   1.04000000e-04]\n",
+      " [  9.52275000e-01   7.00000000e-06   1.82600000e-03 ...,   1.79490000e-02\n",
+      "    1.09020000e-02   7.20000000e-05]\n",
+      " [  1.92200000e-03   1.00000000e-06   1.39000000e-04 ...,   2.35900000e-02\n",
+      "    6.93800000e-03   2.61000000e-04]\n",
+      " ..., \n",
+      " [  9.96346000e-01   3.37000000e-04   1.23600000e-03 ...,   5.24103000e-01\n",
+      "    3.36967000e-01   5.39000000e-04]\n",
+      " [  9.99990000e-01   1.00000000e-06   0.00000000e+00 ...,   0.00000000e+00\n",
+      "    0.00000000e+00   0.00000000e+00]\n",
+      " [  9.96624000e-01   6.97000000e-04   2.59300000e-03 ...,   5.24615000e-01\n",
+      "    3.34985000e-01   5.45000000e-04]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(8977, 1536)\n"
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'nu_0': 0, 'kappa_0': 0, 'lambda_0': 0, 'mu_0': 0}\n"
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pyBHC as bhc\n",
+    "from pyBHC import dists\n",
+    "\n",
+    "mu_init = []\n",
+    "sigma_init = []\n",
+    "S_init = []\n",
+    "cd = dists.NormalFixedCovar(mu_0=mu_init,sigma_0=sigma_init, S=S_init)\n",
+    "\n",
+    "# temp = cd.log_marginal_likelihood(X)\n",
+    "d = bhc.rbhc(X, cd)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/code/Gensim LDA tutorial.ipynb	Wed Mar 15 11:26:24 2017 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,111 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1.10.0\n"
-     ]
-    }
-   ],
-   "source": [
-    "import six\n",
-    "print six.__version__\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "import gensim\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "import logging\n",
-    "import gensim\n",
-    "import bz2\n",
-    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "ename": "IOError",
-     "evalue": "[Errno 2] No such file or directory: 'wiki_en_wordids.txt'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mIOError\u001b[0m                                   Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-4-54645b7b2c38>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# load id->word mapping (the dictionary), one of the results of step 2 above\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mid2word\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDictionary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_from_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_wordids.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;31m# load corpus iterator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mmm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMmCorpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_tfidf.mm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;31m# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/Library/Python/2.7/site-packages/gensim/corpora/dictionary.pyc\u001b[0m in \u001b[0;36mload_from_text\u001b[0;34m(fname)\u001b[0m\n\u001b[1;32m    342\u001b[0m         \"\"\"\n\u001b[1;32m    343\u001b[0m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDictionary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 344\u001b[0;31m         \u001b[0;32mwith\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msmart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    345\u001b[0m             \u001b[0;32mfor\u001b[0m \u001b[0mlineno\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    346\u001b[0m                 \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_unicode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36msmart_open\u001b[0;34m(uri, mode, **kw)\u001b[0m\n\u001b[1;32m    125\u001b[0m             \u001b[0;31m# local files -- both read & write supported\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    126\u001b[0m             \u001b[0;31m# compression, if any, is determined by the filename extension (.gz, .bz2)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfile_smart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    128\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscheme\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"s3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"s3n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    129\u001b[0m             \u001b[0;31m# Get an S3 host. It is required for sigv4 operations.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36mfile_smart_open\u001b[0;34m(fname, mode)\u001b[0m\n\u001b[1;32m    556\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mmake_closing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGzipFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    557\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 558\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    559\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    560\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: 'wiki_en_wordids.txt'"
-     ]
-    }
-   ],
-   "source": [
-    "# load id->word mapping (the dictionary), one of the results of step 2 above\n",
-    "id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')\n",
-    "# load corpus iterator\n",
-    "mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')\n",
-    "# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\n",
-    "\n",
-    "print(mm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 2",
-   "language": "python",
-   "name": "python2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
--- a/code/Hierarchical Clustering.ipynb	Wed Mar 15 11:26:24 2017 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,207 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "from matplotlib import pyplot as plt\n",
-    "from scipy.cluster.hierarchy import dendrogram, linkage, cophenet\n",
-    "from scipy.spatial.distance import pdist\n",
-    "import sklearn \n",
-    "import numpy as np\n",
-    "import csv\n",
-    "\n",
-    "dataFolder = '../data/'\n",
-    "keyFile = 'AdobeNormalised'\n",
-    "datapath = dataFolder + keyFile"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "X = np.genfromtxt(datapath+'.csv', delimiter = ',', skip_header = 1)\n",
-    "filenames = np.loadtxt(datapath+'_filenames.csv', dtype = str)\n",
-    "labels = np.loadtxt(datapath+'_labels.csv', dtype = str)\n",
-    "features = np.loadtxt(datapath+'_features.csv', dtype = str)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "agglo = cluster.FeatureAgglomeration()\n",
-    "agglo.fit(X)\n",
-    "X_reduced = agglo.transform(X)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "Z = linkage(X)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[  8.51810000e-01   4.00000000e-06   2.46000000e-04 ...,   2.10260000e-02\n",
-      "    1.98220000e-02   1.04000000e-04]\n",
-      " [  9.52275000e-01   7.00000000e-06   1.82600000e-03 ...,   1.79490000e-02\n",
-      "    1.09020000e-02   7.20000000e-05]\n",
-      " [  1.92200000e-03   1.00000000e-06   1.39000000e-04 ...,   2.35900000e-02\n",
-      "    6.93800000e-03   2.61000000e-04]\n",
-      " ..., \n",
-      " [  9.96346000e-01   3.37000000e-04   1.23600000e-03 ...,   5.24103000e-01\n",
-      "    3.36967000e-01   5.39000000e-04]\n",
-      " [  9.99990000e-01   1.00000000e-06   0.00000000e+00 ...,   0.00000000e+00\n",
-      "    0.00000000e+00   0.00000000e+00]\n",
-      " [  9.96624000e-01   6.97000000e-04   2.59300000e-03 ...,   5.24615000e-01\n",
-      "    3.34985000e-01   5.45000000e-04]]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print X"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(8977, 1536)\n"
-     ]
-    }
-   ],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'nu_0': 0, 'kappa_0': 0, 'lambda_0': 0, 'mu_0': 0}\n"
-     ]
-    }
-   ],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "import pyBHC as bhc\n",
-    "from pyBHC import dists\n",
-    "\n",
-    "mu_init = []\n",
-    "sigma_init = []\n",
-    "S_init = []\n",
-    "cd = dists.NormalFixedCovar(mu_0=mu_init,sigma_0=sigma_init, S=S_init)\n",
-    "\n",
-    "# temp = cd.log_marginal_likelihood(X)\n",
-    "d = bhc.rbhc(X, cd)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 2",
-   "language": "python",
-   "name": "python2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code/RUNME.sh	Wed Mar 15 11:33:55 2017 +0000
@@ -0,0 +1,1 @@
+screen -d -L -m matlab -nodisplay -nojvm -r runme
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code/aglomCluster.m	Wed Mar 15 11:33:55 2017 +0000
@@ -0,0 +1,62 @@
+function linkList = aglomCluster(data, clusterMethod, distanceMetric, numClusters)
+%% aglomCluster(data, clusterMethod, distanceMetric, numClusters)
+% This function performs aglomerative clustering on a given data set,
+% allowing the interpretation of a hierarchical data, and plotting a
+% dendrogram.
+%
+% data in the format of of each row is an observation and each column is a
+% feature vector clusterMethod;
+%     * 'average'     Unweighted average distance (UPGMA)
+%     * 'centroid'	Centroid distance (UPGMC), appropriate for Euclidean
+%     distances only
+%     * 'complete'	Furthest distance
+%     * 'median'      Weighted center of mass distance (WPGMC),appropriate
+%     for Euclidean distances only
+%     * 'single'      Shortest distance
+%     * 'ward'        Inner squared distance (minimum variance algorithm),
+%     appropriate for Euclidean distances only (default)
+%     * 'weighted'	Weighted average distance (WPGMA)
+% distanceMetric
+%     * 'euclidean' Euclidean distance (default).
+%     * 'seuclidean' Standardized Euclidean distance. Each coordinate
+%     difference between rows in X is scaled by dividing by the
+%     corresponding element of the standard deviation S=nanstd(X). To
+%     specify another value for S, use D=pdist(X,'seuclidean',S).
+%     * 'cityblock' City block metric.
+%     * 'minkowski' Minkowski distance. The default exponent is 2. To
+%     specify a different exponent, use D = pdist(X,'minkowski',P), where P
+%     is a scalar positive value of the exponent.
+%     * 'chebychev' Chebychev distance (maximum coordinate difference).
+%     * 'mahalanobis'	Mahalanobis distance, using the sample covariance
+%     of X as computed by nancov. To compute the distance with a different
+%     covariance, use D = pdist(X,'mahalanobis',C), where the matrix C is
+%     symmetric and positive definite.
+%     * 'cosine' One minus the cosine of the included angle between points
+%     (treated as vectors).
+%     * 'correlation' One minus the sample correlation between points
+%     (treated as sequences of values).
+%     * 'spearman' One minus the sample Spearman's rank correlation between
+%     observations (treated as sequences of values).
+%     * 'hamming' Hamming distance, which is the percentage of coordinates
+%     that differ.
+%     * 'jaccard' One minus the Jaccard coefficient, which is the
+%     percentage of nonzero coordinates that differ.
+% numClusters is the number of final clusters produced by the dendrogram,
+% if 0 (default), then will infer from data
+
+if(nargin<2)
+    clusterMethod = 'ward';
+end
+if(nargin<3)
+    distanceMetric = 'euclidean';
+end
+if (nargin<4)
+    numClusters = 0;
+end
+
+distMap = pdist(data, distanceMetric);
+linkList = linkage(distMap, clusterMethod);
+% [~,T] = dendrogram(linkList,numClusters);
+
+
+end
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code/depthCheck.m	Wed Mar 15 11:33:55 2017 +0000
@@ -0,0 +1,36 @@
+function linkList = depthCheck(linkList)
+%% linkList = depthCheck(linkList)
+% depthCheck will extend a linkList, created by the linkages algorithm, and
+% append an extra column on the end which indicated the depth of the
+% linkage, so the top level is 1, and each following level is the number of
+% links needed to get to the top level - which could be considered the
+% number of rules that exist. 
+% 
+% The other method for measuring depth would be
+% to look at the value of the linkage distance - thresholding and grouping
+% the linkage distances could be beneficial for some analysis.
+
+listSize = size(linkList,1)+1;
+
+linkList = cat(2,linkList, zeros(size(linkList,1),1));
+currentRow = size(linkList,1);
+r = [0;0];
+% depth = 1;
+
+linkList(currentRow,end) = 1;
+% depth = depth + 1;
+%%
+while (~isempty(currentRow))
+    row = currentRow(1);
+    for i = 1:2
+        r(i) = linkList(row,i);
+        if(r(i) > listSize)
+            r(i) = linkList(row,i) - listSize;
+            linkList(r(i),end) = linkList(currentRow(1),end)+1;
+            currentRow = [currentRow; r(i)];
+        end
+    end
+    currentRow = currentRow(2:end);
+end
+end
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code/rfFeatureSelection.m	Wed Mar 15 11:33:55 2017 +0000
@@ -0,0 +1,67 @@
+function featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector)
+%% rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector)
+%
+% using random forests to perform feature selection for a given data set
+% data has size (x,y), where x is the number of labels and y, the number of
+% features. 
+% labels is the set of labels for the data
+% numFeatures is the dimension of the output vector (default 5)
+% iterMethod is the method for which the features are cut down
+%       * 'onePass' will simply select the top (numFeatures) features and
+%       report them 
+%       * 'cutX' will iteratively cut the bottom X percent of
+%       features out, and perform random forest feature selection on the
+%       new set, until the desired number of features has been returned
+%       * 'featureDeltaErr' will cut down the number of features based on
+%       the number of features that negatively impact the results, as given
+%       by the OOBPermutedVarDeltaError
+% featureVector is a list of the features to use, for recursive purposes.
+
+if(length(labels) ~= size(data,1))
+    error('labels and data do not match up');
+end
+
+if(nargin < 2)
+    error('must pass data and labels into function')
+end
+if(nargin < 3)
+    numFeatures = 5;
+end
+if(nargin < 4)
+    iterMethod = 'onePass';
+end
+if(nargin < 5)
+    numTrees = 200;
+end
+if(nargin < 5)
+    featureVector = 1:size(data,2);
+end
+
+
+if(length(featureVector) > numFeatures)
+    options = statset('UseParallel', true);
+    b = TreeBagger(numTrees, data(:,featureVector), labels,'OOBVarImp','On',...
+        'SampleWithReplacement', 'Off','FBoot', 0.632,'Options', options);
+    [FI,I] = sort(b.OOBPermutedVarDeltaError,'descend'); 
+    featureVector = featureVector(I);
+
+    if(strcmp(iterMethod,'onePass'))
+        featureVector = featureVector(1:numFeatures);
+    elseif(strcmp(iterMethod(1:3),'cut'))
+        cutPercentage = str2double(iterMethod(4:end));
+        cutSize = max(floor(length(featureVector)*cutPercentage/100),1);
+        if(length(featureVector) - cutSize < numFeatures)
+            cutSize = length(featureVector) - numFeatures;
+        end
+        featureVector = featureVector(1:end-cutSize);
+        featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector);
+    elseif(strcmp(iterMethod,'featureDeltaErr'))
+        cutSize = sum(FI<0);
+        if(length(featureVector) - cutSize < numFeatures)
+            cutSize = length(featureVector) - numFeatures;
+        end
+        featureVector = featureVector(1:end-cutSize);
+        featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector);
+    end
+end
+end
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code/runme.m	Wed Mar 15 11:33:55 2017 +0000
@@ -0,0 +1,13 @@
+
+%%
+% load('testData.mat');
+% [linkList, featureList]= treeLinkFeatures(data,5);
+% save('testResults.mat','linkList','featureList');
+
+%%
+load('adobeDataNorm.mat')
+[linkList, featureList]= treeLinkFeatures(AdobeNormalised,5,featureNames);
+save('adobeResults.mat','linkList','featureList');
+% exit
+
+%%
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code/testparseData.m	Wed Mar 15 11:33:55 2017 +0000
@@ -0,0 +1,33 @@
+% An attempt to make sense of the treeLinkFeatures output data in a
+% meaningful way, and to understand why so man 
+
+tl = [];
+for i = 1:length(featureList)
+    t = zeros(5,1);
+    for ii = 1:5
+        t(ii) = (featureList{i}(ii) == ii);
+    end
+    tl = [tl; (sum(t)==5)];
+end
+
+%%
+compareList = linkList(find(tl),1:2);
+
+for i = 1:length(compareList)
+    try
+        t1 = T(mod(compareList(i,1),length(featureList)+1));
+        t2 = T(mod(compareList(i,2),length(featureList)+1));
+        if(t1 == t2)
+            fprintf('Line %d matches\n',i);
+        else
+            fprintf('Line %d FAILS\n', i);
+        end
+    catch
+       %TO CATCH- Attempted to access T(0); index must be a positive integer or logical.
+    	fprintf('Line %d CRASH **************\n',i);
+
+    end    
+    %%% THIS DOESNT WORK - Attempted to access T(0); index must be a positive integer or logical.
+
+
+end
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code/traceLinkageToBinary.m	Wed Mar 15 11:33:55 2017 +0000
@@ -0,0 +1,30 @@
+function classList = traceLinkageToBinary(linkList, rowIndex)
+%% class = traceLinkageToBinary(linkList, rowIndex)
+% This function accepts a linkList and a rowIndex, and performs a transform
+% to provide a classification list for all the data points in the original
+% list. From a row index, if the data falls under column 1 (lower number)
+% then it is given a class of 1, if it falls under column 2 (higher number)
+% then it is given a class of 2. Any data not included in that branch of
+% the hierarchy is given a class of 0
+% linkList - the input result from linkages
+% rowIndex - the row on which to split the data
+
+listSize = size(linkList,1)+1;
+c(1) = linkList(rowIndex,1);
+c(2) = linkList(rowIndex,2);
+for i = 1:2
+    if (c(i) > listSize)
+        c(i) = c(i) - listSize;
+    end
+end
+
+leafList1 = traverseDownOneStep(linkList,[],c(1));
+leafList2 = traverseDownOneStep(linkList,[],c(2));
+
+classList = zeros(listSize,1);
+classList(leafList1) = c(1);
+classList(leafList2) = c(2);
+
+
+end
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code/traverseDownOneStep.m	Wed Mar 15 11:33:55 2017 +0000
@@ -0,0 +1,32 @@
+function leaf = traverseDownOneStep(linkList,leaf,row)
+
+%% leaf = traverseDownOneStep(linkList,leaf,row)
+% Recursive function which given a linkList, will search a given row, and
+% if the row is a leaf, it will append the leaf to the end of the leaf
+% list, otherwise, it will recursively call the function to identify the
+% two leaves for the branches it has discovered
+
+listSize = size(linkList,1)+1;
+if(row > listSize)
+    row = row-listSize;
+end
+
+if (row == listSize)
+    leaf = row;
+else
+    leaf1 = linkList(row,1);
+    leaf2 = linkList(row,2);
+
+    if(leaf1 > listSize)
+        leaf = traverseDownOneStep(linkList,leaf,leaf1);
+    else
+        leaf = cat(1,leaf,leaf1);
+    end
+
+    if(leaf2 > listSize)
+        leaf = traverseDownOneStep(linkList,leaf,leaf2);
+    else
+        leaf = cat(1,leaf,leaf2);
+    end
+end
+end
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code/treeLinkFeatures.m	Wed Mar 15 11:33:55 2017 +0000
@@ -0,0 +1,58 @@
+function [linkList, featureList]= treeLinkFeatures(data, depthThresh, featureNames)
+%% [linkList, featureList]= treeLinkFeatures(data, depthThresh, featureNames)
+% given a dataset, a hierarchical cluster of the data is produced, and then
+% the data is traversed, such that, for each split in the data, a set of
+% features are produced, which are the ranked features that can be used to
+% separate the given dataset at that point.
+% data is the nxm matrix of content, n is the number of samples and m is
+% the number of features.
+% depthThresh is a list of the range of tree depths to traverse from the
+% aglomerative clustering tree. A single value of depthThresh, will assume
+% 1:depthThresh. For analysis of a single layer of the tree, pass a list of
+% two values, both of which are the layer to be analysed.
+% feature names is the list of features, so that grown trees have suitable
+% names. No feature names will result in the feature number being returned.
+% featureList corresponds to the rows in linkList, with the form column 1
+% is the 5 most relevant features, column 2 is the depth and column 3 is a
+% decision classification tree for the decision - perhaps this should be in
+% the form of a struct instead?
+
+
+
+if(nargin < 3)
+    featureNames = 1:size(data,2);
+end
+if(nargin < 2)
+    depthThresh = 999;
+end
+
+if (length(depthThresh) == 1)
+    depthThresh = 1:depthThresh;
+end
+    
+linkList = aglomCluster(data);
+linkList = depthCheck(linkList);
+listSize = size(data,1);
+
+% linkList(:,4) = 0;
+featureList = cell(listSize-1,3);
+currentRow = [2*listSize-1];
+
+%%
+while (~isempty(currentRow))
+    if(currentRow(1) > listSize)
+        row = currentRow(1) - listSize
+%         rD = linkList(row,4);
+        if any(linkList(row,4)==depthThresh)
+            classList = traceLinkageToBinary(linkList, row);
+            featureList{row,1} = rfFeatureSelection(data(classList>0,:), classList(classList>0));
+            featureList{row,2} = linkList(row,4);
+            featureList{row,3} = fitctree(data(classList>0,featureList{row,1}),classList(classList>0),'PredictorNames',featureNames(featureList{row,1}));
+        end
+        currentRow = [currentRow; linkList(row,1); linkList(row,2)];
+    end
+    currentRow = currentRow(2:end);
+    save('partialResults.mat');
+end
+
+end
\ No newline at end of file
--- a/phase2/aglomCluster.m	Wed Mar 15 11:26:24 2017 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,62 +0,0 @@
-function linkList = aglomCluster(data, clusterMethod, distanceMetric, numClusters)
-%% aglomCluster(data, clusterMethod, distanceMetric, numClusters)
-% This function performs aglomerative clustering on a given data set,
-% allowing the interpretation of a hierarchical data, and plotting a
-% dendrogram.
-%
-% data in the format of of each row is an observation and each column is a
-% feature vector clusterMethod;
-%     * 'average'     Unweighted average distance (UPGMA)
-%     * 'centroid'	Centroid distance (UPGMC), appropriate for Euclidean
-%     distances only
-%     * 'complete'	Furthest distance
-%     * 'median'      Weighted center of mass distance (WPGMC),appropriate
-%     for Euclidean distances only
-%     * 'single'      Shortest distance
-%     * 'ward'        Inner squared distance (minimum variance algorithm),
-%     appropriate for Euclidean distances only (default)
-%     * 'weighted'	Weighted average distance (WPGMA)
-% distanceMetric
-%     * 'euclidean' Euclidean distance (default).
-%     * 'seuclidean' Standardized Euclidean distance. Each coordinate
-%     difference between rows in X is scaled by dividing by the
-%     corresponding element of the standard deviation S=nanstd(X). To
-%     specify another value for S, use D=pdist(X,'seuclidean',S).
-%     * 'cityblock' City block metric.
-%     * 'minkowski' Minkowski distance. The default exponent is 2. To
-%     specify a different exponent, use D = pdist(X,'minkowski',P), where P
-%     is a scalar positive value of the exponent.
-%     * 'chebychev' Chebychev distance (maximum coordinate difference).
-%     * 'mahalanobis'	Mahalanobis distance, using the sample covariance
-%     of X as computed by nancov. To compute the distance with a different
-%     covariance, use D = pdist(X,'mahalanobis',C), where the matrix C is
-%     symmetric and positive definite.
-%     * 'cosine' One minus the cosine of the included angle between points
-%     (treated as vectors).
-%     * 'correlation' One minus the sample correlation between points
-%     (treated as sequences of values).
-%     * 'spearman' One minus the sample Spearman's rank correlation between
-%     observations (treated as sequences of values).
-%     * 'hamming' Hamming distance, which is the percentage of coordinates
-%     that differ.
-%     * 'jaccard' One minus the Jaccard coefficient, which is the
-%     percentage of nonzero coordinates that differ.
-% numClusters is the number of final clusters produced by the dendrogram,
-% if 0 (default), then will infer from data
-
-if(nargin<2)
-    clusterMethod = 'ward';
-end
-if(nargin<3)
-    distanceMetric = 'euclidean';
-end
-if (nargin<4)
-    numClusters = 0;
-end
-
-distMap = pdist(data, distanceMetric);
-linkList = linkage(distMap, clusterMethod);
-% [~,T] = dendrogram(linkList,numClusters);
-
-
-end
\ No newline at end of file
--- a/phase2/dataWithFeatures12345.txt	Wed Mar 15 11:26:24 2017 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-    9.0000   49.0000    3.2879
-   15.0000   17.0000    3.2923
-    2.0000   12.0000    3.3344
-    3.0000  119.0000    3.4003
-    6.0000   32.0000    3.4004
-   27.0000   37.0000    3.4039
-   56.0000   62.0000    3.4140
-   16.0000   99.0000    3.4140
-   53.0000  100.0000    3.4310
-   13.0000   71.0000    3.4326
-   25.0000   67.0000    3.4425
-  105.0000  120.0000    3.4498
-   85.0000   88.0000    3.4524
-   31.0000   43.0000    3.4858
-   24.0000   41.0000    3.4884
-   18.0000   28.0000    3.4903
-   11.0000   14.0000    3.5708
-   35.0000   64.0000    3.5966
-  125.0000  161.0000    3.6400
-   36.0000  164.0000    3.6450
-   42.0000  126.0000    3.7072
-   20.0000   65.0000    3.7497
-    7.0000  158.0000    3.7764
-    4.0000   55.0000    3.7873
-  123.0000  137.0000    3.8042
-  149.0000  153.0000    3.8448
-   10.0000  163.0000    3.8986
-   34.0000  135.0000    3.9031
-  154.0000  174.0000    3.9159
-   73.0000  159.0000    3.9186
-  134.0000  139.0000    3.9261
-  136.0000  168.0000    3.9283
-   63.0000  147.0000    3.9340
-  148.0000  173.0000    3.9901
-   58.0000  142.0000    3.9974
-  140.0000  155.0000    4.0098
-   38.0000  138.0000    4.0161
-  143.0000  170.0000    4.0167
-  167.0000  172.0000    4.0602
-  169.0000  179.0000    4.2069
-  157.0000  184.0000    4.2088
-  129.0000  160.0000    4.2787
-  152.0000  176.0000    4.3593
-  121.0000  195.0000    4.3706
\ No newline at end of file
--- a/phase2/depthCheck.m	Wed Mar 15 11:26:24 2017 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-function linkList = depthCheck(linkList)
-%% linkList = depthCheck(linkList)
-% depthCheck will extend a linkList, created by the linkages algorithm, and
-% append an extra column on the end which indicated the depth of the
-% linkage, so the top level is 1, and each following level is the number of
-% links needed to get to the top level - which could be considered the
-% number of rules that exist. 
-% 
-% The other method for measuring depth would be
-% to look at the value of the linkage distance - thresholding and grouping
-% the linkage distances could be beneficial for some analysis.
-
-listSize = size(linkList,1)+1;
-
-linkList = cat(2,linkList, zeros(size(linkList,1),1));
-currentRow = size(linkList,1);
-r = [0;0];
-% depth = 1;
-
-linkList(currentRow,end) = 1;
-% depth = depth + 1;
-%%
-while (~isempty(currentRow))
-    row = currentRow(1);
-    for i = 1:2
-        r(i) = linkList(row,i);
-        if(r(i) > listSize)
-            r(i) = linkList(row,i) - listSize;
-            linkList(r(i),end) = linkList(currentRow(1),end)+1;
-            currentRow = [currentRow; r(i)];
-        end
-    end
-    currentRow = currentRow(2:end);
-end
-end
-
--- a/phase2/rfFeatureSelection.m	Wed Mar 15 11:26:24 2017 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,67 +0,0 @@
-function featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector)
-%% rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector)
-%
-% using random forests to perform feature selection for a given data set
-% data has size (x,y), where x is the number of labels and y, the number of
-% features. 
-% labels is the set of labels for the data
-% numFeatures is the dimension of the output vector (default 5)
-% iterMethod is the method for which the features are cut down
-%       * 'onePass' will simply select the top (numFeatures) features and
-%       report them 
-%       * 'cutX' will iteratively cut the bottom X percent of
-%       features out, and perform random forest feature selection on the
-%       new set, until the desired number of features has been returned
-%       * 'featureDeltaErr' will cut down the number of features based on
-%       the number of features that negatively impact the results, as given
-%       by the OOBPermutedVarDeltaError
-% featureVector is a list of the features to use, for recursive purposes.
-
-if(length(labels) ~= size(data,1))
-    error('labels and data do not match up');
-end
-
-if(nargin < 2)
-    error('must pass data and labels into function')
-end
-if(nargin < 3)
-    numFeatures = 5;
-end
-if(nargin < 4)
-    iterMethod = 'onePass';
-end
-if(nargin < 5)
-    numTrees = 200;
-end
-if(nargin < 5)
-    featureVector = 1:size(data,2);
-end
-
-
-if(length(featureVector) > numFeatures)
-    options = statset('UseParallel', true);
-    b = TreeBagger(numTrees, data(:,featureVector), labels,'OOBVarImp','On',...
-        'SampleWithReplacement', 'Off','FBoot', 0.632,'Options', options);
-    [FI,I] = sort(b.OOBPermutedVarDeltaError,'descend'); 
-    featureVector = featureVector(I);
-
-    if(strcmp(iterMethod,'onePass'))
-        featureVector = featureVector(1:numFeatures);
-    elseif(strcmp(iterMethod(1:3),'cut'))
-        cutPercentage = str2double(iterMethod(4:end));
-        cutSize = max(floor(length(featureVector)*cutPercentage/100),1);
-        if(length(featureVector) - cutSize < numFeatures)
-            cutSize = length(featureVector) - numFeatures;
-        end
-        featureVector = featureVector(1:end-cutSize);
-        featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector);
-    elseif(strcmp(iterMethod,'featureDeltaErr'))
-        cutSize = sum(FI<0);
-        if(length(featureVector) - cutSize < numFeatures)
-            cutSize = length(featureVector) - numFeatures;
-        end
-        featureVector = featureVector(1:end-cutSize);
-        featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector);
-    end
-end
-end
\ No newline at end of file
--- a/phase2/runme.m	Wed Mar 15 11:26:24 2017 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-
-%%
-% load('testData.mat');
-% [linkList, featureList]= treeLinkFeatures(data,5);
-% save('testResults.mat','linkList','featureList');
-
-%%
-load('adobeDataNorm.mat')
-[linkList, featureList]= treeLinkFeatures(AdobeNormalised,5,featureNames);
-save('adobeResults.mat','linkList','featureList');
-% exit
-
-%%
--- a/phase2/testparseData.m	Wed Mar 15 11:26:24 2017 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-% An attempt to make sense of the treeLinkFeatures output data in a
-% meaningful way, and to understand why so man 
-
-tl = [];
-for i = 1:length(featureList)
-    t = zeros(5,1);
-    for ii = 1:5
-        t(ii) = (featureList{i}(ii) == ii);
-    end
-    tl = [tl; (sum(t)==5)];
-end
-
-%%
-compareList = linkList(find(tl),1:2);
-
-for i = 1:length(compareList)
-    try
-        t1 = T(mod(compareList(i,1),length(featureList)+1));
-        t2 = T(mod(compareList(i,2),length(featureList)+1));
-        if(t1 == t2)
-            fprintf('Line %d matches\n',i);
-        else
-            fprintf('Line %d FAILS\n', i);
-        end
-    catch
-       %TO CATCH- Attempted to access T(0); index must be a positive integer or logical.
-    	fprintf('Line %d CRASH **************\n',i);
-
-    end    
-    %%% THIS DOESNT WORK - Attempted to access T(0); index must be a positive integer or logical.
-
-
-end
\ No newline at end of file
--- a/phase2/traceLinkageToBinary.m	Wed Mar 15 11:26:24 2017 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,30 +0,0 @@
-function classList = traceLinkageToBinary(linkList, rowIndex)
-%% class = traceLinkageToBinary(linkList, rowIndex)
-% This function accepts a linkList and a rowIndex, and performs a transform
-% to provide a classification list for all the data points in the original
-% list. From a row index, if the data falls under column 1 (lower number)
-% then it is given a class of 1, if it falls under column 2 (higher number)
-% then it is given a class of 2. Any data not included in that branch of
-% the hierarchy is given a class of 0
-% linkList - the input result from linkages
-% rowIndex - the row on which to split the data
-
-listSize = size(linkList,1)+1;
-c(1) = linkList(rowIndex,1);
-c(2) = linkList(rowIndex,2);
-for i = 1:2
-    if (c(i) > listSize)
-        c(i) = c(i) - listSize;
-    end
-end
-
-leafList1 = traverseDownOneStep(linkList,[],c(1));
-leafList2 = traverseDownOneStep(linkList,[],c(2));
-
-classList = zeros(listSize,1);
-classList(leafList1) = c(1);
-classList(leafList2) = c(2);
-
-
-end
-
--- a/phase2/traverseDownOneStep.m	Wed Mar 15 11:26:24 2017 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,32 +0,0 @@
-function leaf = traverseDownOneStep(linkList,leaf,row)
-
-%% leaf = traverseDownOneStep(linkList,leaf,row)
-% Recursive function which given a linkList, will search a given row, and
-% if the row is a leaf, it will append the leaf to the end of the leaf
-% list, otherwise, it will recursively call the function to identify the
-% two leaves for the branches it has discovered
-
-listSize = size(linkList,1)+1;
-if(row > listSize)
-    row = row-listSize;
-end
-
-if (row == listSize)
-    leaf = row;
-else
-    leaf1 = linkList(row,1);
-    leaf2 = linkList(row,2);
-
-    if(leaf1 > listSize)
-        leaf = traverseDownOneStep(linkList,leaf,leaf1);
-    else
-        leaf = cat(1,leaf,leaf1);
-    end
-
-    if(leaf2 > listSize)
-        leaf = traverseDownOneStep(linkList,leaf,leaf2);
-    else
-        leaf = cat(1,leaf,leaf2);
-    end
-end
-end
\ No newline at end of file
--- a/phase2/treeLinkFeatures.m	Wed Mar 15 11:26:24 2017 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,58 +0,0 @@
-function [linkList, featureList]= treeLinkFeatures(data, depthThresh, featureNames)
-%% [linkList, featureList]= treeLinkFeatures(data, depthThresh, featureNames)
-% given a dataset, a hierarchical cluster of the data is produced, and then
-% the data is traversed, such that, for each split in the data, a set of
-% features are produced, which are the ranked features that can be used to
-% separate the given dataset at that point.
-% data is the nxm matrix of content, n is the number of samples and m is
-% the number of features.
-% depthThresh is a list of the range of tree depths to traverse from the
-% aglomerative clustering tree. A single value of depthThresh, will assume
-% 1:depthThresh. For analysis of a single layer of the tree, pass a list of
-% two values, both of which are the layer to be analysed.
-% feature names is the list of features, so that grown trees have suitable
-% names. No feature names will result in the feature number being returned.
-% featureList corresponds to the rows in linkList, with the form column 1
-% is the 5 most relevant features, column 2 is the depth and column 3 is a
-% decision classification tree for the decision - perhaps this should be in
-% the form of a struct instead?
-
-
-
-if(nargin < 3)
-    featureNames = 1:size(data,2);
-end
-if(nargin < 2)
-    depthThresh = 999;
-end
-
-if (length(depthThresh) == 1)
-    depthThresh = 1:depthThresh;
-end
-    
-linkList = aglomCluster(data);
-linkList = depthCheck(linkList);
-listSize = size(data,1);
-
-% linkList(:,4) = 0;
-featureList = cell(listSize-1,3);
-currentRow = [2*listSize-1];
-
-%%
-while (~isempty(currentRow))
-    if(currentRow(1) > listSize)
-        row = currentRow(1) - listSize
-%         rD = linkList(row,4);
-        if any(linkList(row,4)==depthThresh)
-            classList = traceLinkageToBinary(linkList, row);
-            featureList{row,1} = rfFeatureSelection(data(classList>0,:), classList(classList>0));
-            featureList{row,2} = linkList(row,4);
-            featureList{row,3} = fitctree(data(classList>0,featureList{row,1}),classList(classList>0),'PredictorNames',featureNames(featureList{row,1}));
-        end
-        currentRow = [currentRow; linkList(row,1); linkList(row,2)];
-    end
-    currentRow = currentRow(2:end);
-    save('partialResults.mat');
-end
-
-end
\ No newline at end of file