Mercurial > hg > sfx-subgrouping
changeset 32:4bdcab1e821c
tidy up directory
author | DaveM |
---|---|
date | Wed, 15 Mar 2017 11:33:55 +0000 |
parents | 55813e99c6cf |
children | 74d123779d3b |
files | _code/Gensim LDA tutorial.ipynb _code/Hierarchical Clustering.ipynb code/Gensim LDA tutorial.ipynb code/Hierarchical Clustering.ipynb code/RUNME.sh code/aglomCluster.m code/depthCheck.m code/rfFeatureSelection.m code/runme.m code/testparseData.m code/traceLinkageToBinary.m code/traverseDownOneStep.m code/treeLinkFeatures.m phase2/aglomCluster.m phase2/dataWithFeatures12345.txt phase2/depthCheck.m phase2/rfFeatureSelection.m phase2/runme.m phase2/testparseData.m phase2/traceLinkageToBinary.m phase2/traverseDownOneStep.m phase2/treeLinkFeatures.m |
diffstat | 22 files changed, 650 insertions(+), 693 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/_code/Gensim LDA tutorial.ipynb Wed Mar 15 11:33:55 2017 +0000 @@ -0,0 +1,111 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.10.0\n" + ] + } + ], + "source": [ + "import six\n", + "print six.__version__\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import gensim\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import logging\n", + "import gensim\n", + "import bz2\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "IOError", + "evalue": "[Errno 2] No such file or directory: 'wiki_en_wordids.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-4-54645b7b2c38>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# load id->word mapping (the dictionary), one of the results of step 2 above\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mid2word\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDictionary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_from_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_wordids.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m# load corpus iterator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mmm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMmCorpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_tfidf.mm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Python/2.7/site-packages/gensim/corpora/dictionary.pyc\u001b[0m in \u001b[0;36mload_from_text\u001b[0;34m(fname)\u001b[0m\n\u001b[1;32m 342\u001b[0m \"\"\"\n\u001b[1;32m 343\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDictionary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 344\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msmart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 345\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlineno\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_unicode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36msmart_open\u001b[0;34m(uri, mode, **kw)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;31m# local files -- both read & write supported\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;31m# compression, if any, is determined by the filename extension (.gz, .bz2)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfile_smart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscheme\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"s3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"s3n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;31m# Get an S3 host. It is required for sigv4 operations.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36mfile_smart_open\u001b[0;34m(fname, mode)\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmake_closing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGzipFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 557\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 558\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 559\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 560\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: 'wiki_en_wordids.txt'" + ] + } + ], + "source": [ + "# load id->word mapping (the dictionary), one of the results of step 2 above\n", + "id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')\n", + "# load corpus iterator\n", + "mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')\n", + "# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\n", + "\n", + "print(mm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/_code/Hierarchical Clustering.ipynb Wed Mar 15 11:33:55 2017 +0000 @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from matplotlib import pyplot as plt\n", + "from scipy.cluster.hierarchy import dendrogram, linkage, cophenet\n", + "from scipy.spatial.distance import pdist\n", + "import sklearn \n", + "import numpy as np\n", + "import csv\n", + "\n", + "dataFolder = '../data/'\n", + "keyFile = 'AdobeNormalised'\n", + "datapath = dataFolder + keyFile" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X = np.genfromtxt(datapath+'.csv', delimiter = ',', skip_header = 1)\n", + "filenames = np.loadtxt(datapath+'_filenames.csv', dtype = str)\n", + "labels = np.loadtxt(datapath+'_labels.csv', dtype = str)\n", + "features = np.loadtxt(datapath+'_features.csv', dtype = str)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "agglo = cluster.FeatureAgglomeration()\n", + "agglo.fit(X)\n", + "X_reduced = agglo.transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "Z = linkage(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 8.51810000e-01 4.00000000e-06 2.46000000e-04 ..., 2.10260000e-02\n", + " 1.98220000e-02 1.04000000e-04]\n", + " [ 9.52275000e-01 7.00000000e-06 1.82600000e-03 ..., 1.79490000e-02\n", + " 1.09020000e-02 7.20000000e-05]\n", + " [ 1.92200000e-03 1.00000000e-06 1.39000000e-04 ..., 2.35900000e-02\n", + " 6.93800000e-03 2.61000000e-04]\n", + " ..., \n", + " [ 9.96346000e-01 3.37000000e-04 1.23600000e-03 ..., 5.24103000e-01\n", + " 3.36967000e-01 5.39000000e-04]\n", + " [ 9.99990000e-01 1.00000000e-06 0.00000000e+00 ..., 0.00000000e+00\n", + " 0.00000000e+00 0.00000000e+00]\n", + " [ 9.96624000e-01 6.97000000e-04 2.59300000e-03 ..., 5.24615000e-01\n", + " 3.34985000e-01 5.45000000e-04]]\n" + ] + } + ], + "source": [ + "print X" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(8977, 1536)\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'nu_0': 0, 'kappa_0': 0, 'lambda_0': 0, 'mu_0': 0}\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pyBHC as bhc\n", + "from pyBHC import dists\n", + "\n", + "mu_init = []\n", + "sigma_init = []\n", + "S_init = []\n", + "cd = dists.NormalFixedCovar(mu_0=mu_init,sigma_0=sigma_init, S=S_init)\n", + "\n", + "# temp = cd.log_marginal_likelihood(X)\n", + "d = bhc.rbhc(X, cd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}
--- a/code/Gensim LDA tutorial.ipynb Wed Mar 15 11:26:24 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,111 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.10.0\n" - ] - } - ], - "source": [ - "import six\n", - "print six.__version__\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import gensim\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import logging\n", - "import gensim\n", - "import bz2\n", - "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "ename": "IOError", - "evalue": "[Errno 2] No such file or directory: 'wiki_en_wordids.txt'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m<ipython-input-4-54645b7b2c38>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# load id->word mapping (the dictionary), one of the results of step 2 above\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mid2word\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDictionary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_from_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_wordids.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m# load corpus iterator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mmm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMmCorpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_tfidf.mm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Library/Python/2.7/site-packages/gensim/corpora/dictionary.pyc\u001b[0m in \u001b[0;36mload_from_text\u001b[0;34m(fname)\u001b[0m\n\u001b[1;32m 342\u001b[0m \"\"\"\n\u001b[1;32m 343\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDictionary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 344\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msmart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 345\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlineno\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_unicode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36msmart_open\u001b[0;34m(uri, mode, **kw)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;31m# local files -- both read & write supported\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;31m# compression, if any, is determined by the filename extension (.gz, .bz2)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfile_smart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscheme\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"s3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"s3n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;31m# Get an S3 host. It is required for sigv4 operations.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36mfile_smart_open\u001b[0;34m(fname, mode)\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmake_closing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGzipFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 557\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 558\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 559\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 560\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: 'wiki_en_wordids.txt'" - ] - } - ], - "source": [ - "# load id->word mapping (the dictionary), one of the results of step 2 above\n", - "id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')\n", - "# load corpus iterator\n", - "mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')\n", - "# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\n", - "\n", - "print(mm)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -}
--- a/code/Hierarchical Clustering.ipynb Wed Mar 15 11:26:24 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,207 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from matplotlib import pyplot as plt\n", - "from scipy.cluster.hierarchy import dendrogram, linkage, cophenet\n", - "from scipy.spatial.distance import pdist\n", - "import sklearn \n", - "import numpy as np\n", - "import csv\n", - "\n", - "dataFolder = '../data/'\n", - "keyFile = 'AdobeNormalised'\n", - "datapath = dataFolder + keyFile" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "X = np.genfromtxt(datapath+'.csv', delimiter = ',', skip_header = 1)\n", - "filenames = np.loadtxt(datapath+'_filenames.csv', dtype = str)\n", - "labels = np.loadtxt(datapath+'_labels.csv', dtype = str)\n", - "features = np.loadtxt(datapath+'_features.csv', dtype = str)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "agglo = cluster.FeatureAgglomeration()\n", - "agglo.fit(X)\n", - "X_reduced = agglo.transform(X)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "Z = linkage(X)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[ 8.51810000e-01 4.00000000e-06 2.46000000e-04 ..., 2.10260000e-02\n", - " 1.98220000e-02 1.04000000e-04]\n", - " [ 9.52275000e-01 7.00000000e-06 1.82600000e-03 ..., 1.79490000e-02\n", - " 1.09020000e-02 7.20000000e-05]\n", - " [ 1.92200000e-03 1.00000000e-06 1.39000000e-04 ..., 2.35900000e-02\n", - " 6.93800000e-03 2.61000000e-04]\n", - " ..., \n", - " [ 9.96346000e-01 3.37000000e-04 1.23600000e-03 ..., 5.24103000e-01\n", - " 3.36967000e-01 5.39000000e-04]\n", - " [ 9.99990000e-01 1.00000000e-06 0.00000000e+00 ..., 0.00000000e+00\n", - " 0.00000000e+00 0.00000000e+00]\n", - " [ 9.96624000e-01 6.97000000e-04 2.59300000e-03 ..., 5.24615000e-01\n", - " 3.34985000e-01 5.45000000e-04]]\n" - ] - } - ], - "source": [ - "print X" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(8977, 1536)\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'nu_0': 0, 'kappa_0': 0, 'lambda_0': 0, 'mu_0': 0}\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import pyBHC as bhc\n", - "from pyBHC import dists\n", - "\n", - "mu_init = []\n", - "sigma_init = []\n", - "S_init = []\n", - "cd = dists.NormalFixedCovar(mu_0=mu_init,sigma_0=sigma_init, S=S_init)\n", - "\n", - "# temp = cd.log_marginal_likelihood(X)\n", - "d = bhc.rbhc(X, cd)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code/RUNME.sh Wed Mar 15 11:33:55 2017 +0000 @@ -0,0 +1,1 @@ +screen -d -L -m matlab -nodisplay -nojvm -r runme
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code/aglomCluster.m Wed Mar 15 11:33:55 2017 +0000 @@ -0,0 +1,62 @@ +function linkList = aglomCluster(data, clusterMethod, distanceMetric, numClusters) +%% aglomCluster(data, clusterMethod, distanceMetric, numClusters) +% This function performs aglomerative clustering on a given data set, +% allowing the interpretation of a hierarchical data, and plotting a +% dendrogram. +% +% data in the format of of each row is an observation and each column is a +% feature vector clusterMethod; +% * 'average' Unweighted average distance (UPGMA) +% * 'centroid' Centroid distance (UPGMC), appropriate for Euclidean +% distances only +% * 'complete' Furthest distance +% * 'median' Weighted center of mass distance (WPGMC),appropriate +% for Euclidean distances only +% * 'single' Shortest distance +% * 'ward' Inner squared distance (minimum variance algorithm), +% appropriate for Euclidean distances only (default) +% * 'weighted' Weighted average distance (WPGMA) +% distanceMetric +% * 'euclidean' Euclidean distance (default). +% * 'seuclidean' Standardized Euclidean distance. Each coordinate +% difference between rows in X is scaled by dividing by the +% corresponding element of the standard deviation S=nanstd(X). To +% specify another value for S, use D=pdist(X,'seuclidean',S). +% * 'cityblock' City block metric. +% * 'minkowski' Minkowski distance. The default exponent is 2. To +% specify a different exponent, use D = pdist(X,'minkowski',P), where P +% is a scalar positive value of the exponent. +% * 'chebychev' Chebychev distance (maximum coordinate difference). +% * 'mahalanobis' Mahalanobis distance, using the sample covariance +% of X as computed by nancov. To compute the distance with a different +% covariance, use D = pdist(X,'mahalanobis',C), where the matrix C is +% symmetric and positive definite. +% * 'cosine' One minus the cosine of the included angle between points +% (treated as vectors). +% * 'correlation' One minus the sample correlation between points +% (treated as sequences of values). +% * 'spearman' One minus the sample Spearman's rank correlation between +% observations (treated as sequences of values). +% * 'hamming' Hamming distance, which is the percentage of coordinates +% that differ. +% * 'jaccard' One minus the Jaccard coefficient, which is the +% percentage of nonzero coordinates that differ. +% numClusters is the number of final clusters produced by the dendrogram, +% if 0 (default), then will infer from data + +if(nargin<2) + clusterMethod = 'ward'; +end +if(nargin<3) + distanceMetric = 'euclidean'; +end +if (nargin<4) + numClusters = 0; +end + +distMap = pdist(data, distanceMetric); +linkList = linkage(distMap, clusterMethod); +% [~,T] = dendrogram(linkList,numClusters); + + +end \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code/depthCheck.m Wed Mar 15 11:33:55 2017 +0000 @@ -0,0 +1,36 @@ +function linkList = depthCheck(linkList) +%% linkList = depthCheck(linkList) +% depthCheck will extend a linkList, created by the linkages algorithm, and +% append an extra column on the end which indicated the depth of the +% linkage, so the top level is 1, and each following level is the number of +% links needed to get to the top level - which could be considered the +% number of rules that exist. +% +% The other method for measuring depth would be +% to look at the value of the linkage distance - thresholding and grouping +% the linkage distances could be beneficial for some analysis. + +listSize = size(linkList,1)+1; + +linkList = cat(2,linkList, zeros(size(linkList,1),1)); +currentRow = size(linkList,1); +r = [0;0]; +% depth = 1; + +linkList(currentRow,end) = 1; +% depth = depth + 1; +%% +while (~isempty(currentRow)) + row = currentRow(1); + for i = 1:2 + r(i) = linkList(row,i); + if(r(i) > listSize) + r(i) = linkList(row,i) - listSize; + linkList(r(i),end) = linkList(currentRow(1),end)+1; + currentRow = [currentRow; r(i)]; + end + end + currentRow = currentRow(2:end); +end +end +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code/rfFeatureSelection.m Wed Mar 15 11:33:55 2017 +0000 @@ -0,0 +1,67 @@ +function featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector) +%% rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector) +% +% using random forests to perform feature selection for a given data set +% data has size (x,y), where x is the number of labels and y, the number of +% features. +% labels is the set of labels for the data +% numFeatures is the dimension of the output vector (default 5) +% iterMethod is the method for which the features are cut down +% * 'onePass' will simply select the top (numFeatures) features and +% report them +% * 'cutX' will iteratively cut the bottom X percent of +% features out, and perform random forest feature selection on the +% new set, until the desired number of features has been returned +% * 'featureDeltaErr' will cut down the number of features based on +% the number of features that negatively impact the results, as given +% by the OOBPermutedVarDeltaError +% featureVector is a list of the features to use, for recursive purposes. + +if(length(labels) ~= size(data,1)) + error('labels and data do not match up'); +end + +if(nargin < 2) + error('must pass data and labels into function') +end +if(nargin < 3) + numFeatures = 5; +end +if(nargin < 4) + iterMethod = 'onePass'; +end +if(nargin < 5) + numTrees = 200; +end +if(nargin < 5) + featureVector = 1:size(data,2); +end + + +if(length(featureVector) > numFeatures) + options = statset('UseParallel', true); + b = TreeBagger(numTrees, data(:,featureVector), labels,'OOBVarImp','On',... + 'SampleWithReplacement', 'Off','FBoot', 0.632,'Options', options); + [FI,I] = sort(b.OOBPermutedVarDeltaError,'descend'); + featureVector = featureVector(I); + + if(strcmp(iterMethod,'onePass')) + featureVector = featureVector(1:numFeatures); + elseif(strcmp(iterMethod(1:3),'cut')) + cutPercentage = str2double(iterMethod(4:end)); + cutSize = max(floor(length(featureVector)*cutPercentage/100),1); + if(length(featureVector) - cutSize < numFeatures) + cutSize = length(featureVector) - numFeatures; + end + featureVector = featureVector(1:end-cutSize); + featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector); + elseif(strcmp(iterMethod,'featureDeltaErr')) + cutSize = sum(FI<0); + if(length(featureVector) - cutSize < numFeatures) + cutSize = length(featureVector) - numFeatures; + end + featureVector = featureVector(1:end-cutSize); + featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector); + end +end +end \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code/runme.m Wed Mar 15 11:33:55 2017 +0000 @@ -0,0 +1,13 @@ + +%% +% load('testData.mat'); +% [linkList, featureList]= treeLinkFeatures(data,5); +% save('testResults.mat','linkList','featureList'); + +%% +load('adobeDataNorm.mat') +[linkList, featureList]= treeLinkFeatures(AdobeNormalised,5,featureNames); +save('adobeResults.mat','linkList','featureList'); +% exit + +%%
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code/testparseData.m Wed Mar 15 11:33:55 2017 +0000 @@ -0,0 +1,33 @@ +% An attempt to make sense of the treeLinkFeatures output data in a +% meaningful way, and to understand why so man + +tl = []; +for i = 1:length(featureList) + t = zeros(5,1); + for ii = 1:5 + t(ii) = (featureList{i}(ii) == ii); + end + tl = [tl; (sum(t)==5)]; +end + +%% +compareList = linkList(find(tl),1:2); + +for i = 1:length(compareList) + try + t1 = T(mod(compareList(i,1),length(featureList)+1)); + t2 = T(mod(compareList(i,2),length(featureList)+1)); + if(t1 == t2) + fprintf('Line %d matches\n',i); + else + fprintf('Line %d FAILS\n', i); + end + catch + %TO CATCH- Attempted to access T(0); index must be a positive integer or logical. + fprintf('Line %d CRASH **************\n',i); + + end + %%% THIS DOESNT WORK - Attempted to access T(0); index must be a positive integer or logical. + + +end \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code/traceLinkageToBinary.m Wed Mar 15 11:33:55 2017 +0000 @@ -0,0 +1,30 @@ +function classList = traceLinkageToBinary(linkList, rowIndex) +%% class = traceLinkageToBinary(linkList, rowIndex) +% This function accepts a linkList and a rowIndex, and performs a transform +% to provide a classification list for all the data points in the original +% list. From a row index, if the data falls under column 1 (lower number) +% then it is given a class of 1, if it falls under column 2 (higher number) +% then it is given a class of 2. Any data not included in that branch of +% the hierarchy is given a class of 0 +% linkList - the input result from linkages +% rowIndex - the row on which to split the data + +listSize = size(linkList,1)+1; +c(1) = linkList(rowIndex,1); +c(2) = linkList(rowIndex,2); +for i = 1:2 + if (c(i) > listSize) + c(i) = c(i) - listSize; + end +end + +leafList1 = traverseDownOneStep(linkList,[],c(1)); +leafList2 = traverseDownOneStep(linkList,[],c(2)); + +classList = zeros(listSize,1); +classList(leafList1) = c(1); +classList(leafList2) = c(2); + + +end +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code/traverseDownOneStep.m Wed Mar 15 11:33:55 2017 +0000 @@ -0,0 +1,32 @@ +function leaf = traverseDownOneStep(linkList,leaf,row) + +%% leaf = traverseDownOneStep(linkList,leaf,row) +% Recursive function which given a linkList, will search a given row, and +% if the row is a leaf, it will append the leaf to the end of the leaf +% list, otherwise, it will recursively call the function to identify the +% two leaves for the branches it has discovered + +listSize = size(linkList,1)+1; +if(row > listSize) + row = row-listSize; +end + +if (row == listSize) + leaf = row; +else + leaf1 = linkList(row,1); + leaf2 = linkList(row,2); + + if(leaf1 > listSize) + leaf = traverseDownOneStep(linkList,leaf,leaf1); + else + leaf = cat(1,leaf,leaf1); + end + + if(leaf2 > listSize) + leaf = traverseDownOneStep(linkList,leaf,leaf2); + else + leaf = cat(1,leaf,leaf2); + end +end +end \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code/treeLinkFeatures.m Wed Mar 15 11:33:55 2017 +0000 @@ -0,0 +1,58 @@ +function [linkList, featureList]= treeLinkFeatures(data, depthThresh, featureNames) +%% [linkList, featureList]= treeLinkFeatures(data, depthThresh, featureNames) +% given a dataset, a hierarchical cluster of the data is produced, and then +% the data is traversed, such that, for each split in the data, a set of +% features are produced, which are the ranked features that can be used to +% separate the given dataset at that point. +% data is the nxm matrix of content, n is the number of samples and m is +% the number of features. +% depthThresh is a list of the range of tree depths to traverse from the +% aglomerative clustering tree. A single value of depthThresh, will assume +% 1:depthThresh. For analysis of a single layer of the tree, pass a list of +% two values, both of which are the layer to be analysed. +% feature names is the list of features, so that grown trees have suitable +% names. No feature names will result in the feature number being returned. +% featureList corresponds to the rows in linkList, with the form column 1 +% is the 5 most relevant features, column 2 is the depth and column 3 is a +% decision classification tree for the decision - perhaps this should be in +% the form of a struct instead? + + + +if(nargin < 3) + featureNames = 1:size(data,2); +end +if(nargin < 2) + depthThresh = 999; +end + +if (length(depthThresh) == 1) + depthThresh = 1:depthThresh; +end + +linkList = aglomCluster(data); +linkList = depthCheck(linkList); +listSize = size(data,1); + +% linkList(:,4) = 0; +featureList = cell(listSize-1,3); +currentRow = [2*listSize-1]; + +%% +while (~isempty(currentRow)) + if(currentRow(1) > listSize) + row = currentRow(1) - listSize +% rD = linkList(row,4); + if any(linkList(row,4)==depthThresh) + classList = traceLinkageToBinary(linkList, row); + featureList{row,1} = rfFeatureSelection(data(classList>0,:), classList(classList>0)); + featureList{row,2} = linkList(row,4); + featureList{row,3} = fitctree(data(classList>0,featureList{row,1}),classList(classList>0),'PredictorNames',featureNames(featureList{row,1})); + end + currentRow = [currentRow; linkList(row,1); linkList(row,2)]; + end + currentRow = currentRow(2:end); + save('partialResults.mat'); +end + +end \ No newline at end of file
--- a/phase2/aglomCluster.m Wed Mar 15 11:26:24 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -function linkList = aglomCluster(data, clusterMethod, distanceMetric, numClusters) -%% aglomCluster(data, clusterMethod, distanceMetric, numClusters) -% This function performs aglomerative clustering on a given data set, -% allowing the interpretation of a hierarchical data, and plotting a -% dendrogram. -% -% data in the format of of each row is an observation and each column is a -% feature vector clusterMethod; -% * 'average' Unweighted average distance (UPGMA) -% * 'centroid' Centroid distance (UPGMC), appropriate for Euclidean -% distances only -% * 'complete' Furthest distance -% * 'median' Weighted center of mass distance (WPGMC),appropriate -% for Euclidean distances only -% * 'single' Shortest distance -% * 'ward' Inner squared distance (minimum variance algorithm), -% appropriate for Euclidean distances only (default) -% * 'weighted' Weighted average distance (WPGMA) -% distanceMetric -% * 'euclidean' Euclidean distance (default). -% * 'seuclidean' Standardized Euclidean distance. Each coordinate -% difference between rows in X is scaled by dividing by the -% corresponding element of the standard deviation S=nanstd(X). To -% specify another value for S, use D=pdist(X,'seuclidean',S). -% * 'cityblock' City block metric. -% * 'minkowski' Minkowski distance. The default exponent is 2. To -% specify a different exponent, use D = pdist(X,'minkowski',P), where P -% is a scalar positive value of the exponent. -% * 'chebychev' Chebychev distance (maximum coordinate difference). -% * 'mahalanobis' Mahalanobis distance, using the sample covariance -% of X as computed by nancov. To compute the distance with a different -% covariance, use D = pdist(X,'mahalanobis',C), where the matrix C is -% symmetric and positive definite. -% * 'cosine' One minus the cosine of the included angle between points -% (treated as vectors). -% * 'correlation' One minus the sample correlation between points -% (treated as sequences of values). -% * 'spearman' One minus the sample Spearman's rank correlation between -% observations (treated as sequences of values). -% * 'hamming' Hamming distance, which is the percentage of coordinates -% that differ. -% * 'jaccard' One minus the Jaccard coefficient, which is the -% percentage of nonzero coordinates that differ. -% numClusters is the number of final clusters produced by the dendrogram, -% if 0 (default), then will infer from data - -if(nargin<2) - clusterMethod = 'ward'; -end -if(nargin<3) - distanceMetric = 'euclidean'; -end -if (nargin<4) - numClusters = 0; -end - -distMap = pdist(data, distanceMetric); -linkList = linkage(distMap, clusterMethod); -% [~,T] = dendrogram(linkList,numClusters); - - -end \ No newline at end of file
--- a/phase2/dataWithFeatures12345.txt Wed Mar 15 11:26:24 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ - 9.0000 49.0000 3.2879 - 15.0000 17.0000 3.2923 - 2.0000 12.0000 3.3344 - 3.0000 119.0000 3.4003 - 6.0000 32.0000 3.4004 - 27.0000 37.0000 3.4039 - 56.0000 62.0000 3.4140 - 16.0000 99.0000 3.4140 - 53.0000 100.0000 3.4310 - 13.0000 71.0000 3.4326 - 25.0000 67.0000 3.4425 - 105.0000 120.0000 3.4498 - 85.0000 88.0000 3.4524 - 31.0000 43.0000 3.4858 - 24.0000 41.0000 3.4884 - 18.0000 28.0000 3.4903 - 11.0000 14.0000 3.5708 - 35.0000 64.0000 3.5966 - 125.0000 161.0000 3.6400 - 36.0000 164.0000 3.6450 - 42.0000 126.0000 3.7072 - 20.0000 65.0000 3.7497 - 7.0000 158.0000 3.7764 - 4.0000 55.0000 3.7873 - 123.0000 137.0000 3.8042 - 149.0000 153.0000 3.8448 - 10.0000 163.0000 3.8986 - 34.0000 135.0000 3.9031 - 154.0000 174.0000 3.9159 - 73.0000 159.0000 3.9186 - 134.0000 139.0000 3.9261 - 136.0000 168.0000 3.9283 - 63.0000 147.0000 3.9340 - 148.0000 173.0000 3.9901 - 58.0000 142.0000 3.9974 - 140.0000 155.0000 4.0098 - 38.0000 138.0000 4.0161 - 143.0000 170.0000 4.0167 - 167.0000 172.0000 4.0602 - 169.0000 179.0000 4.2069 - 157.0000 184.0000 4.2088 - 129.0000 160.0000 4.2787 - 152.0000 176.0000 4.3593 - 121.0000 195.0000 4.3706 \ No newline at end of file
--- a/phase2/depthCheck.m Wed Mar 15 11:26:24 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -function linkList = depthCheck(linkList) -%% linkList = depthCheck(linkList) -% depthCheck will extend a linkList, created by the linkages algorithm, and -% append an extra column on the end which indicated the depth of the -% linkage, so the top level is 1, and each following level is the number of -% links needed to get to the top level - which could be considered the -% number of rules that exist. -% -% The other method for measuring depth would be -% to look at the value of the linkage distance - thresholding and grouping -% the linkage distances could be beneficial for some analysis. - -listSize = size(linkList,1)+1; - -linkList = cat(2,linkList, zeros(size(linkList,1),1)); -currentRow = size(linkList,1); -r = [0;0]; -% depth = 1; - -linkList(currentRow,end) = 1; -% depth = depth + 1; -%% -while (~isempty(currentRow)) - row = currentRow(1); - for i = 1:2 - r(i) = linkList(row,i); - if(r(i) > listSize) - r(i) = linkList(row,i) - listSize; - linkList(r(i),end) = linkList(currentRow(1),end)+1; - currentRow = [currentRow; r(i)]; - end - end - currentRow = currentRow(2:end); -end -end -
--- a/phase2/rfFeatureSelection.m Wed Mar 15 11:26:24 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ -function featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector) -%% rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector) -% -% using random forests to perform feature selection for a given data set -% data has size (x,y), where x is the number of labels and y, the number of -% features. -% labels is the set of labels for the data -% numFeatures is the dimension of the output vector (default 5) -% iterMethod is the method for which the features are cut down -% * 'onePass' will simply select the top (numFeatures) features and -% report them -% * 'cutX' will iteratively cut the bottom X percent of -% features out, and perform random forest feature selection on the -% new set, until the desired number of features has been returned -% * 'featureDeltaErr' will cut down the number of features based on -% the number of features that negatively impact the results, as given -% by the OOBPermutedVarDeltaError -% featureVector is a list of the features to use, for recursive purposes. - -if(length(labels) ~= size(data,1)) - error('labels and data do not match up'); -end - -if(nargin < 2) - error('must pass data and labels into function') -end -if(nargin < 3) - numFeatures = 5; -end -if(nargin < 4) - iterMethod = 'onePass'; -end -if(nargin < 5) - numTrees = 200; -end -if(nargin < 5) - featureVector = 1:size(data,2); -end - - -if(length(featureVector) > numFeatures) - options = statset('UseParallel', true); - b = TreeBagger(numTrees, data(:,featureVector), labels,'OOBVarImp','On',... - 'SampleWithReplacement', 'Off','FBoot', 0.632,'Options', options); - [FI,I] = sort(b.OOBPermutedVarDeltaError,'descend'); - featureVector = featureVector(I); - - if(strcmp(iterMethod,'onePass')) - featureVector = featureVector(1:numFeatures); - elseif(strcmp(iterMethod(1:3),'cut')) - cutPercentage = str2double(iterMethod(4:end)); - cutSize = max(floor(length(featureVector)*cutPercentage/100),1); - if(length(featureVector) - cutSize < numFeatures) - cutSize = length(featureVector) - numFeatures; - end - featureVector = featureVector(1:end-cutSize); - featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector); - elseif(strcmp(iterMethod,'featureDeltaErr')) - cutSize = sum(FI<0); - if(length(featureVector) - cutSize < numFeatures) - cutSize = length(featureVector) - numFeatures; - end - featureVector = featureVector(1:end-cutSize); - featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector); - end -end -end \ No newline at end of file
--- a/phase2/runme.m Wed Mar 15 11:26:24 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ - -%% -% load('testData.mat'); -% [linkList, featureList]= treeLinkFeatures(data,5); -% save('testResults.mat','linkList','featureList'); - -%% -load('adobeDataNorm.mat') -[linkList, featureList]= treeLinkFeatures(AdobeNormalised,5,featureNames); -save('adobeResults.mat','linkList','featureList'); -% exit - -%%
--- a/phase2/testparseData.m Wed Mar 15 11:26:24 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -% An attempt to make sense of the treeLinkFeatures output data in a -% meaningful way, and to understand why so man - -tl = []; -for i = 1:length(featureList) - t = zeros(5,1); - for ii = 1:5 - t(ii) = (featureList{i}(ii) == ii); - end - tl = [tl; (sum(t)==5)]; -end - -%% -compareList = linkList(find(tl),1:2); - -for i = 1:length(compareList) - try - t1 = T(mod(compareList(i,1),length(featureList)+1)); - t2 = T(mod(compareList(i,2),length(featureList)+1)); - if(t1 == t2) - fprintf('Line %d matches\n',i); - else - fprintf('Line %d FAILS\n', i); - end - catch - %TO CATCH- Attempted to access T(0); index must be a positive integer or logical. - fprintf('Line %d CRASH **************\n',i); - - end - %%% THIS DOESNT WORK - Attempted to access T(0); index must be a positive integer or logical. - - -end \ No newline at end of file
--- a/phase2/traceLinkageToBinary.m Wed Mar 15 11:26:24 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -function classList = traceLinkageToBinary(linkList, rowIndex) -%% class = traceLinkageToBinary(linkList, rowIndex) -% This function accepts a linkList and a rowIndex, and performs a transform -% to provide a classification list for all the data points in the original -% list. From a row index, if the data falls under column 1 (lower number) -% then it is given a class of 1, if it falls under column 2 (higher number) -% then it is given a class of 2. Any data not included in that branch of -% the hierarchy is given a class of 0 -% linkList - the input result from linkages -% rowIndex - the row on which to split the data - -listSize = size(linkList,1)+1; -c(1) = linkList(rowIndex,1); -c(2) = linkList(rowIndex,2); -for i = 1:2 - if (c(i) > listSize) - c(i) = c(i) - listSize; - end -end - -leafList1 = traverseDownOneStep(linkList,[],c(1)); -leafList2 = traverseDownOneStep(linkList,[],c(2)); - -classList = zeros(listSize,1); -classList(leafList1) = c(1); -classList(leafList2) = c(2); - - -end -
--- a/phase2/traverseDownOneStep.m Wed Mar 15 11:26:24 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -function leaf = traverseDownOneStep(linkList,leaf,row) - -%% leaf = traverseDownOneStep(linkList,leaf,row) -% Recursive function which given a linkList, will search a given row, and -% if the row is a leaf, it will append the leaf to the end of the leaf -% list, otherwise, it will recursively call the function to identify the -% two leaves for the branches it has discovered - -listSize = size(linkList,1)+1; -if(row > listSize) - row = row-listSize; -end - -if (row == listSize) - leaf = row; -else - leaf1 = linkList(row,1); - leaf2 = linkList(row,2); - - if(leaf1 > listSize) - leaf = traverseDownOneStep(linkList,leaf,leaf1); - else - leaf = cat(1,leaf,leaf1); - end - - if(leaf2 > listSize) - leaf = traverseDownOneStep(linkList,leaf,leaf2); - else - leaf = cat(1,leaf,leaf2); - end -end -end \ No newline at end of file
--- a/phase2/treeLinkFeatures.m Wed Mar 15 11:26:24 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ -function [linkList, featureList]= treeLinkFeatures(data, depthThresh, featureNames) -%% [linkList, featureList]= treeLinkFeatures(data, depthThresh, featureNames) -% given a dataset, a hierarchical cluster of the data is produced, and then -% the data is traversed, such that, for each split in the data, a set of -% features are produced, which are the ranked features that can be used to -% separate the given dataset at that point. -% data is the nxm matrix of content, n is the number of samples and m is -% the number of features. -% depthThresh is a list of the range of tree depths to traverse from the -% aglomerative clustering tree. A single value of depthThresh, will assume -% 1:depthThresh. For analysis of a single layer of the tree, pass a list of -% two values, both of which are the layer to be analysed. -% feature names is the list of features, so that grown trees have suitable -% names. No feature names will result in the feature number being returned. -% featureList corresponds to the rows in linkList, with the form column 1 -% is the 5 most relevant features, column 2 is the depth and column 3 is a -% decision classification tree for the decision - perhaps this should be in -% the form of a struct instead? - - - -if(nargin < 3) - featureNames = 1:size(data,2); -end -if(nargin < 2) - depthThresh = 999; -end - -if (length(depthThresh) == 1) - depthThresh = 1:depthThresh; -end - -linkList = aglomCluster(data); -linkList = depthCheck(linkList); -listSize = size(data,1); - -% linkList(:,4) = 0; -featureList = cell(listSize-1,3); -currentRow = [2*listSize-1]; - -%% -while (~isempty(currentRow)) - if(currentRow(1) > listSize) - row = currentRow(1) - listSize -% rD = linkList(row,4); - if any(linkList(row,4)==depthThresh) - classList = traceLinkageToBinary(linkList, row); - featureList{row,1} = rfFeatureSelection(data(classList>0,:), classList(classList>0)); - featureList{row,2} = linkList(row,4); - featureList{row,3} = fitctree(data(classList>0,featureList{row,1}),classList(classList>0),'PredictorNames',featureNames(featureList{row,1})); - end - currentRow = [currentRow; linkList(row,1); linkList(row,2)]; - end - currentRow = currentRow(2:end); - save('partialResults.mat'); -end - -end \ No newline at end of file