changeset 1:995546d09284

add gensim notebook and matlab scripts
author DaveM
date Tue, 24 Jan 2017 17:44:45 +0000
parents 7d69c0d6f4c9
children 985cd163ba54
files code/Gensim LDA tutorial.ipynb code/Hierarchical Clustering.ipynb other/evalResults.m
diffstat 3 files changed, 404 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code/Gensim LDA tutorial.ipynb	Tue Jan 24 17:44:45 2017 +0000
@@ -0,0 +1,111 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.10.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import six\n",
+    "print six.__version__\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import gensim\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import gensim\n",
+    "import bz2\n",
+    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "IOError",
+     "evalue": "[Errno 2] No such file or directory: 'wiki_en_wordids.txt'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mIOError\u001b[0m                                   Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-4-54645b7b2c38>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# load id->word mapping (the dictionary), one of the results of step 2 above\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mid2word\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDictionary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_from_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_wordids.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;31m# load corpus iterator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mmm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMmCorpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_tfidf.mm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;31m# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/Library/Python/2.7/site-packages/gensim/corpora/dictionary.pyc\u001b[0m in \u001b[0;36mload_from_text\u001b[0;34m(fname)\u001b[0m\n\u001b[1;32m    342\u001b[0m         \"\"\"\n\u001b[1;32m    343\u001b[0m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDictionary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 344\u001b[0;31m         \u001b[0;32mwith\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msmart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    345\u001b[0m             \u001b[0;32mfor\u001b[0m \u001b[0mlineno\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    346\u001b[0m                 \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_unicode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36msmart_open\u001b[0;34m(uri, mode, **kw)\u001b[0m\n\u001b[1;32m    125\u001b[0m             \u001b[0;31m# local files -- both read & write supported\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    126\u001b[0m             \u001b[0;31m# compression, if any, is determined by the filename extension (.gz, .bz2)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfile_smart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    128\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscheme\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"s3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"s3n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    129\u001b[0m             \u001b[0;31m# Get an S3 host. It is required for sigv4 operations.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36mfile_smart_open\u001b[0;34m(fname, mode)\u001b[0m\n\u001b[1;32m    556\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mmake_closing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGzipFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    557\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 558\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    559\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    560\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: 'wiki_en_wordids.txt'"
+     ]
+    }
+   ],
+   "source": [
+    "# load id->word mapping (the dictionary), one of the results of step 2 above\n",
+    "id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')\n",
+    "# load corpus iterator\n",
+    "mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')\n",
+    "# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\n",
+    "\n",
+    "print(mm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/code/Hierarchical Clustering.ipynb	Mon Jan 16 17:34:29 2017 +0000
+++ b/code/Hierarchical Clustering.ipynb	Tue Jan 24 17:44:45 2017 +0000
@@ -11,6 +11,7 @@
     "from matplotlib import pyplot as plt\n",
     "from scipy.cluster.hierarchy import dendrogram, linkage, cophenet\n",
     "from scipy.spatial.distance import pdist\n",
+    "import sklearn \n",
     "import numpy as np\n",
     "import csv\n",
     "\n",
@@ -41,9 +42,9 @@
    },
    "outputs": [],
    "source": [
-    "print X.shape\n",
-    "print filenames.shape\n",
-    "print features.shape"
+    "agglo = cluster.FeatureAgglomeration()\n",
+    "agglo.fit(X)\n",
+    "X_reduced = agglo.transform(X)"
    ]
   },
   {
@@ -59,6 +60,121 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[  8.51810000e-01   4.00000000e-06   2.46000000e-04 ...,   2.10260000e-02\n",
+      "    1.98220000e-02   1.04000000e-04]\n",
+      " [  9.52275000e-01   7.00000000e-06   1.82600000e-03 ...,   1.79490000e-02\n",
+      "    1.09020000e-02   7.20000000e-05]\n",
+      " [  1.92200000e-03   1.00000000e-06   1.39000000e-04 ...,   2.35900000e-02\n",
+      "    6.93800000e-03   2.61000000e-04]\n",
+      " ..., \n",
+      " [  9.96346000e-01   3.37000000e-04   1.23600000e-03 ...,   5.24103000e-01\n",
+      "    3.36967000e-01   5.39000000e-04]\n",
+      " [  9.99990000e-01   1.00000000e-06   0.00000000e+00 ...,   0.00000000e+00\n",
+      "    0.00000000e+00   0.00000000e+00]\n",
+      " [  9.96624000e-01   6.97000000e-04   2.59300000e-03 ...,   5.24615000e-01\n",
+      "    3.34985000e-01   5.45000000e-04]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(8977, 1536)\n"
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'nu_0': 0, 'kappa_0': 0, 'lambda_0': 0, 'mu_0': 0}\n"
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pyBHC as bhc\n",
+    "from pyBHC import dists\n",
+    "\n",
+    "mu_init = []\n",
+    "sigma_init = []\n",
+    "S_init = []\n",
+    "cd = dists.NormalFixedCovar(mu_0=mu_init,sigma_0=sigma_init, S=S_init)\n",
+    "\n",
+    "# temp = cd.log_marginal_likelihood(X)\n",
+    "d = bhc.rbhc(X, cd)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "collapsed": true
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/other/evalResults.m	Tue Jan 24 17:44:45 2017 +0000
@@ -0,0 +1,174 @@
+
+load('Adobe.mat')
+load('Results1Percent.mat')
+%%
+datamap = featuredata(end).IdxVar;
+reduceData = Data(:,datamap);
+reduceLabels = Labels(datamap);
+%%
+reduceFeatures = FeatureNames(datamap);
+
+%%
+load('Results1Percent.mat')
+
+%%
+reduceFeatures = featuredata(1).FeatureNamesRanked;
+
+dataToUseSize = 500;
+dataToUse = ceil(rand(dataToUseSize,1)*size(reduceData,1))';
+
+dMap = pdist(reduceData(dataToUse,:));
+clusterMethod = 'ward';
+% 'average'     Unweighted average distance (UPGMA)
+% 'centroid'	Centroid distance (UPGMC), appropriate for Euclidean distances only
+% 'complete'	Furthest distance
+% 'median'      Weighted center of mass distance (WPGMC), appropriate for Euclidean distances only
+% 'single'      Shortest distance
+% 'ward'        Inner squared distance (minimum variance algorithm), appropriate for Euclidean distances only
+% 'weighted'	Weighted average distance (WPGMA)
+
+dl = linkage(dMap, clusterMethod);
+dendrogram(dl)
+% figure; imagesc(squareform(dMap_sp))
+% title('euclidian self similarity');
+
+%%
+incon_sp = inconsistent(dl)
+
+
+%%
+% Use all data
+
+dMapAll = pdist(reduceData);
+clusterMethod = 'ward';
+% 'average'     Unweighted average distance (UPGMA)
+% 'centroid'	Centroid distance (UPGMC), 
+%                     appropriate for Euclidean distances only
+% 'complete'	Furthest distance
+% 'median'      Weighted center of mass distance (WPGMC),
+%                     appropriate for Euclidean distances only
+% 'single'      Shortest distance
+% 'ward'        Inner squared distance (minimum variance algorithm), 
+%                   appropriate for Euclidean distances only
+% 'weighted'	Weighted average distance (WPGMA)
+
+dl_all = linkage(dMapAll, clusterMethod);
+% [~,T] = dendrogram(dl_all,0)
+
+%%
+% print filelist for each cluster
+
+numClusters = 25;
+fnames = cell(1,numClusters);
+[~,T] = dendrogram(dl_all,numClusters);
+for i = 1:numClusters
+    numFiles = sum(T==i);
+    fnames{i} = Filenames(find(T==i));
+end
+
+%%
+% makeCSV for Weka
+% format 
+
+feats = reduceData;
+
+% csvOut = mat2cell(feats,ones(size(feats,1),1), ones(size(feats,2),1))
+csvOut = num2cell(feats);
+csvOut = [csvOut, num2cell(T)];
+% size(csvOut)
+% size([FeatureNames(datamap)', {'Class'}])
+csvOut = [[FeatureNames(datamap)', {'Class'}]; csvOut];
+
+%%
+% fnames to CSV
+
+maxLen = size(fnames,2);
+
+for i = 1:maxLen
+    depth = size(fnames{i},1);
+    for ii = 1:depth
+        csvOut(i,ii) = fnames{i}(ii);
+    end
+end
+
+printString = '';
+for i = 1:maxLen
+    printString = [printString ' %s, '];
+end
+
+fid = fopen('junk.csv','w');
+fprintf(fid,[printString '\n'],csvOut{1:end,:});
+% fprintf(fid,'%f, %f, %f\n',c{2:end,:})
+fclose(fid) ;
+% dlmwrite('test.csv', csvOut, '-append') ;
+
+%%
+T = cluster(dl_sp,'cutoff',1.3);
+figure; plot(T);
+
+
+
+%%
+
+
+T = cluster(dl_sp,'maxclust',2);
+plot(T)
+%%
+T = cluster(dl_sp,'maxclust',3);
+plot(T)
+%%
+T = cluster(dl_sp,'maxclust',4);
+plot(T)
+T = cluster(dl_sp,'maxclust',5);
+plot(T)
+T = cluster(dl_sp,'maxclust',6);
+plot(T)
+T = cluster(dl_sp,'maxclust',7);
+plot(T)
+T = cluster(dl_sp,'maxclust',8);
+plot(T)
+T = cluster(dl_sp,'maxclust',9);
+plot(T)
+%%
+T = cluster(dl_sp,'maxclust',10);
+plot(T)
+%%
+T = cluster(dl_sp,'maxclust',100);
+plot(T)
+%%
+median(T)
+
+
+T = cluster(dl_sp,'maxclust',1000);
+median(T)
+
+
+plot(T)
+csvwrite('dataOutput',reduceData);
+
+
+
+
+
+
+
+
+
+
+
+
+% dMap_euc = pdist(reduceData);
+% dMap_cos = pdist(reduceData,'cos');
+% dMap_cos = pdist(reduceData,'cosine');
+% dl_euc = linkage(dMap_euc);
+% dl_cos = linkage(dMap_cos);
+% % dl_sp
+% dl_sp(10,:)
+% dl_sp(1:10,:)
+% sprintf('%f', dl_sp(1:10,:))
+% dl_sp(1:10,:)
+% format short g
+% dl_sp(1:10,:)
+% plot(dl_sp(:))
+% plot(dl_sp(:,3))
+% incon_sp = inconsistent(dl_sp)