Mercurial > hg > sfx-subgrouping
changeset 1:995546d09284
add gensim notebook and matlab scripts
author | DaveM |
---|---|
date | Tue, 24 Jan 2017 17:44:45 +0000 |
parents | 7d69c0d6f4c9 |
children | 985cd163ba54 |
files | code/Gensim LDA tutorial.ipynb code/Hierarchical Clustering.ipynb other/evalResults.m |
diffstat | 3 files changed, 404 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code/Gensim LDA tutorial.ipynb Tue Jan 24 17:44:45 2017 +0000 @@ -0,0 +1,111 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.10.0\n" + ] + } + ], + "source": [ + "import six\n", + "print six.__version__\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import gensim\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import logging\n", + "import gensim\n", + "import bz2\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "IOError", + "evalue": "[Errno 2] No such file or directory: 'wiki_en_wordids.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-4-54645b7b2c38>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# load id->word mapping (the dictionary), one of the results of step 2 above\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mid2word\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDictionary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_from_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_wordids.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m# load corpus iterator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mmm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMmCorpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_tfidf.mm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Python/2.7/site-packages/gensim/corpora/dictionary.pyc\u001b[0m in \u001b[0;36mload_from_text\u001b[0;34m(fname)\u001b[0m\n\u001b[1;32m 342\u001b[0m \"\"\"\n\u001b[1;32m 343\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDictionary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 344\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msmart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 345\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlineno\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_unicode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36msmart_open\u001b[0;34m(uri, mode, **kw)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;31m# local files -- both read & write supported\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;31m# compression, if any, is determined by the filename extension (.gz, .bz2)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfile_smart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscheme\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"s3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"s3n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;31m# Get an S3 host. It is required for sigv4 operations.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36mfile_smart_open\u001b[0;34m(fname, mode)\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmake_closing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGzipFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 557\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 558\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 559\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 560\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: 'wiki_en_wordids.txt'" + ] + } + ], + "source": [ + "# load id->word mapping (the dictionary), one of the results of step 2 above\n", + "id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')\n", + "# load corpus iterator\n", + "mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')\n", + "# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\n", + "\n", + "print(mm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}
--- a/code/Hierarchical Clustering.ipynb Mon Jan 16 17:34:29 2017 +0000 +++ b/code/Hierarchical Clustering.ipynb Tue Jan 24 17:44:45 2017 +0000 @@ -11,6 +11,7 @@ "from matplotlib import pyplot as plt\n", "from scipy.cluster.hierarchy import dendrogram, linkage, cophenet\n", "from scipy.spatial.distance import pdist\n", + "import sklearn \n", "import numpy as np\n", "import csv\n", "\n", @@ -41,9 +42,9 @@ }, "outputs": [], "source": [ - "print X.shape\n", - "print filenames.shape\n", - "print features.shape" + "agglo = cluster.FeatureAgglomeration()\n", + "agglo.fit(X)\n", + "X_reduced = agglo.transform(X)" ] }, { @@ -59,6 +60,121 @@ }, { "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 8.51810000e-01 4.00000000e-06 2.46000000e-04 ..., 2.10260000e-02\n", + " 1.98220000e-02 1.04000000e-04]\n", + " [ 9.52275000e-01 7.00000000e-06 1.82600000e-03 ..., 1.79490000e-02\n", + " 1.09020000e-02 7.20000000e-05]\n", + " [ 1.92200000e-03 1.00000000e-06 1.39000000e-04 ..., 2.35900000e-02\n", + " 6.93800000e-03 2.61000000e-04]\n", + " ..., \n", + " [ 9.96346000e-01 3.37000000e-04 1.23600000e-03 ..., 5.24103000e-01\n", + " 3.36967000e-01 5.39000000e-04]\n", + " [ 9.99990000e-01 1.00000000e-06 0.00000000e+00 ..., 0.00000000e+00\n", + " 0.00000000e+00 0.00000000e+00]\n", + " [ 9.96624000e-01 6.97000000e-04 2.59300000e-03 ..., 5.24615000e-01\n", + " 3.34985000e-01 5.45000000e-04]]\n" + ] + } + ], + "source": [ + "print X" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(8977, 1536)\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'nu_0': 0, 'kappa_0': 0, 'lambda_0': 0, 'mu_0': 0}\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pyBHC as bhc\n", + "from pyBHC import dists\n", + "\n", + "mu_init = []\n", + "sigma_init = []\n", + "S_init = []\n", + "cd = dists.NormalFixedCovar(mu_0=mu_init,sigma_0=sigma_init, S=S_init)\n", + "\n", + "# temp = cd.log_marginal_likelihood(X)\n", + "d = bhc.rbhc(X, cd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/other/evalResults.m Tue Jan 24 17:44:45 2017 +0000 @@ -0,0 +1,174 @@ + +load('Adobe.mat') +load('Results1Percent.mat') +%% +datamap = featuredata(end).IdxVar; +reduceData = Data(:,datamap); +reduceLabels = Labels(datamap); +%% +reduceFeatures = FeatureNames(datamap); + +%% +load('Results1Percent.mat') + +%% +reduceFeatures = featuredata(1).FeatureNamesRanked; + +dataToUseSize = 500; +dataToUse = ceil(rand(dataToUseSize,1)*size(reduceData,1))'; + +dMap = pdist(reduceData(dataToUse,:)); +clusterMethod = 'ward'; +% 'average' Unweighted average distance (UPGMA) +% 'centroid' Centroid distance (UPGMC), appropriate for Euclidean distances only +% 'complete' Furthest distance +% 'median' Weighted center of mass distance (WPGMC), appropriate for Euclidean distances only +% 'single' Shortest distance +% 'ward' Inner squared distance (minimum variance algorithm), appropriate for Euclidean distances only +% 'weighted' Weighted average distance (WPGMA) + +dl = linkage(dMap, clusterMethod); +dendrogram(dl) +% figure; imagesc(squareform(dMap_sp)) +% title('euclidian self similarity'); + +%% +incon_sp = inconsistent(dl) + + +%% +% Use all data + +dMapAll = pdist(reduceData); +clusterMethod = 'ward'; +% 'average' Unweighted average distance (UPGMA) +% 'centroid' Centroid distance (UPGMC), +% appropriate for Euclidean distances only +% 'complete' Furthest distance +% 'median' Weighted center of mass distance (WPGMC), +% appropriate for Euclidean distances only +% 'single' Shortest distance +% 'ward' Inner squared distance (minimum variance algorithm), +% appropriate for Euclidean distances only +% 'weighted' Weighted average distance (WPGMA) + +dl_all = linkage(dMapAll, clusterMethod); +% [~,T] = dendrogram(dl_all,0) + +%% +% print filelist for each cluster + +numClusters = 25; +fnames = cell(1,numClusters); +[~,T] = dendrogram(dl_all,numClusters); +for i = 1:numClusters + numFiles = sum(T==i); + fnames{i} = Filenames(find(T==i)); +end + +%% +% makeCSV for Weka +% format + +feats = reduceData; + +% csvOut = mat2cell(feats,ones(size(feats,1),1), ones(size(feats,2),1)) +csvOut = num2cell(feats); +csvOut = [csvOut, num2cell(T)]; +% size(csvOut) +% size([FeatureNames(datamap)', {'Class'}]) +csvOut = [[FeatureNames(datamap)', {'Class'}]; csvOut]; + +%% +% fnames to CSV + +maxLen = size(fnames,2); + +for i = 1:maxLen + depth = size(fnames{i},1); + for ii = 1:depth + csvOut(i,ii) = fnames{i}(ii); + end +end + +printString = ''; +for i = 1:maxLen + printString = [printString ' %s, ']; +end + +fid = fopen('junk.csv','w'); +fprintf(fid,[printString '\n'],csvOut{1:end,:}); +% fprintf(fid,'%f, %f, %f\n',c{2:end,:}) +fclose(fid) ; +% dlmwrite('test.csv', csvOut, '-append') ; + +%% +T = cluster(dl_sp,'cutoff',1.3); +figure; plot(T); + + + +%% + + +T = cluster(dl_sp,'maxclust',2); +plot(T) +%% +T = cluster(dl_sp,'maxclust',3); +plot(T) +%% +T = cluster(dl_sp,'maxclust',4); +plot(T) +T = cluster(dl_sp,'maxclust',5); +plot(T) +T = cluster(dl_sp,'maxclust',6); +plot(T) +T = cluster(dl_sp,'maxclust',7); +plot(T) +T = cluster(dl_sp,'maxclust',8); +plot(T) +T = cluster(dl_sp,'maxclust',9); +plot(T) +%% +T = cluster(dl_sp,'maxclust',10); +plot(T) +%% +T = cluster(dl_sp,'maxclust',100); +plot(T) +%% +median(T) + + +T = cluster(dl_sp,'maxclust',1000); +median(T) + + +plot(T) +csvwrite('dataOutput',reduceData); + + + + + + + + + + + + +% dMap_euc = pdist(reduceData); +% dMap_cos = pdist(reduceData,'cos'); +% dMap_cos = pdist(reduceData,'cosine'); +% dl_euc = linkage(dMap_euc); +% dl_cos = linkage(dMap_cos); +% % dl_sp +% dl_sp(10,:) +% dl_sp(1:10,:) +% sprintf('%f', dl_sp(1:10,:)) +% dl_sp(1:10,:) +% format short g +% dl_sp(1:10,:) +% plot(dl_sp(:)) +% plot(dl_sp(:,3)) +% incon_sp = inconsistent(dl_sp)