Mercurial > hg > sfx-subgrouping
diff code/Gensim LDA tutorial.ipynb @ 1:995546d09284
add gensim notebook and matlab scripts
author | DaveM |
---|---|
date | Tue, 24 Jan 2017 17:44:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code/Gensim LDA tutorial.ipynb Tue Jan 24 17:44:45 2017 +0000 @@ -0,0 +1,111 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.10.0\n" + ] + } + ], + "source": [ + "import six\n", + "print six.__version__\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import gensim\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import logging\n", + "import gensim\n", + "import bz2\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "IOError", + "evalue": "[Errno 2] No such file or directory: 'wiki_en_wordids.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-4-54645b7b2c38>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# load id->word mapping (the dictionary), one of the results of step 2 above\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mid2word\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDictionary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_from_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_wordids.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m# load corpus iterator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mmm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMmCorpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_tfidf.mm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Python/2.7/site-packages/gensim/corpora/dictionary.pyc\u001b[0m in \u001b[0;36mload_from_text\u001b[0;34m(fname)\u001b[0m\n\u001b[1;32m 342\u001b[0m \"\"\"\n\u001b[1;32m 343\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDictionary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 344\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msmart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 345\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlineno\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_unicode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36msmart_open\u001b[0;34m(uri, mode, **kw)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;31m# local files -- both read & write supported\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;31m# compression, if any, is determined by the filename extension (.gz, .bz2)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfile_smart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscheme\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"s3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"s3n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;31m# Get an S3 host. It is required for sigv4 operations.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36mfile_smart_open\u001b[0;34m(fname, mode)\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmake_closing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGzipFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 557\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 558\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 559\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 560\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: 'wiki_en_wordids.txt'" + ] + } + ], + "source": [ + "# load id->word mapping (the dictionary), one of the results of step 2 above\n", + "id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')\n", + "# load corpus iterator\n", + "mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')\n", + "# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\n", + "\n", + "print(mm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}