annotate _code/Gensim LDA tutorial.ipynb @ 37:d9a9a6b93026 tip

Add README
author DaveM
date Sat, 01 Apr 2017 17:03:14 +0100
parents 4bdcab1e821c
children
rev   line source
DaveM@1 1 {
DaveM@1 2 "cells": [
DaveM@1 3 {
DaveM@1 4 "cell_type": "code",
DaveM@1 5 "execution_count": 1,
DaveM@1 6 "metadata": {
DaveM@1 7 "collapsed": false
DaveM@1 8 },
DaveM@1 9 "outputs": [
DaveM@1 10 {
DaveM@1 11 "name": "stdout",
DaveM@1 12 "output_type": "stream",
DaveM@1 13 "text": [
DaveM@1 14 "1.10.0\n"
DaveM@1 15 ]
DaveM@1 16 }
DaveM@1 17 ],
DaveM@1 18 "source": [
DaveM@1 19 "import six\n",
DaveM@1 20 "print six.__version__\n"
DaveM@1 21 ]
DaveM@1 22 },
DaveM@1 23 {
DaveM@1 24 "cell_type": "code",
DaveM@1 25 "execution_count": 2,
DaveM@1 26 "metadata": {
DaveM@1 27 "collapsed": false
DaveM@1 28 },
DaveM@1 29 "outputs": [],
DaveM@1 30 "source": [
DaveM@1 31 "import gensim\n"
DaveM@1 32 ]
DaveM@1 33 },
DaveM@1 34 {
DaveM@1 35 "cell_type": "code",
DaveM@1 36 "execution_count": 3,
DaveM@1 37 "metadata": {
DaveM@1 38 "collapsed": false
DaveM@1 39 },
DaveM@1 40 "outputs": [],
DaveM@1 41 "source": [
DaveM@1 42 "import logging\n",
DaveM@1 43 "import gensim\n",
DaveM@1 44 "import bz2\n",
DaveM@1 45 "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
DaveM@1 46 ]
DaveM@1 47 },
DaveM@1 48 {
DaveM@1 49 "cell_type": "code",
DaveM@1 50 "execution_count": 4,
DaveM@1 51 "metadata": {
DaveM@1 52 "collapsed": false
DaveM@1 53 },
DaveM@1 54 "outputs": [
DaveM@1 55 {
DaveM@1 56 "ename": "IOError",
DaveM@1 57 "evalue": "[Errno 2] No such file or directory: 'wiki_en_wordids.txt'",
DaveM@1 58 "output_type": "error",
DaveM@1 59 "traceback": [
DaveM@1 60 "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
DaveM@1 61 "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)",
DaveM@1 62 "\u001b[0;32m<ipython-input-4-54645b7b2c38>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# load id->word mapping (the dictionary), one of the results of step 2 above\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mid2word\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDictionary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_from_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_wordids.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m# load corpus iterator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mmm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMmCorpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'wiki_en_tfidf.mm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
DaveM@1 63 "\u001b[0;32m/Library/Python/2.7/site-packages/gensim/corpora/dictionary.pyc\u001b[0m in \u001b[0;36mload_from_text\u001b[0;34m(fname)\u001b[0m\n\u001b[1;32m 342\u001b[0m \"\"\"\n\u001b[1;32m 343\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDictionary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 344\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msmart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 345\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlineno\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_unicode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
DaveM@1 64 "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36msmart_open\u001b[0;34m(uri, mode, **kw)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;31m# local files -- both read & write supported\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;31m# compression, if any, is determined by the filename extension (.gz, .bz2)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfile_smart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscheme\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"s3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"s3n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;31m# Get an S3 host. It is required for sigv4 operations.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
DaveM@1 65 "\u001b[0;32m/Library/Python/2.7/site-packages/smart_open/smart_open_lib.pyc\u001b[0m in \u001b[0;36mfile_smart_open\u001b[0;34m(fname, mode)\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmake_closing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mGzipFile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 557\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 558\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 559\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 560\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
DaveM@1 66 "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: 'wiki_en_wordids.txt'"
DaveM@1 67 ]
DaveM@1 68 }
DaveM@1 69 ],
DaveM@1 70 "source": [
DaveM@1 71 "# load id->word mapping (the dictionary), one of the results of step 2 above\n",
DaveM@1 72 "id2word = gensim.corpora.Dictionary.load_from_text('wiki_en_wordids.txt')\n",
DaveM@1 73 "# load corpus iterator\n",
DaveM@1 74 "mm = gensim.corpora.MmCorpus('wiki_en_tfidf.mm')\n",
DaveM@1 75 "# mm = gensim.corpora.MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2')) # use this if you compressed the TFIDF output\n",
DaveM@1 76 "\n",
DaveM@1 77 "print(mm)"
DaveM@1 78 ]
DaveM@1 79 },
DaveM@1 80 {
DaveM@1 81 "cell_type": "code",
DaveM@1 82 "execution_count": null,
DaveM@1 83 "metadata": {
DaveM@1 84 "collapsed": true
DaveM@1 85 },
DaveM@1 86 "outputs": [],
DaveM@1 87 "source": []
DaveM@1 88 }
DaveM@1 89 ],
DaveM@1 90 "metadata": {
DaveM@1 91 "kernelspec": {
DaveM@1 92 "display_name": "Python 2",
DaveM@1 93 "language": "python",
DaveM@1 94 "name": "python2"
DaveM@1 95 },
DaveM@1 96 "language_info": {
DaveM@1 97 "codemirror_mode": {
DaveM@1 98 "name": "ipython",
DaveM@1 99 "version": 2
DaveM@1 100 },
DaveM@1 101 "file_extension": ".py",
DaveM@1 102 "mimetype": "text/x-python",
DaveM@1 103 "name": "python",
DaveM@1 104 "nbconvert_exporter": "python",
DaveM@1 105 "pygments_lexer": "ipython2",
DaveM@1 106 "version": "2.7.10"
DaveM@1 107 }
DaveM@1 108 },
DaveM@1 109 "nbformat": 4,
DaveM@1 110 "nbformat_minor": 0
DaveM@1 111 }