diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e3f3f6b9c..85b10008b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ Changes - In hdpmodel and dtmmodel - NOT BACKWARDS COMPATIBLE! * Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113) +* Implemented LsiModel.docs_processed attribute +* `n_similarity()` raises `ValueError` if an empty list is passed to it in word2vec, doc2vec (@droudy, #761) 0.13.1, 2016-06-22 diff --git a/README.md b/README.md index deb2757521..f8b4a1c03c 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ Adopters | Cisco Security | | [cisco.com](http://www.cisco.com/c/en/us/products/security/index.html)| Large-scale fraud detection | 12K Research | | [12k.co](https://12k.co/)| Document similarity analysis on media articles | National Institutes of Health | | [github/NIHOPA](https://github.com/NIHOPA/pipeline_word2vec)| Processing grants and publications with word2vec +| Codeq LLC | | [codeq.com](https://codeq.com)| Document classification with word2vec ------- diff --git a/docs/notebooks/similarity_metrics.ipynb b/docs/notebooks/distance_metrics.ipynb similarity index 74% rename from docs/notebooks/similarity_metrics.ipynb rename to docs/notebooks/distance_metrics.ipynb index 1840ab3bcc..99d0071b68 100644 --- a/docs/notebooks/similarity_metrics.ipynb +++ b/docs/notebooks/distance_metrics.ipynb @@ -27,16 +27,7 @@ "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:gensim.models.word2vec:Slow version of gensim.models.word2vec is being used\n", - "WARNING:gensim.models.doc2vec:Slow version of gensim.models.doc2vec is being used\n" - ] - } - ], + "outputs": [], "source": [ "from gensim.corpora import Dictionary\n", "from gensim.models import ldamodel\n", @@ -77,13 +68,6 @@ "collapsed": false }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n" - ] - }, { "data": { "text/plain": [ @@ -165,7 +149,7 @@ { "data": { "text/plain": [ - "0.51251199778753564" + "0.51251199778753576" ] }, "execution_count": 5, @@ -187,7 +171,7 @@ { "data": { "text/plain": [ - "0.2340730527221043" + "0.23407305272210427" ] }, "execution_count": 6, @@ -263,7 +247,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `bank` document is a combination of both water and finance related terms - but as bank in this context is likely to belong to the finance topic, the distance values are less between the finance and bank bows." + "*NOTE!*\n", + "\n", + "KL is not a Distance Metric in the mathematical sense, and hence is not symmetrical. \n", + "This means that `kullback_leibler(lda_bow_finance, lda_bow_bank)` is not equal to `kullback_leibler(lda_bow_bank, lda_bow_finance)`. " ] }, { @@ -276,7 +263,7 @@ { "data": { "text/plain": [ - "[(0, 0.44146764073708356), (1, 0.55853235926291644)]" + "0.24780412" ] }, "execution_count": 9, @@ -284,6 +271,38 @@ "output_type": "execute_result" } ], + "source": [ + "# As you can see, the values are not equal. We'll get more into the details of this later on in the notebook.\n", + "kullback_leibler(lda_bow_bank, lda_bow_finance)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In our previous examples we saw that there were lower distance values between bank and finance than for bank and water, even if it wasn't by a huge margin. What does this mean?\n", + "\n", + "The `bank` document is a combination of both water and finance related terms - but as bank in this context is likely to belong to the finance topic, the distance values are less between the finance and bank bows." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0, 0.44146764073708339), (1, 0.55853235926291656)]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# just to confirm our suspicion that the bank bow is more to do with finance:\n", "\n", @@ -320,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": { "collapsed": false }, @@ -331,7 +350,7 @@ "0.8571428571428572" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -342,7 +361,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "collapsed": false }, @@ -353,7 +372,7 @@ "0.8333333333333334" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -364,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "collapsed": false }, @@ -375,7 +394,7 @@ "0.0" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -416,7 +435,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "collapsed": false }, @@ -436,7 +455,7 @@ " (9, 0.04)]" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -478,7 +497,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": { "collapsed": false }, @@ -489,7 +508,7 @@ "0.36453028040240248" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -522,7 +541,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "collapsed": false }, @@ -533,7 +552,7 @@ "inf" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -554,7 +573,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "collapsed": false }, @@ -565,7 +584,7 @@ "0.19781515" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -591,6 +610,209 @@ "So, just remember, if you intend to use KL as a metric to measure similarity or distance between two distributions, avoid zeros by returning the ENTIRE distribution. Since it's unlikely any probability distribution will ever have absolute zeros for any feature/word, returning all the values like we did will make you good to go." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## So - what exactly are Distance Metrics? " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Having seen the practical usages of these measures (i.e, to find similarity), let's learn a little about what exactly Distance Measures and Metrics are. \n", + "\n", + "I mentioned in the previous section that KL was not a distance metric. There are 4 conditons for for a distance measure to be a matric:\n", + "\n", + "1.\td(x,y) >= 0\n", + "2. d(x,y) = 0 <=> x = y\n", + "3. d(x,y) = d(y,x)\n", + "4. d(x,z) <= d(x,y) + d(y,z)\n", + "\n", + "That is: it must be non-negative; if x and y are the same, distance must be zero; it must be symmetric; and it must obey the triangle inequality law. \n", + "\n", + "Simple enough, right? \n", + "Let's test these out for our measures." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.22491784692602151" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# normal Hellinger\n", + "hellinger(water_distribution, finance_distribution)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.22491784692602151" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# we swap finance and water distributions and get the same value. It is indeed symmetric!\n", + "hellinger(finance_distribution, water_distribution)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# if we pass the same values, it is zero.\n", + "hellinger(water_distribution, water_distribution)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.23407305272210427" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# for triangle inequality let's use LDA document distributions\n", + "hellinger(lda_bow_finance, lda_bow_bank)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.79979376323008911" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Triangle inequality works too!\n", + "hellinger(lda_bow_finance, lda_bow_water) + hellinger(lda_bow_water, lda_bow_bank)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So Hellinger is indeed a metric. Let's check out KL. " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.2149342" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kullback_leibler(finance_distribution, water_distribution)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.19781515" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kullback_leibler(water_distribution, finance_distribution)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We immediately notice that when we swap the values they aren't equal! One of the four conditions not fitting is enough for it to not be a metric. \n", + "\n", + "However, just because it is not a metric, (strictly in the mathematical sense) does not mean that it is not useful to figure out the distance between two probability distributions. KL Divergence is widely used for this purpose, and is probably the most 'famous' distance measure in fields like Information Theory.\n", + "\n", + "For a nice review of the mathematical differences between Hellinger and KL, [this](http://stats.stackexchange.com/questions/130432/differences-between-bhattacharyya-distance-and-kl-divergence) link does a very good job. " + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -625,7 +847,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.10" + "version": "2.7.11" } }, "nbformat": 4, diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index b7437f95e3..2f459769f9 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -492,6 +492,8 @@ def n_similarity(self, ds1, ds2): index or string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.) """ + if not ds1 or not ds2: + raise ValueError("Can't compute similarity with an empty list") v1 = [self[doc] for doc in ds1] v2 = [self[doc] for doc in ds2] return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 26f249d79a..12a3c17d18 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -361,6 +361,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): num_terms=self.num_terms, chunksize=chunksize, extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) + self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0 else: # the one-pass algo doc_no = 0 @@ -395,6 +396,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): if self.dispatcher: logger.info("reached the end of input; now waiting for all remaining jobs to finish") self.projection = self.dispatcher.getstate() + self.docs_processed += doc_no # logger.info("top topics after adding %i documents" % doc_no) # self.print_debug(10) else: @@ -403,6 +405,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents", corpus.shape[1]) + self.docs_processed += corpus.shape[1] def __str__(self): return "LsiModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index f6728bfd8d..6715dff6ac 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1511,6 +1511,8 @@ def n_similarity(self, ws1, ws2): True """ + if not ws1 or not ws2: + raise ValueError("Can't compute similarity with an empty list") v1 = [self[word] for word in ws1] v2 = [self[word] for word in ws2] return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) @@ -1564,7 +1566,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, u """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. - See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example. + See questions-words.txt in https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index a55a605391..35fe3f4df7 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -173,9 +173,9 @@ def convert_input(self, corpus, time_slices): corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: - fout.write(six.u(str(len(self.time_slices)) + "\n")) + fout.write(six.u(utils.to_utf8(str(len(self.time_slices)) + "\n"))) for sl in time_slices: - fout.write(six.u(str(sl) + "\n")) + fout.write(six.u(utils.to_utf8(str(sl) + "\n"))) def train(self, corpus, time_slices, mode, model): """ diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index b5c3b1db29..abe65c0e3c 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -16,14 +16,17 @@ import numpy -from gensim.utils import to_unicode, smart_extension +from gensim.utils import to_unicode from gensim.interfaces import TransformedCorpus from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus, ucicorpus, malletcorpus, textcorpus, indexedcorpus) # needed because sample data files are located in the same folder module_path = os.path.dirname(__file__) -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) + + +def datapath(fname): + return os.path.join(module_path, 'test_data', fname) def testfile(): @@ -180,7 +183,7 @@ def test_indexing(self): self.assertEqual(len(docs), len(corpus)) self.assertEqual(len(docs), len(corpus[:])) self.assertEqual(len(docs[::2]), len(corpus[::2])) - + def _get_slice(corpus, slice_): # assertRaises for python 2.6 takes a callable return corpus[slice_] @@ -200,9 +203,9 @@ def _get_slice(corpus, slice_): # corpus does, and throws an error otherwise if hasattr(corpus, 'index') and corpus.index is not None: corpus_ = TransformedCorpus(DummyTransformer(), corpus) - self.assertEqual(corpus_[0][0][1], docs[0][0][1]+1) + self.assertEqual(corpus_[0][0][1], docs[0][0][1] + 1) self.assertRaises(ValueError, _get_slice, corpus_, set([1])) - transformed_docs = [val+1 for i, d in enumerate(docs) for _, val in d if i in [1, 3, 4]] + transformed_docs = [val + 1 for i, d in enumerate(docs) for _, val in d if i in [1, 3, 4]] self.assertEquals(transformed_docs, list(v for doc in corpus_[[1, 3, 4]] for _, v in doc)) self.assertEqual(3, len(corpus_[[1, 3, 4]])) else: @@ -214,12 +217,19 @@ def _get_slice(corpus, slice_): class TestMmCorpus(CorpusTestCase): def setUp(self): self.corpus_class = mmcorpus.MmCorpus + self.corpus = self.corpus_class(datapath('testcorpus.mm')) self.file_extension = '.mm' def test_serialize_compressed(self): # MmCorpus needs file write with seek => doesn't support compressed output (only input) pass + def test_load(self): + self.assertEqual(self.corpus.num_docs, 9) + self.assertEqual(self.corpus.num_terms, 12) + self.assertEqual(self.corpus.num_nnz, 28) + self.assertEqual(tuple(self.corpus.index), (97, 121, 169, 201, 225, 249, 258, 276, 303)) + class TestSvmLightCorpus(CorpusTestCase): def setUp(self): diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 42264c0b4b..287368bbd7 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -124,6 +124,13 @@ def test_empty_errors(self): # input not empty, but rather completely filtered out self.assertRaises(RuntimeError, doc2vec.Doc2Vec, list_corpus, min_count=10000) + def test_n_similarity(self): + corpus = DocsLeeCorpus() + model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20) + model.build_vocab(corpus) + model.train(corpus) + self.assertRaises(ValueError, model.n_similarity, ['graph', 'trees'], []) + def test_similarity_unseen_docs(self): """Test similarity of out of training sentences""" rome_str = ['rome', 'italy'] diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py index 457725ebbb..26df7c011e 100644 --- a/gensim/test/test_lsimodel.py +++ b/gensim/test/test_lsimodel.py @@ -24,20 +24,23 @@ from gensim import matutils -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder -datapath = lambda fname: os.path.join(module_path, 'test_data', fname) +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder + + +def datapath(fname): + return os.path.join(module_path, 'test_data', fname) # set up vars used in testing ("Deerwester" from the web tutorial) texts = [['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey']] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] @@ -59,16 +62,15 @@ def testTransform(self): # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) - self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match + self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] - vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests - expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version - # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version - self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign - + vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests + expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version + # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version + self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign def testShowTopic(self): topic = self.model.show_topic(1) @@ -77,7 +79,6 @@ def testShowTopic(self): self.assertTrue(isinstance(k, six.string_types)) self.assertTrue(isinstance(v, float)) - def testShowTopics(self): topics = self.model.show_topics(formatted=False) @@ -88,49 +89,47 @@ def testShowTopics(self): self.assertTrue(isinstance(k, six.string_types)) self.assertTrue(isinstance(v, float)) - def testCorpusTransform(self): """Test lsi[corpus] transformation.""" model = self.model got = numpy.vstack(matutils.sparse2full(doc, 2) for doc in model[self.corpus]) expected = numpy.array([ - [ 0.65946639, 0.14211544], - [ 2.02454305, -0.42088759], - [ 1.54655361, 0.32358921], - [ 1.81114125, 0.5890525 ], - [ 0.9336738 , -0.27138939], - [ 0.01274618, -0.49016181], - [ 0.04888203, -1.11294699], - [ 0.08063836, -1.56345594], - [ 0.27381003, -1.34694159]]) - self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign - + [0.65946639, 0.14211544], + [2.02454305, -0.42088759], + [1.54655361, 0.32358921], + [1.81114125, 0.5890525 ], + [0.9336738 , -0.27138939], + [0.01274618, -0.49016181], + [0.04888203, -1.11294699], + [0.08063836, -1.56345594], + [0.27381003, -1.34694159]]) + self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign def testOnlineTransform(self): corpus = list(self.corpus) - doc = corpus[0] # use the corpus' first document for testing + doc = corpus[0] # use the corpus' first document for testing # create the transformation model - model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once - model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later + model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once + model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later # train model on a single document model.add_documents([corpus[0]]) # transform the testing document with this partial transformation transformed = model[doc] - vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests - expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version - self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign + vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests + expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version + self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign # train on another 4 documents - model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols + model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols # transform a document with this partial transformation transformed = model[doc] - vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests - expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version - self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign + vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests + expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version + self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign # train on the rest of documents model.add_documents(corpus[5:]) @@ -138,8 +137,7 @@ def testOnlineTransform(self): # make sure the final transformation is the same as if we had decomposed the whole corpus at once vec1 = matutils.sparse2full(model[doc], model.num_topics) vec2 = matutils.sparse2full(model2[doc], model2.num_topics) - self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign - + self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign def testPersistence(self): fname = testfile() @@ -150,7 +148,7 @@ def testPersistence(self): self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u)) self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s)) tstvec = [] - self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testPersistenceCompressed(self): fname = testfile() + '.gz' @@ -161,7 +159,7 @@ def testPersistenceCompressed(self): self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u)) self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s)) tstvec = [] - self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmap(self): fname = testfile() @@ -178,7 +176,7 @@ def testLargeMmap(self): self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u)) self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s)) tstvec = [] - self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector def testLargeMmapCompressed(self): fname = testfile() + '.gz' @@ -194,7 +192,11 @@ def testLargeMmapCompressed(self): # to be mmaped! self.assertRaises(IOError, lsimodel.LsiModel.load, fname, mmap='r') -#endclass TestLsiModel + def testDocsProcessed(self): + self.assertEqual(self.model.docs_processed, 9) + self.assertEqual(self.model.docs_processed, self.corpus.num_docs) + +# endclass TestLsiModel if __name__ == '__main__': diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index bad981db76..30002caf0a 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -349,6 +349,7 @@ def testSimilarities(self): self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph'])) self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees')) + self.assertRaises(ValueError, model.n_similarity, ['graph', 'trees'], []) def testSimilarBy(self): """Test word2vec similar_by_word and similar_by_vector."""