diff --git a/docs/notebooks/topic_coherence_tutorial.ipynb b/docs/notebooks/topic_coherence_tutorial.ipynb new file mode 100644 index 0000000000..b8487b2c30 --- /dev/null +++ b/docs/notebooks/topic_coherence_tutorial.ipynb @@ -0,0 +1,671 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Demonstration of the topic coherence pipeline in Gensim" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will be using the `u_mass` and `c_v` coherence for two different LDA models: a \"good\" and a \"bad\" LDA model. The good LDA model will be trained over 50 iterations and the bad one for 1 iteration. Hence in theory, the good LDA model will be able come up with better or more human-understandable topics. Therefore the coherence measure output for the good LDA model should be more (better) than that for the bad LDA model. This is because, simply, the good LDA model usually comes up with better topics that are more human interpretable." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import logging\n", + "import pyLDAvis.gensim\n", + "import json\n", + "import warnings\n", + "warnings.filterwarnings('ignore') # To ignore all warnings that arise here to enhance clarity\n", + "\n", + "from gensim.models.coherencemodel import CoherenceModel\n", + "from gensim.models.ldamodel import LdaModel\n", + "from gensim.corpora.dictionary import Dictionary\n", + "from numpy import array" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up logging" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "logger = logging.getLogger()\n", + "logger.setLevel(logging.DEBUG)\n", + "logging.debug(\"test\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up corpus" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As stated in table 2 from [this](http://www.cs.bham.ac.uk/~pxt/IDA/lsa_ind.pdf) paper, this corpus essentially has two classes of documents. First five are about human-computer interaction and the other four are about graphs. We will be setting up two LDA models. One with 50 iterations of training and the other with just 1. Hence the one with 50 iterations (\"better\" model) should be able to capture this underlying pattern of the corpus better than the \"bad\" LDA model. Therefore, in theory, our topic coherence for the good LDA model should be greater than the one for the bad LDA model." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "texts = [['human', 'interface', 'computer'],\n", + " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n", + " ['eps', 'user', 'interface', 'system'],\n", + " ['system', 'human', 'system', 'eps'],\n", + " ['user', 'response', 'time'],\n", + " ['trees'],\n", + " ['graph', 'trees'],\n", + " ['graph', 'minors', 'trees'],\n", + " ['graph', 'minors', 'survey']]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dictionary = Dictionary(texts)\n", + "corpus = [dictionary.doc2bow(text) for text in texts]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up two topic models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll be setting up two different LDA Topic models. A good one and bad one. To build a \"good\" topic model, we'll simply train it using more iterations than the bad one. Therefore the `u_mass` coherence should in theory be better for the good model than the bad one since it would be producing more \"human-interpretable\" topics." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "goodLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=2)\n", + "badLdaModel = LdaModel(corpus=corpus, id2word=dictionary, iterations=1, num_topics=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using U_Mass Coherence" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "goodcm = CoherenceModel(model=goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "badcm = CoherenceModel(model=badLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View the pipeline parameters for one coherence model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Following are the pipeline parameters for `u_mass` coherence. By pipeline parameters, we mean the functions being used to calculate segmentation, probability estimation, confirmation measure and aggregation as shown in figure 1 in [this](http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf) paper." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CoherenceModel(segmentation=, probability estimation=, confirmation measure=, aggregation=)\n" + ] + } + ], + "source": [ + "print goodcm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interpreting the topics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we will see below using LDA visualization, the better model comes up with two topics composed of the following words:\n", + "1. goodLdaModel:\n", + " - __Topic 1__: More weightage assigned to words such as \"system\", \"user\", \"eps\", \"interface\" etc which captures the first set of documents.\n", + " - __Topic 2__: More weightage assigned to words such as \"graph\", \"trees\", \"survey\" which captures the topic in the second set of documents.\n", + "2. badLdaModel:\n", + " - __Topic 1__: More weightage assigned to words such as \"system\", \"user\", \"trees\", \"graph\" which doesn't make the topic clear enough.\n", + " - __Topic 2__: More weightage assigned to words such as \"system\", \"trees\", \"graph\", \"user\" which is similar to the first topic. Hence both topics are not human-interpretable.\n", + "\n", + "Therefore, the topic coherence for the goodLdaModel should be greater for this than the badLdaModel since the topics it comes up with are more human-interpretable. We will see this using `u_mass` and `c_v` topic coherence measures." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualize topic models" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pyLDAvis.enable_notebook()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "PreparedData(topic_coordinates= Freq cluster topics x y\n", + "topic \n", + "1 60.467874 1 1 -0.02178 -0.0\n", + "0 39.532126 1 2 0.02178 -0.0, topic_info= Category Freq Term Total loglift logprob\n", + "term \n", + "1 Default 2.000000 graph 2.000000 12.0000 12.0000\n", + "6 Default 2.000000 survey 2.000000 11.0000 11.0000\n", + "3 Default 2.000000 trees 2.000000 10.0000 10.0000\n", + "0 Default 2.000000 minors 2.000000 9.0000 9.0000\n", + "5 Default 2.000000 computer 2.000000 8.0000 8.0000\n", + "4 Default 2.000000 eps 2.000000 7.0000 7.0000\n", + "9 Default 2.000000 time 2.000000 6.0000 6.0000\n", + "11 Default 2.000000 response 2.000000 5.0000 5.0000\n", + "2 Default 3.000000 system 3.000000 4.0000 4.0000\n", + "7 Default 2.000000 user 2.000000 3.0000 3.0000\n", + "8 Default 2.000000 human 2.000000 2.0000 2.0000\n", + "10 Default 2.000000 interface 2.000000 1.0000 1.0000\n", + "4 Topic1 1.754656 eps 2.192159 0.2804 -2.3020\n", + "2 Topic1 2.765990 system 3.630010 0.2312 -1.8468\n", + "7 Topic1 2.132646 user 2.892076 0.1984 -2.1069\n", + "10 Topic1 1.511120 interface 2.155900 0.1477 -2.4514\n", + "8 Topic1 1.448214 human 2.146535 0.1095 -2.4939\n", + "11 Topic1 1.300499 response 2.124542 0.0122 -2.6015\n", + "9 Topic1 1.292999 time 2.123425 0.0070 -2.6073\n", + "3 Topic1 1.420436 trees 2.786037 -0.1706 -2.5133\n", + "5 Topic1 1.064564 computer 2.089414 -0.1713 -2.8017\n", + "0 Topic1 1.037844 minors 2.085436 -0.1948 -2.8271\n", + "6 Topic1 0.818827 survey 2.052828 -0.4160 -3.0641\n", + "1 Topic1 0.987888 graph 2.721637 -0.5104 -2.8764\n", + "1 Topic2 1.733749 graph 2.721637 0.4771 -1.8890\n", + "6 Topic2 1.234000 survey 2.052828 0.4191 -2.2290\n", + "0 Topic2 1.047592 minors 2.085436 0.2396 -2.3927\n", + "5 Topic2 1.024850 computer 2.089414 0.2157 -2.4147\n", + "3 Topic2 1.365602 trees 2.786037 0.2150 -2.1276\n", + "9 Topic2 0.830426 time 2.123425 -0.0108 -2.6251\n", + "11 Topic2 0.824043 response 2.124542 -0.0190 -2.6328\n", + "8 Topic2 0.698320 human 2.146535 -0.1949 -2.7983\n", + "10 Topic2 0.644780 interface 2.155900 -0.2790 -2.8781\n", + "7 Topic2 0.759429 user 2.892076 -0.4091 -2.7144\n", + "2 Topic2 0.864020 system 3.630010 -0.5073 -2.5854\n", + "4 Topic2 0.437504 eps 2.192159 -0.6835 -3.2659, token_table= Topic Freq Term\n", + "term \n", + "5 1 0.478603 computer\n", + "5 2 0.478603 computer\n", + "4 1 0.912342 eps\n", + "1 1 0.367426 graph\n", + "1 2 0.734852 graph\n", + "8 1 0.465867 human\n", + "8 2 0.465867 human\n", + "10 1 0.927687 interface\n", + "10 2 0.463843 interface\n", + "0 1 0.479516 minors\n", + "0 2 0.479516 minors\n", + "11 1 0.470690 response\n", + "11 2 0.470690 response\n", + "6 1 0.487133 survey\n", + "6 2 0.487133 survey\n", + "2 1 0.826444 system\n", + "2 2 0.275481 system\n", + "9 1 0.470937 time\n", + "9 2 0.470937 time\n", + "3 1 0.358933 trees\n", + "3 2 0.358933 trees\n", + "7 1 0.691545 user\n", + "7 2 0.345772 user, R=12, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[2, 1])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pyLDAvis.gensim.prepare(goodLdaModel, corpus, dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "PreparedData(topic_coordinates= Freq cluster topics x y\n", + "topic \n", + "1 52.514671 1 1 -0.002455 -0.0\n", + "0 47.485329 1 2 0.002455 -0.0, topic_info= Category Freq Term Total loglift logprob\n", + "term \n", + "8 Default 2.000000 human 2.000000 12.0000 12.0000\n", + "4 Default 2.000000 eps 2.000000 11.0000 11.0000\n", + "1 Default 2.000000 graph 2.000000 10.0000 10.0000\n", + "9 Default 2.000000 time 2.000000 9.0000 9.0000\n", + "5 Default 2.000000 computer 2.000000 8.0000 8.0000\n", + "3 Default 2.000000 trees 2.000000 7.0000 7.0000\n", + "6 Default 2.000000 survey 2.000000 6.0000 6.0000\n", + "10 Default 2.000000 interface 2.000000 5.0000 5.0000\n", + "0 Default 2.000000 minors 2.000000 4.0000 4.0000\n", + "2 Default 3.000000 system 3.000000 3.0000 3.0000\n", + "7 Default 2.000000 user 2.000000 2.0000 2.0000\n", + "11 Default 2.000000 response 2.000000 1.0000 1.0000\n", + "9 Topic1 1.315907 time 2.123095 0.1657 -2.4487\n", + "6 Topic1 1.228044 survey 2.122596 0.0969 -2.5178\n", + "0 Topic1 1.189171 minors 2.122376 0.0648 -2.5500\n", + "11 Topic1 1.156021 response 2.122188 0.0366 -2.5782\n", + "2 Topic1 1.926266 system 3.536977 0.0364 -2.0676\n", + "7 Topic1 1.540934 user 2.829581 0.0363 -2.2908\n", + "10 Topic1 1.134199 interface 2.122064 0.0176 -2.5973\n", + "3 Topic1 1.477609 trees 2.829222 -0.0055 -2.3328\n", + "5 Topic1 1.032319 computer 2.121486 -0.0762 -2.6914\n", + "1 Topic1 1.347614 graph 2.828485 -0.0973 -2.4249\n", + "4 Topic1 0.977820 eps 2.121177 -0.1303 -2.7456\n", + "8 Topic1 0.903351 human 2.120755 -0.2093 -2.8249\n", + "8 Topic2 1.217404 human 2.120755 0.1897 -2.4258\n", + "4 Topic2 1.143357 eps 2.121177 0.1267 -2.4886\n", + "1 Topic2 1.480871 graph 2.828485 0.0976 -2.2299\n", + "5 Topic2 1.089167 computer 2.121486 0.0780 -2.5371\n", + "3 Topic2 1.351613 trees 2.829222 0.0060 -2.3212\n", + "10 Topic2 0.987865 interface 2.122064 -0.0198 -2.6348\n", + "7 Topic2 1.288647 user 2.829581 -0.0418 -2.3690\n", + "2 Topic2 1.610711 system 3.536977 -0.0418 -2.1459\n", + "11 Topic2 0.966167 response 2.122188 -0.0421 -2.6570\n", + "0 Topic2 0.933205 minors 2.122376 -0.0769 -2.6917\n", + "6 Topic2 0.894553 survey 2.122596 -0.1193 -2.7340\n", + "9 Topic2 0.807188 time 2.123095 -0.2223 -2.8367, token_table= Topic Freq Term\n", + "term \n", + "5 1 0.471368 computer\n", + "5 2 0.471368 computer\n", + "4 1 0.471436 eps\n", + "4 2 0.471436 eps\n", + "1 1 0.353546 graph\n", + "1 2 0.353546 graph\n", + "8 1 0.471530 human\n", + "8 2 0.471530 human\n", + "10 1 0.471239 interface\n", + "10 2 0.471239 interface\n", + "0 1 0.471170 minors\n", + "0 2 0.471170 minors\n", + "11 1 0.471212 response\n", + "11 2 0.471212 response\n", + "6 1 0.471121 survey\n", + "6 2 0.471121 survey\n", + "2 1 0.565455 system\n", + "2 2 0.565455 system\n", + "9 1 0.471011 time\n", + "9 2 0.471011 time\n", + "3 1 0.353454 trees\n", + "3 2 0.353454 trees\n", + "7 1 0.706818 user\n", + "7 2 0.353409 user, R=12, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[2, 1])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pyLDAvis.gensim.prepare(badLdaModel, corpus, dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-14.0842451581\n" + ] + } + ], + "source": [ + "print goodcm.get_coherence()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-14.4434307511\n" + ] + } + ], + "source": [ + "print badcm.get_coherence()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using C_V coherence" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "goodcm = CoherenceModel(model=goodLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "badcm = CoherenceModel(model=badLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pipeline parameters for C_V coherence" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CoherenceModel(segmentation=, probability estimation=, confirmation measure=, aggregation=)\n" + ] + } + ], + "source": [ + "print goodcm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Print coherence values" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.552164532134\n" + ] + } + ], + "source": [ + "print goodcm.get_coherence()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5269189184\n" + ] + } + ], + "source": [ + "print badcm.get_coherence()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Hence as we can see, the `u_mass` and `c_v` coherence for the good LDA model is much more (better) than that for the bad LDA model. This is because, simply, the good LDA model usually comes up with better topics that are more human interpretable. The badLdaModel however fails to decipher between these two topics and comes up with topics which are not clear to a human. The `u_mass` and `c_v` topic coherences capture this wonderfully by giving the interpretability of these topics a number as we can see above. Hence this coherence measure can be used to compare difference topic models based on their human-interpretability." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index dc028e24fe..21d9fd91c8 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -4,6 +4,7 @@ """ # bring model classes directly into package namespace, to save some typing +from .coherencemodel import CoherenceModel from .hdpmodel import HdpModel from .ldamodel import LdaModel from .lsimodel import LsiModel diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py new file mode 100644 index 0000000000..8bfde8b082 --- /dev/null +++ b/gensim/models/coherencemodel.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Module for calculating topic coherence in python. This is the implementation of +the four stage topic coherence pipeline from the paper [1]. +The four stage pipeline is basically: + +Segmentation -> Probability Estimation -> Confirmation Measure -> Aggregation. + +Implementation of this pipeline allows for the user to in essence "make" a +coherence measure of his/her choice by choosing a method in each of the pipelines. + +[1] Michael Roeder, Andreas Both and Alexander Hinneburg. Exploring the space of topic +coherence measures. http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf. +""" + +import logging + +from gensim import interfaces +from gensim.topic_coherence import (segmentation, probability_estimation, + direct_confirmation_measure, indirect_confirmation_measure, + aggregation) +from gensim.corpora import Dictionary +from gensim.matutils import argsort +from gensim.utils import is_corpus +from gensim.models.ldamodel import LdaModel +from gensim.models.wrappers import LdaVowpalWabbit + +logger = logging.getLogger(__name__) + + +class CoherenceModel(interfaces.TransformationABC): + """ + Objects of this class allow for building and maintaining a model for topic + coherence. + + The main methods are: + + 1. constructor, which initializes the four stage pipeline by accepting a coherence measure, + 2. the ``get_coherence()`` method, which returns the topic coherence. + + >>> cm = CoherenceModel(model=tm, corpus=corpus, coherence='u_mass') # tm is the trained topic model + >>> cm.get_coherence() + + Model persistency is achieved via its load/save methods. + """ + def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c_v'): + """ + Args: + ---- + model : Pre-trained topic model. + texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator. + corpus : Gensim document corpus. + dictionary : Gensim dictionary mapping of id word to create corpus. + coherence : Coherence measure to be used. Supported values are: + u_mass + c_v + """ + if texts is None and corpus is None: + raise ValueError("One of texts or corpus has to be provided.") + if coherence == 'u_mass': + if is_corpus(corpus)[0]: + if dictionary is None: + if model.id2word[0] == 0: + raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model" + "should be set as the dictionary.") + else: + self.dictionary = model.id2word + else: + self.dictionary = dictionary + self.corpus = corpus + elif texts is not None: + self.texts = texts + if dictionary is None: + self.dictionary = Dictionary(self.texts) + else: + self.dictionary = dictionary + self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] + else: + raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence) + + elif coherence == 'c_v': + if texts is None: + raise ValueError("'texts' should be provided for %s coherence." % coherence) + else: + self.texts = texts + self.dictionary = Dictionary(self.texts) + self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] + + else: + raise ValueError("%s coherence is not currently supported." % coherence) + + self.model = model + self.topics = self._get_topics() + self.coherence = coherence + # Set pipeline parameters: + if self.coherence == 'u_mass': + self.seg = segmentation.s_one_pre + self.prob = probability_estimation.p_boolean_document + self.conf = direct_confirmation_measure.log_conditional_probability + self.aggr = aggregation.arithmetic_mean + + elif self.coherence == 'c_v': + self.seg = segmentation.s_one_set + self.prob = probability_estimation.p_boolean_sliding_window + self.conf = indirect_confirmation_measure.cosine_similarity + self.aggr = aggregation.arithmetic_mean + + def __str__(self): + return "CoherenceModel(segmentation=%s, probability estimation=%s, confirmation measure=%s, aggregation=%s)" % ( + self.seg, self.prob, self.conf, self.aggr) + + def _get_topics(self): + """Internal helper function to return topics from a trained topic model.""" + topics = [] # FIXME : Meant to work for LDAModel, LdaVowpalWabbit right now. Make it work for others. + if isinstance(self.model, LdaModel): + for topic in self.model.state.get_lambda(): + bestn = argsort(topic, topn=10, reverse=True) + topics.append(bestn) + elif isinstance(self.model, LdaVowpalWabbit): + for topic in self.model._get_topics(): + bestn = argsort(topic, topn=10, reverse=True) + topics.append(bestn) + return topics + + def get_coherence(self): + if self.coherence == 'u_mass': + segmented_topics = self.seg(self.topics) + per_topic_postings, num_docs = self.prob(self.corpus, segmented_topics) + confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_docs) + return self.aggr(confirmed_measures) + + elif self.coherence == 'c_v': + segmented_topics = self.seg(self.topics) + per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics, + dictionary=self.dictionary, window_size=2) # FIXME : Change window size to 110 finally. + confirmed_measures = self.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows) + return self.aggr(confirmed_measures) diff --git a/gensim/test/test_direct_confirmation.py b/gensim/test/test_direct_confirmation.py new file mode 100644 index 0000000000..c3c57dd0fd --- /dev/null +++ b/gensim/test/test_direct_confirmation.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for direct confirmation measures in the direct_confirmation_measure module. +""" + +import logging +import unittest + +from gensim.topic_coherence import direct_confirmation_measure + +class TestDirectConfirmationMeasure(unittest.TestCase): + def setUp(self): + # Set up toy example for better understanding and testing + # of this module. See the modules for the mathematical formulas + self.segmentation = [[(1, 2)]] + self.posting_list = {1: set([2, 3, 4]), 2: set([3, 5])} + self.num_docs = 5 + + def testLogConditionalProbability(self): + """Test log_conditional_probability()""" + obtained = direct_confirmation_measure.log_conditional_probability(self.segmentation, self.posting_list, self.num_docs)[0] + # Answer should be ~ ln(1 / 2) = -0.693147181 + expected = -0.693147181 + self.assertAlmostEqual(obtained, expected) + + def testLogRatioMeasure(self): + """Test log_ratio_measure()""" + obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.posting_list, self.num_docs)[0] + # Answer should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557 + expected = -0.182321557 + self.assertAlmostEqual(obtained, expected) + + def testNormalizedLogRatioMeasure(self): + """Test normalized_log_ratio_measure()""" + obtained = direct_confirmation_measure.normalized_log_ratio_measure(self.segmentation, self.posting_list, self.num_docs)[0] + # Answer should be ~ -0.182321557 / ln(1 / 5) = 0.113282753 + expected = 0.113282753 + self.assertAlmostEqual(obtained, expected) + +if __name__ == '__main__': + logging.root.setLevel(logging.WARNING) + unittest.main() diff --git a/gensim/test/test_indirect_confirmation.py b/gensim/test/test_indirect_confirmation.py new file mode 100644 index 0000000000..8fca92a34a --- /dev/null +++ b/gensim/test/test_indirect_confirmation.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for indirect confirmation measures in the indirect_confirmation_measure module. +""" + +import logging +import unittest + +from gensim.topic_coherence import indirect_confirmation_measure + +import numpy as np +from numpy import array + +class TestIndirectConfirmation(unittest.TestCase): + def setUp(self): + # Set up toy example for better understanding and testing + # of this module. See the modules for the mathematical formulas + self.topics = [np.array([1, 2])] + # Result from s_one_set segmentation: + self.segmentation = [[(1, array([1, 2])), (2, array([1, 2]))]] + self.posting_list = {1: set([2, 3, 4]), 2: set([3, 5])} + self.gamma = 1 + self.measure = 'nlr' + self.num_docs = 5 + + def testCosineSimilarity(self): + """Test cosine_similarity()""" + obtained = indirect_confirmation_measure.cosine_similarity(self.topics, self.segmentation, + self.posting_list, self.measure, + self.gamma, self.num_docs) + # The steps involved in this calculation are as follows: + # 1. Take (1, array([1, 2]). Take w' which is 1. + # 2. Calculate nlr(1, 1), nlr(1, 2). This is our first vector. + # 3. Take w* which is array([1, 2]). + # 4. Calculate nlr(1, 1) + nlr(2, 1). Calculate nlr(1, 2), nlr(2, 2). This is our second vector. + # 5. Find out cosine similarity between these two vectors. + # 6. Similarly for the second segmentation. + expected = [0.6230, 0.6230] # To account for EPSILON approximation + self.assertAlmostEqual(obtained[0], expected[0], 4) + self.assertAlmostEqual(obtained[1], expected[1], 4) + +if __name__ == '__main__': + logging.root.setLevel(logging.WARNING) + unittest.main() diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py new file mode 100644 index 0000000000..596f91f65b --- /dev/null +++ b/gensim/test/test_probability_estimation.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for probability estimation algorithms in the probability_estimation module. +""" + +import logging +import unittest + +from gensim.topic_coherence import probability_estimation +from gensim.corpora.hashdictionary import HashDictionary + +class TestProbabilityEstimation(unittest.TestCase): + def setUp(self): + self.texts = [['human', 'interface', 'computer'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees']] + self.dictionary = HashDictionary(self.texts) + # Following is the mapping: + # {'computer': 10608, + # 'eps': 31049, + # 'graph': 18451, + # 'human': 31002, + # 'interface': 12466, + # 'response': 5232, + # 'system': 5798, + # 'time': 29104, + # 'trees': 23844, + # 'user': 12736} + self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] + # Suppose the segmented topics from s_one_pre are: + self.segmented_topics = [[(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)]] + + def testPBooleanDocument(self): + """Test p_boolean_document()""" + # Unique topic ids are 5798, 10608, 12736 and 18451 + obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) + expected = {18451: set([5]), 12736: set([1, 3]), 5798: set([1, 2]), 10608: set([0])} + self.assertTrue(obtained == expected) + + def testPBooleanSlidingWindow(self): + """Test p_boolean_sliding_window()""" + # Test with window size as 2. window_id is zero indexed. + obtained, _ = probability_estimation.p_boolean_sliding_window(self.texts, self.segmented_topics, self.dictionary, 2) + expected = {10608: set([1]), 12736: set([8, 2, 3]), 18451: set([11]), 5798: set([4, 5, 6, 7])} + self.assertTrue(obtained == expected) + +if __name__ == '__main__': + logging.root.setLevel(logging.WARNING) + unittest.main() diff --git a/gensim/test/test_segmentation.py b/gensim/test/test_segmentation.py new file mode 100644 index 0000000000..d44fce350d --- /dev/null +++ b/gensim/test/test_segmentation.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2011 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for segmentation algorithms in the segmentation module. +""" + + +import logging +import unittest + +import numpy as np + +from gensim.topic_coherence import segmentation +from numpy import array + + +class TestSegmentation(unittest.TestCase): + def setUp(self): + self.topics = [array([9, 4, 6]), array([9, 10, 7]), array([5, 2, 7])] + + def testSOnePre(self): + """Test s_one_pre segmentation.""" + actual = segmentation.s_one_pre(self.topics) + expected = [[(4, 9), (6, 9), (6, 4)], + [(10, 9), (7, 9), (7, 10)], + [(2, 5), (7, 5), (7, 2)]] + self.assertTrue(np.allclose(actual, expected)) + + def testSOneSet(self): + """Test s_one_set segmentation.""" + actual = segmentation.s_one_set(self.topics) + expected = [[(9, array([9, 4, 6])), (4, array([9, 4, 6])), (6, array([9, 4, 6]))], + [(9, array([9, 10, 7])), (10, array([9, 10, 7])), (7, array([9, 10, 7]))], + [(5, array([5, 2, 7])), (2, array([5, 2, 7])), (7, array([5, 2, 7]))]] + for s_i in range(len(actual)): + for j in range(len(actual[s_i])): + self.assertEqual(actual[s_i][j][0], expected[s_i][j][0]) + self.assertTrue(np.allclose(actual[s_i][j][1], expected[s_i][j][1])) + +if __name__ == '__main__': + logging.root.setLevel(logging.WARNING) + unittest.main() diff --git a/gensim/topic_coherence/__init__.py b/gensim/topic_coherence/__init__.py new file mode 100644 index 0000000000..82a7b92f2e --- /dev/null +++ b/gensim/topic_coherence/__init__.py @@ -0,0 +1,4 @@ +""" +This package contains implementation of the individual components of +the topic coherence pipeline. +""" diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py new file mode 100644 index 0000000000..7c345d8812 --- /dev/null +++ b/gensim/topic_coherence/aggregation.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2013 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +This module contains functions to perform aggregation on a list of values +obtained from the confirmation measure. +""" + +import logging +import numpy as np + +logger = logging.getLogger(__name__) + +def arithmetic_mean(confirmed_measures): + """ + This functoin performs the arithmetic mean aggregation on the output obtained from + the confirmation measure module. + + Args: + ---- + confirmed_measures : list of calculated confirmation measure on each set in the segmented topics. + + Returns: + ------- + mean : Arithmetic mean of all the values contained in confirmation measures. + """ + return np.mean(confirmed_measures) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py new file mode 100644 index 0000000000..eaa1b66841 --- /dev/null +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2013 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +This module contains functions to compute direct confirmation on a pair of words or word subsets. +""" + +import logging +import numpy as np + +logger = logging.getLogger(__name__) + +EPSILON = 1e-12 # Should be small. Value as suggested in paper. + +def log_conditional_probability(segmented_topics, per_topic_postings, num_docs): + """ + This function calculates the log-conditional-probability measure + which is used by coherence measures such as U_mass. + This is defined as: m_lc(S_i) = log[(P(W', W*) + e) / P(W*)] + + Args: + ---- + segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. + per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics. + num_docs : Total number of documents in corresponding corpus. + + Returns: + ------- + m_lc : List of log conditional probability measure on each set in segmented topics. + """ + m_lc = [] + for s_i in segmented_topics: + for w_prime, w_star in s_i: + w_prime_docs = per_topic_postings[w_prime] + w_star_docs = per_topic_postings[w_star] + co_docs = w_prime_docs.intersection(w_star_docs) + m_lc_i = np.log(((len(co_docs) / float(num_docs)) + EPSILON) / (len(w_star_docs) / float(num_docs))) + m_lc.append(m_lc_i) + + return m_lc + +def log_ratio_measure(segmented_topics, per_topic_postings, num_docs): + """ + This function calculates the log-ratio-measure which is used by + coherence measures such as c_v. + This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))] + + Args: + ---- + segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. + per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics + num_docs : Total number of documents in corpus. Used for calculating probability. + + Returns: + ------- + m_lr : List of log ratio measures on each set in segmented topics. + """ + m_lr = [] + for s_i in segmented_topics: + for w_prime, w_star in s_i: + w_prime_docs = per_topic_postings[w_prime] + w_star_docs = per_topic_postings[w_star] + co_docs = w_prime_docs.intersection(w_star_docs) + numerator = (len(co_docs) / float(num_docs)) + EPSILON + denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs)) + m_lr_i = np.log(numerator / denominator) + m_lr.append(m_lr_i) + + return m_lr + +def normalized_log_ratio_measure(segmented_topics, per_topic_postings, num_docs): + """ + This function calculates the normalized-log-ratio-measure, popularly knowns as + NPMI which is used by coherence measures such as c_v. + This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e] + + Args: + ---- + segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. + per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics + num_docs : Total number of documents in corpus. Used for calculating probability. + + Returns: + ------- + m_nlr : List of log ratio measures on each set in segmented topics. + """ + m_nlr = [] + for s_i in segmented_topics: + for w_prime, w_star in s_i: + numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0] + w_prime_docs = per_topic_postings[w_prime] + w_star_docs = per_topic_postings[w_star] + co_docs = w_prime_docs.intersection(w_star_docs) + co_doc_prob = len(co_docs) / float(num_docs) + m_nlr_i = numerator / np.log(co_doc_prob + EPSILON) + m_nlr.append(m_nlr_i) + + return m_nlr diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py new file mode 100644 index 0000000000..e41cb778f1 --- /dev/null +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2013 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +This module contains functions to compute confirmation on a pair of words or word subsets. + +The formula used to compute indirect confirmation measure is: + _ _ +m_sim(m, gamma)(W', W*) = s_sim(V_m,gamma(W'), V_m,gamma(W*)) + +where s_sim can be cosine, dice or jaccard similarity and +_ +V_m,gamma(W') = {sigma(w' belonging to W') m(w_i, w_j) ^ gamma} where j = 1, ...., |W| + +Here 'm' is the direct confirmation measure used. +""" + +import logging +import numpy as np + +from gensim.topic_coherence import direct_confirmation_measure +from gensim.matutils import cossim + +logger = logging.getLogger(__name__) + +def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_docs): + """ + Internal helper function to return context vectors for segmentations. + """ + context_vectors = {} + if isinstance(w_prime, np.ndarray): + for w_j in w: + for w_i in w_prime: + if (w_i, w_j) not in backtrack: + backtrack[(w_i, w_j)] = measure([[(w_i, w_j)]], per_topic_postings, num_docs)[0] + if w_j not in context_vectors: + context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma + else: + context_vectors[w_j] += backtrack[(w_i, w_j)] ** gamma + else: + for w_j in w: + if (w_prime, w_j) not in backtrack: + backtrack[(w_prime, w_j)] = measure([[(w_prime, w_j)]], per_topic_postings, num_docs)[0] + context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma + return (context_vectors, backtrack) + +def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs): + """ + This function calculates the indirect cosine measure. Given context vectors + _ _ _ _ + u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect + _ _ + cosine measure is computed as the cosine similarity between u and w. + + Args: + ---- + topics : Topics obtained from the trained topic model. + segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. + per_topic_postings : per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics. + measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). + gamma : Gamma value for computing W', W* vectors. + num_docs : Total number of documents in corresponding corpus. + """ + if measure == 'nlr': + measure = direct_confirmation_measure.normalized_log_ratio_measure + else: + raise ValueError("The direct confirmation measure you entered is not currently supported.") + backtrack = {} + s_cos_sim = [] + for top_words, s_i in zip(topics, segmented_topics): + for w_prime, w_star in s_i: + w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) + backtrack.update(backtrack_i) + w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) + backtrack.update(backtrack_i) + s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items()) + s_cos_sim.append(s_cos_sim_i) + + return s_cos_sim diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py new file mode 100644 index 0000000000..e7e931b9ac --- /dev/null +++ b/gensim/topic_coherence/probability_estimation.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2013 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +This module contains functions to perform segmentation on a list of topics. +""" + +import logging +import numpy as np + +from gensim.corpora import Dictionary + +from itertools import chain, islice + +logger = logging.getLogger(__name__) + +def _ret_top_ids(segmented_topics): + """ + Helper function to return a set of all the unique topic ids in segmented topics. + """ + top_ids = set() # is a set of all the unique ids contained in topics. + for s_i in segmented_topics: + for id in chain.from_iterable(s_i): + if isinstance(id, np.ndarray): + for i in id: + top_ids.add(i) + else: + top_ids.add(id) + return top_ids + +def p_boolean_document(corpus, segmented_topics): + """ + This function performs the boolean document probability estimation. Boolean document estimates the probability + of a single word as the number of documents in which the word occurs divided by the total number of documents. + + Args: + ---- + corpus : The corpus of documents. + segmented_topics : Output from the segmentation of topics. Could be simply topics too. + + Returns: + ------- + per_topic_postings : Boolean document posting list for each unique topic id. + num_docs : Total number of documents in corpus. + """ + top_ids = _ret_top_ids(segmented_topics) + # Perform boolean document now to create document word list. + per_topic_postings = {} + for id in top_ids: + id_list = set() + for n, document in enumerate(corpus): + if id in frozenset(x[0] for x in document): + id_list.add(n) + per_topic_postings[id] = id_list + num_docs = len(corpus) + return (per_topic_postings, num_docs) + +def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): + """ + This function performs the boolean sliding window probability estimation. Boolean sliding window + determines word counts using a sliding window. The window moves over the documents one word token per step. + Each step defines a new virtual document by copying the window content. Boolean document is applied to + these virtual documents to compute word probabilities. + + Args: + ---- + texts : List of string sentences. + segmented_topics : Output from the segmentation of topics. Could be simply topics too. + dictionary : Gensim dictionary mapping of the tokens and ids. + window_size : Size of the sliding window. 110 found out to be the ideal size for large corpora. + + Returns: + ------- + per_topic_postings : Boolean sliding window postings list of all the unique topic ids. + window_id[0] : Total no of windows + """ + top_ids = _ret_top_ids(segmented_topics) + window_id = [0] # Each window assigned a window id. + per_topic_postings = {} + token2id_dict = dictionary.token2id + def add_topic_posting(): + for word in window: + word_id = token2id_dict[word] + if word_id in top_ids: + if word_id in per_topic_postings: + per_topic_postings[word_id].add(window_id[0]) + else: + per_topic_postings[word_id] = set([window_id[0]]) + window_id[0] += 1 + # Apply boolean sliding window to each document in texts. + for document in texts: + it = iter(document) + window = tuple(islice(it, window_size)) + add_topic_posting() + if len(window) <= window_size: + pass # FIXME : Handle case when window size is bigger than length of document + for elem in it: + window = window[1:] + (elem,) + add_topic_posting() + + return (per_topic_postings, window_id[0]) diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py new file mode 100644 index 0000000000..de5be5ea91 --- /dev/null +++ b/gensim/topic_coherence/segmentation.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2013 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +This module contains functions to perform segmentation on a list of topics. +""" + +import logging +import numpy as np + +logger = logging.getLogger(__name__) + +def s_one_pre(topics): + """ + This function performs s_one_pre segmentation on a list of topics. + s_one_pre segmentation is defined as: s_one_pre = {(W', W*) | W' = {w_i}; + W* = {w_j}; w_i, w_j belongs to W; i > j} + Example: + + >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] + >>> s_one_pre(topics) + [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] + + Args: + ---- + topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + + Returns: + ------- + s_one_pre : list of list of (W', W*) tuples for all unique topic ids + """ + s_one_pre = [] + + for top_words in topics: + s_one_pre_t = [] + for w_prime in top_words[1:]: + w_prime_index = np.where(top_words == w_prime)[0] # To get index of w_star in top_words + for w_star in top_words[:w_prime_index]: + s_one_pre_t.append((w_prime, w_star)) + s_one_pre.append(s_one_pre_t) + + return s_one_pre + +def s_one_set(topics): + """ + This function performs s_one_set segmentation on a list of topics. + s_one_set segmentation is defined as: s_one_set = {(W', W*) | W' = {w_i}; w_i belongs to W; + W* = W} + Example: + >>> topics = [np.array([9, 10, 7]) + >>> s_one_set(topics) + [[(9, array([ 9, 10, 7])), + (10, array([ 9, 10, 7])), + (7, array([ 9, 10, 7]))]] + + Args: + ---- + topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + + Returns: + ------- + s_one_set : list of list of (W', W*) tuples for all unique topic ids. + """ + s_one_set = [] + + for top_words in topics: + s_one_set_t = [] + for w_prime in top_words: + s_one_set_t.append((w_prime, top_words)) + s_one_set.append(s_one_set_t) + + return s_one_set