piskvorky · tmylk · Jun 22, 2016 · May 27, 2016 · May 27, 2016 · May 28, 2016
diff --git a/docs/notebooks/u_mass_tutorial.ipynb b/docs/notebooks/u_mass_tutorial.ipynb
diff --git a/gensim/aggregation.py b/gensim/aggregation.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+This module contains functions to perform aggregation on a list of values
+obtained from the confirmation measure.
+"""
+
+import logging
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+def arithmetic_mean(confirmed_measures):
+    """
+    This functoin performs the arithmetic mean aggregation on the output obtained from
+    the confirmation measure module.
+
+    Args:
+    ----
+    confirmed_measures : list of calculated confirmation measure on each set in the segmented topics.
+
+    Returns:
+    -------
+    mean : Arithmetic mean of all the values contained in confirmation measures.
+    """
+    return np.mean(confirmed_measures)
diff --git a/gensim/direct_confirmation_measure.py b/gensim/direct_confirmation_measure.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+This module contains functions to compute direct confirmation on a pair of words or word subsets.
+"""
+
+import logging
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+EPSILON = 1e-12  # Should be small. Value as suggested in paper.
+
+def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):
+    """
+    This function calculates the log-conditional-probability measure
+    which is used by coherence measures such as U_mass.
+    This is defined as: m_lc(S_i) = log[(P(W', W*) + e) / P(W*)]
+
+    Args:
+    ----
+    segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
+    per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
+    num_docs : Total number of documents in corresponding corpus.
+
+    Returns:
+    -------
+    m_lc : List of log conditional probability measure on each set in segmented topics.
+    """
+    m_lc = []
+    for s_i in segmented_topics:
+        for w_prime, w_star in s_i:
+            w_prime_docs = per_topic_postings[w_prime]
+            w_star_docs = per_topic_postings[w_star]
+            co_docs = w_prime_docs.intersection(w_star_docs)
+            m_lc_i = np.log(((len(co_docs) / float(num_docs)) + EPSILON) / (len(w_star_docs) / float(num_docs)))
+            m_lc.append(m_lc_i)
+
+    return m_lc
+
+def log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
+    """
+    This function calculates the log-ratio-measure which is used by
+    coherence measures such as c_v.
+    This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]
+
+    Args:
+    ----
+    segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
+    per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics
+    num_docs : Total number of documents in corpus. Used for calculating probability.
+
+    Returns:
+    -------
+    m_lr : List of log ratio measures on each set in segmented topics.
+    """
+    m_lr = []
+    for s_i in segmented_topics:
+        for w_prime, w_star in s_i:
+            w_prime_docs = per_topic_postings[w_prime]
+            w_star_docs = per_topic_postings[w_star]
+            co_docs = w_prime_docs.intersection(w_star_docs)
+            numerator = (len(co_docs) / float(num_docs)) + EPSILON
+            denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs))
+            m_lr_i = np.log(numerator / denominator)
+            m_lr.append(m_lr_i)
+
+    return m_lr
+
+def normalized_log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
+    """
+    This function calculates the normalized-log-ratio-measure, popularly knowns as
+    NPMI which is used by coherence measures such as c_v.
+    This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e]
+
+    Args:
+    ----
+    segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
+    per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics
+    num_docs : Total number of documents in corpus. Used for calculating probability.
+
+    Returns:
+    -------
+    m_nlr : List of log ratio measures on each set in segmented topics.
+    """
+    m_nlr = []
+    for s_i in segmented_topics:
+        for w_prime, w_star in s_i:
+            numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0]
+            w_prime_docs = per_topic_postings[w_prime]
+            w_star_docs = per_topic_postings[w_star]
+            co_docs = w_prime_docs.intersection(w_star_docs)
+            co_doc_prob = len(co_docs) / float(num_docs)
+            m_nlr_i = numerator / np.log(co_doc_prob + EPSILON)
+            m_nlr.append(m_nlr_i)
+
+    return m_nlr
diff --git a/gensim/indirect_confirmation_measure.py b/gensim/indirect_confirmation_measure.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+This module contains functions to compute confirmation on a pair of words or word subsets.
+"""
+
+import logging
+import numpy as np
+
+from gensim import direct_confirmation_measure
+from gensim.matutils import cossim
+
+logger = logging.getLogger(__name__)
+
+def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_docs):
+    """
+    Internal helper function to return context vectors for segmentations.
+    """
+    context_vectors = {}
+    if isinstance(w_prime, np.ndarray):
+        for w_j in w:
+            for w_i in w_prime:
+                if (w_i, w_j) not in backtrack:
+                    backtrack[(w_i, w_j)] = measure([[(w_i, w_j)]], per_topic_postings, num_docs)[0]
+                if w_j not in context_vectors:
+                    context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma
+                else:
+                    context_vectors[w_j] += backtrack[(w_i, w_j)] ** gamma
+    else:
+        for w_j in w:
+            if (w_prime, w_j) not in backtrack:
+                backtrack[(w_prime, w_j)] = measure([[(w_prime, w_j)]], per_topic_postings, num_docs)[0]
+            context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma
+    return (context_vectors, backtrack)
+
+def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs):  # FIXME : Write documentation for arguments.
+    """
+    This function calculates the indirect cosine measure. Given context vectors
+    _   _         _   _
+    u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect
+                                                                _     _
+    cosine measure is computed as the cosine similarity between u and w.
+
+    Args:
+    ----
+    measure : String. Supported values are "nlr" (normalized log ratio).
+    """
+    if measure == 'nlr':
+        measure = direct_confirmation_measure.normalized_log_ratio_measure
+    backtrack = {}
+    s_cos_sim = []
+    for top_words, s_i in zip(topics, segmented_topics):
+        for w_prime, w_star in s_i:
+            w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
+            backtrack.update(backtrack_i)
+            w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
+            backtrack.update(backtrack_i)
+            s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items())
+            s_cos_sim.append(s_cos_sim_i)
+
+    return s_cos_sim
diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py
@@ -4,6 +4,7 @@
 """
 
 # bring model classes directly into package namespace, to save some typing
+from .coherencemodel import CoherenceModel
 from .hdpmodel import HdpModel
 from .ldamodel import LdaModel
 from .lsimodel import LsiModel

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Module for calculating topic coherence in python. This is the implementation of
+the four stage topic coherence pipeline from http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf.
+The four stage pipeline is basically:
+
+Segmentation -> Probability Estimation -> Confirmation Measure -> Aggregation.
+
+Implementation of this pipeline allows for the user to in essence "make" a
+coherence measure of his/her choice by choosing a method in each of the pipelines.
+"""
+
+import logging
+
+from gensim import interfaces
+from gensim import (segmentation, probability_estimation,
+                    direct_confirmation_measure, indirect_confirmation_measure,
+                    aggregation)
+from gensim.corpora import Dictionary
+from gensim.matutils import argsort
+
+logger = logging.getLogger(__name__)
+
+
+class CoherenceModel(interfaces.TransformationABC): # FIXME : Document all the arguments for __init__
+    """
+    Objects of this class allow for building and maintaining a model for topic
+    coherence.
+
+    The main methods are:
+
+    1. constructor, which initializes the four stage pipeline by accepting a coherence measure,
+    2. the ``get_coherence()`` method, which returns the topic coherence.
+
+    Model persistency is achieved via its load/save methods.
+    """
+    def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='u_mass'):
+        """
+        FIXME : Write documentation.
+        model : Pre-trained topic model
+        """
+        if texts is None and corpus is None:
+            raise ValueError("One of texts or corpus has to be provided.")
+        if coherence == 'u_mass':
+            if corpus is not None:
+                if dictionary is None:
+                    if model.id2word[0] == 0:
+                        raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
+                                         "should be set as the dictionary.")
+                    else:
+                        self.dictionary = model.id2word
+                else:
+                    self.dictionary = dictionary
+                self.corpus = corpus
+            elif texts is not None:
+                self.texts = texts
+                if dictionary is None:
+                    self.dictionary = Dictionary(self.texts)
+                else:
+                    self.dictionary = dictionary
+                self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
+            else:
+                raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence)
+
+        elif coherence == 'c_v':
+            if texts is None:
+                raise ValueError("'texts' should be provided for %s coherence." % coherence)
+            else:
+                self.texts = texts
+                self.dictionary = Dictionary(self.texts)
+                self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
+
+        else:
+            raise ValueError("%s coherence is not currently supported." % coherence)
+
+        self.model = model
+        self.topics = self._get_topics()
+        self.coherence = coherence
+        # Set pipeline parameters:
+        if self.coherence == 'u_mass':
+            self.seg = segmentation.s_one_pre
+            self.prob = probability_estimation.p_boolean_document
+            self.conf = direct_confirmation_measure.log_conditional_probability
+            self.aggr = aggregation.arithmetic_mean
+
+        elif self.coherence == 'c_v':
+            self.seg = segmentation.s_one_set
+            self.prob = probability_estimation.p_boolean_sliding_window
+            self.conf = indirect_confirmation_measure.cosine_similarity
+            self.aggr = aggregation.arithmetic_mean
+
+    def __str__(self):
+        return "CoherenceModel(segmentation=%s, probability estimation=%s, confirmation measure=%s, aggregation=%s)" % (
+            self.seg, self.prob, self.conf, self.aggr)
+
+    def _get_topics(self):
+        """Internal helper function to return topics from a trained topic model."""
+        topics = []  # FIXME : Meant to work for LDAModel right now. Make it work for others.
+        for topic in self.model.state.get_lambda():
+            bestn = argsort(topic, topn=10, reverse=True)
+            topics.append(bestn)
+        return topics
+
+    def get_coherence(self):
+        if self.coherence == 'u_mass':
+            segmented_topics = self.seg(self.topics)
+            per_topic_postings, num_docs = self.prob(self.corpus, segmented_topics)
+            confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_docs)
+            return self.aggr(confirmed_measures)
+
+        elif self.coherence == 'c_v':
+            segmented_topics = self.seg(self.topics)
+            per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics,
+                                                        dictionary=self.dictionary, window_size=2)  # FIXME : Change window size to 110 finally.
+            confirmed_measures = self.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows)
+            return self.aggr(confirmed_measures)