Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

[WIP] Add topic coherence pipeline to gensim #710

Merged
merged 18 commits into from
Jun 22, 2016
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
532 changes: 532 additions & 0 deletions docs/notebooks/u_mass_tutorial.ipynb

Large diffs are not rendered by default.

30 changes: 30 additions & 0 deletions gensim/aggregation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
This module contains functions to perform aggregation on a list of values
obtained from the confirmation measure.
"""

import logging
import numpy as np

logger = logging.getLogger(__name__)

def arithmetic_mean(confirmed_measures):
"""
This functoin performs the arithmetic mean aggregation on the output obtained from
the confirmation measure module.

Args:
----
confirmed_measures : list of calculated confirmation measure on each set in the segmented topics.

Returns:
-------
mean : Arithmetic mean of all the values contained in confirmation measures.
"""
return np.mean(confirmed_measures)
101 changes: 101 additions & 0 deletions gensim/direct_confirmation_measure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
This module contains functions to compute direct confirmation on a pair of words or word subsets.
"""

import logging
import numpy as np

logger = logging.getLogger(__name__)

EPSILON = 1e-12 # Should be small. Value as suggested in paper.

def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):
"""
This function calculates the log-conditional-probability measure
which is used by coherence measures such as U_mass.
This is defined as: m_lc(S_i) = log[(P(W', W*) + e) / P(W*)]

Args:
----
segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
num_docs : Total number of documents in corresponding corpus.

Returns:
-------
m_lc : List of log conditional probability measure on each set in segmented topics.
"""
m_lc = []
for s_i in segmented_topics:
for w_prime, w_star in s_i:
w_prime_docs = per_topic_postings[w_prime]
w_star_docs = per_topic_postings[w_star]
co_docs = w_prime_docs.intersection(w_star_docs)
m_lc_i = np.log(((len(co_docs) / float(num_docs)) + EPSILON) / (len(w_star_docs) / float(num_docs)))
m_lc.append(m_lc_i)

return m_lc

def log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
"""
This function calculates the log-ratio-measure which is used by
coherence measures such as c_v.
This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]

Args:
----
segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics
num_docs : Total number of documents in corpus. Used for calculating probability.

Returns:
-------
m_lr : List of log ratio measures on each set in segmented topics.
"""
m_lr = []
for s_i in segmented_topics:
for w_prime, w_star in s_i:
w_prime_docs = per_topic_postings[w_prime]
w_star_docs = per_topic_postings[w_star]
co_docs = w_prime_docs.intersection(w_star_docs)
numerator = (len(co_docs) / float(num_docs)) + EPSILON
denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs))
m_lr_i = np.log(numerator / denominator)
m_lr.append(m_lr_i)

return m_lr

def normalized_log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
"""
This function calculates the normalized-log-ratio-measure, popularly knowns as
NPMI which is used by coherence measures such as c_v.
This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e]

Args:
----
segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics
num_docs : Total number of documents in corpus. Used for calculating probability.

Returns:
-------
m_nlr : List of log ratio measures on each set in segmented topics.
"""
m_nlr = []
for s_i in segmented_topics:
for w_prime, w_star in s_i:
numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0]
w_prime_docs = per_topic_postings[w_prime]
w_star_docs = per_topic_postings[w_star]
co_docs = w_prime_docs.intersection(w_star_docs)
co_doc_prob = len(co_docs) / float(num_docs)
m_nlr_i = numerator / np.log(co_doc_prob + EPSILON)
m_nlr.append(m_nlr_i)

return m_nlr
65 changes: 65 additions & 0 deletions gensim/indirect_confirmation_measure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
This module contains functions to compute confirmation on a pair of words or word subsets.
"""

import logging
import numpy as np

from gensim import direct_confirmation_measure
from gensim.matutils import cossim

logger = logging.getLogger(__name__)

def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_docs):
"""
Internal helper function to return context vectors for segmentations.
"""
context_vectors = {}
if isinstance(w_prime, np.ndarray):
for w_j in w:
for w_i in w_prime:
if (w_i, w_j) not in backtrack:
backtrack[(w_i, w_j)] = measure([[(w_i, w_j)]], per_topic_postings, num_docs)[0]
if w_j not in context_vectors:
context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma
else:
context_vectors[w_j] += backtrack[(w_i, w_j)] ** gamma
else:
for w_j in w:
if (w_prime, w_j) not in backtrack:
backtrack[(w_prime, w_j)] = measure([[(w_prime, w_j)]], per_topic_postings, num_docs)[0]
context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma
return (context_vectors, backtrack)

def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs): # FIXME : Write documentation for arguments.
"""
This function calculates the indirect cosine measure. Given context vectors
_ _ _ _
u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect
_ _
cosine measure is computed as the cosine similarity between u and w.

Args:
----
measure : String. Supported values are "nlr" (normalized log ratio).
"""
if measure == 'nlr':
measure = direct_confirmation_measure.normalized_log_ratio_measure
backtrack = {}
s_cos_sim = []
for top_words, s_i in zip(topics, segmented_topics):
for w_prime, w_star in s_i:
w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
backtrack.update(backtrack_i)
w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs)
backtrack.update(backtrack_i)
s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items())
s_cos_sim.append(s_cos_sim_i)

return s_cos_sim
1 change: 1 addition & 0 deletions gensim/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

# bring model classes directly into package namespace, to save some typing
from .coherencemodel import CoherenceModel
from .hdpmodel import HdpModel
from .ldamodel import LdaModel
from .lsimodel import LsiModel
Expand Down
121 changes: 121 additions & 0 deletions gensim/models/coherencemodel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Module for calculating topic coherence in python. This is the implementation of
the four stage topic coherence pipeline from http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf.
The four stage pipeline is basically:

Segmentation -> Probability Estimation -> Confirmation Measure -> Aggregation.

Implementation of this pipeline allows for the user to in essence "make" a
coherence measure of his/her choice by choosing a method in each of the pipelines.
"""

import logging

from gensim import interfaces
from gensim import (segmentation, probability_estimation,
direct_confirmation_measure, indirect_confirmation_measure,
aggregation)
from gensim.corpora import Dictionary
from gensim.matutils import argsort

logger = logging.getLogger(__name__)


class CoherenceModel(interfaces.TransformationABC): # FIXME : Document all the arguments for __init__
"""
Objects of this class allow for building and maintaining a model for topic
coherence.

The main methods are:

1. constructor, which initializes the four stage pipeline by accepting a coherence measure,
2. the ``get_coherence()`` method, which returns the topic coherence.

Model persistency is achieved via its load/save methods.
"""
def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='u_mass'):
"""
FIXME : Write documentation.
model : Pre-trained topic model
"""
if texts is None and corpus is None:
raise ValueError("One of texts or corpus has to be provided.")
if coherence == 'u_mass':
if corpus is not None:
if dictionary is None:
if model.id2word[0] == 0:
raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
"should be set as the dictionary.")
else:
self.dictionary = model.id2word
else:
self.dictionary = dictionary
self.corpus = corpus
elif texts is not None:
self.texts = texts
if dictionary is None:
self.dictionary = Dictionary(self.texts)
else:
self.dictionary = dictionary
self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
else:
raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence)

elif coherence == 'c_v':
if texts is None:
raise ValueError("'texts' should be provided for %s coherence." % coherence)
else:
self.texts = texts
self.dictionary = Dictionary(self.texts)
self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]

else:
raise ValueError("%s coherence is not currently supported." % coherence)

self.model = model
self.topics = self._get_topics()
self.coherence = coherence
# Set pipeline parameters:
if self.coherence == 'u_mass':
self.seg = segmentation.s_one_pre
self.prob = probability_estimation.p_boolean_document
self.conf = direct_confirmation_measure.log_conditional_probability
self.aggr = aggregation.arithmetic_mean

elif self.coherence == 'c_v':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'c_v' should be a constant to avoid duplication

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry can you please elaborate on this a bit more?

self.seg = segmentation.s_one_set
self.prob = probability_estimation.p_boolean_sliding_window
self.conf = indirect_confirmation_measure.cosine_similarity
self.aggr = aggregation.arithmetic_mean

def __str__(self):
return "CoherenceModel(segmentation=%s, probability estimation=%s, confirmation measure=%s, aggregation=%s)" % (
self.seg, self.prob, self.conf, self.aggr)

def _get_topics(self):
"""Internal helper function to return topics from a trained topic model."""
topics = [] # FIXME : Meant to work for LDAModel right now. Make it work for others.
for topic in self.model.state.get_lambda():
bestn = argsort(topic, topn=10, reverse=True)
topics.append(bestn)
return topics

def get_coherence(self):
if self.coherence == 'u_mass':
segmented_topics = self.seg(self.topics)
per_topic_postings, num_docs = self.prob(self.corpus, segmented_topics)
confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_docs)
return self.aggr(confirmed_measures)

elif self.coherence == 'c_v':
segmented_topics = self.seg(self.topics)
per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics,
dictionary=self.dictionary, window_size=2) # FIXME : Change window size to 110 finally.
confirmed_measures = self.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows)
return self.aggr(confirmed_measures)
Loading