From 8f7c9ff4c546f84d42c220dcf28543500747c171 Mon Sep 17 00:00:00 2001 From: Yuri Isakov Date: Mon, 19 Feb 2018 17:34:42 +0300 Subject: [PATCH] Fix docstrings for `gensim.interfaces` (#1913) * docstrings for interfaces.py. draft * typos and fixes * fix interfaces[1] * fix interfaces[2] --- gensim/interfaces.py | 352 +++++++++++++++++++++++++++++++------------ 1 file changed, 253 insertions(+), 99 deletions(-) diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 6cc7e8d872..0261c290f9 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -4,11 +4,12 @@ # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains basic interfaces used throughout the whole gensim package. +"""This module contains implementations of basic interfaces used across the whole gensim package. +These interfaces usable for building corpus, transformation and similarity classes. + +All interfaces are realized as abstract base classes (i.e. some optional functionality is provided in the interface +itself, so that the interfaces should be inherited). -The interfaces are realized as abstract base classes (ie., some optional functionality -is provided in the interface itself, so that the interfaces can be subclassed). """ from __future__ import with_statement @@ -19,42 +20,82 @@ from six.moves import xrange -logger = logging.getLogger('gensim.interfaces') +logger = logging.getLogger(__name__) class CorpusABC(utils.SaveLoad): - """ - Interface (abstract base class) for corpora. A *corpus* is simply an iterable, - where each iteration step yields one document: - - >>> for doc in corpus: - >>> # do something with the doc... - - A document is a sequence of `(fieldId, fieldValue)` 2-tuples: + """Interface for corpus classes from :mod:`gensim.corpora`. - >>> for attr_id, attr_value in doc: - >>> # do something with the attribute + Corpus is simply an iterable object, where each iteration step yields one document: - Note that although a default :func:`len` method is provided, it is very inefficient - (performs a linear scan through the corpus to determine its length). Wherever - the corpus size is needed and known in advance (or at least doesn't change so - that it can be cached), the :func:`len` method should be overridden. - - See the :mod:`gensim.corpora.svmlightcorpus` module for an example of a corpus. + >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class + >>> from gensim.test.utils import datapath + >>> + >>> corpus = MmCorpus(datapath("testcorpus.mm")) + >>> for doc in corpus: + ... pass # do something with the doc... + + A document represented in bag-of-word (BoW) format, i.e. list of (attr_id, attr_value), + like ``[(1, 0.2), (4, 0.6), ...]``. + + >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class + >>> from gensim.test.utils import datapath + >>> + >>> corpus = MmCorpus(datapath("testcorpus.mm")) + >>> doc = next(iter(corpus)) + >>> print(doc) + [(0, 1.0), (1, 1.0), (2, 1.0)] + + Remember, that save/load methods save only corpus class (not corpus as data itself), + for save/load functionality, please use this pattern : + + >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class + >>> from gensim.test.utils import datapath, get_tmpfile + >>> + >>> corpus = MmCorpus(datapath("testcorpus.mm")) + >>> tmp_path = get_tmpfile("temp_corpus.mm") + >>> + >>> MmCorpus.serialize(tmp_path, corpus) # serialize corpus to disk in MmCorpus format + >>> # MmCorpus.save_corpus(tmp_path, corpus) # this variant also possible, but if serialize availbe - call it. + >>> loaded_corpus = MmCorpus(tmp_path) # load corpus through constructor + >>> for (doc_1, doc_2) in zip(corpus, loaded_corpus): + ... assert doc_1 == doc_2 # check that corpuses exactly same + + + See Also + -------- + :mod:`gensim.corpora` + Corpuses in different formats - Saving the corpus with the `save` method (inherited from `utils.SaveLoad`) will - only store the *in-memory* (binary, pickled) object representation=the stream - state, and **not** the documents themselves. See the `save_corpus` static method - for serializing the actual stream content. """ def __iter__(self): - """ - Iterate over the corpus, yielding one document at a time. + """Iterate over corpus, **should be overridden in inheritor class**. + + Raises + ------ + NotImplementedError + Since it's abstract class this iterator protocol should be overwritten in the inherited class. + """ raise NotImplementedError('cannot instantiate abstract base class') def save(self, *args, **kwargs): + """Saves corpus in-memory state. + + Warnings + -------- + This save only "state" of corpus class (not corpus-data at all), + for saving data please use :meth:`~gensim.interfaces.CorpusABC.save_corpus` instead`. + + Parameters + ---------- + *args + Variable length argument list. + **kwargs + Arbitrary keyword arguments. + + """ import warnings warnings.warn( "corpus.save() stores only the (tiny) iteration object; " @@ -63,49 +104,65 @@ def save(self, *args, **kwargs): super(CorpusABC, self).save(*args, **kwargs) def __len__(self): - """ - Return the number of documents in the corpus. + """Get size of the corpus (number of documents), **should be overridden in inheritor class**. + + Raises + ------ + NotImplementedError + Since it's abstract class this method should be reimplemented later. - This method is just the least common denominator and should really be - overridden when possible. """ raise NotImplementedError("must override __len__() before calling len(corpus)") -# logger.warning("performing full corpus scan to determine its length; was this intended?") -# return sum(1 for doc in self) # sum(empty generator) == 0, so this works even for an empty corpus @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): - """ - Save an existing `corpus` to disk. - - Some formats also support saving the dictionary (`feature_id->word` mapping), - which can in this case be provided by the optional `id2word` parameter. - - >>> MmCorpus.save_corpus('file.mm', corpus) - - Some corpora also support an index of where each document begins, so - that the documents on disk can be accessed in O(1) time (see the - `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically - called internally by `serialize`, which does `save_corpus` plus saves the index - at the same time, so you want to store the corpus with:: - - >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents - - Calling `serialize()` is preferred to calling `save_corpus()`. + """Saves given `corpus` to disk, **should be overridden in inheritor class**. + + Some formats support saving the dictionary (`feature_id -> word` mapping), + which can be provided by the optional `id2word` parameter. + + Notes + ----- + Some corpus also support an index of where each document begins, so that the documents on disk + can be accessed in O(1) time (see the :class:`gensim.corpora.indexedcorpus.IndexedCorpus` base class). + In this case, :meth:`~gensim.interfaces.CorpusABC.save_corpus` is automatically called internally by + :func:`serialize`, which does :meth:`~gensim.interfaces.CorpusABC.save_corpus` plus saves the index + at the same time. + + Calling :func:`serialize() is preferred to calling :meth:`~gensim.interfaces.CorpusABC.save_corpus`. + + Parameters + ---------- + fname : str + Path to output file. + corpus : iterable of list of (int, number) + Corpus in BoW format. + id2word : :class:`~gensim.corpora.Dictionary`, optional + Dictionary of corpus. + metadata : bool, optional + If True, will write some meta-information to `fname` too. """ raise NotImplementedError('cannot instantiate abstract base class') - # example code: - logger.info("converting corpus to ??? format: %s", fname) - with utils.smart_open(fname, 'wb') as fout: - for doc in corpus: # iterate over the document stream - fmt = str(doc) # format the document appropriately... - fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk - class TransformedCorpus(CorpusABC): + """Interface for corpus supports transformations.""" def __init__(self, obj, corpus, chunksize=None, **kwargs): + """ + + Parameters + ---------- + obj : object + Some corpus class from :mod:`gensim.corpora`. + corpus : iterable of list of (int, number) + Corpus in BoW format. + chunksize : int, optional + If provided - more effective processing (by group of documents) will performed. + kwargs + Arbitrary keyword arguments. + + """ self.obj, self.corpus, self.chunksize = obj, corpus, chunksize # add the new parameters like per_word_topics to base class object of LdaModel for key, value in kwargs.items(): @@ -113,9 +170,20 @@ def __init__(self, obj, corpus, chunksize=None, **kwargs): self.metadata = False def __len__(self): + """Get size of the corpus.""" return len(self.corpus) def __iter__(self): + """Iterate over the corpus. + + If `chunksize` is set, works in "batch-manner" (more efficient). + + Yields + ------ + list of (int, number) + Document in BoW format + + """ if self.chunksize: for chunk in utils.grouper(self.corpus, self.chunksize): for transformed in self.obj.__getitem__(chunk, chunksize=None): @@ -125,6 +193,24 @@ def __iter__(self): yield self.obj[doc] def __getitem__(self, docno): + """Get element from corpus index `docno`. + + Parameters + ---------- + docno : int + Index of document in corpus. + + Returns + ------- + list of (int, number) + Document in BoW format + + Raises + ------ + RuntimeError + If corpus doesn't support slicing (:meth`__getitem__` doesn't exists). + + """ if hasattr(self.corpus, '__getitem__'): return self.obj[self.corpus[docno]] else: @@ -132,73 +218,140 @@ def __getitem__(self, docno): class TransformationABC(utils.SaveLoad): - """ - Interface for transformations. A 'transformation' is any object which accepts - a sparse document via the dictionary notation `[]` and returns another sparse - document in its stead:: - - >>> transformed_doc = transformation[doc] + """Transformation interface. - or also:: + A 'transformation' is any object which accepts document in BoW format via the `__getitem__` (notation `[]`) + and returns another sparse document in its stead: - >>> transformed_corpus = transformation[corpus] + >>> from gensim.models import LsiModel + >>> from gensim.test.utils import common_dictionary, common_corpus + >>> + >>> model = LsiModel(common_corpus, id2word=common_dictionary) + >>> bow_vector = model[common_corpus[0]] # model applied through __getitem__ on document from corpus. + >>> bow_corpus = model[common_corpus] # also, we can apply model on full corpus - See the :mod:`gensim.models.tfidfmodel` module for an example of a transformation. """ def __getitem__(self, vec): - """ - Transform vector from one vector space into another + """Get element of `transformations`, **should be overridden in inheritor class**. + + Transforms vector from one vector space into another **or** whole corpus into another. - **or** + Parameters + ---------- + vec : object + Given vector. + + Raises + ------ + NotImplementedError + Since it's abstract class this method should be reimplemented later. - Transform a whole corpus into another. """ raise NotImplementedError('cannot instantiate abstract base class') def _apply(self, corpus, chunksize=None, **kwargs): - """ - Apply the transformation to a whole corpus (as opposed to a single document) - and return the result as another corpus. + """Apply the transformation to a whole corpus and get the result as another corpus. + + Parameters + ---------- + corpus : iterable of list of (int, number) + Corpus in BoW format. + chunksize : int, optional + If provided - more effective processing (by group of documents) will performed. + kwargs + Arbitrary keyword arguments. + + Returns + ------- + :class:`~gensim.interfaces.TransformedCorpus` + Transformed corpus. + """ return TransformedCorpus(self, corpus, chunksize, **kwargs) class SimilarityABC(utils.SaveLoad): - """ - Abstract interface for similarity searches over a corpus. + """Interface for similarity search over a corpus. + + In all instances, there is a corpus against which we want to perform the similarity search. + For each similarity search, the input is a document and the output are its similarities + to individual corpus documents. + - In all instances, there is a corpus against which we want to perform the - similarity search. + Examples + -------- + >>> from gensim.similarities import MatrixSimilarity + >>> from gensim.test.utils import common_dictionary, common_corpus + >>> + >>> index = MatrixSimilarity(common_corpus) + >>> similarities = index.get_similarities(common_corpus[1]) # get similarities between query and corpus - For each similarity search, the input is a document and the output are its - similarities to individual corpus documents. + Notes + ----- + There is also a convenience wrapper, where iterating over `self` yields similarities of each document in the corpus + against the whole corpus (i.e. the query is each corpus document in turn). - Similarity queries are realized by calling ``self[query_document]``. + See Also + -------- + :mod:`gensim.similarities` + Provided different type of indexes for search. - There is also a convenience wrapper, where iterating over `self` yields - similarities of each document in the corpus against the whole corpus (ie., - the query is each corpus document in turn). """ def __init__(self, corpus): + """Initialization of object, **should be overridden in inheritor class**. + + Parameters + ---------- + corpus : iterable of list of (int, number) + Corpus in BoW format. + + Raises + ------ + NotImplementedError + Since it's abstract class this method should be reimplemented later. + + """ raise NotImplementedError("cannot instantiate Abstract Base Class") def get_similarities(self, doc): - # (Sparse)MatrixSimilarity override this method so that they both use the - # same __getitem__ method, defined below + """Get similarity measures of documents of corpus to given `doc`, **should be overridden in inheritor class**. + + Parameters + ---------- + doc : list of (int, number) + Document in BoW format. + + Raises + ------ + NotImplementedError + Since it's abstract class this method should be reimplemented later. + + """ raise NotImplementedError("cannot instantiate Abstract Base Class") def __getitem__(self, query): - """Get similarities of document `query` to all documents in the corpus. + """Get access to similarities of document/corpus `query` to all documents in the corpus. - **or** + Using :meth:`~gensim.interfaces.SimilarityABC.get_similarities` + + + Notes + ----- + Passing corpus to `query` (instead of document) can be more efficient, because will processed in batching-way. + + Parameters + ---------- + query : {list of (int, int), iterable of list of (int, int)} + Document or corpus in BoW format. + + Returns + ------- + {`scipy.sparse.csr.csr_matrix`, list of (int, float)} + Similarities given document or corpus and objects corpus, depends on `query`. - If `query` is a corpus (iterable of documents), return a matrix of similarities - of all query documents vs. all corpus document. Using this type of batch - query is more efficient than computing the similarities one document after - another. """ is_corpus, query = utils.is_corpus(query) if self.normalize: @@ -206,10 +359,7 @@ def __getitem__(self, query): # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). - if matutils.ismatrix(query): - import warnings # noqa:F401 - # warnings.warn("non-gensim input must already come normalized") - else: + if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: @@ -219,7 +369,7 @@ def __getitem__(self, query): if self.num_best is None: return result - # if maintain_sparity is True, result is scipy sparse. Sort, clip the + # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) @@ -233,9 +383,13 @@ def __getitem__(self, query): return matutils.full2sparse_clipped(result, self.num_best) def __iter__(self): - """ - For each index document, compute cosine similarity against all other - documents in the index and yield the result. + """Iterate over all documents, computes similarity against all other documents in the index. + + Yields + ------ + {`scipy.sparse.csr.csr_matrix`, list of (int, float)} + Similarity of current document and all documents of corpus. + """ # turn off query normalization (vectors in the index are assumed to be already normalized) norm = self.normalize