From 860d34e5a60861423d08b47e6b8b6eb9d672c4dd Mon Sep 17 00:00:00 2001 From: mark-todd Date: Fri, 26 Nov 2021 13:40:19 +0000 Subject: [PATCH 01/11] Added new ValueError in place of assertion error for no model data provided in lsi model Added warning to lsi model for initialising a model with no data --- gensim/models/lsimodel.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 6a407e860e..7969f4d254 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -70,7 +70,7 @@ from gensim import interfaces, matutils, utils from gensim.models import basemodel - +import warnings logger = logging.getLogger(__name__) # accuracy defaults for the multi-pass stochastic algo @@ -489,7 +489,8 @@ def add_documents(self, corpus, chunksize=None, decay=None): chunksize = self.chunksize if decay is None: decay = self.decay - + if corpus == []: + warnings.warn('LsiModel.add_documents() called but no documents provided, is this intended?') if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo @@ -590,7 +591,8 @@ def __getitem__(self, bow, scaled=False, chunksize=512): Latent representation of corpus in BoW format if `bow` is corpus. """ - assert self.projection.u is not None, "decomposition not initialized yet" + if self.projection.u is None: + raise ValueError('No training data provided - LSI model not initialized yet') # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) From b1e4fa3299f2fd1c0786ff5cd28098d222fb3450 Mon Sep 17 00:00:00 2001 From: mark-todd <60781787+mark-todd@users.noreply.github.com> Date: Mon, 21 Feb 2022 12:16:15 +0000 Subject: [PATCH 02/11] Update lsimodel.py --- gensim/models/lsimodel.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 7969f4d254..107101fd0a 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -70,7 +70,6 @@ from gensim import interfaces, matutils, utils from gensim.models import basemodel -import warnings logger = logging.getLogger(__name__) # accuracy defaults for the multi-pass stochastic algo @@ -489,8 +488,8 @@ def add_documents(self, corpus, chunksize=None, decay=None): chunksize = self.chunksize if decay is None: decay = self.decay - if corpus == []: - warnings.warn('LsiModel.add_documents() called but no documents provided, is this intended?') + if not corpus: + logger.warning('LsiModel.add_documents() called but no documents provided, is this intended?') if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo From 5ff6b3f4605d021fc010782adefa8292c0235a29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Mon, 21 Feb 2022 16:59:12 +0100 Subject: [PATCH 03/11] Update gensim/models/lsimodel.py --- gensim/models/lsimodel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 107101fd0a..7f154022dd 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -70,6 +70,7 @@ from gensim import interfaces, matutils, utils from gensim.models import basemodel + logger = logging.getLogger(__name__) # accuracy defaults for the multi-pass stochastic algo From ba9878d0d3396000b0462e946029b97e3747edac Mon Sep 17 00:00:00 2001 From: mark-todd Date: Tue, 22 Feb 2022 11:38:14 +0000 Subject: [PATCH 04/11] Added better empty corpus testing --- gensim/models/lsimodel.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 7f154022dd..76402a0e9e 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -62,6 +62,7 @@ import logging import sys import time +import types import numpy as np import scipy.linalg @@ -482,6 +483,20 @@ def add_documents(self, corpus, chunksize=None, decay=None): If the distributed mode is on, each chunk is sent to a different worker/computer. """ + def is_empty(corpus): + """Is the corpus (an iterable or a scipy.sparse array) empty?""" + if scipy.sparse.issparse(corpus): + return corpus.shape[1] == 0 # by convention, scipy.sparse documents are columns + if isinstance(corpus, types.GeneratorType): + return False # don't try to guess emptiness of generators, may lose elements irretrievably + try: + first_doc = next(iter(corpus)) # list, numpy array etc + return False # first document exists => not empty + except StopIteration: + return True + except Exception: + return False + logger.info("updating model with new documents") # get computation parameters; if not specified, use the ones from constructor @@ -489,7 +504,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): chunksize = self.chunksize if decay is None: decay = self.decay - if not corpus: + if is_empty(corpus): logger.warning('LsiModel.add_documents() called but no documents provided, is this intended?') if not scipy.sparse.issparse(corpus): if not self.onepass: From 0e553126fa08bc0e6b08ca22bcec1af981675ce2 Mon Sep 17 00:00:00 2001 From: mark-todd Date: Fri, 25 Feb 2022 18:06:08 +0000 Subject: [PATCH 05/11] Moved is_empty function to utils --- gensim/models/lsimodel.py | 15 +-------------- gensim/utils.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 76402a0e9e..8027418ecc 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -71,7 +71,7 @@ from gensim import interfaces, matutils, utils from gensim.models import basemodel - +from gensim.utils import is_empty logger = logging.getLogger(__name__) # accuracy defaults for the multi-pass stochastic algo @@ -483,19 +483,6 @@ def add_documents(self, corpus, chunksize=None, decay=None): If the distributed mode is on, each chunk is sent to a different worker/computer. """ - def is_empty(corpus): - """Is the corpus (an iterable or a scipy.sparse array) empty?""" - if scipy.sparse.issparse(corpus): - return corpus.shape[1] == 0 # by convention, scipy.sparse documents are columns - if isinstance(corpus, types.GeneratorType): - return False # don't try to guess emptiness of generators, may lose elements irretrievably - try: - first_doc = next(iter(corpus)) # list, numpy array etc - return False # first document exists => not empty - except StopIteration: - return True - except Exception: - return False logger.info("updating model with new documents") diff --git a/gensim/utils.py b/gensim/utils.py index d4fc6a71dc..d1fd6478bf 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -2084,3 +2084,17 @@ def effective_n_jobs(n_jobs): elif n_jobs < 0: n_jobs = max(multiprocessing.cpu_count() + 1 + n_jobs, 1) return n_jobs + +def is_empty(corpus): + """Is the corpus (an iterable or a scipy.sparse array) empty?""" + if scipy.sparse.issparse(corpus): + return corpus.shape[1] == 0 # by convention, scipy.sparse documents are columns + if isinstance(corpus, types.GeneratorType): + return False # don't try to guess emptiness of generators, may lose elements irretrievably + try: + first_doc = next(iter(corpus)) # list, numpy array etc + return False # first document exists => not empty + except StopIteration: + return True + except Exception: + return False From d9811b90b980b6162db4ae0c9a1b7c08795591d6 Mon Sep 17 00:00:00 2001 From: mark-todd <60781787+mark-todd@users.noreply.github.com> Date: Fri, 25 Feb 2022 19:20:34 +0000 Subject: [PATCH 06/11] Update gensim/models/lsimodel.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added space Co-authored-by: Radim Řehůřek --- gensim/models/lsimodel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 8027418ecc..3265f5116a 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -483,7 +483,6 @@ def add_documents(self, corpus, chunksize=None, decay=None): If the distributed mode is on, each chunk is sent to a different worker/computer. """ - logger.info("updating model with new documents") # get computation parameters; if not specified, use the ones from constructor From dc75e9398e213459fecb97638fef8941f50b1c9b Mon Sep 17 00:00:00 2001 From: mark-todd <60781787+mark-todd@users.noreply.github.com> Date: Fri, 25 Feb 2022 19:20:48 +0000 Subject: [PATCH 07/11] Update gensim/models/lsimodel.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added import space Co-authored-by: Radim Řehůřek --- gensim/models/lsimodel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 3265f5116a..e9601b33b3 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -72,6 +72,7 @@ from gensim import interfaces, matutils, utils from gensim.models import basemodel from gensim.utils import is_empty + logger = logging.getLogger(__name__) # accuracy defaults for the multi-pass stochastic algo From da6d67d2e3566241a66e5fea5e6922a2f6fb80c3 Mon Sep 17 00:00:00 2001 From: mark-todd <60781787+mark-todd@users.noreply.github.com> Date: Fri, 25 Feb 2022 19:21:35 +0000 Subject: [PATCH 08/11] Update gensim/utils.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added space after False Co-authored-by: Radim Řehůřek --- gensim/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/utils.py b/gensim/utils.py index d1fd6478bf..6c658d4870 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -2093,7 +2093,7 @@ def is_empty(corpus): return False # don't try to guess emptiness of generators, may lose elements irretrievably try: first_doc = next(iter(corpus)) # list, numpy array etc - return False # first document exists => not empty + return False # first document exists => not empty except StopIteration: return True except Exception: From 163bd8015657ee20ff06d6df4207e388042712d4 Mon Sep 17 00:00:00 2001 From: mark-todd Date: Fri, 25 Feb 2022 19:28:11 +0000 Subject: [PATCH 09/11] Moved import --- gensim/models/lsimodel.py | 1 - gensim/utils.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index e9601b33b3..8f8c9c511a 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -62,7 +62,6 @@ import logging import sys import time -import types import numpy as np import scipy.linalg diff --git a/gensim/utils.py b/gensim/utils.py index 6c658d4870..3533d3575b 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -30,6 +30,7 @@ from copy import deepcopy from datetime import datetime import platform +import types import numpy as np import scipy.sparse From 4cbd22717b0f8f6b2090609905c0a4f087dfac2e Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 18 Mar 2022 23:16:46 +0900 Subject: [PATCH 10/11] Update utils.py --- gensim/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/utils.py b/gensim/utils.py index 3533d3575b..f200d1547c 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -2093,7 +2093,8 @@ def is_empty(corpus): if isinstance(corpus, types.GeneratorType): return False # don't try to guess emptiness of generators, may lose elements irretrievably try: - first_doc = next(iter(corpus)) # list, numpy array etc + # list, numpy array etc + first_doc = next(iter(corpus)) # noqa: F841 (ignore unused variable) return False # first document exists => not empty except StopIteration: return True From 9741a81a440f81ed9b17ec0ac963866d516ad45d Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 22 Mar 2022 11:31:32 +0900 Subject: [PATCH 11/11] fix flake8 problem --- gensim/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/utils.py b/gensim/utils.py index f200d1547c..78d64b88e6 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -2086,6 +2086,7 @@ def effective_n_jobs(n_jobs): n_jobs = max(multiprocessing.cpu_count() + 1 + n_jobs, 1) return n_jobs + def is_empty(corpus): """Is the corpus (an iterable or a scipy.sparse array) empty?""" if scipy.sparse.issparse(corpus):