Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Added new ValueError in place of assertion error for no model data provided in lsi model #3271

Merged
merged 12 commits into from
Mar 22, 2022
7 changes: 5 additions & 2 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@

from gensim import interfaces, matutils, utils
from gensim.models import basemodel
from gensim.utils import is_empty

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -489,7 +490,8 @@ def add_documents(self, corpus, chunksize=None, decay=None):
chunksize = self.chunksize
if decay is None:
decay = self.decay

if is_empty(corpus):
logger.warning('LsiModel.add_documents() called but no documents provided, is this intended?')
if not scipy.sparse.issparse(corpus):
if not self.onepass:
# we are allowed multiple passes over the input => use a faster, randomized two-pass algo
Expand Down Expand Up @@ -590,7 +592,8 @@ def __getitem__(self, bow, scaled=False, chunksize=512):
Latent representation of corpus in BoW format if `bow` is corpus.

"""
assert self.projection.u is not None, "decomposition not initialized yet"
if self.projection.u is None:
raise ValueError('No training data provided - LSI model not initialized yet')

# if the input vector is in fact a corpus, return a transformed corpus as a result
is_corpus, bow = utils.is_corpus(bow)
Expand Down
17 changes: 17 additions & 0 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from copy import deepcopy
from datetime import datetime
import platform
import types

import numpy as np
import scipy.sparse
Expand Down Expand Up @@ -2084,3 +2085,19 @@ def effective_n_jobs(n_jobs):
elif n_jobs < 0:
n_jobs = max(multiprocessing.cpu_count() + 1 + n_jobs, 1)
return n_jobs


def is_empty(corpus):
"""Is the corpus (an iterable or a scipy.sparse array) empty?"""
if scipy.sparse.issparse(corpus):
return corpus.shape[1] == 0 # by convention, scipy.sparse documents are columns
if isinstance(corpus, types.GeneratorType):
return False # don't try to guess emptiness of generators, may lose elements irretrievably
try:
# list, numpy array etc
first_doc = next(iter(corpus)) # noqa: F841 (ignore unused variable)
return False # first document exists => not empty
except StopIteration:
return True
except Exception:
return False