From b8dce4f64cd577ba21b634041c8c4c90f1a907df Mon Sep 17 00:00:00 2001 From: Hiyorimi Date: Thu, 10 Oct 2019 03:51:28 +0300 Subject: [PATCH 1/2] Handling for iterables without 0-th element, fixes #2556 --- gensim/sklearn_api/d2vmodel.py | 2 +- gensim/test/test_d2vmodel.py | 57 ++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 gensim/test/test_d2vmodel.py diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index e967c84eb1..ac261037e0 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -159,7 +159,7 @@ def fit(self, X, y=None): The trained model. """ - if isinstance(X[0], doc2vec.TaggedDocument): + if isinstance(list(X)[0], doc2vec.TaggedDocument): d2v_sentences = X else: d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(X)] diff --git a/gensim/test/test_d2vmodel.py b/gensim/test/test_d2vmodel.py new file mode 100644 index 0000000000..44d3d1612c --- /dev/null +++ b/gensim/test/test_d2vmodel.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for checking D2VTransformer class. +""" + +import unittest +import logging +from gensim.sklearn_api import D2VTransformer +from gensim.test.utils import common_texts + + +class IteratorForIterable: + """Iterator capable of folding into list.""" + def __init__(self, iterable): + self._data = iterable + self._index = 0 + + def __next__(self): + if len(self._data) > self._index: + result = self._data[self._index] + self._index += 1 + return result + raise StopIteration + + +class IterableWithoutZeroElement: + """ + Iterable, emulating pandas.Series behaviour without 0-th element. + Equivalent to calling `series.index += 1`. + """ + def __init__(self, data): + self.data = data + + def __getitem__(self, key): + if key == 0: + raise KeyError("Emulation of absence of item with key 0.") + return self.data[key] + + def __iter__(self): + return IteratorForIterable(self.data) + + +class TestD2VTransformer(unittest.TestCase): + def TestWorksWithIterableNotHavingElementWithZeroIndex(self): + a = IterableWithoutZeroElement(common_texts) + transformer = D2VTransformer(min_count=1, size=5) + transformer.fit(a) + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() From f8bc1ae1c49b55b16c79d592149e27f76d44481b Mon Sep 17 00:00:00 2001 From: Hiyorimi Date: Thu, 10 Oct 2019 04:01:26 +0300 Subject: [PATCH 2/2] Improved accessing the first element for the case of big datasets --- gensim/sklearn_api/d2vmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index ac261037e0..fa154a2497 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -159,7 +159,7 @@ def fit(self, X, y=None): The trained model. """ - if isinstance(list(X)[0], doc2vec.TaggedDocument): + if isinstance([i for i in X[:1]][0], doc2vec.TaggedDocument): d2v_sentences = X else: d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(X)]