You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Python version 3.7.5
gensim version 3.6.0
apache-beam[gcp] 2.20.0
tensorflow==1.14
Problem description
Trying to create tf records using gensim Doc2Vec.
Expected result is to create tf records with the given parameters.
In Directrunner
tf record creation is happening when used with gensim 3.6.0
but AttributeError is raised when ran with 3.8.0 version of gensim (AttributeError: 'Doc2VecTrainables' object has no attribute 'vectors_lockf')
While running a dataflow job even with gensim 3.6.0
Attribute error is raised
Steps/code/corpus to reproduce
pretrained_emb = 'glove.6B.100d.txt'
vector_size = 300
window_size = 15
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 0 #0 = dbow; 1 = dmpv
worker_count = 1 #number of parallel processes
print('max_seq_len which is being passed above Doc2Vec', self.max_seq_len)
self.model = g.Doc2Vec(documents=None,size=vector_size,
window=window_size, min_count=min_count,
sample=sampling_threshold,
workers=worker_count, hs=0,
dm=dm, negative=negative_size,
dbow_words=1, dm_concat=1,
pretrained_emb=pretrained_emb,
iter=100)
print("Loaded Model")
plot class type is 'string'
embedding_vector = self.model.infer_vector([plot])
It is raising an attribute error when ran in dataflow runner. In Directrunner issue is raised when gensim version is 3.8.0
Error log:
I have pasted the entire error log.
textPayload: "Error message from worker: Traceback (most recent call last):
File "apache_beam/runners/common.py", line 950, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 547, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1078, in apache_beam.runners.common._OutputProcessor.process_outputs
File "tfrecord_util/csv2tfrecord_train_valid.py", line 310, in process
x = self.preprocess(x)
File "tfrecord_util/csv2tfrecord_train_valid.py", line 233, in preprocess
embedding_vector = self._embedding(plot)
File "tfrecord_util/csv2tfrecord_train_valid.py", line 300, in _embedding
embedding_vector = self.model.infer_vector([plot])
File "/usr/local/lib/python3.7/site-packages/gensim/models/doc2vec.py", line 915, in infer_vector
learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
File "gensim/models/doc2vec_inner.pyx", line 332, in gensim.models.doc2vec_inner.train_document_dbow
File "gensim/models/doc2vec_inner.pyx", line 254, in gensim.models.doc2vec_inner.init_d2v_config
AttributeError: 'Doc2VecTrainables' object has no attribute 'vectors_lockf'
I hope you understand the issue from the above details. Please let me know if you still need any additional information.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 647, in do_work
work_executor.execute()
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/executor.py", line 176, in execute
op.start()
File "dataflow_worker/native_operations.py", line 38, in dataflow_worker.native_operations.NativeReadOperation.start
File "dataflow_worker/native_operations.py", line 39, in dataflow_worker.native_operations.NativeReadOperation.start
File "dataflow_worker/native_operations.py", line 44, in dataflow_worker.native_operations.NativeReadOperation.start
File "dataflow_worker/native_operations.py", line 54, in dataflow_worker.native_operations.NativeReadOperation.start
File "apache_beam/runners/worker/operations.py", line 329, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 192, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 682, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 683, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 952, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1013, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 950, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 547, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1105, in apache_beam.runners.common._OutputProcessor.process_outputs
File "apache_beam/runners/worker/operations.py", line 192, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 682, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 683, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 952, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1028, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "/usr/local/lib/python3.7/site-packages/future/utils/init.py", line 421, in raise_with_traceback
raise exc.with_traceback(traceback)
File "apache_beam/runners/common.py", line 950, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 547, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1078, in apache_beam.runners.common._OutputProcessor.process_outputs
File "tfrecord_util/csv2tfrecord_train_valid.py", line 310, in process
x = self.preprocess(x)
File "tfrecord_util/csv2tfrecord_train_valid.py", line 233, in preprocess
embedding_vector = self._embedding(plot)
File "tfrecord_util/csv2tfrecord_train_valid.py", line 300, in _embedding
embedding_vector = self.model.infer_vector([plot])
File "/usr/local/lib/python3.7/site-packages/gensim/models/doc2vec.py", line 915, in infer_vector
learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
File "gensim/models/doc2vec_inner.pyx", line 332, in gensim.models.doc2vec_inner.train_document_dbow
File "gensim/models/doc2vec_inner.pyx", line 254, in gensim.models.doc2vec_inner.init_d2v_config
AttributeError: 'Doc2VecTrainables' object has no attribute 'vectors_lockf' [while running 'PreprocessData']
The text was updated successfully, but these errors were encountered:
Gensim's Doc2Vec does not support a parameter pretrained_emb. So I'm surprised that didn't cause an error, but may just be a no-op.
You haven't supplied a corpus as either documents (in the instantiation) or via subsequent calls to build_vocab() & train(). So you have an incompletely-initialized model that's never been trained. Many things won't work with such an untrained model.
So: not a bug. (And more generally: there's no supported way to turn a set of word-vectors into a working Doc2Vec model. You need to train a Doc2Vec model with texts.)
Python version 3.7.5
gensim version 3.6.0
apache-beam[gcp] 2.20.0
tensorflow==1.14
Problem description
Trying to create tf records using gensim Doc2Vec.
Expected result is to create tf records with the given parameters.
In Directrunner
tf record creation is happening when used with gensim 3.6.0
but AttributeError is raised when ran with 3.8.0 version of gensim (AttributeError: 'Doc2VecTrainables' object has no attribute 'vectors_lockf')
While running a dataflow job even with gensim 3.6.0
Attribute error is raised
Steps/code/corpus to reproduce
pretrained_emb = 'glove.6B.100d.txt'
vector_size = 300
window_size = 15
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 0 #0 = dbow; 1 = dmpv
worker_count = 1 #number of parallel processes
print('max_seq_len which is being passed above Doc2Vec', self.max_seq_len)
self.model = g.Doc2Vec(documents=None,size=vector_size,
window=window_size, min_count=min_count,
sample=sampling_threshold,
workers=worker_count, hs=0,
dm=dm, negative=negative_size,
dbow_words=1, dm_concat=1,
pretrained_emb=pretrained_emb,
iter=100)
print("Loaded Model")
plot class type is 'string'
embedding_vector = self.model.infer_vector([plot])
It is raising an attribute error when ran in dataflow runner. In Directrunner issue is raised when gensim version is 3.8.0
Error log:
I have pasted the entire error log.
textPayload: "Error message from worker: Traceback (most recent call last):
File "apache_beam/runners/common.py", line 950, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 547, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1078, in apache_beam.runners.common._OutputProcessor.process_outputs
File "tfrecord_util/csv2tfrecord_train_valid.py", line 310, in process
x = self.preprocess(x)
File "tfrecord_util/csv2tfrecord_train_valid.py", line 233, in preprocess
embedding_vector = self._embedding(plot)
File "tfrecord_util/csv2tfrecord_train_valid.py", line 300, in _embedding
embedding_vector = self.model.infer_vector([plot])
File "/usr/local/lib/python3.7/site-packages/gensim/models/doc2vec.py", line 915, in infer_vector
learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
File "gensim/models/doc2vec_inner.pyx", line 332, in gensim.models.doc2vec_inner.train_document_dbow
File "gensim/models/doc2vec_inner.pyx", line 254, in gensim.models.doc2vec_inner.init_d2v_config
AttributeError: 'Doc2VecTrainables' object has no attribute 'vectors_lockf'
I hope you understand the issue from the above details. Please let me know if you still need any additional information.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 647, in do_work
work_executor.execute()
File "/usr/local/lib/python3.7/site-packages/dataflow_worker/executor.py", line 176, in execute
op.start()
File "dataflow_worker/native_operations.py", line 38, in dataflow_worker.native_operations.NativeReadOperation.start
File "dataflow_worker/native_operations.py", line 39, in dataflow_worker.native_operations.NativeReadOperation.start
File "dataflow_worker/native_operations.py", line 44, in dataflow_worker.native_operations.NativeReadOperation.start
File "dataflow_worker/native_operations.py", line 54, in dataflow_worker.native_operations.NativeReadOperation.start
File "apache_beam/runners/worker/operations.py", line 329, in apache_beam.runners.worker.operations.Operation.output
File "apache_beam/runners/worker/operations.py", line 192, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 682, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 683, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 952, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1013, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "apache_beam/runners/common.py", line 950, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 547, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1105, in apache_beam.runners.common._OutputProcessor.process_outputs
File "apache_beam/runners/worker/operations.py", line 192, in apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 682, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/worker/operations.py", line 683, in apache_beam.runners.worker.operations.DoOperation.process
File "apache_beam/runners/common.py", line 952, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 1028, in apache_beam.runners.common.DoFnRunner._reraise_augmented
File "/usr/local/lib/python3.7/site-packages/future/utils/init.py", line 421, in raise_with_traceback
raise exc.with_traceback(traceback)
File "apache_beam/runners/common.py", line 950, in apache_beam.runners.common.DoFnRunner.process
File "apache_beam/runners/common.py", line 547, in apache_beam.runners.common.SimpleInvoker.invoke_process
File "apache_beam/runners/common.py", line 1078, in apache_beam.runners.common._OutputProcessor.process_outputs
File "tfrecord_util/csv2tfrecord_train_valid.py", line 310, in process
x = self.preprocess(x)
File "tfrecord_util/csv2tfrecord_train_valid.py", line 233, in preprocess
embedding_vector = self._embedding(plot)
File "tfrecord_util/csv2tfrecord_train_valid.py", line 300, in _embedding
embedding_vector = self.model.infer_vector([plot])
File "/usr/local/lib/python3.7/site-packages/gensim/models/doc2vec.py", line 915, in infer_vector
learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
File "gensim/models/doc2vec_inner.pyx", line 332, in gensim.models.doc2vec_inner.train_document_dbow
File "gensim/models/doc2vec_inner.pyx", line 254, in gensim.models.doc2vec_inner.init_d2v_config
AttributeError: 'Doc2VecTrainables' object has no attribute 'vectors_lockf' [while running 'PreprocessData']
The text was updated successfully, but these errors were encountered: