From 58f91d15d5a6b73fe0feaffe215f9c111b5459a3 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 7 Mar 2019 16:06:50 +0900 Subject: [PATCH] implement separate functions to load FT embeddings and models (#2376) Introduced two new pure functions to the gensim.models.fasttext module: 1. load_facebook_vectors: loads embeddings from binaries in FB's fastText .bin format 2. load_facebook_model: loads the full model from binaries in FB's fastText .bin format The existing FastText.load_fasttext_format method loads full models only. I've placed a deprecation warning around it. The full_model parameter is gone - it was only introduced in 3.7.1, so it's not too late to just rip it out, IMHO. When releasing 3.7.2, we should include the above in the change log, as it changes the behavior wrt to 3.6.0 Fixes #2372 --- gensim/models/fasttext.py | 230 +++++++++++++++++++++-------------- gensim/test/test_fasttext.py | 44 +++---- 2 files changed, 159 insertions(+), 115 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index f49e893ab7..1764408085 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -171,27 +171,43 @@ .. sourcecode:: pycon >>> cap_path = datapath("crime-and-punishment.bin") - >>> # Partial model: loads quickly, uses less RAM, but cannot continue training - >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False) - >>> # Full model: loads slowly, consumes RAM, but can continue training (see below) - >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True) + >>> fb_model = load_facebook_model(cap_path) Once loaded, such models behave identically to those trained from scratch. You may continue training them on new data: .. sourcecode:: pycon - >>> 'computer' in fb_full.wv.vocab # New word, currently out of vocab + >>> 'computer' in fb_model.wv.vocab # New word, currently out of vocab False - >>> old_computer = np.copy(fb_full.wv['computer']) # Calculate current vectors - >>> fb_full.build_vocab(new_sentences, update=True) - >>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) - >>> new_computer = fb_full.wv['computer'] + >>> old_computer = np.copy(fb_model.wv['computer']) # Calculate current vectors + >>> fb_model.build_vocab(new_sentences, update=True) + >>> fb_model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) + >>> new_computer = fb_model.wv['computer'] >>> np.allclose(old_computer, new_computer, atol=1e-4) # Vector has changed, model has learnt something False - >>> 'computer' in fb_full.wv.vocab # New word is now in the vocabulary + >>> 'computer' in fb_model.wv.vocab # New word is now in the vocabulary True +If you do not intend to continue training the model, consider using the +:func:`gensim.models.FastText.load_facebook_vectors` function instead. +That function only loads the word embeddings (keyed vectors), consuming much less CPU and RAM: + +.. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> + >>> cap_path = datapath("crime-and-punishment.bin") + >>> wv = load_facebook_vectors(cap_path) + >>> + >>> 'landlord' in wv.vocab # Word is out of vocabulary + False + >>> oov_vector = wv['landlord'] + >>> + >>> 'landlady' in wv.vocab # Word is in the vocabulary + True + >>> iv_vector = wv['landlady'] + Retrieve word-vector for vocab and out-of-vocab word: .. sourcecode:: pycon @@ -417,7 +433,7 @@ class FastText(BaseWordEmbeddingsModel): The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save` and :meth:`~gensim.models.fasttext.FastText.load` methods, or loaded from a format compatible with the original - Fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format`. + Fasttext implementation via :func:`~gensim.models.fasttext.load_facebook_model`. Attributes ---------- @@ -885,14 +901,6 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) """ - cant_train = hasattr(self.trainables, 'syn1neg') and self.trainables.syn1neg is None - if cant_train: - raise ValueError( - 'this model cannot be trained any further, ' - 'if this is a native model, try loading it with ' - 'FastText.load_fasttext_format(path, full_model=True)' - ) - super(FastText, self).train( sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, @@ -943,82 +951,23 @@ def __contains__(self, word): return self.wv.__contains__(word) @classmethod - def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): - """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files. - - By default, this function loads the full model. A full model allows - continuing training with more data, but also consumes more RAM and - takes longer to load. If you do not need to continue training and only - wish the work with the already-trained embeddings, use `full_model=False` - for faster loading and to save RAM. - - Notes - ------ - Facebook provides both `.vec` and `.bin` files with their modules. - The former contains human-readable vectors. - The latter contains machine-readable vectors along with other model parameters. - This function effectively ignores `.vec` output file, since that file is redundant. - It only needs the `.bin` file. - - Parameters - ---------- - model_file : str - Path to the FastText output files. - FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin` - Expected value for this example: `/path/to/model` or `/path/to/model.bin`, - as Gensim requires only `.bin` file to the load entire fastText model. - encoding : str, optional - Specifies the file encoding. - full_model : boolean, optional - If False, skips loading the hidden output matrix. This saves a fair bit - of CPU time and RAM, but **prevents training continuation**. - - Examples - -------- - - Load, infer, continue training: - - .. sourcecode:: pycon - - >>> from gensim.test.utils import datapath - >>> - >>> cap_path = datapath("crime-and-punishment.bin") - >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True) - >>> - >>> 'landlord' in fb_full.wv.vocab # Word is out of vocabulary - False - >>> oov_term = fb_full.wv['landlord'] - >>> - >>> 'landlady' in fb_full.wv.vocab # Word is in the vocabulary - True - >>> iv_term = fb_full.wv['landlady'] - >>> - >>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']] - >>> fb_full.build_vocab(new_sent, update=True) - >>> fb_full.train(sentences=new_sent, total_examples=len(new_sent), epochs=5) - - Load quickly, infer (forego training continuation): - - .. sourcecode:: pycon - - >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False) - >>> - >>> 'landlord' in fb_partial.wv.vocab # Word is out of vocabulary - False - >>> oov_term = fb_partial.wv['landlord'] - >>> - >>> 'landlady' in fb_partial.wv.vocab # Word is in the vocabulary - True - >>> iv_term = fb_partial.wv['landlady'] + @deprecated( + 'use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model ' + '(to continue training with the loaded full model, more RAM) instead' + ) + def load_fasttext_format(cls, model_file, encoding='utf8'): + """Deprecated. - Returns - ------- - gensim.models.fasttext.FastText - The loaded model. + Use :func:`gensim.models.fasttext.load_facebook_model` or + :func:`gensim.models.fasttext.load_facebook_vectors` instead. """ - return _load_fasttext_format(model_file, encoding=encoding, full_model=full_model) + return load_facebook_model(model_file, encoding=encoding) + @deprecated( + 'use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model ' + '(to continue training with the loaded full model, more RAM) instead' + ) def load_binary_data(self, encoding='utf8'): """Load data from a binary file created by Facebook's native FastText. @@ -1229,6 +1178,105 @@ def _pad_ones(m, new_shape): return vstack([m, suffix]) +def load_facebook_model(path, encoding='utf-8'): + """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` output file. + + Notes + ------ + Facebook provides both `.vec` and `.bin` files with their modules. + The former contains human-readable vectors. + The latter contains machine-readable vectors along with other model parameters. + This function effectively ignores `.vec` output file, since that file is redundant. + It only needs the `.bin` file. + + Parameters + ---------- + model_file : str + Path to the FastText output files. + FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin` + Expected value for this example: `/path/to/model` or `/path/to/model.bin`, + as Gensim requires only `.bin` file to the load entire fastText model. + encoding : str, optional + Specifies the file encoding. + + Examples + -------- + + Load, infer, continue training: + + .. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> + >>> cap_path = datapath("crime-and-punishment.bin") + >>> fb_model = load_facebook_model(cap_path) + >>> + >>> 'landlord' in fb_model.wv.vocab # Word is out of vocabulary + False + >>> oov_term = fb_model.wv['landlord'] + >>> + >>> 'landlady' in fb_model.wv.vocab # Word is in the vocabulary + True + >>> iv_term = fb_model.wv['landlady'] + >>> + >>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']] + >>> fb_model.build_vocab(new_sent, update=True) + >>> fb_model.train(sentences=new_sent, total_examples=len(new_sent), epochs=5) + + Returns + ------- + gensim.models.fasttext.FastText + The loaded model. + + """ + return _load_fasttext_format(path, encoding=encoding, full_model=True) + + +def load_facebook_vectors(path, encoding='utf-8'): + """Load word embeddings from a model saved in Facebook's native fasttext `.bin` format. + + Parameters + ---------- + path : str + The location of the model file. + encoding : str, optional + Specifies the file encoding. + + Returns + ------- + gensim.models.keyedvectors.FastTextKeyedVectors + The word embeddings. + + Examples + -------- + + Load and infer: + + >>> from gensim.test.utils import datapath + >>> + >>> cap_path = datapath("crime-and-punishment.bin") + >>> fbkv = load_facebook_vectors(cap_path) + >>> + >>> 'landlord' in fbkv.vocab # Word is out of vocabulary + False + >>> oov_vector = fbkv['landlord'] + >>> + >>> 'landlady' in fbkv.vocab # Word is in the vocabulary + True + >>> iv_vector = fbkv['landlady'] + + See Also + -------- + + :meth:`gensim.models.fasttext.FastText.load_facebook_model` loads + the full model, not just word embeddings, and enables you to continue + model training. + + """ + model_wrapper = _load_fasttext_format(path, encoding=encoding, full_model=False) + return model_wrapper.wv + + def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files. diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 4a1056a109..0933af8b34 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -23,6 +23,8 @@ import gensim.models._fasttext_bin +import gensim.models.fasttext + try: from pyemd import emd # noqa:F401 PYEMD_EXT = True @@ -59,24 +61,9 @@ def setUp(self): ft_home = os.environ.get('FT_HOME', None) self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None self.test_model_file = datapath('lee_fasttext') - self.test_model = FT_gensim.load_fasttext_format(self.test_model_file) + self.test_model = gensim.models.fasttext.load_facebook_model(self.test_model_file) self.test_new_model_file = datapath('lee_fasttext_new') - def test_native_partial_model(self): - """Can we skip loading the NN and still get a working model?""" - model = FT_gensim.load_fasttext_format(self.test_model_file, full_model=False) - - # - # Training continuation should be impossible - # - self.assertIsNone(model.trainables.syn1neg) - self.assertRaises(ValueError, model.train, sentences, - total_examples=model.corpus_count, epochs=model.epochs) - - model.wv['green'] - model.wv['foobar'] - model.wv['thisworddoesnotexist'] - def test_training(self): model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(sentences) @@ -205,7 +192,7 @@ def model_sanity(self, model): def test_load_fasttext_format(self): try: - model = FT_gensim.load_fasttext_format(self.test_model_file) + model = gensim.models.fasttext.load_facebook_model(self.test_model_file) except Exception as exc: self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc)) vocab_size, model_size = 1762, 10 @@ -258,7 +245,7 @@ def test_load_fasttext_format(self): def test_load_fasttext_new_format(self): try: - new_model = FT_gensim.load_fasttext_format(self.test_new_model_file) + new_model = gensim.models.fasttext.load_facebook_model(self.test_new_model_file) except Exception as exc: self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc)) vocab_size, model_size = 1763, 10 @@ -311,10 +298,10 @@ def test_load_fasttext_new_format(self): def test_load_model_supervised(self): with self.assertRaises(NotImplementedError): - FT_gensim.load_fasttext_format(datapath('pang_lee_polarity_fasttext')) + gensim.models.fasttext.load_facebook_model(datapath('pang_lee_polarity_fasttext')) def test_load_model_with_non_ascii_vocab(self): - model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext')) + model = gensim.models.fasttext.load_facebook_model(datapath('non_ascii_fasttext')) self.assertTrue(u'který' in model.wv) try: model.wv[u'který'] @@ -322,7 +309,7 @@ def test_load_model_with_non_ascii_vocab(self): self.fail('Unable to access vector for utf8 encoded non-ascii word') def test_load_model_non_utf8_encoding(self): - model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') + model = gensim.models.fasttext.load_facebook_model(datapath('cp852_fasttext'), encoding='cp852') self.assertTrue(u'který' in model.wv) try: model.wv[u'který'] @@ -894,7 +881,7 @@ def load_native(): # ./fasttext cbow -input toy-data.txt -output toy-model -bucket 100 -dim 5 # path = datapath('toy-model.bin') - model = FT_gensim.load_fasttext_format(path) + model = gensim.models.fasttext.load_facebook_model(path) return model @@ -1118,11 +1105,20 @@ def test_save_load_native(self): model.save(model_name) def test_load_native_pretrained(self): - model = FT_gensim.load_fasttext_format(datapath('toy-model-pretrained.bin')) + model = gensim.models.fasttext.load_facebook_model(datapath('toy-model-pretrained.bin')) actual = model['monarchist'] expected = np.array([0.76222, 1.0669, 0.7055, -0.090969, -0.53508]) self.assertTrue(np.allclose(expected, actual, atol=10e-4)) + def test_load_native_vectors(self): + cap_path = datapath("crime-and-punishment.bin") + fbkv = gensim.models.fasttext.load_facebook_vectors(cap_path) + self.assertFalse('landlord' in fbkv.vocab) + self.assertTrue('landlady' in fbkv.vocab) + oov_vector = fbkv['landlord'] + iv_vector = fbkv['landlady'] + self.assertFalse(np.allclose(oov_vector, iv_vector)) + def _train_model_with_pretrained_vectors(): """Generate toy-model-pretrained.bin for use in test_load_native_pretrained. @@ -1173,7 +1169,7 @@ def setUp(self): # # ./fasttext skipgram -minCount 0 -bucket 100 -input crime-and-punishment.txt -output crime-and-punishment -dim 5 # noqa: E501 # - self.model = FT_gensim.load_fasttext_format(datapath('crime-and-punishment.bin')) + self.model = gensim.models.fasttext.load_facebook_model(datapath('crime-and-punishment.bin')) with smart_open.smart_open(datapath('crime-and-punishment.vec'), 'r', encoding='utf-8') as fin: self.expected = dict(load_vec(fin))