Skip to content

Commit

Permalink
implement separate functions to load FT embeddings and models (#2376)
Browse files Browse the repository at this point in the history
Introduced two new pure functions to the gensim.models.fasttext module:

1. load_facebook_vectors: loads embeddings from binaries in FB's fastText .bin format
2. load_facebook_model: loads the full model from binaries in FB's fastText .bin format

The existing FastText.load_fasttext_format method loads full models only. I've placed a deprecation warning around it. The full_model parameter is gone - it was only introduced in 3.7.1, so it's not too late to just rip it out, IMHO.

When releasing 3.7.2, we should include the above in the change log, as it changes the behavior wrt to 3.6.0

Fixes #2372
  • Loading branch information
mpenkov authored Mar 7, 2019
1 parent 92bc7b6 commit 58f91d1
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 115 deletions.
230 changes: 139 additions & 91 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,27 +171,43 @@
.. sourcecode:: pycon
>>> cap_path = datapath("crime-and-punishment.bin")
>>> # Partial model: loads quickly, uses less RAM, but cannot continue training
>>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
>>> # Full model: loads slowly, consumes RAM, but can continue training (see below)
>>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
>>> fb_model = load_facebook_model(cap_path)
Once loaded, such models behave identically to those trained from scratch.
You may continue training them on new data:
.. sourcecode:: pycon
>>> 'computer' in fb_full.wv.vocab # New word, currently out of vocab
>>> 'computer' in fb_model.wv.vocab # New word, currently out of vocab
False
>>> old_computer = np.copy(fb_full.wv['computer']) # Calculate current vectors
>>> fb_full.build_vocab(new_sentences, update=True)
>>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
>>> new_computer = fb_full.wv['computer']
>>> old_computer = np.copy(fb_model.wv['computer']) # Calculate current vectors
>>> fb_model.build_vocab(new_sentences, update=True)
>>> fb_model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
>>> new_computer = fb_model.wv['computer']
>>> np.allclose(old_computer, new_computer, atol=1e-4) # Vector has changed, model has learnt something
False
>>> 'computer' in fb_full.wv.vocab # New word is now in the vocabulary
>>> 'computer' in fb_model.wv.vocab # New word is now in the vocabulary
True
If you do not intend to continue training the model, consider using the
:func:`gensim.models.FastText.load_facebook_vectors` function instead.
That function only loads the word embeddings (keyed vectors), consuming much less CPU and RAM:
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> wv = load_facebook_vectors(cap_path)
>>>
>>> 'landlord' in wv.vocab # Word is out of vocabulary
False
>>> oov_vector = wv['landlord']
>>>
>>> 'landlady' in wv.vocab # Word is in the vocabulary
True
>>> iv_vector = wv['landlady']
Retrieve word-vector for vocab and out-of-vocab word:
.. sourcecode:: pycon
Expand Down Expand Up @@ -417,7 +433,7 @@ class FastText(BaseWordEmbeddingsModel):
The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save` and
:meth:`~gensim.models.fasttext.FastText.load` methods, or loaded from a format compatible with the original
Fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format`.
Fasttext implementation via :func:`~gensim.models.fasttext.load_facebook_model`.
Attributes
----------
Expand Down Expand Up @@ -885,14 +901,6 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor
>>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)
"""
cant_train = hasattr(self.trainables, 'syn1neg') and self.trainables.syn1neg is None
if cant_train:
raise ValueError(
'this model cannot be trained any further, '
'if this is a native model, try loading it with '
'FastText.load_fasttext_format(path, full_model=True)'
)

super(FastText, self).train(
sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words,
epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
Expand Down Expand Up @@ -943,82 +951,23 @@ def __contains__(self, word):
return self.wv.__contains__(word)

@classmethod
def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.
By default, this function loads the full model. A full model allows
continuing training with more data, but also consumes more RAM and
takes longer to load. If you do not need to continue training and only
wish the work with the already-trained embeddings, use `full_model=False`
for faster loading and to save RAM.
Notes
------
Facebook provides both `.vec` and `.bin` files with their modules.
The former contains human-readable vectors.
The latter contains machine-readable vectors along with other model parameters.
This function effectively ignores `.vec` output file, since that file is redundant.
It only needs the `.bin` file.
Parameters
----------
model_file : str
Path to the FastText output files.
FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
as Gensim requires only `.bin` file to the load entire fastText model.
encoding : str, optional
Specifies the file encoding.
full_model : boolean, optional
If False, skips loading the hidden output matrix. This saves a fair bit
of CPU time and RAM, but **prevents training continuation**.
Examples
--------
Load, infer, continue training:
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
>>>
>>> 'landlord' in fb_full.wv.vocab # Word is out of vocabulary
False
>>> oov_term = fb_full.wv['landlord']
>>>
>>> 'landlady' in fb_full.wv.vocab # Word is in the vocabulary
True
>>> iv_term = fb_full.wv['landlady']
>>>
>>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']]
>>> fb_full.build_vocab(new_sent, update=True)
>>> fb_full.train(sentences=new_sent, total_examples=len(new_sent), epochs=5)
Load quickly, infer (forego training continuation):
.. sourcecode:: pycon
>>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
>>>
>>> 'landlord' in fb_partial.wv.vocab # Word is out of vocabulary
False
>>> oov_term = fb_partial.wv['landlord']
>>>
>>> 'landlady' in fb_partial.wv.vocab # Word is in the vocabulary
True
>>> iv_term = fb_partial.wv['landlady']
@deprecated(
'use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model '
'(to continue training with the loaded full model, more RAM) instead'
)
def load_fasttext_format(cls, model_file, encoding='utf8'):
"""Deprecated.
Returns
-------
gensim.models.fasttext.FastText
The loaded model.
Use :func:`gensim.models.fasttext.load_facebook_model` or
:func:`gensim.models.fasttext.load_facebook_vectors` instead.
"""
return _load_fasttext_format(model_file, encoding=encoding, full_model=full_model)
return load_facebook_model(model_file, encoding=encoding)

@deprecated(
'use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model '
'(to continue training with the loaded full model, more RAM) instead'
)
def load_binary_data(self, encoding='utf8'):
"""Load data from a binary file created by Facebook's native FastText.
Expand Down Expand Up @@ -1229,6 +1178,105 @@ def _pad_ones(m, new_shape):
return vstack([m, suffix])


def load_facebook_model(path, encoding='utf-8'):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` output file.
Notes
------
Facebook provides both `.vec` and `.bin` files with their modules.
The former contains human-readable vectors.
The latter contains machine-readable vectors along with other model parameters.
This function effectively ignores `.vec` output file, since that file is redundant.
It only needs the `.bin` file.
Parameters
----------
model_file : str
Path to the FastText output files.
FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
as Gensim requires only `.bin` file to the load entire fastText model.
encoding : str, optional
Specifies the file encoding.
Examples
--------
Load, infer, continue training:
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fb_model = load_facebook_model(cap_path)
>>>
>>> 'landlord' in fb_model.wv.vocab # Word is out of vocabulary
False
>>> oov_term = fb_model.wv['landlord']
>>>
>>> 'landlady' in fb_model.wv.vocab # Word is in the vocabulary
True
>>> iv_term = fb_model.wv['landlady']
>>>
>>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']]
>>> fb_model.build_vocab(new_sent, update=True)
>>> fb_model.train(sentences=new_sent, total_examples=len(new_sent), epochs=5)
Returns
-------
gensim.models.fasttext.FastText
The loaded model.
"""
return _load_fasttext_format(path, encoding=encoding, full_model=True)


def load_facebook_vectors(path, encoding='utf-8'):
"""Load word embeddings from a model saved in Facebook's native fasttext `.bin` format.
Parameters
----------
path : str
The location of the model file.
encoding : str, optional
Specifies the file encoding.
Returns
-------
gensim.models.keyedvectors.FastTextKeyedVectors
The word embeddings.
Examples
--------
Load and infer:
>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fbkv = load_facebook_vectors(cap_path)
>>>
>>> 'landlord' in fbkv.vocab # Word is out of vocabulary
False
>>> oov_vector = fbkv['landlord']
>>>
>>> 'landlady' in fbkv.vocab # Word is in the vocabulary
True
>>> iv_vector = fbkv['landlady']
See Also
--------
:meth:`gensim.models.fasttext.FastText.load_facebook_model` loads
the full model, not just word embeddings, and enables you to continue
model training.
"""
model_wrapper = _load_fasttext_format(path, encoding=encoding, full_model=False)
return model_wrapper.wv


def _load_fasttext_format(model_file, encoding='utf-8', full_model=True):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.
Expand Down
44 changes: 20 additions & 24 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import gensim.models._fasttext_bin


import gensim.models.fasttext

try:
from pyemd import emd # noqa:F401
PYEMD_EXT = True
Expand Down Expand Up @@ -59,24 +61,9 @@ def setUp(self):
ft_home = os.environ.get('FT_HOME', None)
self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None
self.test_model_file = datapath('lee_fasttext')
self.test_model = FT_gensim.load_fasttext_format(self.test_model_file)
self.test_model = gensim.models.fasttext.load_facebook_model(self.test_model_file)
self.test_new_model_file = datapath('lee_fasttext_new')

def test_native_partial_model(self):
"""Can we skip loading the NN and still get a working model?"""
model = FT_gensim.load_fasttext_format(self.test_model_file, full_model=False)

#
# Training continuation should be impossible
#
self.assertIsNone(model.trainables.syn1neg)
self.assertRaises(ValueError, model.train, sentences,
total_examples=model.corpus_count, epochs=model.epochs)

model.wv['green']
model.wv['foobar']
model.wv['thisworddoesnotexist']

def test_training(self):
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model.build_vocab(sentences)
Expand Down Expand Up @@ -205,7 +192,7 @@ def model_sanity(self, model):

def test_load_fasttext_format(self):
try:
model = FT_gensim.load_fasttext_format(self.test_model_file)
model = gensim.models.fasttext.load_facebook_model(self.test_model_file)
except Exception as exc:
self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc))
vocab_size, model_size = 1762, 10
Expand Down Expand Up @@ -258,7 +245,7 @@ def test_load_fasttext_format(self):

def test_load_fasttext_new_format(self):
try:
new_model = FT_gensim.load_fasttext_format(self.test_new_model_file)
new_model = gensim.models.fasttext.load_facebook_model(self.test_new_model_file)
except Exception as exc:
self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc))
vocab_size, model_size = 1763, 10
Expand Down Expand Up @@ -311,18 +298,18 @@ def test_load_fasttext_new_format(self):

def test_load_model_supervised(self):
with self.assertRaises(NotImplementedError):
FT_gensim.load_fasttext_format(datapath('pang_lee_polarity_fasttext'))
gensim.models.fasttext.load_facebook_model(datapath('pang_lee_polarity_fasttext'))

def test_load_model_with_non_ascii_vocab(self):
model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext'))
model = gensim.models.fasttext.load_facebook_model(datapath('non_ascii_fasttext'))
self.assertTrue(u'který' in model.wv)
try:
model.wv[u'který']
except UnicodeDecodeError:
self.fail('Unable to access vector for utf8 encoded non-ascii word')

def test_load_model_non_utf8_encoding(self):
model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852')
model = gensim.models.fasttext.load_facebook_model(datapath('cp852_fasttext'), encoding='cp852')
self.assertTrue(u'který' in model.wv)
try:
model.wv[u'který']
Expand Down Expand Up @@ -894,7 +881,7 @@ def load_native():
# ./fasttext cbow -input toy-data.txt -output toy-model -bucket 100 -dim 5
#
path = datapath('toy-model.bin')
model = FT_gensim.load_fasttext_format(path)
model = gensim.models.fasttext.load_facebook_model(path)
return model


Expand Down Expand Up @@ -1118,11 +1105,20 @@ def test_save_load_native(self):
model.save(model_name)

def test_load_native_pretrained(self):
model = FT_gensim.load_fasttext_format(datapath('toy-model-pretrained.bin'))
model = gensim.models.fasttext.load_facebook_model(datapath('toy-model-pretrained.bin'))
actual = model['monarchist']
expected = np.array([0.76222, 1.0669, 0.7055, -0.090969, -0.53508])
self.assertTrue(np.allclose(expected, actual, atol=10e-4))

def test_load_native_vectors(self):
cap_path = datapath("crime-and-punishment.bin")
fbkv = gensim.models.fasttext.load_facebook_vectors(cap_path)
self.assertFalse('landlord' in fbkv.vocab)
self.assertTrue('landlady' in fbkv.vocab)
oov_vector = fbkv['landlord']
iv_vector = fbkv['landlady']
self.assertFalse(np.allclose(oov_vector, iv_vector))


def _train_model_with_pretrained_vectors():
"""Generate toy-model-pretrained.bin for use in test_load_native_pretrained.
Expand Down Expand Up @@ -1173,7 +1169,7 @@ def setUp(self):
#
# ./fasttext skipgram -minCount 0 -bucket 100 -input crime-and-punishment.txt -output crime-and-punishment -dim 5 # noqa: E501
#
self.model = FT_gensim.load_fasttext_format(datapath('crime-and-punishment.bin'))
self.model = gensim.models.fasttext.load_facebook_model(datapath('crime-and-punishment.bin'))
with smart_open.smart_open(datapath('crime-and-punishment.vec'), 'r', encoding='utf-8') as fin:
self.expected = dict(load_vec(fin))

Expand Down

0 comments on commit 58f91d1

Please # to comment.