implement separate functions to load FT embeddings and models (#2376)

Introduced two new pure functions to the gensim.models.fasttext module: 1. load_facebook_vectors: loads embeddings from binaries in FB's fastText .bin format 2. load_facebook_model: loads the full model from binaries in FB's fastText .bin format The existing FastText.load_fasttext_format method loads full models only. I've placed a deprecation warning around it. The full_model parameter is gone - it was only introduced in 3.7.1, so it's not too late to just rip it out, IMHO. When releasing 3.7.2, we should include the above in the change log, as it changes the behavior wrt to 3.6.0 Fixes #2372
piskvorky · Mar 7, 2019 · 58f91d1 · 58f91d1
1 parent 92bc7b6
commit 58f91d1
Show file tree

Hide file tree

Showing 2 changed files with 159 additions and 115 deletions.
diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -171,27 +171,43 @@
 .. sourcecode:: pycon
 
     >>> cap_path = datapath("crime-and-punishment.bin")
-    >>> # Partial model: loads quickly, uses less RAM, but cannot continue training
-    >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
-    >>> # Full model: loads slowly, consumes RAM, but can continue training (see below)
-    >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
+    >>> fb_model = load_facebook_model(cap_path)
 
 Once loaded, such models behave identically to those trained from scratch.
 You may continue training them on new data:
 
 .. sourcecode:: pycon
 
-    >>> 'computer' in fb_full.wv.vocab  # New word, currently out of vocab
+    >>> 'computer' in fb_model.wv.vocab  # New word, currently out of vocab
     False
-    >>> old_computer = np.copy(fb_full.wv['computer'])  # Calculate current vectors
-    >>> fb_full.build_vocab(new_sentences, update=True)
-    >>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
-    >>> new_computer = fb_full.wv['computer']
+    >>> old_computer = np.copy(fb_model.wv['computer'])  # Calculate current vectors
+    >>> fb_model.build_vocab(new_sentences, update=True)
+    >>> fb_model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
+    >>> new_computer = fb_model.wv['computer']
     >>> np.allclose(old_computer, new_computer, atol=1e-4)  # Vector has changed, model has learnt something
     False
-    >>> 'computer' in fb_full.wv.vocab  # New word is now in the vocabulary
+    >>> 'computer' in fb_model.wv.vocab  # New word is now in the vocabulary
     True
 
+If you do not intend to continue training the model, consider using the
+:func:`gensim.models.FastText.load_facebook_vectors` function instead.
+That function only loads the word embeddings (keyed vectors), consuming much less CPU and RAM:
+
+.. sourcecode:: pycon
+
+    >>> from gensim.test.utils import datapath
+    >>>
+    >>> cap_path = datapath("crime-and-punishment.bin")
+    >>> wv = load_facebook_vectors(cap_path)
+    >>>
+    >>> 'landlord' in wv.vocab  # Word is out of vocabulary
+    False
+    >>> oov_vector = wv['landlord']
+    >>>
+    >>> 'landlady' in wv.vocab  # Word is in the vocabulary
+    True
+    >>> iv_vector = wv['landlady']
+
 Retrieve word-vector for vocab and out-of-vocab word:
 
 .. sourcecode:: pycon
@@ -417,7 +433,7 @@ class FastText(BaseWordEmbeddingsModel):
 
     The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save` and
     :meth:`~gensim.models.fasttext.FastText.load` methods, or loaded from a format compatible with the original
-    Fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format`.
+    Fasttext implementation via :func:`~gensim.models.fasttext.load_facebook_model`.
 
     Attributes
     ----------
@@ -885,14 +901,6 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor
             >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)
 
         """
-        cant_train = hasattr(self.trainables, 'syn1neg') and self.trainables.syn1neg is None
-        if cant_train:
-            raise ValueError(
-                'this model cannot be trained any further, '
-                'if this is a native model, try loading it with '
-                'FastText.load_fasttext_format(path, full_model=True)'
-            )
-
         super(FastText, self).train(
             sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words,
             epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
@@ -943,82 +951,23 @@ def __contains__(self, word):
         return self.wv.__contains__(word)
 
     @classmethod
-    def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True):
-        """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.
-
-        By default, this function loads the full model.  A full model allows
-        continuing training with more data, but also consumes more RAM and
-        takes longer to load.  If you do not need to continue training and only
-        wish the work with the already-trained embeddings, use `full_model=False`
-        for faster loading and to save RAM.
-
-        Notes
-        ------
-        Facebook provides both `.vec` and `.bin` files with their modules.
-        The former contains human-readable vectors.
-        The latter contains machine-readable vectors along with other model parameters.
-        This function effectively ignores `.vec` output file, since that file is redundant.
-        It only needs the `.bin` file.
-
-        Parameters
-        ----------
-        model_file : str
-            Path to the FastText output files.
-            FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
-            Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
-            as Gensim requires only `.bin` file to the load entire fastText model.
-        encoding : str, optional
-            Specifies the file encoding.
-        full_model : boolean, optional
-            If False, skips loading the hidden output matrix. This saves a fair bit
-            of CPU time and RAM, but **prevents training continuation**.
-
-        Examples
-        --------
-
-        Load, infer, continue training:
-
-        .. sourcecode:: pycon
-
-            >>> from gensim.test.utils import datapath
-            >>>
-            >>> cap_path = datapath("crime-and-punishment.bin")
-            >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
-            >>>
-            >>> 'landlord' in fb_full.wv.vocab  # Word is out of vocabulary
-            False
-            >>> oov_term = fb_full.wv['landlord']
-            >>>
-            >>> 'landlady' in fb_full.wv.vocab  # Word is in the vocabulary
-            True
-            >>> iv_term = fb_full.wv['landlady']
-            >>>
-            >>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']]
-            >>> fb_full.build_vocab(new_sent, update=True)
-            >>> fb_full.train(sentences=new_sent, total_examples=len(new_sent), epochs=5)
-
-        Load quickly, infer (forego training continuation):
-
-        .. sourcecode:: pycon
-
-            >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
-            >>>
-            >>> 'landlord' in fb_partial.wv.vocab  # Word is out of vocabulary
-            False
-            >>> oov_term = fb_partial.wv['landlord']
-            >>>
-            >>> 'landlady' in fb_partial.wv.vocab  # Word is in the vocabulary
-            True
-            >>> iv_term = fb_partial.wv['landlady']
+    @deprecated(
+        'use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model '
+        '(to continue training with the loaded full model, more RAM) instead'
+    )
+    def load_fasttext_format(cls, model_file, encoding='utf8'):
+        """Deprecated.
 
-        Returns
-        -------
-        gensim.models.fasttext.FastText
-            The loaded model.
+        Use :func:`gensim.models.fasttext.load_facebook_model` or
+        :func:`gensim.models.fasttext.load_facebook_vectors` instead.
 
         """
-        return _load_fasttext_format(model_file, encoding=encoding, full_model=full_model)
+        return load_facebook_model(model_file, encoding=encoding)
 
+    @deprecated(
+        'use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model '
+        '(to continue training with the loaded full model, more RAM) instead'
+    )
     def load_binary_data(self, encoding='utf8'):
         """Load data from a binary file created by Facebook's native FastText.
 
@@ -1229,6 +1178,105 @@ def _pad_ones(m, new_shape):
     return vstack([m, suffix])
 
 
+def load_facebook_model(path, encoding='utf-8'):
+    """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` output file.
+
+    Notes
+    ------
+    Facebook provides both `.vec` and `.bin` files with their modules.
+    The former contains human-readable vectors.
+    The latter contains machine-readable vectors along with other model parameters.
+    This function effectively ignores `.vec` output file, since that file is redundant.
+    It only needs the `.bin` file.
+
+    Parameters
+    ----------
+    model_file : str
+        Path to the FastText output files.
+        FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
+        Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
+        as Gensim requires only `.bin` file to the load entire fastText model.
+    encoding : str, optional
+        Specifies the file encoding.
+
+    Examples
+    --------
+
+    Load, infer, continue training:
+
+    .. sourcecode:: pycon
+
+        >>> from gensim.test.utils import datapath
+        >>>
+        >>> cap_path = datapath("crime-and-punishment.bin")
+        >>> fb_model = load_facebook_model(cap_path)
+        >>>
+        >>> 'landlord' in fb_model.wv.vocab  # Word is out of vocabulary
+        False
+        >>> oov_term = fb_model.wv['landlord']
+        >>>
+        >>> 'landlady' in fb_model.wv.vocab  # Word is in the vocabulary
+        True
+        >>> iv_term = fb_model.wv['landlady']
+        >>>
+        >>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']]
+        >>> fb_model.build_vocab(new_sent, update=True)
+        >>> fb_model.train(sentences=new_sent, total_examples=len(new_sent), epochs=5)
+
+    Returns
+    -------
+    gensim.models.fasttext.FastText
+        The loaded model.
+
+    """
+    return _load_fasttext_format(path, encoding=encoding, full_model=True)
+
+
+def load_facebook_vectors(path, encoding='utf-8'):
+    """Load word embeddings from a model saved in Facebook's native fasttext `.bin` format.
+
+    Parameters
+    ----------
+    path : str
+        The location of the model file.
+    encoding : str, optional
+        Specifies the file encoding.
+
+    Returns
+    -------
+    gensim.models.keyedvectors.FastTextKeyedVectors
+        The word embeddings.
+
+    Examples
+    --------
+
+    Load and infer:
+
+        >>> from gensim.test.utils import datapath
+        >>>
+        >>> cap_path = datapath("crime-and-punishment.bin")
+        >>> fbkv = load_facebook_vectors(cap_path)
+        >>>
+        >>> 'landlord' in fbkv.vocab  # Word is out of vocabulary
+        False
+        >>> oov_vector = fbkv['landlord']
+        >>>
+        >>> 'landlady' in fbkv.vocab  # Word is in the vocabulary
+        True
+        >>> iv_vector = fbkv['landlady']
+
+    See Also
+    --------
+
+    :meth:`gensim.models.fasttext.FastText.load_facebook_model` loads
+    the full model, not just word embeddings, and enables you to continue
+    model training.
+
+    """
+    model_wrapper = _load_fasttext_format(path, encoding=encoding, full_model=False)
+    return model_wrapper.wv
+
+
 def _load_fasttext_format(model_file, encoding='utf-8', full_model=True):
     """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.
 

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
@@ -23,6 +23,8 @@
 import gensim.models._fasttext_bin
 
 
+import gensim.models.fasttext
+
 try:
     from pyemd import emd  # noqa:F401
     PYEMD_EXT = True
@@ -59,24 +61,9 @@ def setUp(self):
         ft_home = os.environ.get('FT_HOME', None)
         self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None
         self.test_model_file = datapath('lee_fasttext')
-        self.test_model = FT_gensim.load_fasttext_format(self.test_model_file)
+        self.test_model = gensim.models.fasttext.load_facebook_model(self.test_model_file)
         self.test_new_model_file = datapath('lee_fasttext_new')
 
-    def test_native_partial_model(self):
-        """Can we skip loading the NN and still get a working model?"""
-        model = FT_gensim.load_fasttext_format(self.test_model_file, full_model=False)
-
-        #
-        # Training continuation should be impossible
-        #
-        self.assertIsNone(model.trainables.syn1neg)
-        self.assertRaises(ValueError, model.train, sentences,
-                          total_examples=model.corpus_count, epochs=model.epochs)
-
-        model.wv['green']
-        model.wv['foobar']
-        model.wv['thisworddoesnotexist']
-
     def test_training(self):
         model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
         model.build_vocab(sentences)
@@ -205,7 +192,7 @@ def model_sanity(self, model):
 
     def test_load_fasttext_format(self):
         try:
-            model = FT_gensim.load_fasttext_format(self.test_model_file)
+            model = gensim.models.fasttext.load_facebook_model(self.test_model_file)
         except Exception as exc:
             self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc))
         vocab_size, model_size = 1762, 10
@@ -258,7 +245,7 @@ def test_load_fasttext_format(self):
 
     def test_load_fasttext_new_format(self):
         try:
-            new_model = FT_gensim.load_fasttext_format(self.test_new_model_file)
+            new_model = gensim.models.fasttext.load_facebook_model(self.test_new_model_file)
         except Exception as exc:
             self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc))
         vocab_size, model_size = 1763, 10
@@ -311,18 +298,18 @@ def test_load_fasttext_new_format(self):
 
     def test_load_model_supervised(self):
         with self.assertRaises(NotImplementedError):
-            FT_gensim.load_fasttext_format(datapath('pang_lee_polarity_fasttext'))
+            gensim.models.fasttext.load_facebook_model(datapath('pang_lee_polarity_fasttext'))
 
     def test_load_model_with_non_ascii_vocab(self):
-        model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext'))
+        model = gensim.models.fasttext.load_facebook_model(datapath('non_ascii_fasttext'))
         self.assertTrue(u'který' in model.wv)
         try:
             model.wv[u'který']
         except UnicodeDecodeError:
             self.fail('Unable to access vector for utf8 encoded non-ascii word')
 
     def test_load_model_non_utf8_encoding(self):
-        model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852')
+        model = gensim.models.fasttext.load_facebook_model(datapath('cp852_fasttext'), encoding='cp852')
         self.assertTrue(u'který' in model.wv)
         try:
             model.wv[u'který']
@@ -894,7 +881,7 @@ def load_native():
     # ./fasttext cbow -input toy-data.txt -output toy-model -bucket 100 -dim 5
     #
     path = datapath('toy-model.bin')
-    model = FT_gensim.load_fasttext_format(path)
+    model = gensim.models.fasttext.load_facebook_model(path)
     return model
 
 
@@ -1118,11 +1105,20 @@ def test_save_load_native(self):
             model.save(model_name)
 
     def test_load_native_pretrained(self):
-        model = FT_gensim.load_fasttext_format(datapath('toy-model-pretrained.bin'))
+        model = gensim.models.fasttext.load_facebook_model(datapath('toy-model-pretrained.bin'))
         actual = model['monarchist']
         expected = np.array([0.76222, 1.0669, 0.7055, -0.090969, -0.53508])
         self.assertTrue(np.allclose(expected, actual, atol=10e-4))
 
+    def test_load_native_vectors(self):
+        cap_path = datapath("crime-and-punishment.bin")
+        fbkv = gensim.models.fasttext.load_facebook_vectors(cap_path)
+        self.assertFalse('landlord' in fbkv.vocab)
+        self.assertTrue('landlady' in fbkv.vocab)
+        oov_vector = fbkv['landlord']
+        iv_vector = fbkv['landlady']
+        self.assertFalse(np.allclose(oov_vector, iv_vector))
+
 
 def _train_model_with_pretrained_vectors():
     """Generate toy-model-pretrained.bin for use in test_load_native_pretrained.
@@ -1173,7 +1169,7 @@ def setUp(self):
         #
         # ./fasttext skipgram -minCount 0 -bucket 100 -input crime-and-punishment.txt -output crime-and-punishment -dim 5  # noqa: E501
         #
-        self.model = FT_gensim.load_fasttext_format(datapath('crime-and-punishment.bin'))
+        self.model = gensim.models.fasttext.load_facebook_model(datapath('crime-and-punishment.bin'))
         with smart_open.smart_open(datapath('crime-and-punishment.vec'), 'r', encoding='utf-8') as fin:
             self.expected = dict(load_vec(fin))