Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Add nmslib indexer #2417

Merged
merged 22 commits into from
Jul 7, 2019
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,080 changes: 1,080 additions & 0 deletions docs/notebooks/nmslibtutorial.ipynb

Large diffs are not rendered by default.

225 changes: 225 additions & 0 deletions gensim/similarities/nmslib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com>
# Copyright (C) 2019 Masahiro Kazama <kazama.masa@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Intro
-----

This module contains integration Nmslib with :class:`~gensim.models.word2vec.Word2Vec`,
:class:`~gensim.models.doc2vec.Doc2Vec`, :class:`~gensim.models.fasttext.FastText` and
:class:`~gensim.models.keyedvectors.KeyedVectors`.
To use nmslib, instantiate a :class:`~gensim.similarities.nmslib.NmslibIndexer` class
and pass the instance as the indexer parameter to your model's most_similar method
(e.g. :py:func:`~gensim.models.doc2vec.most_similar`).

Example usage
-------------

.. sourcecode:: pycon

>>> from gensim.similarities.nmslib import NmslibIndexer
>>> from gensim.models import Word2Vec
>>>
>>> sentences = [['cute', 'cat', 'say', 'meow'], ['cute', 'dog', 'say', 'woof']]
>>> model = Word2Vec(sentences, min_count=1, seed=1)
>>>
>>> indexer = NmslibIndexer(model)
>>> model.most_similar("cat", topn=2, indexer=indexer)
[('cat', 1.0), ('meow', 0.5595494508743286)]

Load and save example
---------------------

.. sourcecode:: pycon

>>> from gensim.similarities.nmslib import NmslibIndexer
>>> from gensim.models import Word2Vec
>>> from tempfile import mkstemp
>>>
>>> sentences = [['cute', 'cat', 'say', 'meow'], ['cute', 'dog', 'say', 'woof']]
>>> model = Word2Vec(sentences, min_count=1, seed=1, iter=10)
>>>
>>> indexer = NmslibIndexer(model)
>>> _, temp_fn = mkstemp()
>>> indexer.save(temp_fn)
>>>
>>> new_indexer = NmslibIndexer.load(temp_fn)
>>> model.most_similar("cat", topn=2, indexer=new_indexer)
[('cat', 1.0), ('meow', 0.5595494508743286)]

What is Nmslib
-------------

Non-Metric Space Library (NMSLIB) is an efficient cross-platform similarity search library and a toolkit
for evaluation of similarity search methods. The core-library does not have any third-party dependencies.
More information about Nmslib: `github repository <https://github.com/nmslib/nmslib>`_.

Why use Nmslib?
-------------

The current implementation for finding k nearest neighbors in a vector space in gensim has linear complexity
via brute force in the number of indexed documents, although with extremely low constant factors.
The retrieved results are exact, which is an overkill in many applications:
approximate results retrieved in sub-linear time may be enough.
Nmslib can find approximate nearest neighbors much faster.
Compared to annoy, nmslib has more parameters to control the build and query time and accuracy.
Nmslib can achieve faster and more accurate nearest neighbors search than annoy.
"""

from smart_open import open
try:
import cPickle as _pickle
except ImportError:
import pickle as _pickle

from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
from gensim.models import KeyedVectors
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors
try:
import nmslib
except ImportError:
raise ImportError(
"Nmslib has not been installed, if you wish to use the nmslib indexer, please run `pip install nmslib`"
)


class NmslibIndexer(object):
"""This class allows to use `Nmslib <https://github.com/nmslib/nmslib>`_ as indexer for `most_similar` method
from :class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.doc2vec.Doc2Vec`,
:class:`~gensim.models.fasttext.FastText` and :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` classes.

"""

def __init__(self, model=None, index_params=None, query_time_params=None):
"""
Parameters
----------
model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`, optional
Model, that will be used as source for index.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if model is None? It may be worth including an example showing this use case, if it is valid.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the model is None, index and labels are not initialized. In that case, a user should load or init the index and labels by themselves. I add this information to doc string.
Also this is used by load function with model=None.

If the model is None, index and labels are not initialized.
In that case please load or init the index and labels by yourself.
index_params : dict, optional
index_params for Nmslib indexer.
query_time_params : dict, optional
query_time_params for Nmslib indexer.

"""
if index_params is None:
index_params = {'M': 100, 'indexThreadQty': 1, 'efConstruction': 100, 'post': 0}
if query_time_params is None:
query_time_params = {'efSearch': 100}

self.index = None
self.labels = None
self.model = model
self.index_params = index_params
self.query_time_params = query_time_params

if model:
if isinstance(self.model, Doc2Vec):
self._build_from_doc2vec()
elif isinstance(self.model, (Word2Vec, FastText)):
self._build_from_word2vec()
elif isinstance(self.model, (WordEmbeddingsKeyedVectors, KeyedVectors)):
self._build_from_keyedvectors()
else:
raise ValueError("model must be a Word2Vec, Doc2Vec, FastText or KeyedVectors instance")

def save(self, fname, protocol=2):
"""Save this NmslibIndexer instance to a file.

Parameters
----------
fname : str
Path to the output file,
will produce 2 files: `fname` - parameters and `fname`.d - :class:`~nmslib.NmslibIndex`.
protocol : int, optional
Protocol for pickle.

Notes
-----
This method saves **only** the index (**the model isn't preserved**).

"""
fname_dict = fname + '.d'
self.index.saveIndex(fname)
d = {'index_params': self.index_params, 'query_time_params': self.query_time_params, 'labels': self.labels}
with open(fname_dict, 'wb') as fout:
_pickle.dump(d, fout, protocol=protocol)

@classmethod
def load(cls, fname):
"""Load a NmslibIndexer instance from a file

Parameters
----------
fname : str
Path to dump with NmslibIndexer.

"""
fname_dict = fname + '.d'
with open(fname_dict, 'rb') as f:
d = _pickle.load(f)
index_params = d['index_params']
query_time_params = d['query_time_params']
nmslib_instance = cls(index_params=index_params, query_time_params=query_time_params)
index = nmslib.init()
index.loadIndex(fname)
nmslib_instance.index = index
nmslib_instance.labels = d['labels']
return nmslib_instance

def _build_from_word2vec(self):
"""Build an Nmslib index using word vectors from a Word2Vec model."""

self.model.init_sims()
return self._build_from_model(self.model.wv.vectors_norm, self.model.wv.index2word)

def _build_from_doc2vec(self):
"""Build an Nmslib index using document vectors from a Doc2Vec model."""

docvecs = self.model.docvecs
docvecs.init_sims()
labels = [docvecs.index_to_doctag(i) for i in range(0, docvecs.count)]
return self._build_from_model(docvecs.vectors_docs_norm, labels)

def _build_from_keyedvectors(self):
"""Build an Nmslib index using word vectors from a KeyedVectors model."""

self.model.init_sims()
return self._build_from_model(self.model.vectors_norm, self.model.index2word)

def _build_from_model(self, vectors, labels):
index = nmslib.init()
index.addDataPointBatch(vectors)

index.createIndex(self.index_params, print_progress=True)
nmslib.setQueryTimeParams(index, self.query_time_params)

self.index = index
self.labels = labels

def most_similar(self, vector, num_neighbors):
"""Find the approximate `num_neighbors` most similar items.

Parameters
----------
vector : numpy.array
Vector for word/document.
num_neighbors : int
Number of most similar items

Returns
-------
list of (str, float)
List of most similar items in format [(`item`, `cosine_distance`), ... ]

"""
ids, distances = self.index.knnQueryBatch(vector.reshape(1, -1), k=num_neighbors)[0]

return [(self.labels[ids[i]], 1 - distances[i] / 2) for i in range(len(ids))]
152 changes: 152 additions & 0 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,158 @@ def testSaveLoad(self):
self.assertEqual(self.index.num_trees, self.index2.num_trees)


class TestWord2VecNmslibIndexer(unittest.TestCase):

def setUp(self):
try:
import nmslib # noqa:F401
except ImportError:
raise unittest.SkipTest("Nmslib library is not available")

from gensim.similarities.nmslib import NmslibIndexer
self.indexer = NmslibIndexer

def testWord2Vec(self):
model = word2vec.Word2Vec(texts, min_count=1)
model.init_sims()
index = self.indexer(model)

self.assertVectorIsSimilarToItself(model.wv, index)
self.assertApproxNeighborsMatchExact(model, model.wv, index)
self.assertIndexSaved(index)
self.assertLoadedIndexEqual(index, model)

def testFastText(self):
class LeeReader(object):
def __init__(self, fn):
self.fn = fn

def __iter__(self):
with smart_open(self.fn, 'r', encoding="latin_1") as infile:
for line in infile:
yield line.lower().strip().split()

model = FastText(LeeReader(datapath('lee.cor')))
model.init_sims()
index = self.indexer(model)

self.assertVectorIsSimilarToItself(model.wv, index)
self.assertApproxNeighborsMatchExact(model, model.wv, index)
self.assertIndexSaved(index)
self.assertLoadedIndexEqual(index, model)

def testNmslibIndexingOfKeyedVectors(self):
from gensim.similarities.nmslib import NmslibIndexer
keyVectors_file = datapath('lee_fasttext.vec')
model = KeyedVectors.load_word2vec_format(keyVectors_file)
index = NmslibIndexer(model)

self.assertVectorIsSimilarToItself(model, index)
self.assertApproxNeighborsMatchExact(model, model, index)

def testLoadMissingRaisesError(self):
from gensim.similarities.nmslib import NmslibIndexer
test_index = NmslibIndexer()

self.assertRaises(IOError, test_index.load, fname='test-index')

def assertVectorIsSimilarToItself(self, wv, index):
vector = wv.vectors_norm[0]
label = wv.index2word[0]
approx_neighbors = index.most_similar(vector, 1)
word, similarity = approx_neighbors[0]

self.assertEqual(word, label)
self.assertAlmostEqual(similarity, 1.0, places=2)

def assertApproxNeighborsMatchExact(self, model, wv, index):
vector = wv.vectors_norm[0]
approx_neighbors = model.wv.most_similar([vector], topn=5, indexer=index)
exact_neighbors = model.wv.most_similar(positive=[vector], topn=5)

approx_words = [neighbor[0] for neighbor in approx_neighbors]
exact_words = [neighbor[0] for neighbor in exact_neighbors]

self.assertEqual(approx_words, exact_words)

def assertIndexSaved(self, index):
fname = get_tmpfile('gensim_similarities.tst.pkl')
index.save(fname)
self.assertTrue(os.path.exists(fname))
self.assertTrue(os.path.exists(fname + '.d'))

def assertLoadedIndexEqual(self, index, model):
from gensim.similarities.nmslib import NmslibIndexer

fname = get_tmpfile('gensim_similarities.tst.pkl')
index.save(fname)

index2 = NmslibIndexer.load(fname)
index2.model = model

self.assertEqual(index.labels, index2.labels)
self.assertEqual(index.index_params, index2.index_params)
self.assertEqual(index.query_time_params, index2.query_time_params)


class TestDoc2VecNmslibIndexer(unittest.TestCase):

def setUp(self):
try:
import nmslib # noqa:F401
except ImportError:
raise unittest.SkipTest("Nmslib library is not available")

from gensim.similarities.nmslib import NmslibIndexer

self.model = doc2vec.Doc2Vec(sentences, min_count=1)
self.model.init_sims()
self.index = NmslibIndexer(self.model)
self.vector = self.model.docvecs.vectors_docs_norm[0]

def testDocumentIsSimilarToItself(self):
approx_neighbors = self.index.most_similar(self.vector, 1)
doc, similarity = approx_neighbors[0]

self.assertEqual(doc, 0)
self.assertAlmostEqual(similarity, 1.0, places=2)

def testApproxNeighborsMatchExact(self):
approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index)
exact_neighbors = self.model.docvecs.most_similar(
positive=[self.vector], topn=5)

approx_words = [neighbor[0] for neighbor in approx_neighbors]
exact_words = [neighbor[0] for neighbor in exact_neighbors]

self.assertEqual(approx_words, exact_words)

def testSave(self):
fname = get_tmpfile('gensim_similarities.tst.pkl')
self.index.save(fname)
self.assertTrue(os.path.exists(fname))
self.assertTrue(os.path.exists(fname + '.d'))

def testLoadNotExist(self):
from gensim.similarities.nmslib import NmslibIndexer
self.test_index = NmslibIndexer()

self.assertRaises(IOError, self.test_index.load, fname='test-index')

def testSaveLoad(self):
from gensim.similarities.nmslib import NmslibIndexer

fname = get_tmpfile('gensim_similarities.tst.pkl')
self.index.save(fname)

self.index2 = NmslibIndexer.load(fname)
self.index2.model = self.model

self.assertEqual(self.index.labels, self.index2.labels)
self.assertEqual(self.index.index_params, self.index2.index_params)
self.assertEqual(self.index.query_time_params, self.index2.query_time_params)


class TestUniformTermSimilarityIndex(unittest.TestCase):
def setUp(self):
self.documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]]
Expand Down
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,9 +252,12 @@ def finalize_options(self):
linux_testenv.extend([
'tensorflow <= 1.3.0',
'keras >= 2.0.4, <= 2.1.4',
'annoy',
'annoy'
])

if (3, 0) < sys.version_info < (3, 7):
linux_testenv.extend(['nmslib'])

ext_modules = [
Extension('gensim.models.word2vec_inner',
sources=['./gensim/models/word2vec_inner.c'],
Expand Down