Skip to content

Commit

Permalink
test without sklearn api
Browse files Browse the repository at this point in the history
  • Loading branch information
markroxor committed Dec 22, 2017
1 parent ac4b154 commit 0bacc08
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 168 deletions.
Binary file added gensim/test/test_data/tfidf_model.tst
Binary file not shown.
Binary file added gensim/test/test_data/tfidf_model.tst.bz2
Binary file not shown.
170 changes: 3 additions & 167 deletions gensim/test/test_sklearn_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -973,13 +973,13 @@ def testTransform(self):

def testSetGetParams(self):
# updating only one param
self.model.set_params(smartirs='nnn')
self.model.set_params(normalize=False)
model_params = self.model.get_params()
self.assertEqual(model_params["smartirs"], 'nnn')
self.assertEqual(model_params["normalize"], False)

# verify that the attributes values are also changed for `gensim_model` after fitting
self.model.fit(self.corpus)
self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn')
self.assertEqual(getattr(self.model.gensim_model, 'normalize'), False)

def testPipeline(self):
with open(datapath('mini_newsgroup'), 'rb') as f:
Expand All @@ -1000,9 +1000,6 @@ def testPipeline(self):
self.assertGreater(score, 0.40)

def testPersistence(self):
# Test current model persistency.
self.model.set_params(smartirs='ntc')

model_dump = pickle.dumps(self.model)
model_load = pickle.loads(model_dump)

Expand All @@ -1013,171 +1010,10 @@ def testPersistence(self):
original_transformed_doc = self.model.transform(doc)
self.assertEqual(original_transformed_doc, loaded_transformed_doc)

# compare backward model pickle compatibility
with open("test_data/tfidf_model.pkl", "rb") as model_handler:
model_load = pickle.load(model_handler)

loaded_transformed_doc = model_load.transform(doc)

# comparing the original and new models
original_transformed_doc = self.model.transform(doc)
self.assertEqual(original_transformed_doc, loaded_transformed_doc)

def testModelNotFitted(self):
tfidf_wrapper = TfIdfTransformer()
self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0])

def testConsistency(self):
# Test if `ntc` yields the default docs.
docs = [corpus[1], corpus[2]]

self.model.set_params(smartirs='ntc')
self.model.fit(self.corpus)
transformed_docs = self.model.transform(docs)

self.model.set_params(normalize=True)
self.model.fit(self.corpus)
expected_docs = self.model.transform(docs)

self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))

# Testing all the variations of `wlocal`
# smartirs=`nnn`
self.model.set_params(smartirs='nnn')
self.model.fit(self.corpus)

transformed_docs = self.model.transform(docs)
expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
[(5, 6), (9, 3), (10, 3)]
]

self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))

# smartirs=`lnn`
self.model.set_params(smartirs='lnn')
self.model.fit(self.corpus)

transformed_docs = self.model.transform(docs)
expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
[(5, 6.0), (9, 3.0), (10, 3.0)]
]

self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))

# smartirs=`ann`
self.model.set_params(smartirs='ann')
self.model.fit(self.corpus)

transformed_docs = self.model.transform(docs)
expected_docs = [
[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
[(5, 3.0), (9, 2.25), (10, 2.25)]
]

self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))

# smartirs=`bnn`
self.model.set_params(smartirs='bnn')
self.model.fit(self.corpus)

transformed_docs = self.model.transform(docs)
expected_docs = [
[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
[(5, 3), (9, 3), (10, 3)]
]

self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))

# smartirs=`Lnn`
self.model.set_params(smartirs='Lnn')
self.model.fit(self.corpus)

transformed_docs = self.model.transform(docs)
expected_docs = [[(3, 1.4635792826230198),
(4, 1.4635792826230198),
(5, 2.19536892393453),
(6, 1.4635792826230198),
(7, 2.19536892393453),
(8, 1.4635792826230198)],
[(5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055)]
]

self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))

# Testing all the variations of `glocal`
# smartirs=`ntn`
self.model.set_params(smartirs='ntn')
self.model.fit(self.corpus)

transformed_docs = self.model.transform(docs)
expected_docs = [[(3, 2.1699250014423126),
(4, 2.1699250014423126),
(5, 1.5849625007211563),
(6, 2.1699250014423126),
(7, 1.5849625007211563),
(8, 2.1699250014423126)],
[(5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563)]
]

self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))

# smartirs=`npn`
self.model.set_params(smartirs='npn')
self.model.fit(self.corpus)

transformed_docs = self.model.transform(docs)
expected_docs = [[(3, 1.8073549220576042),
(4, 1.8073549220576042),
(5, 1.0),
(6, 1.8073549220576042),
(7, 1.0),
(8, 1.8073549220576042)],
[(5, 2.0), (9, 1.0), (10, 1.0)]
]

self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))

# Testing all the variations of `normalize`
# smartirs=`nnc`
self.model.set_params(smartirs='nnc')
self.model.fit(self.corpus)

transformed_docs = self.model.transform(docs)
expected_docs = [[(3, 0.34299717028501764),
(4, 0.34299717028501764),
(5, 0.51449575542752646),
(6, 0.34299717028501764),
(7, 0.51449575542752646),
(8, 0.34299717028501764)],
[(5, 0.81649658092772603),
(9, 0.40824829046386302),
(10, 0.40824829046386302)]
]

self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))

# Check if wlocal and wglobal are overriden if smartirs is not None
self.model.set_params(wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc')
self.model.fit(self.corpus)

transformed_docs = self.model.transform(docs)

self.model.set_params(wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc')
self.model.fit(self.corpus)
expected_docs = self.model.transform(docs)

self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))


class TestHdpTransformer(unittest.TestCase):
def setUp(self):
Expand Down
163 changes: 162 additions & 1 deletion gensim/test/test_tfidfmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,27 @@
from gensim.models import tfidfmodel
from gensim.test.utils import datapath, get_tmpfile, common_dictionary, common_corpus

from gensim.corpora import Dictionary

texts = [
['complier', 'system', 'computer'],
['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
['graph', 'flow', 'network', 'graph'],
['loading', 'computer', 'system'],
['user', 'server', 'system'],
['tree', 'hamiltonian'],
['graph', 'trees'],
['computer', 'kernel', 'malfunction', 'computer'],
['server', 'system', 'computer'],
]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


class TestTfidfModel(unittest.TestCase):
def setUp(self):
self.corpus = MmCorpus(datapath('testcorpus.mm'))
self.model = tfidfmodel.TfidfModel(self.corpus, normalize=True)

def testTransform(self):
# create the transformation model
Expand Down Expand Up @@ -58,6 +75,13 @@ def testPersistence(self):
tstvec = []
self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

# Test persistence between old and new model.
model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
self.assertTrue(model3.idfs == model4.idfs)
tstvec = []
self.assertTrue(np.allclose(model3[tstvec], model4[tstvec])) # try projecting an empty vector

def testPersistenceCompressed(self):
fname = get_tmpfile('gensim_models.tst.gz')
model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
Expand All @@ -66,8 +90,145 @@ def testPersistenceCompressed(self):
self.assertTrue(model.idfs == model2.idfs)
tstvec = []
self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
# endclass TestTfidfModel

# Test persistence between old and new compressed model.
model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
self.assertTrue(model3.idfs == model4.idfs)
tstvec = []
self.assertTrue(np.allclose(model3[tstvec], model4[tstvec])) # try projecting an empty vector

def TestConsistency(self):
docs = [corpus[1], corpus[2]]

# Test if `ntc` yields the default docs.
model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntc')
transformed_docs = [model[docs[0]], model[docs[1]]]

model = tfidfmodel.TfidfModel(self.corpus)
expected_docs = [model[docs[0]], model[docs[1]]]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# Testing all the variations of `wlocal`
# nnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
[(5, 6), (9, 3), (10, 3)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# lnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
[(5, 6.0), (9, 3.0), (10, 3.0)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# ann
model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
[(5, 3.0), (9, 2.25), (10, 2.25)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# bnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
[(5, 3), (9, 3), (10, 3)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# Lnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='Lnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [[(3, 1.4635792826230198),
(4, 1.4635792826230198),
(5, 2.19536892393453),
(6, 1.4635792826230198),
(7, 2.19536892393453),
(8, 1.4635792826230198)],
[(5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# Testing all the variations of `glocal`
# ntn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [[(3, 2.1699250014423126),
(4, 2.1699250014423126),
(5, 1.5849625007211563),
(6, 2.1699250014423126),
(7, 1.5849625007211563),
(8, 2.1699250014423126)],
[(5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# npn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='npn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [[(3, 1.8073549220576042),
(4, 1.8073549220576042),
(5, 1.0),
(6, 1.8073549220576042),
(7, 1.0),
(8, 1.8073549220576042)],
[(5, 2.0), (9, 1.0), (10, 1.0)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# Testing all the variations of `normalize`
# nnc
model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnc')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [[(3, 0.34299717028501764),
(4, 0.34299717028501764),
(5, 0.51449575542752646),
(6, 0.34299717028501764),
(7, 0.51449575542752646),
(8, 0.34299717028501764)],
[(5, 0.81649658092772603),
(9, 0.40824829046386302),
(10, 0.40824829046386302)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc')

transformed_docs = [model[docs[0]], model[docs[1]]]

model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc')
expected_docs = [model[docs[0]], model[docs[1]]]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# endclass TestTfidfModel

if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
Expand Down

0 comments on commit 0bacc08

Please # to comment.