Skip to content

Commit

Permalink
Fix symmetrical case for hellinger distance. Fix #1854 (#1860)
Browse files Browse the repository at this point in the history
* fix: fix bugs in hellinger distance computing

This changes fix the bugs that return different distance when we
call hellinger(x, y) and hellinger(y, x).
The cause of this bug is that we compute the distance based on one
distribution's index previously, but we should iterate all the
index appears in two probability distributions.

* rename variable + fix union indices error in python3

* fix and add tests for hellinger distance

* fix the currrent test for different length BOW inputs
* add a test for symmetrical inputs
  • Loading branch information
caiyulun authored and menshikh-iv committed Jan 30, 2018
1 parent 97bf9bd commit 1f357a7
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
5 changes: 2 additions & 3 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,10 +897,9 @@ def hellinger(vec1, vec2):
if isbow(vec1) and isbow(vec2):
# if it is a BoW format, instead of converting to dense we use dictionaries to calculate appropriate distance
vec1, vec2 = dict(vec1), dict(vec2)
if len(vec2) < len(vec1):
vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector
indices = set(list(vec1.keys()) + list(vec2.keys()))
sim = np.sqrt(
0.5 * sum((np.sqrt(value) - np.sqrt(vec2.get(index, 0.0)))**2 for index, value in iteritems(vec1))
0.5 * sum((np.sqrt(vec1.get(index, 0.0)) - np.sqrt(vec2.get(index, 0.0)))**2 for index in indices)
)
return sim
else:
Expand Down
13 changes: 11 additions & 2 deletions gensim/test/test_similarity_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,21 @@ def test_inputs(self):

def test_distributions(self):

# checking bag of words as inputs
# checking different length bag of words as inputs
vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1)]
result = matutils.hellinger(vec_1, vec_2)
expected = 0.185241936534
expected = 0.484060507634
self.assertAlmostEqual(expected, result)

# checking symmetrical bag of words inputs return same distance
vec_1 = [(2, 0.1), (3, 0.4), (4, 0.1), (5, 0.1), (1, 0.1), (7, 0.2)]
vec_2 = [(1, 0.1), (3, 0.8), (4, 0.1), (8, 0.1), (10, 0.8), (9, 0.1)]
result = matutils.hellinger(vec_1, vec_2)
result_symmetric = matutils.hellinger(vec_2, vec_1)
expected = 0.856921568786
self.assertAlmostEqual(expected, result)
self.assertAlmostEqual(expected, result_symmetric)

# checking ndarray, csr_matrix as inputs
vec_1 = np.array([[1, 0.3], [0, 0.4], [2, 0.3]])
Expand Down

0 comments on commit 1f357a7

Please # to comment.