matejak · matejak · Aug 11, 2024 · Sep 5, 2024 · Sep 8, 2024
diff --git a/estimage/statops/compare.py b/estimage/statops/compare.py
@@ -0,0 +1,71 @@
+import numpy as np
+import scipy as sp
+
+
+# \int_{-\inf}^{\inf}l(x)\int_{x}^{\inf}g(t)\, \textup{d}t\, \textup{d}x
+# where l is the function that we think should be lower, and g the greater
+def _integrate(dom, one, two):
+    res = 0
+    for i, x in enumerate(dom):
+        toadd = two[i] * 0.5
+        toadd += sum(two[i + 1:])
+        toadd *= one[i]
+        res += toadd
+    return res
+
+
+def _integrate_estimate_both_degenerate(one, two):
+    if one.expected == two.expected:
+        return 0.5
+    if one.expected < two.expected:
+        return 1
+    return 0
+
+
+def _integrate_estimate_second_degenerate(one, two):
+    threshold = two.expected
+    rv = one._get_rv()
+    return rv.cdf(threshold)
+
+
+def _integrate_estimate_first_degenerate(one, two):
+    threshold = one.expected
+    rv = two._get_rv()
+    return 1 - rv.cdf(threshold)
+
+
+INTEGRATION_PRECISION = 0.001
+
+
+# \int_{-\inf}^{\inf}l(x)\int_{x}^{\inf}g(t)\, \textup{d}t\, \textup{d}x is in fact
+# \int_{-\inf}^{\inf}l(x)\, \textup{d}x - \int_{-\inf}^{\inf}l(x)G(x)\, \textup{d}x =
+# = 1 - \int_{-\inf}^{\inf}l(x)G(x)\, \textup{d}x where
+# G(x) is distribution function of the g randvar
+def _integrate_estimate(one, two):
+    # unlike upper bound case, PDF and CDF are zero below their respective lower bound
+    effective_lower_bound = max(one.source.optimistic, two.source.optimistic)
+    safe_upper_bound = max(one.source.pessimistic, two.source.pessimistic)
+    lower_rv = one._get_rv()
+    upper_rv = two._get_rv()
+    def integrand(x): return lower_rv.pdf(x) * upper_rv.cdf(x)
+    result = sp.integrate.quad(integrand, effective_lower_bound, safe_upper_bound, epsabs=INTEGRATION_PRECISION)
+    return 1 - result[0]
+
+
+def is_lower(dom, one, two):
+    dom = np.array(dom, dtype=float)
+    one = np.array(one, dtype=float)
+    one /= one.sum()
+    two = np.array(two, dtype=float)
+    two /= two.sum()
+    return _integrate(dom, one, two)
+
+
+def estimate_is_lower(one, two):
+    if one.sigma == 0 and two.sigma == 0:
+        return _integrate_estimate_both_degenerate(one, two)
+    if two.sigma == 0:
+        return _integrate_estimate_second_degenerate(one, two)
+    if one.sigma == 0:
+        return _integrate_estimate_first_degenerate(one, two)
+    return _integrate_estimate(one, two)
diff --git a/tests/test_statops_compare.py b/tests/test_statops_compare.py
@@ -0,0 +1,76 @@
+import estimage.statops.compare as tm
+import estimage.data as data
+
+import pytest
+
+
+def montecarlo_is_lower_test(one, two, samples=1000):
+    rvs_one, rvs_two = [est._get_rv().rvs(samples) for est in (one, two)]
+    return sum(rvs_one < rvs_two) / samples
+
+
+def integ_approx(x):
+    return pytest.approx(x, abs=tm.INTEGRATION_PRECISION)
+
+
+def test_compare_trivial():
+    assert tm.is_lower([0], [1], [1]) == 0.5
+    degenerate_estimate = data.Estimate(1, 0)
+    assert tm.estimate_is_lower(degenerate_estimate, degenerate_estimate) == 0.5
+
+
+def test_compare_equal():
+    assert tm.is_lower([0, 1], [1, 1], [1, 1]) == 0.5
+    assert tm.is_lower([0, 1, 2], [1, 1, 1], [1, 1, 1]) == 0.5
+    assert tm.is_lower([0, 1, 2, 3], [1, 1, 1, 1], [1, 1, 1, 1]) == 0.5
+    assert tm.is_lower([0, 1, 2], [0, 1, 0], [1, 1, 1]) == 0.5
+    simple_estimate = data.Estimate.from_triple(1, 0, 2)
+    assert tm.estimate_is_lower(simple_estimate, simple_estimate) == 0.5
+    concrete_estimate = data.Estimate.from_triple(2, 1, 3)
+    assert tm.estimate_is_lower(concrete_estimate, concrete_estimate) == 0.5
+    assert montecarlo_is_lower_test(concrete_estimate, concrete_estimate) == pytest.approx(0.5, abs=0.05)
+
+
+def test_compare_clear():
+    assert tm.is_lower([0, 1], [0, 1], [1, 0]) == 0
+    assert tm.is_lower([0, 1], [1, 0], [0, 1]) == 1
+    assert tm.estimate_is_lower(data.Estimate(1, 0), data.Estimate(0, 0)) == 0
+    assert tm.estimate_is_lower(data.Estimate(0, 0), data.Estimate(1, 0)) == 1
+    assert tm.is_lower([0, 1, 2, 3], [1, 1, 0, 0], [0, 0, 1, 1]) == 1
+    assert tm.is_lower([0, 1, 2, 3], [0, 0, 1, 1], [1, 1, 0, 0]) == 0
+    low_estimate = data.Estimate.from_triple(1, 0, 3)
+    high_estimate = data.Estimate.from_triple(4, 3, 6)
+    assert tm.estimate_is_lower(low_estimate, high_estimate) == integ_approx(1)
+    assert tm.estimate_is_lower(high_estimate, low_estimate) == integ_approx(0)
+
+
+def test_compare_overlapping():
+    low_estimate = data.Estimate.from_triple(1, 0, 3)
+    test_estimate = data.Estimate(1, 0)
+    assert 0 < tm.estimate_is_lower(low_estimate, test_estimate) < 0.5
+    assert 0.5 < tm.estimate_is_lower(test_estimate, low_estimate) < 1
+    almost_low_estimate = data.Estimate.from_triple(1.1, 0.1, 3.1)
+    assert 0.5 < tm.estimate_is_lower(low_estimate, almost_low_estimate) < 0.6
+    low_end_estimate = data.Estimate.from_triple(0.5, 0, 1)
+    assert 0.1 < tm.estimate_is_lower(low_estimate, low_end_estimate) < 0.2
+    rate = tm.estimate_is_lower(low_end_estimate, low_estimate)
+    assert 0.8 < rate < 0.9
+    assert montecarlo_is_lower_test(low_end_estimate, low_estimate) == pytest.approx(rate, abs=0.05)
+
+
+def test_compare_general():
+    one = data.Estimate.from_triple(2, 1, 3)
+    two = data.Estimate.from_triple(1.5, -2, 4)
+    rate = tm.estimate_is_lower(one, two)
+    assert montecarlo_is_lower_test(one, two, 4000) == pytest.approx(rate, abs=0.01)
+    one = data.Estimate.from_triple(2.5, 1, 3)
+    two = data.Estimate.from_triple(1.5, 1, 4)
+    rate = tm.estimate_is_lower(one, two)
+    assert montecarlo_is_lower_test(one, two) == pytest.approx(rate, abs=0.05)
+
+
+def test_compare_weighted():
+    assert 0.99 < tm.is_lower([0, 1, 2, 3], [1, 1, 0, 0], [0, 0.01, 1, 1]) < 1
+
+# TODO: Comparison of estimates
+# TODO: %-complete