set up cov with tests

databricks · lopez- · Jun 30, 2020 · Jul 20, 2020 · Jul 20, 2020 · Jun 30, 2020
commit cd9c1bee8ac533607d35ea6782dd891513fb6485
diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py
@@ -42,7 +42,6 @@ class MissingPandasLikeSeries(object):
     autocorr = _unsupported_function("autocorr")
     between_time = _unsupported_function("between_time")
     combine = _unsupported_function("combine")
-    cov = _unsupported_function("cov")
     droplevel = _unsupported_function("droplevel")
     ewm = _unsupported_function("ewm")
     factorize = _unsupported_function("factorize")

diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -4858,6 +4858,54 @@ def mad(self):
 
         return mad
 
+    def cov(self, other: "Series", min_periods: Optional[int] = None) -> float:
+        """
+        Return the covariance between two series.
+
+        Parameters
+        ----------
+        other : Series
+        min_periods : int
+
+        Examples
+        --------
+        >>> s1 = ks.Series([1, 2, 3, 4])
+        >>> s2 = ks.Series([5, 6, 7, 8])
+        >>> s1
+        0    1
+        1    2
+        2    3
+        3    4
+        Name: 0, dtype: int64
+
+        >>> s2
+        0    5
+        1    6
+        2    7
+        3    8
+        Name: 0, dtype: int64
+
+        >>> s1.cov(s2)
+        1.666666...
+        """
+
+        if not isinstance(other, Series):
+            raise ValueError("'other' must be a Series")
+
+        if len(self.index) != len(other.index):
+            raise ValueError("series are not aligned")
+
+        min_periods = 0 if min_periods is None else min_periods
+        if len(self.index) < min_periods or len(self.index) <= 1:
+            return np.nan
+
+        if same_anchor(self, other):
+            # if the have the same anchor use the more performant Spark native `cov`
+            return self._internal.spark_frame.cov(self.name, other.name)
+        else:
+            # if not on the same anchor calculate covariance manually
+            return (self - self.mean()).dot(other - other.mean()) / (len(self.index) - 1)
+
     def unstack(self, level=-1):
         """
         Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.

diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py
@@ -948,6 +948,32 @@ def test_series_repeat(self):
         else:
             self.assert_eq(kser1.repeat(kser2).sort_index(), pser1.repeat(pser2).sort_index())
 
+    def test_cov(self):
+        kser = ks.Series([90, 91, 85])
+        pser = kser.to_pandas()
+        kser_other = ks.Series([90, 91, 85])
+        pser_other = kser_other.to_pandas()
+
+        self.assert_eq(kser.cov(kser_other), pser.cov(pser_other), almost=True)
+
+        kser = ks.Series([90])
+        pser = kser.to_pandas()
+        kser_other = ks.Series([85])
+        pser_other = kser_other.to_pandas()
+
+        k_isnan = np.isnan(kser.cov(kser_other))
+        p_isnan = np.isnan(pser.cov(pser_other))
+        self.assert_eq(k_isnan, p_isnan)
+
+        kser = ks.Series([90, 91, 85])
+        pser = kser.to_pandas()
+        kser_other = ks.Series([90, 91, 85])
+        pser_other = kser_other.to_pandas()
+
+        k_isnan = np.isnan(kser.cov(kser_other, 4))
+        p_isnan = np.isnan(pser.cov(pser_other, 4))
+        self.assert_eq(k_isnan, p_isnan)
+
 
 class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils):
     @classmethod

diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -1787,3 +1787,25 @@ def test_ffill(self):
         kser.ffill(inplace=True)
         pser.ffill(inplace=True)
         self.assert_eq(repr(kser), repr(pser))
+
+    def test_cov(self):
+        kdf = ks.DataFrame({"A": [90, 91, 85], "B": [90, 91, 85]}, columns=["A", "B"])
+        pdf = kdf.to_pandas()
+
+        self.assert_eq(kdf.A.cov(kdf.B), pdf.A.cov(pdf.B), almost=True)
+
+        kdf = ks.DataFrame({"A": [90], "B": [90]}, columns=["A", "B"])
+        pdf = kdf.to_pandas()
+
+        k_cov = kdf.A.cov(kdf.B)
+        p_cov = pdf.A.cov(pdf.B)
+
+        self.assert_eq(np.isnan(k_cov), np.isnan(p_cov))
+
+        kdf = ks.DataFrame({"A": [90, 91, 85], "B": [90, 91, 85]}, columns=["A", "B"])
+        pdf = kdf.to_pandas()
+
+        k_cov = kdf.A.cov(kdf.B, 4)
+        p_cov = pdf.A.cov(pdf.B, 4)
+
+        self.assert_eq(np.isnan(k_cov), np.isnan(p_cov))