databricks · Jan 12, 2021
diff --git a/‎databricks/koalas/plot/core.py
+177-8 b/‎databricks/koalas/plot/core.py
+177-8
@@ -16,13 +16,18 @@
 
 import importlib
 
+import pandas as pd
+import numpy as np
+from pyspark.ml.feature import Bucketizer
+from pyspark.sql import functions as F
 from pandas.core.base import PandasObject
+from pandas.core.dtypes.inference import is_integer
 
 from databricks.koalas.missing import unsupported_function
 from databricks.koalas.config import get_option
 
 
-class TopNPlot:
+class TopNPlotBase:
     def get_top_n(self, data):
         from databricks.koalas import DataFrame, Series
 
@@ -56,7 +61,7 @@ def set_result_text(self, ax):
             )
 
 
-class SampledPlot:
+class SampledPlotBase:
     def get_sampled(self, data):
         from databricks.koalas import DataFrame, Series
 
@@ -89,6 +94,170 @@ def set_result_text(self, ax):
             )
 
 
+class HistogramPlotBase:
+    @staticmethod
+    def prepare_hist_data(data, bins):
+        # TODO: this logic is same with KdePlot. Might have to deduplicate it.
+        from databricks.koalas.series import Series
+
+        if isinstance(data, Series):
+            data = data.to_frame()
+
+        numeric_data = data.select_dtypes(
+            include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64]
+        )
+
+        # no empty frames or series allowed
+        if len(numeric_data.columns) == 0:
+            raise TypeError(
+                "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)
+            )
+
+        if is_integer(bins):
+            # computes boundaries for the column
+            bins = HistogramPlotBase.get_bins(data.to_spark(), bins)
+
+        return numeric_data, bins
+
+    @staticmethod
+    def get_bins(sdf, bins):
+        # 'data' is a Spark DataFrame that selects all columns.
+        if len(sdf.columns) > 1:
+            min_col = F.least(*map(F.min, sdf))
+            max_col = F.greatest(*map(F.max, sdf))
+        else:
+            min_col = F.min(sdf.columns[-1])
+            max_col = F.max(sdf.columns[-1])
+        boundaries = sdf.select(min_col, max_col).first()
+
+        # divides the boundaries into bins
+        if boundaries[0] == boundaries[1]:
+            boundaries = (boundaries[0] - 0.5, boundaries[1] + 0.5)
+
+        return np.linspace(boundaries[0], boundaries[1], bins + 1)
+
+    @staticmethod
+    def compute_hist(kdf, bins):
+        # 'data' is a Spark DataFrame that selects one column.
+        assert isinstance(bins, (np.ndarray, np.generic))
+
+        sdf = kdf._internal.spark_frame
+        scols = []
+        for label in kdf._internal.column_labels:
+            scols.append(kdf._internal.spark_column_for(label))
+        sdf = sdf.select(*scols)
+
+        # 1. Make the bucket output flat to:
+        #     +----------+-------+
+        #     |__group_id|buckets|
+        #     +----------+-------+
+        #     |0         |0.0    |
+        #     |0         |0.0    |
+        #     |0         |1.0    |
+        #     |0         |2.0    |
+        #     |0         |3.0    |
+        #     |0         |3.0    |
+        #     |1         |0.0    |
+        #     |1         |1.0    |
+        #     |1         |1.0    |
+        #     |1         |2.0    |
+        #     |1         |1.0    |
+        #     |1         |0.0    |
+        #     +----------+-------+
+        colnames = sdf.columns
+        bucket_names = ["__{}_bucket".format(colname) for colname in colnames]
+
+        output_df = None
+        for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)):
+            # creates a Bucketizer to get corresponding bin of each value
+            bucketizer = Bucketizer(
+                splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip"
+            )
+
+            bucket_df = bucketizer.transform(sdf)
+
+            if output_df is None:
+                output_df = bucket_df.select(
+                    F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
+                )
+            else:
+                output_df = output_df.union(
+                    bucket_df.select(
+                        F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket")
+                    )
+                )
+
+        # 2. Calculate the count based on each group and bucket.
+        #     +----------+-------+------+
+        #     |__group_id|buckets| count|
+        #     +----------+-------+------+
+        #     |0         |0.0    |2     |
+        #     |0         |1.0    |1     |
+        #     |0         |2.0    |1     |
+        #     |0         |3.0    |2     |
+        #     |1         |0.0    |2     |
+        #     |1         |1.0    |3     |
+        #     |1         |2.0    |1     |
+        #     +----------+-------+------+
+        result = (
+            output_df.groupby("__group_id", "__bucket")
+            .agg(F.count("*").alias("count"))
+            .toPandas()
+            .sort_values(by=["__group_id", "__bucket"])
+        )
+
+        # 3. Fill empty bins and calculate based on each group id. From:
+        #     +----------+--------+------+
+        #     |__group_id|__bucket| count|
+        #     +----------+--------+------+
+        #     |0         |0.0     |2     |
+        #     |0         |1.0     |1     |
+        #     |0         |2.0     |1     |
+        #     |0         |3.0     |2     |
+        #     +----------+--------+------+
+        #     +----------+--------+------+
+        #     |__group_id|__bucket| count|
+        #     +----------+--------+------+
+        #     |1         |0.0     |2     |
+        #     |1         |1.0     |3     |
+        #     |1         |2.0     |1     |
+        #     +----------+--------+------+
+        #
+        # to:
+        #     +-----------------+
+        #     |__values1__bucket|
+        #     +-----------------+
+        #     |2                |
+        #     |1                |
+        #     |1                |
+        #     |2                |
+        #     |0                |
+        #     +-----------------+
+        #     +-----------------+
+        #     |__values2__bucket|
+        #     +-----------------+
+        #     |2                |
+        #     |3                |
+        #     |1                |
+        #     |0                |
+        #     |0                |
+        #     +-----------------+
+        output_series = []
+        for i, bucket_name in enumerate(bucket_names):
+            current_bucket_result = result[result["__group_id"] == i]
+            # generates a pandas DF with one row for each bin
+            # we need this as some of the bins may be empty
+            indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)})
+            # merges the bins with counts on it and fills remaining ones with zeros
+            pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[
+                ["count"]
+            ]
+            pdf.columns = [bucket_name]
+            output_series.append(pdf[bucket_name])
+
+        return output_series
+
+
 class KoalasPlotAccessor(PandasObject):
     """
     Series/Frames plotting accessor and method.
@@ -188,12 +357,12 @@ def __call__(self, kind="line", backend=None, **kwargs):
 
         if plot_backend.__name__ != "databricks.koalas.plot":
             data_preprocessor_map = {
-                "pie": TopNPlot().get_top_n,
-                "bar": TopNPlot().get_top_n,
-                "barh": TopNPlot().get_top_n,
-                "scatter": TopNPlot().get_top_n,
-                "area": SampledPlot().get_sampled,
-                "line": SampledPlot().get_sampled,
+                "pie": TopNPlotBase().get_top_n,
+                "bar": TopNPlotBase().get_top_n,
+                "barh": TopNPlotBase().get_top_n,
+                "scatter": TopNPlotBase().get_top_n,
+                "area": SampledPlotBase().get_sampled,
+                "line": SampledPlotBase().get_sampled,
             }
             if not data_preprocessor_map[kind]:
                 raise NotImplementedError(