From bf99d938d102bc6b14f6b01f609acda77e3c2967 Mon Sep 17 00:00:00 2001 From: Diego Medina Date: Wed, 1 Jun 2022 20:18:20 -0300 Subject: [PATCH 1/2] fix: Box Plot Chart throws an error when the average (AVG) / SUM is being calculated on the Metrics --- superset/utils/pandas_postprocessing/boxplot.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/superset/utils/pandas_postprocessing/boxplot.py b/superset/utils/pandas_postprocessing/boxplot.py index 4436af9182c0f..1af9de4e2c8c3 100644 --- a/superset/utils/pandas_postprocessing/boxplot.py +++ b/superset/utils/pandas_postprocessing/boxplot.py @@ -18,7 +18,7 @@ import numpy as np from flask_babel import gettext as _ -from pandas import DataFrame, Series +from pandas import DataFrame, Series, to_numeric from superset.exceptions import InvalidPostProcessingError from superset.utils.core import PostProcessingBoxplotWhiskerType @@ -122,4 +122,11 @@ def outliers(series: Series) -> Set[float]: for operator_name, operator in operators.items() for metric in metrics } + + # nanpercentile needs numeric values, otherwise the isnan function + # that's used in the underlying function will fail + for column in df: + if df.dtypes[column] == np.object: + df[column] = to_numeric(df[column], errors="coerce") + return aggregate(df, groupby=groupby, aggregates=aggregates) From d46bd784396abc4ca89778981da593159659b129 Mon Sep 17 00:00:00 2001 From: Diego Medina Date: Wed, 1 Jun 2022 21:11:41 -0300 Subject: [PATCH 2/2] add test --- .../utils/pandas_postprocessing/boxplot.py | 2 +- .../pandas_postprocessing/test_boxplot.py | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/superset/utils/pandas_postprocessing/boxplot.py b/superset/utils/pandas_postprocessing/boxplot.py index 1af9de4e2c8c3..40ce9200d358e 100644 --- a/superset/utils/pandas_postprocessing/boxplot.py +++ b/superset/utils/pandas_postprocessing/boxplot.py @@ -125,7 +125,7 @@ def outliers(series: Series) -> Set[float]: # nanpercentile needs numeric values, otherwise the isnan function # that's used in the underlying function will fail - for column in df: + for column in metrics: if df.dtypes[column] == np.object: df[column] = to_numeric(df[column], errors="coerce") diff --git a/tests/unit_tests/pandas_postprocessing/test_boxplot.py b/tests/unit_tests/pandas_postprocessing/test_boxplot.py index 9252b0da78846..27dff0adeb894 100644 --- a/tests/unit_tests/pandas_postprocessing/test_boxplot.py +++ b/tests/unit_tests/pandas_postprocessing/test_boxplot.py @@ -124,3 +124,28 @@ def test_boxplot_percentile_incorrect_params(): metrics=["cars"], percentiles=[10, 90, 10], ) + + +def test_boxplot_type_coercion(): + df = names_df + df["cars"] = df["cars"].astype(str) + df = boxplot( + df=df, + groupby=["region"], + whisker_type=PostProcessingBoxplotWhiskerType.TUKEY, + metrics=["cars"], + ) + + columns = {column for column in df.columns} + assert columns == { + "cars__mean", + "cars__median", + "cars__q1", + "cars__q3", + "cars__max", + "cars__min", + "cars__count", + "cars__outliers", + "region", + } + assert len(df) == 4