diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 592c8ef..87159dd 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -12,10 +12,6 @@ jobs: strategy: matrix: include: - - scala-version: 2.11.8 - spark-version: 2.3.0 - - scala-version: 2.11.8 - spark-version: 2.4.3 - scala-version: 2.12.11 spark-version: 3.0.0 - scala-version: 2.12.11 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 581487c..2c83c83 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -12,10 +12,6 @@ jobs: strategy: matrix: include: - - scala-version: 2.11.8 - spark-version: 2.3.0 - - scala-version: 2.11.8 - spark-version: 2.4.3 - scala-version: 2.12.11 spark-version: 3.0.0 - scala-version: 2.12.11 diff --git a/python/setup.py b/python/setup.py index e26acea..8a858d9 100644 --- a/python/setup.py +++ b/python/setup.py @@ -18,7 +18,7 @@ long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.6", - install_requires=["pyspark>=2.3.0", "numpy"], + install_requires=["pyspark>=3.0.0", "numpy"], tests_require=["pytest"], project_urls={ "Source code": "https://github.com/Salmon-Brain/dead-salmon-brain/tree/main/python", diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala index d9997c2..48f37ff 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/BaseStatisticTransformer.scala @@ -35,12 +35,11 @@ trait BaseStatisticTransformer override def transformSchema(schema: StructType): StructType = outputSchema - def srm(controlSize: Int, treatmentSize: Int, alpha: Double): Boolean = { + def srm(controlSize: Int, treatmentSize: Int): Double = { val uniform = (treatmentSize + controlSize).toDouble / 2 TestUtils.chiSquareTest( Array(uniform, uniform), Array(controlSize, treatmentSize), - alpha ) } diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala index c9cab85..1205eee 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/ExpData.scala @@ -19,6 +19,8 @@ case class StatisticsReport( beta: Double, minValidSampleSize: Int, srm: Boolean, + srmAlpha: Double, + pValueSrm: Double, controlSize: Long, treatmentSize: Long, testType: String, diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala index 1e05a63..9bb318f 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/MannWhitneyStatisticsTransformer.scala @@ -70,15 +70,17 @@ class MannWhitneyStatisticsTransformer(override val uid: String) extends BaseSta beta, useLinearApproximationForVariance ), - srm(controlSize, treatmentSize, $(srmAlpha)) + srm(controlSize, treatmentSize) ) - else (getInvalidStatResult(CentralTendency.MEDIAN), false) + else (getInvalidStatResult(CentralTendency.MEDIAN), -1d) StatisticsReport( statResult, alpha, beta, minValidSampleSize, + srmResult < $(srmAlpha), + $(srmAlpha), srmResult, controlSize, treatmentSize, diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/OutlierRemoveTransformer.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/OutlierRemoveTransformer.scala index fd452b8..65bd9fc 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/OutlierRemoveTransformer.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/OutlierRemoveTransformer.scala @@ -39,15 +39,34 @@ class OutlierRemoveTransformer(override val uid: String) set(upperPercentile, value) override def transform(dataset: Dataset[_]): DataFrame = { - assert($(lowerPercentile) > 0 && $(lowerPercentile) < 1, "lowerPercentile must be in (0, 1)") assert($(upperPercentile) > 0 && $(upperPercentile) < 1, "upperPercentile must be in (0, 1)") assert( $(upperPercentile) > $(lowerPercentile), "upperPercentile must be greater than lowerPercentile" ) + val isDisabledLower = $(lowerPercentile) <= 0 + import dataset.sparkSession.implicits._ + val aggFunc = Seq( + callUDF("percentile_approx", col($(valueColumn)), lit($(upperPercentile))) as "rightBound" + ) ++ + (if (isDisabledLower) Seq() + else + Seq( + callUDF( + "percentile_approx", + col($(valueColumn)), + lit($(lowerPercentile)) + ) as "leftBound" + )) + val filterFunc = + if (isDisabledLower) col($(valueColumn)) < $"rightBound" + else col($(valueColumn)) > $"leftBound" && col($(valueColumn)) < $"rightBound" + + val dropCols = if (isDisabledLower) Seq("rightBound") else Seq("rightBound", "leftBound") + val columns = Seq( $(variantColumn), $(experimentColumn), @@ -59,14 +78,14 @@ class OutlierRemoveTransformer(override val uid: String) val percentilesBound = dataset .groupBy(columns.head, columns: _*) .agg( - callUDF("percentile_approx", col($(valueColumn)), lit($(lowerPercentile))) as "leftBound", - callUDF("percentile_approx", col($(valueColumn)), lit($(upperPercentile))) as "rightBound" + aggFunc.head, + aggFunc.tail: _* ) dataset .join(broadcast(percentilesBound), columns) - .filter(col($(valueColumn)) > $"leftBound" && col($(valueColumn)) < $"rightBound") - .drop("leftBound", "rightBound") + .filter(filterFunc) + .drop(dropCols: _*) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) diff --git a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchStatisticsTransformer.scala b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchStatisticsTransformer.scala index 7df5b56..10c3172 100644 --- a/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchStatisticsTransformer.scala +++ b/ruleofthumb/src/main/scala/ai/salmonbrain/ruleofthumb/WelchStatisticsTransformer.scala @@ -52,14 +52,17 @@ class WelchStatisticsTransformer(override val uid: String) extends BaseStatistic if (isEnoughData) ( WelchTTest.welchTTest(control, treatment, alpha, beta), - srm(controlSize.toInt, treatmentSize.toInt, $(srmAlpha)) + srm(controlSize.toInt, treatmentSize.toInt) ) - else (getInvalidStatResult(CentralTendency.MEAN), false) + else (getInvalidStatResult(CentralTendency.MEAN), -1d) + StatisticsReport( statResult, alpha, beta, minValidSampleSize, + srmResult < $(srmAlpha), + $(srmAlpha), srmResult, controlSize, treatmentSize, diff --git a/ruleofthumb/src/test/scala/ai/salmonbrain/ruleofthumb/OutlierRemoveTransformerSpec.scala b/ruleofthumb/src/test/scala/ai/salmonbrain/ruleofthumb/OutlierRemoveTransformerSpec.scala index da53668..a070223 100644 --- a/ruleofthumb/src/test/scala/ai/salmonbrain/ruleofthumb/OutlierRemoveTransformerSpec.scala +++ b/ruleofthumb/src/test/scala/ai/salmonbrain/ruleofthumb/OutlierRemoveTransformerSpec.scala @@ -11,4 +11,10 @@ class OutlierRemoveTransformerSpec extends AnyFlatSpec with SparkHelper with Mat val clearData = new OutlierRemoveTransformer().transform(data) assert(clearData.count() == 26) } + + "OutlierRemoveTransformerSpec with 0 lower percentile" should "be" in { + val data = generateDataForWelchTest() + val clearData = new OutlierRemoveTransformer().setLowerPercentile(0).transform(data) + assert(clearData.count() == 28) + } }