From 97ca886d50cc17f72296871088dbe33fd50b0c06 Mon Sep 17 00:00:00 2001 From: NVnavkumar <97137715+NVnavkumar@users.noreply.github.com> Date: Fri, 4 Mar 2022 13:04:28 -0800 Subject: [PATCH] Enable fuzz testing for Regular Expression repetitions and move remaining edge cases to CPU (#4885) * Handle remaining issues with repetitions and enable repetition fuzz tests Signed-off-by: Navin Kumar * Add comment references to new issues created for string_split Signed-off-by: Navin Kumar --- .../com/nvidia/spark/rapids/RegexParser.scala | 22 ++++++++++--- .../RegularExpressionTranspilerSuite.scala | 32 +++++++++++-------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index bae02e5af10..2589bf5c897 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -708,15 +708,29 @@ class CudfRegexTranspiler(mode: RegexMode) { throw new RegexUnsupportedException( "regexp_replace on GPU does not support repetition with ? or *") - case (_, QuantifierVariableLength(0,None)) if mode == RegexReplaceMode => + case (_, SimpleQuantifier(ch)) if mode == RegexSplitMode && "?*".contains(ch) => + // example: pattern " ?", input "] b[", replace with "X": + // java: X]XXbX[X + // cuDF: XXXX] b[ + // see https://github.com/NVIDIA/spark-rapids/issues/4884 + throw new RegexUnsupportedException( + "regexp_split on GPU does not support repetition with ? or * consistently with Spark") + + case (_, QuantifierVariableLength(0, _)) if mode == RegexReplaceMode => // see https://github.com/NVIDIA/spark-rapids/issues/4468 throw new RegexUnsupportedException( - "regexp_replace on GPU does not support repetition with {0,}") + "regexp_replace on GPU does not support repetition with {0,} or {0,n}") + + case (_, QuantifierVariableLength(0, _)) if mode == RegexSplitMode => + // see https://github.com/NVIDIA/spark-rapids/issues/4884 + throw new RegexUnsupportedException( + "regexp_split on GPU does not support repetition with {0,} or {0,n} " + + "consistently with Spark") - case (_, QuantifierFixedLength(0)) | (_, QuantifierVariableLength(0, Some(0))) + case (_, QuantifierFixedLength(0)) if mode != RegexFindMode => throw new RegexUnsupportedException( - "regex_replace and regex_split on GPU do not support repetition with {0} or {0,0}") + "regex_replace and regex_split on GPU do not support repetition with {0}") case (RegexGroup(_, term), SimpleQuantifier(ch)) if "+*".contains(ch) && !isSupportedRepetitionBase(term) => diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index fa72edff2fa..f1e2c0ca3ff 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -122,7 +122,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("cuDF does not support single repetition both inside and outside of capture groups") { - // see https://github.com/NVIDIA/spark-rapids/issues/4487 val patterns = Seq("(3?)+", "(3?)*", "(3*)+", "((3?))+") patterns.foreach(pattern => assertUnsupported(pattern, RegexFindMode, "nothing to repeat")) @@ -328,6 +327,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("regexp_replace - character class repetition - ? and * - fall back to CPU") { + // see https://github.com/NVIDIA/spark-rapids/issues/4468 val patterns = Seq(raw"[1a-zA-Z]?", raw"[1a-zA-Z]*") patterns.foreach(pattern => assertUnsupported(pattern, RegexReplaceMode, @@ -336,28 +336,34 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { ) } - test("regexp_replace - character class repetition - {0,} - fall back to CPU") { - val patterns = Seq(raw"[1a-zA-Z]{0,}") + test("regexp_replace - character class repetition - {0,} or {0,n} - fall back to CPU") { + // see https://github.com/NVIDIA/spark-rapids/issues/4468 + val patterns = Seq(raw"[1a-zA-Z]{0,}", raw"[1a-zA-Z]{0,2}") patterns.foreach(pattern => assertUnsupported(pattern, RegexReplaceMode, - "regexp_replace on GPU does not support repetition with {0,}" + "regexp_replace on GPU does not support repetition with {0,} or {0,n}" ) ) } - test("regexp_replace - fall back to CPU for {0} or {0,0}") { - val patterns = Seq("a{0}", raw"\02{0}", "a{0,0}", raw"\02{0,0}") + test("regexp_split - character class repetition - ? and * - fall back to CPU") { + // see https://github.com/NVIDIA/spark-rapids/issues/4884 + val patterns = Seq(raw"[1a-zA-Z]?", raw"[1a-zA-Z]*") patterns.foreach(pattern => - assertUnsupported(pattern, RegexReplaceMode, - "regex_replace and regex_split on GPU do not support repetition with {0} or {0,0}") + assertUnsupported(pattern, RegexSplitMode, + "regexp_split on GPU does not support repetition with ? or * " + + "consistently with Spark" + ) ) } - test("regexp_split - fall back to CPU for {0} or {0,0}") { - val patterns = Seq("a{0}", raw"\02{0}", "a{0,0}", raw"\02{0,0}") + test("regexp_split - fall back to CPU for {0,n}, or {0,}") { + // see https://github.com/NVIDIA/spark-rapids/issues/4884 + val patterns = Seq("a{0,}", raw"\02{0,}", "a{0,2}", raw"\02{0,10}") patterns.foreach(pattern => assertUnsupported(pattern, RegexSplitMode, - "regex_replace and regex_split on GPU do not support repetition with {0} or {0,0}") + "regexp_split on GPU does not support repetition with {0,} or {0,n} " + + "consistently with Spark") ) } @@ -754,12 +760,12 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true) { () => predefinedCharacterClass, () => group(depth), () => boundaryMatch, - () => sequence(depth)) + () => sequence(depth), + () => repetition(depth)) val generators = if (skipKnownIssues) { baseGenerators } else { baseGenerators ++ Seq( - () => repetition(depth), // https://github.com/NVIDIA/spark-rapids/issues/4487 () => choice(depth)) // https://github.com/NVIDIA/spark-rapids/issues/4603 } generators(rr.nextInt(generators.length))()