Skip to content

Commit

Permalink
Handle and cases
Browse files Browse the repository at this point in the history
  • Loading branch information
andygrove committed Dec 4, 2021
1 parent ef4434c commit 57e9425
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 15 deletions.
37 changes: 24 additions & 13 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,9 @@ class RegexParser(pattern: String) {
throw new RegexUnsupportedException(
s"unexpected EOF while parsing escaped character", Some(pos))
case Some(ch) =>
ch match {
case '\\' | '^' | '-' | ']' | '+' =>
// escaped metacharacter within character class
characterClass.appendEscaped(consumeExpected(ch))
}
// typically an escaped metacharacter ('\\' | '^' | '-' | ']' | '+')
// within the character class, but could be any escaped character
characterClass.appendEscaped(consumeExpected(ch))
}
case '\u0000' =>
throw new RegexUnsupportedException(
Expand Down Expand Up @@ -508,23 +506,36 @@ class CudfRegexTranspiler(replace: Boolean) {
// `[^a\r]` => `(?:[\n]|[^a])`
// `[^a\n]` => `(?:[\r]|[^a])`
// `[^a\r\n]` => `[^a]`
// `[^\r\n]` => `[^\r\n]`

val newlineCharsInClass = characters.flatMap {
case RegexChar(ch) if ch == '\n' || ch == '\r' =>
Seq(ch)
case _ =>
Seq.empty
val allLinefeed = components.forall {
case RegexChar(ch) => ch == '\n' || ch == '\r'
case RegexEscaped(ch) => ch == 'n' || ch == 'r'
case _ => false
}

val newlineCharsInClass = components.flatMap {
case RegexChar(ch) if ch == '\n' || ch == '\r' => Seq(ch)
case RegexEscaped(ch) if ch == 'n' => Seq('\n')
case RegexEscaped(ch) if ch == 'r' => Seq('\r')
case _ => Seq.empty
}.distinct

val negatedNewlines = Seq('\r', '\n').diff(newlineCharsInClass)
if (negatedNewlines.isEmpty) {
RegexCharacterClass(negated, ListBuffer(components: _*))

if (allLinefeed && newlineCharsInClass.length == 2) {
// special case for `[^\r\n]`
RegexCharacterClass(negated = true, ListBuffer(components: _*))
} else if (negatedNewlines.isEmpty) {
RegexCharacterClass(negated = true, ListBuffer(components: _*))
} else {
RegexGroup(capture = false,
RegexChoice(
RegexCharacterClass(negated = false,
characters = ListBuffer(negatedNewlines.map(RegexChar): _*)),
RegexCharacterClass(negated, ListBuffer(components: _*))))
RegexCharacterClass(negated = true, ListBuffer(components: _*))))
}

} else {
RegexCharacterClass(negated, ListBuffer(components: _*))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,9 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

test("compare CPU and GPU: regexp replace negated character class") {
val inputs = Seq("a", "b", "a\nb")
val patterns = Seq("[^z]")
val inputs = Seq("a", "b", "a\nb", "a\r\nb\n\rc\rd")
val patterns = Seq("[^z]", "[^\r]", "[^\n]", "[^\r]", "[^\r\n]",
"[^a\n]", "[^b\r]", "[^bc\r\n]", "[^\\r\\n]")
assertCpuGpuMatchesRegexpReplace(patterns, inputs)
}

Expand Down

0 comments on commit 57e9425

Please # to comment.