From f468733b89f80431dbbd7ac0558a800988e3c34e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Ili=C4=87?= Date: Wed, 28 Aug 2024 17:40:33 +0200 Subject: [PATCH] [Kernel] Change string comparing from UTF16 to UTF8 (#3611) ## Description Changed string comparing from UTF16 to UTF8. This fixes comparison issues around the characters with surrogate pairs. ## How was this patch tested? Tests added to `DefaultExpressionEvaluatorSuite.scala` --- .../expressions/DefaultExpressionUtils.java | 15 ++++- .../DefaultExpressionEvaluatorSuite.scala | 63 +++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/DefaultExpressionUtils.java b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/DefaultExpressionUtils.java index e2fddee5b19..86e00d453e0 100644 --- a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/DefaultExpressionUtils.java +++ b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/DefaultExpressionUtils.java @@ -24,6 +24,7 @@ import io.delta.kernel.internal.util.Utils; import io.delta.kernel.types.*; import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; import java.util.Comparator; import java.util.List; import java.util.function.Function; @@ -34,7 +35,19 @@ class DefaultExpressionUtils { static final Comparator BIGDECIMAL_COMPARATOR = Comparator.naturalOrder(); - static final Comparator STRING_COMPARATOR = Comparator.naturalOrder(); + static final Comparator STRING_COMPARATOR = + (leftOp, rightOp) -> { + byte[] leftBytes = leftOp.getBytes(StandardCharsets.UTF_8); + byte[] rightBytes = rightOp.getBytes(StandardCharsets.UTF_8); + int i = 0; + while (i < leftBytes.length && i < rightBytes.length) { + if (leftBytes[i] != rightBytes[i]) { + return Byte.toUnsignedInt(leftBytes[i]) - Byte.toUnsignedInt(rightBytes[i]); + } + i++; + } + return Integer.compare(leftBytes.length, rightBytes.length); + }; static final Comparator BINARY_COMPARTOR = (leftOp, rightOp) -> { int i = 0; diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/expressions/DefaultExpressionEvaluatorSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/expressions/DefaultExpressionEvaluatorSuite.scala index d0544b88fdd..44ef0aca9a8 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/expressions/DefaultExpressionEvaluatorSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/expressions/DefaultExpressionEvaluatorSuite.scala @@ -583,6 +583,9 @@ class DefaultExpressionEvaluatorSuite extends AnyFunSuite with ExpressionSuiteBa } test("evaluate expression: comparators (=, <, <=, >, >=)") { + val ASCII_MAX_CHARACTER = '\u007F' + val UTF8_MAX_CHARACTER = new String(Character.toChars(Character.MAX_CODE_POINT)) + // Literals for each data type from the data type value range, used as inputs to comparator // (small, big, small, null) val literals = Seq( @@ -607,6 +610,66 @@ class DefaultExpressionEvaluatorSuite extends AnyFunSuite with ExpressionSuiteBa ), (ofDate(-12123), ofDate(123123), ofDate(-12123), ofNull(DateType.DATE)), (ofString("apples"), ofString("oranges"), ofString("apples"), ofNull(StringType.STRING)), + (ofString(""), ofString("a"), ofString(""), ofNull(StringType.STRING)), + (ofString("abc"), ofString("abc0"), ofString("abc"), ofNull(StringType.STRING)), + (ofString("abc"), ofString("abcd"), ofString("abc"), ofNull(StringType.STRING)), + (ofString("abc"), ofString("abd"), ofString("abc"), ofNull(StringType.STRING)), + ( + ofString("Abcabcabc"), + ofString("aBcabcabc"), + ofString("Abcabcabc"), + ofNull(StringType.STRING) + ), + ( + ofString("abcabcabC"), + ofString("abcabcabc"), + ofString("abcabcabC"), + ofNull(StringType.STRING) + ), + // scalastyle:off nonascii + (ofString("abc"), ofString("世界"), ofString("abc"), ofNull(StringType.STRING)), + (ofString("世界"), ofString("你好"), ofString("世界"), ofNull(StringType.STRING)), + (ofString("你好122"), ofString("你好123"), ofString("你好122"), ofNull(StringType.STRING)), + (ofString("A"), ofString("Ā"), ofString("A"), ofNull(StringType.STRING)), + (ofString("»"), ofString("î"), ofString("»"), ofNull(StringType.STRING)), + (ofString("�"), ofString("🌼"), ofString("�"), ofNull(StringType.STRING)), + ( + ofString("abcdef🚀"), + ofString(s"abcdef$UTF8_MAX_CHARACTER"), + ofString("abcdef🚀"), + ofNull(StringType.STRING) + ), + ( + ofString("abcde�abcdef�abcdef�abcdef"), + ofString(s"abcde�$ASCII_MAX_CHARACTER"), + ofString("abcde�abcdef�abcdef�abcdef"), + ofNull(StringType.STRING) + ), + ( + ofString("abcde�abcdef�abcdef�abcdef"), + ofString(s"abcde�$ASCII_MAX_CHARACTER"), + ofString("abcde�abcdef�abcdef�abcdef"), + ofNull(StringType.STRING) + ), + ( + ofString("����"), + ofString(s"��$UTF8_MAX_CHARACTER"), + ofString("����"), + ofNull(StringType.STRING) + ), + ( + ofString(s"a${UTF8_MAX_CHARACTER}d"), + ofString(s"a$UTF8_MAX_CHARACTER$ASCII_MAX_CHARACTER"), + ofString(s"a${UTF8_MAX_CHARACTER}d"), + ofNull(StringType.STRING) + ), + ( + ofString("abcdefghijklm💞😉💕\n🥀🌹💐🌺🌷🌼🌻🌷🥀"), + ofString(s"abcdefghijklm💞😉💕\n🥀🌹💐🌺🌷🌼$UTF8_MAX_CHARACTER"), + ofString("abcdefghijklm💞😉💕\n🥀🌹💐🌺🌷🌼🌻🌷🥀"), + ofNull(StringType.STRING) + ), + // scalastyle:on nonascii ( ofBinary("apples".getBytes()), ofBinary("oranges".getBytes()),