Skip to content

Commit

Permalink
[Kernel] Change string comparing from UTF16 to UTF8 (#3611)
Browse files Browse the repository at this point in the history
## Description
Changed string comparing from UTF16 to UTF8. This fixes comparison
issues around the characters with surrogate pairs.

## How was this patch tested?
Tests added to `DefaultExpressionEvaluatorSuite.scala`
  • Loading branch information
ilicmarkodb authored Aug 28, 2024
1 parent a0beb10 commit f468733
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import io.delta.kernel.internal.util.Utils;
import io.delta.kernel.types.*;
import java.math.BigDecimal;
import java.nio.charset.StandardCharsets;
import java.util.Comparator;
import java.util.List;
import java.util.function.Function;
Expand All @@ -34,7 +35,19 @@
class DefaultExpressionUtils {

static final Comparator<BigDecimal> BIGDECIMAL_COMPARATOR = Comparator.naturalOrder();
static final Comparator<String> STRING_COMPARATOR = Comparator.naturalOrder();
static final Comparator<String> STRING_COMPARATOR =
(leftOp, rightOp) -> {
byte[] leftBytes = leftOp.getBytes(StandardCharsets.UTF_8);
byte[] rightBytes = rightOp.getBytes(StandardCharsets.UTF_8);
int i = 0;
while (i < leftBytes.length && i < rightBytes.length) {
if (leftBytes[i] != rightBytes[i]) {
return Byte.toUnsignedInt(leftBytes[i]) - Byte.toUnsignedInt(rightBytes[i]);
}
i++;
}
return Integer.compare(leftBytes.length, rightBytes.length);
};
static final Comparator<byte[]> BINARY_COMPARTOR =
(leftOp, rightOp) -> {
int i = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,9 @@ class DefaultExpressionEvaluatorSuite extends AnyFunSuite with ExpressionSuiteBa
}

test("evaluate expression: comparators (=, <, <=, >, >=)") {
val ASCII_MAX_CHARACTER = '\u007F'
val UTF8_MAX_CHARACTER = new String(Character.toChars(Character.MAX_CODE_POINT))

// Literals for each data type from the data type value range, used as inputs to comparator
// (small, big, small, null)
val literals = Seq(
Expand All @@ -607,6 +610,66 @@ class DefaultExpressionEvaluatorSuite extends AnyFunSuite with ExpressionSuiteBa
),
(ofDate(-12123), ofDate(123123), ofDate(-12123), ofNull(DateType.DATE)),
(ofString("apples"), ofString("oranges"), ofString("apples"), ofNull(StringType.STRING)),
(ofString(""), ofString("a"), ofString(""), ofNull(StringType.STRING)),
(ofString("abc"), ofString("abc0"), ofString("abc"), ofNull(StringType.STRING)),
(ofString("abc"), ofString("abcd"), ofString("abc"), ofNull(StringType.STRING)),
(ofString("abc"), ofString("abd"), ofString("abc"), ofNull(StringType.STRING)),
(
ofString("Abcabcabc"),
ofString("aBcabcabc"),
ofString("Abcabcabc"),
ofNull(StringType.STRING)
),
(
ofString("abcabcabC"),
ofString("abcabcabc"),
ofString("abcabcabC"),
ofNull(StringType.STRING)
),
// scalastyle:off nonascii
(ofString("abc"), ofString("世界"), ofString("abc"), ofNull(StringType.STRING)),
(ofString("世界"), ofString("你好"), ofString("世界"), ofNull(StringType.STRING)),
(ofString("你好122"), ofString("你好123"), ofString("你好122"), ofNull(StringType.STRING)),
(ofString("A"), ofString("Ā"), ofString("A"), ofNull(StringType.STRING)),
(ofString("»"), ofString("î"), ofString("»"), ofNull(StringType.STRING)),
(ofString(""), ofString("🌼"), ofString(""), ofNull(StringType.STRING)),
(
ofString("abcdef🚀"),
ofString(s"abcdef$UTF8_MAX_CHARACTER"),
ofString("abcdef🚀"),
ofNull(StringType.STRING)
),
(
ofString("abcde�abcdef�abcdef�abcdef"),
ofString(s"abcde�$ASCII_MAX_CHARACTER"),
ofString("abcde�abcdef�abcdef�abcdef"),
ofNull(StringType.STRING)
),
(
ofString("abcde�abcdef�abcdef�abcdef"),
ofString(s"abcde�$ASCII_MAX_CHARACTER"),
ofString("abcde�abcdef�abcdef�abcdef"),
ofNull(StringType.STRING)
),
(
ofString("����"),
ofString(s"��$UTF8_MAX_CHARACTER"),
ofString("����"),
ofNull(StringType.STRING)
),
(
ofString(s"a${UTF8_MAX_CHARACTER}d"),
ofString(s"a$UTF8_MAX_CHARACTER$ASCII_MAX_CHARACTER"),
ofString(s"a${UTF8_MAX_CHARACTER}d"),
ofNull(StringType.STRING)
),
(
ofString("abcdefghijklm💞😉💕\n🥀🌹💐🌺🌷🌼🌻🌷🥀"),
ofString(s"abcdefghijklm💞😉💕\n🥀🌹💐🌺🌷🌼$UTF8_MAX_CHARACTER"),
ofString("abcdefghijklm💞😉💕\n🥀🌹💐🌺🌷🌼🌻🌷🥀"),
ofNull(StringType.STRING)
),
// scalastyle:on nonascii
(
ofBinary("apples".getBytes()),
ofBinary("oranges".getBytes()),
Expand Down

0 comments on commit f468733

Please # to comment.