From 1a2d200d37236ed520f9e5e7fb03fd9b20bb51f8 Mon Sep 17 00:00:00 2001 From: belugabehr <12578579+belugabehr@users.noreply.github.com> Date: Fri, 27 Sep 2024 13:33:13 -0400 Subject: [PATCH] AVRO-4061: Use Default Value of 1 For UTF8 Hash (#3177) --- .../java/org/apache/avro/io/BinaryData.java | 2 +- .../main/java/org/apache/avro/util/Utf8.java | 4 +- .../java/org/apache/avro/util/TestUtf8.java | 37 ++++++++++--------- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryData.java b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryData.java index e6fc7086eac..b6126ec236c 100644 --- a/lang/java/avro/src/main/java/org/apache/avro/io/BinaryData.java +++ b/lang/java/avro/src/main/java/org/apache/avro/io/BinaryData.java @@ -257,7 +257,7 @@ private static int hashCode(HashData data, Schema schema) throws IOException { case FIXED: return hashBytes(1, data, schema.getFixedSize(), false); case STRING: - return hashBytes(0, data, decoder.readInt(), false); + return hashBytes(1, data, decoder.readInt(), false); case BYTES: return hashBytes(1, data, decoder.readInt(), true); case NULL: diff --git a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java index b38d237f212..ae4df8e5c42 100644 --- a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java +++ b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java @@ -41,7 +41,8 @@ public class Utf8 implements Comparable, CharSequence, Externalizable { private String string; public Utf8() { - bytes = EMPTY; + this.bytes = EMPTY; + this.hash = 1; } public Utf8(String string) { @@ -174,6 +175,7 @@ public int hashCode() { if (h == 0) { byte[] bytes = this.bytes; int length = this.length; + h = 1; for (int i = 0; i < length; i++) { h = h * 31 + bytes[i]; } diff --git a/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java b/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java index e0977ff9f96..91618ca5efc 100644 --- a/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java +++ b/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java @@ -59,43 +59,44 @@ void arrayReusedWhenLargerThanRequestedSize() { @Test void hashCodeReused() { - assertEquals(97, new Utf8("a").hashCode()); - assertEquals(3904, new Utf8("zz").hashCode()); - assertEquals(122, new Utf8("z").hashCode()); - assertEquals(99162322, new Utf8("hello").hashCode()); - assertEquals(3198781, new Utf8("hell").hashCode()); + assertEquals(1, new Utf8().hashCode()); + assertEquals(128, new Utf8("a").hashCode()); + assertEquals(4865, new Utf8("zz").hashCode()); + assertEquals(153, new Utf8("z").hashCode()); + assertEquals(127791473, new Utf8("hello").hashCode()); + assertEquals(4122302, new Utf8("hell").hashCode()); Utf8 u = new Utf8("a"); - assertEquals(97, u.hashCode()); - assertEquals(97, u.hashCode()); + assertEquals(128, u.hashCode()); + assertEquals(128, u.hashCode()); u.set("a"); - assertEquals(97, u.hashCode()); + assertEquals(128, u.hashCode()); u.setByteLength(1); - assertEquals(97, u.hashCode()); + assertEquals(128, u.hashCode()); u.setByteLength(2); - assertNotEquals(97, u.hashCode()); + assertNotEquals(128, u.hashCode()); u.set("zz"); - assertEquals(3904, u.hashCode()); + assertEquals(4865, u.hashCode()); u.setByteLength(1); - assertEquals(122, u.hashCode()); + assertEquals(153, u.hashCode()); u.set("hello"); - assertEquals(99162322, u.hashCode()); + assertEquals(127791473, u.hashCode()); u.setByteLength(4); - assertEquals(3198781, u.hashCode()); + assertEquals(4122302, u.hashCode()); u.set(new Utf8("zz")); - assertEquals(3904, u.hashCode()); + assertEquals(4865, u.hashCode()); u.setByteLength(1); - assertEquals(122, u.hashCode()); + assertEquals(153, u.hashCode()); u.set(new Utf8("hello")); - assertEquals(99162322, u.hashCode()); + assertEquals(127791473, u.hashCode()); u.setByteLength(4); - assertEquals(3198781, u.hashCode()); + assertEquals(4122302, u.hashCode()); } @Test