From 0cca0ef93526a529089165cee12dc602a4e3c821 Mon Sep 17 00:00:00 2001 From: David Date: Tue, 24 Sep 2024 21:27:23 -0400 Subject: [PATCH] AVRO-4060: Use JDK to Hash Byte Array in UTF8 --- .../main/java/org/apache/avro/util/Utf8.java | 10 ++++++++-- .../java/org/apache/avro/util/TestUtf8.java | 20 +++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java index b38d237f212..b7d88bfc3db 100644 --- a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java +++ b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java @@ -174,8 +174,14 @@ public int hashCode() { if (h == 0) { byte[] bytes = this.bytes; int length = this.length; - for (int i = 0; i < length; i++) { - h = h * 31 + bytes[i]; + // If the array is filled, use the underlying JDK hash functionality. + // Starting with JDK 21, the underlying implementation is vectorized. + if (bytes.length == length) { + h = Arrays.hashCode(bytes); + } else { + for (int i = 0; i < length; i++) { + h = h * 31 + bytes[i]; + } } this.hash = h; } diff --git a/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java b/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java index e0977ff9f96..a78d6b8509d 100644 --- a/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java +++ b/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java @@ -98,6 +98,26 @@ void hashCodeReused() { assertEquals(3198781, u.hashCode()); } + /** + * There are two different code paths that hashcode() can call depending on the + * state of the internal buffer. If the buffer is full (string length eq. buffer + * length) then the JDK hashcode function can be used. This function can + * sometimes be vectorized JDK 21+ and therefore should be preferable. However, + * if the buffer is not full (string length le. buffer length), then the JDK + * does not support this and a scalar implementation is the only option as of + * today. + */ + @Test + void hashCodeBasedOnCapacity() { + // string = 3; buffer = 3 + Utf8 fullCapacity = new Utf8("abc"); + + // string = 3; buffer = 4 + Utf8 partialCapacity = new Utf8("abcX").setByteLength(3); + + assertEquals(fullCapacity.hashCode(), partialCapacity.hashCode()); + } + @Test void oversizeUtf8() { Utf8 u = new Utf8();