Skip to content

Commit

Permalink
AVRO-4060: Use JDK to Hash Byte Array in UTF8
Browse files Browse the repository at this point in the history
  • Loading branch information
belugabehr committed Sep 25, 2024
1 parent 005ee80 commit ba04247
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 2 deletions.
10 changes: 8 additions & 2 deletions lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,14 @@ public int hashCode() {
if (h == 0) {
byte[] bytes = this.bytes;
int length = this.length;
for (int i = 0; i < length; i++) {
h = h * 31 + bytes[i];
// If the array is filled, use the underlying JDK hash functionality.
// Starting with JDK 21, the underlying implementation is vectorized.
if (bytes.length == length) {
h = Arrays.hashCode(bytes);
} else {
for (int i = 0; i < length; i++) {
h = h * 31 + bytes[i];
}
}
this.hash = h;
}
Expand Down
18 changes: 18 additions & 0 deletions lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,24 @@ void hashCodeReused() {
assertEquals(3198781, u.hashCode());
}

/**
* There are two different code paths that hashcode() can call depending on the state of the
* internal buffer. If the buffer is full (string length eq. buffer length) then the JDK
* hashcode function can be used. This function can sometimes be vectorized JDK 21+ and therefore
* should be preferable. However, if the buffer is not full (string length le. buffer length), then the
* JDK does not support this and a scalar implementation is the only option as of today.
*/
@Test
void hashCodeBasedOnCapacity() {
// string = 3; buffer = 3
Utf8 fullCapacity = new Utf8("abc");

// string = 3; buffer = 4
Utf8 partialCapacity = new Utf8("abcX").setByteLength(3);

assertEquals(fullCapacity.hashCode(), partialCapacity.hashCode());
}

@Test
void oversizeUtf8() {
Utf8 u = new Utf8();
Expand Down

0 comments on commit ba04247

Please # to comment.