diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/types/CollationIdentifier.java b/kernel/kernel-api/src/main/java/io/delta/kernel/types/CollationIdentifier.java new file mode 100644 index 00000000000..713a141f0a4 --- /dev/null +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/types/CollationIdentifier.java @@ -0,0 +1,111 @@ +/* + * Copyright (2024) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.types; + +import static io.delta.kernel.internal.util.Preconditions.checkArgument; + +import io.delta.kernel.annotation.Evolving; +import java.util.Objects; +import java.util.Optional; + +/** + * Identifies collation for string type. + * Collation identifiers + * + * @since 3.3.0 + */ +@Evolving +public class CollationIdentifier { + + private final String provider; + private final String name; + private final Optional version; + + private CollationIdentifier(String provider, String collationName) { + this(provider, collationName, Optional.empty()); + } + + private CollationIdentifier(String provider, String collationName, Optional version) { + Objects.requireNonNull(provider, "Collation provider cannot be null."); + Objects.requireNonNull(collationName, "Collation name cannot be null."); + Objects.requireNonNull(version, "Collation version cannot be null."); + + this.provider = provider.toUpperCase(); + this.name = collationName.toUpperCase(); + this.version = version.map(String::toUpperCase); + } + + /** @return collation provider. */ + public String getProvider() { + return provider; + } + + /** @return collation name. */ + public String getName() { + return name; + } + + /** @return collation version. */ + public Optional getVersion() { + return version; + } + + /** + * @param identifier collation identifier in string form of
+ * {@code PROVIDER.COLLATION_NAME[.COLLATION_VERSION]}. + * @return appropriate collation identifier object + */ + public static CollationIdentifier fromString(String identifier) { + long numDots = identifier.chars().filter(ch -> ch == '.').count(); + checkArgument(numDots > 0, String.format("Invalid collation identifier: %s", identifier)); + if (numDots == 1) { + String[] parts = identifier.split("\\."); + return new CollationIdentifier(parts[0], parts[1]); + } else { + String[] parts = identifier.split("\\.", 3); + return new CollationIdentifier(parts[0], parts[1], Optional.of(parts[2])); + } + } + + /** Collation identifiers are identical when the provider, name, and version are the same. */ + @Override + public boolean equals(Object o) { + if (!(o instanceof CollationIdentifier)) { + return false; + } + + CollationIdentifier other = (CollationIdentifier) o; + return this.provider.equals(other.provider) + && this.name.equals(other.name) + && this.version.equals(other.version); + } + + /** @return collation identifier in form of {@code PROVIDER.COLLATION_NAME}. */ + public String toStringWithoutVersion() { + return String.format("%s.%s", provider, name); + } + + /** @return collation identifier in form of {@code PROVIDER.COLLATION_NAME[.COLLATION_VERSION]} */ + @Override + public String toString() { + if (version.isPresent()) { + return String.format("%s.%s.%s", provider, name, version.get()); + } else { + return String.format("%s.%s", provider, name); + } + } +} diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java b/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java index 08b5bbd1df7..a18d93cf804 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java @@ -24,9 +24,42 @@ */ @Evolving public class StringType extends BasePrimitiveType { - public static final StringType STRING = new StringType(); + public static final StringType STRING = + new StringType(CollationIdentifier.fromString("SPARK.UTF8_BINARY")); - private StringType() { + private final CollationIdentifier collationIdentifier; + + /** + * @param collationIdentifier An identifier representing the collation to be used for string + * comparison and sorting. This determines how strings will be ordered and compared in query + * operations. + */ + public StringType(CollationIdentifier collationIdentifier) { super("string"); + this.collationIdentifier = collationIdentifier; + } + + /** + * @param collationName name of collation in which this StringType will be observed. In form of + * {@code PROVIDER.COLLATION_NAME[.VERSION]} + */ + public StringType(String collationName) { + super("string"); + this.collationIdentifier = CollationIdentifier.fromString(collationName); + } + + /** @return StringType's collation identifier */ + public CollationIdentifier getCollationIdentifier() { + return collationIdentifier; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof StringType)) { + return false; + } + + StringType that = (StringType) o; + return collationIdentifier.equals(that.collationIdentifier); } } diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/types/CollationIdentifierSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/CollationIdentifierSuite.scala new file mode 100644 index 00000000000..2e4f8c29947 --- /dev/null +++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/CollationIdentifierSuite.scala @@ -0,0 +1,100 @@ +/* + * Copyright (2024) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.types + +import org.scalatest.funsuite.AnyFunSuite + +import java.util.Optional + +class CollationIdentifierSuite extends AnyFunSuite { + val PROVIDER_SPARK = "SPARK" + val PROVIDER_ICU = "ICU" + val DEFAULT_COLLATION_NAME = "UTF8_BINARY" + val DEFAULT_COLLATION_IDENTIFIER = CollationIdentifier.fromString("SPARK.UTF8_BINARY") + + test("check fromString with valid string") { + Seq( + ( + s"$PROVIDER_SPARK.$DEFAULT_COLLATION_NAME", + DEFAULT_COLLATION_IDENTIFIER + ), + ( + s"$PROVIDER_ICU.sr_Cyrl_SRB", + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB") + ), + ( + s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1", + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1") + ) + ).foreach { + case(stringIdentifier, collationIdentifier) => + assert(CollationIdentifier.fromString(stringIdentifier).equals(collationIdentifier)) + } + } + + test("check fromString with invalid string") { + Seq( + PROVIDER_SPARK, + s"${PROVIDER_SPARK}_sr_Cyrl_SRB" + ).foreach { + stringIdentifier => + val e = intercept[IllegalArgumentException] { + val collationIdentifier = CollationIdentifier.fromString(stringIdentifier) + } + assert(e.getMessage == String.format("Invalid collation identifier: %s", stringIdentifier)) + } + } + + test("check toStringWithoutVersion") { + Seq( + ( + DEFAULT_COLLATION_IDENTIFIER, + s"$PROVIDER_SPARK.$DEFAULT_COLLATION_NAME" + ), + ( + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB"), + s"$PROVIDER_ICU.SR_CYRL_SRB" + ), + ( + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1"), + s"$PROVIDER_ICU.SR_CYRL_SRB" + ) + ).foreach { + case(collationIdentifier, toStringWithoutVersion) => + assert(collationIdentifier.toStringWithoutVersion == toStringWithoutVersion) + } + } + + test("check toString") { + Seq( + ( + DEFAULT_COLLATION_IDENTIFIER, + s"$PROVIDER_SPARK.$DEFAULT_COLLATION_NAME" + ), + ( + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB"), + s"$PROVIDER_ICU.SR_CYRL_SRB" + ), + ( + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1"), + s"$PROVIDER_ICU.SR_CYRL_SRB.75.1" + ) + ).foreach { + case(collationIdentifier, toString) => + assert(collationIdentifier.toString == toString) + } + } +} diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/types/StringTypeSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/StringTypeSuite.scala new file mode 100644 index 00000000000..d6acfa47e93 --- /dev/null +++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/StringTypeSuite.scala @@ -0,0 +1,59 @@ +/* + * Copyright (2024) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.types + +import org.scalatest.funsuite.AnyFunSuite + +class StringTypeSuite extends AnyFunSuite { + test("check equals") { + // Testcase: (instance1, instance2, expected value for `instance1 == instance2`) + Seq( + ( + StringType.STRING, + StringType.STRING, + true + ), + ( + StringType.STRING, + new StringType("sPark.UTF8_bINary"), + true + ), + ( + StringType.STRING, + new StringType("SPARK.UTF8_LCASE"), + false + ), + ( + new StringType("ICU.UNICODE"), + new StringType("SPARK.UTF8_LCASE"), + false + ), + ( + new StringType("ICU.UNICODE"), + new StringType("ICU.UNICODE_CI"), + false + ), + ( + new StringType("ICU.UNICODE_CI"), + new StringType("icU.uniCODe_Ci"), + true + ) + ).foreach { + case (st1, st2, expResult) => + assert(st1.equals(st2) == expResult) + } + } +}