diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/types/CollationIdentifier.java b/kernel/kernel-api/src/main/java/io/delta/kernel/types/CollationIdentifier.java
new file mode 100644
index 00000000000..713a141f0a4
--- /dev/null
+++ b/kernel/kernel-api/src/main/java/io/delta/kernel/types/CollationIdentifier.java
@@ -0,0 +1,111 @@
+/*
+ * Copyright (2024) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.delta.kernel.types;
+
+import static io.delta.kernel.internal.util.Preconditions.checkArgument;
+
+import io.delta.kernel.annotation.Evolving;
+import java.util.Objects;
+import java.util.Optional;
+
+/**
+ * Identifies collation for string type.
+ * Collation identifiers
+ *
+ * @since 3.3.0
+ */
+@Evolving
+public class CollationIdentifier {
+
+ private final String provider;
+ private final String name;
+ private final Optional version;
+
+ private CollationIdentifier(String provider, String collationName) {
+ this(provider, collationName, Optional.empty());
+ }
+
+ private CollationIdentifier(String provider, String collationName, Optional version) {
+ Objects.requireNonNull(provider, "Collation provider cannot be null.");
+ Objects.requireNonNull(collationName, "Collation name cannot be null.");
+ Objects.requireNonNull(version, "Collation version cannot be null.");
+
+ this.provider = provider.toUpperCase();
+ this.name = collationName.toUpperCase();
+ this.version = version.map(String::toUpperCase);
+ }
+
+ /** @return collation provider. */
+ public String getProvider() {
+ return provider;
+ }
+
+ /** @return collation name. */
+ public String getName() {
+ return name;
+ }
+
+ /** @return collation version. */
+ public Optional getVersion() {
+ return version;
+ }
+
+ /**
+ * @param identifier collation identifier in string form of
+ * {@code PROVIDER.COLLATION_NAME[.COLLATION_VERSION]}.
+ * @return appropriate collation identifier object
+ */
+ public static CollationIdentifier fromString(String identifier) {
+ long numDots = identifier.chars().filter(ch -> ch == '.').count();
+ checkArgument(numDots > 0, String.format("Invalid collation identifier: %s", identifier));
+ if (numDots == 1) {
+ String[] parts = identifier.split("\\.");
+ return new CollationIdentifier(parts[0], parts[1]);
+ } else {
+ String[] parts = identifier.split("\\.", 3);
+ return new CollationIdentifier(parts[0], parts[1], Optional.of(parts[2]));
+ }
+ }
+
+ /** Collation identifiers are identical when the provider, name, and version are the same. */
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof CollationIdentifier)) {
+ return false;
+ }
+
+ CollationIdentifier other = (CollationIdentifier) o;
+ return this.provider.equals(other.provider)
+ && this.name.equals(other.name)
+ && this.version.equals(other.version);
+ }
+
+ /** @return collation identifier in form of {@code PROVIDER.COLLATION_NAME}. */
+ public String toStringWithoutVersion() {
+ return String.format("%s.%s", provider, name);
+ }
+
+ /** @return collation identifier in form of {@code PROVIDER.COLLATION_NAME[.COLLATION_VERSION]} */
+ @Override
+ public String toString() {
+ if (version.isPresent()) {
+ return String.format("%s.%s.%s", provider, name, version.get());
+ } else {
+ return String.format("%s.%s", provider, name);
+ }
+ }
+}
diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java b/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java
index 08b5bbd1df7..a18d93cf804 100644
--- a/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java
+++ b/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java
@@ -24,9 +24,42 @@
*/
@Evolving
public class StringType extends BasePrimitiveType {
- public static final StringType STRING = new StringType();
+ public static final StringType STRING =
+ new StringType(CollationIdentifier.fromString("SPARK.UTF8_BINARY"));
- private StringType() {
+ private final CollationIdentifier collationIdentifier;
+
+ /**
+ * @param collationIdentifier An identifier representing the collation to be used for string
+ * comparison and sorting. This determines how strings will be ordered and compared in query
+ * operations.
+ */
+ public StringType(CollationIdentifier collationIdentifier) {
super("string");
+ this.collationIdentifier = collationIdentifier;
+ }
+
+ /**
+ * @param collationName name of collation in which this StringType will be observed. In form of
+ * {@code PROVIDER.COLLATION_NAME[.VERSION]}
+ */
+ public StringType(String collationName) {
+ super("string");
+ this.collationIdentifier = CollationIdentifier.fromString(collationName);
+ }
+
+ /** @return StringType's collation identifier */
+ public CollationIdentifier getCollationIdentifier() {
+ return collationIdentifier;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof StringType)) {
+ return false;
+ }
+
+ StringType that = (StringType) o;
+ return collationIdentifier.equals(that.collationIdentifier);
}
}
diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/types/CollationIdentifierSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/CollationIdentifierSuite.scala
new file mode 100644
index 00000000000..2e4f8c29947
--- /dev/null
+++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/CollationIdentifierSuite.scala
@@ -0,0 +1,100 @@
+/*
+ * Copyright (2024) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.delta.kernel.types
+
+import org.scalatest.funsuite.AnyFunSuite
+
+import java.util.Optional
+
+class CollationIdentifierSuite extends AnyFunSuite {
+ val PROVIDER_SPARK = "SPARK"
+ val PROVIDER_ICU = "ICU"
+ val DEFAULT_COLLATION_NAME = "UTF8_BINARY"
+ val DEFAULT_COLLATION_IDENTIFIER = CollationIdentifier.fromString("SPARK.UTF8_BINARY")
+
+ test("check fromString with valid string") {
+ Seq(
+ (
+ s"$PROVIDER_SPARK.$DEFAULT_COLLATION_NAME",
+ DEFAULT_COLLATION_IDENTIFIER
+ ),
+ (
+ s"$PROVIDER_ICU.sr_Cyrl_SRB",
+ CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB")
+ ),
+ (
+ s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1",
+ CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1")
+ )
+ ).foreach {
+ case(stringIdentifier, collationIdentifier) =>
+ assert(CollationIdentifier.fromString(stringIdentifier).equals(collationIdentifier))
+ }
+ }
+
+ test("check fromString with invalid string") {
+ Seq(
+ PROVIDER_SPARK,
+ s"${PROVIDER_SPARK}_sr_Cyrl_SRB"
+ ).foreach {
+ stringIdentifier =>
+ val e = intercept[IllegalArgumentException] {
+ val collationIdentifier = CollationIdentifier.fromString(stringIdentifier)
+ }
+ assert(e.getMessage == String.format("Invalid collation identifier: %s", stringIdentifier))
+ }
+ }
+
+ test("check toStringWithoutVersion") {
+ Seq(
+ (
+ DEFAULT_COLLATION_IDENTIFIER,
+ s"$PROVIDER_SPARK.$DEFAULT_COLLATION_NAME"
+ ),
+ (
+ CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB"),
+ s"$PROVIDER_ICU.SR_CYRL_SRB"
+ ),
+ (
+ CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1"),
+ s"$PROVIDER_ICU.SR_CYRL_SRB"
+ )
+ ).foreach {
+ case(collationIdentifier, toStringWithoutVersion) =>
+ assert(collationIdentifier.toStringWithoutVersion == toStringWithoutVersion)
+ }
+ }
+
+ test("check toString") {
+ Seq(
+ (
+ DEFAULT_COLLATION_IDENTIFIER,
+ s"$PROVIDER_SPARK.$DEFAULT_COLLATION_NAME"
+ ),
+ (
+ CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB"),
+ s"$PROVIDER_ICU.SR_CYRL_SRB"
+ ),
+ (
+ CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1"),
+ s"$PROVIDER_ICU.SR_CYRL_SRB.75.1"
+ )
+ ).foreach {
+ case(collationIdentifier, toString) =>
+ assert(collationIdentifier.toString == toString)
+ }
+ }
+}
diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/types/StringTypeSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/StringTypeSuite.scala
new file mode 100644
index 00000000000..d6acfa47e93
--- /dev/null
+++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/StringTypeSuite.scala
@@ -0,0 +1,59 @@
+/*
+ * Copyright (2024) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.delta.kernel.types
+
+import org.scalatest.funsuite.AnyFunSuite
+
+class StringTypeSuite extends AnyFunSuite {
+ test("check equals") {
+ // Testcase: (instance1, instance2, expected value for `instance1 == instance2`)
+ Seq(
+ (
+ StringType.STRING,
+ StringType.STRING,
+ true
+ ),
+ (
+ StringType.STRING,
+ new StringType("sPark.UTF8_bINary"),
+ true
+ ),
+ (
+ StringType.STRING,
+ new StringType("SPARK.UTF8_LCASE"),
+ false
+ ),
+ (
+ new StringType("ICU.UNICODE"),
+ new StringType("SPARK.UTF8_LCASE"),
+ false
+ ),
+ (
+ new StringType("ICU.UNICODE"),
+ new StringType("ICU.UNICODE_CI"),
+ false
+ ),
+ (
+ new StringType("ICU.UNICODE_CI"),
+ new StringType("icU.uniCODe_Ci"),
+ true
+ )
+ ).foreach {
+ case (st1, st2, expResult) =>
+ assert(st1.equals(st2) == expResult)
+ }
+ }
+}