From cd30b9a54f6250b7f60c6610a54f26be8f912ded Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 21 Jul 2023 14:33:53 +0200 Subject: [PATCH 01/15] serialize/deserialize datatypes --- Cargo.toml | 8 + src/error.rs | 11 ++ src/lib.rs | 2 + src/spec/datatypes.rs | 388 ++++++++++++++++++++++++++++++++++++++++++ src/spec/mod.rs | 1 + 5 files changed, 410 insertions(+) create mode 100644 src/error.rs create mode 100644 src/spec/datatypes.rs create mode 100644 src/spec/mod.rs diff --git a/Cargo.toml b/Cargo.toml index 11de5047e..5b55fe3a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,3 +27,11 @@ license = "Apache-2.0" keywords = ["iceberg"] [dependencies] +apache-avro = "0.15.0" +chrono = { version = "0.4.23", default-features = false, features = ["serde"] } +rust_decimal = "1.27.0" +serde = "^1.0" +serde_bytes = "0.11.8" +serde_json = "^1.0" +serde_derive = "^1.0" +thiserror = "1.0.44" diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 000000000..f0cc67eea --- /dev/null +++ b/src/error.rs @@ -0,0 +1,11 @@ +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum IcebergError { + #[error("The type `{0}` cannot be stored as bytes.")] + ValueByteConversion(String), + #[error("Failed to convert slice to array")] + TryFromSlice(#[from] std::array::TryFromSliceError), + #[error("Failed to convert u8 to string")] + Utf8(#[from] std::str::Utf8Error), +} diff --git a/src/lib.rs b/src/lib.rs index b2b60570c..99a35ac24 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,3 +16,5 @@ // under the License. //! Native Rust implementation of Apache Iceberg +pub mod error; +pub mod spec; diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs new file mode 100644 index 000000000..8e04d76ae --- /dev/null +++ b/src/spec/datatypes.rs @@ -0,0 +1,388 @@ +/*! + * Data Types +*/ +use std::{fmt, ops::Index}; + +use serde::{ + de::{Error, IntoDeserializer}, + Deserialize, Deserializer, Serialize, Serializer, +}; + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +#[serde(untagged)] +/// All data types are either primitives or nested types, which are maps, lists, or structs. +pub enum Type { + /// Primitive types + Primitive(PrimitiveType), + /// Struct type + Struct(StructType), + /// List type. + List(ListType), + /// Map type + Map(MapType), +} + +impl fmt::Display for Type { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Type::Primitive(primitive) => write!(f, "{}", primitive), + Type::Struct(_) => write!(f, "struct"), + Type::List(_) => write!(f, "list"), + Type::Map(_) => write!(f, "map"), + } + } +} + +/// Primitive data types +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +#[serde(rename_all = "lowercase", remote = "Self")] +pub enum PrimitiveType { + /// True or False + Boolean, + /// 32-bit signed integer + Int, + /// 64-bit signed integer + Long, + /// 32-bit IEEE 753 floating bit. + Float, + /// 64-bit IEEE 753 floating bit. + Double, + /// Fixed point decimal + Decimal { + /// Precision + precision: u32, + /// Scale + scale: u32, + }, + /// Calendar date without timezone or time. + Date, + /// Time of day without date or timezone. + Time, + /// Timestamp without timezone + Timestamp, + /// Timestamp with timezone + Timestampz, + /// Arbitrary-length character sequences + String, + /// Universally Unique Identifiers + Uuid, + /// Fixed length byte array + Fixed(u64), + /// Arbitrary-length byte array. + Binary, +} + +impl<'de> Deserialize<'de> for PrimitiveType { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + if s.starts_with("decimal") { + deserialize_decimal(s.into_deserializer()) + } else if s.starts_with("fixed") { + deserialize_fixed(s.into_deserializer()) + } else { + PrimitiveType::deserialize(s.into_deserializer()) + } + } +} + +impl Serialize for PrimitiveType { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self { + PrimitiveType::Decimal { precision, scale } => { + serialize_decimal(precision, scale, serializer) + } + PrimitiveType::Fixed(l) => serialize_fixed(l, serializer), + _ => PrimitiveType::serialize(self, serializer), + } + } +} + +fn deserialize_decimal<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + let (precision, scale) = s + .trim_start_matches(r"decimal(") + .trim_end_matches(')') + .split_once(',') + .ok_or_else(|| D::Error::custom("Decimal requires precision and scale: {s}"))?; + + Ok(PrimitiveType::Decimal { + precision: precision.parse().map_err(D::Error::custom)?, + scale: scale.parse().map_err(D::Error::custom)?, + }) +} + +fn serialize_decimal(precision: &u32, scale: &u32, serializer: S) -> Result +where + S: Serializer, +{ + serializer.serialize_str(&format!("decimal({precision},{scale})")) +} + +fn deserialize_fixed<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let fixed = String::deserialize(deserializer)? + .trim_start_matches(r"fixed[") + .trim_end_matches(']') + .to_owned(); + + fixed + .parse() + .map(PrimitiveType::Fixed) + .map_err(D::Error::custom) +} + +fn serialize_fixed(value: &u64, serializer: S) -> Result +where + S: Serializer, +{ + serializer.serialize_str(&format!("fixed[{value}]")) +} + +impl fmt::Display for PrimitiveType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + PrimitiveType::Boolean => write!(f, "boolean"), + PrimitiveType::Int => write!(f, "int"), + PrimitiveType::Long => write!(f, "long"), + PrimitiveType::Float => write!(f, "float"), + PrimitiveType::Double => write!(f, "double"), + PrimitiveType::Decimal { + precision: _, + scale: _, + } => write!(f, "decimal"), + PrimitiveType::Date => write!(f, "date"), + PrimitiveType::Time => write!(f, "time"), + PrimitiveType::Timestamp => write!(f, "timestamp"), + PrimitiveType::Timestampz => write!(f, "timestampz"), + PrimitiveType::String => write!(f, "string"), + PrimitiveType::Uuid => write!(f, "uuid"), + PrimitiveType::Fixed(_) => write!(f, "fixed"), + PrimitiveType::Binary => write!(f, "binary"), + } + } +} + +/// DataType for a specific struct +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +#[serde(rename = "struct", tag = "type")] +pub struct StructType { + /// Struct fields + pub fields: Vec, +} + +impl StructType { + /// Get structfield with certain id + pub fn get(&self, index: usize) -> Option<&StructField> { + self.fields.iter().find(|field| field.id as usize == index) + } + /// Get structfield with certain name + pub fn get_name(&self, name: &str) -> Option<&StructField> { + self.fields.iter().find(|field| field.name == name) + } +} + +impl Index for StructType { + type Output = StructField; + + fn index(&self, index: usize) -> &Self::Output { + &self.fields[index] + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +/// A struct is a tuple of typed values. Each field in the tuple is named and has an integer id that is unique in the table schema. +/// Each field can be either optional or required, meaning that values can (or cannot) be null. Fields may be any type. +/// Fields may have an optional comment or doc string. Fields can have default values. +pub struct StructField { + /// Id unique in table schema + pub id: i32, + /// Field Name + pub name: String, + /// Optional or required + pub required: bool, + /// Datatype + #[serde(rename = "type")] + pub field_type: Type, + /// Fields may have an optional comment or doc string. + #[serde(skip_serializing_if = "Option::is_none")] + pub doc: Option, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +#[serde(rename = "list", rename_all = "kebab-case", tag = "type")] +/// A list is a collection of values with some element type. The element field has an integer id that is unique in the table schema. +/// Elements can be either optional or required. Element types may be any type. +pub struct ListType { + /// Id unique in table schema + pub element_id: i32, + + /// Elements can be either optional or required. + pub element_required: bool, + + /// Datatype + pub element: Box, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +#[serde(rename = "map", rename_all = "kebab-case", tag = "type")] +/// A map is a collection of key-value pairs with a key type and a value type. +/// Both the key field and value field each have an integer id that is unique in the table schema. +/// Map keys are required and map values can be either optional or required. +/// Both map keys and map values may be any type, including nested types. +pub struct MapType { + /// Key Id that is unique in table schema + pub key_id: i32, + /// Datatype of key + pub key: Box, + /// Value Id that is unique in table schema + pub value_id: i32, + /// If value is optional or required + pub value_required: bool, + /// Datatype of value + pub value: Box, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn decimal() { + let record = r#" + { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "id", + "required": true, + "type": "decimal(9,2)" + } + ] + } + "#; + + let result: StructType = serde_json::from_str(record).unwrap(); + assert_eq!( + Type::Primitive(PrimitiveType::Decimal { + precision: 9, + scale: 2 + }), + result.fields[0].field_type + ); + let result_two: StructType = serde_json::from_str( + &serde_json::to_string(&result).expect("Failed to serialize result"), + ) + .expect("Failed to serialize json"); + assert_eq!(result, result_two); + } + + #[test] + fn fixed() { + let record = r#" + { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "id", + "required": true, + "type": "fixed[8]" + } + ] + } + "#; + + let result: StructType = serde_json::from_str(record).unwrap(); + assert_eq!( + Type::Primitive(PrimitiveType::Fixed(8)), + result.fields[0].field_type + ); + let result_two: StructType = serde_json::from_str( + &serde_json::to_string(&result).expect("Failed to serialize result"), + ) + .expect("Failed to serialize json"); + assert_eq!(result, result_two); + } + + #[test] + fn struct_type() { + let record = r#" + { + "type": "struct", + "fields": [ { + "id": 1, + "name": "id", + "required": true, + "type": "uuid", + "initial-default": "0db3e2a8-9d1d-42b9-aa7b-74ebe558dceb", + "write-default": "ec5911be-b0a7-458c-8438-c9a3e53cffae" + }, { + "id": 2, + "name": "data", + "required": false, + "type": "int" + } ] + } + "#; + + let result: StructType = serde_json::from_str(record).unwrap(); + assert_eq!( + Type::Primitive(PrimitiveType::Uuid), + result.fields[0].field_type + ); + assert_eq!(1, result.fields[0].id); + assert_eq!(true, result.fields[0].required); + + assert_eq!( + Type::Primitive(PrimitiveType::Int), + result.fields[1].field_type + ); + assert_eq!(2, result.fields[1].id); + assert_eq!(false, result.fields[1].required); + } + + #[test] + fn list() { + let record = r#" + { + "type": "list", + "element-id": 3, + "element-required": true, + "element": "string" + } + "#; + + let result: ListType = serde_json::from_str(record).unwrap(); + assert_eq!(Type::Primitive(PrimitiveType::String), *result.element); + } + + #[test] + fn map() { + let record = r#" + { + "type": "map", + "key-id": 4, + "key": "string", + "value-id": 5, + "value-required": false, + "value": "double" + } + "#; + + let result: MapType = serde_json::from_str(record).unwrap(); + assert_eq!(Type::Primitive(PrimitiveType::String), *result.key); + assert_eq!(Type::Primitive(PrimitiveType::Double), *result.value); + } +} diff --git a/src/spec/mod.rs b/src/spec/mod.rs new file mode 100644 index 000000000..0d8681dc4 --- /dev/null +++ b/src/spec/mod.rs @@ -0,0 +1 @@ +pub mod datatypes; From f10de7b132da51a0e5dc2a1384fef9c1c486622f Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 21 Jul 2023 22:27:08 +0200 Subject: [PATCH 02/15] fix timestamptz --- src/spec/datatypes.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index 8e04d76ae..dc28b8a1f 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -61,7 +61,7 @@ pub enum PrimitiveType { /// Timestamp without timezone Timestamp, /// Timestamp with timezone - Timestampz, + Timestamptz, /// Arbitrary-length character sequences String, /// Universally Unique Identifiers @@ -164,7 +164,7 @@ impl fmt::Display for PrimitiveType { PrimitiveType::Date => write!(f, "date"), PrimitiveType::Time => write!(f, "time"), PrimitiveType::Timestamp => write!(f, "timestamp"), - PrimitiveType::Timestampz => write!(f, "timestampz"), + PrimitiveType::Timestamptz => write!(f, "timestamptz"), PrimitiveType::String => write!(f, "string"), PrimitiveType::Uuid => write!(f, "uuid"), PrimitiveType::Fixed(_) => write!(f, "fixed"), From 25be27482fad795e26337be484f7c6290b8eb69d Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 21 Jul 2023 22:32:00 +0200 Subject: [PATCH 03/15] allow whitespace in decimal --- src/spec/datatypes.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index dc28b8a1f..31479aa06 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -115,8 +115,8 @@ where .ok_or_else(|| D::Error::custom("Decimal requires precision and scale: {s}"))?; Ok(PrimitiveType::Decimal { - precision: precision.parse().map_err(D::Error::custom)?, - scale: scale.parse().map_err(D::Error::custom)?, + precision: precision.trim().parse().map_err(D::Error::custom)?, + scale: scale.trim().parse().map_err(D::Error::custom)?, }) } From 3e8ffb092bbe21a401e242edaae459a8a6981566 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 21 Jul 2023 22:35:09 +0200 Subject: [PATCH 04/15] format json --- src/spec/datatypes.rs | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index 31479aa06..3263ea899 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -321,20 +321,22 @@ mod tests { let record = r#" { "type": "struct", - "fields": [ { - "id": 1, - "name": "id", - "required": true, - "type": "uuid", - "initial-default": "0db3e2a8-9d1d-42b9-aa7b-74ebe558dceb", - "write-default": "ec5911be-b0a7-458c-8438-c9a3e53cffae" - }, { - "id": 2, - "name": "data", - "required": false, - "type": "int" - } ] - } + "fields": [ + { + "id": 1, + "name": "id", + "required": true, + "type": "uuid", + "initial-default": "0db3e2a8-9d1d-42b9-aa7b-74ebe558dceb", + "write-default": "ec5911be-b0a7-458c-8438-c9a3e53cffae" + }, { + "id": 2, + "name": "data", + "required": false, + "type": "int" + } + ] + } "#; let result: StructType = serde_json::from_str(record).unwrap(); @@ -361,7 +363,7 @@ mod tests { "element-id": 3, "element-required": true, "element": "string" - } + } "#; let result: ListType = serde_json::from_str(record).unwrap(); @@ -378,7 +380,7 @@ mod tests { "value-id": 5, "value-required": false, "value": "double" - } + } "#; let result: MapType = serde_json::from_str(record).unwrap(); From e300cccbc34e8dcb65519d173be8bb5c61736f0b Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 21 Jul 2023 22:45:23 +0200 Subject: [PATCH 05/15] test for map with int key --- src/spec/datatypes.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index 3263ea899..cc7d4ee52 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -387,4 +387,22 @@ mod tests { assert_eq!(Type::Primitive(PrimitiveType::String), *result.key); assert_eq!(Type::Primitive(PrimitiveType::Double), *result.value); } + + #[test] + fn map_int() { + let record = r#" + { + "type": "map", + "key-id": 4, + "key": "int", + "value-id": 5, + "value-required": false, + "value": "string" + } + "#; + + let result: MapType = serde_json::from_str(record).unwrap(); + assert_eq!(Type::Primitive(PrimitiveType::Int), *result.key); + assert_eq!(Type::Primitive(PrimitiveType::String), *result.value); + } } From 2f4e221e4c88fbe549e9969bb0f281ac1f87ce61 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Mon, 24 Jul 2023 07:42:13 +0200 Subject: [PATCH 06/15] Add apache headers --- src/error.rs | 17 +++++++++++++++++ src/spec/datatypes.rs | 17 +++++++++++++++++ src/spec/mod.rs | 17 +++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/src/error.rs b/src/error.rs index f0cc67eea..fbaa6fd7b 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use thiserror::Error; #[derive(Error, Debug)] diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index cc7d4ee52..6c60eb48f 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + /*! * Data Types */ diff --git a/src/spec/mod.rs b/src/spec/mod.rs index 0d8681dc4..ff69ae248 100644 --- a/src/spec/mod.rs +++ b/src/spec/mod.rs @@ -1 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + pub mod datatypes; From f32ab09a788be84432d76c3dc305832d85a98287 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Mon, 24 Jul 2023 07:45:19 +0200 Subject: [PATCH 07/15] Fix clippy warnings --- src/spec/datatypes.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index 6c60eb48f..0386e82be 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -362,14 +362,14 @@ mod tests { result.fields[0].field_type ); assert_eq!(1, result.fields[0].id); - assert_eq!(true, result.fields[0].required); + assert!(result.fields[0].required); assert_eq!( Type::Primitive(PrimitiveType::Int), result.fields[1].field_type ); assert_eq!(2, result.fields[1].id); - assert_eq!(false, result.fields[1].required); + assert!(!result.fields[1].required); } #[test] From 9d2ef4fc3faf1626c5e982070d24d6e3d0c2c428 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Mon, 24 Jul 2023 08:11:16 +0200 Subject: [PATCH 08/15] improve Dispay of decimal and fixed type --- src/spec/datatypes.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index 0386e82be..3186a91c4 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -174,17 +174,16 @@ impl fmt::Display for PrimitiveType { PrimitiveType::Long => write!(f, "long"), PrimitiveType::Float => write!(f, "float"), PrimitiveType::Double => write!(f, "double"), - PrimitiveType::Decimal { - precision: _, - scale: _, - } => write!(f, "decimal"), + PrimitiveType::Decimal { precision, scale } => { + write!(f, "decimal({},{})", precision, scale) + } PrimitiveType::Date => write!(f, "date"), PrimitiveType::Time => write!(f, "time"), PrimitiveType::Timestamp => write!(f, "timestamp"), PrimitiveType::Timestamptz => write!(f, "timestamptz"), PrimitiveType::String => write!(f, "string"), PrimitiveType::Uuid => write!(f, "uuid"), - PrimitiveType::Fixed(_) => write!(f, "fixed"), + PrimitiveType::Fixed(size) => write!(f, "fixed({})", size), PrimitiveType::Binary => write!(f, "binary"), } } From f7737f1a3fafeeb8f88d592c4cb4597e967ddda9 Mon Sep 17 00:00:00 2001 From: JanKaul Date: Mon, 24 Jul 2023 08:19:50 +0200 Subject: [PATCH 09/15] Update datatype documentation Co-authored-by: Renjie Liu --- src/spec/datatypes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index 3186a91c4..dc8cf3c15 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -79,7 +79,7 @@ pub enum PrimitiveType { Timestamp, /// Timestamp with timezone Timestamptz, - /// Arbitrary-length character sequences + /// Arbitrary-length character sequences encoded in utf-8 String, /// Universally Unique Identifiers Uuid, From 420aaf21d3d6d0a09775e81b6f51213da7711339 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Mon, 24 Jul 2023 08:21:29 +0200 Subject: [PATCH 10/15] rename index in structtype getter --- src/spec/datatypes.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index dc8cf3c15..5bf0e1979 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -199,8 +199,8 @@ pub struct StructType { impl StructType { /// Get structfield with certain id - pub fn get(&self, index: usize) -> Option<&StructField> { - self.fields.iter().find(|field| field.id as usize == index) + pub fn get(&self, id: usize) -> Option<&StructField> { + self.fields.iter().find(|field| field.id as usize == id) } /// Get structfield with certain name pub fn get_name(&self, name: &str) -> Option<&StructField> { From 4510f298f4bb7f25091bbb7597816cabdab36664 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Mon, 24 Jul 2023 09:05:35 +0200 Subject: [PATCH 11/15] add serialization to tests --- src/spec/datatypes.rs | 141 +++++++++++++++++++++++++++++------------- 1 file changed, 99 insertions(+), 42 deletions(-) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index 5bf0e1979..0feb8ae93 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -217,6 +217,7 @@ impl Index for StructType { } #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +#[serde(rename_all = "kebab-case")] /// A struct is a tuple of typed values. Each field in the tuple is named and has an integer id that is unique in the table schema. /// Each field can be either optional or required, meaning that values can (or cannot) be null. Fields may be any type. /// Fields may have an optional comment or doc string. Fields can have default values. @@ -233,6 +234,12 @@ pub struct StructField { /// Fields may have an optional comment or doc string. #[serde(skip_serializing_if = "Option::is_none")] pub doc: Option, + /// Used to populate the field’s value for all records that were written before the field was added to the schema + #[serde(skip_serializing_if = "Option::is_none")] + pub initial_default: Option, + /// Used to populate the field’s value for any records written after the field was added to the schema, if the writer does not supply the field’s value + #[serde(skip_serializing_if = "Option::is_none")] + pub write_default: Option, } #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] @@ -273,6 +280,17 @@ pub struct MapType { mod tests { use super::*; + fn check_type_serde(json: &str, expected_type: Type) { + let desered_type: Type = serde_json::from_str(json).unwrap(); + assert_eq!(desered_type, expected_type); + + let sered_json = serde_json::to_string(&expected_type).unwrap(); + let parsed_json_value = serde_json::from_str::(&sered_json).unwrap(); + let raw_json_value = serde_json::from_str::(json).unwrap(); + + assert_eq!(parsed_json_value, raw_json_value); + } + #[test] fn decimal() { let record = r#" @@ -289,19 +307,23 @@ mod tests { } "#; - let result: StructType = serde_json::from_str(record).unwrap(); - assert_eq!( - Type::Primitive(PrimitiveType::Decimal { - precision: 9, - scale: 2 + check_type_serde( + record, + Type::Struct(StructType { + fields: vec![StructField { + id: 1, + name: "id".to_string(), + required: true, + field_type: Type::Primitive(PrimitiveType::Decimal { + precision: 9, + scale: 2, + }), + doc: None, + initial_default: None, + write_default: None, + }], }), - result.fields[0].field_type - ); - let result_two: StructType = serde_json::from_str( - &serde_json::to_string(&result).expect("Failed to serialize result"), ) - .expect("Failed to serialize json"); - assert_eq!(result, result_two); } #[test] @@ -320,16 +342,20 @@ mod tests { } "#; - let result: StructType = serde_json::from_str(record).unwrap(); - assert_eq!( - Type::Primitive(PrimitiveType::Fixed(8)), - result.fields[0].field_type - ); - let result_two: StructType = serde_json::from_str( - &serde_json::to_string(&result).expect("Failed to serialize result"), + check_type_serde( + record, + Type::Struct(StructType { + fields: vec![StructField { + id: 1, + name: "id".to_string(), + required: true, + field_type: Type::Primitive(PrimitiveType::Fixed(8)), + doc: None, + initial_default: None, + write_default: None, + }], + }), ) - .expect("Failed to serialize json"); - assert_eq!(result, result_two); } #[test] @@ -355,20 +381,31 @@ mod tests { } "#; - let result: StructType = serde_json::from_str(record).unwrap(); - assert_eq!( - Type::Primitive(PrimitiveType::Uuid), - result.fields[0].field_type - ); - assert_eq!(1, result.fields[0].id); - assert!(result.fields[0].required); - - assert_eq!( - Type::Primitive(PrimitiveType::Int), - result.fields[1].field_type - ); - assert_eq!(2, result.fields[1].id); - assert!(!result.fields[1].required); + check_type_serde( + record, + Type::Struct(StructType { + fields: vec![ + StructField { + id: 1, + name: "id".to_string(), + required: true, + field_type: Type::Primitive(PrimitiveType::Uuid), + doc: None, + initial_default: Some("0db3e2a8-9d1d-42b9-aa7b-74ebe558dceb".to_string()), + write_default: Some("ec5911be-b0a7-458c-8438-c9a3e53cffae".to_string()), + }, + StructField { + id: 2, + name: "data".to_string(), + required: false, + field_type: Type::Primitive(PrimitiveType::Int), + doc: None, + initial_default: None, + write_default: None, + }, + ], + }), + ) } #[test] @@ -382,8 +419,14 @@ mod tests { } "#; - let result: ListType = serde_json::from_str(record).unwrap(); - assert_eq!(Type::Primitive(PrimitiveType::String), *result.element); + check_type_serde( + record, + Type::List(ListType { + element_id: 3, + element_required: true, + element: Box::new(Type::Primitive(PrimitiveType::String)), + }), + ); } #[test] @@ -399,9 +442,16 @@ mod tests { } "#; - let result: MapType = serde_json::from_str(record).unwrap(); - assert_eq!(Type::Primitive(PrimitiveType::String), *result.key); - assert_eq!(Type::Primitive(PrimitiveType::Double), *result.value); + check_type_serde( + record, + Type::Map(MapType { + key_id: 4, + key: Box::new(Type::Primitive(PrimitiveType::String)), + value_id: 5, + value_required: false, + value: Box::new(Type::Primitive(PrimitiveType::Double)), + }), + ); } #[test] @@ -417,8 +467,15 @@ mod tests { } "#; - let result: MapType = serde_json::from_str(record).unwrap(); - assert_eq!(Type::Primitive(PrimitiveType::Int), *result.key); - assert_eq!(Type::Primitive(PrimitiveType::String), *result.value); + check_type_serde( + record, + Type::Map(MapType { + key_id: 4, + key: Box::new(Type::Primitive(PrimitiveType::Int)), + value_id: 5, + value_required: false, + value: Box::new(Type::Primitive(PrimitiveType::String)), + }), + ); } } From ff332f0132f5bf6c23d2d907c7b255c80d858718 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Mon, 24 Jul 2023 09:09:29 +0200 Subject: [PATCH 12/15] make structtype fields private --- src/spec/datatypes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index 0feb8ae93..5c80de73c 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -194,7 +194,7 @@ impl fmt::Display for PrimitiveType { #[serde(rename = "struct", tag = "type")] pub struct StructType { /// Struct fields - pub fields: Vec, + fields: Vec, } impl StructType { From f92a7691191d8b3395a570b14eea6dad0ef3ec22 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Mon, 24 Jul 2023 09:21:58 +0200 Subject: [PATCH 13/15] implement Display for StructType,StructField --- src/spec/datatypes.rs | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index 5c80de73c..682c436c4 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -216,6 +216,16 @@ impl Index for StructType { } } +impl fmt::Display for StructType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "struct<")?; + for field in &self.fields { + write!(f, "{}", field.field_type)?; + } + write!(f, ">") + } +} + #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] #[serde(rename_all = "kebab-case")] /// A struct is a tuple of typed values. Each field in the tuple is named and has an integer id that is unique in the table schema. @@ -242,6 +252,23 @@ pub struct StructField { pub write_default: Option, } +impl fmt::Display for StructField { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}: ", self.id)?; + write!(f, "{}: ", self.name)?; + if self.required { + write!(f, "required ")?; + } else { + write!(f, "optional ")?; + } + write!(f, "{} ", self.field_type)?; + if let Some(doc) = &self.doc { + write!(f, "{}", doc)?; + } + Ok(()) + } +} + #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] #[serde(rename = "list", rename_all = "kebab-case", tag = "type")] /// A list is a collection of values with some element type. The element field has an integer id that is unique in the table schema. From debda0deef99155a23624d8959b3758faa59c9c1 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Mon, 24 Jul 2023 09:27:13 +0200 Subject: [PATCH 14/15] fix display structtype --- src/spec/datatypes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spec/datatypes.rs b/src/spec/datatypes.rs index 682c436c4..3005b2751 100644 --- a/src/spec/datatypes.rs +++ b/src/spec/datatypes.rs @@ -43,7 +43,7 @@ impl fmt::Display for Type { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Type::Primitive(primitive) => write!(f, "{}", primitive), - Type::Struct(_) => write!(f, "struct"), + Type::Struct(s) => write!(f, "{}", s), Type::List(_) => write!(f, "list"), Type::Map(_) => write!(f, "map"), } From cbb7e948165483d33269663c69f32980e77e3928 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Mon, 24 Jul 2023 10:42:45 +0200 Subject: [PATCH 15/15] remove dependencies --- Cargo.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5b55fe3a7..8cdf28e8e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,8 +28,6 @@ keywords = ["iceberg"] [dependencies] apache-avro = "0.15.0" -chrono = { version = "0.4.23", default-features = false, features = ["serde"] } -rust_decimal = "1.27.0" serde = "^1.0" serde_bytes = "0.11.8" serde_json = "^1.0"