apache · devinjdangelo · Dec 28, 2023 · Dec 28, 2023 · Dec 28, 2023 · Dec 28, 2023
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -98,7 +98,7 @@ jobs:
         with:
           rust-version: stable
       - name: Run tests (excluding doctests)
-        run: cargo test --lib --tests --bins --features avro,json,backtrace
+        run: RUST_MIN_STACK=504857600 cargo test --lib --tests --bins --features avro,json,backtrace
       - name: Verify Working Directory Clean
         run: git diff --exit-code
 

diff --git a/datafusion-examples/examples/bench.py b/datafusion-examples/examples/bench.py
@@ -0,0 +1,88 @@
+import polars as pl
+import time
+from datetime import date
+from datafusion import SessionContext
+
+t = time.time()
+
+#file = "/home/dev/arrow-datafusion/benchmarks/data/tpch_sf10/lineitem/part-0.parquet"
+
+
+file = "/home/dev/arrow-datafusion/test_out/benchon.parquet"
+#file = "/home/dev/arrow-datafusion/test_out/uncompressed.parquet"
+
+
+# Create a DataFusion context
+ctx = SessionContext()
+
+# Register table with context
+ctx.register_parquet('test', file)
+
+times = []
+for i in range(5):
+
+    t = time.time()
+    df = pl.scan_parquet(file) \
+        .filter(pl.col("l_shipdate") <= date(1998, 9, 2)) \
+        .group_by("l_returnflag", "l_linestatus") \
+        .agg([
+            pl.col("l_quantity").sum().alias("sum_qty"),
+            pl.col("l_extendedprice").sum().alias("sum_base_price"),
+            (pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).sum().alias("sum_disc_price"),
+            (pl.col("l_extendedprice") * (1 - pl.col("l_discount")) * (1 + pl.col("l_tax"))).sum().alias("sum_charge"),
+            pl.col("l_quantity").mean().alias("avg_qty"),
+            pl.col("l_extendedprice").mean().alias("avg_price"),
+            pl.col("l_discount").mean().alias("avg_disc"),
+            pl.count().alias("count_order")
+        ]
+        ) \
+        .sort([pl.col("l_returnflag"), pl.col("l_linestatus")])
+    df = df.collect()
+
+    print(f"polars agg query {time.time()-t}s")
+
+    #t = time.time()
+    #pl.scan_parquet(file).sink_parquet("test_out/pl.parquet")
+    #print(f"polars re-endcode job {time.time()-t}")
+
+
+    t = time.time()
+
+
+    query = """
+   select
+    l_returnflag,
+    l_linestatus,
+    sum(l_quantity) as sum_qty,
+    sum(l_extendedprice) as sum_base_price,
+    sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+    sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+    avg(l_quantity) as avg_qty,
+    avg(l_extendedprice) as avg_price,
+    avg(l_discount) as avg_disc,
+    count(*) as count_order
+from
+    test
+where
+        l_shipdate <= date '1998-09-02'
+group by
+    l_returnflag,
+    l_linestatus
+order by
+    l_returnflag,
+    l_linestatus;
+    """
+
+    # Execute SQL
+    df = ctx.sql(f"{query}").cache()
+    elapsed = time.time() - t
+    times.append(elapsed)
+    print(f"datafusion agg query {elapsed}s")
+
+    # t = time.time()
+    # df = ctx.sql("copy test to 'test_out/df.parquet'")
+    # df.show()
+    # print(f"datafusion reendcode job {time.time() - t}s")
+
+print(sum(times)/len(times))
+
diff --git a/datafusion-examples/test_csv/5idJzooCjySkTpLf_0.csv b/datafusion-examples/test_csv/5idJzooCjySkTpLf_0.csv
diff --git a/datafusion-examples/test_csv/HGQahh0J2jquvQ6M_0.csv b/datafusion-examples/test_csv/HGQahh0J2jquvQ6M_0.csv
@@ -0,0 +1,4 @@
+tablecol1
+a
+b
+c
diff --git a/datafusion-examples/test_json/Ml2qxq3EGuEpgtz6_0.json b/datafusion-examples/test_json/Ml2qxq3EGuEpgtz6_0.json
@@ -0,0 +1,3 @@
+{"tablecol1":"a"}
+{"tablecol1":"b"}
+{"tablecol1":"c"}
diff --git a/datafusion-examples/test_json/i87q26cL7eYNxxnQ_0.json b/datafusion-examples/test_json/i87q26cL7eYNxxnQ_0.json
@@ -0,0 +1,3 @@
+{"tablecol1":"a"}
+{"tablecol1":"b"}
+{"tablecol1":"c"}
diff --git a/datafusion-examples/test_parquet/YTByOUmxGAgr4Z8t_0.parquet b/datafusion-examples/test_parquet/YTByOUmxGAgr4Z8t_0.parquet
diff --git a/datafusion-examples/test_parquet/e9tydLQCWeJ8ul8D_0.parquet b/datafusion-examples/test_parquet/e9tydLQCWeJ8ul8D_0.parquet
diff --git a/datafusion-examples/test_table/EkdN1r77DpYeMh3w_0.parquet b/datafusion-examples/test_table/EkdN1r77DpYeMh3w_0.parquet
diff --git a/datafusion-examples/test_table/oUn5SdwSqetWEpqU_0.parquet b/datafusion-examples/test_table/oUn5SdwSqetWEpqU_0.parquet
diff --git a/datafusion/common/src/file_options/file_type.rs b/datafusion/common/src/file_options/file_type.rs
@@ -18,9 +18,11 @@
 //! File type abstraction
 
 use crate::error::{DataFusionError, Result};
+use crate::parsers::CompressionTypeVariant;
 
 use core::fmt;
 use std::fmt::Display;
+use std::hash::Hasher;
 use std::str::FromStr;
 
 /// The default file extension of arrow files
@@ -40,93 +42,39 @@ pub trait GetExt {
     fn get_ext(&self) -> String;
 }
 
-/// Readable file type
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub enum FileType {
-    /// Apache Arrow file
-    ARROW,
-    /// Apache Avro file
-    AVRO,
-    /// Apache Parquet file
-    #[cfg(feature = "parquet")]
-    PARQUET,
-    /// CSV file
-    CSV,
-    /// JSON file
-    JSON,
-}
+/// A trait which provides information during planning time about a type of file which may be defined
+/// externally. Use SessionContext::register_file_type to add new implementations.
+pub trait FileType:
+    std::fmt::Debug + FileTypeClone + Send + Sync
+{
+    /// Returns the default file extension for this type, e.g. CSV would return ".csv".to_owned()
+    /// The default_extension is also used to uniquely identify a specific FileType::Extension variant,
+    /// so ensure this String is unique from any built in FileType and any other ExtensionFileTypes
+    /// defined.
+    fn default_extension(&self) -> String;
 
-impl GetExt for FileType {
-    fn get_ext(&self) -> String {
-        match self {
-            FileType::ARROW => DEFAULT_ARROW_EXTENSION.to_owned(),
-            FileType::AVRO => DEFAULT_AVRO_EXTENSION.to_owned(),
-            #[cfg(feature = "parquet")]
-            FileType::PARQUET => DEFAULT_PARQUET_EXTENSION.to_owned(),
-            FileType::CSV => DEFAULT_CSV_EXTENSION.to_owned(),
-            FileType::JSON => DEFAULT_JSON_EXTENSION.to_owned(),
-        }
-    }
+    /// Returns the file extension when it is compressed with a given [CompressionTypeVariant]
+    fn extension_with_compression(
+        &self,
+        compression: CompressionTypeVariant,
+    ) -> Result<String>;
 }
 
-impl Display for FileType {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let out = match self {
-            FileType::CSV => "csv",
-            FileType::JSON => "json",
-            #[cfg(feature = "parquet")]
-            FileType::PARQUET => "parquet",
-            FileType::AVRO => "avro",
-            FileType::ARROW => "arrow",
-        };
-        write!(f, "{}", out)
-    }
+pub trait FileTypeClone {
+    fn clone_box(&self) -> Box<dyn FileType>;
 }
 
-impl FromStr for FileType {
-    type Err = DataFusionError;
-
-    fn from_str(s: &str) -> Result<Self> {
-        let s = s.to_uppercase();
-        match s.as_str() {
-            "ARROW" => Ok(FileType::ARROW),
-            "AVRO" => Ok(FileType::AVRO),
-            #[cfg(feature = "parquet")]
-            "PARQUET" => Ok(FileType::PARQUET),
-            "CSV" => Ok(FileType::CSV),
-            "JSON" | "NDJSON" => Ok(FileType::JSON),
-            _ => Err(DataFusionError::NotImplemented(format!(
-                "Unknown FileType: {s}"
-            ))),
-        }
+impl Clone for Box<dyn FileType> {
+    fn clone(&self) -> Box<dyn FileType> {
+        self.clone_box()
     }
 }
 
-#[cfg(test)]
-#[cfg(feature = "parquet")]
-mod tests {
-    use crate::error::DataFusionError;
-    use crate::file_options::FileType;
-    use std::str::FromStr;
-
-    #[test]
-    fn from_str() {
-        for (ext, file_type) in [
-            ("csv", FileType::CSV),
-            ("CSV", FileType::CSV),
-            ("json", FileType::JSON),
-            ("JSON", FileType::JSON),
-            ("avro", FileType::AVRO),
-            ("AVRO", FileType::AVRO),
-            ("parquet", FileType::PARQUET),
-            ("PARQUET", FileType::PARQUET),
-        ] {
-            assert_eq!(FileType::from_str(ext).unwrap(), file_type);
-        }
-
-        assert!(matches!(
-            FileType::from_str("Unknown"),
-            Err(DataFusionError::NotImplemented(_))
-        ));
+impl std::hash::Hash for Box<dyn FileType> {
+    fn hash<H>(&self, state: &mut H)
+    where
+        H: Hasher,
+    {
+        self.default_extension().hash(state)
     }
 }