From becead9666cd211e5f728ee5c02bd5e29eadfba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:21:55 +0100 Subject: [PATCH] docs: Rewrite 'Getting started' page (#19028) --- docs/README.md | 9 + docs/source/_build/API_REFERENCE_LINKS.yml | 46 +++- docs/source/index.md | 4 +- .../src/python/user-guide/getting-started.py | 135 ++++++++++++ .../user-guide/getting-started/expressions.py | 87 -------- .../user-guide/getting-started/joins.py | 29 --- .../getting-started/reading-writing.py | 42 ---- docs/source/src/rust/Cargo.toml | 14 +- .../src/rust/user-guide/getting-started.rs | 196 ++++++++++++++++++ .../user-guide/getting-started/expressions.rs | 137 ------------ .../rust/user-guide/getting-started/joins.rs | 29 --- .../getting-started/reading-writing.rs | 71 ------- docs/source/user-guide/getting-started.md | 191 +++++++++-------- 13 files changed, 481 insertions(+), 509 deletions(-) create mode 100644 docs/README.md create mode 100644 docs/source/src/python/user-guide/getting-started.py delete mode 100644 docs/source/src/python/user-guide/getting-started/expressions.py delete mode 100644 docs/source/src/python/user-guide/getting-started/joins.py delete mode 100644 docs/source/src/python/user-guide/getting-started/reading-writing.py create mode 100644 docs/source/src/rust/user-guide/getting-started.rs delete mode 100644 docs/source/src/rust/user-guide/getting-started/expressions.rs delete mode 100644 docs/source/src/rust/user-guide/getting-started/joins.rs delete mode 100644 docs/source/src/rust/user-guide/getting-started/reading-writing.rs diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000000..60b690c1c100 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,9 @@ +The documentation is split across two subfolders, `source` and `assets`. +The folder `source` contains the static source files that make up the user guide, which are mostly markdown files and the snippets of code. +The folder `assets` contains (dynamically generated) assets used by those files, including data files for the snippets and images with plots or diagrams. + +Do _not_ merge the two folders together. +In [PR #18773](https://github.com/pola-rs/polars/pull/18773) we introduced this split to fix the MkDocs server live reloading. +If everything is in one folder `docs`, the MkDocs server will watch the folder `docs`. +When you make one change the MkDocs server live reloads and rebuilds the docs. +This triggers scripts that build asset files, which change the folder `docs`, leading to an infinite reloading loop. diff --git a/docs/source/_build/API_REFERENCE_LINKS.yml b/docs/source/_build/API_REFERENCE_LINKS.yml index c39ffc310a07..2a9bc80237dc 100644 --- a/docs/source/_build/API_REFERENCE_LINKS.yml +++ b/docs/source/_build/API_REFERENCE_LINKS.yml @@ -8,8 +8,9 @@ python: filter: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.filter.html with_columns: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.with_columns.html group_by: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.group_by.html + agg: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.dataframe.group_by.GroupBy.agg.html join: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html - hstack: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.hstack.html + vstack: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.vstack.html read_csv: https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html write_csv: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_csv.html read_excel: https://docs.pola.rs/api/python/stable/reference/api/polars.read_excel.html @@ -34,7 +35,12 @@ python: fold: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.fold.html concat_str: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.concat_str.html str.split: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.split.html - Expr.list: https://docs.pola.rs/api/python/stable/reference/expressions/list.html + Expr.list: + name: "list namespace" + link: https://docs.pola.rs/api/python/stable/reference/expressions/list.html + Expr.str: + name: "str namespace" + link: https://docs.pola.rs/api/python/stable/reference/expressions/string.html element: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.element.html all: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.all.html exclude: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.exclude.html @@ -122,6 +128,9 @@ python: is_selector: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.is_selector expand_selector: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.expand_selector + Expr.dt: + name: "dt namespace" + link: https://docs.pola.rs/api/python/stable/reference/expressions/temporal.html dt.convert_time_zone: name: dt.convert_time_zone link: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.dt.convert_time_zone.html @@ -147,6 +156,10 @@ python: struct.field: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.struct.field.html struct.rename_fields: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.struct.rename_fields.html + Expr.name: + name: "name namespace" + link: https://docs.pola.rs/api/python/stable/reference/expressions/name.html + round: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.round.html#polars.Expr.round rust: DataFrame: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html @@ -160,12 +173,13 @@ rust: filter: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.filter with_columns: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.with_columns group_by: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.group_by + agg: https://docs.rs/polars/latest/polars/prelude/struct.LazyGroupBy.html#method.agg group_by_dynamic: name: group_by_dynamic link: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.group_by_dynamic feature_flags: [dynamic_group_by] join: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join - hstack: https://docs.pola.rs/api/rust/dev/polars_core/frame/struct.DataFrame.html#method.hstack + vstack: https://docs.pola.rs/api/rust/dev/polars_core/frame/struct.DataFrame.html#method.vstack concat: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.concat.html explain: https://docs.rs/polars/latest/polars/prelude/struct.LazyFrame.html#method.explain @@ -283,7 +297,10 @@ rust: n_unique: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.n_unique null_count: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.null_count interpolate: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.interpolate - is_between: https://github.com/pola-rs/polars/issues/11285 + is_between: + name: is_between + link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.is_between + feature_flags: [is_between] is_duplicated: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.is_duplicated is_null: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.is_null value_counts: @@ -291,7 +308,13 @@ rust: link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.value_counts feature_flags: [dtype-struct] - Expr.list: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ListNameSpace.html + Expr.list: + name: "list namespace" + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ListNameSpace.html + Expr.str: + name: "str namespace" + link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.StringNameSpaceImpl.html + feature_flags: [strings] Series.arr: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ArrayNameSpace.html date_range: @@ -374,3 +397,16 @@ rust: struct.field: name: struct.field_by_name link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.StructNameSpace.html#method.field_by_name + + Expr.name: + name: "name namespace" + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ExprNameNameSpace.html + feature_flags: [lazy] + Expr.dt: + name: "dt namespace" + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html + feature_flags: [temporal] + round: + name: "round" + link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.round + feature_flags: [round_series] diff --git a/docs/source/index.md b/docs/source/index.md index 279996457ddd..4fd988e974eb 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -23,7 +23,7 @@ Polars is a blazingly fast DataFrame library for manipulating structured data. T - **Fast**: Written from scratch in Rust, designed close to the machine and without external dependencies. - **I/O**: First class support for all common data storage layers: local, cloud storage & databases. - **Intuitive API**: Write your queries the way they were intended. Polars, internally, will determine the most efficient way to execute using its query optimizer. -- **Out of Core**: The streaming API allows you to process your results without requiring all your data to be in memory at the same time +- **Out of Core**: The streaming API allows you to process your results without requiring all your data to be in memory at the same time. - **Parallel**: Utilises the power of your machine by dividing the workload among the available CPU cores without any additional configuration. - **Vectorized Query Engine**: Using [Apache Arrow](https://arrow.apache.org/), a columnar data format, to process your queries in a vectorized manner and SIMD to optimize CPU usage. - **GPU Support**: Optionally run queries on NVIDIA GPUs for maximum performance for in-memory workloads. @@ -45,7 +45,7 @@ The goal of Polars is to provide a lightning fast DataFrame library that: - A consistent and predictable API. - Adheres to a strict schema (data-types should be known before running the query). -Polars is written in Rust which gives it C/C++ performance and allows it to fully control performance critical parts in a query engine. +Polars is written in Rust which gives it C/C++ performance and allows it to fully control performance-critical parts in a query engine. ## Example diff --git a/docs/source/src/python/user-guide/getting-started.py b/docs/source/src/python/user-guide/getting-started.py new file mode 100644 index 000000000000..d68207ebd60d --- /dev/null +++ b/docs/source/src/python/user-guide/getting-started.py @@ -0,0 +1,135 @@ +# --8<-- [start:df] +import polars as pl +import datetime as dt + +df = pl.DataFrame( + { + "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"], + "birthdate": [ + dt.date(1997, 1, 10), + dt.date(1985, 2, 15), + dt.date(1983, 3, 22), + dt.date(1981, 4, 30), + ], + "weight": [57.9, 72.5, 53.6, 83.1], # (kg) + "height": [1.56, 1.77, 1.65, 1.75], # (m) + } +) + +print(df) +# --8<-- [end:df] + +# --8<-- [start:csv] +df.write_csv("docs/assets/data/output.csv") +df_csv = pl.read_csv("docs/assets/data/output.csv", try_parse_dates=True) +print(df_csv) +# --8<-- [end:csv] + +# --8<-- [start:select] +result = df.select( + pl.col("name"), + pl.col("birthdate").dt.year().alias("birth_year"), + (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"), +) +print(result) +# --8<-- [end:select] + +# --8<-- [start:expression-expansion] +result = df.select( + pl.col("name"), + (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"), +) +print(result) +# --8<-- [end:expression-expansion] + +# --8<-- [start:with_columns] +result = df.with_columns( + birth_year=pl.col("birthdate").dt.year(), + bmi=pl.col("weight") / (pl.col("height") ** 2), +) +print(result) +# --8<-- [end:with_columns] + +# --8<-- [start:filter] +result = df.filter(pl.col("birthdate").dt.year() < 1990) +print(result) +# --8<-- [end:filter] + +# --8<-- [start:filter-multiple] +result = df.filter( + pl.col("birthdate").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)), + pl.col("height") > 1.7, +) +print(result) +# --8<-- [end:filter-multiple] + +# --8<-- [start:group_by] +result = df.group_by( + (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"), + maintain_order=True, +).len() +print(result) +# --8<-- [end:group_by] + +# --8<-- [start:group_by-agg] +result = df.group_by( + (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"), + maintain_order=True, +).agg( + pl.len().alias("sample_size"), + pl.col("weight").mean().round(2).alias("avg_weight"), + pl.col("height").max().alias("tallest"), +) +print(result) +# --8<-- [end:group_by-agg] + +# --8<-- [start:complex] +result = ( + df.with_columns( + (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"), + pl.col("name").str.split(by=" ").list.first(), + ) + .select( + pl.all().exclude("birthdate"), + ) + .group_by( + pl.col("decade"), + maintain_order=True, + ) + .agg( + pl.col("name"), + pl.col("weight", "height").mean().round(2).name.prefix("avg_"), + ) +) +print(result) +# --8<-- [end:complex] + +# --8<-- [start:join] +df2 = pl.DataFrame( + { + "name": ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"], + "parent": [True, False, False, False], + "siblings": [1, 2, 3, 4], + } +) + +print(df.join(df2, on="name", how="left")) +# --8<-- [end:join] + +# --8<-- [start:concat] +df3 = pl.DataFrame( + { + "name": ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"], + "birthdate": [ + dt.date(1977, 5, 10), + dt.date(1975, 6, 23), + dt.date(1973, 7, 22), + dt.date(1971, 8, 3), + ], + "weight": [67.9, 72.5, 57.6, 93.1], # (kg) + "height": [1.76, 1.6, 1.66, 1.8], # (m) + } +) + +print(pl.concat([df, df3], how="vertical")) +# --8<-- [end:concat] diff --git a/docs/source/src/python/user-guide/getting-started/expressions.py b/docs/source/src/python/user-guide/getting-started/expressions.py deleted file mode 100644 index dd27738f33ef..000000000000 --- a/docs/source/src/python/user-guide/getting-started/expressions.py +++ /dev/null @@ -1,87 +0,0 @@ -# --8<-- [start:setup] -from datetime import datetime - -import numpy as np -import polars as pl - -df = pl.DataFrame( - { - "a": range(5), - "b": np.random.rand(5), - "c": [ - datetime(2025, 12, 1), - datetime(2025, 12, 2), - datetime(2025, 12, 3), - datetime(2025, 12, 4), - datetime(2025, 12, 5), - ], - "d": [1.0, 2.0, float("nan"), -42.0, None], - } -) -# --8<-- [end:setup] - -# --8<-- [start:select] -df.select(pl.col("*")) -# --8<-- [end:select] - -# --8<-- [start:select2] -df.select(pl.col("a", "b")) -# --8<-- [end:select2] - -# --8<-- [start:select3] -df.select(pl.col("a"), pl.col("b")).limit(3) -# --8<-- [end:select3] - -# --8<-- [start:exclude] -df.select(pl.exclude(["a", "c"])) -# --8<-- [end:exclude] - -# --8<-- [start:filter] -df.filter( - pl.col("c").is_between(datetime(2025, 12, 2), datetime(2025, 12, 3)), -) -# --8<-- [end:filter] - -# --8<-- [start:filter2] -df.filter((pl.col("a") <= 3) & (pl.col("d").is_not_nan())) -# --8<-- [end:filter2] - -# --8<-- [start:with_columns] -df.with_columns(pl.col("b").sum().alias("e"), (pl.col("b") + 42).alias("b+42")) -# --8<-- [end:with_columns] - -# --8<-- [start:dataframe2] -df2 = pl.DataFrame( - { - "x": range(8), - "y": ["A", "A", "A", "B", "B", "C", "X", "X"], - } -) -# --8<-- [end:dataframe2] - -# --8<-- [start:group_by] -df2.group_by("y", maintain_order=True).len() -# --8<-- [end:group_by] - -# --8<-- [start:group_by2] -df2.group_by("y", maintain_order=True).agg( - pl.col("*").count().alias("count"), - pl.col("*").sum().alias("sum"), -) -# --8<-- [end:group_by2] - -# --8<-- [start:combine] -df_x = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select( - pl.all().exclude(["c", "d"]) -) - -print(df_x) -# --8<-- [end:combine] - -# --8<-- [start:combine2] -df_y = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select( - pl.all().exclude("d") -) - -print(df_y) -# --8<-- [end:combine2] diff --git a/docs/source/src/python/user-guide/getting-started/joins.py b/docs/source/src/python/user-guide/getting-started/joins.py deleted file mode 100644 index fd7dcc19eb4a..000000000000 --- a/docs/source/src/python/user-guide/getting-started/joins.py +++ /dev/null @@ -1,29 +0,0 @@ -# --8<-- [start:setup] -import numpy as np -import polars as pl - -# --8<-- [end:setup] - -# --8<-- [start:join] -df = pl.DataFrame( - { - "a": range(8), - "b": np.random.rand(8), - "d": [1.0, 2.0, float("nan"), float("nan"), 0.0, -5.0, -42.0, None], - } -) - -df2 = pl.DataFrame( - { - "x": range(8), - "y": ["A", "A", "A", "B", "B", "C", "X", "X"], - } -) -joined = df.join(df2, left_on="a", right_on="x") -print(joined) -# --8<-- [end:join] - -# --8<-- [start:hstack] -stacked = df.hstack(df2) -print(stacked) -# --8<-- [end:hstack] diff --git a/docs/source/src/python/user-guide/getting-started/reading-writing.py b/docs/source/src/python/user-guide/getting-started/reading-writing.py deleted file mode 100644 index 72eb819bba92..000000000000 --- a/docs/source/src/python/user-guide/getting-started/reading-writing.py +++ /dev/null @@ -1,42 +0,0 @@ -# --8<-- [start:dataframe] -import polars as pl -from datetime import datetime - -df = pl.DataFrame( - { - "integer": [1, 2, 3], - "date": [ - datetime(2025, 1, 1), - datetime(2025, 1, 2), - datetime(2025, 1, 3), - ], - "float": [4.0, 5.0, 6.0], - "string": ["a", "b", "c"], - } -) - -print(df) -# --8<-- [end:dataframe] - -# --8<-- [start:csv] -df.write_csv("docs/assets/data/output.csv") -df_csv = pl.read_csv("docs/assets/data/output.csv") -print(df_csv) -# --8<-- [end:csv] - -# --8<-- [start:csv2] -df_csv = pl.read_csv("docs/assets/data/output.csv", try_parse_dates=True) -print(df_csv) -# --8<-- [end:csv2] - -# --8<-- [start:json] -df.write_json("docs/assets/data/output.json") -df_json = pl.read_json("docs/assets/data/output.json") -print(df_json) -# --8<-- [end:json] - -# --8<-- [start:parquet] -df.write_parquet("docs/assets/data/output.parquet") -df_parquet = pl.read_parquet("docs/assets/data/output.parquet") -print(df_parquet) -# --8<-- [end:parquet] diff --git a/docs/source/src/rust/Cargo.toml b/docs/source/src/rust/Cargo.toml index c99561340b12..061c60d02948 100644 --- a/docs/source/src/rust/Cargo.toml +++ b/docs/source/src/rust/Cargo.toml @@ -26,16 +26,10 @@ path = "home/example.rs" required-features = ["polars/lazy", "polars/csv"] [[bin]] -name = "user-guide-getting-started-expressions" -path = "user-guide/getting-started/expressions.rs" -required-features = ["polars/lazy"] -[[bin]] -name = "user-guide-getting-started-joins" -path = "user-guide/getting-started/joins.rs" -[[bin]] -name = "user-guide-getting-started-reading-writing" -path = "user-guide/getting-started/reading-writing.rs" -required-features = ["polars/json"] +name = "user-guide-getting-started" +path = "user-guide/getting-started.rs" +required-features = ["polars/lazy", "polars/temporal", "polars/round_series", "polars/strings"] + [[bin]] name = "user-guide-concepts-data-structures" path = "user-guide/concepts/data-structures.rs" diff --git a/docs/source/src/rust/user-guide/getting-started.rs b/docs/source/src/rust/user-guide/getting-started.rs new file mode 100644 index 000000000000..362c99b533c9 --- /dev/null +++ b/docs/source/src/rust/user-guide/getting-started.rs @@ -0,0 +1,196 @@ +fn main() -> Result<(), Box> { + // --8<-- [start:df] + use chrono::prelude::*; + use polars::prelude::*; + + let mut df: DataFrame = df!( + "name" => ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"], + "birthdate" => [ + NaiveDate::from_ymd_opt(1997, 1, 10).unwrap(), + NaiveDate::from_ymd_opt(1985, 2, 15).unwrap(), + NaiveDate::from_ymd_opt(1983, 3, 22).unwrap(), + NaiveDate::from_ymd_opt(1981, 4, 30).unwrap(), + ], + "weight" => [57.9, 72.5, 53.6, 83.1], // (kg) + "height" => [1.56, 1.77, 1.65, 1.75], // (m) + ) + .unwrap(); + println!("{}", df); + // --8<-- [end:df] + + // --8<-- [start:csv] + use std::fs::File; + + let mut file = File::create("../../../assets/data/output.csv").expect("could not create file"); + CsvWriter::new(&mut file) + .include_header(true) + .with_separator(b',') + .finish(&mut df)?; + let df_csv = CsvReadOptions::default() + .with_infer_schema_length(None) + .with_has_header(true) + .with_parse_options(CsvParseOptions::default().with_try_parse_dates(true)) + .try_into_reader_with_file_path(Some("../../../assets/data/output.csv".into()))? + .finish()?; + println!("{}", df_csv); + // --8<-- [end:csv] + + // --8<-- [start:select] + let result = df + .clone() + .lazy() + .select([ + col("name"), + col("birthdate").dt().year().alias("birth_year"), + (col("weight") / col("height").pow(2)).alias("bmi"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:select] + + // --8<-- [start:expression-expansion] + let result = df + .clone() + .lazy() + .select([ + col("name"), + (cols(["weight", "height"]) * lit(0.95)) + .round(2) + .name() + .suffix("-5%"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:expression-expansion] + + // --8<-- [start:with_columns] + let result = df + .clone() + .lazy() + .with_columns([ + col("birthdate").dt().year().alias("birth_year"), + (col("weight") / col("height").pow(2)).alias("bmi"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:with_columns] + + // --8<-- [start:filter] + let result = df + .clone() + .lazy() + .filter(col("birthdate").dt().year().lt(lit(1990))) + .collect()?; + println!("{}", result); + // --8<-- [end:filter] + + // --8<-- [start:filter-multiple] + let result = df + .clone() + .lazy() + .filter( + col("birthdate") + .is_between( + lit(NaiveDate::from_ymd_opt(1982, 12, 31).unwrap()), + lit(NaiveDate::from_ymd_opt(1996, 1, 1).unwrap()), + ClosedInterval::Both, + ) + .and(col("height").gt(lit(1.7))), + ) + .collect()?; + println!("{}", result); + // --8<-- [end:filter-multiple] + + // --8<-- [start:group_by] + // Use `group_by_stable` if you want the Python behaviour of `maintain_order=True`. + let result = df + .clone() + .lazy() + .group_by([(col("birthdate").dt().year() / lit(10) * lit(10)).alias("decade")]) + .agg([len()]) + .collect()?; + println!("{}", result); + // --8<-- [end:group_by] + + // --8<-- [start:group_by-agg] + let result = df + .clone() + .lazy() + .group_by([(col("birthdate").dt().year() / lit(10) * lit(10)).alias("decade")]) + .agg([ + len().alias("sample_size"), + col("weight").mean().round(2).alias("avg_weight"), + col("height").max().alias("tallest"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:group_by-agg] + + // --8<-- [start:complex] + let result = df + .clone() + .lazy() + .with_columns([ + (col("birthdate").dt().year() / lit(10) * lit(10)).alias("decade"), + col("name").str().split(lit(" ")).list().first(), + ]) + .select([all().exclude(["birthdate"])]) + .group_by([col("decade")]) + .agg([ + col("name"), + cols(["weight", "height"]) + .mean() + .round(2) + .name() + .prefix("avg_"), + ]) + .collect()?; + println!("{}", result); + // --8<-- [end:complex] + + // --8<-- [start:join] + let df2: DataFrame = df!( + "name" => ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"], + "parent" => [true, false, false, false], + "siblings" => [1, 2, 3, 4], + ) + .unwrap(); + + let result = df + .clone() + .lazy() + .join( + df2.clone().lazy(), + [col("name")], + [col("name")], + JoinArgs::new(JoinType::Left), + ) + .collect()?; + + println!("{}", result); + // --8<-- [end:join] + + // --8<-- [start:concat] + let df3: DataFrame = df!( + "name" => ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"], + "birthdate" => [ + NaiveDate::from_ymd_opt(1977, 5, 10).unwrap(), + NaiveDate::from_ymd_opt(1975, 6, 23).unwrap(), + NaiveDate::from_ymd_opt(1973, 7, 22).unwrap(), + NaiveDate::from_ymd_opt(1971, 8, 3).unwrap(), + ], + "weight" => [67.9, 72.5, 57.6, 93.1], // (kg) + "height" => [1.76, 1.6, 1.66, 1.8], // (m) + ) + .unwrap(); + + let result = concat( + [df.clone().lazy(), df3.clone().lazy()], + UnionArgs::default(), + )? + .collect()?; + println!("{}", result); + // --8<-- [end:concat] + + Ok(()) +} diff --git a/docs/source/src/rust/user-guide/getting-started/expressions.rs b/docs/source/src/rust/user-guide/getting-started/expressions.rs deleted file mode 100644 index 757c52e3939f..000000000000 --- a/docs/source/src/rust/user-guide/getting-started/expressions.rs +++ /dev/null @@ -1,137 +0,0 @@ -use chrono::prelude::*; -use polars::prelude::*; -use rand::Rng; - -fn main() -> Result<(), Box> { - let mut rng = rand::thread_rng(); - - let df: DataFrame = df!( - "a" => 0..5, - "b"=> (0..5).map(|_| rng.gen::()).collect::>(), - "c"=> [ - NaiveDate::from_ymd_opt(2025, 12, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2025, 12, 2).unwrap().and_hms_opt(0, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2025, 12, 3).unwrap().and_hms_opt(0, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2025, 12, 4).unwrap().and_hms_opt(0, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2025, 12, 5).unwrap().and_hms_opt(0, 0, 0).unwrap(), - ], - "d"=> [Some(1.0), Some(2.0), None, Some(-42.), None] - ) - .unwrap(); - - // --8<-- [start:select] - let out = df.clone().lazy().select([col("*")]).collect()?; - println!("{}", out); - // --8<-- [end:select] - - // --8<-- [start:select2] - let out = df.clone().lazy().select([col("a"), col("b")]).collect()?; - println!("{}", out); - // --8<-- [end:select2] - - // --8<-- [start:select3] - let out = df - .clone() - .lazy() - .select([col("a"), col("b")]) - .limit(3) - .collect()?; - println!("{}", out); - // --8<-- [end:select3] - - // --8<-- [start:exclude] - let out = df - .clone() - .lazy() - .select([col("*").exclude(["a", "c"])]) - .collect()?; - println!("{}", out); - // --8<-- [end:exclude] - - // --8<-- [start:filter] - let start_date = NaiveDate::from_ymd_opt(2025, 12, 2) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(); - let end_date = NaiveDate::from_ymd_opt(2025, 12, 3) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(); - let out = df - .clone() - .lazy() - .filter( - col("c") - .gt_eq(lit(start_date)) - .and(col("c").lt_eq(lit(end_date))), - ) - .collect()?; - println!("{}", out); - // --8<-- [end:filter] - - // --8<-- [start:filter2] - let out = df - .clone() - .lazy() - .filter(col("a").lt_eq(3).and(col("d").is_not_null())) - .collect()?; - println!("{}", out); - // --8<-- [end:filter2] - - // --8<-- [start:with_columns] - let out = df - .clone() - .lazy() - .with_columns([ - col("b").sum().alias("e"), - (col("b") + lit(42)).alias("b+42"), - ]) - .collect()?; - println!("{}", out); - // --8<-- [end:with_columns] - - // --8<-- [start:dataframe2] - let df2: DataFrame = df!("x" => 0..8, - "y"=> &["A", "A", "A", "B", "B", "C", "X", "X"], - ) - .expect("should not fail"); - println!("{}", df2); - // --8<-- [end:dataframe2] - - // --8<-- [start:group_by] - let out = df2.clone().lazy().group_by(["y"]).agg([len()]).collect()?; - println!("{}", out); - // --8<-- [end:group_by] - - // --8<-- [start:group_by2] - let out = df2 - .clone() - .lazy() - .group_by(["y"]) - .agg([col("*").count().alias("count"), col("*").sum().alias("sum")]) - .collect()?; - println!("{}", out); - // --8<-- [end:group_by2] - - // --8<-- [start:combine] - let out = df - .clone() - .lazy() - .with_columns([(col("a") * col("b")).alias("a * b")]) - .select([col("*").exclude(["c", "d"])]) - .collect()?; - println!("{}", out); - // --8<-- [end:combine] - - // --8<-- [start:combine2] - let out = df - .clone() - .lazy() - .with_columns([(col("a") * col("b")).alias("a * b")]) - .select([col("*").exclude(["d"])]) - .collect()?; - println!("{}", out); - // --8<-- [end:combine2] - - Ok(()) -} diff --git a/docs/source/src/rust/user-guide/getting-started/joins.rs b/docs/source/src/rust/user-guide/getting-started/joins.rs deleted file mode 100644 index a5f36c73f342..000000000000 --- a/docs/source/src/rust/user-guide/getting-started/joins.rs +++ /dev/null @@ -1,29 +0,0 @@ -use polars::prelude::*; - -fn main() -> Result<(), Box> { - // --8<-- [start:join] - use rand::Rng; - let mut rng = rand::thread_rng(); - - let df: DataFrame = df!( - "a" => 0..8, - "b"=> (0..8).map(|_| rng.gen::()).collect::>(), - "d"=> [Some(1.0), Some(2.0), Some(f64::NAN), Some(f64::NAN), Some(0.0), Some(-5.0), Some(-42.), None] - ) - .unwrap(); - let df2: DataFrame = df!( - "x" => 0..8, - "y"=> &["A", "A", "A", "B", "B", "C", "X", "X"], - ) - .unwrap(); - let joined = df.join(&df2, ["a"], ["x"], JoinType::Left.into())?; - println!("{}", joined); - // --8<-- [end:join] - - // --8<-- [start:hstack] - let stacked = df.hstack(df2.get_columns())?; - println!("{}", stacked); - // --8<-- [end:hstack] - - Ok(()) -} diff --git a/docs/source/src/rust/user-guide/getting-started/reading-writing.rs b/docs/source/src/rust/user-guide/getting-started/reading-writing.rs deleted file mode 100644 index 5c0efc9e3c9f..000000000000 --- a/docs/source/src/rust/user-guide/getting-started/reading-writing.rs +++ /dev/null @@ -1,71 +0,0 @@ -fn main() -> Result<(), Box> { - // --8<-- [start:dataframe] - use std::fs::File; - - use chrono::prelude::*; - use polars::prelude::*; - - let mut df: DataFrame = df!( - "integer" => &[1, 2, 3], - "date" => &[ - NaiveDate::from_ymd_opt(2025, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2025, 1, 2).unwrap().and_hms_opt(0, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2025, 1, 3).unwrap().and_hms_opt(0, 0, 0).unwrap(), - ], - "float" => &[4.0, 5.0, 6.0], - "string" => &["a", "b", "c"], - ) - .unwrap(); - println!("{}", df); - // --8<-- [end:dataframe] - - // --8<-- [start:csv] - let mut file = File::create("docs/assets/data/output.csv").expect("could not create file"); - CsvWriter::new(&mut file) - .include_header(true) - .with_separator(b',') - .finish(&mut df)?; - let df_csv = CsvReadOptions::default() - .with_infer_schema_length(None) - .with_has_header(true) - .try_into_reader_with_file_path(Some("docs/assets/data/output.csv".into()))? - .finish()?; - println!("{}", df_csv); - // --8<-- [end:csv] - - // --8<-- [start:csv2] - let mut file = File::create("docs/assets/data/output.csv").expect("could not create file"); - CsvWriter::new(&mut file) - .include_header(true) - .with_separator(b',') - .finish(&mut df)?; - let df_csv = CsvReadOptions::default() - .with_infer_schema_length(None) - .with_has_header(true) - .map_parse_options(|parse_options| parse_options.with_try_parse_dates(true)) - .try_into_reader_with_file_path(Some("docs/assets/data/output.csv".into()))? - .finish()?; - - println!("{}", df_csv); - // --8<-- [end:csv2] - - // --8<-- [start:json] - let mut file = File::create("docs/assets/data/output.json").expect("could not create file"); - JsonWriter::new(&mut file).finish(&mut df)?; - let f = File::open("docs/assets/data/output.json")?; - let df_json = JsonReader::new(f) - .with_json_format(JsonFormat::JsonLines) - .finish()?; - println!("{}", df_json); - // --8<-- [end:json] - - // --8<-- [start:parquet] - let mut file = File::create("docs/assets/data/output.parquet").expect("could not create file"); - ParquetWriter::new(&mut file).finish(&mut df)?; - let f = File::open("docs/assets/data/output.parquet")?; - let df_parquet = ParquetReader::new(f).finish()?; - println!("{}", df_parquet); - // --8<-- [end:parquet] - - Ok(()) -} diff --git a/docs/source/user-guide/getting-started.md b/docs/source/user-guide/getting-started.md index 8be3c2f54566..69aa8a02d8da 100644 --- a/docs/source/user-guide/getting-started.md +++ b/docs/source/user-guide/getting-started.md @@ -1,6 +1,6 @@ # Getting started -This chapter is here to help you get started with Polars. It covers all the fundamental features and functionalities of the library, making it easy for new users to familiarise themselves with the basics from initial installation and setup to core functionalities. If you're already an advanced user or familiar with Dataframes, feel free to skip ahead to the [next chapter about installation options](installation.md). +This chapter is here to help you get started with Polars. It covers all the fundamental features and functionalities of the library, making it easy for new users to familiarise themselves with the basics from initial installation and setup to core functionalities. If you're already an advanced user or familiar with dataframes, feel free to skip ahead to the [next chapter about installation options](installation.md). ## Installing Polars @@ -22,165 +22,162 @@ This chapter is here to help you get started with Polars. It covers all the fund ## Reading & writing -Polars supports reading and writing for common file formats (e.g. csv, json, parquet), cloud storage (S3, Azure Blob, BigQuery) and databases (e.g. postgres, mysql). Below we show the concept of reading and writing to disk. +Polars supports reading and writing for common file formats (e.g. csv, json, parquet), cloud storage (S3, Azure Blob, BigQuery) and databases (e.g. postgres, mysql). Below, we create a small dataframe and show how to write it to disk and read it back. -{{code_block('user-guide/getting-started/reading-writing','dataframe',['DataFrame'])}} +{{code_block('user-guide/getting-started','df',['DataFrame'])}} -```python exec="on" result="text" session="getting-started/reading" ---8<-- "python/user-guide/getting-started/reading-writing.py:dataframe" +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:df" ``` -In the example below we write the DataFrame to a csv file called `output.csv`. After that, we read it back using `read_csv` and then `print` the result for inspection. +In the example below we write the dataframe to a csv file called `output.csv`. After that, we read it back using `read_csv` and then print the result for inspection. -{{code_block('user-guide/getting-started/reading-writing','csv',['read_csv','write_csv'])}} +{{code_block('user-guide/getting-started','csv',['read_csv','write_csv'])}} -```python exec="on" result="text" session="getting-started/reading" ---8<-- "python/user-guide/getting-started/reading-writing.py:csv" +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:csv" ``` -For more examples on the CSV file format and other data formats, start with the [IO section](io/index.md) of the user guide. +For more examples on the CSV file format and other data formats, see the [IO section](io/index.md) of the user guide. -## Expressions +## Expressions and contexts -`Expressions` are the core strength of Polars. The `expressions` offer a modular structure that allows you to combine simple concepts into complex queries. Below we cover the basic components that serve as building blocks (or in Polars terminology contexts) for all your queries: +_Expressions_ are one of the main strengths of Polars because they provide a modular and flexible way of expressing data transformations. + +Here is an example of a Polars expression: + +```py +pl.col("weight") / (pl.col("height") ** 2) +``` + +As you might be able to guess, this expression takes the column named “weight” and divides its values by the square of the values in the column “height”, computing a person's BMI. +Note that the code above expresses an abstract computation: it's only inside a Polars _context_ that the expression materalizes into a series with the results. + +Below, we will show examples of Polars expressions inside different contexts: - `select` -- `filter` - `with_columns` +- `filter` - `group_by` -To learn more about expressions and the context in which they operate, see the user guide sections: [Contexts](concepts/contexts.md) and [Expressions](concepts/expressions.md). - -### Select - -To select a column we need to do two things: +For a more detailed exploration of contexts and expressions see the respective user guide sections: [Contexts](concepts/contexts.md) and [Expressions](concepts/expressions.md). -1. Define the `DataFrame` we want the data from. -2. Select the data that we need. +### `select` -In the example below you see that we select `col('*')`. The asterisk stands for all columns. +The context `select` allows you to select and manipulate columns from a dataframe. +In the simplest case, each expression you provide will map to a column in the result dataframe: -{{code_block('user-guide/getting-started/expressions','select',['select'])}} +{{code_block('user-guide/getting-started','select',['select','alias','Expr.dt'])}} -```python exec="on" result="text" session="getting-started/expressions" ---8<-- "python/user-guide/getting-started/expressions.py:setup" -print( - --8<-- "python/user-guide/getting-started/expressions.py:select" -) +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:select" ``` -You can also specify the specific columns that you want to return. There are two ways to do this. The first option is to pass the column names, as seen below. +Polars also supports a feature called “expression expansion”, in which one expression acts as shorthand for multiple expressions. +In the example below, we use expression expansion to manipulate the columns “weight” and “height” with a single expression. +When using expression expansion you can use `.name.suffix` to add a suffix to the names of the original columns: -{{code_block('user-guide/getting-started/expressions','select2',['select'])}} +{{code_block('user-guide/getting-started','expression-expansion',['select','alias','Expr.name'])}} -```python exec="on" result="text" session="getting-started/expressions" -print( - --8<-- "python/user-guide/getting-started/expressions.py:select2" -) +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:expression-expansion" ``` -Follow these links to other parts of the user guide to learn more about [basic operations](expressions/operators.md) or [column selections](expressions/column-selections.md). +You can check other sections of the user guide to learn more about [basic operations](expressions/operators.md) or [column selections](expressions/column-selections.md). -### Filter +### `with_columns` -The `filter` option allows us to create a subset of the `DataFrame`. We use the same `DataFrame` as earlier and we filter between two specified dates. +The context `with_columns` is very similar to the context `select` but `with_columns` adds columns to the dataframe instead of selecting them. +Notice how the resulting dataframe contains the four columns of the original dataframe plus the two new columns introduced by the expressions inside `with_columns`: -{{code_block('user-guide/getting-started/expressions','filter',['filter'])}} +{{code_block('user-guide/getting-started','with_columns',['with_columns'])}} -```python exec="on" result="text" session="getting-started/expressions" -print( - --8<-- "python/user-guide/getting-started/expressions.py:filter" -) +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:with_columns" ``` -With `filter` you can also create more complex filters that include multiple columns. +In the example above we also decided to use named expressions instead of the method `alias` to specify the names of the new columns. +Other contexts like `select` and `group_by` also accept named expressions. -{{code_block('user-guide/getting-started/expressions','filter2',['filter'])}} +### `filter` -```python exec="on" result="text" session="getting-started/expressions" -print( - --8<-- "python/user-guide/getting-started/expressions.py:filter2" -) -``` - -### Add columns +The context `filter` allows us to create a second dataframe with a subset of the rows of the original one: -`with_columns` allows you to create new columns for your analyses. We create two new columns `e` and `b+42`. First we sum all values from column `b` and store the results in column `e`. After that we add `42` to the values of `b`. Creating a new column `b+42` to store these results. +{{code_block('user-guide/getting-started','filter',['filter','Expr.dt'])}} -{{code_block('user-guide/getting-started/expressions','with_columns',['with_columns'])}} - -```python exec="on" result="text" session="getting-started/expressions" -print( - --8<-- "python/user-guide/getting-started/expressions.py:with_columns" -) +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:filter" ``` -### Group by - -We will create a new `DataFrame` for the Group by functionality. This new `DataFrame` will include several 'groups' that we want to group by. +You can also provide multiple predicate expressions as separate parameters, which is more convenient than putting them all together with `&`: -{{code_block('user-guide/getting-started/expressions','dataframe2',['DataFrame'])}} +{{code_block('user-guide/getting-started','filter-multiple',['filter','is_between'])}} -```python exec="on" result="text" session="getting-started/expressions" ---8<-- "python/user-guide/getting-started/expressions.py:dataframe2" -print(df2) +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:filter-multiple" ``` -{{code_block('user-guide/getting-started/expressions','group_by',['group_by'])}} +### `group_by` -```python exec="on" result="text" session="getting-started/expressions" -print( - --8<-- "python/user-guide/getting-started/expressions.py:group_by" -) -``` +The context `group_by` can be used to group together the rows of the dataframe that share the same value across one or more expressions. +The example below counts how many people were born in each decade: -{{code_block('user-guide/getting-started/expressions','group_by2',['group_by'])}} +{{code_block('user-guide/getting-started','group_by',['group_by','alias','Expr.dt'])}} -```python exec="on" result="text" session="getting-started/expressions" -print( - --8<-- "python/user-guide/getting-started/expressions.py:group_by2" -) +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:group_by" ``` -### Combination +The keyword argument `maintain_order` forces Polars to present the resulting groups in the same order as they appear in the original dataframe. +This slows down the grouping operation but is used here to ensure reproducibility of the examples. -Below are some examples on how to combine operations to create the `DataFrame` you require. +After using the context `group_by` we can use `agg` to compute aggregations over the resulting groups: -{{code_block('user-guide/getting-started/expressions','combine',['select','with_columns'])}} +{{code_block('user-guide/getting-started','group_by-agg',['group_by','agg'])}} -```python exec="on" result="text" session="getting-started/expressions" ---8<-- "python/user-guide/getting-started/expressions.py:combine" +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:group_by-agg" ``` -{{code_block('user-guide/getting-started/expressions','combine2',['select','with_columns'])}} +### More complex queries + +Contexts and the expressions within can be chained to create more complex queries according to your needs. +In the example below we combine some of the contexts we have seen so far to create a more complex query: -```python exec="on" result="text" session="getting-started/expressions" ---8<-- "python/user-guide/getting-started/expressions.py:combine2" +{{code_block('user-guide/getting-started','complex',['group_by','agg','select','with_columns','Expr.str','Expr.list'])}} + +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:complex" ``` -## Combining DataFrames +## Combining dataframes -There are two ways `DataFrame`s can be combined depending on the use case: join and concat. +Polars provides a number of tools to combine two dataframes. +In this section, we show an example of a join and an example of a concatenation. -### Join +### Joinining dataframes -Polars supports all types of join (e.g. left, right, inner, outer). Let's have a closer look on how to `join` two `DataFrames` into a single `DataFrame`. Our two `DataFrames` both have an 'id'-like column: `a` and `x`. We can use those columns to `join` the `DataFrames` in this example. +Polars provides many different join algorithms. +The example below shows how to use a left outer join to combine two dataframes when a column can be used as a unique identifier to establish a correspondence between rows across the dataframes: -{{code_block('user-guide/getting-started/joins','join',['join'])}} +{{code_block('user-guide/getting-started','join',['join'])}} -```python exec="on" result="text" session="getting-started/joins" ---8<-- "python/user-guide/getting-started/joins.py:setup" ---8<-- "python/user-guide/getting-started/joins.py:join" +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:join" ``` -To see more examples with other types of joins, see the [Transformations section](transformations/joins.md) in the user guide. +Polars provides many different join algorithms that you can learn about in the [joins section of the user guide](transformations/joins.md). -### Concat +### Concatenating dataframes -We can also `concatenate` two `DataFrames`. Vertical concatenation will make the `DataFrame` longer. Horizontal concatenation will make the `DataFrame` wider. Below you can see the result of an horizontal concatenation of our two `DataFrames`. +Concatenating dataframes creates a taller or wider dataframe, depending on the method used. +Assuming we have a second dataframe with data from other people, we could use vertical concatenation to create a taller dataframe: -{{code_block('user-guide/getting-started/joins','hstack',['hstack'])}} +{{code_block('user-guide/getting-started','concat',['concat'])}} -```python exec="on" result="text" session="getting-started/joins" ---8<-- "python/user-guide/getting-started/joins.py:hstack" +```python exec="on" result="text" session="getting-started" +--8<-- "python/user-guide/getting-started.py:concat" ``` + +Polars provides vertical and horizontal concatenation, as well as diagonal concatenation. +You can learn more about these in the [concatenations section of the user guide](transformations/concatenation.md).