docs: Rewrite 'Getting started' page (#19028)

pola-rs · Oct 1, 2024 · becead9 · becead9
1 parent 76d0363
commit becead9
Show file tree

Hide file tree

Showing 13 changed files with 481 additions and 509 deletions.
diff --git a/docs/README.md b/docs/README.md
@@ -0,0 +1,9 @@
+The documentation is split across two subfolders, `source` and `assets`.
+The folder `source` contains the static source files that make up the user guide, which are mostly markdown files and the snippets of code.
+The folder `assets` contains (dynamically generated) assets used by those files, including data files for the snippets and images with plots or diagrams.
+
+Do _not_ merge the two folders together.
+In [PR #18773](https://github.com/pola-rs/polars/pull/18773) we introduced this split to fix the MkDocs server live reloading.
+If everything is in one folder `docs`, the MkDocs server will watch the folder `docs`.
+When you make one change the MkDocs server live reloads and rebuilds the docs.
+This triggers scripts that build asset files, which change the folder `docs`, leading to an infinite reloading loop.
diff --git a/docs/source/_build/API_REFERENCE_LINKS.yml b/docs/source/_build/API_REFERENCE_LINKS.yml
@@ -8,8 +8,9 @@ python:
   filter: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.filter.html
   with_columns: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.with_columns.html
   group_by: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.group_by.html
+  agg: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.dataframe.group_by.GroupBy.agg.html
   join: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html
-  hstack: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.hstack.html
+  vstack: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.vstack.html
   read_csv: https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html
   write_csv: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_csv.html
   read_excel: https://docs.pola.rs/api/python/stable/reference/api/polars.read_excel.html
@@ -34,7 +35,12 @@ python:
   fold: https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.fold.html
   concat_str: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.concat_str.html
   str.split: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.split.html
-  Expr.list: https://docs.pola.rs/api/python/stable/reference/expressions/list.html
+  Expr.list:
+    name: "list namespace"
+    link: https://docs.pola.rs/api/python/stable/reference/expressions/list.html
+  Expr.str:
+    name: "str namespace"
+    link: https://docs.pola.rs/api/python/stable/reference/expressions/string.html
   element: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.element.html
   all: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.all.html
   exclude: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.exclude.html
@@ -122,6 +128,9 @@ python:
   is_selector: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.is_selector
   expand_selector: https://docs.pola.rs/api/python/stable/reference/selectors.html#polars.selectors.expand_selector
 
+  Expr.dt:
+    name: "dt namespace"
+    link: https://docs.pola.rs/api/python/stable/reference/expressions/temporal.html
   dt.convert_time_zone:
     name: dt.convert_time_zone
     link: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.dt.convert_time_zone.html
@@ -147,6 +156,10 @@ python:
 
   struct.field: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.struct.field.html
   struct.rename_fields: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.struct.rename_fields.html
+  Expr.name:
+    name: "name namespace"
+    link: https://docs.pola.rs/api/python/stable/reference/expressions/name.html
+  round: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.round.html#polars.Expr.round
 
 rust:
   DataFrame: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html
@@ -160,12 +173,13 @@ rust:
   filter: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.filter
   with_columns: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.with_columns
   group_by: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.group_by
+  agg: https://docs.rs/polars/latest/polars/prelude/struct.LazyGroupBy.html#method.agg
   group_by_dynamic:
     name: group_by_dynamic
     link: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/struct.LazyFrame.html#method.group_by_dynamic
     feature_flags: [dynamic_group_by]
   join: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.DataFrameJoinOps.html#method.join
-  hstack: https://docs.pola.rs/api/rust/dev/polars_core/frame/struct.DataFrame.html#method.hstack
+  vstack: https://docs.pola.rs/api/rust/dev/polars_core/frame/struct.DataFrame.html#method.vstack
   concat: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/functions/fn.concat.html
 
   explain: https://docs.rs/polars/latest/polars/prelude/struct.LazyFrame.html#method.explain
@@ -283,15 +297,24 @@ rust:
   n_unique: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.n_unique
   null_count: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.null_count
   interpolate: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.interpolate
-  is_between: https://github.com/pola-rs/polars/issues/11285
+  is_between:
+    name: is_between
+    link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.is_between
+    feature_flags: [is_between]
   is_duplicated: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.is_duplicated
   is_null: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.is_null
   value_counts:
     name: value_counts
     link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.value_counts
     feature_flags: [dtype-struct]
 
-  Expr.list: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ListNameSpace.html
+  Expr.list:
+    name: "list namespace"
+    link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ListNameSpace.html
+  Expr.str:
+    name: "str namespace"
+    link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.StringNameSpaceImpl.html
+    feature_flags: [strings]
   Series.arr: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ArrayNameSpace.html
 
   date_range:
@@ -374,3 +397,16 @@ rust:
   struct.field:
     name: struct.field_by_name
     link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.StructNameSpace.html#method.field_by_name
+
+  Expr.name:
+    name: "name namespace"
+    link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ExprNameNameSpace.html
+    feature_flags: [lazy]
+  Expr.dt:
+    name: "dt namespace"
+    link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html
+    feature_flags: [temporal]
+  round:
+    name: "round"
+    link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.round
+    feature_flags: [round_series]
diff --git a/docs/source/index.md b/docs/source/index.md
@@ -23,7 +23,7 @@ Polars is a blazingly fast DataFrame library for manipulating structured data. T
 - **Fast**: Written from scratch in Rust, designed close to the machine and without external dependencies.
 - **I/O**: First class support for all common data storage layers: local, cloud storage & databases.
 - **Intuitive API**: Write your queries the way they were intended. Polars, internally, will determine the most efficient way to execute using its query optimizer.
-- **Out of Core**: The streaming API allows you to process your results without requiring all your data to be in memory at the same time
+- **Out of Core**: The streaming API allows you to process your results without requiring all your data to be in memory at the same time.
 - **Parallel**: Utilises the power of your machine by dividing the workload among the available CPU cores without any additional configuration.
 - **Vectorized Query Engine**: Using [Apache Arrow](https://arrow.apache.org/), a columnar data format, to process your queries in a vectorized manner and SIMD to optimize CPU usage.
 - **GPU Support**: Optionally run queries on NVIDIA GPUs for maximum performance for in-memory workloads.
@@ -45,7 +45,7 @@ The goal of Polars is to provide a lightning fast DataFrame library that:
 - A consistent and predictable API.
 - Adheres to a strict schema (data-types should be known before running the query).
 
-Polars is written in Rust which gives it C/C++ performance and allows it to fully control performance critical parts in a query engine.
+Polars is written in Rust which gives it C/C++ performance and allows it to fully control performance-critical parts in a query engine.
 
 ## Example
 

diff --git a/docs/source/src/python/user-guide/getting-started.py b/docs/source/src/python/user-guide/getting-started.py
@@ -0,0 +1,135 @@
+# --8<-- [start:df]
+import polars as pl
+import datetime as dt
+
+df = pl.DataFrame(
+    {
+        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
+        "birthdate": [
+            dt.date(1997, 1, 10),
+            dt.date(1985, 2, 15),
+            dt.date(1983, 3, 22),
+            dt.date(1981, 4, 30),
+        ],
+        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
+        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
+    }
+)
+
+print(df)
+# --8<-- [end:df]
+
+# --8<-- [start:csv]
+df.write_csv("docs/assets/data/output.csv")
+df_csv = pl.read_csv("docs/assets/data/output.csv", try_parse_dates=True)
+print(df_csv)
+# --8<-- [end:csv]
+
+# --8<-- [start:select]
+result = df.select(
+    pl.col("name"),
+    pl.col("birthdate").dt.year().alias("birth_year"),
+    (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"),
+)
+print(result)
+# --8<-- [end:select]
+
+# --8<-- [start:expression-expansion]
+result = df.select(
+    pl.col("name"),
+    (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"),
+)
+print(result)
+# --8<-- [end:expression-expansion]
+
+# --8<-- [start:with_columns]
+result = df.with_columns(
+    birth_year=pl.col("birthdate").dt.year(),
+    bmi=pl.col("weight") / (pl.col("height") ** 2),
+)
+print(result)
+# --8<-- [end:with_columns]
+
+# --8<-- [start:filter]
+result = df.filter(pl.col("birthdate").dt.year() < 1990)
+print(result)
+# --8<-- [end:filter]
+
+# --8<-- [start:filter-multiple]
+result = df.filter(
+    pl.col("birthdate").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)),
+    pl.col("height") > 1.7,
+)
+print(result)
+# --8<-- [end:filter-multiple]
+
+# --8<-- [start:group_by]
+result = df.group_by(
+    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
+    maintain_order=True,
+).len()
+print(result)
+# --8<-- [end:group_by]
+
+# --8<-- [start:group_by-agg]
+result = df.group_by(
+    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
+    maintain_order=True,
+).agg(
+    pl.len().alias("sample_size"),
+    pl.col("weight").mean().round(2).alias("avg_weight"),
+    pl.col("height").max().alias("tallest"),
+)
+print(result)
+# --8<-- [end:group_by-agg]
+
+# --8<-- [start:complex]
+result = (
+    df.with_columns(
+        (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
+        pl.col("name").str.split(by=" ").list.first(),
+    )
+    .select(
+        pl.all().exclude("birthdate"),
+    )
+    .group_by(
+        pl.col("decade"),
+        maintain_order=True,
+    )
+    .agg(
+        pl.col("name"),
+        pl.col("weight", "height").mean().round(2).name.prefix("avg_"),
+    )
+)
+print(result)
+# --8<-- [end:complex]
+
+# --8<-- [start:join]
+df2 = pl.DataFrame(
+    {
+        "name": ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"],
+        "parent": [True, False, False, False],
+        "siblings": [1, 2, 3, 4],
+    }
+)
+
+print(df.join(df2, on="name", how="left"))
+# --8<-- [end:join]
+
+# --8<-- [start:concat]
+df3 = pl.DataFrame(
+    {
+        "name": ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"],
+        "birthdate": [
+            dt.date(1977, 5, 10),
+            dt.date(1975, 6, 23),
+            dt.date(1973, 7, 22),
+            dt.date(1971, 8, 3),
+        ],
+        "weight": [67.9, 72.5, 57.6, 93.1],  # (kg)
+        "height": [1.76, 1.6, 1.66, 1.8],  # (m)
+    }
+)
+
+print(pl.concat([df, df3], how="vertical"))
+# --8<-- [end:concat]
diff --git a/docs/source/src/python/user-guide/getting-started/expressions.py b/docs/source/src/python/user-guide/getting-started/expressions.py
diff --git a/docs/source/src/python/user-guide/getting-started/joins.py b/docs/source/src/python/user-guide/getting-started/joins.py