From ddc67f4dc161d75fad803bbac6e55df5d5866592 Mon Sep 17 00:00:00 2001
From: Adrian D'Alessandro <a.dalessandro@imperial.ac.uk>
Date: Tue, 1 Oct 2024 13:29:09 +0100
Subject: [PATCH 1/4] add polars as a dev dependency

---
 poetry.lock    | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 pyproject.toml |  1 +
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index b34ce94..6f7087c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "attrs"
@@ -879,8 +879,8 @@ files = [
 [package.dependencies]
 numpy = [
     {version = ">=1.20.3", markers = "python_version < \"3.10\""},
-    {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
     {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
+    {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
 ]
 python-dateutil = ">=2.8.1"
 pytz = ">=2020.1"
@@ -929,6 +929,47 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "polars"
+version = "1.8.2"
+description = "Blazingly fast DataFrame library"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "polars-1.8.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:114be1ebfb051b794fb9e1f15999430c79cc0824595e237d3f45632be3e56d73"},
+    {file = "polars-1.8.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:e4fc36cfe48972d4c5be21a7cb119d6378fb7af0bb3eeb61456b66a1f43228e3"},
+    {file = "polars-1.8.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67c1e448d6e38697650b22dd359f13c40b567c0b66686c8602e4367400e87801"},
+    {file = "polars-1.8.2-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:570ee86b033dc5a6dbe2cb0df48522301642f304dda3da48f53d7488899a2206"},
+    {file = "polars-1.8.2-cp38-abi3-win_amd64.whl", hash = "sha256:ce1a1c1e2150ffcc44a5f1c461d738e1dcd95abbd0f210af0271c7ac0c9f7ef9"},
+    {file = "polars-1.8.2.tar.gz", hash = "sha256:42f69277d5be2833b0b826af5e75dcf430222d65c9633872856e176a0bed27a0"},
+]
+
+[package.extras]
+adbc = ["adbc-driver-manager[dbapi]", "adbc-driver-sqlite[dbapi]"]
+all = ["polars[async,cloudpickle,database,deltalake,excel,fsspec,graph,iceberg,numpy,pandas,plot,pyarrow,pydantic,style,timezone]"]
+async = ["gevent"]
+calamine = ["fastexcel (>=0.9)"]
+cloudpickle = ["cloudpickle"]
+connectorx = ["connectorx (>=0.3.2)"]
+database = ["nest-asyncio", "polars[adbc,connectorx,sqlalchemy]"]
+deltalake = ["deltalake (>=0.15.0)"]
+excel = ["polars[calamine,openpyxl,xlsx2csv,xlsxwriter]"]
+fsspec = ["fsspec"]
+gpu = ["cudf-polars-cu12"]
+graph = ["matplotlib"]
+iceberg = ["pyiceberg (>=0.5.0)"]
+numpy = ["numpy (>=1.16.0)"]
+openpyxl = ["openpyxl (>=3.0.0)"]
+pandas = ["pandas", "polars[pyarrow]"]
+plot = ["altair (>=5.4.0)"]
+pyarrow = ["pyarrow (>=7.0.0)"]
+pydantic = ["pydantic"]
+sqlalchemy = ["polars[pandas]", "sqlalchemy"]
+style = ["great-tables (>=0.8.0)"]
+timezone = ["backports-zoneinfo", "tzdata"]
+xlsx2csv = ["xlsx2csv (>=0.8.0)"]
+xlsxwriter = ["xlsxwriter"]
+
 [[package]]
 name = "pre-commit"
 version = "2.21.0"
@@ -1341,4 +1382,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8"
-content-hash = "2bcdaccad228261ec751162a1cf7bfb873a10f57b481e508da1c462431ce1eed"
+content-hash = "d9edd40c8b99bf315b02d29d2840a8c9f8adf30f86a8c21dabd437ea11c5ec5d"
diff --git a/pyproject.toml b/pyproject.toml
index 6f195c4..4aacdd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,7 @@ flake8 = "^4.0.1"
 types-PyYAML = "^6.0.7"
 bump2version = "^1.0.1"
 coverage = "^7.1.0"
+polars = "^1.8.2"
 
 [tool.poetry.group.docs]
 optional = true

From 6096feb11ceb356b0077eddecbc4f71cf281b0a0 Mon Sep 17 00:00:00 2001
From: Adrian D'Alessandro <a.dalessandro@imperial.ac.uk>
Date: Tue, 1 Oct 2024 13:29:54 +0100
Subject: [PATCH 2/4] Include a read_to_polars function

---
 csvy/__init__.py   |  2 ++
 csvy/readers.py    | 60 +++++++++++++++++++++++++++++++++++++++++++++-
 tests/test_read.py | 23 ++++++++++++++++++
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/csvy/__init__.py b/csvy/__init__.py
index adb2a2b..9996fa1 100644
--- a/csvy/__init__.py
+++ b/csvy/__init__.py
@@ -1,11 +1,13 @@
 """
 Python reader/writer for CSV files with YAML header information.
 """
+
 __version__ = "0.2.2"
 from .readers import (  # noqa: F401
     read_header,
     read_metadata,
     read_to_array,
     read_to_dataframe,
+    read_to_polars,
 )
 from .writers import Writer, write, write_header  # noqa: F401
diff --git a/csvy/readers.py b/csvy/readers.py
index e51bcd9..90f6dc3 100644
--- a/csvy/readers.py
+++ b/csvy/readers.py
@@ -17,7 +17,17 @@
 except ModuleNotFoundError:
     DataFrame = None  # type: ignore
     logging.getLogger().debug(
-        "Pandas is not installed. Reading into a DataFrame will not work."
+        "Pandas is not installed. Reading into a pd.DataFrame will not work."
+    )
+
+try:
+    from polars import DataFrame as PolarsDataFrame
+    from polars import LazyFrame
+except ModuleNotFoundError:
+    LazyFrame = None  # type: ignore
+    PolarsDataFrame = None  # type: ignore
+    logging.getLogger().debug(
+        "Polars is not installed. Reading into a pl.DataFrame will not work."
     )
 
 
@@ -168,6 +178,54 @@ def read_to_dataframe(
     return pd.read_csv(filename, **options), header
 
 
+def read_to_polars(
+    filename: Union[Path, str],
+    marker: str = "---",
+    csv_options: Optional[Dict[str, Any]] = None,
+    yaml_options: Optional[Dict[str, Any]] = None,
+    eager: bool = False,
+) -> Tuple[Union[LazyFrame, PolarsDataFrame], Dict[str, Any]]:
+    """Reads a CSVY file into dict with the header and a Polars LazyFrame with the data.
+
+    This uses the `scan_csv` method from Polars to read the data. This returns a polars
+    LazyFrame, which means the data is not loaded into memory until it is needed. To
+    load the data into memory, set the `eager` parameter to `True`.
+
+    Possible 'skip_rows' and 'comment_prefix' argument provided in the 'csv_options'
+    dictionary will be ignored.
+
+    Args:
+        filename:  Name of the file to read.
+        marker: The marker characters that indicate the yaml header.
+        csv_options: Options to pass to pd.read_csv.
+        yaml_options: Options to pass to yaml.safe_load.
+        eager: Whether to load the data into memory.
+
+    Raises:
+        ModuleNotFoundError: If polars is not found.
+
+    Returns:
+        Tuple containing: The polars LazyFrame and the header as a dictionary.
+    """
+    if LazyFrame is None:
+        raise ModuleNotFoundError(
+            "Module polars is not present. Install it to read data into DataFrame."
+        )
+    import polars as pl
+
+    yaml_options = yaml_options if yaml_options is not None else {}
+    header, nlines, comment = read_header(filename, marker=marker, **yaml_options)
+
+    options = csv_options.copy() if csv_options is not None else {}
+    options["skip_rows"] = nlines
+    options["comment_prefix"] = comment[0] if len(comment) >= 1 else None
+
+    lf = pl.scan_csv(filename, **options)
+    if eager:
+        return lf.collect(), header
+    return lf, header
+
+
 def read_to_list(
     filename: Union[Path, str],
     marker: str = "---",
diff --git a/tests/test_read.py b/tests/test_read.py
index 32bf32d..df39077 100644
--- a/tests/test_read.py
+++ b/tests/test_read.py
@@ -84,6 +84,29 @@ def test_read_to_dataframe(data_path):
         read_to_dataframe(data_path)
 
 
+def test_read_to_polars(data_path):
+    import polars as pl
+    from polars.testing import assert_frame_equal
+
+    from csvy.readers import read_to_polars
+
+    lazy_data, header = read_to_polars(data_path)
+    assert isinstance(lazy_data, pl.LazyFrame)
+    assert tuple(lazy_data.columns) == ("Date", "WTI")
+    assert isinstance(header, dict)
+    assert len(header) > 0
+
+    eager_data, _ = read_to_polars(data_path, eager=True)
+    assert_frame_equal(lazy_data.collect(), eager_data)
+
+    import csvy.readers as readers
+
+    readers.LazyFrame = None
+
+    with pytest.raises(ModuleNotFoundError):
+        read_to_polars(data_path)
+
+
 def test_read_to_list(array_data_path):
     from csvy.readers import read_to_list
 

From de32ca4ec314fed04700328a516b3ec610a7a905 Mon Sep 17 00:00:00 2001
From: Adrian D'Alessandro <a.dalessandro@imperial.ac.uk>
Date: Tue, 1 Oct 2024 15:06:47 +0100
Subject: [PATCH 3/4] Add a write_polars function

---
 csvy/writers.py     | 36 ++++++++++++++++++++++++++++++++++++
 tests/test_write.py | 23 +++++++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/csvy/writers.py b/csvy/writers.py
index 68234cd..a1c43f7 100644
--- a/csvy/writers.py
+++ b/csvy/writers.py
@@ -223,6 +223,42 @@ def write_pandas(
     return False
 
 
+@register_writer
+def write_polars(
+    filename: Union[Path, str], data: Any, comment: str = "", **kwargs: Any
+) -> bool:
+    """Writes the polars dataframe to the chosen file, adding it after the header.
+
+    Args:
+        filename: Name of the file to save the data into. The data will be added to the
+            end of the file.
+        data: The data. If it is a polars DataFrame or LazyFrame, it will be saved,
+            otherwise nothing is done.
+        comment: String to use to mark the header lines as comments.
+        **kwargs: Arguments to be passed to the underlaying saving method.
+
+    Returns:
+        True if the writer worked, False otherwise.
+    """
+    try:
+        import polars as pl
+
+        if isinstance(data, pl.LazyFrame):
+            # Streaming mode (saving with `LazyFrame.sink_csv`) is unstable, so we
+            # collect the data into a DataFrame first
+            data = data.collect()
+        if isinstance(data, pl.DataFrame):
+            with open(filename, "a", newline="") as f:
+                data.write_csv(f, **kwargs)
+
+            return True
+
+    except ModuleNotFoundError:
+        logging.getLogger().debug("Polars is not installed, so not using 'write_csv'.")
+
+    return False
+
+
 def write_csv(
     filename: Union[Path, str], data: Any, comment: str = "", **kwargs: Any
 ) -> bool:
diff --git a/tests/test_write.py b/tests/test_write.py
index 8e1c18f..5bfdf7d 100644
--- a/tests/test_write.py
+++ b/tests/test_write.py
@@ -77,6 +77,29 @@ def test_write_pandas(mock_save, tmpdir):
     mock_save.assert_called_once()
 
 
+@patch("polars.DataFrame.write_csv")
+def test_write_polars(mock_save, tmpdir, mocker):
+    import polars as pl
+
+    from csvy.writers import write_polars
+
+    filename = tmpdir / "some_file.csv"
+
+    data = []
+    assert not write_polars(filename, data)
+
+    data = pl.DataFrame()
+    assert write_polars(filename, data)
+    mock_save.assert_called_once()
+
+    data = pl.LazyFrame()
+    collect_spy = mocker.spy(data, "collect")
+    mock_save.reset_mock()
+    assert write_polars(filename, data)
+    collect_spy.assert_called_once()
+    mock_save.assert_called_once()
+
+
 @patch("csv.writer")
 def test_write_csv(mock_save, tmpdir):
     from csvy.writers import write_csv

From 33571d0fbb7dd480a19e4c49055dbb306591caed Mon Sep 17 00:00:00 2001
From: Adrian D'Alessandro <a.dalessandro@imperial.ac.uk>
Date: Tue, 1 Oct 2024 15:24:23 +0100
Subject: [PATCH 4/4] Update csvy/readers.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Diego Alonso Álvarez <6095790+dalonsoa@users.noreply.github.com>
---
 csvy/readers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csvy/readers.py b/csvy/readers.py
index 90f6dc3..f43b96b 100644
--- a/csvy/readers.py
+++ b/csvy/readers.py
@@ -197,7 +197,7 @@ def read_to_polars(
     Args:
         filename:  Name of the file to read.
         marker: The marker characters that indicate the yaml header.
-        csv_options: Options to pass to pd.read_csv.
+        csv_options: Options to pass to pl.scan_csv.
         yaml_options: Options to pass to yaml.safe_load.
         eager: Whether to load the data into memory.