ImperialCollegeLondon · dalonsoa · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024
diff --git a/csvy/__init__.py b/csvy/__init__.py
@@ -1,11 +1,13 @@
 """
 Python reader/writer for CSV files with YAML header information.
 """
+
 __version__ = "0.2.2"
 from .readers import (  # noqa: F401
     read_header,
     read_metadata,
     read_to_array,
     read_to_dataframe,
+    read_to_polars,
 )
 from .writers import Writer, write, write_header  # noqa: F401
diff --git a/csvy/readers.py b/csvy/readers.py
@@ -17,7 +17,17 @@
 except ModuleNotFoundError:
     DataFrame = None  # type: ignore
     logging.getLogger().debug(
-        "Pandas is not installed. Reading into a DataFrame will not work."
+        "Pandas is not installed. Reading into a pd.DataFrame will not work."
+    )
+
+try:
+    from polars import DataFrame as PolarsDataFrame
+    from polars import LazyFrame
+except ModuleNotFoundError:
+    LazyFrame = None  # type: ignore
+    PolarsDataFrame = None  # type: ignore
+    logging.getLogger().debug(
+        "Polars is not installed. Reading into a pl.DataFrame will not work."
     )
 
 
@@ -168,6 +178,54 @@ def read_to_dataframe(
     return pd.read_csv(filename, **options), header
 
 
+def read_to_polars(
+    filename: Union[Path, str],
+    marker: str = "---",
+    csv_options: Optional[Dict[str, Any]] = None,
+    yaml_options: Optional[Dict[str, Any]] = None,
+    eager: bool = False,
+) -> Tuple[Union[LazyFrame, PolarsDataFrame], Dict[str, Any]]:
+    """Reads a CSVY file into dict with the header and a Polars LazyFrame with the data.
+
+    This uses the `scan_csv` method from Polars to read the data. This returns a polars
+    LazyFrame, which means the data is not loaded into memory until it is needed. To
+    load the data into memory, set the `eager` parameter to `True`.
+
+    Possible 'skip_rows' and 'comment_prefix' argument provided in the 'csv_options'
+    dictionary will be ignored.
+
+    Args:
+        filename:  Name of the file to read.
+        marker: The marker characters that indicate the yaml header.
+        csv_options: Options to pass to pd.read_csv.
+        yaml_options: Options to pass to yaml.safe_load.
+        eager: Whether to load the data into memory.
+
+    Raises:
+        ModuleNotFoundError: If polars is not found.
+
+    Returns:
+        Tuple containing: The polars LazyFrame and the header as a dictionary.
+    """
+    if LazyFrame is None:
+        raise ModuleNotFoundError(
+            "Module polars is not present. Install it to read data into DataFrame."
+        )
+    import polars as pl
+
+    yaml_options = yaml_options if yaml_options is not None else {}
+    header, nlines, comment = read_header(filename, marker=marker, **yaml_options)
+
+    options = csv_options.copy() if csv_options is not None else {}
+    options["skip_rows"] = nlines
+    options["comment_prefix"] = comment[0] if len(comment) >= 1 else None
+
+    lf = pl.scan_csv(filename, **options)
+    if eager:
+        return lf.collect(), header
+    return lf, header
+
+
 def read_to_list(
     filename: Union[Path, str],
     marker: str = "---",

diff --git a/csvy/writers.py b/csvy/writers.py
@@ -223,6 +223,42 @@ def write_pandas(
     return False
 
 
+@register_writer
+def write_polars(
+    filename: Union[Path, str], data: Any, comment: str = "", **kwargs: Any
+) -> bool:
+    """Writes the polars dataframe to the chosen file, adding it after the header.
+
+    Args:
+        filename: Name of the file to save the data into. The data will be added to the
+            end of the file.
+        data: The data. If it is a polars DataFrame or LazyFrame, it will be saved,
+            otherwise nothing is done.
+        comment: String to use to mark the header lines as comments.
+        **kwargs: Arguments to be passed to the underlaying saving method.
+
+    Returns:
+        True if the writer worked, False otherwise.
+    """
+    try:
+        import polars as pl
+
+        if isinstance(data, pl.LazyFrame):
+            # Streaming mode (saving with `LazyFrame.sink_csv`) is unstable, so we
+            # collect the data into a DataFrame first
+            data = data.collect()
+        if isinstance(data, pl.DataFrame):
+            with open(filename, "a", newline="") as f:
+                data.write_csv(f, **kwargs)
+
+            return True
+
+    except ModuleNotFoundError:
+        logging.getLogger().debug("Polars is not installed, so not using 'write_csv'.")
+
+    return False
+
+
 def write_csv(
     filename: Union[Path, str], data: Any, comment: str = "", **kwargs: Any
 ) -> bool:

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,7 @@ flake8 = "^4.0.1"
 types-PyYAML = "^6.0.7"
 bump2version = "^1.0.1"
 coverage = "^7.1.0"
+polars = "^1.8.2"
 
 [tool.poetry.group.docs]
 optional = true

diff --git a/tests/test_read.py b/tests/test_read.py
@@ -84,6 +84,29 @@ def test_read_to_dataframe(data_path):
         read_to_dataframe(data_path)
 
 
+def test_read_to_polars(data_path):
+    import polars as pl
+    from polars.testing import assert_frame_equal
+
+    from csvy.readers import read_to_polars
+
+    lazy_data, header = read_to_polars(data_path)
+    assert isinstance(lazy_data, pl.LazyFrame)
+    assert tuple(lazy_data.columns) == ("Date", "WTI")
+    assert isinstance(header, dict)
+    assert len(header) > 0
+
+    eager_data, _ = read_to_polars(data_path, eager=True)
+    assert_frame_equal(lazy_data.collect(), eager_data)
+
+    import csvy.readers as readers
+
+    readers.LazyFrame = None
+
+    with pytest.raises(ModuleNotFoundError):
+        read_to_polars(data_path)
+
+
 def test_read_to_list(array_data_path):
     from csvy.readers import read_to_list
 

diff --git a/tests/test_write.py b/tests/test_write.py
@@ -77,6 +77,29 @@ def test_write_pandas(mock_save, tmpdir):
     mock_save.assert_called_once()
 
 
+@patch("polars.DataFrame.write_csv")
+def test_write_polars(mock_save, tmpdir, mocker):
+    import polars as pl
+
+    from csvy.writers import write_polars
+
+    filename = tmpdir / "some_file.csv"
+
+    data = []
+    assert not write_polars(filename, data)
+
+    data = pl.DataFrame()
+    assert write_polars(filename, data)
+    mock_save.assert_called_once()
+
+    data = pl.LazyFrame()
+    collect_spy = mocker.spy(data, "collect")
+    mock_save.reset_mock()
+    assert write_polars(filename, data)
+    collect_spy.assert_called_once()
+    mock_save.assert_called_once()
+
+
 @patch("csv.writer")
 def test_write_csv(mock_save, tmpdir):
     from csvy.writers import write_csv