From ddc67f4dc161d75fad803bbac6e55df5d5866592 Mon Sep 17 00:00:00 2001 From: Adrian D'Alessandro Date: Tue, 1 Oct 2024 13:29:09 +0100 Subject: [PATCH 1/4] add polars as a dev dependency --- poetry.lock | 47 ++++++++++++++++++++++++++++++++++++++++++++--- pyproject.toml | 1 + 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index b34ce94..6f7087c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "attrs" @@ -879,8 +879,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, ] python-dateutil = ">=2.8.1" pytz = ">=2020.1" @@ -929,6 +929,47 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "polars" +version = "1.8.2" +description = "Blazingly fast DataFrame library" +optional = false +python-versions = ">=3.8" +files = [ + {file = "polars-1.8.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:114be1ebfb051b794fb9e1f15999430c79cc0824595e237d3f45632be3e56d73"}, + {file = "polars-1.8.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:e4fc36cfe48972d4c5be21a7cb119d6378fb7af0bb3eeb61456b66a1f43228e3"}, + {file = "polars-1.8.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67c1e448d6e38697650b22dd359f13c40b567c0b66686c8602e4367400e87801"}, + {file = "polars-1.8.2-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:570ee86b033dc5a6dbe2cb0df48522301642f304dda3da48f53d7488899a2206"}, + {file = "polars-1.8.2-cp38-abi3-win_amd64.whl", hash = "sha256:ce1a1c1e2150ffcc44a5f1c461d738e1dcd95abbd0f210af0271c7ac0c9f7ef9"}, + {file = "polars-1.8.2.tar.gz", hash = "sha256:42f69277d5be2833b0b826af5e75dcf430222d65c9633872856e176a0bed27a0"}, +] + +[package.extras] +adbc = ["adbc-driver-manager[dbapi]", "adbc-driver-sqlite[dbapi]"] +all = ["polars[async,cloudpickle,database,deltalake,excel,fsspec,graph,iceberg,numpy,pandas,plot,pyarrow,pydantic,style,timezone]"] +async = ["gevent"] +calamine = ["fastexcel (>=0.9)"] +cloudpickle = ["cloudpickle"] +connectorx = ["connectorx (>=0.3.2)"] +database = ["nest-asyncio", "polars[adbc,connectorx,sqlalchemy]"] +deltalake = ["deltalake (>=0.15.0)"] +excel = ["polars[calamine,openpyxl,xlsx2csv,xlsxwriter]"] +fsspec = ["fsspec"] +gpu = ["cudf-polars-cu12"] +graph = ["matplotlib"] +iceberg = ["pyiceberg (>=0.5.0)"] +numpy = ["numpy (>=1.16.0)"] +openpyxl = ["openpyxl (>=3.0.0)"] +pandas = ["pandas", "polars[pyarrow]"] +plot = ["altair (>=5.4.0)"] +pyarrow = ["pyarrow (>=7.0.0)"] +pydantic = ["pydantic"] +sqlalchemy = ["polars[pandas]", "sqlalchemy"] +style = ["great-tables (>=0.8.0)"] +timezone = ["backports-zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +xlsxwriter = ["xlsxwriter"] + [[package]] name = "pre-commit" version = "2.21.0" @@ -1341,4 +1382,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "2bcdaccad228261ec751162a1cf7bfb873a10f57b481e508da1c462431ce1eed" +content-hash = "d9edd40c8b99bf315b02d29d2840a8c9f8adf30f86a8c21dabd437ea11c5ec5d" diff --git a/pyproject.toml b/pyproject.toml index 6f195c4..4aacdd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ flake8 = "^4.0.1" types-PyYAML = "^6.0.7" bump2version = "^1.0.1" coverage = "^7.1.0" +polars = "^1.8.2" [tool.poetry.group.docs] optional = true From 6096feb11ceb356b0077eddecbc4f71cf281b0a0 Mon Sep 17 00:00:00 2001 From: Adrian D'Alessandro Date: Tue, 1 Oct 2024 13:29:54 +0100 Subject: [PATCH 2/4] Include a read_to_polars function --- csvy/__init__.py | 2 ++ csvy/readers.py | 60 +++++++++++++++++++++++++++++++++++++++++++++- tests/test_read.py | 23 ++++++++++++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/csvy/__init__.py b/csvy/__init__.py index adb2a2b..9996fa1 100644 --- a/csvy/__init__.py +++ b/csvy/__init__.py @@ -1,11 +1,13 @@ """ Python reader/writer for CSV files with YAML header information. """ + __version__ = "0.2.2" from .readers import ( # noqa: F401 read_header, read_metadata, read_to_array, read_to_dataframe, + read_to_polars, ) from .writers import Writer, write, write_header # noqa: F401 diff --git a/csvy/readers.py b/csvy/readers.py index e51bcd9..90f6dc3 100644 --- a/csvy/readers.py +++ b/csvy/readers.py @@ -17,7 +17,17 @@ except ModuleNotFoundError: DataFrame = None # type: ignore logging.getLogger().debug( - "Pandas is not installed. Reading into a DataFrame will not work." + "Pandas is not installed. Reading into a pd.DataFrame will not work." + ) + +try: + from polars import DataFrame as PolarsDataFrame + from polars import LazyFrame +except ModuleNotFoundError: + LazyFrame = None # type: ignore + PolarsDataFrame = None # type: ignore + logging.getLogger().debug( + "Polars is not installed. Reading into a pl.DataFrame will not work." ) @@ -168,6 +178,54 @@ def read_to_dataframe( return pd.read_csv(filename, **options), header +def read_to_polars( + filename: Union[Path, str], + marker: str = "---", + csv_options: Optional[Dict[str, Any]] = None, + yaml_options: Optional[Dict[str, Any]] = None, + eager: bool = False, +) -> Tuple[Union[LazyFrame, PolarsDataFrame], Dict[str, Any]]: + """Reads a CSVY file into dict with the header and a Polars LazyFrame with the data. + + This uses the `scan_csv` method from Polars to read the data. This returns a polars + LazyFrame, which means the data is not loaded into memory until it is needed. To + load the data into memory, set the `eager` parameter to `True`. + + Possible 'skip_rows' and 'comment_prefix' argument provided in the 'csv_options' + dictionary will be ignored. + + Args: + filename: Name of the file to read. + marker: The marker characters that indicate the yaml header. + csv_options: Options to pass to pd.read_csv. + yaml_options: Options to pass to yaml.safe_load. + eager: Whether to load the data into memory. + + Raises: + ModuleNotFoundError: If polars is not found. + + Returns: + Tuple containing: The polars LazyFrame and the header as a dictionary. + """ + if LazyFrame is None: + raise ModuleNotFoundError( + "Module polars is not present. Install it to read data into DataFrame." + ) + import polars as pl + + yaml_options = yaml_options if yaml_options is not None else {} + header, nlines, comment = read_header(filename, marker=marker, **yaml_options) + + options = csv_options.copy() if csv_options is not None else {} + options["skip_rows"] = nlines + options["comment_prefix"] = comment[0] if len(comment) >= 1 else None + + lf = pl.scan_csv(filename, **options) + if eager: + return lf.collect(), header + return lf, header + + def read_to_list( filename: Union[Path, str], marker: str = "---", diff --git a/tests/test_read.py b/tests/test_read.py index 32bf32d..df39077 100644 --- a/tests/test_read.py +++ b/tests/test_read.py @@ -84,6 +84,29 @@ def test_read_to_dataframe(data_path): read_to_dataframe(data_path) +def test_read_to_polars(data_path): + import polars as pl + from polars.testing import assert_frame_equal + + from csvy.readers import read_to_polars + + lazy_data, header = read_to_polars(data_path) + assert isinstance(lazy_data, pl.LazyFrame) + assert tuple(lazy_data.columns) == ("Date", "WTI") + assert isinstance(header, dict) + assert len(header) > 0 + + eager_data, _ = read_to_polars(data_path, eager=True) + assert_frame_equal(lazy_data.collect(), eager_data) + + import csvy.readers as readers + + readers.LazyFrame = None + + with pytest.raises(ModuleNotFoundError): + read_to_polars(data_path) + + def test_read_to_list(array_data_path): from csvy.readers import read_to_list From de32ca4ec314fed04700328a516b3ec610a7a905 Mon Sep 17 00:00:00 2001 From: Adrian D'Alessandro Date: Tue, 1 Oct 2024 15:06:47 +0100 Subject: [PATCH 3/4] Add a write_polars function --- csvy/writers.py | 36 ++++++++++++++++++++++++++++++++++++ tests/test_write.py | 23 +++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/csvy/writers.py b/csvy/writers.py index 68234cd..a1c43f7 100644 --- a/csvy/writers.py +++ b/csvy/writers.py @@ -223,6 +223,42 @@ def write_pandas( return False +@register_writer +def write_polars( + filename: Union[Path, str], data: Any, comment: str = "", **kwargs: Any +) -> bool: + """Writes the polars dataframe to the chosen file, adding it after the header. + + Args: + filename: Name of the file to save the data into. The data will be added to the + end of the file. + data: The data. If it is a polars DataFrame or LazyFrame, it will be saved, + otherwise nothing is done. + comment: String to use to mark the header lines as comments. + **kwargs: Arguments to be passed to the underlaying saving method. + + Returns: + True if the writer worked, False otherwise. + """ + try: + import polars as pl + + if isinstance(data, pl.LazyFrame): + # Streaming mode (saving with `LazyFrame.sink_csv`) is unstable, so we + # collect the data into a DataFrame first + data = data.collect() + if isinstance(data, pl.DataFrame): + with open(filename, "a", newline="") as f: + data.write_csv(f, **kwargs) + + return True + + except ModuleNotFoundError: + logging.getLogger().debug("Polars is not installed, so not using 'write_csv'.") + + return False + + def write_csv( filename: Union[Path, str], data: Any, comment: str = "", **kwargs: Any ) -> bool: diff --git a/tests/test_write.py b/tests/test_write.py index 8e1c18f..5bfdf7d 100644 --- a/tests/test_write.py +++ b/tests/test_write.py @@ -77,6 +77,29 @@ def test_write_pandas(mock_save, tmpdir): mock_save.assert_called_once() +@patch("polars.DataFrame.write_csv") +def test_write_polars(mock_save, tmpdir, mocker): + import polars as pl + + from csvy.writers import write_polars + + filename = tmpdir / "some_file.csv" + + data = [] + assert not write_polars(filename, data) + + data = pl.DataFrame() + assert write_polars(filename, data) + mock_save.assert_called_once() + + data = pl.LazyFrame() + collect_spy = mocker.spy(data, "collect") + mock_save.reset_mock() + assert write_polars(filename, data) + collect_spy.assert_called_once() + mock_save.assert_called_once() + + @patch("csv.writer") def test_write_csv(mock_save, tmpdir): from csvy.writers import write_csv From 33571d0fbb7dd480a19e4c49055dbb306591caed Mon Sep 17 00:00:00 2001 From: Adrian D'Alessandro Date: Tue, 1 Oct 2024 15:24:23 +0100 Subject: [PATCH 4/4] Update csvy/readers.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Diego Alonso Álvarez <6095790+dalonsoa@users.noreply.github.com> --- csvy/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csvy/readers.py b/csvy/readers.py index 90f6dc3..f43b96b 100644 --- a/csvy/readers.py +++ b/csvy/readers.py @@ -197,7 +197,7 @@ def read_to_polars( Args: filename: Name of the file to read. marker: The marker characters that indicate the yaml header. - csv_options: Options to pass to pd.read_csv. + csv_options: Options to pass to pl.scan_csv. yaml_options: Options to pass to yaml.safe_load. eager: Whether to load the data into memory.