Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Include support for polars #94

Merged
merged 6 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions csvy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""
Python reader/writer for CSV files with YAML header information.
"""

__version__ = "0.2.2"
from .readers import ( # noqa: F401
read_header,
read_metadata,
read_to_array,
read_to_dataframe,
read_to_polars,
)
from .writers import Writer, write, write_header # noqa: F401
60 changes: 59 additions & 1 deletion csvy/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,17 @@
except ModuleNotFoundError:
DataFrame = None # type: ignore
logging.getLogger().debug(
"Pandas is not installed. Reading into a DataFrame will not work."
"Pandas is not installed. Reading into a pd.DataFrame will not work."
)

try:
from polars import DataFrame as PolarsDataFrame
from polars import LazyFrame
except ModuleNotFoundError:
LazyFrame = None # type: ignore
PolarsDataFrame = None # type: ignore
logging.getLogger().debug(
"Polars is not installed. Reading into a pl.DataFrame will not work."
)


Expand Down Expand Up @@ -168,6 +178,54 @@ def read_to_dataframe(
return pd.read_csv(filename, **options), header


def read_to_polars(
filename: Union[Path, str],
marker: str = "---",
csv_options: Optional[Dict[str, Any]] = None,
yaml_options: Optional[Dict[str, Any]] = None,
eager: bool = False,
) -> Tuple[Union[LazyFrame, PolarsDataFrame], Dict[str, Any]]:
"""Reads a CSVY file into dict with the header and a Polars LazyFrame with the data.

This uses the `scan_csv` method from Polars to read the data. This returns a polars
LazyFrame, which means the data is not loaded into memory until it is needed. To
load the data into memory, set the `eager` parameter to `True`.

Possible 'skip_rows' and 'comment_prefix' argument provided in the 'csv_options'
dictionary will be ignored.

Args:
filename: Name of the file to read.
marker: The marker characters that indicate the yaml header.
csv_options: Options to pass to pd.read_csv.
AdrianDAlessandro marked this conversation as resolved.
Show resolved Hide resolved
yaml_options: Options to pass to yaml.safe_load.
eager: Whether to load the data into memory.

Raises:
ModuleNotFoundError: If polars is not found.

Returns:
Tuple containing: The polars LazyFrame and the header as a dictionary.
"""
if LazyFrame is None:
raise ModuleNotFoundError(
"Module polars is not present. Install it to read data into DataFrame."
)
import polars as pl

yaml_options = yaml_options if yaml_options is not None else {}
header, nlines, comment = read_header(filename, marker=marker, **yaml_options)

options = csv_options.copy() if csv_options is not None else {}
options["skip_rows"] = nlines
options["comment_prefix"] = comment[0] if len(comment) >= 1 else None

lf = pl.scan_csv(filename, **options)
if eager:
return lf.collect(), header
return lf, header


def read_to_list(
filename: Union[Path, str],
marker: str = "---",
Expand Down
36 changes: 36 additions & 0 deletions csvy/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,42 @@ def write_pandas(
return False


@register_writer
def write_polars(
filename: Union[Path, str], data: Any, comment: str = "", **kwargs: Any
) -> bool:
"""Writes the polars dataframe to the chosen file, adding it after the header.

Args:
filename: Name of the file to save the data into. The data will be added to the
end of the file.
data: The data. If it is a polars DataFrame or LazyFrame, it will be saved,
otherwise nothing is done.
comment: String to use to mark the header lines as comments.
**kwargs: Arguments to be passed to the underlaying saving method.

Returns:
True if the writer worked, False otherwise.
"""
try:
import polars as pl

if isinstance(data, pl.LazyFrame):
# Streaming mode (saving with `LazyFrame.sink_csv`) is unstable, so we
# collect the data into a DataFrame first
data = data.collect()
if isinstance(data, pl.DataFrame):
with open(filename, "a", newline="") as f:
data.write_csv(f, **kwargs)

return True

except ModuleNotFoundError:
logging.getLogger().debug("Polars is not installed, so not using 'write_csv'.")

return False


def write_csv(
filename: Union[Path, str], data: Any, comment: str = "", **kwargs: Any
) -> bool:
Expand Down
43 changes: 42 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ flake8 = "^4.0.1"
types-PyYAML = "^6.0.7"
bump2version = "^1.0.1"
coverage = "^7.1.0"
polars = "^1.8.2"

[tool.poetry.group.docs]
optional = true
Expand Down
23 changes: 23 additions & 0 deletions tests/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,29 @@ def test_read_to_dataframe(data_path):
read_to_dataframe(data_path)


def test_read_to_polars(data_path):
import polars as pl
from polars.testing import assert_frame_equal

from csvy.readers import read_to_polars

lazy_data, header = read_to_polars(data_path)
assert isinstance(lazy_data, pl.LazyFrame)
assert tuple(lazy_data.columns) == ("Date", "WTI")
assert isinstance(header, dict)
assert len(header) > 0

eager_data, _ = read_to_polars(data_path, eager=True)
assert_frame_equal(lazy_data.collect(), eager_data)

import csvy.readers as readers

readers.LazyFrame = None

with pytest.raises(ModuleNotFoundError):
read_to_polars(data_path)


def test_read_to_list(array_data_path):
from csvy.readers import read_to_list

Expand Down
23 changes: 23 additions & 0 deletions tests/test_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,29 @@ def test_write_pandas(mock_save, tmpdir):
mock_save.assert_called_once()


@patch("polars.DataFrame.write_csv")
def test_write_polars(mock_save, tmpdir, mocker):
import polars as pl

from csvy.writers import write_polars

filename = tmpdir / "some_file.csv"

data = []
assert not write_polars(filename, data)

data = pl.DataFrame()
assert write_polars(filename, data)
mock_save.assert_called_once()

data = pl.LazyFrame()
collect_spy = mocker.spy(data, "collect")
mock_save.reset_mock()
assert write_polars(filename, data)
collect_spy.assert_called_once()
mock_save.assert_called_once()


@patch("csv.writer")
def test_write_csv(mock_save, tmpdir):
from csvy.writers import write_csv
Expand Down
Loading