From a6c0ca84afa3ea139d2df5414453052c83f85480 Mon Sep 17 00:00:00 2001 From: Nathan McDougall Date: Fri, 19 Jul 2024 15:44:20 +1200 Subject: [PATCH 1/7] Add geopandas as an optional test dependency --- pyproject.toml | 1 + requirements/dev.txt | 25 +++++++++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3497c51a..4b394c01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ test = [ "pytest-dotenv", "pytest-parallel", "s3fs", + "geopandas", ] [build-system] diff --git a/requirements/dev.txt b/requirements/dev.txt index 2b2a43a0..3aa34309 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -47,7 +47,7 @@ azure-datalake-store==0.0.53 # via adlfs azure-identity==1.17.1 # via adlfs -azure-storage-blob==12.20.0 +azure-storage-blob==12.21.0 # via adlfs backcall==0.2.0 # via ipython @@ -61,6 +61,8 @@ cachetools==5.4.0 # via google-auth certifi==2024.7.4 # via + # pyogrio + # pyproj # requests # sphobjinv cffi==1.16.0 @@ -118,6 +120,8 @@ fsspec==2024.6.1 # s3fs gcsfs==2024.6.1 # via pins (setup.cfg) +geopandas==1.0.1 + # via pins (setup.cfg) google-api-core==2.19.1 # via # google-cloud-core @@ -209,7 +213,7 @@ matplotlib-inline==0.1.7 # ipython mdurl==0.1.2 # via markdown-it-py -msal==1.29.0 +msal==1.30.0 # via # azure-datalake-store # azure-identity @@ -235,20 +239,26 @@ nodeenv==1.9.1 numpy==2.0.0 # via # fastparquet + # geopandas # pandas # pyarrow + # pyogrio + # shapely oauthlib==3.2.2 # via requests-oauthlib packaging==24.1 # via # build # fastparquet + # geopandas # ipykernel + # pyogrio # pytest # pytest-cases pandas==2.2.2 # via # fastparquet + # geopandas # pins (setup.cfg) parso==0.8.4 # via jedi @@ -264,7 +274,7 @@ platformdirs==4.2.2 # virtualenv pluggy==1.5.0 # via pytest -plum-dispatch==2.5.1.post1 +plum-dispatch==2.5.2 # via quartodoc portalocker==2.10.1 # via msal-extensions @@ -287,7 +297,7 @@ pure-eval==0.2.2 # via stack-data py==1.11.0 # via pytest -pyarrow==16.1.0 +pyarrow==17.0.0 # via pins (setup.cfg) pyasn1==0.6.0 # via @@ -309,6 +319,10 @@ pyjwt==2.8.0 # via # msal # pyjwt +pyogrio==0.9.0 + # via geopandas +pyproj==3.6.1 + # via geopandas pyproject-hooks==1.1.0 # via # build @@ -373,6 +387,8 @@ rsa==4.9 # via google-auth s3fs==2024.6.1 # via pins (setup.cfg) +shapely==2.0.5 + # via geopandas six==1.16.0 # via # asttokens @@ -410,6 +426,7 @@ typing-extensions==4.12.2 # azure-core # azure-identity # azure-storage-blob + # plum-dispatch # pydantic # pydantic-core # quartodoc From cd68b2a80931a68eee15581f42f53680466a7a71 Mon Sep 17 00:00:00 2001 From: Nathan McDougall Date: Fri, 19 Jul 2024 16:43:39 +1200 Subject: [PATCH 2/7] Implement drivers for geoparquet. --- pins/drivers.py | 28 ++++++++++++++++++++++++++++ pins/tests/test_drivers.py | 21 +++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/pins/drivers.py b/pins/drivers.py index 5aa3e186..b9a7c720 100644 --- a/pins/drivers.py +++ b/pins/drivers.py @@ -22,6 +22,16 @@ def _assert_is_pandas_df(x, file_type: str) -> None: ) +def _assert_is_geopandas_df(x): + # Assume we have already protected against uninstalled geopandas + import geopandas as gpd + + if not isinstance(x, gpd.GeoDataFrame): + raise NotImplementedError( + "Currently only geopandas.GeoDataFrame can be saved to a GeoParquet." + ) + + def load_path(meta, path_to_version): # Check that only a single file name was given fnames = [meta.file] if isinstance(meta.file, str) else meta.file @@ -104,6 +114,17 @@ def load_data( return pd.read_csv(f) + elif meta.type == "geoparquet": + try: + import geopandas as gpd + except ModuleNotFoundError: + raise ModuleNotFoundError( + 'The "geopandas" package is required to read "geoparquet" type ' + "files." + ) from None + + return gpd.read_parquet(f) + elif meta.type == "joblib": import joblib @@ -144,6 +165,8 @@ def save_data(obj, fname, type=None, apply_suffix: bool = True) -> "str | Sequen if apply_suffix: if type == "file": suffix = "".join(Path(obj).suffixes) + elif type == "geoparquet": + suffix = ".parquet" else: suffix = f".{type}" else: @@ -175,6 +198,11 @@ def save_data(obj, fname, type=None, apply_suffix: bool = True) -> "str | Sequen obj.to_parquet(final_name) + elif type == "geoparquet": + _assert_is_geopandas_df(obj) + + obj.to_parquet(final_name) + elif type == "joblib": import joblib diff --git a/pins/tests/test_drivers.py b/pins/tests/test_drivers.py index 230f0e80..f6d33857 100644 --- a/pins/tests/test_drivers.py +++ b/pins/tests/test_drivers.py @@ -76,6 +76,27 @@ def test_driver_roundtrip(tmp_path: Path, type_): assert df.equals(obj) +def test_driver_geoparquet_roundtrip(tmp_dir2): + import geopandas as gpd + + gdf = gpd.GeoDataFrame( + {"x": [1, 2, 3], "geometry": gpd.points_from_xy([1, 2, 3], [1, 2, 3])} + ) + + fname = "some_gdf" + full_file = f"{fname}.parquet" + + p_obj = tmp_dir2 / fname + res_fname = save_data(gdf, p_obj, "geoparquet") + + assert Path(res_fname).name == full_file + + meta = MetaRaw(full_file, "geoparquet", "my_pin") + obj = load_data(meta, fsspec.filesystem("file"), tmp_dir2, allow_pickle_read=True) + + assert gdf.equals(obj) + + @pytest.mark.parametrize( "type_", [ From 807deedafad0d82ebce9bf54f72143321132b303 Mon Sep 17 00:00:00 2001 From: Nathan McDougall Date: Fri, 19 Jul 2024 16:48:21 +1200 Subject: [PATCH 3/7] Update documentation. --- docs/get_started.qmd | 1 + pins/boards.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/get_started.qmd b/docs/get_started.qmd index 2f658d7f..ef7f79dd 100644 --- a/docs/get_started.qmd +++ b/docs/get_started.qmd @@ -75,6 +75,7 @@ Above, we saved the data as a CSV, but you can choose another option depending o - `type = "arrow"` uses `to_feather()` from pandas to create an Arrow/Feather file. - `type = "joblib"` uses `joblib.dump()` to create a binary Python data file, such as for storing a trained model. See the [joblib docs](https://joblib.readthedocs.io/en/latest/) for more information. - `type = "json"` uses `json.dump()` to create a JSON file. Pretty much every programming language can read JSON files, but they only work well for nested lists. +- `type = "geoparquet"` uses `to_parquet()` from [geopandas](https://github.com/geopandas/geopandas) to create a [GeoParquet](https://github.com/opengeospatial/geoparquet) file, which is a specialized Parquet format for geospatial data. Note that when the data lives elsewhere, pins takes care of downloading and caching so that it's only re-downloaded when needed. That said, most boards transmit pins over HTTP, and this is going to be slow and possibly unreliable for very large pins. diff --git a/pins/boards.py b/pins/boards.py index f305488c..2c32c952 100644 --- a/pins/boards.py +++ b/pins/boards.py @@ -319,7 +319,7 @@ def pin_write( Pin name. type: File type used to save `x` to disk. May be "csv", "arrow", "parquet", - "joblib", or "json". + "joblib", "json", or "geoparquet". title: A title for the pin; most important for shared boards so that others can understand what the pin contains. If omitted, a brief description From f4773981e129595c84086b62c08d706be754d8f3 Mon Sep 17 00:00:00 2001 From: Nathan McDougall Date: Fri, 19 Jul 2024 16:53:33 +1200 Subject: [PATCH 4/7] Add lower bound to geopandas requirement --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4b394c01..294ec11f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ test = [ "pytest-dotenv", "pytest-parallel", "s3fs", - "geopandas", + "geopandas>=0.8.0", # At 0.8.0, the GeoParquet format was introduced. ] [build-system] From 52c0212c04ee7ce8921cfccbb6e86d9333b47a50 Mon Sep 17 00:00:00 2001 From: Nathan McDougall Date: Sat, 20 Jul 2024 06:58:23 +1200 Subject: [PATCH 5/7] Support GeoDataFrame in `default_title` --- pins/drivers.py | 12 +++++++++++- pins/tests/test_drivers.py | 5 +++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pins/drivers.py b/pins/drivers.py index b9a7c720..2cc5e796 100644 --- a/pins/drivers.py +++ b/pins/drivers.py @@ -231,10 +231,20 @@ def default_title(obj, name): import pandas as pd if isinstance(obj, pd.DataFrame): + try: + import geopandas as gpd + except ModuleNotFoundError: + obj_name = "DataFrame" + else: + if isinstance(obj, gpd.GeoDataFrame): + obj_name = "GeoDataFrame" + else: + obj_name = "DataFrame" + # TODO(compat): title says CSV rather than data.frame # see https://github.com/machow/pins-python/issues/5 shape_str = " x ".join(map(str, obj.shape)) - return f"{name}: a pinned {shape_str} DataFrame" + return f"{name}: a pinned {shape_str} {obj_name}" else: obj_name = type(obj).__qualname__ return f"{name}: a pinned {obj_name} object" diff --git a/pins/tests/test_drivers.py b/pins/tests/test_drivers.py index f6d33857..71d4b441 100644 --- a/pins/tests/test_drivers.py +++ b/pins/tests/test_drivers.py @@ -1,6 +1,7 @@ from pathlib import Path import fsspec +import geopandas as gpd import pandas as pd import pytest @@ -34,6 +35,10 @@ class D: [ (pd.DataFrame({"x": [1, 2]}), "somename: a pinned 2 x 1 DataFrame"), (pd.DataFrame({"x": [1], "y": [2]}), "somename: a pinned 1 x 2 DataFrame"), + ( + gpd.GeoDataFrame({"x": [1], "geometry": [None]}), + "somename: a pinned 1 x 2 GeoDataFrame", + ), (ExC(), "somename: a pinned ExC object"), (ExC().D(), "somename: a pinned ExC.D object"), ([1, 2, 3], "somename: a pinned list object"), From da7196653bf9108642ed6318b0b1d86f3911499f Mon Sep 17 00:00:00 2001 From: Nathan McDougall Date: Sat, 20 Jul 2024 14:17:12 +1200 Subject: [PATCH 6/7] Revert unnecessary bumps in dev.txt --- requirements/dev.txt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index 3aa34309..5331a037 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -47,7 +47,7 @@ azure-datalake-store==0.0.53 # via adlfs azure-identity==1.17.1 # via adlfs -azure-storage-blob==12.21.0 +azure-storage-blob==12.20.0 # via adlfs backcall==0.2.0 # via ipython @@ -213,7 +213,7 @@ matplotlib-inline==0.1.7 # ipython mdurl==0.1.2 # via markdown-it-py -msal==1.30.0 +msal==1.29.0 # via # azure-datalake-store # azure-identity @@ -274,7 +274,7 @@ platformdirs==4.2.2 # virtualenv pluggy==1.5.0 # via pytest -plum-dispatch==2.5.2 +plum-dispatch==2.5.1.post1 # via quartodoc portalocker==2.10.1 # via msal-extensions @@ -297,7 +297,7 @@ pure-eval==0.2.2 # via stack-data py==1.11.0 # via pytest -pyarrow==17.0.0 +pyarrow==16.1.0 # via pins (setup.cfg) pyasn1==0.6.0 # via @@ -426,7 +426,6 @@ typing-extensions==4.12.2 # azure-core # azure-identity # azure-storage-blob - # plum-dispatch # pydantic # pydantic-core # quartodoc From c59806e5b4471bc39e212e2731cffa23167cb5fc Mon Sep 17 00:00:00 2001 From: Nathan McDougall Date: Thu, 25 Jul 2024 20:10:10 +1200 Subject: [PATCH 7/7] Move from using tmp_dir2 to tmp_path in tests --- pins/tests/test_drivers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pins/tests/test_drivers.py b/pins/tests/test_drivers.py index 71d4b441..4588ab59 100644 --- a/pins/tests/test_drivers.py +++ b/pins/tests/test_drivers.py @@ -81,7 +81,7 @@ def test_driver_roundtrip(tmp_path: Path, type_): assert df.equals(obj) -def test_driver_geoparquet_roundtrip(tmp_dir2): +def test_driver_geoparquet_roundtrip(tmp_path): import geopandas as gpd gdf = gpd.GeoDataFrame( @@ -91,13 +91,13 @@ def test_driver_geoparquet_roundtrip(tmp_dir2): fname = "some_gdf" full_file = f"{fname}.parquet" - p_obj = tmp_dir2 / fname + p_obj = tmp_path / fname res_fname = save_data(gdf, p_obj, "geoparquet") assert Path(res_fname).name == full_file meta = MetaRaw(full_file, "geoparquet", "my_pin") - obj = load_data(meta, fsspec.filesystem("file"), tmp_dir2, allow_pickle_read=True) + obj = load_data(meta, fsspec.filesystem("file"), tmp_path, allow_pickle_read=True) assert gdf.equals(obj)