Skip to content

Commit

Permalink
ENH: New geoaccessor to_h3 to convert geometry to h3 index (#739)
Browse files Browse the repository at this point in the history
* ENH: Turn point to h3

* Drop activate to turn geodataframe to dataframe

* Add `column` for Series's xy_to_h3

* Add `drop` option

* Add basic documentation for xy_to_h3

* Add example for geodataframe.xy_to_h3

* Add example for geoseries.xy_to_h3

* Set default column name

* BUG: should return geodataframe

* Simplify a bit

* update default value

* Add new advice

* correct path

* New advice for h3-pandas

* Raise error while keep original data and missing column names

* simplify via to_geoframe

* Add raises part

* New advice for h3-pandas

* `xy_to_h3` -> `points_to_h3`

* index methodes

* adjust the place of import block

* require geometry is point type

* Add other error informations

* BOT: auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* lint codes

* correct method name

* use sphinx link

* relink

* add h3 into package

* Create polygons_to_h3.py

* merge `points_to_h3` and `polygons_to_h3` into `to_h3`

* do `rename `when drop is True

* Update error message

* lint codes

* miss importing `xy`

* update example

* merge `points_to_h3` and `polygons_to_h3` into `to_h3`

* lint codes

* update example

* Update example

* Simplify a bit

* simplify a bit

* use getattr to simplify

* move 'import' block to top

* `column` -> `name`

* `column` -> `name`

use `name` first

* BOT: auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update to_h3.py

* Simplify a bit

* adapt the new feature

* simplify a bit

* BOT: auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add blank lines

* Create test_to_h3.py

* BOT: auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* lint codes

* correct variable

* cut blank lines

* adjust parameters sequence

* Create test_to_h3.py

* add more examples

* should use and

* simplify a bit

* Simplify a bit

* correct logic

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
Zeroto521 and pre-commit-ci[bot] authored Dec 11, 2022
1 parent 54916ba commit 95618ef
Show file tree
Hide file tree
Showing 10 changed files with 416 additions and 0 deletions.
1 change: 1 addition & 0 deletions ci/env/latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies:
- topojson
- joblib
- rapidfuzz
- h3-py
# testing
- pytest
- pytest-cov
Expand Down
1 change: 1 addition & 0 deletions doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@
# TODO: delete pygeos after shapely 2.x released
"pygeos": ("https://pygeos.readthedocs.io/en/stable/", None),
"joblib": ("https://joblib.readthedocs.io/en/latest/", None),
"h3": ("https://uber.github.io/h3-py/", None),
}

# extlinks alias
Expand Down
8 changes: 8 additions & 0 deletions doc/source/reference/geoaccessor/geodataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ General methods and attributes
xy


Conversion
----------
.. autosummary::
:toctree: ../api/

to_h3


Projection handling
-------------------
.. autosummary::
Expand Down
8 changes: 8 additions & 0 deletions doc/source/reference/geoaccessor/geoseries.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ General methods and attributes
xy


Conversion
----------
.. autosummary::
:toctree: ../api/

to_h3


Projection handling
-------------------
.. autosummary::
Expand Down
1 change: 1 addition & 0 deletions dtoolkit/geoaccessor/geodataframe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,6 @@
from dtoolkit.geoaccessor.geodataframe.reverse_geocode import ( # noqa: F401
reverse_geocode,
)
from dtoolkit.geoaccessor.geodataframe.to_h3 import to_h3 # noqa: F401
from dtoolkit.geoaccessor.geodataframe.toposimplify import toposimplify # noqa: F401
from dtoolkit.geoaccessor.geodataframe.xy import xy # noqa: F401
131 changes: 131 additions & 0 deletions dtoolkit/geoaccessor/geodataframe/to_h3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from __future__ import annotations

from typing import Hashable

import geopandas as gpd
import pandas as pd

from dtoolkit.accessor.dataframe import repeat
from dtoolkit.accessor.series import len as s_len
from dtoolkit.geoaccessor.geodataframe.drop_geometry import drop_geometry
from dtoolkit.geoaccessor.geoseries.to_h3 import points_to_h3
from dtoolkit.geoaccessor.geoseries.to_h3 import polygons_to_h3
from dtoolkit.geoaccessor.register import register_geodataframe_method


@register_geodataframe_method
def to_h3(
df: gpd.GeoDataFrame,
/,
resolution: int,
drop: bool = True,
name: Hashable = "h3",
) -> pd.DataFrame | gpd.GeoDataFrame:
"""
Convert Point to containing H3 cell index.
Parameters
----------
resolution : int
H3 resolution.
drop : bool, default True
Whether to drop the geometry column.
name : Hashable, default "h3"
Name of the column to store the H3 cell index.
Returns
-------
DataFrame or GeoDataFrame
DataFrame if drop is True else GeoDataFrame.
Raises
------
ModuleNotFoundError
If don't have module named 'h3'.
TypeError
If the geometry is not Point.
ValueError
If the CRS is not WGS84 or EPSG:4326.
See Also
--------
h3.latlon_to_h3
dtoolkit.geoaccessor.geoseries.to_h3
Examples
--------
>>> import dtoolkit.geoaccessor
>>> import pandas as pd
Points to h3 indexes.
>>> df = pd.DataFrame({"x": [122, 100], "y": [55, 1]}).from_xy('x', 'y', crs=4326)
>>> df
x y geometry
0 122 55 POINT (122.00000 55.00000)
1 100 1 POINT (100.00000 1.00000)
>>> df.to_h3(8)
x y h3
0 122 55 612845052823076863
1 100 1 614269156845420543
Keep the geometry column.
>>> df.to_h3(8, drop=False)
x y geometry h3
0 122 55 POINT (122.00000 55.00000) 612845052823076863
1 100 1 POINT (100.00000 1.00000) 614269156845420543
Polygons to h3 indexes.
>>> df = pd.DataFrame(
... {
... "label": ["a", "b"],
... "wkt": [
... "POLYGON ((1 0, 1 1, 0 1, 0 0, 1 0))",
... "POLYGON ((2 1, 2 2, 1 2, 1 1, 2 1))",
... ],
... },
... ).from_wkt("wkt", crs=4326, drop=True)
>>> df
label geometry
0 a POLYGON ((1.00000 0.00000, 1.00000 1.00000, 0....
1 b POLYGON ((2.00000 1.00000, 2.00000 2.00000, 1....
>>> df.to_h3(4)
label h3
0 a 596538839648960511
0 a 596538693620072447
0 a 596538685030137855
0 a 596538848238895103
0 a 596537920525959167
0 a 596538813879156735
0 a 596538856828829695
0 a 596538805289222143
1 b 596538229763604479
1 b 596537946295762943
1 b 596540780974178303
1 b 596540729434570751
1 b 596540772384243711
1 b 596538212583735295
1 b 596540763794309119
1 b 596537954885697535
1 b 596540746614439935
1 b 596538195403866111
1 b 596541030082281471
"""

if df.crs != 4326:
raise ValueError(f"Only support 'EPSG:4326' CRS, but got {df.crs!r}.")

if all(df.geom_type == "Point"):
h3 = points_to_h3(df.geometry, resolution=resolution).rename(name)
elif all(df.geom_type == "Polygon"):
h3_list = polygons_to_h3(df.geometry, resolution=resolution)
h3 = h3_list.explode().rename(name)
df = repeat(df, s_len(h3_list))
else:
raise TypeError("Only support 'Point' or 'Polygon' geometry type.")

return pd.concat((drop_geometry(df) if drop else df, h3), axis=1)
1 change: 1 addition & 0 deletions dtoolkit/geoaccessor/geoseries/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@
from dtoolkit.geoaccessor.geoseries.has_hole import has_hole # noqa: F401
from dtoolkit.geoaccessor.geoseries.hole_counts import hole_counts # noqa: F401
from dtoolkit.geoaccessor.geoseries.reverse_geocode import reverse_geocode # noqa: F401
from dtoolkit.geoaccessor.geoseries.to_h3 import to_h3 # noqa: F401
from dtoolkit.geoaccessor.geoseries.toposimplify import toposimplify # noqa: F401
from dtoolkit.geoaccessor.geoseries.xy import xy # noqa: F401
181 changes: 181 additions & 0 deletions dtoolkit/geoaccessor/geoseries/to_h3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
from __future__ import annotations

from typing import Hashable

import geopandas as gpd
import pandas as pd

from dtoolkit.accessor.series import getattr as s_getattr
from dtoolkit.accessor.series import len as s_len
from dtoolkit.geoaccessor.geoseries.xy import xy
from dtoolkit.geoaccessor.register import register_geoseries_method
from dtoolkit.geoaccessor.series import to_geoframe


@register_geoseries_method
def to_h3(
s: gpd.GeoSeries,
/,
resolution: int,
drop: bool = True,
name: Hashable = "h3",
) -> pd.Series | gpd.GeoDataFrame:
"""
Convert Point or Polygon to H3 cell index.
Parameters
----------
resolution : int
H3 resolution.
name : Hashable, default "h3"
Name of the column to store the H3 cell index. If ``name`` is None, there will
use ``s.name`` as the column name.
drop : bool, default True
Whether to drop the geometry column.
Returns
-------
Series or GeoDataFrame
Series if drop is True else GeoDataFrame.
Raises
------
ModuleNotFoundError
If don't have module named 'h3'.
TypeError
If the geometry type is not Point or Polygon.
ValueError
- If the CRS is not WGS84 or EPSG:4326.
- If ``drop=False`` but ``name=None`` or ``s.name=None``.
See Also
--------
h3.geo_to_h3
dtoolkit.geoaccessor.geodataframe.to_h3
Examples
--------
>>> import dtoolkit.geoaccessor
>>> import pandas as pd
Points to h3 indexes.
>>> s = pd.Series(
... [
... "POINT (122 55)",
... "POINT (100 1)",
... ],
... ).from_wkt(crs=4326, drop=True)
>>> s
0 POINT (122.00000 55.00000)
1 POINT (100.00000 1.00000)
dtype: geometry
>>> s.to_h3(8)
0 612845052823076863
1 614269156845420543
dtype: int64
Keep the geometry column.
>>> s.to_h3(8, drop=False)
h3 geometry
0 612845052823076863 POINT (122.00000 55.00000)
1 614269156845420543 POINT (100.00000 1.00000)
Polygons to h3 indexes.
>>> s = pd.Series(
... [
... "POLYGON ((1 0, 1 1, 0 1, 0 0, 1 0))",
... "POLYGON ((2 1, 2 2, 1 2, 1 1, 2 1))",
... ],
... ).from_wkt(crs=4326, drop=True)
>>> s
0 POLYGON ((1.00000 0.00000, 1.00000 1.00000, 0....
1 POLYGON ((2.00000 1.00000, 2.00000 2.00000, 1....
dtype: geometry
>>> s.to_h3(4)
0 596538839648960511
0 596538693620072447
0 596538685030137855
0 596538848238895103
0 596537920525959167
0 596538813879156735
0 596538856828829695
0 596538805289222143
1 596538229763604479
1 596537946295762943
1 596540780974178303
1 596540729434570751
1 596540772384243711
1 596538212583735295
1 596540763794309119
1 596537954885697535
1 596540746614439935
1 596538195403866111
1 596541030082281471
dtype: object
"""

# TODO: Advices for h3-pandas
# 1. use `import h3.api.numpy_int as h3` instead of `import h3`
# 2. compat with h3-py 4
# 3. requires crs is 4326
# 4. consider h3-py as the accessor of Series
# 5. use h3 cell index as the index of DataFrame, this may be a problem,
# cause it is not unique actually.
# 6. Speed up creating points / polygons via pygeos

if s.crs != 4326:
raise ValueError(f"Only support 'EPSG:4326' CRS, but got {s.crs!r}.")
if not drop and name is None and s.name is None:
raise ValueError(
"to keep the original data requires setting the 'name' of "
f"{s.__class__.__name__!r} or 'name'.",
)

if all(s.geom_type == "Point"):
h3 = points_to_h3(s, resolution=resolution)
elif all(s.geom_type == "Polygon"):
h3_list = polygons_to_h3(s, resolution=resolution)
h3 = h3_list.explode()
s = s.repeat(s_len(h3_list))
else:
raise TypeError("Only support 'Point' or 'Polygon' geometry type.")

return h3 if drop else to_geoframe(h3.rename(name or s.name), geometry=s)


def points_to_h3(s: gpd.GeoSeries, /, resolution: int) -> pd.Series:
# TODO: Use `latlon_to_h3` instead of `geo_to_h3`
# While h3-py release 4, `latlon_to_h3` is not available.

# requires h3 >= 4
# from h3.api.numpy_int import latlng_to_cell
# requires h3 < 4
from h3.api.numpy_int import geo_to_h3

return xy(s, reverse=True, frame=False, name=None).apply(
lambda yx: geo_to_h3(*yx, resolution),
)


def polygons_to_h3(s: gpd.GeoSeries, /, resolution: int) -> pd.Series:
# TODO: Use `polygon_to_cells` instead of `geo_to_h3`
# While h3-py release 4, `polygon_to_cells` is not available.

# requires h3 >= 4
# from h3.api.numpy_int import polygon_to_cells
# requires h3 < 4
from h3.api.numpy_int import polyfill

# If `geo_json_conformant` is True, the coordinate could be (lon, lat).
return s_getattr(s, "__geo_interface__").apply(
polyfill,
res=resolution,
geo_json_conformant=True,
)
Loading

0 comments on commit 95618ef

Please # to comment.