ENH: New geoaccessor to_h3 to convert geometry to h3 index (#739)

* ENH: Turn point to h3 * Drop activate to turn geodataframe to dataframe * Add `column` for Series's xy_to_h3 * Add `drop` option * Add basic documentation for xy_to_h3 * Add example for geodataframe.xy_to_h3 * Add example for geoseries.xy_to_h3 * Set default column name * BUG: should return geodataframe * Simplify a bit * update default value * Add new advice * correct path * New advice for h3-pandas * Raise error while keep original data and missing column names * simplify via to_geoframe * Add raises part * New advice for h3-pandas * `xy_to_h3` -> `points_to_h3` * index methodes * adjust the place of import block * require geometry is point type * Add other error informations * BOT: auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * lint codes * correct method name * use sphinx link * relink * add h3 into package * Create polygons_to_h3.py * merge `points_to_h3` and `polygons_to_h3` into `to_h3` * do `rename `when drop is True * Update error message * lint codes * miss importing `xy` * update example * merge `points_to_h3` and `polygons_to_h3` into `to_h3` * lint codes * update example * Update example * Simplify a bit * simplify a bit * use getattr to simplify * move 'import' block to top * `column` -> `name` * `column` -> `name` use `name` first * BOT: auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update to_h3.py * Simplify a bit * adapt the new feature * simplify a bit * BOT: auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add blank lines * Create test_to_h3.py * BOT: auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * lint codes * correct variable * cut blank lines * adjust parameters sequence * Create test_to_h3.py * add more examples * should use and * simplify a bit * Simplify a bit * correct logic Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Zeroto521 · Dec 11, 2022 · 95618ef · 95618ef
1 parent 54916ba
commit 95618ef
Show file tree

Hide file tree

Showing 10 changed files with 416 additions and 0 deletions.
diff --git a/ci/env/latest.yaml b/ci/env/latest.yaml
@@ -15,6 +15,7 @@ dependencies:
   - topojson
   - joblib
   - rapidfuzz
+  - h3-py
   # testing
   - pytest
   - pytest-cov

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -129,6 +129,7 @@
     # TODO: delete pygeos after shapely 2.x released
     "pygeos": ("https://pygeos.readthedocs.io/en/stable/", None),
     "joblib": ("https://joblib.readthedocs.io/en/latest/", None),
+    "h3": ("https://uber.github.io/h3-py/", None),
 }
 
 # extlinks alias

diff --git a/doc/source/reference/geoaccessor/geodataframe.rst b/doc/source/reference/geoaccessor/geodataframe.rst
@@ -21,6 +21,14 @@ General methods and attributes
     xy
 
 
+Conversion
+----------
+.. autosummary::
+    :toctree: ../api/
+
+    to_h3
+
+
 Projection handling
 -------------------
 .. autosummary::

diff --git a/doc/source/reference/geoaccessor/geoseries.rst b/doc/source/reference/geoaccessor/geoseries.rst
@@ -21,6 +21,14 @@ General methods and attributes
     xy
 
 
+Conversion
+----------
+.. autosummary::
+    :toctree: ../api/
+
+    to_h3
+
+
 Projection handling
 -------------------
 .. autosummary::

diff --git a/dtoolkit/geoaccessor/geodataframe/__init__.py b/dtoolkit/geoaccessor/geodataframe/__init__.py
@@ -29,5 +29,6 @@
 from dtoolkit.geoaccessor.geodataframe.reverse_geocode import (  # noqa: F401
     reverse_geocode,
 )
+from dtoolkit.geoaccessor.geodataframe.to_h3 import to_h3  # noqa: F401
 from dtoolkit.geoaccessor.geodataframe.toposimplify import toposimplify  # noqa: F401
 from dtoolkit.geoaccessor.geodataframe.xy import xy  # noqa: F401
diff --git a/dtoolkit/geoaccessor/geodataframe/to_h3.py b/dtoolkit/geoaccessor/geodataframe/to_h3.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+from typing import Hashable
+
+import geopandas as gpd
+import pandas as pd
+
+from dtoolkit.accessor.dataframe import repeat
+from dtoolkit.accessor.series import len as s_len
+from dtoolkit.geoaccessor.geodataframe.drop_geometry import drop_geometry
+from dtoolkit.geoaccessor.geoseries.to_h3 import points_to_h3
+from dtoolkit.geoaccessor.geoseries.to_h3 import polygons_to_h3
+from dtoolkit.geoaccessor.register import register_geodataframe_method
+
+
+@register_geodataframe_method
+def to_h3(
+    df: gpd.GeoDataFrame,
+    /,
+    resolution: int,
+    drop: bool = True,
+    name: Hashable = "h3",
+) -> pd.DataFrame | gpd.GeoDataFrame:
+    """
+    Convert Point to containing H3 cell index.
+
+    Parameters
+    ----------
+    resolution : int
+        H3 resolution.
+
+    drop : bool, default True
+        Whether to drop the geometry column.
+
+    name : Hashable, default "h3"
+        Name of the column to store the H3 cell index.
+
+    Returns
+    -------
+    DataFrame or GeoDataFrame
+        DataFrame if drop is True else GeoDataFrame.
+
+    Raises
+    ------
+    ModuleNotFoundError
+        If don't have module named 'h3'.
+    TypeError
+        If the geometry is not Point.
+    ValueError
+        If the CRS is not WGS84 or EPSG:4326.
+
+    See Also
+    --------
+    h3.latlon_to_h3
+    dtoolkit.geoaccessor.geoseries.to_h3
+
+    Examples
+    --------
+    >>> import dtoolkit.geoaccessor
+    >>> import pandas as pd
+
+    Points to h3 indexes.
+
+    >>> df = pd.DataFrame({"x": [122, 100], "y": [55, 1]}).from_xy('x', 'y', crs=4326)
+    >>> df
+         x   y                    geometry
+    0  122  55  POINT (122.00000 55.00000)
+    1  100   1   POINT (100.00000 1.00000)
+    >>> df.to_h3(8)
+         x   y                  h3
+    0  122  55  612845052823076863
+    1  100   1  614269156845420543
+
+    Keep the geometry column.
+
+    >>> df.to_h3(8, drop=False)
+         x   y                    geometry                  h3
+    0  122  55  POINT (122.00000 55.00000)  612845052823076863
+    1  100   1   POINT (100.00000 1.00000)  614269156845420543
+
+    Polygons to h3 indexes.
+
+    >>> df = pd.DataFrame(
+    ...     {
+    ...         "label": ["a", "b"],
+    ...         "wkt": [
+    ...             "POLYGON ((1 0, 1 1, 0 1, 0 0, 1 0))",
+    ...             "POLYGON ((2 1, 2 2, 1 2, 1 1, 2 1))",
+    ...         ],
+    ...     },
+    ... ).from_wkt("wkt", crs=4326, drop=True)
+    >>> df
+      label                                           geometry
+    0     a  POLYGON ((1.00000 0.00000, 1.00000 1.00000, 0....
+    1     b  POLYGON ((2.00000 1.00000, 2.00000 2.00000, 1....
+    >>> df.to_h3(4)
+      label                  h3
+    0     a  596538839648960511
+    0     a  596538693620072447
+    0     a  596538685030137855
+    0     a  596538848238895103
+    0     a  596537920525959167
+    0     a  596538813879156735
+    0     a  596538856828829695
+    0     a  596538805289222143
+    1     b  596538229763604479
+    1     b  596537946295762943
+    1     b  596540780974178303
+    1     b  596540729434570751
+    1     b  596540772384243711
+    1     b  596538212583735295
+    1     b  596540763794309119
+    1     b  596537954885697535
+    1     b  596540746614439935
+    1     b  596538195403866111
+    1     b  596541030082281471
+    """
+
+    if df.crs != 4326:
+        raise ValueError(f"Only support 'EPSG:4326' CRS, but got {df.crs!r}.")
+
+    if all(df.geom_type == "Point"):
+        h3 = points_to_h3(df.geometry, resolution=resolution).rename(name)
+    elif all(df.geom_type == "Polygon"):
+        h3_list = polygons_to_h3(df.geometry, resolution=resolution)
+        h3 = h3_list.explode().rename(name)
+        df = repeat(df, s_len(h3_list))
+    else:
+        raise TypeError("Only support 'Point' or 'Polygon' geometry type.")
+
+    return pd.concat((drop_geometry(df) if drop else df, h3), axis=1)
diff --git a/dtoolkit/geoaccessor/geoseries/__init__.py b/dtoolkit/geoaccessor/geoseries/__init__.py
@@ -22,5 +22,6 @@
 from dtoolkit.geoaccessor.geoseries.has_hole import has_hole  # noqa: F401
 from dtoolkit.geoaccessor.geoseries.hole_counts import hole_counts  # noqa: F401
 from dtoolkit.geoaccessor.geoseries.reverse_geocode import reverse_geocode  # noqa: F401
+from dtoolkit.geoaccessor.geoseries.to_h3 import to_h3  # noqa: F401
 from dtoolkit.geoaccessor.geoseries.toposimplify import toposimplify  # noqa: F401
 from dtoolkit.geoaccessor.geoseries.xy import xy  # noqa: F401
diff --git a/dtoolkit/geoaccessor/geoseries/to_h3.py b/dtoolkit/geoaccessor/geoseries/to_h3.py
@@ -0,0 +1,181 @@
+from __future__ import annotations
+
+from typing import Hashable
+
+import geopandas as gpd
+import pandas as pd
+
+from dtoolkit.accessor.series import getattr as s_getattr
+from dtoolkit.accessor.series import len as s_len
+from dtoolkit.geoaccessor.geoseries.xy import xy
+from dtoolkit.geoaccessor.register import register_geoseries_method
+from dtoolkit.geoaccessor.series import to_geoframe
+
+
+@register_geoseries_method
+def to_h3(
+    s: gpd.GeoSeries,
+    /,
+    resolution: int,
+    drop: bool = True,
+    name: Hashable = "h3",
+) -> pd.Series | gpd.GeoDataFrame:
+    """
+    Convert Point or Polygon to H3 cell index.
+
+    Parameters
+    ----------
+    resolution : int
+        H3 resolution.
+
+    name : Hashable, default "h3"
+        Name of the column to store the H3 cell index. If ``name`` is None, there will
+        use ``s.name`` as the column name.
+
+    drop : bool, default True
+        Whether to drop the geometry column.
+
+    Returns
+    -------
+    Series or GeoDataFrame
+        Series if drop is True else GeoDataFrame.
+
+    Raises
+    ------
+    ModuleNotFoundError
+        If don't have module named 'h3'.
+
+    TypeError
+        If the geometry type is not Point or Polygon.
+
+    ValueError
+        - If the CRS is not WGS84 or EPSG:4326.
+        - If ``drop=False`` but ``name=None`` or ``s.name=None``.
+
+    See Also
+    --------
+    h3.geo_to_h3
+    dtoolkit.geoaccessor.geodataframe.to_h3
+
+    Examples
+    --------
+    >>> import dtoolkit.geoaccessor
+    >>> import pandas as pd
+
+    Points to h3 indexes.
+
+    >>> s = pd.Series(
+    ...     [
+    ...         "POINT (122 55)",
+    ...         "POINT (100 1)",
+    ...     ],
+    ... ).from_wkt(crs=4326, drop=True)
+    >>> s
+    0    POINT (122.00000 55.00000)
+    1     POINT (100.00000 1.00000)
+    dtype: geometry
+    >>> s.to_h3(8)
+    0    612845052823076863
+    1    614269156845420543
+    dtype: int64
+
+    Keep the geometry column.
+
+    >>> s.to_h3(8, drop=False)
+                         h3                    geometry
+    0    612845052823076863  POINT (122.00000 55.00000)
+    1    614269156845420543   POINT (100.00000 1.00000)
+
+    Polygons to h3 indexes.
+
+    >>> s = pd.Series(
+    ...     [
+    ...         "POLYGON ((1 0, 1 1, 0 1, 0 0, 1 0))",
+    ...         "POLYGON ((2 1, 2 2, 1 2, 1 1, 2 1))",
+    ...     ],
+    ... ).from_wkt(crs=4326, drop=True)
+    >>> s
+    0    POLYGON ((1.00000 0.00000, 1.00000 1.00000, 0....
+    1    POLYGON ((2.00000 1.00000, 2.00000 2.00000, 1....
+    dtype: geometry
+    >>> s.to_h3(4)
+    0    596538839648960511
+    0    596538693620072447
+    0    596538685030137855
+    0    596538848238895103
+    0    596537920525959167
+    0    596538813879156735
+    0    596538856828829695
+    0    596538805289222143
+    1    596538229763604479
+    1    596537946295762943
+    1    596540780974178303
+    1    596540729434570751
+    1    596540772384243711
+    1    596538212583735295
+    1    596540763794309119
+    1    596537954885697535
+    1    596540746614439935
+    1    596538195403866111
+    1    596541030082281471
+    dtype: object
+    """
+
+    # TODO: Advices for h3-pandas
+    # 1. use `import h3.api.numpy_int as h3` instead of `import h3`
+    # 2. compat with h3-py 4
+    # 3. requires crs is 4326
+    # 4. consider h3-py as the accessor of Series
+    # 5. use h3 cell index as the index of DataFrame, this may be a problem,
+    # cause it is not unique actually.
+    # 6. Speed up creating points / polygons via pygeos
+
+    if s.crs != 4326:
+        raise ValueError(f"Only support 'EPSG:4326' CRS, but got {s.crs!r}.")
+    if not drop and name is None and s.name is None:
+        raise ValueError(
+            "to keep the original data requires setting the 'name' of "
+            f"{s.__class__.__name__!r} or 'name'.",
+        )
+
+    if all(s.geom_type == "Point"):
+        h3 = points_to_h3(s, resolution=resolution)
+    elif all(s.geom_type == "Polygon"):
+        h3_list = polygons_to_h3(s, resolution=resolution)
+        h3 = h3_list.explode()
+        s = s.repeat(s_len(h3_list))
+    else:
+        raise TypeError("Only support 'Point' or 'Polygon' geometry type.")
+
+    return h3 if drop else to_geoframe(h3.rename(name or s.name), geometry=s)
+
+
+def points_to_h3(s: gpd.GeoSeries, /, resolution: int) -> pd.Series:
+    # TODO: Use `latlon_to_h3` instead of `geo_to_h3`
+    # While h3-py release 4, `latlon_to_h3` is not available.
+
+    # requires h3 >= 4
+    # from h3.api.numpy_int import latlng_to_cell
+    # requires h3 < 4
+    from h3.api.numpy_int import geo_to_h3
+
+    return xy(s, reverse=True, frame=False, name=None).apply(
+        lambda yx: geo_to_h3(*yx, resolution),
+    )
+
+
+def polygons_to_h3(s: gpd.GeoSeries, /, resolution: int) -> pd.Series:
+    # TODO: Use `polygon_to_cells` instead of `geo_to_h3`
+    # While h3-py release 4, `polygon_to_cells` is not available.
+
+    # requires h3 >= 4
+    # from h3.api.numpy_int import polygon_to_cells
+    # requires h3 < 4
+    from h3.api.numpy_int import polyfill
+
+    # If `geo_json_conformant` is True, the coordinate could be (lon, lat).
+    return s_getattr(s, "__geo_interface__").apply(
+        polyfill,
+        res=resolution,
+        geo_json_conformant=True,
+    )