ENH: New accessor to_zh (#784)

* convert chinese words * index method * apply to dataframe * update inputing * BOT: auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add zhconv * rename variable * add quotes * Create test_to_zh.py * convert chinese words * index method * apply to dataframe * update inputing * BOT: auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add zhconv * rename variable * add quotes * Create test_to_zh.py * Update test_to_zh.py * Update test_to_zh.py Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Zeroto521 · Dec 11, 2022 · bb38b34 · bb38b34
1 parent 3eb6824
commit bb38b34
Show file tree

Hide file tree

Showing 8 changed files with 181 additions and 0 deletions.
diff --git a/ci/env/latest.yaml b/ci/env/latest.yaml
@@ -22,3 +22,7 @@ dependencies:
   - codecov
   # doctest testing
   - pytest-doctestplus
+
+  - pip
+  - pip:
+    - zhconv
diff --git a/doc/source/reference/accessor/dataframe.rst b/doc/source/reference/accessor/dataframe.rst
@@ -19,6 +19,7 @@ Conversion
 
     change_axis_type
     to_series
+    to_zh
     values_to_dict
 
 

diff --git a/doc/source/reference/accessor/series.rst b/doc/source/reference/accessor/series.rst
@@ -22,6 +22,7 @@ Conversion
     change_axis_type
     swap_index_values
     to_datetime
+    to_zh
     to_set
     values_to_dict
 

diff --git a/dtoolkit/accessor/dataframe/__init__.py b/dtoolkit/accessor/dataframe/__init__.py
@@ -15,6 +15,7 @@
 from dtoolkit.accessor.dataframe.repeat import repeat  # noqa: F401
 from dtoolkit.accessor.dataframe.set_unique_index import set_unique_index  # noqa: F401
 from dtoolkit.accessor.dataframe.to_series import to_series  # noqa: F401
+from dtoolkit.accessor.dataframe.to_zh import to_zh  # noqa: F401
 from dtoolkit.accessor.dataframe.top_n import top_n  # noqa: F401
 from dtoolkit.accessor.dataframe.values_to_dict import values_to_dict  # noqa: F401
 from dtoolkit.accessor.dataframe.weighted_mean import weighted_mean  # noqa: F401
diff --git a/dtoolkit/accessor/dataframe/to_zh.py b/dtoolkit/accessor/dataframe/to_zh.py
@@ -0,0 +1,74 @@
+from typing import Hashable
+
+import pandas as pd
+
+from dtoolkit.accessor.register import register_dataframe_method
+from dtoolkit.accessor.series.to_zh import LOCALIZATION
+from dtoolkit.accessor.series.to_zh import to_zh as s_to_zh
+
+
+@register_dataframe_method
+def to_zh(
+    df: pd.DataFrame,
+    /,
+    column: Hashable,
+    *,
+    locale: LOCALIZATION = "zh-cn",
+    dictionary: dict = None,
+) -> pd.DataFrame:
+    """
+    Simple conversion and localization between simplified and traditional Chinese.
+
+    Parameters
+    ----------
+    column : Hashable
+        The column to convert.
+
+    locale : {"zh-hans", "zh-hant", "zh-cn", "zh-sg", "zh-tw", "zh-hk", "zh-my", \
+"zh-mo"}, default "zh-cn"
+        Locale to convert to.
+
+    dictionary : dict, default None
+        A dictionary which updates the conversion table, eg.
+        ``{'from1': 'to1', 'from2': 'to2'}``
+
+    Returns
+    -------
+    Series
+
+    Raises
+    ------
+    ModuleNotFoundError
+        If don't have module named 'zhconv'.
+
+    TypeError
+        If ``s`` is not string dtype.
+
+    See Also
+    --------
+    dtoolkit.accessor.series.to_zh
+
+    Examples
+    --------
+    >>> import dtoolkit
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({'zh': ['漢', '字']})
+    >>> df
+       zh
+    0  漢
+    1  字
+    >>> df.to_zh('zh')
+       zh
+    0  汉
+    1  字
+    """
+
+    return df.assign(
+        **{
+            column: s_to_zh(
+                df[column],
+                locale=locale,
+                dictionary=dictionary,
+            ),
+        },
+    )
diff --git a/dtoolkit/accessor/series/__init__.py b/dtoolkit/accessor/series/__init__.py
@@ -22,5 +22,6 @@
 )
 from dtoolkit.accessor.series.to_datetime import to_datetime  # noqa: F401
 from dtoolkit.accessor.series.to_set import to_set  # noqa: F401
+from dtoolkit.accessor.series.to_zh import to_zh  # noqa: F401
 from dtoolkit.accessor.series.top_n import top_n  # noqa: F401
 from dtoolkit.accessor.series.values_to_dict import values_to_dict  # noqa: F401
diff --git a/dtoolkit/accessor/series/to_zh.py b/dtoolkit/accessor/series/to_zh.py
@@ -0,0 +1,78 @@
+from typing import Literal
+
+import pandas as pd
+from pandas.api.types import is_string_dtype
+
+from dtoolkit.accessor.register import register_series_method
+
+
+LOCALIZATION = Literal[
+    "zh-hans",
+    "zh-hant",
+    "zh-cn",
+    "zh-sg",
+    "zh-tw",
+    "zh-hk",
+    "zh-my",
+    "zh-mo",
+]
+
+
+@register_series_method
+def to_zh(
+    s: pd.Series,
+    /,
+    *,
+    locale: LOCALIZATION = "zh-cn",
+    dictionary: dict = None,
+) -> pd.Series:
+    """
+    Simple conversion and localization between simplified and traditional Chinese.
+
+    Parameters
+    ----------
+    locale : {"zh-hans", "zh-hant", "zh-cn", "zh-sg", "zh-tw", "zh-hk", "zh-my", \
+"zh-mo"}, default "zh-cn"
+        Locale to convert to.
+
+    dictionary : dict, default None
+        A dictionary which updates the conversion table, eg.
+        ``{'from1': 'to1', 'from2': 'to2'}``
+
+    Returns
+    -------
+    Series
+
+    Raises
+    ------
+    ModuleNotFoundError
+        If don't have module named 'zhconv'.
+
+    TypeError
+        If ``s`` is not string dtype.
+
+    See Also
+    --------
+    dtoolkit.accessor.dataframe.to_zh
+
+    Examples
+    --------
+    >>> import dtoolkit
+    >>> import pandas as pd
+    >>> s = pd.Series(['漢', '字'])
+    >>> s
+    0    漢
+    1    字
+    dtype: object
+    >>> s.to_zh(locale="zh-cn")
+    0    汉
+    1    字
+    dtype: object
+    """
+
+    from zhconv import convert
+
+    if not is_string_dtype(s):
+        raise TypeError(f"Expected string dtype, but got {s.dtype!r}.")
+
+    return s.apply(convert, locale=locale, update=dictionary)
diff --git a/test/accessor/series/test_to_zh.py b/test/accessor/series/test_to_zh.py
@@ -0,0 +1,21 @@
+import pandas as pd
+import pytest
+
+from dtoolkit.accessor.series.to_zh import to_zh
+
+
+pytest.importorskip("zhconv")
+
+
+@pytest.mark.parametrize(
+    "s, error",
+    [
+        (
+            pd.Series([1, 2]),
+            TypeError,
+        ),
+    ],
+)
+def test_error(s, error):
+    with pytest.raises(error):
+        to_zh(s)
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,3 +22,7 @@ dependencies: @@
       - codecov
       # doctest testing
       - pytest-doctestplus
+      - pip
+      - pip:
+        - zhconv