Skip to content

Commit

Permalink
ENH: New accessor to_zh (#784)
Browse files Browse the repository at this point in the history
* convert chinese words

* index method

* apply to dataframe

* update inputing

* BOT: auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add zhconv

* rename variable

* add quotes

* Create test_to_zh.py

* convert chinese words

* index method

* apply to dataframe

* update inputing

* BOT: auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add zhconv

* rename variable

* add quotes

* Create test_to_zh.py

* Update test_to_zh.py

* Update test_to_zh.py

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
Zeroto521 and pre-commit-ci[bot] authored Dec 11, 2022
1 parent 3eb6824 commit bb38b34
Show file tree
Hide file tree
Showing 8 changed files with 181 additions and 0 deletions.
4 changes: 4 additions & 0 deletions ci/env/latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,7 @@ dependencies:
- codecov
# doctest testing
- pytest-doctestplus

- pip
- pip:
- zhconv
1 change: 1 addition & 0 deletions doc/source/reference/accessor/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Conversion

change_axis_type
to_series
to_zh
values_to_dict


Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/accessor/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Conversion
change_axis_type
swap_index_values
to_datetime
to_zh
to_set
values_to_dict

Expand Down
1 change: 1 addition & 0 deletions dtoolkit/accessor/dataframe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from dtoolkit.accessor.dataframe.repeat import repeat # noqa: F401
from dtoolkit.accessor.dataframe.set_unique_index import set_unique_index # noqa: F401
from dtoolkit.accessor.dataframe.to_series import to_series # noqa: F401
from dtoolkit.accessor.dataframe.to_zh import to_zh # noqa: F401
from dtoolkit.accessor.dataframe.top_n import top_n # noqa: F401
from dtoolkit.accessor.dataframe.values_to_dict import values_to_dict # noqa: F401
from dtoolkit.accessor.dataframe.weighted_mean import weighted_mean # noqa: F401
74 changes: 74 additions & 0 deletions dtoolkit/accessor/dataframe/to_zh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Hashable

import pandas as pd

from dtoolkit.accessor.register import register_dataframe_method
from dtoolkit.accessor.series.to_zh import LOCALIZATION
from dtoolkit.accessor.series.to_zh import to_zh as s_to_zh


@register_dataframe_method
def to_zh(
df: pd.DataFrame,
/,
column: Hashable,
*,
locale: LOCALIZATION = "zh-cn",
dictionary: dict = None,
) -> pd.DataFrame:
"""
Simple conversion and localization between simplified and traditional Chinese.
Parameters
----------
column : Hashable
The column to convert.
locale : {"zh-hans", "zh-hant", "zh-cn", "zh-sg", "zh-tw", "zh-hk", "zh-my", \
"zh-mo"}, default "zh-cn"
Locale to convert to.
dictionary : dict, default None
A dictionary which updates the conversion table, eg.
``{'from1': 'to1', 'from2': 'to2'}``
Returns
-------
Series
Raises
------
ModuleNotFoundError
If don't have module named 'zhconv'.
TypeError
If ``s`` is not string dtype.
See Also
--------
dtoolkit.accessor.series.to_zh
Examples
--------
>>> import dtoolkit
>>> import pandas as pd
>>> df = pd.DataFrame({'zh': ['漢', '字']})
>>> df
zh
0 漢
1 字
>>> df.to_zh('zh')
zh
0 汉
1 字
"""

return df.assign(
**{
column: s_to_zh(
df[column],
locale=locale,
dictionary=dictionary,
),
},
)
1 change: 1 addition & 0 deletions dtoolkit/accessor/series/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@
)
from dtoolkit.accessor.series.to_datetime import to_datetime # noqa: F401
from dtoolkit.accessor.series.to_set import to_set # noqa: F401
from dtoolkit.accessor.series.to_zh import to_zh # noqa: F401
from dtoolkit.accessor.series.top_n import top_n # noqa: F401
from dtoolkit.accessor.series.values_to_dict import values_to_dict # noqa: F401
78 changes: 78 additions & 0 deletions dtoolkit/accessor/series/to_zh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from typing import Literal

import pandas as pd
from pandas.api.types import is_string_dtype

from dtoolkit.accessor.register import register_series_method


LOCALIZATION = Literal[
"zh-hans",
"zh-hant",
"zh-cn",
"zh-sg",
"zh-tw",
"zh-hk",
"zh-my",
"zh-mo",
]


@register_series_method
def to_zh(
s: pd.Series,
/,
*,
locale: LOCALIZATION = "zh-cn",
dictionary: dict = None,
) -> pd.Series:
"""
Simple conversion and localization between simplified and traditional Chinese.
Parameters
----------
locale : {"zh-hans", "zh-hant", "zh-cn", "zh-sg", "zh-tw", "zh-hk", "zh-my", \
"zh-mo"}, default "zh-cn"
Locale to convert to.
dictionary : dict, default None
A dictionary which updates the conversion table, eg.
``{'from1': 'to1', 'from2': 'to2'}``
Returns
-------
Series
Raises
------
ModuleNotFoundError
If don't have module named 'zhconv'.
TypeError
If ``s`` is not string dtype.
See Also
--------
dtoolkit.accessor.dataframe.to_zh
Examples
--------
>>> import dtoolkit
>>> import pandas as pd
>>> s = pd.Series(['漢', '字'])
>>> s
0 漢
1 字
dtype: object
>>> s.to_zh(locale="zh-cn")
0 汉
1 字
dtype: object
"""

from zhconv import convert

if not is_string_dtype(s):
raise TypeError(f"Expected string dtype, but got {s.dtype!r}.")

return s.apply(convert, locale=locale, update=dictionary)
21 changes: 21 additions & 0 deletions test/accessor/series/test_to_zh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pandas as pd
import pytest

from dtoolkit.accessor.series.to_zh import to_zh


pytest.importorskip("zhconv")


@pytest.mark.parametrize(
"s, error",
[
(
pd.Series([1, 2]),
TypeError,
),
],
)
def test_error(s, error):
with pytest.raises(error):
to_zh(s)

0 comments on commit bb38b34

Please # to comment.