-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from jyyyeung/feat/pinyin
Feat/pinyin
- Loading branch information
Showing
12 changed files
with
265 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
"""Pinyin module for StringPod.""" | ||
|
||
import logging | ||
|
||
from pypinyin import Style, lazy_pinyin | ||
|
||
from stringpod.language import to_simplified_chinese | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def get_pinyin(text: str, **kwargs) -> list[str]: | ||
"""Get the pinyin of a text. | ||
>>> get_pinyin("李浩") | ||
['lǐ', 'hào'] | ||
>>> get_pinyin("我爱北京天安门", style=Style.TONE3) | ||
['wǒ', 'ài', 'běi', 'jīng', 'tiān', 'ān', 'mén'] | ||
Reference: https://github.com/mozillazg/python-pinyin | ||
Args: | ||
text (str): The text to get the pinyin of. | ||
**kwargs: Additional keyword arguments for the pinyin function. | ||
""" | ||
pinyin_list = lazy_pinyin(text, **kwargs) | ||
return pinyin_list | ||
|
||
|
||
def match_pinyin(text1: str, text2: str, with_tone: bool = False, spoken_tone: bool = False) -> bool: | ||
"""Match the pinyin of a text with a pinyin string. | ||
>>> match_pinyin("李浩", "理好", with_tone=False) | ||
True | ||
>>> match_pinyin("李浩", "理好", with_tone=True) | ||
False | ||
Args: | ||
text1 (str): The text to match. | ||
text2 (str): The pinyin string to match. | ||
with_tone (bool, optional): Whether to include the tone in the pinyin. Defaults to False. | ||
spoken_tone (bool, optional): Whether to use the spoken tone. Defaults to False. | ||
Returns: | ||
bool: True if the pinyin of text1 matches the pinyin of text2, False otherwise. | ||
""" | ||
if len(text1) != len(text2): | ||
raise ValueError("The length of text1 and text2 must be the same.") | ||
|
||
style = Style.TONE3 if with_tone else Style.NORMAL | ||
tone_sandhi = bool(spoken_tone) | ||
|
||
# 以簡體中文為標准轉拼音 | ||
text1_cn = to_simplified_chinese(text1) | ||
text2_cn = to_simplified_chinese(text2) | ||
|
||
# 获取拼音 | ||
pinyin1 = get_pinyin(text1_cn, style=style, tone_sandhi=tone_sandhi) | ||
pinyin2 = get_pinyin(text2_cn, style=style, tone_sandhi=tone_sandhi) | ||
logger.debug("pinyin1: %s, pinyin2: %s", pinyin1, pinyin2) | ||
|
||
length = len(pinyin1) | ||
|
||
for i in range(length): | ||
logger.debug("pinyin1[i]: %s, pinyin2[i]: %s, %s", pinyin1[i], pinyin2[i], pinyin1[i] == pinyin2[i]) | ||
if pinyin1[i] != pinyin2[i]: | ||
return False | ||
|
||
# # Character i | ||
# char_list1 = pinyin1[i] | ||
# char_list2 = pinyin2[i] | ||
|
||
# char_py_matched = False | ||
# # Ensure that at least one character in char_list1 is in char_list2 | ||
# for py1 in char_list1: | ||
# if py1 in char_list2: | ||
# char_py_matched = True | ||
# break | ||
|
||
# if not char_py_matched: | ||
# return False | ||
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
"""Segmentation of Chinese text.""" | ||
|
||
import jieba | ||
|
||
|
||
def segment_text(text: str) -> list[str]: | ||
"""Segment the text into characters. | ||
>>> segment_text("你好,世界!") | ||
['你好', ',', '世界', '!'] | ||
>>> segment_text("我爱北京天安门") | ||
['我', '爱', '北京', '天安门'] | ||
Reference: https://github.com/fxsjy/jieba | ||
""" | ||
# jieba.enable_paddle() | ||
return list(jieba.cut(text, cut_all=False)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
"""Test the pinyin module.""" | ||
|
||
import pytest | ||
from pypinyin import Style | ||
|
||
from stringpod.pinyin import get_pinyin, match_pinyin | ||
|
||
|
||
class TestGetPinyin: | ||
"""Test the get_pinyin function.""" | ||
|
||
@pytest.mark.parametrize( | ||
"input_text, expected, style_kwarg", | ||
[ | ||
("李浩", ['li', 'hao'], {}), | ||
("你好", ['ni', 'hao'], {}), | ||
("重庆", ['chong', 'qing'], {"style": Style.NORMAL}), | ||
("重庆", ['chóng', 'qìng'], {"style": Style.TONE}), | ||
("重庆", ['chong2', 'qing4'], {"style": Style.TONE3}), | ||
], | ||
) | ||
def test_get_pinyin_basic(self, input_text, expected, style_kwarg): | ||
"""Test the get_pinyin function with basic cases.""" | ||
assert get_pinyin(input_text, **style_kwarg) == expected | ||
|
||
|
||
class TestMatchPinyin: | ||
"""Test the match_pinyin function.""" | ||
|
||
@pytest.mark.parametrize( | ||
"text1, text2, expected, with_tone, spoken_tone", | ||
[ | ||
("李浩", "理好", True, False, False), | ||
("李浩", "理好", False, True, True), | ||
("妈妈", "马麻", True, False, False), | ||
("是", "市", True, True, False), | ||
("重庆", "重慶", True, False, True), | ||
], | ||
) | ||
def test_match_cases(self, text1, text2, expected, with_tone, spoken_tone): | ||
"""Test the match_pinyin function with different cases.""" | ||
assert match_pinyin(text1, text2, with_tone, spoken_tone) == expected | ||
|
||
def test_length_mismatch(self): | ||
"""Test the match_pinyin function with different length of text1 and text2.""" | ||
with pytest.raises(ValueError): | ||
match_pinyin("你好", "你好吗") | ||
|
||
@pytest.mark.parametrize( | ||
"text1, text2", | ||
[ | ||
("银行", "銀行"), # Different characters but same pronunciation | ||
("发现", "髮現"), # Homophone in some contexts | ||
], | ||
) | ||
def test_heteronym_matching(self, text1, text2): | ||
"""Test the match_pinyin function with heteronym matching.""" | ||
assert match_pinyin(text1, text2, with_tone=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
"""Test the segmentation module.""" | ||
|
||
import pytest | ||
|
||
from stringpod.segmentation import segment_text | ||
|
||
|
||
class TestSegmentText: | ||
"""Test the segment_text function.""" | ||
|
||
@pytest.mark.parametrize( | ||
"text, expected", | ||
[ | ||
("你好,世界!", ['你好', ',', '世界', '!']), | ||
("我爱北京天安门", ['我', '爱', '北京', '天安门']), | ||
], | ||
) | ||
def test_segment_text(self, text, expected): | ||
"""Test the segment_text function with basic cases.""" | ||
assert segment_text(text) == expected |