Merge pull request #2 from jyyyeung/feat/pinyin

Feat/pinyin
jyyyeung · Feb 22, 2025 · 7cf2820 · 7cf2820
2 parents a421d87 + ed7f31c
commit 7cf2820
Show file tree

Hide file tree

Showing 12 changed files with 265 additions and 1 deletion.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -3,6 +3,10 @@ name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
 on:
   workflow_dispatch:
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build:
     name: Build distribution 📦

diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
@@ -2,6 +2,10 @@
 
 name: dev workflow
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 # Controls when the action will run.
 on:
   # Triggers the workflow on push or pull request events but only for the master branch

diff --git a/.github/workflows/preview.yml b/.github/workflows/preview.yml
@@ -11,6 +11,10 @@ on:
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
   publish_dev_build:

diff --git a/docs/usage.md b/docs/usage.md
@@ -61,3 +61,14 @@ True
 >>> number_parser("一萬三", language="zh")
 130000
 ```
+
+## Compare using Pinyin
+
+```bash
+>>> stringpod cmp-pinyin "你好" "你号"
+True # Default: 忽略聲調
+>>> stringpod cmp-pinyin "你好" "你号" -t
+False # 考慮聲調
+>>> stringpod cmp-pinyin "你好" "你号" -s
+False # 考慮聲調，使用口語變調
+```
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,6 +49,8 @@ bump2version = { version = "^1.0.1", optional = true }
 word2number = "^1.1"
 number-parser = "^0.3.2"
 langdetect = "^1.0.9"
+pypinyin = "^0.53.0"
+jieba = "^0.42.1"
 
 [tool.poetry.scripts]
 stringpod = 'stringpod.cli:main'

diff --git a/stringpod/cli.py b/stringpod/cli.py
@@ -4,6 +4,7 @@
 
 from stringpod.normalizer import Normalizer, NormalizerOptions
 from stringpod.number import to_number, to_number_with_language
+from stringpod.pinyin import match_pinyin
 from stringpod.stringpod import contains_substring
 
 
@@ -77,6 +78,35 @@ def number(text: str, language: str):
     click.echo(f"Result: {result}")
 
 
+@main.command()
+@click.argument("text1", type=str)
+@click.argument("text2", type=str)
+@click.option('-t', "--with-tone", is_flag=True, help="Whether to take tone into account", default=False)
+@click.option('-s', "--use-spoken-tone", is_flag=True, help="Whether to use the spoken tone", default=False)
+def cmp_pinyin(text1: str, text2: str, with_tone: bool, use_spoken_tone: bool):
+    """Compare the pinyin of two texts.
+
+    If --use-spoken-tone is set, --with-tone will be set to True.
+
+    >>> stringpod cmp-pinyin "你好" "你好"
+    True
+    >>> stringpod cmp-pinyin "你好" "你号"
+    True # Default: 忽略聲調
+    >>> stringpod cmp-pinyin "你好" "你号" -t
+    False # 考慮聲調，忽略變調
+    >>> stringpod cmp-pinyin "你好" "你号" -s
+    False # 考慮聲調，使用口語變調
+    """
+    if use_spoken_tone:
+        with_tone = True
+
+    click.echo(f"Text1: {text1}")
+    click.echo(f"Text2: {text2}")
+    click.echo(f"With tone: {with_tone}")
+    click.echo(f"Use spoken tone: {use_spoken_tone}")
+    click.echo(f"Result: {match_pinyin(text1, text2, with_tone, use_spoken_tone)}")
+
+
 if __name__ == "__main__":
     import doctest
 

diff --git a/stringpod/language.py b/stringpod/language.py
@@ -2,6 +2,7 @@
 
 from langdetect import DetectorFactory, detect_langs
 from langdetect.language import Language
+from opencc import OpenCC
 
 DetectorFactory.seed = 0
 
@@ -82,3 +83,13 @@ def detect_language(text: str) -> list[Language]:
     "zh-cn",
     "zh-tw",
 ]
+
+
+def to_simplified_chinese(text: str) -> str:
+    """Convert a text to simplified Chinese.
+
+    >>> to_simplified_chinese("你好，世界！")
+    "你好，世界！"
+    """
+    opencc = OpenCC("t2s.json")
+    return opencc.convert(text)
diff --git a/stringpod/pinyin.py b/stringpod/pinyin.py
@@ -0,0 +1,82 @@
+"""Pinyin module for StringPod."""
+
+import logging
+
+from pypinyin import Style, lazy_pinyin
+
+from stringpod.language import to_simplified_chinese
+
+logger = logging.getLogger(__name__)
+
+
+def get_pinyin(text: str, **kwargs) -> list[str]:
+    """Get the pinyin of a text.
+
+    >>> get_pinyin("李浩")
+    ['lǐ', 'hào']
+    >>> get_pinyin("我爱北京天安门", style=Style.TONE3)
+    ['wǒ', 'ài', 'běi', 'jīng', 'tiān', 'ān', 'mén']
+
+    Reference: https://github.com/mozillazg/python-pinyin
+
+    Args:
+        text (str): The text to get the pinyin of.
+        **kwargs: Additional keyword arguments for the pinyin function.
+    """
+    pinyin_list = lazy_pinyin(text, **kwargs)
+    return pinyin_list
+
+
+def match_pinyin(text1: str, text2: str, with_tone: bool = False, spoken_tone: bool = False) -> bool:
+    """Match the pinyin of a text with a pinyin string.
+
+    >>> match_pinyin("李浩", "理好", with_tone=False)
+    True
+    >>> match_pinyin("李浩", "理好", with_tone=True)
+    False
+
+    Args:
+        text1 (str): The text to match.
+        text2 (str): The pinyin string to match.
+        with_tone (bool, optional): Whether to include the tone in the pinyin. Defaults to False.
+        spoken_tone (bool, optional): Whether to use the spoken tone. Defaults to False.
+
+    Returns:
+        bool: True if the pinyin of text1 matches the pinyin of text2, False otherwise.
+    """
+    if len(text1) != len(text2):
+        raise ValueError("The length of text1 and text2 must be the same.")
+
+    style = Style.TONE3 if with_tone else Style.NORMAL
+    tone_sandhi = bool(spoken_tone)
+
+    # 以簡體中文為標准轉拼音
+    text1_cn = to_simplified_chinese(text1)
+    text2_cn = to_simplified_chinese(text2)
+
+    # 获取拼音
+    pinyin1 = get_pinyin(text1_cn, style=style, tone_sandhi=tone_sandhi)
+    pinyin2 = get_pinyin(text2_cn, style=style, tone_sandhi=tone_sandhi)
+    logger.debug("pinyin1: %s, pinyin2: %s", pinyin1, pinyin2)
+
+    length = len(pinyin1)
+
+    for i in range(length):
+        logger.debug("pinyin1[i]: %s, pinyin2[i]: %s, %s", pinyin1[i], pinyin2[i], pinyin1[i] == pinyin2[i])
+        if pinyin1[i] != pinyin2[i]:
+            return False
+
+        # # Character i
+        # char_list1 = pinyin1[i]
+        # char_list2 = pinyin2[i]
+
+        # char_py_matched = False
+        # # Ensure that at least one character in char_list1 is in char_list2
+        # for py1 in char_list1:
+        #     if py1 in char_list2:
+        #         char_py_matched = True
+        #         break
+
+        # if not char_py_matched:
+        #     return False
+    return True
diff --git a/stringpod/segmentation.py b/stringpod/segmentation.py
@@ -0,0 +1,17 @@
+"""Segmentation of Chinese text."""
+
+import jieba
+
+
+def segment_text(text: str) -> list[str]:
+    """Segment the text into characters.
+
+    >>> segment_text("你好，世界！")
+    ['你好', '，', '世界', '！']
+    >>> segment_text("我爱北京天安门")
+    ['我', '爱', '北京', '天安门']
+
+    Reference: https://github.com/fxsjy/jieba
+    """
+    # jieba.enable_paddle()
+    return list(jieba.cut(text, cut_all=False))
diff --git a/tests/test_pinyin.py b/tests/test_pinyin.py
@@ -0,0 +1,58 @@
+"""Test the pinyin module."""
+
+import pytest
+from pypinyin import Style
+
+from stringpod.pinyin import get_pinyin, match_pinyin
+
+
+class TestGetPinyin:
+    """Test the get_pinyin function."""
+
+    @pytest.mark.parametrize(
+        "input_text, expected, style_kwarg",
+        [
+            ("李浩", ['li', 'hao'], {}),
+            ("你好", ['ni', 'hao'], {}),
+            ("重庆", ['chong', 'qing'], {"style": Style.NORMAL}),
+            ("重庆", ['chóng', 'qìng'], {"style": Style.TONE}),
+            ("重庆", ['chong2', 'qing4'], {"style": Style.TONE3}),
+        ],
+    )
+    def test_get_pinyin_basic(self, input_text, expected, style_kwarg):
+        """Test the get_pinyin function with basic cases."""
+        assert get_pinyin(input_text, **style_kwarg) == expected
+
+
+class TestMatchPinyin:
+    """Test the match_pinyin function."""
+
+    @pytest.mark.parametrize(
+        "text1, text2, expected, with_tone, spoken_tone",
+        [
+            ("李浩", "理好", True, False, False),
+            ("李浩", "理好", False, True, True),
+            ("妈妈", "马麻", True, False, False),
+            ("是", "市", True, True, False),
+            ("重庆", "重慶", True, False, True),
+        ],
+    )
+    def test_match_cases(self, text1, text2, expected, with_tone, spoken_tone):
+        """Test the match_pinyin function with different cases."""
+        assert match_pinyin(text1, text2, with_tone, spoken_tone) == expected
+
+    def test_length_mismatch(self):
+        """Test the match_pinyin function with different length of text1 and text2."""
+        with pytest.raises(ValueError):
+            match_pinyin("你好", "你好吗")
+
+    @pytest.mark.parametrize(
+        "text1, text2",
+        [
+            ("银行", "銀行"),  # Different characters but same pronunciation
+            ("发现", "髮現"),  # Homophone in some contexts
+        ],
+    )
+    def test_heteronym_matching(self, text1, text2):
+        """Test the match_pinyin function with heteronym matching."""
+        assert match_pinyin(text1, text2, with_tone=False)
diff --git a/tests/test_segmentation.py b/tests/test_segmentation.py
@@ -0,0 +1,20 @@
+"""Test the segmentation module."""
+
+import pytest
+
+from stringpod.segmentation import segment_text
+
+
+class TestSegmentText:
+    """Test the segment_text function."""
+
+    @pytest.mark.parametrize(
+        "text, expected",
+        [
+            ("你好，世界！", ['你好', '，', '世界', '！']),
+            ("我爱北京天安门", ['我', '爱', '北京', '天安门']),
+        ],
+    )
+    def test_segment_text(self, text, expected):
+        """Test the segment_text function with basic cases."""
+        assert segment_text(text) == expected