Skip to content

Commit

Permalink
Merge pull request #2 from jyyyeung/feat/pinyin
Browse files Browse the repository at this point in the history
Feat/pinyin
  • Loading branch information
jyyyeung authored Feb 22, 2025
2 parents a421d87 + ed7f31c commit 7cf2820
Show file tree
Hide file tree
Showing 12 changed files with 265 additions and 1 deletion.
4 changes: 4 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
on:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
build:
name: Build distribution 📦
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

name: dev workflow

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

# Controls when the action will run.
on:
# Triggers the workflow on push or pull request events but only for the master branch
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/preview.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ on:
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
publish_dev_build:
Expand Down
11 changes: 11 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,14 @@ True
>>> number_parser("一萬三", language="zh")
130000
```

## Compare using Pinyin

```bash
>>> stringpod cmp-pinyin "你好" "你号"
True # Default: 忽略聲調
>>> stringpod cmp-pinyin "你好" "你号" -t
False # 考慮聲調
>>> stringpod cmp-pinyin "你好" "你号" -s
False # 考慮聲調,使用口語變調
```
23 changes: 22 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ bump2version = { version = "^1.0.1", optional = true }
word2number = "^1.1"
number-parser = "^0.3.2"
langdetect = "^1.0.9"
pypinyin = "^0.53.0"
jieba = "^0.42.1"

[tool.poetry.scripts]
stringpod = 'stringpod.cli:main'
Expand Down
30 changes: 30 additions & 0 deletions stringpod/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from stringpod.normalizer import Normalizer, NormalizerOptions
from stringpod.number import to_number, to_number_with_language
from stringpod.pinyin import match_pinyin
from stringpod.stringpod import contains_substring


Expand Down Expand Up @@ -77,6 +78,35 @@ def number(text: str, language: str):
click.echo(f"Result: {result}")


@main.command()
@click.argument("text1", type=str)
@click.argument("text2", type=str)
@click.option('-t', "--with-tone", is_flag=True, help="Whether to take tone into account", default=False)
@click.option('-s', "--use-spoken-tone", is_flag=True, help="Whether to use the spoken tone", default=False)
def cmp_pinyin(text1: str, text2: str, with_tone: bool, use_spoken_tone: bool):
"""Compare the pinyin of two texts.
If --use-spoken-tone is set, --with-tone will be set to True.
>>> stringpod cmp-pinyin "你好" "你好"
True
>>> stringpod cmp-pinyin "你好" "你号"
True # Default: 忽略聲調
>>> stringpod cmp-pinyin "你好" "你号" -t
False # 考慮聲調,忽略變調
>>> stringpod cmp-pinyin "你好" "你号" -s
False # 考慮聲調,使用口語變調
"""
if use_spoken_tone:
with_tone = True

click.echo(f"Text1: {text1}")
click.echo(f"Text2: {text2}")
click.echo(f"With tone: {with_tone}")
click.echo(f"Use spoken tone: {use_spoken_tone}")
click.echo(f"Result: {match_pinyin(text1, text2, with_tone, use_spoken_tone)}")


if __name__ == "__main__":
import doctest

Expand Down
11 changes: 11 additions & 0 deletions stringpod/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from langdetect import DetectorFactory, detect_langs
from langdetect.language import Language
from opencc import OpenCC

DetectorFactory.seed = 0

Expand Down Expand Up @@ -82,3 +83,13 @@ def detect_language(text: str) -> list[Language]:
"zh-cn",
"zh-tw",
]


def to_simplified_chinese(text: str) -> str:
"""Convert a text to simplified Chinese.
>>> to_simplified_chinese("你好,世界!")
"你好,世界!"
"""
opencc = OpenCC("t2s.json")
return opencc.convert(text)
82 changes: 82 additions & 0 deletions stringpod/pinyin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Pinyin module for StringPod."""

import logging

from pypinyin import Style, lazy_pinyin

from stringpod.language import to_simplified_chinese

logger = logging.getLogger(__name__)


def get_pinyin(text: str, **kwargs) -> list[str]:
"""Get the pinyin of a text.
>>> get_pinyin("李浩")
['lǐ', 'hào']
>>> get_pinyin("我爱北京天安门", style=Style.TONE3)
['wǒ', 'ài', 'běi', 'jīng', 'tiān', 'ān', 'mén']
Reference: https://github.com/mozillazg/python-pinyin
Args:
text (str): The text to get the pinyin of.
**kwargs: Additional keyword arguments for the pinyin function.
"""
pinyin_list = lazy_pinyin(text, **kwargs)
return pinyin_list


def match_pinyin(text1: str, text2: str, with_tone: bool = False, spoken_tone: bool = False) -> bool:
"""Match the pinyin of a text with a pinyin string.
>>> match_pinyin("李浩", "理好", with_tone=False)
True
>>> match_pinyin("李浩", "理好", with_tone=True)
False
Args:
text1 (str): The text to match.
text2 (str): The pinyin string to match.
with_tone (bool, optional): Whether to include the tone in the pinyin. Defaults to False.
spoken_tone (bool, optional): Whether to use the spoken tone. Defaults to False.
Returns:
bool: True if the pinyin of text1 matches the pinyin of text2, False otherwise.
"""
if len(text1) != len(text2):
raise ValueError("The length of text1 and text2 must be the same.")

style = Style.TONE3 if with_tone else Style.NORMAL
tone_sandhi = bool(spoken_tone)

# 以簡體中文為標准轉拼音
text1_cn = to_simplified_chinese(text1)
text2_cn = to_simplified_chinese(text2)

# 获取拼音
pinyin1 = get_pinyin(text1_cn, style=style, tone_sandhi=tone_sandhi)
pinyin2 = get_pinyin(text2_cn, style=style, tone_sandhi=tone_sandhi)
logger.debug("pinyin1: %s, pinyin2: %s", pinyin1, pinyin2)

length = len(pinyin1)

for i in range(length):
logger.debug("pinyin1[i]: %s, pinyin2[i]: %s, %s", pinyin1[i], pinyin2[i], pinyin1[i] == pinyin2[i])
if pinyin1[i] != pinyin2[i]:
return False

# # Character i
# char_list1 = pinyin1[i]
# char_list2 = pinyin2[i]

# char_py_matched = False
# # Ensure that at least one character in char_list1 is in char_list2
# for py1 in char_list1:
# if py1 in char_list2:
# char_py_matched = True
# break

# if not char_py_matched:
# return False
return True
17 changes: 17 additions & 0 deletions stringpod/segmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Segmentation of Chinese text."""

import jieba


def segment_text(text: str) -> list[str]:
"""Segment the text into characters.
>>> segment_text("你好,世界!")
['你好', ',', '世界', '!']
>>> segment_text("我爱北京天安门")
['我', '爱', '北京', '天安门']
Reference: https://github.com/fxsjy/jieba
"""
# jieba.enable_paddle()
return list(jieba.cut(text, cut_all=False))
58 changes: 58 additions & 0 deletions tests/test_pinyin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Test the pinyin module."""

import pytest
from pypinyin import Style

from stringpod.pinyin import get_pinyin, match_pinyin


class TestGetPinyin:
"""Test the get_pinyin function."""

@pytest.mark.parametrize(
"input_text, expected, style_kwarg",
[
("李浩", ['li', 'hao'], {}),
("你好", ['ni', 'hao'], {}),
("重庆", ['chong', 'qing'], {"style": Style.NORMAL}),
("重庆", ['chóng', 'qìng'], {"style": Style.TONE}),
("重庆", ['chong2', 'qing4'], {"style": Style.TONE3}),
],
)
def test_get_pinyin_basic(self, input_text, expected, style_kwarg):
"""Test the get_pinyin function with basic cases."""
assert get_pinyin(input_text, **style_kwarg) == expected


class TestMatchPinyin:
"""Test the match_pinyin function."""

@pytest.mark.parametrize(
"text1, text2, expected, with_tone, spoken_tone",
[
("李浩", "理好", True, False, False),
("李浩", "理好", False, True, True),
("妈妈", "马麻", True, False, False),
("是", "市", True, True, False),
("重庆", "重慶", True, False, True),
],
)
def test_match_cases(self, text1, text2, expected, with_tone, spoken_tone):
"""Test the match_pinyin function with different cases."""
assert match_pinyin(text1, text2, with_tone, spoken_tone) == expected

def test_length_mismatch(self):
"""Test the match_pinyin function with different length of text1 and text2."""
with pytest.raises(ValueError):
match_pinyin("你好", "你好吗")

@pytest.mark.parametrize(
"text1, text2",
[
("银行", "銀行"), # Different characters but same pronunciation
("发现", "髮現"), # Homophone in some contexts
],
)
def test_heteronym_matching(self, text1, text2):
"""Test the match_pinyin function with heteronym matching."""
assert match_pinyin(text1, text2, with_tone=False)
20 changes: 20 additions & 0 deletions tests/test_segmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Test the segmentation module."""

import pytest

from stringpod.segmentation import segment_text


class TestSegmentText:
"""Test the segment_text function."""

@pytest.mark.parametrize(
"text, expected",
[
("你好,世界!", ['你好', ',', '世界', '!']),
("我爱北京天安门", ['我', '爱', '北京', '天安门']),
],
)
def test_segment_text(self, text, expected):
"""Test the segment_text function with basic cases."""
assert segment_text(text) == expected

0 comments on commit 7cf2820

Please # to comment.