Skip to content

Commit

Permalink
Use Janome and NEologd instead of pykakasi
Browse files Browse the repository at this point in the history
Related to #32
Close #31
  • Loading branch information
ensan-hcl authored Nov 8, 2020
1 parent 3382036 commit 669f945
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 19 deletions.
4 changes: 4 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
python3 -m pip install --no-cache-dir -r ./requirements.txt && \
python3 -m pip check

RUN wget -q https://github.com/peaceiris/emoji-ime-dictionary/releases/download/v2.2.1/Janome-0.4.1-neologd-20200910.tar.gz && \
python3 -m pip install --no-cache-dir --no-compile Janome-0.4.1-neologd-20200910.tar.gz && \
rm Janome-0.4.1-neologd-20200910.tar.gz

RUN wget -q 'https://raw.githubusercontent.com/yagays/emoji-ja/20190726/data/emoji_ja.json' \
-O /root/emoji_ja.json

Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.PHONY: build
build:
docker-compose run --rm -T dev python main.py
COMPOSE_DOCKER_CLI_BUILD=1 docker-compose run --rm -T dev python main.py

.PHONY: test
test:
Expand Down
28 changes: 20 additions & 8 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import urllib.request
import json
from janome.tokenizer import Tokenizer
from pykakasi import kakasi
import re

from pykakasi import kakasi, wakati

t = Tokenizer()

kakasi = kakasi()
kakasi.setMode("J","H")
Expand All @@ -11,13 +12,24 @@
conv_k2h = kakasi.getConverter()


def hiraganafy(keyword):
def hiraganafy_v1(keyword):
k = keyword.upper()
k = conv_j2h.do(k)
k = conv_k2h.do(k)
return k


def hiraganafy(keyword):
katakana = ''
for token in t.tokenize(keyword.upper()):
katakana += token.surface if token.reading == '*' else token.reading
hiragana = conv_k2h.do(katakana)
hiragana_v1 = hiraganafy_v1(keyword)
if hiragana != hiragana_v1:
print(f'| {keyword} | {hiragana} | {hiragana_v1} |')
return hiragana


def add_word_to_dict(emoji, keyword, emoji_dict):
valid_keyword = keyword.replace('ゔ', 'う゛')
word = f':{valid_keyword}\t{emoji}\t記号\t'
Expand Down Expand Up @@ -46,14 +58,14 @@ def create_emoji_dict(self) -> None:
for k in self.emoji_json[emoji]['keywords']:
if k.isalpha() is False:
continue
k = hiraganafy(k)
add_word_to_dict(emoji, k, self.emoji_dict)
hiragana = hiraganafy(k)
add_word_to_dict(emoji, hiragana, self.emoji_dict)

k = self.emoji_json[emoji]['short_name']
if k.isalpha() is False:
continue
k = hiraganafy(k)
add_word_to_dict(emoji, k, self.emoji_dict)
hiragana = hiraganafy(k)
add_word_to_dict(emoji, hiragana, self.emoji_dict)

self.emoji_dict.sort()
self.emoji_dict = '\n'.join(self.emoji_dict)
Expand Down
20 changes: 10 additions & 10 deletions main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,12 @@ def test_hiraganafy(self):
{'in': '営マーク', 'out': 'えいまーく'},
{'in': '営業中', 'out': 'えいぎょうちゅう'},
{'in': '営業日', 'out': 'えいぎょうび'},
]
for t in test_cases:
self.assertEqual(main.hiraganafy(t['in']), t['out'])

def test_hiraganafy_todo(self):
test_cases = [
{'in': 'しかめ面', 'out': 'しかめっつら'},
{'in': 'しかめ面の人', 'out': 'しかめつらのひと'},
{'in': 'バルカン人', 'out': 'ばるかんじん'},
{'in': 'フェイスマッサージ中の人', 'out': 'ふぇいすまっさーじちゅうのひと'},
{'in': '水球をする人', 'out': 'すいきゅうをするひと'},
{'in': '人のシルエット', 'out': 'ひとのしるえっと'},
{'in': '話す人のシルエット', 'out': 'はなすひとのしるえっと'},
{'in': '2人のシルエット', 'out': 'ふたりのしるえっと'},
{'in': '六芒星', 'out': 'ろくぼうせい'},
{'in': '六角星', 'out': 'ろっかくせい'},
{'in': '波乗り', 'out': 'なみのり'},
{'in': 'きらきら星', 'out': 'きらきらぼし'},
{'in': 'くす玉', 'out': 'くすだま'},
Expand All @@ -40,6 +30,16 @@ def test_hiraganafy_todo(self):
{'in': '介助犬', 'out': 'かいじょけん'},
{'in': '筋トレ', 'out': 'きんとれ'},
]
for t in test_cases:
self.assertEqual(main.hiraganafy(t['in']), t['out'])

def test_hiraganafy_todo(self):
test_cases = [
{'in': 'しかめ面', 'out': 'しかめっつら'},
{'in': 'しかめ面の人', 'out': 'しかめつらのひと'},
{'in': 'フェイスマッサージ中の人', 'out': 'ふぇいすまっさーじちゅうのひと'},
{'in': '六角星', 'out': 'ろっかくせい'},
]
for t in test_cases:
self.assertNotEqual(main.hiraganafy(t['in']), t['out'])

Expand Down

0 comments on commit 669f945

Please # to comment.