-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkatakana.py
32 lines (25 loc) · 825 Bytes
/
katakana.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import MeCab
import unidic
import pandas as pd
import alkana
import re
import sys
sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf8', buffering=1)
alphaReg = re.compile(r'^[a-zA-Z]+$')
def isalpha(s):
return alphaReg.match(s) is not None
def katakana_converter(text):
wakati = MeCab.Tagger('-Owakati')
wakati_result = wakati.parse(text)
df = pd.DataFrame(wakati_result.split(" "),columns=["word"])
df = df[df["word"].str.isalpha() == True]
df["english_word"] = df["word"].apply(isalpha)
df = df[df["english_word"] == True]
df["katakana"] = df["word"].apply(alkana.get_kana)
dict_rep = dict(zip(df["word"], df["katakana"]))
for word, read in dict_rep.items():
try:
text = text.replace(word, read)
except:
pass
return text