-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnormalizer.py
165 lines (140 loc) · 7.81 KB
/
normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import re
from unicodedata import normalize
import pandas as pd
import visen
from time import time
opening_ls = ['[', '{', '⁅', '〈', '⎡', '⎢', '⎣', '⎧', '⎨', '⎩', '❬', '❰', '❲', '❴', '⟦', '⟨', '⟪', '⟬', '⦃', '⦇', '⦉',
'⦋', '⦍', '⦏', '⦑', '⦓', '⦕', '⦗', '⧼', '⸂', '⸄', '⸉', '⸌', '⸜', '⸢', '⸤', '⸦', '〈', '《', '「', '『',
'【', '〔', '〖', '〘', '〚', '﹛', '﹝', '[', '{', '「', '」']
closing_ls = [']', '}', '⁆', '〉', '⎤', '⎥', '⎦', '⎫', '⎬', '⎭', '❭', '❱', '❳', '❵', '⟧', '⟩', '⟫', '⟭', '⦄', '⦈', '⦊',
'⦌', '⦎', '⦐', '⦒', '⦔', '⦖', '⦘', '⧽', '⸃', '⸅', '⸊', '⸍', '⸝', '⸣', '⸥', '⸧', '〉', '》', '」', '』',
'】', '〕', '〗', '〙', '〛', '﹜', '﹞', ']', '}', '」']
opening_bracket = {key: '(' for key in opening_ls}
closing_bracket = {key: ')' for key in closing_ls}
opening_bracket_pattern = {f"\\{key}": "(" for key in opening_ls}
closing_bracket_pattern = {f"\\{key}": ")" for key in closing_ls}
PUNC = '!\"#$&()*+,-–−./:;=?@[\]^_`{|}~”“`°²ˈ‐ㄧ‛∼’' # remove <> for number_sym and unknown_sym
re_num_and_decimal = '[0-9]*[,.\-]*[0-9]*[,.\-]*[0-9]*[.,\-]*[0-9]*[,.\-]*[0-9]+[.,]?'
re_unknown = '[a-z]+[\d]+[\w]*|[\d]+[a-z]+[\w]*'
re_vnese_txt = r'[^a-z0-9A-ZàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉ' \
r'ẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ\s|_]'
special_punc = {'”': '"', '': '', "’": "'", "`": "'"}
def replace_all(replacer: dict, txt: str) -> str:
"""
Replace all the keys in the dictionary with their respective values.
:param replacer: dictionary of keys are the words to be replaced and values are the words to replace them with
:param txt: subject string
:return: string after replacement
"""
for old, new in replacer.items():
txt = txt.replace(old, new)
return txt
def replace_num(txt: str) -> str:
"""
Replace all the numbers in the text with blank
:param text: subject string
:return: string after replacement
"""
text = re.sub(re_num_and_decimal, '', txt)
return text
def replace_unknown(text: str) -> str:
"""
Replace all the predefined unknown symbols in the text with blank
:param text: subject string
:return: string after replacement
"""
text = re.sub(re_unknown, '', text)
return text
def unicode_normalizer(text, forms: list = ['NFKC', 'NKFD', 'NFC', 'NFD']) -> str:
"""
Normalize unicode text
:param text: subject string
:param forms: unicode normalization forms
:return: string after normalization
"""
for form in forms:
text = normalize(form, text)
return text
def normalize_bracket(text: str) -> str:
"""
Normalize brackets in the text with predefined string for later use
:param text: subject string
:return: transformed string
"""
text = replace_all(opening_bracket, text)
text = replace_all(closing_bracket, text)
text = re.sub(r"[\(\[].*?[\)\]]", "", text)
return text
def remove_punc(text: str) -> str:
"""
Remove punctuations in the text
:param text: subject string
:return: string after removal
"""
r = re.compile(r'[\s{}]+'.format(re.escape(PUNC)))
text = r.split(text)
return ' '.join(i for i in text if i)
def norm(text: str) -> str:
"""
Normalize text by removing punctuations, numbers, unknown symbols, brackets, and normalize unicode
:param text: subject string
:return: normalized string
"""
text = str(text)
text = text.lower()
text = text.split('\n')[0]
text = unicode_normalizer(text, ["NFKC", "NFKD", "NFD", "NFC"])
text = replace_all(special_punc, text)
text = normalize_bracket(text)
text = replace_unknown(text)
text = replace_num(text)
text = remove_punc(text)
text = re.sub(re_vnese_txt, "", text)
text = text.strip()
return visen.clean_tone(text)
def dataframe_normalize(df: pd.DataFrame = None) -> pd.DataFrame:
"""
Normalize dataframe
:param df: Dataframe to normalize
:return: Dataframe normalized
"""
assert 'sample' in df.columns, f"DataFrame with column name 'sample' expected, got {df.columns} "
begin = time()
df['sample'] = df['sample'].str.lower()
df['sample'] = df['sample'].str.replace('[a-z]+[\d]+[\w]*|[\d]+[a-z]+[\w]*', '',
regex=True) # replace all product code with blank
df['sample'] = df['sample'].str.replace('[0-9]*[,.\-]*[0-9]*[,.\-]*[0-9]*[.,\-]*[0-9]*[,.\-]*[0-9]+[.,]?', '',
regex=True) # replace all number with blank
df['sample'] = df['sample'].replace(opening_bracket_pattern, regex=True) # normalize opening brackets, listed above
df['sample'] = df['sample'].replace(closing_bracket_pattern, regex=True) # normalize closing brackets, listed above
df['sample'] = df['sample'].str.replace("[\(\[].*?[\)\]]", '',
regex=True) # remove all content inside brackets, eg: (Siêu sale 12/12)
df['sample'] = df['sample'].str.normalize('NFKC') # Normalize unicode with NFKC standard
df['sample'] = df['sample'].str.normalize('NFKD')
df['sample'] = df['sample'].str.normalize('NFC')
df['sample'] = df['sample'].str.normalize('NFD')
# # Remove non Vietnamese text, seems like unessesary right now, might consider remove later to improve run time.
df['sample'] = df['sample'].str.replace(
r"[^a-z0-9A-ZàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ\s|_]",
'', regex=True)
df['sample'] = df['sample'].str.replace('đ',
'd').str.strip() # Special case where unicode normalizer does not consider 'đ' a combination of base unicode characters
print('Total enlapsed time:', time() - begin, 'seconds')
return df
if __name__ == '__main__':
sample_txts = [
'〖𝔰𝔬𝔪𝔢 𝔣𝔞𝔫𝔠𝔶 𝔭𝔯𝔬𝔡𝔲𝔠𝔱 𝔫𝔞𝔪𝔢〗',
'𝖘𝖔𝖒𝖊 𝖋𝖆𝖓𝖈𝖞 𝖕𝖗𝖔𝖉𝖚𝖈𝖙 𝖓𝖆𝖒𝖊',
'❰some product code which will overfit this sample❱ 𝓼𝓸𝓶𝓮 𝓯𝓪𝓷𝓬𝔂 𝓹𝓻𝓸𝓭𝓾𝓬𝓽 𝓷𝓪𝓶𝓮',
'𝓈𝑜𝓂𝑒 𝒻𝒶𝓃𝒸𝓎 𝓅𝓇𝑜𝒹𝓊𝒸𝓉 𝓃𝒶𝓂𝑒',
'{𝕤𝕠𝕞𝕖 𝕗𝕒𝕟𝕔𝕪 𝕡𝕣𝕠𝕕𝕦𝕔𝕥 𝕟𝕒𝕞𝕖',
'☯😝some fancy product name☯😝',
'𝐬𝐨𝐦𝐞 𝐟𝐚𝐧𝐜𝐲 𝐩𝐫𝐨𝐝𝐮𝐜𝐭 𝐧𝐚𝐦𝐞',
'𝘀𝗼𝗺𝗲 𝗳𝗮𝗻𝗰𝘆 𝗽𝗿𝗼𝗱𝘂𝗰𝘁 𝗻𝗮𝗺𝗲',
'𝘴𝘰𝘮𝘦 𝘧𝘢𝘯𝘤𝘺 𝘱𝘳𝘰𝘥𝘶𝘤𝘵 𝘯𝘢𝘮𝘦',
'▄▀▄▀▄▀ 𝙨𝙤𝙢𝙚 𝙛𝙖𝙣𝙘𝙮 𝙥𝙧𝙤𝙙𝙪𝙘𝙩 𝙣𝙖𝙢𝙚 ▄▀▄▀▄▀',
'🌌 🎀 𝚜𝚘𝚖𝚎 𝚏𝚊𝚗𝚌𝚢 𝚙𝚛𝚘𝚍𝚞𝚌𝚝 𝚗𝚊𝚖𝚎🌌 🎀 ',
'(っ◔◡◔)っ ♥ some fancy product name ♥',
]
for sample in sample_txts:
print(f'sample \"{sample}\", after normalize: \"{norm(sample)}\"')