This repository has been archived by the owner on Aug 19, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_text.py
104 lines (102 loc) · 3.18 KB
/
clean_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import re
def clean_strings(strings):
replacements = {
"damn": "darn",
"shit": "crud",
"ass": "rear end",
"bitch": "female dog",
"piss": "tinkle",
"fuck": "fudge",
"cock": "rooster",
"dick": "Richard",
"pussy": "kitty",
"cunt": "kitty",
"bastard": "illegitimate child",
"asshole": "jerk",
"motherfucker": "jerk",
"son of a bitch": "jerk",
"bullshit": "nonsense",
"wanker": "jerk",
"bollocks": "nonsense",
"twat": "jerk",
"arse": "rear end",
"fanny": "rear end",
"douchebag": "jerk",
"dipshit": "idiot",
"schmuck": "fool",
"prick": "jerk",
"slut": "promiscuous person",
"whore": "sex worker",
"jackass": "jerk",
"moron": "idiot",
"numbnuts": "idiot",
"nimrod": "idiot",
"knob": "jerk",
"bonehead": "idiot",
"cretin": "fool",
"doofus": "idiot",
"goofball": "fool",
"nitwit": "fool",
"simpleton": "fool",
"dumbass": "idiot",
"asswipe": "jerk",
"butthead": "jerk",
"dickhead": "jerk",
"dingleberry": "fool",
"numb-nuts": "idiot",
"shithead": "jerk",
"skank": "disgusting person",
"sucka": "fool",
"twit": "fool",
"dam": "darn",
"shite": "crud",
"arsehole": "jerk",
"basterd": "illegitimate child",
"fuk": "fudge",
"p*ssy": "kitty",
"c*nt": "kitty",
"w*nker": "jerk",
"d*ck": "Richard",
"a$$hole": "jerk",
"mothafucka": "jerk",
"shyt": "crud",
"bullshyt": "nonsense",
"a$$": "rear end",
"biatch": "female dog",
"douche": "jerk",
"dipsht": "idiot",
"pr*ck": "jerk",
"sl*t": "promiscuous person",
"wh*re": "sex worker",
"jacka$$": "jerk",
"morron": "idiot",
"numb-nut": "idiot",
"nimrodd": "idiot",
"kn*b": "jerk",
"cretinn": "fool",
"doofuss": "idiot",
"goofballl": "fool",
"nitwitt": "fool",
"simpelton": "fool",
"dumbazz": "idiot"
}
print('Cleaning text...')
cleaned_strings = []
for string in strings:
cleaned_string = string
for vulgar_word, replacement_word in replacements.items():
insensitive_regex = re.compile(re.escape(vulgar_word), re.IGNORECASE)
match_iter = insensitive_regex.finditer(cleaned_string)
for match in match_iter:
original_word = match.group(0)
if original_word.islower():
replacement = replacement_word.lower()
elif original_word.isupper():
replacement = replacement_word.upper()
elif original_word.istitle():
replacement = replacement_word.title()
else:
replacement = replacement_word
cleaned_string = cleaned_string[:match.start()] + replacement + cleaned_string[match.end():]
cleaned_strings.append(cleaned_string)
return cleaned_strings