-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
116 lines (93 loc) · 3.34 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import re
from collections import defaultdict
from typing import Tuple
# 10k words from https://www.mit.edu/~ecprice/wordlist.10000
with open("10k.txt") as f:
tenk_words = f.read().split()
tenk_words = set(tenk_words)
# input, result
tests = [
[
"Dancing With our Hands Tied (DWHT) is my favourite song.",
("Dancing With our Hands Tied", "DWoHT"),
],
[
"Dancing With Our Hands Tied (DWHT) is my favourite song.",
("Dancing With Our Hands Tied", "DWOHT"),
],
[
"Dancing With Our Hands Tied is my favourite song.",
("Dancing With Our Hands Tied", "DWOHT"),
],
[
"Dancing With Our Hands Tied (D.W.O.H.T) is my favourite song.",
("Dancing With Our Hands Tied", "DWOHT"),
],
["Club Penguin is a video game.", ("Club Penguin", "CP")],
]
def get_abbreviations_and_acronyms(
text: str, lowercase_first_word=False
) -> Tuple[str, str]:
"""
Given a string, return the longest abbreviation or acronym in the form of:
- ... Word Like This (WLT) ... = ('Word Like This', 'WLT')
- ... Word Like This ... = ('Word Like This', 'WLT')
- ... Word Like This (W.L.T) ... = ('Word Like This', 'WLT')
- ... Word like This (WLT) ... = ('Word Like This', 'WlT')
Parameters:
- text: str: The text to extract the abbreviation / acronym from.
- lowercase_first_word: bool: Whether to lowercase the first word in the text.
You may want to set this to True if the text is a sentence and you are unsure if you know the first word
is not part of an abbreviation / acronym.
"""
if lowercase_first_word:
text = text[0].lower() + text[1:]
abbreviations_with_symbol_structure = re.findall(
r"([A-Z][a-z]+(?: [A-Z][a-z]+)+) \(([A-Z]+)\)", text
)
all_words = text.split()
words = defaultdict(str)
buffer = []
for i, word in enumerate(all_words):
if i == 0 and word.islower():
continue
if word.strip().islower() and (
i < len(all_words) - 1 and all_words[i + 1][0].islower()
):
continue
# if all caps, skip
# skip if next word starts with a capital letter
if (
i < len(all_words) - 1
and (word.istitle() and all_words[i + 1][0].istitle())
or (
i < len(all_words) - 2
and all_words[i + 1][0].islower()
and all_words[i + 2][0].istitle()
)
):
buffer.append(word)
continue
# if next word is upper, continue
if i < len(all_words) - 1 and all_words[i + 1].istitle():
buffer.append(word)
continue
if len(buffer) > 0:
# if current word starts with a capital
if word[0].isupper():
word = " ".join(buffer) + " " + word
else:
word = " ".join(buffer)
word = word.strip()
if not words.get(word):
words[word] = "".join([c[0] for c in word.split(" ") if c.isalpha()])
# get longest
if not words:
return []
longest = [max(words, key=lambda x: len(x))]
return (longest[0], words[longest[0]])
for test in tests:
input, result = test
output = get_abbreviations_and_acronyms(input)
assert output == result, f"{output} != {result}"