-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path05_NER.py
128 lines (104 loc) · 4.2 KB
/
05_NER.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Stanford NER
article = '''
Asian shares skidded on Tuesday after a rout in tech stocks put Wall Street to the sword, while a
sharp drop in oil prices and political risks in Europe pushed the dollar to 16-month highs as investors dumped
riskier assets. MSCI’s broadest index of Asia-Pacific shares outside Japan dropped 1.7 percent to a 1-1/2
week trough, with Australian shares sinking 1.6 percent. Japan’s Nikkei dived 3.1 percent led by losses in
electric machinery makers and suppliers of Apple’s iphone parts. Sterling fell to $1.286 after three straight
sessions of losses took it to the lowest since Nov.1 as there were still considerable unresolved issues with the
European Union over Brexit, British Prime Minister Theresa May said on Monday.'''
import nltk
from nltk.tag import StanfordNERTagger
# print('NTLK Version: %s' % nltk.__version__)
stanford_ner_tagger = StanfordNERTagger(
r"D:\Twitter Data\Data\NER\stanford-ner-2018-10-16\classifiers\english.muc.7class.distsim.crf.ser.gz",
r"D:\Twitter Data\Data\NER\stanford-ner-2018-10-16\stanford-ner-3.9.2.jar"
)
results = stanford_ner_tagger.tag(article.split())
# print('Original Sentence: %s' % (article))
for result in results:
tag_value = result[0]
tag_type = result[1]
if tag_type != 'O':
print('Value: %s\nType: %s' % (tag_value, tag_type))
# spaCy
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp(article)
for X in doc.ents:
print('Value: %s, Type: %s' % (X.text, X.label_))
# NLTK
def fn_preprocess(art):
art = nltk.word_tokenize(art)
art = nltk.pos_tag(art)
return art
art_processed = fn_preprocess(article)
print(art_processed)
# get continuous words starting with a capital letter
import re
pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|·|!|…|('
a = "I am Alex Lee. I am from Denman Prospect and I love this place very much. We don't like apple. The big one is good."
# our goal is getting 'I', 'Alex Lee', 'Denman Prospect', 'I'
def get_capital(sentence):
sections = [s.strip() for s in re.split(pattern, sentence)]
caps = []
for sec in sections:
tmp = []
for w in sec.split():
if w[0].isupper():
tmp.append(w)
elif len(tmp) > 0:
caps.append(tmp)
tmp = []
if len(tmp) > 0:
caps.append(tmp)
tmp = []
caps = [' '.join(c) for c in caps]
return list(set(caps))
print(get_capital(a))
# Similar with the above method, but also delete stopwords
# method 1
import re
import nltk
from nltk.corpus import stopwords
pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|·|!|…|('
a = "I am Alex Lee. I am from Denman Prospect and I love this place very much. We don't like apple. The big one is good."
# our goal is getting 'I', 'Alex Lee', 'Denman Prospect', 'I'
def get_capital(sentence):
sections = [s.strip() for s in re.split(pattern, sentence)]
caps = []
for sec in sections:
tmp = []
for w in sec.split():
if w[0].isupper():
tmp.append(w)
elif len(tmp) > 0:
caps.append(tmp)
tmp = []
if len(tmp) > 0:
caps.append(tmp)
tmp = []
caps = list(set([' '.join(c) for c in caps]))
caps = [c for c in caps if c.lower() not in stopwords.words('english')]
return list(set(caps))
print(get_capital(a))
# method 2 (better, based on NLTK)
import nltk
from nltk.corpus import stopwords
a = "I am Alex Lee. I am from Denman Prospect and I love this place very much. We don't like apple. The big one is good."
tokens = nltk.word_tokenize(a)
caps = []
for i in range(1, 4):
for eles in nltk.ngrams(tokens, i):
length = len(list(eles))
for j in range(length):
if eles[j][0].islower() or not eles[j][0].isalpha():
break
elif j == length - 1:
caps.append(' '.join(list(eles)))
caps = list(set(caps))
caps = [c for c in caps if c.lower() not in stopwords.words('english')]
print(caps)