-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombinedSignificance + visuals.py
182 lines (149 loc) · 6.22 KB
/
combinedSignificance + visuals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# ##1## Install dependencies and import them
import re
import csv
import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
import urllib.request
from bs4 import BeautifulSoup
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
from PyPDF2 import PdfReader
from collections import Counter
import numpy as np
# Load spaCy model
nlp = spacy.load('en_core_web_sm')
# Configure pandas display options
pd.set_option('display.max_colwidth', 200)
# ##2## Function to parse text from URL
def parse_text_from_url(url):
html = urllib.request.urlopen(url)
htmlParse = BeautifulSoup(html, 'html.parser')
parsed_text = ""
for para in htmlParse.find_all("p"):
parsed_text += " " + str(para.get_text())
return parsed_text
# ##3## Function to parse text from PDF
def parse_text_from_pdf(pdf_path):
parsed_text = ""
with open(pdf_path, 'rb') as file:
reader = PdfReader(file)
for page in reader.pages:
parsed_text += page.extract_text()
return parsed_text
# ##4## Convert text into sentences & save in CSV
def save_sentences_to_csv(text, filename='article_text.csv'):
sentences = [[i] for i in nlp(text).sents]
with open(filename, 'w', newline='', encoding='utf-8', errors="replace") as myfile:
writer = csv.writer(myfile)
writer.writerow(['sentence'])
writer.writerows(sentences)
return pd.read_csv(filename, encoding='utf-8')
# ##5## Extract entity pairs
def get_entities(sent):
ent1 = ""
ent2 = ""
prv_tok_dep = "" # Dependency tag of previous token in the sentence
prv_tok_text = "" # Previous token in the sentence
prefix = ""
modifier = ""
# Process each token in the sentence
for tok in nlp(sent):
if tok.dep_ != "punct":
if tok.dep_ == "compound":
prefix = tok.text
if prv_tok_dep == "compound":
prefix = prv_tok_text + " " + tok.text
if tok.dep_.endswith("mod"):
modifier = tok.text
if prv_tok_dep == "compound":
modifier = prv_tok_text + " " + tok.text
if tok.dep_.find("subj") != -1:
ent1 = modifier + " " + prefix + " " + tok.text
prefix = ""
modifier = ""
prv_tok_dep = ""
prv_tok_text = ""
if tok.dep_.find("obj") != -1:
ent2 = modifier + " " + prefix + " " + tok.text
# Update variables
prv_tok_dep = tok.dep_
prv_tok_text = tok.text
return [ent1.strip() or None, ent2.strip() or None]
# ##6## Get relations for the entities
def get_relation(sent):
doc = nlp(sent)
matcher = Matcher(nlp.vocab)
# Define the pattern
pattern = [{'DEP': 'ROOT'},
{'DEP': 'prep', 'OP': '?'},
{'DEP': 'agent', 'OP': '?'},
{'POS': 'ADJ', 'OP': '?'}]
matcher.add("matching_1", [pattern])
matches = matcher(doc)
# If any match is found, extract the relation
if matches:
k = len(matches) - 1
span = doc[matches[k][1]:matches[k][2]]
return span.text
return ""
# New function to score entities and relations
def score_entity_relation(entities, relation):
if not entities or len(entities)<2 or not all(entities):
return 0
# Implement your scoring logic here
# This is a simple example based on entity types and relation length
else:
entity_score = sum(2 if ent in ['PERSON', 'ORG', 'GPE'] else 1 for ent in entities)
relation_score = len(relation) if isinstance(relation, str) else 0
return entity_score * relation_score
# ##7## Create entity pairs and relations
def process_sentences_to_graph(csv_sentences):
entity_pairs = []
relations = []
scores = []
for sent in tqdm(csv_sentences['sentence']):
entities = get_entities(sent)
relation = get_relation(sent)
if all(entities): # Only add if both entities are found
entity_pairs.append(entities)
relations.append(relation)
scores.append(score_entity_relation(entities, relation))
# Filter based on scores
threshold = np.percentile(scores, 75) # Keep top 25% of entity-relation pairs
filtered_pairs = [(pair, rel, score) for pair, rel, score in zip(entity_pairs, relations, scores) if score >= threshold]
# Create a DataFrame for the graph
source = [i[0][0] for i in filtered_pairs]
target = [i[0][1] for i in filtered_pairs]
edge = [i[1] for i in filtered_pairs]
score = [i[2] for i in filtered_pairs]
kg_df = pd.DataFrame({'source': source, 'target': target, 'edge': edge, 'score': score})
# Create a directed graph from the DataFrame
G = nx.from_pandas_edgelist(kg_df, "source", "target", edge_attr=True, create_using=nx.MultiDiGraph())
# Plot the graph
plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k=2, iterations=100, scale=2)
nx.draw( G, pos, with_labels=True, node_color='lightblue', font_size=8, edge_color='gray', node_size=1000, alpha=0.7, font_weight='bold', width=1.5)
edge_labels = nx.get_edge_attributes(G, 'edge')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8, font_color='darkred')
plt.title("Entity Relation Graph", fontsize=15)
plt.show()
# ##8## Main function
if __name__ == "__main__":
# Choose input type
input_type = input("Enter 'url' to process a web article or 'pdf' to process a PDF file: ").strip().lower()
if input_type == 'url':
url = input("Enter the URL: ").strip()
parsed_text = parse_text_from_url(url)
elif input_type == 'pdf':
pdf_path = input("Enter the PDF file path: ").strip()
parsed_text = parse_text_from_pdf(pdf_path)
else:
print("Invalid input type. Please enter either 'url' or 'pdf'.")
exit()
# Save sentences to CSV and re-import
csv_sentences = save_sentences_to_csv(parsed_text)
# Process and visualize the graph
process_sentences_to_graph(csv_sentences)