-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathresultAnalysis.py
90 lines (71 loc) · 3.34 KB
/
resultAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import csv
import datetime
import sys
from collections import Counter, defaultdict
from typing import List
from pydantic_core import from_json
from rapidfuzz import fuzz
from utils import PaperInfo
# The recursion limit needs to extended for this script
sys.setrecursionlimit(10000)
def frequency_analysis(tech_lst: List[tuple[str, int]]):
clusters = defaultdict(list)
return fuzzy_matching_loop(tech_lst, clusters)
def fuzzy_matching_loop(tech_lst: List[tuple[str, int]], cluster: dict,
cutoff_ratio=90.0):
"""
This is a recursive implementation of the fuzzy synonym matching. For finding synonyms.
Rappid fuzz with the using the "partial ratio" is used for this purpose.
The first element of the tech_list, gets compared to all other elements.
If it matches within the cutoff_ratio it will get added the synonym cluster.
The function calls itself with the matched synonyms removed from the tech_list.
It returns the synonym cluster.
"""
if tech_lst == []:
return cluster
# tech_a gets compared to tech_b. tech_a is then used as generic term.
tech_a, _ = tech_lst[0]
matched_synonyms = []
for tech_b, frequency in tech_lst:
r = fuzz.partial_ratio(tech_a, tech_b)
if r >= cutoff_ratio:
# If the tech_a and tech_b match within the given ratio tech_b gets added to
# cluster, under the key of tech_a. tech_a is used as a generic term.
cluster[tech_a].append((tech_b, frequency))
matched_synonyms.append(tech_b)
next_tech_lst = [(x, f) for x, f in tech_lst if x not in matched_synonyms]
# The generic term, tech_a may not exist in the next iteration.
# Per definition, tech_a will always have a 100.0 ratio with itself.
assert tech_a not in next_tech_lst
# Assert that the progress is made.
assert len(next_tech_lst) < len(tech_lst)
return fuzzy_matching_loop(next_tech_lst, cluster)
if __name__ == "__main__":
# Open the output file from the abstract analyzer.
results = open("results_2000.json", "r")
paper_list = from_json(results.read())
counter = Counter()
for paper in paper_list:
parsed_paper = PaperInfo(**paper)
# Clean the technology names.
# Add them to a counter set; this will be used for the frequency analysis.
counter += Counter(
[x.replace("-", " ").removeprefix(" ").removesuffix(" ") for x in
parsed_paper.technology_used])
tech_list = [(tech, freq) for tech, freq in counter.most_common()]
analysis_clusters = frequency_analysis(tech_list)
cut_of_freq = 5
csv_data = []
for tech, syn in sorted(analysis_clusters.items()):
syn_names = [x for x, _ in syn]
total_freq = sum([x for _, x in syn])
if total_freq >= cut_of_freq:
csv_data.append({"Frequency": total_freq, "Technology Name": tech,
"Matched Synonyms": syn_names})
print(f"Freq: {total_freq}, Technology: {tech}: {syn_names}")
with open(f"analyzed_technology_{cut_of_freq}_{datetime.datetime.now()}.csv", "w",
newline='') as csv_file:
fieldnames = ["Frequency", "Technology Name", "Matched Synonyms"]
csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
csv_writer.writeheader()
csv_writer.writerows(csv_data)