-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconnectivity_stats.py
executable file
·81 lines (49 loc) · 2.63 KB
/
connectivity_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#! /usr/bin/env python3
import pathlib
import yaml
import pandas as pd
# ==================== parameters reading
# parameters are read
with open('parameters.yaml') as yaml_data:
# the parameters file is read to memory
parameters = yaml.load(yaml_data, Loader=yaml.FullLoader)
# ---
input_directory = pathlib.Path(parameters['common']['output']['csv']['directory']) / parameters[
'common']['output']['csv']['key columns subdirectory']
projects_csv_file = input_directory / (parameters['projects']['name'] + '.csv')
publications_csv_file = input_directory / (parameters['publications']['name'] + '.csv')
link_csv_file = input_directory / (parameters['link tables']['name'] + '.csv')
patents_csv_file = input_directory / (parameters['patents']['name'] + '.csv')
for file in [projects_csv_file, publications_csv_file, link_csv_file]:
if not file.exists():
print(f"file {file} doesn't exist, please run the main program first")
raise SystemExit
projects = pd.read_csv(projects_csv_file)
publications = pd.read_csv(publications_csv_file)
link = pd.read_csv(link_csv_file)
patents = pd.read_csv(patents_csv_file)
n_projects = len(projects)
n_publications = len(publications)
n_patents = len(patents)
# --- projects and publications
projects_link = projects.merge(link, how='outer', left_on='CORE_PROJECT_NUM', right_on='PROJECT_NUMBER', indicator=True)
pub_linked_to_inexistent_proj = (projects_link._merge == "right_only").sum()
print(
f'# publications linked to inexistent projects: {pub_linked_to_inexistent_proj} '
f'(out of {n_publications} *linked* publications)'
)
proj_with_no_linked_pub = (projects_link._merge == "left_only").sum()
print(f'# projects with no linked publications: {proj_with_no_linked_pub} (out of {n_projects})')
# --- publications
publications_link = publications.merge(link, how='outer', on='PMID', indicator=True)
n_orphan_publications = (publications_link._merge != "both").sum()
print(f'# publications with not linked *data*: {n_orphan_publications}')
# --- projects and patents
patents_projects = patents.merge(projects, how='outer', left_on='PROJECT_ID', right_on='CORE_PROJECT_NUM', indicator=True)
pat_with_no_linked_proj = (patents_projects._merge == "left_only").sum()
print(f'# patents with no linked project: {pat_with_no_linked_proj} (out of {n_patents})')
proj_with_no_linked_pat = (patents_projects._merge == "right_only").sum()
print(f'# projects with no linked patents: {proj_with_no_linked_pat} (out of {n_projects})')
# ---
# publications_title = pd.read_feather(publications_csv_file.with_suffix('.feather'), columns=['PMID', 'PUB_TITLE'])
# merged = projects_link.merge(publications_title, how='outer', on='PMID')