-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparameters.yaml
199 lines (134 loc) · 4.72 KB
/
parameters.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
common:
homepage: "https://exporter.nih.gov"
output:
# relative to the directory where the script is run
downloads directory: &output_downloads_directory downloads
# relative to the `downloads directory`
unzipped files subdirectory: unzipped
dataframes directory: *output_downloads_directory
# Comma Separated Value format
csv:
# directory in which to save CSV files
directory: downloads/csv
# subdirectory of the one above in which to store CSV files with only the "key" columns
key columns subdirectory: key_columns
# besides the appointed binary format, every `DataFrame` is also saved in a CSV file
save everything: true
projects:
# to be used in naming
name: projects
# the URL (relative to the above "homepage") where to find projects data
relative path: "ExPORTER_Catalog.aspx?sid=1&index=0"
# the position of the table of interest when considering all the tables in the URL
table number: &nih_projects_table_number 13
# (Python) regular expressions meant to match (linked) files to be ignored
filename patterns to be ignored:
- ^javascript.*
- .*_DUNS_.*\.zip$
- .*_PRJFUNDING_.*\.zip$
# columns that unequivocally identify elements here (they need not be unique)
key columns: [CORE_PROJECT_NUM]
# data types specification (you probably don't want to mess with this); if blank, it is inferred
data types:
# these were obtained by *adapting* those returned by `util.dataframe_types_to_yaml`
ACTIVITY: object
ADMINISTERING_IC: object
APPLICATION_ID: int64
APPLICATION_TYPE: float64
ARRA_FUNDED: object
AWARD_NOTICE_DATE: object
BUDGET_END: object
BUDGET_START: object
CFDA_CODE: object
CORE_PROJECT_NUM: object
DIRECT_COST_AMT: float64
ED_INST_TYPE: object
FOA_NUMBER: object
FULL_PROJECT_NUM: object
FUNDING_ICs: object
FUNDING_MECHANISM: object
FY: int64
IC_NAME: object
INDIRECT_COST_AMT: float64
NIH_SPENDING_CATS: object
ORG_CITY: object
ORG_COUNTRY: object
ORG_DEPT: object
ORG_DISTRICT: float64
ORG_DUNS: object
ORG_FIPS: object
ORG_IPF_CODE: float64
ORG_NAME: object
ORG_STATE: object
ORG_ZIPCODE: object
PHR: object
PI_IDS: object
PI_NAMEs: object
PROGRAM_OFFICER_NAME: object
PROJECT_END: object
PROJECT_START: object
PROJECT_TERMS: object
PROJECT_TITLE: object
SERIAL_NUMBER: object
STUDY_SECTION: object
STUDY_SECTION_NAME: object
SUBPROJECT_ID: float64
SUFFIX: object
SUPPORT_YEAR: object
TOTAL_COST: float64
TOTAL_COST_SUB_PROJECT: float64
# =======================================================================
publications:
name: publications
relative path: &publications_relative_path "ExPORTER_Catalog.aspx?sid=0&index=2"
table number: *nih_projects_table_number
filename patterns to be ignored: &publications_filename_patterns_to_be_ignored
- ^javascript.*
# - .*_AFFLNK_.*\.zip$
key columns: [PMID]
data types: &publications_data_types
# =======================================================================
publications author affiliations:
name: publications_author_affiliations
relative path: *publications_relative_path
table number: *nih_projects_table_number
filename patterns to be ignored: *publications_filename_patterns_to_be_ignored
key columns: [AFFILIATION, AUTHOR_NAME, PMID]
# data types: *publications_data_types
data types:
AFFILIATION: object
AUTHOR_NAME: object
AUTH_TRANS_ID: object
PMID: int64
# =======================================================================
link tables:
name: link
relative path: "ExPORTER_Catalog.aspx?sid=2&index=5"
table number: *nih_projects_table_number
filename patterns to be ignored: []
key columns: [PMID, PROJECT_NUMBER]
data types:
# =======================================================================
patents:
name: patents
relative path: "ExPORTER_Catalog.aspx?sid=2&index=3"
table number: *nih_projects_table_number
filename patterns to be ignored: []
key columns: [PATENT_ID, PROJECT_ID]
data types:
# =======================================================================
abstracts:
name: abstracts
relative path: "ExPORTER_Catalog.aspx?sid=0&index=1"
table number: *nih_projects_table_number
filename patterns to be ignored: []
key columns: [APPLICATION_ID]
data types:
# =======================================================================
clinical studies:
name: clinical_studies
relative path: "ExPORTER_Catalog.aspx?sid=2&index=4"
table number: *nih_projects_table_number
filename patterns to be ignored: []
key columns: [ClinicalTrials.gov ID, Core Project Number]
data types: