-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
executable file
·257 lines (210 loc) · 11.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!/usr/bin/env python
from snakemake import shell
from snakemake.io import expand
import nodes.utils as utils
import nodes.encodings as encodings
import nodes.filter as dataset_filter
import nodes.fps as fps
import nodes.benchmark as benchmark
import nodes.vis as vis
import sys
import secrets
from peptidereactor.workflow_executer \
import WorkflowExecuter, WorkflowSetter
TOKEN = secrets.token_hex(6)
CORES = 32
DATASETS = [
"fps"
]
with WorkflowSetter(cores=CORES, benchmark_dir="data/{dataset}/misc/benchmark/") as w:
# w.add(utils.map_sequence_names.rule(
# fasta_in="data/{dataset}/seqs.fasta", classes_in="data/{dataset}/classes.txt", benchmark_dir=w.benchmark_dir,
# fasta_out="data/{dataset}/seqs_mapped.fasta", maps_out="data/{dataset}/misc/mapped_sequence_names.yaml"))
w.add(utils.tertiary_structure_search.rule(
fasta_in="data/{dataset}/seqs_mapped.fasta", classes_in="data/{dataset}/classes.txt",
fasta_sec_out="data/{dataset}/seqs_sec.fasta", classes_sec_out="data/{dataset}/classes_sec.txt",
fasta_ter_out="data/{dataset}/seqs_ter.fasta", classes_ter_out="data/{dataset}/classes_ter.txt",
pdb_dir="data/{dataset}/pdb/", profile_dir="data/{dataset}/profile/", benchmark_dir=w.benchmark_dir))
w.add(utils.multiple_sequence_alignment.rule(
fastas_in=["data/{dataset}/seqs_mapped.fasta", "data/{dataset}/seqs_sec.fasta",
"data/{dataset}/seqs_ter.fasta"],
fastas_out=["data/{dataset}/seqs_msa.fasta", "data/{dataset}/seqs_msa_sec.fasta",
"data/{dataset}/seqs_msa_ter.fasta"],
benchmark_dir=w.benchmark_dir))
seqb = encodings.sequence_based.Rule()
w.add(seqb.rule(
fasta_in="data/{dataset}/seqs_mapped.fasta", fasta_msa_in="data/{dataset}/seqs_msa.fasta",
classes_in="data/{dataset}/classes.txt", path_to_config="config.yaml",
exclude=["ngram_a2", "ngram_a3", "ngram_e3"],
misc_dir="data/{dataset}/misc/", csv_dir="data/{dataset}/csv/", benchmark_dir=w.benchmark_dir))
strb = encodings.structure_based.Rule()
w.add(strb.rule(
fasta_sec_in="data/{dataset}/seqs_sec.fasta", fasta_msa_sec_in="data/{dataset}/seqs_msa_sec.fasta",
classes_sec_in="data/{dataset}/classes_sec.txt", fasta_ter_in="data/{dataset}/seqs_ter.fasta",
classes_ter_in="data/{dataset}/classes_ter.txt", path_to_config="config.yaml", pdb_dir="data/{dataset}/pdb/",
profile_dir="data/{dataset}/profile/", csv_dir="data/{dataset}/csv/", benchmark_dir=w.benchmark_dir))
w.add(utils.collect_encodings.rule(
csv_seq_in=seqb.target_csvs, csv_str_in=strb.target_csvs,
csv_seq_out=f"data/temp/{TOKEN}/{{dataset}}/csv/original/sequence_based/",
csv_str_out=f"data/temp/{TOKEN}/{{dataset}}/csv/original/structure_based/"))
sequence_based_encodings_dir, structure_based_encodings_dir, all_encodings_dir = \
"data/{dataset}/csv/sequence_based/", "data/{dataset}/csv/structure_based/", "data/{dataset}/csv/all/"
w.add(dataset_filter.non_empty.rule(
csv_in=f"data/temp/{TOKEN}/{{dataset}}/csv/original/sequence_based/",
csv_out=f"data/temp/{TOKEN}/{{dataset}}/csv/sequence_based/non_empty/", benchmark_dir=w.benchmark_dir))
w.add(dataset_filter.aaindex.rule(
csv_in=f"data/temp/{TOKEN}/{{dataset}}/csv/sequence_based/non_empty/",
csv_out=f"data/temp/{TOKEN}/{{dataset}}/csv/sequence_based/aaindex/", benchmark_dir=w.benchmark_dir))
w.add(dataset_filter.psekraac.rule(
csv_in=f"data/temp/{TOKEN}/{{dataset}}/csv/sequence_based/aaindex/",
csv_out=sequence_based_encodings_dir, benchmark_dir=w.benchmark_dir))
w.add(dataset_filter.non_empty.rule(
csv_in=f"data/temp/{TOKEN}/{{dataset}}/csv/original/structure_based/",
csv_out=structure_based_encodings_dir, benchmark_dir=w.benchmark_dir))
w.add(dataset_filter.aggregate_directories.rule(
dirs_in=[sequence_based_encodings_dir, structure_based_encodings_dir],
dir_out=f"data/temp/{TOKEN}/{{dataset}}/all/", benchmark_dir=w.benchmark_dir))
w.add(fps.append_linker.rule(
dir_in=f"data/temp/{TOKEN}/{{dataset}}/all/",
linker_in="data/{dataset}/data.yaml",
dir_out=all_encodings_dir, benchmark_dir=w.benchmark_dir))
w.add(benchmark.cross_validation.single.rule(
csv_seq_in=sequence_based_encodings_dir, csv_str_in=structure_based_encodings_dir,
csv_dir_out="data/{dataset}/benchmark/single/", benchmark_dir=w.benchmark_dir))
w.add(benchmark.compute_metrics.rule(
csv_dir_in="data/{dataset}/benchmark/single/", benchmark_dir=w.benchmark_dir,
metrics_dir_out="data/{dataset}/benchmark/metrics/"))
w.add(benchmark.cross_validation.ensemble.rule(
group_1_in=sequence_based_encodings_dir, group_2_in=structure_based_encodings_dir,
group_1_out="data/{dataset}/benchmark/ensemble/seq_vs_str/sequence_based/",
group_2_out="data/{dataset}/benchmark/ensemble/seq_vs_str/structure_based/",
benchmark_dir=w.benchmark_dir))
w.add(benchmark.cross_validation.ensemble.rule(
group_1_in=all_encodings_dir, group_2_in=all_encodings_dir,
group_1_out="data/{dataset}/benchmark/ensemble/all_vs_all/group_1/",
group_2_out="data/{dataset}/benchmark/ensemble/all_vs_all/group_2/",
benchmark_dir=w.benchmark_dir))
w.add(benchmark.similarity.rule(
group_1_in="data/{dataset}/benchmark/ensemble/seq_vs_str/sequence_based/",
group_2_in="data/{dataset}/benchmark/ensemble/seq_vs_str/structure_based/",
corr_dir_out="data/{dataset}/benchmark/similarity/seq_vs_str/", benchmark_dir=w.benchmark_dir))
w.add(benchmark.similarity.rule(
group_1_in="data/{dataset}/benchmark/ensemble/all_vs_all/group_1/",
group_2_in="data/{dataset}/benchmark/ensemble/all_vs_all/group_2/",
corr_dir_out="data/{dataset}/benchmark/similarity/all_vs_all/", benchmark_dir=w.benchmark_dir))
w.add(benchmark.critical_difference.rule(
metrics_dir_in="data/{dataset}/benchmark/metrics/", benchmark_dir=w.benchmark_dir,
cd_dir_out="data/{dataset}/benchmark/friedman/"))
w.add(benchmark.dataset_correlation.rule(
group_1_in=sequence_based_encodings_dir, group_2_in=structure_based_encodings_dir,
metrics_dir_in="data/{dataset}/benchmark/metrics/",
dataset_corr_out="data/{dataset}/benchmark/dataset_correlation.csv", benchmark_dir=w.benchmark_dir))
w.add(utils.collect_benchmark.rule(
final_dirs_in=[
"data/{dataset}/benchmark/metrics/",
"data/{dataset}/benchmark/similarity/seq_vs_str/",
"data/{dataset}/benchmark/similarity/all_vs_all/",
"data/{dataset}/benchmark/friedman/"
],
final_files_in=[
"data/{dataset}/benchmark/dataset_correlation.csv"
],
csv_out=w.benchmark_dir + "benchmark.csv", benchmark_dir=w.benchmark_dir
))
w.add(vis.sds_1_Overview.rule(
metrics_dir_in="data/{dataset}/benchmark/metrics/",
html_dir_out="data/{dataset}/vis/sds_1_Overview/"
))
w.add(vis.sds_2_Metrics.rule(
metrics_dir_in="data/{dataset}/benchmark/metrics/",
html_dir_out="data/{dataset}/vis/sds_2_Metrics/"
))
w.add(vis.sds_3_Curves.rule(
metrics_dir_in="data/{dataset}/benchmark/metrics/",
html_dir_out="data/{dataset}/vis/sds_3_Curves/"
))
w.add(vis.sds_4_Similarity.rule(
similarity_dir_group_1_in="data/{dataset}/benchmark/similarity/seq_vs_str/",
similarity_dir_group_2_in="data/{dataset}/benchmark/similarity/all_vs_all/",
html_dir_out="data/{dataset}/vis/sds_4_Similarity/"
))
w.add(vis.sds_5_Diversity.rule(
similarity_dir_group_1_in="data/{dataset}/benchmark/similarity/seq_vs_str/",
similarity_dir_group_2_in="data/{dataset}/benchmark/similarity/all_vs_all/",
ensemble_cv_group_1a_in="data/{dataset}/benchmark/ensemble/seq_vs_str/sequence_based/",
ensemble_cv_group_1b_in="data/{dataset}/benchmark/ensemble/seq_vs_str/structure_based/",
ensemble_cv_group_2a_in="data/{dataset}/benchmark/ensemble/all_vs_all/group_1/",
ensemble_cv_group_2b_in="data/{dataset}/benchmark/ensemble/all_vs_all/group_2/",
metrics_dir_in="data/{dataset}/benchmark/metrics/",
html_dir_out="data/{dataset}/vis/sds_5_Diversity/"
))
w.add(vis.sds_6_Difference.rule(
crit_diff_dir_in="data/{dataset}/benchmark/friedman/",
metrics_dir_in="data/{dataset}/benchmark/metrics/",
html_dir_out="data/{dataset}/vis/sds_6_Difference/"
))
w.add(vis.sds_7_Composition.rule(
fasta_in="data/{dataset}/seqs_mapped.fasta",
classes_in="data/{dataset}/classes.txt",
html_dir_out="data/{dataset}/vis/sds_7_Composition/"
))
w.add(vis.sds_8_Correlation.rule(
metrics_dir_in="data/{dataset}/benchmark/metrics/",
dataset_correlation_in="data/{dataset}/benchmark/dataset_correlation.csv",
html_dir_out="data/{dataset}/vis/sds_8_Correlation/"
))
w.add(vis.sds_9_Time.rule(
benchmark_csv_in=w.benchmark_dir + "benchmark.csv",
metrics_dir_in="data/{dataset}/benchmark/metrics/",
html_dir_out="data/{dataset}/vis/sds_9_Time/"
))
w.add(vis.mds_1_Overview.rule(
metric_dirs_in=expand("data/{dataset}/benchmark/metrics/", dataset=DATASETS),
html_dir_out="data/multiple_datasets/vis/mds_1_Overview/"
))
w.add(vis.mds_2_Ranks.rule(
metric_dirs_in=expand("data/{dataset}/benchmark/metrics/", dataset=DATASETS),
html_dir_out="data/multiple_datasets/vis/mds_2_Ranks/"
))
w.add(vis.mds_3_Clustering.rule(
metric_dirs_in=expand("data/{dataset}/benchmark/metrics/", dataset=DATASETS),
html_dir_out="data/multiple_datasets/vis/mds_3_Clustering/"
))
w.add(vis.mds_4_Embedding.rule(
fastas_in=expand("data/{dataset}/seqs.fasta", dataset=DATASETS),
classes_in=expand("data/{dataset}/classes.txt", dataset=DATASETS),
html_dir_out="data/multiple_datasets/vis/mds_4_Embedding/"
))
w.add(vis.mds_5_Time.rule(
fastas_in=expand("data/{dataset}/seqs.fasta", dataset=DATASETS),
benchmark_csvs_in=expand("data/{dataset}/misc/benchmark/benchmark.csv", dataset=DATASETS),
html_dir_out="data/multiple_datasets/vis/mds_5_Time/"
))
w.add(vis.home_Home_tsne.rule(
fastas_in=expand("data/{dataset}/seqs.fasta", dataset=DATASETS),
classes_in=expand("data/{dataset}/classes.txt", dataset=DATASETS),
readmes_in=expand("data/{dataset}/README.md", dataset=DATASETS),
benchmark_csvs_in=expand("data/{dataset}/misc/benchmark/benchmark.csv", dataset=DATASETS),
html_dir_out="data/multiple_datasets/vis/home_Home_tsne/"
))
target = expand([
"data/{dataset}/vis/sds_1_Overview/",
"data/{dataset}/vis/sds_2_Metrics/",
"data/{dataset}/vis/sds_3_Curves/",
"data/{dataset}/vis/sds_4_Similarity/",
"data/{dataset}/vis/sds_5_Diversity/",
"data/{dataset}/vis/sds_6_Difference/",
"data/{dataset}/vis/sds_7_Composition/",
"data/{dataset}/vis/sds_8_Correlation/",
"data/{dataset}/vis/sds_9_Time/",
"data/multiple_datasets/vis/mds_1_Overview/",
"data/multiple_datasets/vis/mds_2_Ranks/",
"data/multiple_datasets/vis/mds_3_Clustering/",
"data/multiple_datasets/vis/mds_4_Embedding/",
"data/multiple_datasets/vis/mds_5_Time/",
"data/multiple_datasets/vis/home_Home_tsne/",
], dataset=DATASETS)
with WorkflowExecuter(dict(), dict(out=target), "peptidereactor.yaml", cores=CORES) as e:
main_cmd = "./peptidereactor/run_pipeline -s peptidereactor.smk --configfile peptidereactor.yaml"
options = f""" --cores {CORES} --keep-going {" ".join(sys.argv[1:])}"""
shell(main_cmd + options)