-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSPA_glycosylation_model.py
142 lines (128 loc) · 6.68 KB
/
SPA_glycosylation_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import SPA
from pandas import read_csv
import numpy as np
import os
def Bydlinski_setup(exp_name):
"""
For a given Bydlinski data name, takes the data for SPA
"""
# Load the .csv files with all the data
X_train = read_csv('../datasets/Training-X.csv', index_col = 0).values
y_train = read_csv(f'../datasets/{exp_name}_training-y.csv', index_col = 0)
glyco_labels = y_train.columns.to_list()
y_train = y_train.values
X_test = read_csv('../datasets/Test-X.csv', index_col = 0).values
y_test = read_csv(f'../datasets/{exp_name}_test-y.csv', index_col = 0).values
# Setup for convenience
os.mkdir(f'{exp_name}_results')
os.chdir(f'{exp_name}_results')
return glyco_labels, X_train, y_train, X_test, y_test
def Kotidis_setup():
"""
Takes the data in the NN_modelNSD files for SPA
"""
# Load the .csv files with all the data
X_train = read_csv('../datasets/NN_modelNSD_training-X.csv', index_col = 0).values
y_train = read_csv('../datasets/NN_modelNSD_training-y.csv', index_col = 0)
glyco_labels = y_train.columns.to_list()
y_train = y_train.values
X_test = read_csv('../datasets/NN_modelNSD_test-X.csv', index_col = 0).values
y_test = read_csv('../datasets/NN_modelNSD_test-y.csv', index_col = 0).values
# Setup for convenience
os.mkdir('NN_modelNSD_results')
os.chdir('NN_modelNSD_results')
return glyco_labels, X_train, y_train, X_test, y_test
def run_SPA(exp_name, glyco_labels, X_train, y_train, X_test, y_test, nested = False):
"""
Manipulates the data according to SPA's requirements, then ...
runs SPA once per glycan, saving the results in a new folder
"""
# SPA can test only one y variable at a time, so we will split the data and call SPA multiple times
for exp_idx, current_glyco in enumerate(glyco_labels):
print(f'Beginning glyco run {exp_idx+1} out of {len(glyco_labels)} | {exp_name}-{current_glyco}')
# Save the data as X data + one y variable for SPA
concat_train = np.atleast_2d(y_train[:, exp_idx]).T # The current y variable
train = np.concatenate((X_train, concat_train), axis = 1)
np.savetxt('Current_training.csv', train, delimiter = ',')
concat_test = np.atleast_2d(y_test[:, exp_idx]).T # The current y variable
test = np.concatenate((X_test, concat_test), axis = 1)
np.savetxt('Current_testing.csv', test, delimiter = ',')
# Group names file
if exp_name == 'NN_modelNSD':
group_file = '../../group_names_NN.txt'
else:
group_file = '../../group_names.txt'
# Run SPA
_ = SPA.main_SPA('Current_training.csv', test_data = 'Current_testing.csv', model_name = ['EN', 'RR', 'PLS'],
cv_method = 'groupkfold', group_name = group_file, K_fold = 4, nested_cv = nested)
for myfile in os.scandir():
if myfile.name.startswith('SPA_results'):
_, ext = os.path.splitext(myfile.name)
os.rename(myfile.name, f'{exp_name}_{current_glyco}_results{ext}')
# Removing temp files
os.remove('Current_training.csv')
os.remove('Current_testing.csv')
def get_mean_and_std():
"""
Finds all the .json files (as generated by main_fun() ) in a folder, then ...
collects the means and stdevs for each training and testing set, putting these ...
values in a .csv file
"""
# Collecting the means and stdevs
glyco_names = ''
train_mean = []
train_std = []
test_mean = []
test_std = []
for myfile in os.scandir():
if myfile.name.endswith('.json'):
with open(myfile.name) as f:
glyco_names += myfile.name.split('_')[2] + ','
for line in f.readlines():
if 'train_nontrans_mean' in line:
num_location = line.find(':') + 2 # +2 to also exclude ": "
train_mean.append(float(line[num_location : -2])) # -2 to remove ",\n" at the end
elif 'train_nontrans_std' in line:
num_location = line.find(':') + 2 # +2 to also exclude ": "
train_std.append(float(line[num_location : -2])) # -2 to remove ",\n" at the end
elif 'test_nontrans_mean' in line:
num_location = line.find(':') + 2 # +2 to also exclude ": "
test_mean.append(float(line[num_location : -2])) # -2 to remove ",\n" at the end
elif 'test_nontrans_std' in line:
num_location = line.find(':') + 2 # +2 to also exclude ": "
test_std.append(float(line[num_location : -1])) # -1 to remove "\n" at the end
# Saving to .csv files
mylen = len(train_mean) # For convenience
train = np.concatenate((train_mean, train_std)).reshape(-1, mylen)
np.savetxt('train_mean_std.csv', train, delimiter = ',', fmt = '%.3g', header = glyco_names)
test = np.concatenate((test_mean, test_std)).reshape(-1, mylen)
np.savetxt('test_mean_std.csv', test, delimiter = ',', fmt = '%.3g', header = glyco_names)
if __name__ == '__main__':
# Input setup (to allow passing --nested flag)
import argparse
parser = argparse.ArgumentParser(description = 'Runs SPA on the Bydlinski and Kotidis datasets to predict the N-glycosylation glycan distribution')
parser.add_argument('--nested', metavar='True | [False]', type=bool, nargs='?', default = False, const = True, choices = {True, False},
help = 'Set this flag to run nested cross validation (instead of regular cross validation)')
nested = parser.parse_args().nested
# Folder setup (for organization)
folder_name = f'SPA_results{"_nested"*(nested)}'
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
os.chdir(folder_name)
# Bydlinski data
for exp_name in ('Asn_24', 'Asn_38', 'Asn_83', 'Asn_110', 'Asn_168', 'Asn_538', 'Asn_745', 'Fc_DAO', 'Fc_EPO'):
try:
glyco_labels, X_train, y_train, X_test, y_test = Bydlinski_setup(exp_name)
run_SPA(exp_name, glyco_labels, X_train, y_train, X_test, y_test, nested)
get_mean_and_std()
os.chdir('..')
except FileExistsError: # Error gets raised during Bydlinski_setup()
print(f'The folder {exp_name}_results already exists. Proceeding to the next site...')
# Kotidis data
try:
glyco_labels, X_train, y_train, X_test, y_test = Kotidis_setup()
run_SPA('NN_modelNSD', glyco_labels, X_train, y_train, X_test, y_test, nested)
get_mean_and_std()
os.chdir('..')
except FileExistsError: # Error gets raised during Kotidis_setup()
print(f'The folder NN_modelNSD_results already exists.')