-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path0_fragmentation.py
112 lines (96 loc) · 5.08 KB
/
0_fragmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import subprocess
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import rdkit.Chem.AllChem as Chem
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt
# data cleaning
df_info = pd.read_csv('data/hMOF_CO2_info.csv')
df_info = df_info.dropna() # drop entries containing 'NaN'
df_info = df_info[df_info.CO2_wc_001>0] # only keep entries with positive CO2 working capacity
df_info = df_info[~df_info.MOFid.str.contains('ERROR')] # drop entries with error
df_info = df_info[~df_info.MOFid.str.contains('NA')] # drop entries with NA
# get node and linker information
metal_eles = ['Zn', 'Cu', 'Mn', 'Zr', 'Co', 'Ni', 'Fe', 'Cd', 'Pb', 'Al', 'Mg', 'V',
'Tb', 'Eu', 'Sm', 'Tm', 'Gd', 'Nd', 'Dy', 'La', 'Ba', 'Ga', 'In',
'Ti', 'Be', 'Ce', 'Li', 'Pd', 'Na', 'Er', 'Ho', 'Yb', 'Ag', 'Pr',
'Cs', 'Mo', 'Lu', 'Ca', 'Pt', 'Ge', 'Sc', 'Hf', 'Cr', 'Bi', 'Rh',
'Sn', 'Ir', 'Nb', 'Ru', 'Th', 'As', 'Sr']
# get a list of metal nodes & create a new column named "metal_nodes"
metal_nodes = []
organic_linkers = []
for i,mofid in tqdm(enumerate(df_info.MOFid)):
sbus = mofid.split()[0].split('.')
metal_nodes.append([c for c in sbus if any(e in c for e in metal_eles)][0])
organic_linkers.append([c for c in sbus if any(e in c for e in metal_eles)==False])
df_info['metal_node'] = metal_nodes
df_info['organic_linker'] = organic_linkers
# get most occuring nodes
unique_nodes = [n for n in list(df_info['metal_node'].unique()) if len(n)<=30] # node smiles should be shorter then 30 strings
df_info = df_info[df_info['metal_node'].isin(unique_nodes)] # filter df_info based on unique_nodes
freq = [df_info['metal_node'].value_counts()[value] for value in list(df_info.metal_node.unique())] # get frequency of unique nodes
df_freq = pd.DataFrame({'node':list(df_info.metal_node.unique()),'freq':freq})
print('node occurance:')
print(df_freq)
unique_node_select = list(df_freq[df_freq.freq>=5000].node) # select occuring nodes
df_info_select = df_info[df_info['metal_node'].isin(unique_node_select)] # select df_info with node only in list(unique_node_select)
# create necessary folders
os.makedirs(f'data/conformers',exist_ok=True)
os.makedirs(f'data/data_by_node',exist_ok=True)
os.makedirs(f'data/data_three_linkers',exist_ok=True)
os.makedirs(f'data/data_high_wc',exist_ok=True)
os.makedirs(f'data/data_high_wc_three_linkers',exist_ok=True)
os.makedirs(f'data/fragments_smi',exist_ok=True)
# output each node to a separate csv files
for n in unique_node_select:
n_name = n.replace('[','').replace(']','').replace('(','').replace(')','')
df_info_select_node = df_info[df_info.metal_node == n]
df_info_select_node.to_csv(f'data/data_by_node/{n_name}.csv',index=False)
# load data
for node in unique_node_select:
node_name = node.replace('[','').replace(']','').replace('(','').replace(')','')
print(f'Now on node {node_name} ... ')
input_data_path = f'data/data_by_node/{node_name}.csv'
three_linkers_data_path = f'data/data_three_linkers/{node_name}.csv'
high_wc_data_path = f'data/data_high_wc/{node_name}.csv'
high_wc_3_linkers_data_path = f'data/data_high_wc_three_linkers/{node_name}.csv'
df = pd.read_csv(input_data_path)
# MOFs with three linkers
len_linkers = [len(eval(df['organic_linker'].iloc[i])) for i in range(len(df['organic_linker']))]
df['len_linkers'] = len_linkers
df_three_linkers = df[df.len_linkers==3]
df_three_linkers.to_csv(three_linkers_data_path,index=False)
# MOFs with high working capactiy at (wc > 2mmol/g @ 0.1 bar)
df_high_wc = df[df['CO2_wc_01'] >=2]
df_high_wc.to_csv(high_wc_data_path,index=False)
# MOFs with high working capacity and three linkers
len_linkers = [len(eval(df_high_wc['organic_linker'].iloc[i])) for i in range(len(df_high_wc['organic_linker']))]
df_high_wc['len_linkers'] = len_linkers
df_high_wc_3_linkers = df_high_wc[df_high_wc.len_linkers==3]
df_high_wc_3_linkers.to_csv(high_wc_3_linkers_data_path,index=False)
# get list of SMILES for all linkers
list_smiles = [eval(i) for i in df_high_wc['organic_linker']]
all_smiles = list(itertools.chain(*list_smiles))
print(f'number of smiles: {len(all_smiles)}')
all_smiles_unique = list(pd.Series(all_smiles).unique())
print(f'number of unique_smiles: {len(all_smiles_unique)}')
# output to sdf
print('Outputting conformers to sdf ... ')
conformer_sdf_path = f'data/conformers/conformers_{node_name}.sdf'
writer = Chem.SDWriter(conformer_sdf_path)
for smile in tqdm(all_smiles_unique):
try:
mol = Chem.AddHs(Chem.MolFromSmiles(smile))
conformers = AllChem.EmbedMultipleConfs(mol, numConfs=1)
conformer = mol.GetConformer(0)
for cid in range(mol.GetNumConformers()):
writer.write(mol, confId=cid)
except:
pass
# generate SMILES
print('Generating SMILES ... ')
subprocess.run(f'python utils/prepare_data_from_sdf.py --sdf_path data/conformers/conformers_{node_name}.sdf --output_path data/fragments_smi/frag_{node_name}.txt --verbose',shell=True)