Skip to content

Commit

Permalink
Cleaning up Molecule.from_pdb, improving performance, extending tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
j-wags committed Oct 26, 2021
1 parent 5fd1a6f commit 354cc76
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 97 deletions.
16 changes: 16 additions & 0 deletions openff/toolkit/tests/test_molecule.py
Original file line number Diff line number Diff line change
Expand Up @@ -4009,6 +4009,8 @@ def test_molecule_from_pdb_mainchain_hid_dipeptide(self):
offmol = Molecule.from_pdb(get_data_file_path('proteins/MainChain_HID.pdb'))
assert offmol.n_atoms == 29
assert offmol.total_charge == 0 * unit.elementary_charge
assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 5
assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 5
expected_mol = Molecule.from_smiles('CC(=O)N[C@H](CC1NC=NC=1)C(=O)NC')
assert offmol.is_isomorphic_with(expected_mol,
atom_stereochemistry_matching=False,
Expand All @@ -4018,6 +4020,8 @@ def test_molecule_from_pdb_mainchain_hie_dipeptide(self):
offmol = Molecule.from_pdb(get_data_file_path('proteins/MainChain_HIE.pdb'))
assert offmol.n_atoms == 29
assert offmol.total_charge == 0 * unit.elementary_charge
assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 5
assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 5
expected_mol = Molecule.from_smiles('CC(=O)N[C@H](CC1N=C[NH]C=1)C(=O)NC')
assert offmol.is_isomorphic_with(expected_mol,
atom_stereochemistry_matching=False,
Expand All @@ -4027,6 +4031,9 @@ def test_molecule_from_pdb_mainchain_hip_dipeptide(self):
offmol = Molecule.from_pdb(get_data_file_path('proteins/MainChain_HIP.pdb'))
assert offmol.n_atoms == 30
assert offmol.total_charge == 1 * unit.elementary_charge
assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 5
assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 5

expected_mol = Molecule.from_smiles('CC(=O)N[C@H](CC1[N+H]=CNC=1)C(=O)NC')
assert offmol.is_isomorphic_with(expected_mol,
atom_stereochemistry_matching=False,
Expand All @@ -4036,6 +4043,9 @@ def test_molecule_from_pdb_mainchain_trp_dipeptide(self):
offmol = Molecule.from_pdb(get_data_file_path('proteins/MainChain_TRP.pdb'))
assert offmol.n_atoms == 36
assert offmol.total_charge == 0 * unit.elementary_charge
assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 9
assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 10

expected_mol = Molecule.from_smiles('CC(=O)N[C@H](CC1C2=CC=CC=C2NC=1)C(=O)NC')
assert offmol.is_isomorphic_with(expected_mol,
atom_stereochemistry_matching=False,
Expand All @@ -4046,6 +4056,9 @@ def test_molecule_from_pdb_cterminal_trp_dipeptide(self):
offmol = Molecule.from_pdb(get_data_file_path('proteins/CTerminal_TRP.pdb'))
assert offmol.n_atoms == 31
assert offmol.total_charge == -1 * unit.elementary_charge
assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 9
assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 10

expected_mol = Molecule.from_smiles('CC(=O)N[C@H](CC1C2=CC=CC=C2NC=1)C(=O)[O-]')
assert offmol.is_isomorphic_with(expected_mol,
atom_stereochemistry_matching=False,
Expand All @@ -4056,6 +4069,9 @@ def test_molecule_from_pdb_nterminal_trp_dipeptide(self):
offmol = Molecule.from_pdb(get_data_file_path('proteins/NTerminal_TRP.pdb'))
assert offmol.n_atoms == 32
assert offmol.total_charge == 1 * unit.elementary_charge
assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 9
assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 10

expected_mol = Molecule.from_smiles('[N+H3][C@H](CC1C2=CC=CC=C2NC=1)C(=O)NC')
assert offmol.is_isomorphic_with(expected_mol,
atom_stereochemistry_matching=False,
Expand Down
160 changes: 63 additions & 97 deletions openff/toolkit/topology/molecule.py
Original file line number Diff line number Diff line change
Expand Up @@ -5015,7 +5015,7 @@ def from_pdb(cls, file_path):
import networkx as nx
from networkx.algorithms import isomorphism

def rdmol_to_networkx(rdmol):
def _rdmol_to_networkx(rdmol, res_name):
_bondtypes = {
# 0: Chem.BondType.AROMATIC,
Chem.BondType.SINGLE: 1,
Expand All @@ -5027,10 +5027,26 @@ def rdmol_to_networkx(rdmol):
Chem.BondType.HEXTUPLE:6 ,
}
rdmol_G = nx.Graph()
n_hydrogens = [0] * rdmol.GetNumAtoms()
for atom in rdmol.GetAtoms():
atomic_number = atom.GetAtomicNum()
# Assign sequential negative numbers as atomic numbers for hydrogens attached to the same heavy atom.
# We do the same to hydrogens in the protein graph. This makes it so we
# don't have to deal with redundant self-symmetric matches.
if atomic_number == 1:
heavy_atom_idx = atom.GetNeighbors()[0].GetIdx()
n_hydrogens[heavy_atom_idx] += 1
atomic_number = -1 * n_hydrogens[heavy_atom_idx]

rdmol_G.add_node(
atom.GetIdx(), atomic_number=atom.GetAtomicNum(), formal_charge=atom.GetFormalCharge()
atom.GetIdx(),
atomic_number=atomic_number,
formal_charge=atom.GetFormalCharge(),
)
# These substructures (and only these substructures) should be able to overlap previous matches.
# They handle bonds between substructures.
if res_name in ['PEPTIDE_BOND', 'DISULFIDE']:
rdmol_G.nodes[atom.GetIdx()]['already_matched'] = True
for bond in rdmol.GetBonds():
bond_type = bond.GetBondType()

Expand All @@ -5049,98 +5065,94 @@ def rdmol_to_networkx(rdmol):
)
return rdmol_G

def protein_from_openmm(substructure_library, openmm_topology):
def _openmm_topology_to_networkx(substructure_library, openmm_topology):
"""
Construct an OpenFF Topology object from an OpenMM Topology object.
Parameters
----------
substructure_library : dict{str:list[str, list[str]]}
A dictionary of substructures. substructure_library[aa_name] = list[tagged SMARTS, list[atom_names]]
openmm_topology : simtk.openmm.app.Topology
An OpenMM Topology object
unique_molecules : iterable of objects that can be used to construct unique Molecule objects
All unique molecules must be provided, in any order, though multiple copies of each molecule are allowed.
The atomic elements and bond connectivity will be used to match the reference molecules
to molecule graphs appearing in the OpenMM ``Topology``. If bond orders are present in the
OpenMM ``Topology``, these will be used in matching as well.
Returns
-------
topology : openff.toolkit.topology.Topology
An OpenFF Topology object
omm_topology_G : networkx graph
A networkX graph representation of the openmm topology with chemical information added from the
substructure dictionary. Atoms are nodes and bonds are edges.
Nodes (atoms) have attributes for `atomic_number` (int) and `formal_charge` (int).
Edges (bonds) have attributes for `bond_order` (Chem.rdchem.BondType).
Any edges that are not assgined a bond order will have the value Chem.rdchem.BondType.UNSPECIFIED
and should be considered an error.
"""
import networkx as nx

from openff.toolkit.topology.molecule import Molecule

# Check to see if the openMM system has defined bond orders, by looping over all Bonds in the Topology.
# omm_has_bond_orders = True
# for omm_bond in openmm_topology.bonds():
# if omm_bond.order is None:
# omm_has_bond_orders = False

# if unique_molecules is None:
# raise MissingUniqueMoleculesError(
# "Topology.from_openmm requires a list of Molecule objects "
# "passed as unique_molecules, but None was passed."
# )

# Convert all unique mols to graphs
# topology = cls()

# Convert all openMM mols to graphs

omm_topology_G = nx.Graph()
for atom in openmm_topology.atoms():
omm_topology_G.add_node(
atom.index, atomic_number=atom.element.atomic_number, formal_charge=0.
atom.index,
atomic_number=atom.element.atomic_number,
formal_charge=0.
)

n_hydrogens = [0] * openmm_topology.getNumAtoms()
for bond in openmm_topology.bonds():
omm_topology_G.add_edge(
bond.atom1.index, bond.atom2.index, bond_order=Chem.rdchem.BondType.UNSPECIFIED # bond.order
)
# Assign sequential negative numbers as atomic numbers for hydrogens attached to the same heavy atom.
# We do the same to the substructure templates that are used for matching. This makes it so we
# don't deal with redundant self-symmetric matches.
if bond.atom1.element.atomic_number == 1:
h_index = bond.atom1.index
heavy_atom_index = bond.atom2.index
n_hydrogens[heavy_atom_index] += 1
omm_topology_G.nodes[h_index]['atomic_number'] = -1 * n_hydrogens[heavy_atom_index]
if bond.atom2.element.atomic_number == 1:
h_index = bond.atom2.index
heavy_atom_index = bond.atom1.index
n_hydrogens[heavy_atom_index] += 1
omm_topology_G.nodes[h_index]['atomic_number'] = -1 * n_hydrogens[heavy_atom_index]

# Try matching this substructure to the whole molecule graph
node_match = isomorphism.categorical_node_match('atomic_number', -1)
#node_match = isomorphism.categorical_node_match(['atomic_number', 'residue_name'], [-1, 'UNK'])
node_match = isomorphism.categorical_node_match(['atomic_number', 'already_matched'], [-100, False])

all_rdmol_graphs = []
already_assigned_nodes = set()
already_assigned_edges = set()

non_isomorphic_count = 0
for res_name in substructure_dictionary:
# TODO: Match peptide bonds in a way that they won't mis-label ring-closing C on PRO sidechain as CA
for res_name in substructure_library:
# TODO: This is a hack for the moment since we don't have a more sophisticated way to resolve clashes
# so it just does the biggest substructures first
sorted_substructure_smarts = sorted(substructure_dictionary[res_name], key=len, reverse=True)
sorted_substructure_smarts = sorted(substructure_library[res_name], key=len, reverse=True)
non_isomorphic_flag = True
for substructure_smarts in sorted_substructure_smarts:
rdmol = Chem.MolFromSmarts(substructure_smarts)
rdmol_G = rdmol_to_networkx(rdmol)
# all_rdmol_graphs.append(rdmol_G)
rdmol_G = _rdmol_to_networkx(rdmol, res_name)
GM = isomorphism.GraphMatcher(omm_topology_G, rdmol_G, node_match=node_match)
#GM = isomorphism.ISMAGS(omm_topology_G, rdmol_G, node_match=node_match)
if GM.subgraph_is_isomorphic():
print(res_name)
print(substructure_smarts)

# print(GM)
# print(dir(GM))
#for omm_idx_2_rdk_idx in GM.subgraph_isomorphisms_iter(symmetry=True):
for omm_idx_2_rdk_idx in GM.subgraph_isomorphisms_iter():
assert len(omm_idx_2_rdk_idx) == rdmol.GetNumAtoms()
# print(omm_idx_2_rdk_idx)
for omm_idx, rdk_idx in omm_idx_2_rdk_idx.items():
if omm_idx in already_assigned_nodes:
continue
already_assigned_nodes.add(omm_idx)
# Negative atomic numbers are really hydrogen, so we reassign them to be atomic number
# 1 once they've been matched
if omm_topology_G.nodes[omm_idx]['atomic_number'] < 0:
omm_topology_G.nodes[omm_idx]['atomic_number'] = 1
omm_topology_G.nodes[omm_idx]['formal_charge'] = rdmol_G.nodes[rdk_idx]['formal_charge']

omm_topology_G.nodes[omm_idx]['already_matched'] = True
rdk_idx_2_omm_idx = dict([(j, i) for i, j in omm_idx_2_rdk_idx.items()])
for edge in rdmol_G.edges:
# print(edge)

# print(dir(edge))
# print(rdmol_G.get_edge_data(*edge))
# 1/0
omm_edge_idx = rdk_idx_2_omm_idx[edge[0]], rdk_idx_2_omm_idx[edge[1]]
if omm_edge_idx in already_assigned_edges:
continue
Expand All @@ -5164,10 +5176,9 @@ def protein_from_openmm(substructure_library, openmm_topology):
with open(substructure_file_path, 'r') as subfile:
substructure_dictionary = json.load(subfile)
from simtk.openmm.app import PDBFile
from openff.toolkit.utils import get_data_file_path

pdb = PDBFile(file_path)
omm_topology_G = protein_from_openmm(substructure_dictionary, pdb.topology)
omm_topology_G = _openmm_topology_to_networkx(substructure_dictionary, pdb.topology)



Expand All @@ -5176,72 +5187,28 @@ def protein_from_openmm(substructure_library, openmm_topology):

for node_idx, node_data in omm_topology_G.nodes.items():
print(node_idx, node_data)
# node_data = omm_topology_G.get_node_data(node)
# rdatom = Chem.Atom(node_data['atomic_number'])
formal_charge = int(node_data['formal_charge'])
print(f'Formal charge: {formal_charge}')
offmol.add_atom(node_data['atomic_number'],
int(node_data['formal_charge']),
False, #node_data['is_aromatic']=="Y"
False,
)
# rdatom.SetFormalCharge(formal_charge # * unit.elementary_charge
# # atom.formal_charge.value_in_unit(unit.elementary_charge)
# )
# rdatom.SetIsAromatic(atom.is_aromatic)
# rdatom.SetProp("_Name", atom.name)

# TODO: Pull in stereochemistry

## Stereo handling code moved to after bonds are added
# if atom.stereochemistry == "S":
# rdatom.SetChiralTag(Chem.CHI_TETRAHEDRAL_CW)
# elif atom.stereochemistry == "R":
# rdatom.SetChiralTag(Chem.CHI_TETRAHEDRAL_CCW)

# rd_index = rdmol.AddAtom(rdatom)
# print(rd_index)

# Let's make sure al the atom indices in the two molecules
# are the same, otherwise we need to create an atom map.
# assert index == atom.molecule_atom_index
# assert index == node_idx

for edge, edge_data in omm_topology_G.edges.items():
print(edge, edge_data)
# atom_indices = (
# edge[0],
# edge[1],
# )
offmol.add_bond(edge[0],
edge[1],
edge_data['bond_order'],
False)
# rdmol.AddBond(*edge)
# rdbond = rdmol.GetBondBetweenAtoms(*edge)
# Assign bond type, which is based on order unless it is aromatic
# if bond.is_aromatic:
# rdbond.SetBondType(_bondtypes[1.5])
# rdbond.SetIsAromatic(True)
# else:
# rdbond.SetBondType(_bondtypes[edge_data['bond_order']])
# rdbond.SetBondType(edge_data['bond_order'])
# rdbond.SetIsAromatic(False)

# Get bond information for N 604
# n_604 = rdmol.GetAtomWithIdx(604)
# for bond in n_604.GetBonds():
# print(bond.GetBondType())

print(f"Number of atoms before sanitization: {len(rdmol.GetAtoms())}")

# TODO: Remove this hack for the uncapped C terminal once we've updated the substructure dict
#rdmol.GetBondBetweenAtoms(rdmol.GetNumAtoms() - 1, rdmol.GetNumAtoms() - 3).SetBondType(Chem.BondType.SINGLE)
#rdmol.GetAtomWithIdx(rdmol.GetNumAtoms() - 1).SetFormalCharge(-1)
# rdmol.GetBondBetweenAtoms(rdmol.GetNumAtoms() - 1, rdmol.GetNumAtoms() - 3).SetBondType(Chem.BondType.SINGLE)
#offmol.get_bond_between(offmol.n_atoms-1, offmol.n_atoms-3).bond_order = 1
# rdmol.GetAtomWithIdx(rdmol.GetNumAtoms() - 1).SetFormalCharge(-1)
#offmol.atoms[offmol.n_atoms-1].formal_charge = -1
# TODO: Does openff mol has some sanitization API point?
print(f"Number of atoms before sanitization: {len(rdmol.GetAtoms())}")
# TODO: Pull in coordinates and assign stereochemistry
# offmol.add_conformer()
# TODO: Ensure that this assigns aromaticity
rdmol = offmol.to_rdkit()
Chem.SanitizeMol(
rdmol,
Expand All @@ -5250,7 +5217,6 @@ def protein_from_openmm(substructure_library, openmm_topology):

print(f"Number of atoms after sanitization: {len(rdmol.GetAtoms())}")

# offmol = Molecule.from_rdkit(rdmol, allow_undefined_stereo=True)
offmol = cls.from_rdkit(rdmol, allow_undefined_stereo=True, hydrogens_are_explicit=True)
print(f"OFFMol number of atoms: {offmol.n_atoms}")
return offmol
Expand Down

0 comments on commit 354cc76

Please # to comment.