Cleaning up Molecule.from_pdb, improving performance, extending tests.

openforcefield · Oct 26, 2021 · 354cc76 · 354cc76
1 parent 5fd1a6f
commit 354cc76
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 97 deletions.
diff --git a/openff/toolkit/tests/test_molecule.py b/openff/toolkit/tests/test_molecule.py
@@ -4009,6 +4009,8 @@ def test_molecule_from_pdb_mainchain_hid_dipeptide(self):
         offmol = Molecule.from_pdb(get_data_file_path('proteins/MainChain_HID.pdb'))
         assert offmol.n_atoms == 29
         assert offmol.total_charge == 0 * unit.elementary_charge
+        assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 5
+        assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 5
         expected_mol = Molecule.from_smiles('CC(=O)N[C@H](CC1NC=NC=1)C(=O)NC')
         assert offmol.is_isomorphic_with(expected_mol,
                                          atom_stereochemistry_matching=False,
@@ -4018,6 +4020,8 @@ def test_molecule_from_pdb_mainchain_hie_dipeptide(self):
         offmol = Molecule.from_pdb(get_data_file_path('proteins/MainChain_HIE.pdb'))
         assert offmol.n_atoms == 29
         assert offmol.total_charge == 0 * unit.elementary_charge
+        assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 5
+        assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 5
         expected_mol = Molecule.from_smiles('CC(=O)N[C@H](CC1N=C[NH]C=1)C(=O)NC')
         assert offmol.is_isomorphic_with(expected_mol,
                                          atom_stereochemistry_matching=False,
@@ -4027,6 +4031,9 @@ def test_molecule_from_pdb_mainchain_hip_dipeptide(self):
         offmol = Molecule.from_pdb(get_data_file_path('proteins/MainChain_HIP.pdb'))
         assert offmol.n_atoms == 30
         assert offmol.total_charge == 1 * unit.elementary_charge
+        assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 5
+        assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 5
+
         expected_mol = Molecule.from_smiles('CC(=O)N[C@H](CC1[N+H]=CNC=1)C(=O)NC')
         assert offmol.is_isomorphic_with(expected_mol,
                                          atom_stereochemistry_matching=False,
@@ -4036,6 +4043,9 @@ def test_molecule_from_pdb_mainchain_trp_dipeptide(self):
         offmol = Molecule.from_pdb(get_data_file_path('proteins/MainChain_TRP.pdb'))
         assert offmol.n_atoms == 36
         assert offmol.total_charge == 0 * unit.elementary_charge
+        assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 9
+        assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 10
+
         expected_mol = Molecule.from_smiles('CC(=O)N[C@H](CC1C2=CC=CC=C2NC=1)C(=O)NC')
         assert offmol.is_isomorphic_with(expected_mol,
                                          atom_stereochemistry_matching=False,
@@ -4046,6 +4056,9 @@ def test_molecule_from_pdb_cterminal_trp_dipeptide(self):
         offmol = Molecule.from_pdb(get_data_file_path('proteins/CTerminal_TRP.pdb'))
         assert offmol.n_atoms == 31
         assert offmol.total_charge == -1 * unit.elementary_charge
+        assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 9
+        assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 10
+
         expected_mol = Molecule.from_smiles('CC(=O)N[C@H](CC1C2=CC=CC=C2NC=1)C(=O)[O-]')
         assert offmol.is_isomorphic_with(expected_mol,
                                          atom_stereochemistry_matching=False,
@@ -4056,6 +4069,9 @@ def test_molecule_from_pdb_nterminal_trp_dipeptide(self):
         offmol = Molecule.from_pdb(get_data_file_path('proteins/NTerminal_TRP.pdb'))
         assert offmol.n_atoms == 32
         assert offmol.total_charge == 1 * unit.elementary_charge
+        assert sum([1 for atom in offmol.atoms if atom.is_aromatic]) == 9
+        assert sum([1 for bond in offmol.bonds if bond.is_aromatic]) == 10
+
         expected_mol = Molecule.from_smiles('[N+H3][C@H](CC1C2=CC=CC=C2NC=1)C(=O)NC')
         assert offmol.is_isomorphic_with(expected_mol,
                                          atom_stereochemistry_matching=False,

diff --git a/openff/toolkit/topology/molecule.py b/openff/toolkit/topology/molecule.py
@@ -5015,7 +5015,7 @@ def from_pdb(cls, file_path):
         import networkx as nx
         from networkx.algorithms import isomorphism
 
-        def rdmol_to_networkx(rdmol):
+        def _rdmol_to_networkx(rdmol, res_name):
             _bondtypes = {
                 # 0: Chem.BondType.AROMATIC,
                 Chem.BondType.SINGLE: 1,
@@ -5027,10 +5027,26 @@ def rdmol_to_networkx(rdmol):
                 Chem.BondType.HEXTUPLE:6 ,
             }
             rdmol_G = nx.Graph()
+            n_hydrogens = [0] * rdmol.GetNumAtoms()
             for atom in rdmol.GetAtoms():
+                atomic_number = atom.GetAtomicNum()
+                # Assign sequential negative numbers as atomic numbers for hydrogens attached to the same heavy atom.
+                # We do the same to hydrogens in the protein graph. This makes it so we
+                # don't have to deal with redundant self-symmetric matches.
+                if atomic_number == 1:
+                    heavy_atom_idx = atom.GetNeighbors()[0].GetIdx()
+                    n_hydrogens[heavy_atom_idx] += 1
+                    atomic_number = -1 * n_hydrogens[heavy_atom_idx]
+
                 rdmol_G.add_node(
-                    atom.GetIdx(), atomic_number=atom.GetAtomicNum(), formal_charge=atom.GetFormalCharge()
+                    atom.GetIdx(),
+                    atomic_number=atomic_number,
+                    formal_charge=atom.GetFormalCharge(),
                 )
+                # These substructures (and only these substructures) should be able to overlap previous matches.
+                # They handle bonds between substructures.
+                if res_name in ['PEPTIDE_BOND', 'DISULFIDE']:
+                    rdmol_G.nodes[atom.GetIdx()]['already_matched'] = True
             for bond in rdmol.GetBonds():
                 bond_type = bond.GetBondType()
 
@@ -5049,98 +5065,94 @@ def rdmol_to_networkx(rdmol):
                 )
             return rdmol_G
 
-        def protein_from_openmm(substructure_library, openmm_topology):
+        def _openmm_topology_to_networkx(substructure_library, openmm_topology):
             """
             Construct an OpenFF Topology object from an OpenMM Topology object.
 
             Parameters
             ----------
+            substructure_library : dict{str:list[str, list[str]]}
+                A dictionary of substructures. substructure_library[aa_name] = list[tagged SMARTS, list[atom_names]]
             openmm_topology : simtk.openmm.app.Topology
                 An OpenMM Topology object
-            unique_molecules : iterable of objects that can be used to construct unique Molecule objects
-                All unique molecules must be provided, in any order, though multiple copies of each molecule are allowed.
-                The atomic elements and bond connectivity will be used to match the reference molecules
-                to molecule graphs appearing in the OpenMM ``Topology``. If bond orders are present in the
-                OpenMM ``Topology``, these will be used in matching as well.
 
             Returns
             -------
-            topology : openff.toolkit.topology.Topology
-                An OpenFF Topology object
+            omm_topology_G : networkx graph
+                A networkX graph representation of the openmm topology with chemical information added from the
+                substructure dictionary. Atoms are nodes and bonds are edges.
+                Nodes (atoms) have attributes for `atomic_number` (int) and `formal_charge` (int).
+                Edges (bonds) have attributes for `bond_order` (Chem.rdchem.BondType).
+                Any edges that are not assgined a bond order will have the value Chem.rdchem.BondType.UNSPECIFIED
+                and should be considered an error.
             """
             import networkx as nx
 
-            from openff.toolkit.topology.molecule import Molecule
-
-            # Check to see if the openMM system has defined bond orders, by looping over all Bonds in the Topology.
-            # omm_has_bond_orders = True
-            # for omm_bond in openmm_topology.bonds():
-            #    if omm_bond.order is None:
-            #        omm_has_bond_orders = False
-
-            # if unique_molecules is None:
-            #    raise MissingUniqueMoleculesError(
-            #        "Topology.from_openmm requires a list of Molecule objects "
-            #        "passed as unique_molecules, but None was passed."
-            #    )
-
-            # Convert all unique mols to graphs
-            # topology = cls()
-
-            # Convert all openMM mols to graphs
-
             omm_topology_G = nx.Graph()
             for atom in openmm_topology.atoms():
                 omm_topology_G.add_node(
-                    atom.index, atomic_number=atom.element.atomic_number, formal_charge=0.
+                    atom.index,
+                    atomic_number=atom.element.atomic_number,
+                    formal_charge=0.
                 )
 
+            n_hydrogens = [0] * openmm_topology.getNumAtoms()
             for bond in openmm_topology.bonds():
                 omm_topology_G.add_edge(
                     bond.atom1.index, bond.atom2.index, bond_order=Chem.rdchem.BondType.UNSPECIFIED  # bond.order
                 )
+                # Assign sequential negative numbers as atomic numbers for hydrogens attached to the same heavy atom.
+                # We do the same to the substructure templates that are used for matching. This makes it so we
+                # don't deal with redundant self-symmetric matches.
+                if bond.atom1.element.atomic_number == 1:
+                    h_index = bond.atom1.index
+                    heavy_atom_index = bond.atom2.index
+                    n_hydrogens[heavy_atom_index] += 1
+                    omm_topology_G.nodes[h_index]['atomic_number'] = -1 * n_hydrogens[heavy_atom_index]
+                if bond.atom2.element.atomic_number == 1:
+                    h_index = bond.atom2.index
+                    heavy_atom_index = bond.atom1.index
+                    n_hydrogens[heavy_atom_index] += 1
+                    omm_topology_G.nodes[h_index]['atomic_number'] = -1 * n_hydrogens[heavy_atom_index]
 
             # Try matching this substructure to the whole molecule graph
-            node_match = isomorphism.categorical_node_match('atomic_number', -1)
+            #node_match = isomorphism.categorical_node_match(['atomic_number', 'residue_name'], [-1, 'UNK'])
+            node_match = isomorphism.categorical_node_match(['atomic_number', 'already_matched'], [-100, False])
 
             all_rdmol_graphs = []
             already_assigned_nodes = set()
             already_assigned_edges = set()
 
             non_isomorphic_count = 0
-            for res_name in substructure_dictionary:
-                # TODO: Match peptide bonds in a way that they won't mis-label ring-closing C on PRO sidechain as CA
+            for res_name in substructure_library:
                 # TODO: This is a hack for the moment since we don't have a more sophisticated way to resolve clashes
                 # so it just does the biggest substructures first
-                sorted_substructure_smarts = sorted(substructure_dictionary[res_name], key=len, reverse=True)
+                sorted_substructure_smarts = sorted(substructure_library[res_name], key=len, reverse=True)
                 non_isomorphic_flag = True
                 for substructure_smarts in sorted_substructure_smarts:
                     rdmol = Chem.MolFromSmarts(substructure_smarts)
-                    rdmol_G = rdmol_to_networkx(rdmol)
-                    # all_rdmol_graphs.append(rdmol_G)
+                    rdmol_G = _rdmol_to_networkx(rdmol, res_name)
                     GM = isomorphism.GraphMatcher(omm_topology_G, rdmol_G, node_match=node_match)
+                    #GM = isomorphism.ISMAGS(omm_topology_G, rdmol_G, node_match=node_match)
                     if GM.subgraph_is_isomorphic():
                         print(res_name)
                         print(substructure_smarts)
 
-                        # print(GM)
-                        # print(dir(GM))
+                        #for omm_idx_2_rdk_idx in GM.subgraph_isomorphisms_iter(symmetry=True):
                         for omm_idx_2_rdk_idx in GM.subgraph_isomorphisms_iter():
                             assert len(omm_idx_2_rdk_idx) == rdmol.GetNumAtoms()
-                            # print(omm_idx_2_rdk_idx)
                             for omm_idx, rdk_idx in omm_idx_2_rdk_idx.items():
                                 if omm_idx in already_assigned_nodes:
                                     continue
                                 already_assigned_nodes.add(omm_idx)
+                                # Negative atomic numbers are really hydrogen, so we reassign them to be atomic number
+                                # 1 once they've been matched
+                                if omm_topology_G.nodes[omm_idx]['atomic_number'] < 0:
+                                    omm_topology_G.nodes[omm_idx]['atomic_number'] = 1
                                 omm_topology_G.nodes[omm_idx]['formal_charge'] = rdmol_G.nodes[rdk_idx]['formal_charge']
-
+                                omm_topology_G.nodes[omm_idx]['already_matched'] = True
                             rdk_idx_2_omm_idx = dict([(j, i) for i, j in omm_idx_2_rdk_idx.items()])
                             for edge in rdmol_G.edges:
-                                # print(edge)
-
-                                # print(dir(edge))
-                                # print(rdmol_G.get_edge_data(*edge))
-                                # 1/0
                                 omm_edge_idx = rdk_idx_2_omm_idx[edge[0]], rdk_idx_2_omm_idx[edge[1]]
                                 if omm_edge_idx in already_assigned_edges:
                                     continue
@@ -5164,10 +5176,9 @@ def protein_from_openmm(substructure_library, openmm_topology):
         with open(substructure_file_path, 'r') as subfile:
             substructure_dictionary = json.load(subfile)
         from simtk.openmm.app import PDBFile
-        from openff.toolkit.utils import get_data_file_path
 
         pdb = PDBFile(file_path)
-        omm_topology_G = protein_from_openmm(substructure_dictionary, pdb.topology)
+        omm_topology_G = _openmm_topology_to_networkx(substructure_dictionary, pdb.topology)
 
 
 
@@ -5176,72 +5187,28 @@ def protein_from_openmm(substructure_library, openmm_topology):
 
         for node_idx, node_data in omm_topology_G.nodes.items():
             print(node_idx, node_data)
-            # node_data = omm_topology_G.get_node_data(node)
-            # rdatom = Chem.Atom(node_data['atomic_number'])
             formal_charge = int(node_data['formal_charge'])
             print(f'Formal charge: {formal_charge}')
             offmol.add_atom(node_data['atomic_number'],
                             int(node_data['formal_charge']),
-                            False, #node_data['is_aromatic']=="Y"
+                            False,
                                 )
-            # rdatom.SetFormalCharge(formal_charge  # * unit.elementary_charge
-            #                        # atom.formal_charge.value_in_unit(unit.elementary_charge)
-            #                        )
-            # rdatom.SetIsAromatic(atom.is_aromatic)
-            # rdatom.SetProp("_Name", atom.name)
 
-            # TODO: Pull in stereochemistry
 
-            ## Stereo handling code moved to after bonds are added
-            # if atom.stereochemistry == "S":
-            #    rdatom.SetChiralTag(Chem.CHI_TETRAHEDRAL_CW)
-            # elif atom.stereochemistry == "R":
-            #    rdatom.SetChiralTag(Chem.CHI_TETRAHEDRAL_CCW)
 
-            # rd_index = rdmol.AddAtom(rdatom)
-            # print(rd_index)
-
-            # Let's make sure al the atom indices in the two molecules
-            # are the same, otherwise we need to create an atom map.
-            # assert index == atom.molecule_atom_index
-            # assert index == node_idx
 
         for edge, edge_data in omm_topology_G.edges.items():
             print(edge, edge_data)
-            # atom_indices = (
-            #    edge[0],
-            #    edge[1],
-            # )
             offmol.add_bond(edge[0],
                             edge[1],
                             edge_data['bond_order'],
                             False)
-            # rdmol.AddBond(*edge)
-            # rdbond = rdmol.GetBondBetweenAtoms(*edge)
-            # Assign bond type, which is based on order unless it is aromatic
-            # if bond.is_aromatic:
-            #    rdbond.SetBondType(_bondtypes[1.5])
-            #    rdbond.SetIsAromatic(True)
-            # else:
-            # rdbond.SetBondType(_bondtypes[edge_data['bond_order']])
-            # rdbond.SetBondType(edge_data['bond_order'])
-            # rdbond.SetIsAromatic(False)
-
-        # Get bond information for N 604
-        # n_604 = rdmol.GetAtomWithIdx(604)
-        # for bond in n_604.GetBonds():
-        #     print(bond.GetBondType())
 
-        print(f"Number of atoms before sanitization: {len(rdmol.GetAtoms())}")
 
-        # TODO: Remove this hack for the uncapped C terminal once we've updated the substructure dict
-        #rdmol.GetBondBetweenAtoms(rdmol.GetNumAtoms() - 1, rdmol.GetNumAtoms() - 3).SetBondType(Chem.BondType.SINGLE)
-        #rdmol.GetAtomWithIdx(rdmol.GetNumAtoms() - 1).SetFormalCharge(-1)
-        # rdmol.GetBondBetweenAtoms(rdmol.GetNumAtoms() - 1, rdmol.GetNumAtoms() - 3).SetBondType(Chem.BondType.SINGLE)
-        #offmol.get_bond_between(offmol.n_atoms-1, offmol.n_atoms-3).bond_order = 1
-        # rdmol.GetAtomWithIdx(rdmol.GetNumAtoms() - 1).SetFormalCharge(-1)
-        #offmol.atoms[offmol.n_atoms-1].formal_charge = -1
-        # TODO: Does openff mol has some sanitization API point?
+        print(f"Number of atoms before sanitization: {len(rdmol.GetAtoms())}")
+        # TODO: Pull in coordinates and assign stereochemistry
+        # offmol.add_conformer()
+        # TODO: Ensure that this assigns aromaticity
         rdmol = offmol.to_rdkit()
         Chem.SanitizeMol(
             rdmol,
@@ -5250,7 +5217,6 @@ def protein_from_openmm(substructure_library, openmm_topology):
 
         print(f"Number of atoms after sanitization: {len(rdmol.GetAtoms())}")
 
-        # offmol = Molecule.from_rdkit(rdmol, allow_undefined_stereo=True)
         offmol = cls.from_rdkit(rdmol, allow_undefined_stereo=True, hydrogens_are_explicit=True)
         print(f"OFFMol number of atoms: {offmol.n_atoms}")
         return offmol