Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Cmiles options #535

Merged
merged 11 commits into from
Mar 31, 2020
8 changes: 8 additions & 0 deletions docs/releasehistory.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ New features
- `PR #529 <https://github.com/openforcefield/openforcefield/pull/529>`_: Adds the ability to write out to XYZ files via
:py:meth:`Molecule.to_file <openforcefield.topology.Molecule.to_file>` Both single frame and multiframe XYZ files are supported.
Note reading from XYZ files will not be supported due to the lack of connectivity information.
- `PR #535 <https://github.com/openforcefield/openforcefield/pull/535>`_: Extends the the API for the
:py:meth:`Molecule.to_smiles <openforcefield.topology.Molecule.to_smiles>` to allow for the creation of cmiles
identifiers through combinations of isomeric, explicit hydrogen and mapped smiles, the default settings will return
isomeric explicit hydrogen smiles as expected.
.. warning::
Atom maps can be supplied to the properties dictionary to modify which atoms have their map index included,
if no map is supplied all atoms will be mapped in the order they appear in the
:py:class:`Molecule <openforcefield.topology.Molecule>`.


Behavior changed
Expand Down
22 changes: 22 additions & 0 deletions openforcefield/tests/test_forcefield.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,28 @@ def round_charge(xml):
xmlsp[index] = chunk
return ' q="'.join(xmlsp)


def create_cis_1_2_dichloroethene():
"""
Creates an openforcefield.topology.Molecule representation of cis-1,2-dichloroethene
without the use of a cheminformatics toolkit.
"""

cis_dichloroethene = Molecule()
cis_dichloroethene.add_atom(17, 0, False)
cis_dichloroethene.add_atom(6, 0, False)
cis_dichloroethene.add_atom(6, 0, False)
cis_dichloroethene.add_atom(17, 0, False)
cis_dichloroethene.add_atom(1, 0, False)
cis_dichloroethene.add_atom(1, 0, False)
cis_dichloroethene.add_bond(0, 1, 1, False)
cis_dichloroethene.add_bond(1, 2, 2, False, 'Z')
cis_dichloroethene.add_bond(2, 3, 1, False)
cis_dichloroethene.add_bond(1, 4, 1, False)
cis_dichloroethene.add_bond(2, 5, 1, False)
return cis_dichloroethene


def create_ethanol():
"""
Creates an openforcefield.topology.Molecule representation of
Expand Down
139 changes: 134 additions & 5 deletions openforcefield/tests/test_molecule.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
# TODO: Will the ToolkitWrapper allow us to pare that down?
from openforcefield.utils.toolkits import OpenEyeToolkitWrapper, RDKitToolkitWrapper, AmberToolsToolkitWrapper, ToolkitRegistry
from openforcefield.tests.test_forcefield import create_ethanol, create_reversed_ethanol, create_acetaldehyde, \
create_benzene_no_aromatic, create_cyclohexane
create_benzene_no_aromatic, create_cyclohexane, create_cis_1_2_dichloroethene

#=============================================================================================
# TEST UTILITIES
Expand Down Expand Up @@ -310,9 +310,134 @@ def test_to_from_smiles(self, molecule, toolkit):
else:
molecule2 = Molecule.from_smiles(smiles1,
toolkit_registry=toolkit_wrapper)

smiles2 = molecule2.to_smiles(toolkit_registry=toolkit_wrapper)
assert (smiles1 == smiles2)

smiles_types = [{'isomeric': True, 'explicit_hydrogens': True, 'mapped': True, 'error': None},
{'isomeric': False, 'explicit_hydrogens': True, 'mapped': True, 'error': None},
{'isomeric': True, 'explicit_hydrogens': False, 'mapped': True, 'error': AssertionError},
{'isomeric': True, 'explicit_hydrogens': True, 'mapped': False, 'error': None},
{'isomeric': True, 'explicit_hydrogens': False, 'mapped': False, 'error': None},
{'isomeric': False, 'explicit_hydrogens': True, 'mapped': False, 'error': None},
{'isomeric': False, 'explicit_hydrogens': False, 'mapped': True, 'error': AssertionError},
{'isomeric': False, 'explicit_hydrogens': False, 'mapped': False, 'error': None},
]

@pytest.mark.parametrize('toolkit_class', [OpenEyeToolkitWrapper, RDKitToolkitWrapper])
@pytest.mark.parametrize('data', smiles_types)
def test_smiles_types(self, data, toolkit_class):
"""Test that the toolkit is passing the correct args to the toolkit backends across different combinations."""

if toolkit_class.is_available():
toolkit = toolkit_class()
mol = create_cis_1_2_dichloroethene()
isomeric, explicit_hs, mapped = data['isomeric'], data['explicit_hydrogens'], data['mapped']
if data['error'] is not None:
with pytest.raises(data['error']):
mol.to_smiles(isomeric=isomeric, explicit_hydrogens=explicit_hs,
mapped=mapped, toolkit_registry=toolkit)

else:

# make the smiles then do some checks on it
output_smiles = mol.to_smiles(isomeric=isomeric,
explicit_hydrogens=explicit_hs,
mapped=mapped, toolkit_registry=toolkit)
if isomeric:
assert '\\' in output_smiles
if explicit_hs:
assert 'H' in output_smiles
if mapped:
for i in range(1,7):
assert f':{i}' in output_smiles
# if the molecule is mapped make it using the mapping
mol2 = Molecule.from_mapped_smiles(mapped_smiles=output_smiles,
toolkit_registry=toolkit,
allow_undefined_stereo=not isomeric)
else:
# make a molecule from a standard smiles
mol2 = Molecule.from_smiles(smiles=output_smiles,
allow_undefined_stereo=not isomeric,
toolkit_registry=toolkit)

isomorphic, atom_map = Molecule.are_isomorphic(mol, mol2, return_atom_map=True,
aromatic_matching=True,
formal_charge_matching=True,
bond_order_matching=True,
atom_stereochemistry_matching=isomeric,
bond_stereochemistry_matching=isomeric)

assert isomorphic is True
if mapped:
assert {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} == atom_map

else:
pytest.skip(f'The required toolkit ({toolkit_class.toolkit_name}) is not available.')

@pytest.mark.parametrize('toolkit_class', [OpenEyeToolkitWrapper, RDKitToolkitWrapper])
def test_smiles_cache(self, toolkit_class):
"""Make sure that the smiles cache is being used correctly."""

if toolkit_class.is_available():
toolkit = toolkit_class()
# this uses no toolkit back end so no smiles should be saved
mol = create_ethanol()

# now lets populate the cache with a test result
# first we need to make the cache key for the default input
isomeric, explicit_hydrogens, mapped = True, True, False
cache_key = toolkit.to_smiles.__qualname__ + str(isomeric) + str(explicit_hydrogens) + str(mapped)
cache_key += str(mol._properties.get('atom_map', None))
mol._cached_smiles = {cache_key: None}
assert mol.to_smiles(isomeric=isomeric, toolkit_registry=toolkit,
explicit_hydrogens=explicit_hydrogens, mapped=mapped) is None

# now make sure the cache is not used if we change an input arg
assert mol.to_smiles(isomeric=True, explicit_hydrogens=True,
mapped=True, toolkit_registry=toolkit) is not None

# now make sure the cache was updated
assert mol._cached_smiles != {cache_key: None}
assert len(mol._cached_smiles) == 2

else:
pytest.skip(f'The required toolkit ({toolkit_class.toolkit_name}) is not available.')

mapped_types = [{'atom_map': None},
{'atom_map': {0: 0}},
{'atom_map': {0: 0, 1: 0, 2: 0, 3: 0}},
{'atom_map': {0: 0, 1: 1, 2: 2, 3: 3}},
{'atom_map': {0: 1, 1: 2, 2: 3, 3: 4}}]

@pytest.mark.parametrize('toolkit_class', [OpenEyeToolkitWrapper, RDKitToolkitWrapper])
@pytest.mark.parametrize('data', mapped_types)
def test_partial_mapped_smiles(self, toolkit_class, data):

if toolkit_class.is_available():
toolkit = toolkit_class()
mol = create_cis_1_2_dichloroethene()
mol._properties['atom_map'] = data['atom_map']

smiles = mol.to_smiles(isomeric=True, explicit_hydrogens=True,
mapped=True, toolkit_registry=toolkit)

# now we just need to check the smiles generated
if data['atom_map'] is None:
for i, atom in enumerate(mol.atoms, 1):
assert f'[{atom.element.symbol}:{i}]' in smiles
else:
if 0 in data['atom_map'].values():
increment = True
else:
increment = False

for atom, index in data['atom_map'].items():
assert f'[{mol.atoms[atom].element.symbol}:{index + 1 if increment else index}]'

else:
pytest.skip(f'The required toolkit ({toolkit_class.toolkit_name}) is not available.')

@pytest.mark.parametrize('molecule', mini_drug_bank())
def test_unique_atom_names(self, molecule):
"""Test molecules have unique atom names"""
Expand Down Expand Up @@ -427,7 +552,7 @@ def test_to_from_rdkit(self, molecule):

# If this is a known failure, check that it raises UndefinedStereochemistryError
# and proceed with the test ignoring it.
test_mol = None
test_mol = None
if undefined_stereo:
with pytest.raises(UndefinedStereochemistryError):
Molecule(rdmol)
Expand Down Expand Up @@ -664,7 +789,7 @@ def test_to_from_oemol(self, molecule):

# If this is a known failure, check that it raises UndefinedStereochemistryError
# and proceed with the test ignoring it.
test_mol = None
test_mol = None
if undefined_stereo:
with pytest.raises(UndefinedStereochemistryError):
Molecule(oemol)
Expand Down Expand Up @@ -968,7 +1093,7 @@ def test_to_qcschema(self):
qcschema = ethanol.to_qcschema()
# make sure the properties match
charge = 0
connectivity = [(0, 1, 1.0), (0, 3, 1.0), (0, 4, 1.0), (0, 5, 1.0), (1, 2, 1.0), (1, 6, 1.0), (1, 7, 1.0), (2, 8, 1.0)]
connectivity = [(0, 1, 1.0), (0, 4, 1.0), (0, 5, 1.0), (0, 6, 1.0), (1, 2, 1.0), (1, 7, 1.0), (1, 8, 1.0), (2, 3, 1.0)]
symbols = ['C', 'C', 'O', 'H', 'H', 'H', 'H', 'H', 'H']
assert charge == qcschema.molecular_charge
assert connectivity == qcschema.connectivity
Expand Down Expand Up @@ -1388,7 +1513,11 @@ def test_add_bond_charge_virtual_site(self, molecule):
molecule_dict = molecule.to_dict()
molecule2 = Molecule.from_dict(molecule_dict)

assert hash(molecule) == hash(molecule2)
# test that the molecules are the same
assert molecule.is_isomorphic_with(molecule2)
# lets also make sure that the vsites were constructed correctly
for site1, site2 in zip(molecule.virtual_sites, molecule2.virtual_sites):
assert site1.to_dict() == site2.to_dict()

# TODO: Make a test for to_dict and from_dict for VirtualSites (even though they're currently just unloaded using
# (for example) Molecule._add_bond_virtual_site functions
Expand Down
40 changes: 24 additions & 16 deletions openforcefield/topology/molecule.py
Original file line number Diff line number Diff line change
Expand Up @@ -1826,14 +1826,25 @@ def __eq__(self, other):
# TODO the doc string did not match the previous function what matching should this method do?
return Molecule.are_isomorphic(self, other, return_atom_map=False)[0]

def to_smiles(self, toolkit_registry=GLOBAL_TOOLKIT_REGISTRY):
def to_smiles(self, isomeric=True, explicit_hydrogens=True, mapped=False, toolkit_registry=GLOBAL_TOOLKIT_REGISTRY):
"""
Return a canonical isomeric SMILES representation of the current molecule
Return a canonical isomeric SMILES representation of the current molecule.
A partially mapped smiles can also be generated for atoms of interest by supplying an `atom_map` to the
properties dictionary.
.. note :: RDKit and OpenEye versions will not necessarily return the same representation.
Parameters
----------
isomeric: bool optional, default= True
return an isomeric smiles
explicit_hydrogens: bool optional, default=True
return a smiles string containing all hydrogens explicitly
mapped: bool optional, default=False
return a explicit hydrogen mapped smiles, the atoms to be mapped can be controlled by supplying an
atom map into the properties dictionary. If no mapping is passed all atoms will be mapped in order, else
an atom map dictionary from the current atom index to the map id should be supplied with no duplicates.
The map ids (values) should start from 0 or 1.
toolkit_registry : openforcefield.utils.toolkits.ToolkitRegistry or openforcefield.utils.toolkits.ToolkitWrapper, optional, default=None
:class:`ToolkitRegistry` or :class:`ToolkitWrapper` to use for SMILES conversion
Expand Down Expand Up @@ -1868,14 +1879,14 @@ def to_smiles(self, toolkit_registry=GLOBAL_TOOLKIT_REGISTRY):
# Get a string representation of the function containing the toolkit name so we can check
# if a SMILES was already cached for this molecule. This will return, for example
# "RDKitToolkitWrapper.to_smiles"
func_qualname = to_smiles_method.__qualname__

smiles_hash = to_smiles_method.__qualname__ + str(isomeric) + str(explicit_hydrogens) + str(mapped)
smiles_hash += str(self._properties.get('atom_map', None))
# Check to see if a SMILES for this molecule was already cached using this method
if func_qualname in self._cached_smiles:
return self._cached_smiles[func_qualname]
if smiles_hash in self._cached_smiles:
return self._cached_smiles[smiles_hash]
else:
smiles = to_smiles_method(self)
self._cached_smiles[func_qualname] = smiles
smiles = to_smiles_method(self, isomeric, explicit_hydrogens, mapped)
self._cached_smiles[smiles_hash] = smiles
return smiles

@staticmethod
Expand Down Expand Up @@ -3676,7 +3687,7 @@ def from_openeye(oemol, allow_undefined_stereo=False):
def to_qcschema(self, multiplicity=1, conformer=0):
"""
Generate the qschema input format used to submit jobs to archive
or run qcengine calculations locally, the molecule is placed in canonical order first.
or run qcengine calculations locally,
spec can be found here <https://molssi-qc-schema.readthedocs.io/en/latest/index.html>
.. warning :: This API is experimental and subject to change.
Expand Down Expand Up @@ -3717,20 +3728,17 @@ def to_qcschema(self, multiplicity=1, conformer=0):
raise ImportError('Please install QCElemental via conda install -c conda-forge qcelemental '
'to validate the schema')

# get a canonical ordered version of the molecule
canonical_mol = self.canonical_order_atoms()

# get/ check the geometry
try:
geometry = canonical_mol.conformers[conformer].in_units_of(unit.bohr)
geometry = self.conformers[conformer].in_units_of(unit.bohr)
except (IndexError, TypeError):
raise InvalidConformerError('The molecule must have a conformation to produce a valid qcschema; '
f'no conformer was found at index {conformer}.')

# Gather the required qschema data
charge = sum([atom.formal_charge for atom in canonical_mol.atoms])
connectivity = [(bond.atom1_index, bond.atom2_index, bond.bond_order) for bond in canonical_mol.bonds]
symbols = [Element.getByAtomicNumber(atom.atomic_number).symbol for atom in canonical_mol.atoms]
charge = sum([atom.formal_charge for atom in self.atoms])
connectivity = [(bond.atom1_index, bond.atom2_index, bond.bond_order) for bond in self.bonds]
symbols = [Element.getByAtomicNumber(atom.atomic_number).symbol for atom in self.atoms]

schema_dict = {'symbols': symbols, 'geometry': geometry, 'connectivity': connectivity,
'molecular_charge': charge, 'molecular_multiplicity': multiplicity}
Expand Down
Loading