openforcefield · jthorton · Mar 31, 2020 · Mar 2, 2020 · Mar 2, 2020 · Mar 3, 2020
diff --git a/docs/releasehistory.rst b/docs/releasehistory.rst
@@ -57,6 +57,14 @@ New features
 - `PR #529 <https://github.com/openforcefield/openforcefield/pull/529>`_: Adds the ability to write out to XYZ files via
   :py:meth:`Molecule.to_file <openforcefield.topology.Molecule.to_file>` Both single frame and multiframe XYZ files are supported.
   Note reading from XYZ files will not be supported due to the lack of connectivity information.
+- `PR #535 <https://github.com/openforcefield/openforcefield/pull/535>`_: Extends the the API for the
+  :py:meth:`Molecule.to_smiles <openforcefield.topology.Molecule.to_smiles>` to allow for the creation of cmiles
+  identifiers through combinations of isomeric, explicit hydrogen and mapped smiles, the default settings will return
+  isomeric explicit hydrogen smiles as expected.
+        .. warning::
+           Atom maps can be supplied to the properties dictionary to modify which atoms have their map index included,
+           if no map is supplied all atoms will be mapped in the order they appear in the
+           :py:class:`Molecule <openforcefield.topology.Molecule>`.
 
 
 Behavior changed

diff --git a/openforcefield/tests/test_forcefield.py b/openforcefield/tests/test_forcefield.py
@@ -241,6 +241,28 @@ def round_charge(xml):
         xmlsp[index] = chunk
     return ' q="'.join(xmlsp)
 
+
+def create_cis_1_2_dichloroethene():
+    """
+    Creates an openforcefield.topology.Molecule representation of cis-1,2-dichloroethene
+    without the use of a cheminformatics toolkit.
+    """
+
+    cis_dichloroethene = Molecule()
+    cis_dichloroethene.add_atom(17, 0, False)
+    cis_dichloroethene.add_atom(6, 0, False)
+    cis_dichloroethene.add_atom(6, 0, False)
+    cis_dichloroethene.add_atom(17, 0, False)
+    cis_dichloroethene.add_atom(1, 0, False)
+    cis_dichloroethene.add_atom(1, 0, False)
+    cis_dichloroethene.add_bond(0, 1, 1, False)
+    cis_dichloroethene.add_bond(1, 2, 2, False, 'Z')
+    cis_dichloroethene.add_bond(2, 3, 1, False)
+    cis_dichloroethene.add_bond(1, 4, 1, False)
+    cis_dichloroethene.add_bond(2, 5, 1, False)
+    return cis_dichloroethene
+
+
 def create_ethanol():
     """
     Creates an openforcefield.topology.Molecule representation of

diff --git a/openforcefield/tests/test_molecule.py b/openforcefield/tests/test_molecule.py
@@ -36,7 +36,7 @@
 # TODO: Will the ToolkitWrapper allow us to pare that down?
 from openforcefield.utils.toolkits import OpenEyeToolkitWrapper, RDKitToolkitWrapper, AmberToolsToolkitWrapper, ToolkitRegistry
 from openforcefield.tests.test_forcefield import create_ethanol, create_reversed_ethanol, create_acetaldehyde, \
-    create_benzene_no_aromatic, create_cyclohexane
+    create_benzene_no_aromatic, create_cyclohexane, create_cis_1_2_dichloroethene
 
 #=============================================================================================
 # TEST UTILITIES
@@ -310,9 +310,134 @@ def test_to_from_smiles(self, molecule, toolkit):
         else:
             molecule2 = Molecule.from_smiles(smiles1,
                                              toolkit_registry=toolkit_wrapper)
+
         smiles2 = molecule2.to_smiles(toolkit_registry=toolkit_wrapper)
         assert (smiles1 == smiles2)
 
+    smiles_types = [{'isomeric': True, 'explicit_hydrogens': True, 'mapped': True, 'error': None},
+                    {'isomeric': False, 'explicit_hydrogens': True, 'mapped': True, 'error': None},
+                    {'isomeric': True, 'explicit_hydrogens': False, 'mapped': True, 'error': AssertionError},
+                    {'isomeric': True, 'explicit_hydrogens': True, 'mapped': False, 'error': None},
+                    {'isomeric': True, 'explicit_hydrogens': False, 'mapped': False, 'error': None},
+                    {'isomeric': False, 'explicit_hydrogens': True, 'mapped': False, 'error': None},
+                    {'isomeric': False, 'explicit_hydrogens': False, 'mapped': True, 'error': AssertionError},
+                    {'isomeric': False, 'explicit_hydrogens': False, 'mapped': False, 'error': None},
+                    ]
+
+    @pytest.mark.parametrize('toolkit_class', [OpenEyeToolkitWrapper, RDKitToolkitWrapper])
+    @pytest.mark.parametrize('data', smiles_types)
+    def test_smiles_types(self, data, toolkit_class):
+        """Test that the toolkit is passing the correct args to the toolkit backends across different combinations."""
+
+        if toolkit_class.is_available():
+            toolkit = toolkit_class()
+            mol = create_cis_1_2_dichloroethene()
+            isomeric, explicit_hs, mapped = data['isomeric'], data['explicit_hydrogens'], data['mapped']
+            if data['error'] is not None:
+                with pytest.raises(data['error']):
+                    mol.to_smiles(isomeric=isomeric, explicit_hydrogens=explicit_hs,
+                                  mapped=mapped, toolkit_registry=toolkit)
+
+            else:
+
+                # make the smiles then do some checks on it
+                output_smiles = mol.to_smiles(isomeric=isomeric,
+                                              explicit_hydrogens=explicit_hs,
+                                              mapped=mapped, toolkit_registry=toolkit)
+                if isomeric:
+                    assert '\\' in output_smiles
+                if explicit_hs:
+                    assert 'H' in output_smiles
+                if mapped:
+                    for i in range(1,7):
+                        assert f':{i}' in output_smiles
+                    # if the molecule is mapped make it using the mapping
+                    mol2 = Molecule.from_mapped_smiles(mapped_smiles=output_smiles,
+                                                       toolkit_registry=toolkit,
+                                                       allow_undefined_stereo=not isomeric)
+                else:
+                    # make a molecule from a standard smiles
+                    mol2 = Molecule.from_smiles(smiles=output_smiles,
+                                                allow_undefined_stereo=not isomeric,
+                                                toolkit_registry=toolkit)
+
+                isomorphic, atom_map = Molecule.are_isomorphic(mol, mol2, return_atom_map=True,
+                                                               aromatic_matching=True,
+                                                               formal_charge_matching=True,
+                                                               bond_order_matching=True,
+                                                               atom_stereochemistry_matching=isomeric,
+                                                               bond_stereochemistry_matching=isomeric)
+
+                assert isomorphic is True
+                if mapped:
+                    assert {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5} == atom_map
+
+        else:
+            pytest.skip(f'The required toolkit ({toolkit_class.toolkit_name}) is not available.')
+
+    @pytest.mark.parametrize('toolkit_class', [OpenEyeToolkitWrapper, RDKitToolkitWrapper])
+    def test_smiles_cache(self, toolkit_class):
+        """Make sure that the smiles cache is being used correctly."""
+
+        if toolkit_class.is_available():
+            toolkit = toolkit_class()
+            # this uses no toolkit back end so no smiles should be saved
+            mol = create_ethanol()
+
+            # now lets populate the cache with a test result
+            # first we need to make the cache key for the default input
+            isomeric, explicit_hydrogens, mapped = True, True, False
+            cache_key = toolkit.to_smiles.__qualname__ + str(isomeric) + str(explicit_hydrogens) + str(mapped)
+            cache_key += str(mol._properties.get('atom_map', None))
+            mol._cached_smiles = {cache_key: None}
+            assert mol.to_smiles(isomeric=isomeric, toolkit_registry=toolkit,
+                                 explicit_hydrogens=explicit_hydrogens, mapped=mapped) is None
+
+            # now make sure the cache is not used if we change an input arg
+            assert mol.to_smiles(isomeric=True, explicit_hydrogens=True,
+                                 mapped=True, toolkit_registry=toolkit) is not None
+
+            # now make sure the cache was updated
+            assert mol._cached_smiles != {cache_key: None}
+            assert len(mol._cached_smiles) == 2
+
+        else:
+            pytest.skip(f'The required toolkit ({toolkit_class.toolkit_name}) is not available.')
+
+    mapped_types = [{'atom_map': None},
+                    {'atom_map': {0: 0}},
+                    {'atom_map': {0: 0, 1: 0, 2: 0, 3: 0}},
+                    {'atom_map': {0: 0, 1: 1, 2: 2, 3: 3}},
+                    {'atom_map': {0: 1, 1: 2, 2: 3, 3: 4}}]
+
+    @pytest.mark.parametrize('toolkit_class', [OpenEyeToolkitWrapper, RDKitToolkitWrapper])
+    @pytest.mark.parametrize('data', mapped_types)
+    def test_partial_mapped_smiles(self, toolkit_class, data):
+
+        if toolkit_class.is_available():
+            toolkit = toolkit_class()
+            mol = create_cis_1_2_dichloroethene()
+            mol._properties['atom_map'] = data['atom_map']
+
+            smiles = mol.to_smiles(isomeric=True, explicit_hydrogens=True,
+                                   mapped=True, toolkit_registry=toolkit)
+
+            # now we just need to check the smiles generated
+            if data['atom_map'] is None:
+                for i, atom in enumerate(mol.atoms, 1):
+                    assert f'[{atom.element.symbol}:{i}]' in smiles
+            else:
+                if 0 in data['atom_map'].values():
+                    increment = True
+                else:
+                    increment = False
+
+                for atom, index in data['atom_map'].items():
+                    assert f'[{mol.atoms[atom].element.symbol}:{index + 1 if increment else index}]'
+
+        else:
+            pytest.skip(f'The required toolkit ({toolkit_class.toolkit_name}) is not available.')
+
     @pytest.mark.parametrize('molecule', mini_drug_bank())
     def test_unique_atom_names(self, molecule):
         """Test molecules have unique atom names"""
@@ -427,7 +552,7 @@ def test_to_from_rdkit(self, molecule):
 
         # If this is a known failure, check that it raises UndefinedStereochemistryError
         # and proceed with the test ignoring it.
-        test_mol =  None
+        test_mol = None
         if undefined_stereo:
             with pytest.raises(UndefinedStereochemistryError):
                 Molecule(rdmol)
@@ -664,7 +789,7 @@ def test_to_from_oemol(self, molecule):
 
         # If this is a known failure, check that it raises UndefinedStereochemistryError
         # and proceed with the test ignoring it.
-        test_mol =  None
+        test_mol = None
         if undefined_stereo:
             with pytest.raises(UndefinedStereochemistryError):
                 Molecule(oemol)
@@ -968,7 +1093,7 @@ def test_to_qcschema(self):
         qcschema = ethanol.to_qcschema()
         # make sure the properties match
         charge = 0
-        connectivity = [(0, 1, 1.0), (0, 3, 1.0), (0, 4, 1.0), (0, 5, 1.0), (1, 2, 1.0), (1, 6, 1.0), (1, 7, 1.0), (2, 8, 1.0)]
+        connectivity = [(0, 1, 1.0), (0, 4, 1.0), (0, 5, 1.0), (0, 6, 1.0), (1, 2, 1.0), (1, 7, 1.0), (1, 8, 1.0), (2, 3, 1.0)]
         symbols = ['C', 'C', 'O', 'H', 'H', 'H', 'H', 'H', 'H']
         assert charge == qcschema.molecular_charge
         assert connectivity == qcschema.connectivity
@@ -1388,7 +1513,11 @@ def test_add_bond_charge_virtual_site(self, molecule):
         molecule_dict = molecule.to_dict()
         molecule2 = Molecule.from_dict(molecule_dict)
 
-        assert hash(molecule) == hash(molecule2)
+        # test that the molecules are the same
+        assert molecule.is_isomorphic_with(molecule2)
+        # lets also make sure that the vsites were constructed correctly
+        for site1, site2 in zip(molecule.virtual_sites, molecule2.virtual_sites):
+            assert site1.to_dict() == site2.to_dict()
 
     # TODO: Make a test for to_dict and from_dict for VirtualSites (even though they're currently just unloaded using
     #      (for example) Molecule._add_bond_virtual_site functions

diff --git a/openforcefield/topology/molecule.py b/openforcefield/topology/molecule.py
@@ -1826,14 +1826,25 @@ def __eq__(self, other):
         # TODO the doc string did not match the previous function what matching should this method do?
         return Molecule.are_isomorphic(self, other, return_atom_map=False)[0]
 
-    def to_smiles(self, toolkit_registry=GLOBAL_TOOLKIT_REGISTRY):
+    def to_smiles(self, isomeric=True, explicit_hydrogens=True, mapped=False, toolkit_registry=GLOBAL_TOOLKIT_REGISTRY):
         """
-        Return a canonical isomeric SMILES representation of the current molecule
+        Return a canonical isomeric SMILES representation of the current molecule.
+        A partially mapped smiles can also be generated for atoms of interest by supplying an `atom_map` to the
+        properties dictionary.
 
         .. note :: RDKit and OpenEye versions will not necessarily return the same representation.
 
         Parameters
         ----------
+        isomeric: bool optional, default= True
+            return an isomeric smiles
+        explicit_hydrogens: bool optional, default=True
+            return a smiles string containing all hydrogens explicitly
+        mapped: bool optional, default=False
+            return a explicit hydrogen mapped smiles, the atoms to be mapped can be controlled by supplying an
+            atom map into the properties dictionary. If no mapping is passed all atoms will be mapped in order, else
+            an atom map dictionary from the current atom index to the map id should be supplied with no duplicates.
+            The map ids (values) should start from 0 or 1.
         toolkit_registry : openforcefield.utils.toolkits.ToolkitRegistry or openforcefield.utils.toolkits.ToolkitWrapper, optional, default=None
             :class:`ToolkitRegistry` or :class:`ToolkitWrapper` to use for SMILES conversion
 
@@ -1868,14 +1879,14 @@ def to_smiles(self, toolkit_registry=GLOBAL_TOOLKIT_REGISTRY):
         # Get a string representation of the function containing the toolkit name so we can check
         # if a SMILES was already cached for this molecule. This will return, for example
         # "RDKitToolkitWrapper.to_smiles"
-        func_qualname = to_smiles_method.__qualname__
-
+        smiles_hash = to_smiles_method.__qualname__ + str(isomeric) + str(explicit_hydrogens) + str(mapped)
+        smiles_hash += str(self._properties.get('atom_map', None))
         # Check to see if a SMILES for this molecule was already cached using this method
-        if func_qualname in self._cached_smiles:
-            return self._cached_smiles[func_qualname]
+        if smiles_hash in self._cached_smiles:
+            return self._cached_smiles[smiles_hash]
         else:
-            smiles = to_smiles_method(self)
-            self._cached_smiles[func_qualname] = smiles
+            smiles = to_smiles_method(self, isomeric, explicit_hydrogens, mapped)
+            self._cached_smiles[smiles_hash] = smiles
             return smiles
 
     @staticmethod
@@ -3676,7 +3687,7 @@ def from_openeye(oemol, allow_undefined_stereo=False):
     def to_qcschema(self, multiplicity=1, conformer=0):
         """
         Generate the qschema input format used to submit jobs to archive
-        or run qcengine calculations locally, the molecule is placed in canonical order first.
+        or run qcengine calculations locally,
         spec can be found here <https://molssi-qc-schema.readthedocs.io/en/latest/index.html>
 
         .. warning :: This API is experimental and subject to change.
@@ -3717,20 +3728,17 @@ def to_qcschema(self, multiplicity=1, conformer=0):
             raise ImportError('Please install QCElemental via conda install -c conda-forge qcelemental '
                               'to validate the schema')
 
-        # get a canonical ordered version of the molecule
-        canonical_mol = self.canonical_order_atoms()
-
         # get/ check the geometry
         try:
-            geometry = canonical_mol.conformers[conformer].in_units_of(unit.bohr)
+            geometry = self.conformers[conformer].in_units_of(unit.bohr)
         except (IndexError, TypeError):
             raise InvalidConformerError('The molecule must have a conformation to produce a valid qcschema; '
                                         f'no conformer was found at index {conformer}.')
 
         # Gather the required qschema data
-        charge = sum([atom.formal_charge for atom in canonical_mol.atoms])
-        connectivity = [(bond.atom1_index, bond.atom2_index, bond.bond_order) for bond in canonical_mol.bonds]
-        symbols = [Element.getByAtomicNumber(atom.atomic_number).symbol for atom in canonical_mol.atoms]
+        charge = sum([atom.formal_charge for atom in self.atoms])
+        connectivity = [(bond.atom1_index, bond.atom2_index, bond.bond_order) for bond in self.bonds]
+        symbols = [Element.getByAtomicNumber(atom.atomic_number).symbol for atom in self.atoms]
 
         schema_dict = {'symbols': symbols, 'geometry': geometry, 'connectivity': connectivity,
                        'molecular_charge': charge, 'molecular_multiplicity': multiplicity}