Skip to content

Commit

Permalink
Update PubChem crawler (#163)
Browse files Browse the repository at this point in the history
* feat: additional error logging in pubchem crawler

* test: enable pubchem tests

* ci: isort
  • Loading branch information
jannisborn authored Mar 21, 2023
1 parent 84c10fc commit e9cf78f
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 26 deletions.
2 changes: 1 addition & 1 deletion pytoda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name = 'pytoda'
__version__ = '1.1.2'
__version__ = '1.1.3'
13 changes: 5 additions & 8 deletions pytoda/preprocessing/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import urllib.request as urllib_request
from itertools import filterfalse
from typing import Iterable, List, Tuple, Union
from urllib.error import HTTPError
from urllib.error import HTTPError, URLError

import pubchempy as pcp
from pubchempy import BadRequestError, PubChemHTTPError
Expand Down Expand Up @@ -180,23 +180,20 @@ def query_pubchem(smiles: str) -> Tuple[bool, int]:
raise TypeError(f'Please pass str, not {type(smiles)}')
try:
result = pcp.get_compounds(smiles, 'smiles')[0]
return (False, -1) if result.cid is None else (True, result.cid)
except BadRequestError:
logger.warning(f'Skipping SMILES. BadRequestError with: {smiles}')
return (False, -2)
except HTTPError:
logger.warning(f'Skipping SMILES. HTTPError with: {smiles}')
return (False, -2)
except TimeoutError:
logger.warning(f'Skipping SMILES. TimeoutError with: {smiles}')
return (False, -2)
except ConnectionResetError:
logger.warning(f'Skipping SMILES. ConnectionResetError with: {smiles}')
return (False, -2)
except PubChemHTTPError:
logger.warning(f'Skipping SMILES, server busy. with: {smiles}')
return (False, -2)

return (False, -1) if result.cid is None else (True, result.cid)
except URLError:
logger.error(f"Skipping SMILES, Network unreachable {smiles}")
return (False, -2)


def is_pubchem(smiles: str) -> bool:
Expand Down
36 changes: 19 additions & 17 deletions pytoda/preprocessing/tests/test_crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from pytoda.preprocessing.crawlers import ( # query_pubchem,; remove_pubchem_smiles,
get_smiles_from_pubchem,
get_smiles_from_zinc,
query_pubchem,
remove_pubchem_smiles,
)


Expand Down Expand Up @@ -83,29 +85,29 @@ def test_get_smiles_from_pubchem(self) -> None:

def test_query_pubchem(self) -> None:
"""Test query_pubchem"""
pass
# pass
# Disabled due to bug in pubchem api
# smiles_list = [
# 'O1C=CC=NC(=O)C1=O',
# 'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1',
# 'Clc1ccccc2ccnc12',
# ]
# ground_truths = [(True, 67945516), (False, -2), (False, -1)]
# for gt, smiles in zip(ground_truths, smiles_list):
# self.assertTupleEqual(query_pubchem(smiles), gt)
smiles_list = [
'O1C=CC=NC(=O)C1=O',
'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1',
'Clc1ccccc2ccnc12',
]
ground_truths = [(True, 67945516), (False, -2), (False, -1)]
for gt, smiles in zip(ground_truths, smiles_list):
self.assertTupleEqual(query_pubchem(smiles), gt)

def test_remove_pubchem_smiles(self) -> None:
"""Test remove_pubchem_smiles"""
pass
# pass

# Disabled due to bug in pubchem api
# smiles_list = [
# 'O1C=CC=NC(=O)C1=O',
# 'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1',
# 'Clc1ccccc2ccnc12',
# ]
# ground_truth = ['CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1', 'Clc1ccccc2ccnc12']
# self.assertListEqual(remove_pubchem_smiles(smiles_list), ground_truth)
smiles_list = [
'O1C=CC=NC(=O)C1=O',
'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1',
'Clc1ccccc2ccnc12',
]
ground_truth = ['CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1', 'Clc1ccccc2ccnc12']
self.assertListEqual(remove_pubchem_smiles(smiles_list), ground_truth)


if __name__ == '__main__':
Expand Down

0 comments on commit e9cf78f

Please # to comment.