From e9cf78fb27e165b245fe56a7425b1a5c87897b41 Mon Sep 17 00:00:00 2001 From: Jannis Born Date: Tue, 21 Mar 2023 10:14:33 +0100 Subject: [PATCH] Update PubChem crawler (#163) * feat: additional error logging in pubchem crawler * test: enable pubchem tests * ci: isort --- pytoda/__init__.py | 2 +- pytoda/preprocessing/crawlers.py | 13 +++----- pytoda/preprocessing/tests/test_crawlers.py | 36 +++++++++++---------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/pytoda/__init__.py b/pytoda/__init__.py index 72e12397..44eb3678 100644 --- a/pytoda/__init__.py +++ b/pytoda/__init__.py @@ -1,2 +1,2 @@ name = 'pytoda' -__version__ = '1.1.2' +__version__ = '1.1.3' diff --git a/pytoda/preprocessing/crawlers.py b/pytoda/preprocessing/crawlers.py index acdf8f3a..9a5e3ad6 100644 --- a/pytoda/preprocessing/crawlers.py +++ b/pytoda/preprocessing/crawlers.py @@ -3,7 +3,7 @@ import urllib.request as urllib_request from itertools import filterfalse from typing import Iterable, List, Tuple, Union -from urllib.error import HTTPError +from urllib.error import HTTPError, URLError import pubchempy as pcp from pubchempy import BadRequestError, PubChemHTTPError @@ -180,23 +180,20 @@ def query_pubchem(smiles: str) -> Tuple[bool, int]: raise TypeError(f'Please pass str, not {type(smiles)}') try: result = pcp.get_compounds(smiles, 'smiles')[0] + return (False, -1) if result.cid is None else (True, result.cid) except BadRequestError: logger.warning(f'Skipping SMILES. BadRequestError with: {smiles}') - return (False, -2) except HTTPError: logger.warning(f'Skipping SMILES. HTTPError with: {smiles}') - return (False, -2) except TimeoutError: logger.warning(f'Skipping SMILES. TimeoutError with: {smiles}') - return (False, -2) except ConnectionResetError: logger.warning(f'Skipping SMILES. ConnectionResetError with: {smiles}') - return (False, -2) except PubChemHTTPError: logger.warning(f'Skipping SMILES, server busy. with: {smiles}') - return (False, -2) - - return (False, -1) if result.cid is None else (True, result.cid) + except URLError: + logger.error(f"Skipping SMILES, Network unreachable {smiles}") + return (False, -2) def is_pubchem(smiles: str) -> bool: diff --git a/pytoda/preprocessing/tests/test_crawlers.py b/pytoda/preprocessing/tests/test_crawlers.py index a6eeafff..0f692862 100644 --- a/pytoda/preprocessing/tests/test_crawlers.py +++ b/pytoda/preprocessing/tests/test_crawlers.py @@ -4,6 +4,8 @@ from pytoda.preprocessing.crawlers import ( # query_pubchem,; remove_pubchem_smiles, get_smiles_from_pubchem, get_smiles_from_zinc, + query_pubchem, + remove_pubchem_smiles, ) @@ -83,29 +85,29 @@ def test_get_smiles_from_pubchem(self) -> None: def test_query_pubchem(self) -> None: """Test query_pubchem""" - pass + # pass # Disabled due to bug in pubchem api - # smiles_list = [ - # 'O1C=CC=NC(=O)C1=O', - # 'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1', - # 'Clc1ccccc2ccnc12', - # ] - # ground_truths = [(True, 67945516), (False, -2), (False, -1)] - # for gt, smiles in zip(ground_truths, smiles_list): - # self.assertTupleEqual(query_pubchem(smiles), gt) + smiles_list = [ + 'O1C=CC=NC(=O)C1=O', + 'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1', + 'Clc1ccccc2ccnc12', + ] + ground_truths = [(True, 67945516), (False, -2), (False, -1)] + for gt, smiles in zip(ground_truths, smiles_list): + self.assertTupleEqual(query_pubchem(smiles), gt) def test_remove_pubchem_smiles(self) -> None: """Test remove_pubchem_smiles""" - pass + # pass # Disabled due to bug in pubchem api - # smiles_list = [ - # 'O1C=CC=NC(=O)C1=O', - # 'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1', - # 'Clc1ccccc2ccnc12', - # ] - # ground_truth = ['CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1', 'Clc1ccccc2ccnc12'] - # self.assertListEqual(remove_pubchem_smiles(smiles_list), ground_truth) + smiles_list = [ + 'O1C=CC=NC(=O)C1=O', + 'CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1', + 'Clc1ccccc2ccnc12', + ] + ground_truth = ['CC(N)S(O)(=O)C(C)CC(C(C)C)c1cc(F)cc(F)c1', 'Clc1ccccc2ccnc12'] + self.assertListEqual(remove_pubchem_smiles(smiles_list), ground_truth) if __name__ == '__main__':