diff --git a/docs/source/api.rst b/docs/source/api.rst index b310355..59cec8a 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -97,14 +97,20 @@ callback - optional Must exist as method of scheduled spider, does not need to contain string "self". - If not passed or not found on spider default callback `parse`_ will be used. + If not passed default Scrapy callback `parse`_ will be used. If there is no spider method + with name specified by callback argument or callback is not callable API will return 400 HTTP error. + + Example request with callback: ``/crawl.json?url=https://quotes.toscrape.com/&spider_name=toscrape-css&callback=parse_page`` errback - type: string - optional Scrapy errback for request made from spider. It must exist as method of - scheduled spider, otherwise exception will be raised. String does not need to contain 'self'. + scheduled spider, otherwise API will return 400 HTTP error. String does not need to contain 'self'. + Defaults to None, can be adjusted with `DEFAULT_ERRBACK_NAME`_ setting. + + Example request with errback: ``/crawl.json?url=https://quotes.toscrape.com/&spider_name=toscrape-css&errback=my_errback`` max_requests - type: integer @@ -517,6 +523,18 @@ Encoding that's used to encode log messages. Default: ``utf-8``. +DEFAULT_ERRBACK_NAME +~~~~~~~~~~~~~~~~~~~~ + +Default: ``None`` + +String with the name of the default errback_. + +Use this setting to set default errback for scrapy spider requests made from ScrapyRT. +Errback must exist as method of spider and must be callable, otherwise 400 HTTP error will be raised. + +.. _errback: https://docs.scrapy.org/en/latest/topics/request-response.htm#using-errbacks-to-catch-exceptions-in-request-processing + Spider settings --------------- diff --git a/scrapyrt/conf/default_settings.py b/scrapyrt/conf/default_settings.py index 99d7746..eea5e72 100644 --- a/scrapyrt/conf/default_settings.py +++ b/scrapyrt/conf/default_settings.py @@ -31,4 +31,6 @@ # disable in production DEBUG = True -TWISTED_REACTOR = None \ No newline at end of file +TWISTED_REACTOR = None + +DEFAULT_ERRBACK_NAME = None diff --git a/scrapyrt/core.py b/scrapyrt/core.py index c0b240a..ccff725 100644 --- a/scrapyrt/core.py +++ b/scrapyrt/core.py @@ -3,6 +3,7 @@ from copy import deepcopy import datetime import os +import traceback from scrapy import signals from scrapy.crawler import CrawlerRunner, Crawler @@ -109,6 +110,7 @@ def __init__(self, spider_name, request_kwargs, self.items = [] self.items_dropped = [] self.errors = [] + self.user_error = None self.max_requests = int(max_requests) if max_requests else None self.timeout_limit = int(app_settings.TIMEOUT_LIMIT) self.request_count = 0 @@ -120,7 +122,7 @@ def __init__(self, spider_name, request_kwargs, # because we need to know if spider has method available self.callback_name = request_kwargs.pop('callback', None) or 'parse' # do the same for errback - self.errback_name = request_kwargs.pop('errback', None) or 'parse' + self.errback_name = request_kwargs.pop('errback', None) or app_settings.DEFAULT_ERRBACK_NAME if request_kwargs.get("url"): self.request = self.create_spider_request(deepcopy(request_kwargs)) @@ -171,17 +173,30 @@ def spider_idle(self, spider): """ if spider is self.crawler.spider and self.request and not self._request_scheduled: - callback = getattr(self.crawler.spider, self.callback_name) - assert callable(callback), 'Invalid callback' - self.request = self.request.replace(callback=callback) + try: + callback = getattr(self.crawler.spider, self.callback_name) + assert callable(callback), 'Invalid callback' + self.request = self.request.replace(callback=callback) + except (AssertionError, AttributeError): + msg = f"Invalid spider callback {self.callback_name}, callback not callable or not a method of a spider {self.spider_name}" + self.user_error = Error(400, message=msg) + try: + if self.errback_name: + errback = getattr(self.crawler.spider, self.errback_name) + assert callable(errback), 'Invalid errback' + self.request = self.request.replace(errback=errback) + except (AssertionError, AttributeError): + msg = f"Invalid spider errback {self.errback_name}, errback not callable or not a method of a spider {self.spider_name}" + self.user_error = Error(400, message=msg) + if self.user_error: + log.msg(self.user_error.message, level=log.ERROR) + return - errback = getattr(self.crawler.spider, self.errback_name) - assert callable(errback), 'Invalid errback' - self.request = self.request.replace(errback=errback) modify_request = getattr( self.crawler.spider, "modify_realtime_request", None) if callable(modify_request): self.request = modify_request(self.request) + spider.crawler.engine.crawl(self.request) self._request_scheduled = True raise DontCloseSpider @@ -238,6 +253,9 @@ def return_items(self, result): "stats": stats, "spider_name": self.spider_name, } + + results["user_error"] = self.user_error + if self.debug: results["errors"] = self.errors return results diff --git a/scrapyrt/resources.py b/scrapyrt/resources.py index b071175..bd673c2 100644 --- a/scrapyrt/resources.py +++ b/scrapyrt/resources.py @@ -261,6 +261,9 @@ def run_crawl(self, spider_name, scrapy_request_args, def prepare_response(self, result, *args, **kwargs): items = result.get("items") + user_error = result.get("user_error", None) + if user_error: + raise user_error response = { "status": "ok", "items": items, diff --git a/tests/test_crawl_manager.py b/tests/test_crawl_manager.py index 3bdb38c..3811d02 100644 --- a/tests/test_crawl_manager.py +++ b/tests/test_crawl_manager.py @@ -111,8 +111,10 @@ def test_spider_opened(self): def test_raise_error_if_not_callable(self): self.spider.parse_something = None - self.assertRaises( - AssertionError, self.crawl_manager.spider_idle, self.spider) + self._call_spider_idle() + self.assertIsNotNone(self.crawl_manager.user_error) + msg = "Invalid spider callback parse_something" + assert re.search(msg, self.crawl_manager.user_error.message) self.assertFalse(self.crawler.engine.crawl.called) def test_modify_realtime_request(self): @@ -142,15 +144,17 @@ def test_pass_wrong_spider_errback(self): mng = self.create_crawl_manager( {'url': 'http://localhost', 'errback': 'handle_error'} ) + try: - with pytest.raises(AttributeError) as err: - mng.spider_idle(self.spider) + mng.spider_idle(self.spider) except DontCloseSpider: pass assert mng.request.errback is None - msg = "has no attribute 'handle_error'" - assert re.search(msg, str(err)) + + self.assertIsNotNone(mng.user_error) + msg = "Invalid spider errback" + assert re.search(msg, mng.user_error.message) def test_pass_good_spider_errback(self): mng = self.create_crawl_manager( @@ -330,6 +334,7 @@ def setUp(self): 'items_dropped': self.crawl_manager.items_dropped, 'stats': self.stats.copy(), 'spider_name': self.spider.name, + 'user_error': None, } def test_return_items(self): diff --git a/tests/test_resource_crawl.py b/tests/test_resource_crawl.py index 0900c26..7b9cc16 100644 --- a/tests/test_resource_crawl.py +++ b/tests/test_resource_crawl.py @@ -142,6 +142,17 @@ def test_prepare_response(self, resource): for key, value in expected: assert prepared_res[key] == value + def test_prepare_response_user_error_raised(self, resource): + result = { + 'items': [1, 2], + 'stats': [99], + 'spider_name': 'test' + } + result['user_error'] = Exception("my exception") + with pytest.raises(Exception) as e_info: + resource.prepare_response(result) + assert e_info.message == "my exception" + class TestCrawlResourceGetRequiredArgument(unittest.TestCase):