From fc97dd1e2231c70046b948d967f25fb69e06a1eb Mon Sep 17 00:00:00 2001 From: Jonathan Alvarado Date: Thu, 11 Jan 2024 16:31:46 -0600 Subject: [PATCH 1/6] errback should default to None rather than the 'parse' method The errback should never have been defaulted to the 'parse' method of the spider. By doing this it invalidates what the scrapy docs say. Also, there is no documentation on the scrapy site that says that exceptions get sent to the parse method. The reason this was found is because the error handling in the `process_spider_exception` middleware was never getting called as the scrapy docs said it should be. The workaround to get it to work the way it did before with the 'parse' method is add `&errback=parse` in the request. --- scrapyrt/core.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scrapyrt/core.py b/scrapyrt/core.py index c0b240a..b2353ba 100644 --- a/scrapyrt/core.py +++ b/scrapyrt/core.py @@ -120,7 +120,7 @@ def __init__(self, spider_name, request_kwargs, # because we need to know if spider has method available self.callback_name = request_kwargs.pop('callback', None) or 'parse' # do the same for errback - self.errback_name = request_kwargs.pop('errback', None) or 'parse' + self.errback_name = request_kwargs.pop('errback', None) if request_kwargs.get("url"): self.request = self.create_spider_request(deepcopy(request_kwargs)) @@ -175,9 +175,11 @@ def spider_idle(self, spider): assert callable(callback), 'Invalid callback' self.request = self.request.replace(callback=callback) - errback = getattr(self.crawler.spider, self.errback_name) - assert callable(errback), 'Invalid errback' - self.request = self.request.replace(errback=errback) + + if self.errback_name: + errback = getattr(self.crawler.spider, self.errback_name) + assert callable(errback), 'Invalid errback' + self.request = self.request.replace(errback=errback) modify_request = getattr( self.crawler.spider, "modify_realtime_request", None) if callable(modify_request): From 40aa643570ba690126054862ffd38f2d6f2ba878 Mon Sep 17 00:00:00 2001 From: Jonathan Alvarado Date: Mon, 15 Jan 2024 14:48:16 -0600 Subject: [PATCH 2/6] Add a DEFAULT_ERRBACK_NAME to settings This will allow the ability to change the non-standard behavior of sending exceptions to the `parse` method of the spider without introducing a breaking change to scrapyrt. It also introduces some documentation of the existing behavior. --- docs/source/api.rst | 7 +++++++ scrapyrt/conf/default_settings.py | 4 +++- scrapyrt/core.py | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index b310355..81db0b9 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -517,6 +517,13 @@ Encoding that's used to encode log messages. Default: ``utf-8``. +DEFAULT_ERRBACK_NAME +~~~~~~~~ + +The name of the default errback method to call on the spider in case of an exception. The default errback method is ``parse`` to maintain backwards compatibility but it is not standard to scrapy and may interfere with the use of middlewares which implement the ``process_spider_exception`` method. Use a setting of ``None`` if you don't want to use the default scrapy exception handling. + +Default: ``parse``. Use the ``parse`` method on scrapy spider to handle exceptions. Be aware that this is non-standard to typical scrapy spiders. + Spider settings --------------- diff --git a/scrapyrt/conf/default_settings.py b/scrapyrt/conf/default_settings.py index 99d7746..a7b6f45 100644 --- a/scrapyrt/conf/default_settings.py +++ b/scrapyrt/conf/default_settings.py @@ -31,4 +31,6 @@ # disable in production DEBUG = True -TWISTED_REACTOR = None \ No newline at end of file +TWISTED_REACTOR = None + +DEFAULT_ERRBACK_NAME = 'parse' diff --git a/scrapyrt/core.py b/scrapyrt/core.py index b2353ba..c18fc54 100644 --- a/scrapyrt/core.py +++ b/scrapyrt/core.py @@ -120,7 +120,7 @@ def __init__(self, spider_name, request_kwargs, # because we need to know if spider has method available self.callback_name = request_kwargs.pop('callback', None) or 'parse' # do the same for errback - self.errback_name = request_kwargs.pop('errback', None) + self.errback_name = request_kwargs.pop('errback', None) or app_settings.DEFAULT_ERRBACK_NAME if request_kwargs.get("url"): self.request = self.create_spider_request(deepcopy(request_kwargs)) From 31cb1565788f85945de4d0a29f5d083589d6d2b9 Mon Sep 17 00:00:00 2001 From: Jonathan Alvarado Date: Fri, 19 Jan 2024 10:37:42 -0600 Subject: [PATCH 3/6] Add user_error support Currently the application is not reporting to the user when the user provides an invalid errback or callback method. The scheduling of the request and validation of the spider callback and errback happens in a different thread than the one which is handling the api request. So, we need a different mechanism to communicate with the api request thread than simply raising the exception. We already do this for other errors and responses by adding properties to the CrawlManager object. So it seems best to also communicate this exception to the api request by using a user_error property on the CrawlManager. Then the exception can be raised in the context of the api request. --- scrapyrt/core.py | 41 ++++++++++++++++++++++-------------- scrapyrt/resources.py | 3 +++ tests/test_crawl_manager.py | 15 ++++++++----- tests/test_resource_crawl.py | 11 ++++++++++ 4 files changed, 49 insertions(+), 21 deletions(-) diff --git a/scrapyrt/core.py b/scrapyrt/core.py index c18fc54..f65cc1d 100644 --- a/scrapyrt/core.py +++ b/scrapyrt/core.py @@ -3,6 +3,7 @@ from copy import deepcopy import datetime import os +import traceback from scrapy import signals from scrapy.crawler import CrawlerRunner, Crawler @@ -109,6 +110,7 @@ def __init__(self, spider_name, request_kwargs, self.items = [] self.items_dropped = [] self.errors = [] + self.user_error = None self.max_requests = int(max_requests) if max_requests else None self.timeout_limit = int(app_settings.TIMEOUT_LIMIT) self.request_count = 0 @@ -171,22 +173,26 @@ def spider_idle(self, spider): """ if spider is self.crawler.spider and self.request and not self._request_scheduled: - callback = getattr(self.crawler.spider, self.callback_name) - assert callable(callback), 'Invalid callback' - self.request = self.request.replace(callback=callback) - - - if self.errback_name: - errback = getattr(self.crawler.spider, self.errback_name) - assert callable(errback), 'Invalid errback' - self.request = self.request.replace(errback=errback) - modify_request = getattr( - self.crawler.spider, "modify_realtime_request", None) - if callable(modify_request): - self.request = modify_request(self.request) - spider.crawler.engine.crawl(self.request) - self._request_scheduled = True - raise DontCloseSpider + try: + callback = getattr(self.crawler.spider, self.callback_name) + assert callable(callback), 'Invalid callback' + self.request = self.request.replace(callback=callback) + + + if self.errback_name: + errback = getattr(self.crawler.spider, self.errback_name) + assert callable(errback), 'Invalid errback' + self.request = self.request.replace(errback=errback) + modify_request = getattr( + self.crawler.spider, "modify_realtime_request", None) + if callable(modify_request): + self.request = modify_request(self.request) + spider.crawler.engine.crawl(self.request) + self._request_scheduled = True + except Exception as e: + self.user_error = Error(400, message=traceback.format_exc()) + else: + raise DontCloseSpider def handle_scheduling(self, request, spider): """Handler of request_scheduled signal. @@ -240,6 +246,9 @@ def return_items(self, result): "stats": stats, "spider_name": self.spider_name, } + + results["user_error"] = self.user_error + if self.debug: results["errors"] = self.errors return results diff --git a/scrapyrt/resources.py b/scrapyrt/resources.py index b071175..bd673c2 100644 --- a/scrapyrt/resources.py +++ b/scrapyrt/resources.py @@ -261,6 +261,9 @@ def run_crawl(self, spider_name, scrapy_request_args, def prepare_response(self, result, *args, **kwargs): items = result.get("items") + user_error = result.get("user_error", None) + if user_error: + raise user_error response = { "status": "ok", "items": items, diff --git a/tests/test_crawl_manager.py b/tests/test_crawl_manager.py index 3bdb38c..63aec46 100644 --- a/tests/test_crawl_manager.py +++ b/tests/test_crawl_manager.py @@ -111,8 +111,10 @@ def test_spider_opened(self): def test_raise_error_if_not_callable(self): self.spider.parse_something = None - self.assertRaises( - AssertionError, self.crawl_manager.spider_idle, self.spider) + self._call_spider_idle() + self.assertIsNotNone(self.crawl_manager.user_error) + msg = "Invalid callback" + assert re.search(msg, self.crawl_manager.user_error.message) self.assertFalse(self.crawler.engine.crawl.called) def test_modify_realtime_request(self): @@ -142,15 +144,17 @@ def test_pass_wrong_spider_errback(self): mng = self.create_crawl_manager( {'url': 'http://localhost', 'errback': 'handle_error'} ) + try: - with pytest.raises(AttributeError) as err: - mng.spider_idle(self.spider) + mng.spider_idle(self.spider) except DontCloseSpider: pass assert mng.request.errback is None + + self.assertIsNotNone(mng.user_error) msg = "has no attribute 'handle_error'" - assert re.search(msg, str(err)) + assert re.search(msg, mng.user_error.message) def test_pass_good_spider_errback(self): mng = self.create_crawl_manager( @@ -330,6 +334,7 @@ def setUp(self): 'items_dropped': self.crawl_manager.items_dropped, 'stats': self.stats.copy(), 'spider_name': self.spider.name, + 'user_error': None, } def test_return_items(self): diff --git a/tests/test_resource_crawl.py b/tests/test_resource_crawl.py index 0900c26..7b9cc16 100644 --- a/tests/test_resource_crawl.py +++ b/tests/test_resource_crawl.py @@ -142,6 +142,17 @@ def test_prepare_response(self, resource): for key, value in expected: assert prepared_res[key] == value + def test_prepare_response_user_error_raised(self, resource): + result = { + 'items': [1, 2], + 'stats': [99], + 'spider_name': 'test' + } + result['user_error'] = Exception("my exception") + with pytest.raises(Exception) as e_info: + resource.prepare_response(result) + assert e_info.message == "my exception" + class TestCrawlResourceGetRequiredArgument(unittest.TestCase): From 13394dd50dd37494e002a8b2f54befbfa098657b Mon Sep 17 00:00:00 2001 From: Jonathan Alvarado Date: Mon, 22 Jan 2024 13:40:09 -0600 Subject: [PATCH 4/6] Update docs/source/api.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- docs/source/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 81db0b9..3a896b9 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -518,7 +518,7 @@ Encoding that's used to encode log messages. Default: ``utf-8``. DEFAULT_ERRBACK_NAME -~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~ The name of the default errback method to call on the spider in case of an exception. The default errback method is ``parse`` to maintain backwards compatibility but it is not standard to scrapy and may interfere with the use of middlewares which implement the ``process_spider_exception`` method. Use a setting of ``None`` if you don't want to use the default scrapy exception handling. From 4fe66154331293574b361e15ad4ddcd1fc57f9ad Mon Sep 17 00:00:00 2001 From: Jonathan Alvarado Date: Mon, 22 Jan 2024 13:40:15 -0600 Subject: [PATCH 5/6] Update docs/source/api.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- docs/source/api.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 3a896b9..e7e4d7b 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -520,9 +520,13 @@ Default: ``utf-8``. DEFAULT_ERRBACK_NAME ~~~~~~~~~~~~~~~~~~~~ -The name of the default errback method to call on the spider in case of an exception. The default errback method is ``parse`` to maintain backwards compatibility but it is not standard to scrapy and may interfere with the use of middlewares which implement the ``process_spider_exception`` method. Use a setting of ``None`` if you don't want to use the default scrapy exception handling. +Default: ``"parse"`` -Default: ``parse``. Use the ``parse`` method on scrapy spider to handle exceptions. Be aware that this is non-standard to typical scrapy spiders. +The name of the default errback_. + +Use an empty string or ``None`` to unset the errback altogether. + +.. _errback: https://docs.scrapy.org/en/latest/topics/request-response.htm#using-errbacks-to-catch-exceptions-in-request-processing Spider settings From 3b5a3942ef07d1b29bb68f1a05c158df4fe8426f Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Wed, 14 Feb 2024 09:41:29 +0100 Subject: [PATCH 6/6] crawl_manager: default errback to None --- docs/source/api.rst | 17 ++++++++++++----- scrapyrt/conf/default_settings.py | 2 +- scrapyrt/core.py | 31 +++++++++++++++++++------------ tests/test_crawl_manager.py | 4 ++-- 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index e7e4d7b..59cec8a 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -97,14 +97,20 @@ callback - optional Must exist as method of scheduled spider, does not need to contain string "self". - If not passed or not found on spider default callback `parse`_ will be used. + If not passed default Scrapy callback `parse`_ will be used. If there is no spider method + with name specified by callback argument or callback is not callable API will return 400 HTTP error. + + Example request with callback: ``/crawl.json?url=https://quotes.toscrape.com/&spider_name=toscrape-css&callback=parse_page`` errback - type: string - optional Scrapy errback for request made from spider. It must exist as method of - scheduled spider, otherwise exception will be raised. String does not need to contain 'self'. + scheduled spider, otherwise API will return 400 HTTP error. String does not need to contain 'self'. + Defaults to None, can be adjusted with `DEFAULT_ERRBACK_NAME`_ setting. + + Example request with errback: ``/crawl.json?url=https://quotes.toscrape.com/&spider_name=toscrape-css&errback=my_errback`` max_requests - type: integer @@ -520,11 +526,12 @@ Default: ``utf-8``. DEFAULT_ERRBACK_NAME ~~~~~~~~~~~~~~~~~~~~ -Default: ``"parse"`` +Default: ``None`` -The name of the default errback_. +String with the name of the default errback_. -Use an empty string or ``None`` to unset the errback altogether. +Use this setting to set default errback for scrapy spider requests made from ScrapyRT. +Errback must exist as method of spider and must be callable, otherwise 400 HTTP error will be raised. .. _errback: https://docs.scrapy.org/en/latest/topics/request-response.htm#using-errbacks-to-catch-exceptions-in-request-processing diff --git a/scrapyrt/conf/default_settings.py b/scrapyrt/conf/default_settings.py index a7b6f45..eea5e72 100644 --- a/scrapyrt/conf/default_settings.py +++ b/scrapyrt/conf/default_settings.py @@ -33,4 +33,4 @@ TWISTED_REACTOR = None -DEFAULT_ERRBACK_NAME = 'parse' +DEFAULT_ERRBACK_NAME = None diff --git a/scrapyrt/core.py b/scrapyrt/core.py index f65cc1d..ccff725 100644 --- a/scrapyrt/core.py +++ b/scrapyrt/core.py @@ -177,22 +177,29 @@ def spider_idle(self, spider): callback = getattr(self.crawler.spider, self.callback_name) assert callable(callback), 'Invalid callback' self.request = self.request.replace(callback=callback) - - + except (AssertionError, AttributeError): + msg = f"Invalid spider callback {self.callback_name}, callback not callable or not a method of a spider {self.spider_name}" + self.user_error = Error(400, message=msg) + try: if self.errback_name: errback = getattr(self.crawler.spider, self.errback_name) assert callable(errback), 'Invalid errback' self.request = self.request.replace(errback=errback) - modify_request = getattr( - self.crawler.spider, "modify_realtime_request", None) - if callable(modify_request): - self.request = modify_request(self.request) - spider.crawler.engine.crawl(self.request) - self._request_scheduled = True - except Exception as e: - self.user_error = Error(400, message=traceback.format_exc()) - else: - raise DontCloseSpider + except (AssertionError, AttributeError): + msg = f"Invalid spider errback {self.errback_name}, errback not callable or not a method of a spider {self.spider_name}" + self.user_error = Error(400, message=msg) + if self.user_error: + log.msg(self.user_error.message, level=log.ERROR) + return + + modify_request = getattr( + self.crawler.spider, "modify_realtime_request", None) + if callable(modify_request): + self.request = modify_request(self.request) + + spider.crawler.engine.crawl(self.request) + self._request_scheduled = True + raise DontCloseSpider def handle_scheduling(self, request, spider): """Handler of request_scheduled signal. diff --git a/tests/test_crawl_manager.py b/tests/test_crawl_manager.py index 63aec46..3811d02 100644 --- a/tests/test_crawl_manager.py +++ b/tests/test_crawl_manager.py @@ -113,7 +113,7 @@ def test_raise_error_if_not_callable(self): self.spider.parse_something = None self._call_spider_idle() self.assertIsNotNone(self.crawl_manager.user_error) - msg = "Invalid callback" + msg = "Invalid spider callback parse_something" assert re.search(msg, self.crawl_manager.user_error.message) self.assertFalse(self.crawler.engine.crawl.called) @@ -153,7 +153,7 @@ def test_pass_wrong_spider_errback(self): assert mng.request.errback is None self.assertIsNotNone(mng.user_error) - msg = "has no attribute 'handle_error'" + msg = "Invalid spider errback" assert re.search(msg, mng.user_error.message) def test_pass_good_spider_errback(self):