Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Retry mechanism for transient errors #49

Merged
merged 7 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/tiny_web_crawler/core/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def crawl(self, url: str) -> None:
return

logger.debug("Crawling: %s", url)
soup = fetch_url(url)
soup = fetch_url(url, retries=self.settings.max_retry_attempts)
if not soup:
return

Expand Down
1 change: 1 addition & 0 deletions src/tiny_web_crawler/core/spider_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class CrawlSettings:
internal_links_only: bool = False
external_links_only: bool = False
respect_robots_txt: bool = True
max_retry_attempts: int = 5

@dataclass
class SpiderSettings(GeneralSettings, CrawlSettings):
Expand Down
14 changes: 13 additions & 1 deletion src/tiny_web_crawler/networking/fetcher.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,32 @@
from typing import Optional
import time

import requests
from bs4 import BeautifulSoup

from tiny_web_crawler.logging import get_logger

TRANSIENT_ERRORS = [408, 502, 503, 504]

logger = get_logger()

def fetch_url(url: str) -> Optional[BeautifulSoup]:
def is_transient_error(status_code: int) -> bool:
return status_code in TRANSIENT_ERRORS

def fetch_url(url: str, retries: int, attempts: int = 0) -> Optional[BeautifulSoup]:
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
data = response.text
return BeautifulSoup(data, 'lxml')
except requests.exceptions.HTTPError as http_err:
if response.status_code and is_transient_error(response.status_code) and retries > 0:
logger.error("Transient HTTP error occurred: %s. Retrying...", http_err)
time.sleep( attempts+1 )
return fetch_url( url, retries-1 , attempts+1)

logger.error("HTTP error occurred: %s", http_err)
return None
except requests.exceptions.ConnectionError as conn_err:
logger.error("Connection error occurred: %s", conn_err)
except requests.exceptions.Timeout as timeout_err:
Expand Down
98 changes: 92 additions & 6 deletions tests/networking/test_fetcher.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from unittest.mock import patch

import responses
import requests
Expand All @@ -15,7 +16,7 @@ def test_fetch_url() -> None:
status=200
)

resp = fetch_url("http://example.com")
resp = fetch_url("http://example.com", 1)

assert resp is not None
assert resp.text == "link"
Expand All @@ -26,15 +27,15 @@ def test_fetch_url_connection_error(caplog) -> None: # type: ignore

with caplog.at_level(ERROR):
# Fetch url whose response isn't mocked to raise ConnectionError
resp = fetch_url("http://connection.error")
resp = fetch_url("http://connection.error", 1)

assert "Connection error occurred:" in caplog.text
assert resp is None


@responses.activate
def test_fetch_url_http_error(caplog) -> None: # type: ignore
error_codes = [403, 404, 408]
error_codes = [403, 404, 412]

for error_code in error_codes:
setup_mock_response(
Expand All @@ -44,7 +45,7 @@ def test_fetch_url_http_error(caplog) -> None: # type: ignore
)

with caplog.at_level(ERROR):
resp = fetch_url(f"http://http.error/{error_code}")
resp = fetch_url(f"http://http.error/{error_code}", 1)

assert "HTTP error occurred:" in caplog.text
assert resp is None
Expand All @@ -60,7 +61,7 @@ def test_fetch_url_timeout_error(caplog) -> None: # type: ignore

with caplog.at_level(ERROR):
# Fetch url whose response isn't mocked to raise ConnectionError
resp = fetch_url("http://timeout.error")
resp = fetch_url("http://timeout.error", 1)

assert "Timeout error occurred:" in caplog.text
assert resp is None
Expand All @@ -76,7 +77,92 @@ def test_fetch_url_requests_exception(caplog) -> None: # type: ignore

with caplog.at_level(ERROR):
# Fetch url whose response isn't mocked to raise ConnectionError
resp = fetch_url("http://requests.exception")
resp = fetch_url("http://requests.exception", 1)

assert "Request error occurred:" in caplog.text
assert resp is None


@patch("time.sleep")
@responses.activate
def test_fetch_url_transient_error_retry_5(mock_sleep, caplog) -> None: # type: ignore
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=503
)

max_retry_attempts = 5

with caplog.at_level(ERROR):
resp = fetch_url("http://transient.error", max_retry_attempts)

assert resp is None

# Assert url was fetched once then retried x ammount of times
assert len(responses.calls) == max_retry_attempts + 1

# Assert sleep time grew with every request
expected_delays = [1, 2, 3, 4, 5]
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
assert actual_delays == expected_delays

assert "Transient HTTP error occurred:" in caplog.text


@patch("time.sleep")
@responses.activate
def test_fetch_url_transient_error_retry_10(mock_sleep, caplog) -> None: # type: ignore
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=503
)

max_retry_attempts = 10

with caplog.at_level(ERROR):
resp = fetch_url("http://transient.error", max_retry_attempts)

assert resp is None

# Assert url was fetched once then retried x ammount of times
assert len(responses.calls) == max_retry_attempts + 1

# Assert sleep time grew with every request
expected_delays = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
assert actual_delays == expected_delays

assert "Transient HTTP error occurred:" in caplog.text


@patch("time.sleep")
@responses.activate
def test_fetch_url_transient_error_retry_success(mock_sleep, caplog) -> None: # type: ignore
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=503
)
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=200
)

max_retry_attempts = 1

with caplog.at_level(ERROR):
resp = fetch_url("http://transient.error", max_retry_attempts)

assert resp is not None
assert resp.text == "link"

# Assert url was fetched 2 times
assert len(responses.calls) == 2

# Assert time.sleep was called
mock_sleep.assert_called_once_with(1)

assert "Transient HTTP error occurred:" in caplog.text
59 changes: 58 additions & 1 deletion tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from tiny_web_crawler import Spider
from tiny_web_crawler import SpiderSettings
from tiny_web_crawler.logging import DEBUG, WARNING
from tiny_web_crawler.logging import DEBUG, WARNING, ERROR
from tests.utils import setup_mock_response

@responses.activate
Expand Down Expand Up @@ -490,3 +490,60 @@ def test_respect_robots_txt_crawl_delay(mock_sleep, mock_urlopen, caplog) -> Non
def test_crawl_no_root_url() -> None:
with pytest.raises(ValueError):
Spider(SpiderSettings(verbose=False))


@patch("time.sleep")
@responses.activate
def test_crawl_url_transient_retry(mock_sleep, caplog) -> None: # type: ignore
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=503
)

spider = Spider(
SpiderSettings(root_url="http://transient.error",
respect_robots_txt=False)
)

with caplog.at_level(ERROR):
spider.crawl("http://transient.error")

assert spider.crawl_result == {}

assert len(responses.calls) == 6

expected_delays = [1, 2, 3, 4, 5]
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
assert actual_delays == expected_delays

assert "Transient HTTP error occurred:" in caplog.text


@patch("time.sleep")
@responses.activate
def test_crawl_url_transient_retry_custom_retry_amount(mock_sleep, caplog) -> None: # type: ignore
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=503
)

spider = Spider(
SpiderSettings(root_url="http://transient.error",
max_retry_attempts=10,
respect_robots_txt=False)
)

with caplog.at_level(ERROR):
spider.crawl("http://transient.error")

assert spider.crawl_result == {}

assert len(responses.calls) == 11

expected_delays = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
assert actual_delays == expected_delays

assert "Transient HTTP error occurred:" in caplog.text
Loading