From cc80b76a821f88d6d815314c2a119144165827d6 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 7 Apr 2025 19:38:57 +0000 Subject: [PATCH 1/9] add impit in dependencies --- pyproject.toml | 2 ++ uv.lock | 55 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 96db0b8cd0..4f349273b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ all = [ "curl-cffi>=0.9.0", "html5lib>=1.0", "inquirer>=3.3.0", + "impit>=0.1.0", "jaro-winkler>=2.0.3", "parsel>=1.10.0", "playwright>=1.27.0", @@ -76,6 +77,7 @@ adaptive-crawler = [ beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"] cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"] curl-impersonate = ["curl-cffi>=0.9.0"] +impit = ["impit>=0.1.0"] parsel = ["parsel>=1.10.0"] playwright = ["playwright>=1.27.0"] diff --git a/uv.lock b/uv.lock index 39af09df40..f9b2521fe0 100644 --- a/uv.lock +++ b/uv.lock @@ -600,7 +600,7 @@ toml = [ [[package]] name = "crawlee" -version = "0.6.6" +version = "0.6.7" source = { editable = "." } dependencies = [ { name = "apify-fingerprint-datapoints" }, @@ -633,6 +633,7 @@ all = [ { name = "cookiecutter" }, { name = "curl-cffi" }, { name = "html5lib" }, + { name = "impit" }, { name = "inquirer" }, { name = "jaro-winkler" }, { name = "parsel" }, @@ -654,6 +655,9 @@ cli = [ curl-impersonate = [ { name = "curl-cffi" }, ] +impit = [ + { name = "impit" }, +] parsel = [ { name = "parsel" }, ] @@ -701,6 +705,8 @@ requires-dist = [ { name = "html5lib", marker = "extra == 'all'", specifier = ">=1.0" }, { name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" }, { name = "httpx", extras = ["brotli", "http2", "zstd"], specifier = ">=0.27.0" }, + { name = "impit", marker = "extra == 'all'", specifier = ">=0.1.0" }, + { name = "impit", marker = "extra == 'impit'", specifier = ">=0.1.0" }, { name = "inquirer", marker = "extra == 'all'", specifier = ">=3.3.0" }, { name = "inquirer", marker = "extra == 'cli'", specifier = ">=3.3.0" }, { name = "jaro-winkler", marker = "extra == 'adaptive-crawler'", specifier = ">=2.0.3" }, @@ -728,7 +734,7 @@ requires-dist = [ { name = "typing-extensions", specifier = ">=4.1.0" }, { name = "yarl", specifier = ">=1.18.0" }, ] -provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "parsel", "playwright"] +provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "impit", "parsel", "playwright"] [package.metadata.requires-dev] dev = [ @@ -1144,12 +1150,55 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, ] +[[package]] +name = "impit" +version = "0.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0d/2c/b0a632b1c455ba97b77395fa57fd086fef4f8a8d9604b6de959dbd959a36/impit-0.1.0.tar.gz", hash = "sha256:7aadb7ed30b17515eabf53afd990675d3b097eedd538abe4f4754be8914924cd", size = 41141 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/5c/f2424cddfdad10cfcd92a38a1f3b31ff046db50d68a9b8f5e5a0b4ff41ad/impit-0.1.0-1-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:8bbdf3785924a50ffb773f3ed12cb30f930ffe329d25f699b46608aac1b46e98", size = 5355850 }, + { url = "https://files.pythonhosted.org/packages/e7/bd/f34fa4678b1317a20d0eec57dc9372bde77462f92e798878bb4ccaad6621/impit-0.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02d818c3b076e0d665f2bc93bcd30fd20a7935787d717799836697529fb315db", size = 5335799 }, + { url = "https://files.pythonhosted.org/packages/17/d6/c1210c27d0632b3d51e62339d9c8d0b0f23699c875393465675d25fe6db4/impit-0.1.0-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:ad54fae8954bd8366f2891c6b7d818eb207b4768b8456fcff478f0217d8d5a49", size = 47737431 }, + { url = "https://files.pythonhosted.org/packages/0c/80/06d7a7e557420ff979449e28d4190c5291fddfe8a745de3e0b6a35c7d85d/impit-0.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:85a43d70cb2ee95f3446f167c1ea235b0143fa4a4dbc9e499bd69625fe8c8338", size = 5626908 }, + { url = "https://files.pythonhosted.org/packages/6b/50/2cea3aae32bde1a535369f78c81ac530a6e801586daac482d9e8f378617b/impit-0.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:132ab63a471475286c163b19cb3b03ad78b8597cfdda792d35d681ec3bdb6b57", size = 5482225 }, + { url = "https://files.pythonhosted.org/packages/f9/a6/3efac7442f754be942d8195535f90229dad89657481ab87b2d708a989792/impit-0.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:fc347a36ec394761ffedaa84528e903d53c7a08ef5c6d43b2fe22a562db92d89", size = 3319369 }, + { url = "https://files.pythonhosted.org/packages/81/41/1331c44227440943a66ce2c6e1b9920b30e0d5e9d0756ffda58e678d8ba8/impit-0.1.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d74c079e059285059c325d18e856458969d85ec6a326af8e1d4f740f75a252fb", size = 3311457 }, + { url = "https://files.pythonhosted.org/packages/5c/1d/2a17a60b4b17448131bc1e24aa6c0c9fa309e548a26ab4e84464d09be46b/impit-0.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2cab190278e7bc0777463b522fdebf329b9ff1202d398472998c33910c92f9e9", size = 3124958 }, + { url = "https://files.pythonhosted.org/packages/4b/58/25930cc6dd91e11a0aff4910886d5246c7103942c4eb506aaef26d04c077/impit-0.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c16dc7f902237d81fe45b04d5ce744c29fdd433ad1493246af83cf3f2fff8d7", size = 5335511 }, + { url = "https://files.pythonhosted.org/packages/9c/b0/41a8cfe609fd121cf5456c456eb4eb1743f3caf73c13228a6fdfe1d0f455/impit-0.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aa4b69ec94c4ed13c72fa782283faf1aaa8747648c83c9c2422c06be8181abee", size = 5626678 }, + { url = "https://files.pythonhosted.org/packages/38/a7/e26db34cf700ad1f41ca409ce062fb87d2abf5dda9d7bbbb49f8cb1185b0/impit-0.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:97c0687bd12c7532eb9a383ddcf92cb5d10bc19e8a62dbe3a5436bd14737f9a2", size = 5482274 }, + { url = "https://files.pythonhosted.org/packages/67/a1/1dfdcee3f4d2ce22dc36cc4c2894f126af3f3abac40eb2acebb84fb5679c/impit-0.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:d0624c286cfc02d59522740f4301fb1bc9a6398c5d8451a475523a5d0c41b791", size = 3319398 }, + { url = "https://files.pythonhosted.org/packages/13/34/3fe6e73f10fb685b8def6a55faf298a8e86e7080f1d4b917694f4c1dcbfb/impit-0.1.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:b3fbaf19cff978a6e076fa952241fe64c99c0af3e55e0acab0c1b68737112b16", size = 3310885 }, + { url = "https://files.pythonhosted.org/packages/be/93/de7fe2dd6fd5d8696145a0b60bd306c39cf64eeb70c0a3763779582883b4/impit-0.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:195de09d104f610891ee3ba5ca460d671a2a06abf4d0975be68d28397c3dd9a4", size = 3126163 }, + { url = "https://files.pythonhosted.org/packages/bf/36/ed7d89faa763b8cd2bddd3a7e274696fafd041c371490dd6b8edb875287c/impit-0.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:218570ec98e59c5e75bb83e211cc6107aabe1f63688f4c1ce518951e71287acf", size = 5335000 }, + { url = "https://files.pythonhosted.org/packages/72/37/d1fc899462f5ef4f95b6053bdc19ea470d6fa707c682d81c7a76beb71e38/impit-0.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9f776b6094e6b6413f06ec889051b931bce3f4d14fd0d8d78173b6e5b001dd66", size = 5626951 }, + { url = "https://files.pythonhosted.org/packages/c3/8b/95881985ca03698df43021883f53830765886fc7f6d4b67911dfe1f6720a/impit-0.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:517bf72aad0d2d364023559cdd670740a450419adf4ad96e8ba311e08d7504ac", size = 5481425 }, + { url = "https://files.pythonhosted.org/packages/b1/09/3b6526be6580cd8f011f944b8c309beaea2a725ca24568f79d9162e9275d/impit-0.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:fd1206d8c6590fa3c5acfb78c646d8fb2cc4f2ae01b9506781b71f0b20b27036", size = 3320896 }, + { url = "https://files.pythonhosted.org/packages/e7/c4/8ccf4542b67365594a20d8b2a5baf0c993a5786abb685d25766ee235e71f/impit-0.1.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:c3154f184dcf987969318f7c82356857a6159c6a39ec8162be345d25036e60ce", size = 3310831 }, + { url = "https://files.pythonhosted.org/packages/3b/c1/efc519bb3a59b297ba289ccec4c2b81cc99beae5874e56f2575e05176934/impit-0.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6c82429009280a975ddf13840da7bcfd1989fb9c8aba0ea86175f2ef44fe177b", size = 3126331 }, + { url = "https://files.pythonhosted.org/packages/61/e7/deca560d53d2c04e159d73cf8bb970122e72a0bd225b7cca26e16bf0ef5a/impit-0.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f4cdadf8f4ad5c61580eb8faeaced122346510492311389d1a1bc3c656d0a30e", size = 5334923 }, + { url = "https://files.pythonhosted.org/packages/ea/f1/6285cf49cf1ef55322705fc768ffcf6ca642ebee1011becee39469bd26c7/impit-0.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:67fc248a0c1e044a3ccfcf1889f92faadcd994dbc689573b3ca2e099037f7a79", size = 5626758 }, + { url = "https://files.pythonhosted.org/packages/f5/c0/326173747b91c52b862801984a11119ec4f37be8326f38e3ea2d0343316c/impit-0.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:43eaba84e90c177225a591870fb3fc0195870e2811b8e4b7bbcf4e98c02279e1", size = 5481611 }, + { url = "https://files.pythonhosted.org/packages/89/7b/859ada2d64d867e1582bc40ead115f80b55e9b75d33ca4940c81a3d7b0d8/impit-0.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:f6ea7e335739aff0a266357591cc87984e7e5cce1eed9b6797aab2cbc12cf558", size = 3320648 }, + { url = "https://files.pythonhosted.org/packages/a9/e4/7b043f5ceb1600e0420a261c11180f2c7b62f43b32e1ab476f9d98e9d435/impit-0.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5262b4dd63fc770e22f4d0fc42fce5fb7386ee7bb8735ce52ccbf862ebd6715b", size = 5626627 }, + { url = "https://files.pythonhosted.org/packages/db/fb/b40896632145cc564501ef8286de2e0761514ab7f740bf30fbdc5f1e9460/impit-0.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:63a0613d9df5d89d981a6f4ac33c2d5b3850497ee197d527c2c370bbb3c10c6d", size = 5481815 }, + { url = "https://files.pythonhosted.org/packages/93/2b/d0a9220645f766513477b93f4054085f9b584bacc9b091eae88210ac1518/impit-0.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9794d90f965173e735ce73a32dbc72ed61c97a5bd62eff910acae450bee7549f", size = 5336070 }, + { url = "https://files.pythonhosted.org/packages/72/98/f34d7cff6ffcdee64b62959d22a44f026f3721a09aabfb700149be845335/impit-0.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fe0a78708dcb93d4abdad28f15a833ff35f818bf0e0ee87404291dbed1713865", size = 5627466 }, + { url = "https://files.pythonhosted.org/packages/19/28/fcbfee48222c3e9a20370ea021c03ad1cca0396b5ea48f165cab1ac37ca9/impit-0.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:19f4ff21e62ad8d25f1d5d8c86c199fe1455f5c2d953bbd1ba7b367e36ab7cc6", size = 5482722 }, + { url = "https://files.pythonhosted.org/packages/4e/3e/309467d5348dd405be6a312e811c00ac5c7adcd7f689cb145fe5aae5b51d/impit-0.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:c927330f4a520482f725d0428ae072e4acf19244bfd47026fc54d24f48ec4930", size = 3320035 }, + { url = "https://files.pythonhosted.org/packages/d6/00/976b399920f4d6398a911716c1254ebb26af9beb39a251f758bbcccd19dc/impit-0.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81b0fa849966c2feeb7fd140bc7aac7b0badd241303e777696308dadf48bbe54", size = 5336092 }, + { url = "https://files.pythonhosted.org/packages/69/39/8da8bac42b497e972060bfe8e30ddaaaf8025510fab4034f4a2676e3d92a/impit-0.1.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:8d13d04906472c3785c3066c7bcd9e2b38ea686e94613caabbbe983ab893940f", size = 5626742 }, + { url = "https://files.pythonhosted.org/packages/d5/7c/7b6efa5828f1a92f666c4ec76211536168c8a1e44320bdf59ffd246ce265/impit-0.1.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:636a459fb7d20ec9515a0310b8c2c24742232629abd6a5dd0fff70ca40f6f653", size = 5482314 }, + { url = "https://files.pythonhosted.org/packages/e5/6d/25c1ad602dd3f9e0f1df2a753b231813ca56a33dc9dcfb7316dfb1461b0b/impit-0.1.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:ec77f5eb3e703ad617acb9812a20ca729da2dadf585e1911a7d7f44daf7d4294", size = 5627255 }, + { url = "https://files.pythonhosted.org/packages/09/5b/b9e424fdd3d623610f9831c0ca80eba712a9912e268826c0740ac0a99da0/impit-0.1.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:66296717b5eb14ec55d076d19779bff02b02922650be392f74cf02a13067be56", size = 5482436 }, +] + [[package]] name = "importlib-metadata" version = "8.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zipp", marker = "python_full_version < '3.11'" }, + { name = "zipp" }, ] sdist = { url = "https://files.pythonhosted.org/packages/33/08/c1395a292bb23fd03bdf572a1357c5a733d3eecbab877641ceacab23db6e/importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580", size = 55767 } wheels = [ From 25cf4ed00f8c900fdc4c8a6c2658c1ddfc45bce0 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 7 Apr 2025 20:12:05 +0000 Subject: [PATCH 2/9] add base for client --- src/crawlee/http_clients/__init__.py | 4 + src/crawlee/http_clients/_impit.py | 167 +++++++++++++++++++++++++++ tests/unit/conftest.py | 5 +- 3 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 src/crawlee/http_clients/_impit.py diff --git a/src/crawlee/http_clients/__init__.py b/src/crawlee/http_clients/__init__.py index 94df9980ec..51235c45b5 100644 --- a/src/crawlee/http_clients/__init__.py +++ b/src/crawlee/http_clients/__init__.py @@ -12,6 +12,9 @@ with _try_import(__name__, 'CurlImpersonateHttpClient'): from ._curl_impersonate import CurlImpersonateHttpClient +with _try_import(__name__, 'ImpitHttpClient'): + from ._impit import ImpitHttpClient + __all__ = [ 'CurlImpersonateHttpClient', @@ -19,4 +22,5 @@ 'HttpCrawlingResult', 'HttpResponse', 'HttpxHttpClient', + 'ImpitHttpClient', ] diff --git a/src/crawlee/http_clients/_impit.py b/src/crawlee/http_clients/_impit.py new file mode 100644 index 0000000000..a5fd8df9fa --- /dev/null +++ b/src/crawlee/http_clients/_impit.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +from logging import getLogger +from typing import TYPE_CHECKING, Any, Optional + +import impit # type: ignore[import-untyped] +from typing_extensions import override + +from crawlee._types import HttpHeaders +from crawlee._utils.docs import docs_group +from crawlee.fingerprint_suite import HeaderGenerator +from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse + +if TYPE_CHECKING: + from crawlee import Request + from crawlee._types import HttpMethod, HttpPayload + from crawlee.proxy_configuration import ProxyInfo + from crawlee.sessions import Session + from crawlee.statistics import Statistics + +logger = getLogger(__name__) + + +class _ImpitResponse: + """Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol.""" + + def __init__(self, response: impit.Response) -> None: + self._response = response + + @property + def http_version(self) -> str: + return str(self._response.http_version) + + @property + def status_code(self) -> int: + return int(self._response.status_code) + + @property + def headers(self) -> HttpHeaders: + return HttpHeaders(dict(self._response.headers)) + + def read(self) -> bytes: + return str(self._response.text).encode() + + +@docs_group('Classes') +class ImpitHttpClient(HttpClient): + """HTTP client based on the `HTTPX` library. + + This client uses the `HTTPX` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses) + and to manage sessions, proxies, and error handling. + + See the `HttpClient` class for more common information about HTTP clients. + + ### Usage + + ```python + from crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler + from crawlee.http_clients import HttpxHttpClient + + http_client = HttpxHttpClient() + crawler = HttpCrawler(http_client=http_client) + ``` + """ + + _DEFAULT_HEADER_GENERATOR = HeaderGenerator() + + def __init__( + self, + *, + persist_cookies_per_session: bool = True, + http3: bool = True, + verify: bool = True, + header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR, + **async_client_kwargs: Any, + ) -> None: + """Initialize a new instance. + + Args: + persist_cookies_per_session: Whether to persist cookies per HTTP session. + http3: Whether to enable HTTP/3 support. + verify: SSL certificates used to verify the identity of requested hosts. + header_generator: Header generator instance to use for generating common headers. + async_client_kwargs: Additional keyword arguments for `httpx.AsyncClient`. + """ + super().__init__( + persist_cookies_per_session=persist_cookies_per_session, + ) + self._http3 = http3 + self._verify = verify + + self._async_client_kwargs = async_client_kwargs + self._header_generator = header_generator + + self._client_by_proxy_url = dict[Optional[str], impit.AsyncClient]() + + @override + async def crawl( + self, + request: Request, + *, + session: Session | None = None, + proxy_info: ProxyInfo | None = None, + statistics: Statistics | None = None, + ) -> HttpCrawlingResult: + client = self._get_client(proxy_info.url if proxy_info else None) + + response = await client.request( + url=request.url, + method=request.method, + content=request.payload, + ) + + if statistics: + statistics.register_status_code(response.status_code) + + request.loaded_url = str(request.url) + + return HttpCrawlingResult( + http_response=_ImpitResponse(response), + ) + + @override + async def send_request( + self, + url: str, + *, + method: HttpMethod = 'GET', + headers: HttpHeaders | dict[str, str] | None = None, + payload: HttpPayload | None = None, + session: Session | None = None, + proxy_info: ProxyInfo | None = None, + ) -> HttpResponse: + if isinstance(headers, dict) or headers is None: + headers = HttpHeaders(headers or {}) + + client = self._get_client(proxy_info.url if proxy_info else None) + + response = await client.request( + url=url, + method=method, + headers=dict(headers) if headers else None, + content=payload, + ) + + return _ImpitResponse(response) + + def _get_client(self, proxy_url: str | None) -> impit.AsyncClient: + """Retrieve or create an HTTP client for the given proxy URL. + + If a client for the specified proxy URL does not exist, create and store a new one. + """ + if proxy_url not in self._client_by_proxy_url: + # Prepare a default kwargs for the new client. + kwargs: dict[str, Any] = { + 'proxy': proxy_url, + 'http3': self._http3, + 'verify': self._verify, + } + + # Update the default kwargs with any additional user-provided kwargs. + kwargs.update(self._async_client_kwargs) + + client = impit.AsyncClient(**kwargs) + self._client_by_proxy_url[proxy_url] = client + + return self._client_by_proxy_url[proxy_url] diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index b7ac06d124..842b0a328b 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -15,7 +15,7 @@ from crawlee import service_locator from crawlee.configuration import Configuration from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_network -from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient +from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient, ImpitHttpClient from crawlee.proxy_configuration import ProxyInfo from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import KeyValueStore, _creation_management @@ -206,9 +206,12 @@ def redirect_server_url(redirect_http_server: TestServer) -> URL: params=[ pytest.param('curl', id='curl'), pytest.param('httpx', id='httpx'), + pytest.param('impit', id='impit'), ] ) async def http_client(request: pytest.FixtureRequest) -> HttpClient: if request.param == 'curl': return CurlImpersonateHttpClient(http_version=CurlHttpVersion.V1_1) + if request.param == 'impit': + return ImpitHttpClient(http3=False) return HttpxHttpClient(http2=False) From e0987759a969660ad9103df00fc9eea0b16e30d0 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 10 Apr 2025 21:21:21 +0000 Subject: [PATCH 3/9] update with new release --- src/crawlee/http_clients/_impit.py | 25 +++++---- uv.lock | 81 ++++++++++++++++-------------- 2 files changed, 54 insertions(+), 52 deletions(-) diff --git a/src/crawlee/http_clients/_impit.py b/src/crawlee/http_clients/_impit.py index a5fd8df9fa..675b0c5fc1 100644 --- a/src/crawlee/http_clients/_impit.py +++ b/src/crawlee/http_clients/_impit.py @@ -3,7 +3,7 @@ from logging import getLogger from typing import TYPE_CHECKING, Any, Optional -import impit # type: ignore[import-untyped] +from impit import AsyncClient, Response from typing_extensions import override from crawlee._types import HttpHeaders @@ -22,9 +22,9 @@ class _ImpitResponse: - """Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol.""" + """Adapter class for `impit.Response` to conform to the `HttpResponse` protocol.""" - def __init__(self, response: impit.Response) -> None: + def __init__(self, response: Response) -> None: self._response = response @property @@ -40,14 +40,14 @@ def headers(self) -> HttpHeaders: return HttpHeaders(dict(self._response.headers)) def read(self) -> bytes: - return str(self._response.text).encode() + return self._response.content @docs_group('Classes') class ImpitHttpClient(HttpClient): - """HTTP client based on the `HTTPX` library. + """HTTP client based on the `impit` library. - This client uses the `HTTPX` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses) + This client uses the `impit` library to perform HTTP requests in crawlers (`BasicCrawler` subclasses) and to manage sessions, proxies, and error handling. See the `HttpClient` class for more common information about HTTP clients. @@ -56,9 +56,9 @@ class ImpitHttpClient(HttpClient): ```python from crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler - from crawlee.http_clients import HttpxHttpClient + from crawlee.http_clients import ImpitHttpClient - http_client = HttpxHttpClient() + http_client = ImpitHttpClient() crawler = HttpCrawler(http_client=http_client) ``` """ @@ -71,7 +71,6 @@ def __init__( persist_cookies_per_session: bool = True, http3: bool = True, verify: bool = True, - header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR, **async_client_kwargs: Any, ) -> None: """Initialize a new instance. @@ -90,9 +89,8 @@ def __init__( self._verify = verify self._async_client_kwargs = async_client_kwargs - self._header_generator = header_generator - self._client_by_proxy_url = dict[Optional[str], impit.AsyncClient]() + self._client_by_proxy_url = dict[Optional[str], AsyncClient]() @override async def crawl( @@ -145,7 +143,7 @@ async def send_request( return _ImpitResponse(response) - def _get_client(self, proxy_url: str | None) -> impit.AsyncClient: + def _get_client(self, proxy_url: str | None) -> AsyncClient: """Retrieve or create an HTTP client for the given proxy URL. If a client for the specified proxy URL does not exist, create and store a new one. @@ -156,12 +154,13 @@ def _get_client(self, proxy_url: str | None) -> impit.AsyncClient: 'proxy': proxy_url, 'http3': self._http3, 'verify': self._verify, + 'follow_redirects': True, } # Update the default kwargs with any additional user-provided kwargs. kwargs.update(self._async_client_kwargs) - client = impit.AsyncClient(**kwargs) + client = AsyncClient(**kwargs) self._client_by_proxy_url[proxy_url] = client return self._client_by_proxy_url[proxy_url] diff --git a/uv.lock b/uv.lock index f9b2521fe0..99a5a6fd2e 100644 --- a/uv.lock +++ b/uv.lock @@ -1152,45 +1152,48 @@ wheels = [ [[package]] name = "impit" -version = "0.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0d/2c/b0a632b1c455ba97b77395fa57fd086fef4f8a8d9604b6de959dbd959a36/impit-0.1.0.tar.gz", hash = "sha256:7aadb7ed30b17515eabf53afd990675d3b097eedd538abe4f4754be8914924cd", size = 41141 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f4/5c/f2424cddfdad10cfcd92a38a1f3b31ff046db50d68a9b8f5e5a0b4ff41ad/impit-0.1.0-1-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:8bbdf3785924a50ffb773f3ed12cb30f930ffe329d25f699b46608aac1b46e98", size = 5355850 }, - { url = "https://files.pythonhosted.org/packages/e7/bd/f34fa4678b1317a20d0eec57dc9372bde77462f92e798878bb4ccaad6621/impit-0.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02d818c3b076e0d665f2bc93bcd30fd20a7935787d717799836697529fb315db", size = 5335799 }, - { url = "https://files.pythonhosted.org/packages/17/d6/c1210c27d0632b3d51e62339d9c8d0b0f23699c875393465675d25fe6db4/impit-0.1.0-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:ad54fae8954bd8366f2891c6b7d818eb207b4768b8456fcff478f0217d8d5a49", size = 47737431 }, - { url = "https://files.pythonhosted.org/packages/0c/80/06d7a7e557420ff979449e28d4190c5291fddfe8a745de3e0b6a35c7d85d/impit-0.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:85a43d70cb2ee95f3446f167c1ea235b0143fa4a4dbc9e499bd69625fe8c8338", size = 5626908 }, - { url = "https://files.pythonhosted.org/packages/6b/50/2cea3aae32bde1a535369f78c81ac530a6e801586daac482d9e8f378617b/impit-0.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:132ab63a471475286c163b19cb3b03ad78b8597cfdda792d35d681ec3bdb6b57", size = 5482225 }, - { url = "https://files.pythonhosted.org/packages/f9/a6/3efac7442f754be942d8195535f90229dad89657481ab87b2d708a989792/impit-0.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:fc347a36ec394761ffedaa84528e903d53c7a08ef5c6d43b2fe22a562db92d89", size = 3319369 }, - { url = "https://files.pythonhosted.org/packages/81/41/1331c44227440943a66ce2c6e1b9920b30e0d5e9d0756ffda58e678d8ba8/impit-0.1.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d74c079e059285059c325d18e856458969d85ec6a326af8e1d4f740f75a252fb", size = 3311457 }, - { url = "https://files.pythonhosted.org/packages/5c/1d/2a17a60b4b17448131bc1e24aa6c0c9fa309e548a26ab4e84464d09be46b/impit-0.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2cab190278e7bc0777463b522fdebf329b9ff1202d398472998c33910c92f9e9", size = 3124958 }, - { url = "https://files.pythonhosted.org/packages/4b/58/25930cc6dd91e11a0aff4910886d5246c7103942c4eb506aaef26d04c077/impit-0.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c16dc7f902237d81fe45b04d5ce744c29fdd433ad1493246af83cf3f2fff8d7", size = 5335511 }, - { url = "https://files.pythonhosted.org/packages/9c/b0/41a8cfe609fd121cf5456c456eb4eb1743f3caf73c13228a6fdfe1d0f455/impit-0.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aa4b69ec94c4ed13c72fa782283faf1aaa8747648c83c9c2422c06be8181abee", size = 5626678 }, - { url = "https://files.pythonhosted.org/packages/38/a7/e26db34cf700ad1f41ca409ce062fb87d2abf5dda9d7bbbb49f8cb1185b0/impit-0.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:97c0687bd12c7532eb9a383ddcf92cb5d10bc19e8a62dbe3a5436bd14737f9a2", size = 5482274 }, - { url = "https://files.pythonhosted.org/packages/67/a1/1dfdcee3f4d2ce22dc36cc4c2894f126af3f3abac40eb2acebb84fb5679c/impit-0.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:d0624c286cfc02d59522740f4301fb1bc9a6398c5d8451a475523a5d0c41b791", size = 3319398 }, - { url = "https://files.pythonhosted.org/packages/13/34/3fe6e73f10fb685b8def6a55faf298a8e86e7080f1d4b917694f4c1dcbfb/impit-0.1.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:b3fbaf19cff978a6e076fa952241fe64c99c0af3e55e0acab0c1b68737112b16", size = 3310885 }, - { url = "https://files.pythonhosted.org/packages/be/93/de7fe2dd6fd5d8696145a0b60bd306c39cf64eeb70c0a3763779582883b4/impit-0.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:195de09d104f610891ee3ba5ca460d671a2a06abf4d0975be68d28397c3dd9a4", size = 3126163 }, - { url = "https://files.pythonhosted.org/packages/bf/36/ed7d89faa763b8cd2bddd3a7e274696fafd041c371490dd6b8edb875287c/impit-0.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:218570ec98e59c5e75bb83e211cc6107aabe1f63688f4c1ce518951e71287acf", size = 5335000 }, - { url = "https://files.pythonhosted.org/packages/72/37/d1fc899462f5ef4f95b6053bdc19ea470d6fa707c682d81c7a76beb71e38/impit-0.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9f776b6094e6b6413f06ec889051b931bce3f4d14fd0d8d78173b6e5b001dd66", size = 5626951 }, - { url = "https://files.pythonhosted.org/packages/c3/8b/95881985ca03698df43021883f53830765886fc7f6d4b67911dfe1f6720a/impit-0.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:517bf72aad0d2d364023559cdd670740a450419adf4ad96e8ba311e08d7504ac", size = 5481425 }, - { url = "https://files.pythonhosted.org/packages/b1/09/3b6526be6580cd8f011f944b8c309beaea2a725ca24568f79d9162e9275d/impit-0.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:fd1206d8c6590fa3c5acfb78c646d8fb2cc4f2ae01b9506781b71f0b20b27036", size = 3320896 }, - { url = "https://files.pythonhosted.org/packages/e7/c4/8ccf4542b67365594a20d8b2a5baf0c993a5786abb685d25766ee235e71f/impit-0.1.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:c3154f184dcf987969318f7c82356857a6159c6a39ec8162be345d25036e60ce", size = 3310831 }, - { url = "https://files.pythonhosted.org/packages/3b/c1/efc519bb3a59b297ba289ccec4c2b81cc99beae5874e56f2575e05176934/impit-0.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6c82429009280a975ddf13840da7bcfd1989fb9c8aba0ea86175f2ef44fe177b", size = 3126331 }, - { url = "https://files.pythonhosted.org/packages/61/e7/deca560d53d2c04e159d73cf8bb970122e72a0bd225b7cca26e16bf0ef5a/impit-0.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f4cdadf8f4ad5c61580eb8faeaced122346510492311389d1a1bc3c656d0a30e", size = 5334923 }, - { url = "https://files.pythonhosted.org/packages/ea/f1/6285cf49cf1ef55322705fc768ffcf6ca642ebee1011becee39469bd26c7/impit-0.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:67fc248a0c1e044a3ccfcf1889f92faadcd994dbc689573b3ca2e099037f7a79", size = 5626758 }, - { url = "https://files.pythonhosted.org/packages/f5/c0/326173747b91c52b862801984a11119ec4f37be8326f38e3ea2d0343316c/impit-0.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:43eaba84e90c177225a591870fb3fc0195870e2811b8e4b7bbcf4e98c02279e1", size = 5481611 }, - { url = "https://files.pythonhosted.org/packages/89/7b/859ada2d64d867e1582bc40ead115f80b55e9b75d33ca4940c81a3d7b0d8/impit-0.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:f6ea7e335739aff0a266357591cc87984e7e5cce1eed9b6797aab2cbc12cf558", size = 3320648 }, - { url = "https://files.pythonhosted.org/packages/a9/e4/7b043f5ceb1600e0420a261c11180f2c7b62f43b32e1ab476f9d98e9d435/impit-0.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5262b4dd63fc770e22f4d0fc42fce5fb7386ee7bb8735ce52ccbf862ebd6715b", size = 5626627 }, - { url = "https://files.pythonhosted.org/packages/db/fb/b40896632145cc564501ef8286de2e0761514ab7f740bf30fbdc5f1e9460/impit-0.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:63a0613d9df5d89d981a6f4ac33c2d5b3850497ee197d527c2c370bbb3c10c6d", size = 5481815 }, - { url = "https://files.pythonhosted.org/packages/93/2b/d0a9220645f766513477b93f4054085f9b584bacc9b091eae88210ac1518/impit-0.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9794d90f965173e735ce73a32dbc72ed61c97a5bd62eff910acae450bee7549f", size = 5336070 }, - { url = "https://files.pythonhosted.org/packages/72/98/f34d7cff6ffcdee64b62959d22a44f026f3721a09aabfb700149be845335/impit-0.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fe0a78708dcb93d4abdad28f15a833ff35f818bf0e0ee87404291dbed1713865", size = 5627466 }, - { url = "https://files.pythonhosted.org/packages/19/28/fcbfee48222c3e9a20370ea021c03ad1cca0396b5ea48f165cab1ac37ca9/impit-0.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:19f4ff21e62ad8d25f1d5d8c86c199fe1455f5c2d953bbd1ba7b367e36ab7cc6", size = 5482722 }, - { url = "https://files.pythonhosted.org/packages/4e/3e/309467d5348dd405be6a312e811c00ac5c7adcd7f689cb145fe5aae5b51d/impit-0.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:c927330f4a520482f725d0428ae072e4acf19244bfd47026fc54d24f48ec4930", size = 3320035 }, - { url = "https://files.pythonhosted.org/packages/d6/00/976b399920f4d6398a911716c1254ebb26af9beb39a251f758bbcccd19dc/impit-0.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81b0fa849966c2feeb7fd140bc7aac7b0badd241303e777696308dadf48bbe54", size = 5336092 }, - { url = "https://files.pythonhosted.org/packages/69/39/8da8bac42b497e972060bfe8e30ddaaaf8025510fab4034f4a2676e3d92a/impit-0.1.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:8d13d04906472c3785c3066c7bcd9e2b38ea686e94613caabbbe983ab893940f", size = 5626742 }, - { url = "https://files.pythonhosted.org/packages/d5/7c/7b6efa5828f1a92f666c4ec76211536168c8a1e44320bdf59ffd246ce265/impit-0.1.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:636a459fb7d20ec9515a0310b8c2c24742232629abd6a5dd0fff70ca40f6f653", size = 5482314 }, - { url = "https://files.pythonhosted.org/packages/e5/6d/25c1ad602dd3f9e0f1df2a753b231813ca56a33dc9dcfb7316dfb1461b0b/impit-0.1.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:ec77f5eb3e703ad617acb9812a20ca729da2dadf585e1911a7d7f44daf7d4294", size = 5627255 }, - { url = "https://files.pythonhosted.org/packages/09/5b/b9e424fdd3d623610f9831c0ca80eba712a9912e268826c0740ac0a99da0/impit-0.1.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:66296717b5eb14ec55d076d19779bff02b02922650be392f74cf02a13067be56", size = 5482436 }, +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/5d/fe90263ecc83806d367cebc8b461b084aa63c81a0fcd77da983fe124ae11/impit-0.2.0.tar.gz", hash = "sha256:dcb67255e0e8c1db567919f37aa790273e953abd63e47e0a2f04d85ab31c1376", size = 66535 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/73/0a14521f7860bae6d24584488229394636d4bdae4af4d8d2f6641a2f77f9/impit-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb765a4b2b44d5c630f5c490ed3db811cd347b14fcfd6d06d9673a0ee8e2b5a", size = 5840160 }, + { url = "https://files.pythonhosted.org/packages/6a/6c/9b126bbec33060ff72fdb82b9c587c62e9d6fce7d14000b966017fffa82c/impit-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8a291e3ad79761c941550bead47e85e21158cac9a9a2b1cdf0e80e9fc44f0c79", size = 6132636 }, + { url = "https://files.pythonhosted.org/packages/ce/cc/1234a041adae0e6b6f1ffb46c2d1b37f7c381c92fb985934501568de7616/impit-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:19ba06a0feed8b707da0f7e39a8a73fee1f3275d59868276ae92c74f39685cc0", size = 5990395 }, + { url = "https://files.pythonhosted.org/packages/55/40/217950fdd425ff6a1b081d01e269b7360b8a7b29ae59a9092751017df2c3/impit-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:aeafa4f76ac0a6e30d3e2ccc63b200398f8a75ce18cdb1aea15ac627ed233f32", size = 3783316 }, + { url = "https://files.pythonhosted.org/packages/35/48/8902f61e1e9ab3b3360a4adf0015caab0bfe37983bc8a9f77d6b039fd324/impit-0.2.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:699745eeab39371571a9e812502b09e5b77358a8d117b1d7d1c979a19e51d9b7", size = 3759536 }, + { url = "https://files.pythonhosted.org/packages/fe/4b/cf5eb85e9c4bdd9b4fa02c48c1c9962f88b80bcb5cf41d7471d860207a0e/impit-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f0c69648c3297249dc4c629bdf78c232021d7b9679c47d0c1ce7eac71a894509", size = 3580949 }, + { url = "https://files.pythonhosted.org/packages/c3/53/d1f570a0971fcf7a4b6972e42fbbf54fe61bcedd2cb04ad48662854d1731/impit-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:298361d13be750d628066995cb319d0919bd3f3857e1ea2f713f53ab1daed9d6", size = 5839954 }, + { url = "https://files.pythonhosted.org/packages/26/78/224c13e3a1cbccf288fdbe07feda7daf25a6603c4e1a3bde6ca0c57472ce/impit-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6965767ecf268adb3f40953f6cc96ca4a3e839863680321ece75df53bf15c7ac", size = 6132517 }, + { url = "https://files.pythonhosted.org/packages/7c/03/df1158166451ed98e02bf2045cc88c677d724b4e4d335dae62d3ca2260cf/impit-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6b88e9d1ebf0e038c37950c83a2b4e1cfd3114a818567b92acb81cf2d3d48af9", size = 5989938 }, + { url = "https://files.pythonhosted.org/packages/9d/10/bd91cbd34554759092c3383fb3e866df0363ac8eb7427e5b5c269d68babf/impit-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:b4505179e41fb52887ba0798a7108a5aa385b1301f5995306f9ba927619cdd3f", size = 3783099 }, + { url = "https://files.pythonhosted.org/packages/eb/d6/588243182b2f402f5850cb806cb039e8ad497a1b535799667eac07a46fad/impit-0.2.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9d606f3303f76fc7c2cfc6e31b29d61e1b005a97381d9b58a6f5a08838ff7665", size = 3759480 }, + { url = "https://files.pythonhosted.org/packages/76/9f/240817d13c8122c8c5ed8e4cd8f761a5bfba2a638256e0d0fd11d3b8bcff/impit-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9956fef95a1f7d5567f8c5c848168d59651ae74e4385ffc90458cf674ecc9004", size = 3579564 }, + { url = "https://files.pythonhosted.org/packages/40/95/6992232cfd962d55f2ea5d5648a032387f7c348c2b8927c58fff96765288/impit-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7dc5e96f7f6ef7f2d429074282301c00b78773a5a4e8189ba7d9db9837a35d3", size = 5839026 }, + { url = "https://files.pythonhosted.org/packages/ca/47/09436266fb10a966788448dc2938d070f483739455c6e80306083581eeed/impit-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7084e027d7270b7b4c9aa4c2a3f6499c912517667070568d11f84c1261f1a8f9", size = 6130881 }, + { url = "https://files.pythonhosted.org/packages/30/82/4b8a73f099ae327f66bc11a7ebbd79efff21667d18ffa022cf650c1aa471/impit-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7a4fa3b251a3caf8f443ceca204e8725fd9897f9b7f1693969597b88577a6e1d", size = 5988971 }, + { url = "https://files.pythonhosted.org/packages/fd/be/07b3a159d28c6fada6ca5e5576809a795c62836b2b50856230a884cffb82/impit-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:a7818499b9ee24e6962142e35681f0a96b287dbc010ff7dd4bffa3043ed96a69", size = 3781597 }, + { url = "https://files.pythonhosted.org/packages/7c/08/b89b16946b6e4022c9022d13383c115f6a1ee4e8c84f6192d673e372fe11/impit-0.2.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:4a687b7af3f095253725838a72580718d68130698efaf612dfebf330f55c68fb", size = 3759434 }, + { url = "https://files.pythonhosted.org/packages/83/de/eae4226c7060db88d8b2a08cf3ce55ab9b0e7549938f132a410843cc43cb/impit-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3d91320e79077bdb58b4d6cfd9cfe2c30fdc7d77e56697ea67fbe037b83e6009", size = 3579552 }, + { url = "https://files.pythonhosted.org/packages/06/af/1ed4af85e5e24bbc4ec04dc9e5baf948c0d1c89b0b7f5012bac2a731e917/impit-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b02ec2ed3b10542a9789385e9d76b2d81dd858ff4b98ba7d7364f0f91747352", size = 5838677 }, + { url = "https://files.pythonhosted.org/packages/4e/d1/a0a7e0840e5668c794ae070fafd0c4475035fa5e8f8cb579ea174af9a7cf/impit-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7f6592e264b9c9918463c72054c23764f72dd54e776c4eb7cb610f030473d8d3", size = 6131055 }, + { url = "https://files.pythonhosted.org/packages/78/2e/823c7128200ff8a345dae07123a88ab7d2e1ab138b161b41fe06c9a195ab/impit-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1ca019a1a41d6fb75b9564503cf072527c36fb8f858b99a0eb01b9e80b0e5da7", size = 5989107 }, + { url = "https://files.pythonhosted.org/packages/68/c1/eb6e7984a667907002e735505f79ee6de9ca325088072a8daa150bebc2a5/impit-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:595f6bd53be59a75dc33b89be5b8d2505e6fb5226a69879c01efa00ec168689d", size = 3781262 }, + { url = "https://files.pythonhosted.org/packages/f5/04/786fc1817f488dda1bd323b76400031f9184a511895a10c81dcb0ace03a4/impit-0.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:164be83233cbc2010344e29106caf331fb6ec86dd1ee5cee5f0b5b01af49bd41", size = 6131593 }, + { url = "https://files.pythonhosted.org/packages/bb/e9/dc7c3b1c6317d2edf66cd35a018e206c49cd4553b9bab2b5b0326c09a1fb/impit-0.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0f7e9fe363f67391b97fcf88ca0e10f6f6b8c31145e4a1f94a959e826d79dd9b", size = 5989830 }, + { url = "https://files.pythonhosted.org/packages/ad/93/fc9cbc151f2e19bcbaa2efe4b02421ee400b6b1dab2ccb251fd18e4d6832/impit-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a324a3c11e6a85bc60a37c19a29c91b3806cd1e23955e1538b84790c3ba0f37", size = 5841580 }, + { url = "https://files.pythonhosted.org/packages/c5/f5/8e4e72b1116e3157a8ff20ed4e5f11073dbdfadf894dd8af6cd2864ed2e7/impit-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a38fbb4b2fe2e8ad0e95b12b7db6915eb00a1115313cbe8cb5f824384df76a6d", size = 6133679 }, + { url = "https://files.pythonhosted.org/packages/48/7e/f600db4c27e1faec7e533517306f59b554d9b6ef3afcbe26df6ea3c7c05d/impit-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:85831228c527e416a89d3b648d5b33954bb37aca2a429a75aa5e97a434042e05", size = 5991115 }, + { url = "https://files.pythonhosted.org/packages/cc/ed/fcd5270f61734ed23cd2c6cbe44b794406833950dc2eaf9507499ec3c423/impit-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:8b3145a0601d2ff2571da75887cc1f01f18fe866fce3f39e11643d93793ede67", size = 3783994 }, + { url = "https://files.pythonhosted.org/packages/da/91/d8df56db3d9a009bd700507434313ae4c5a002857999d5c65c8752273943/impit-0.2.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5eb89eb8b602c8742b9b795c1ac83d53429744ae2a259cf19996d87307584f21", size = 3759779 }, + { url = "https://files.pythonhosted.org/packages/0b/c4/265bd6b0f165111dbc4ffa016f2cec0ad85f4d581264b7d255a798114c8c/impit-0.2.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c1247ebf57a78f7ff2cacd2dfa93f3e11cfd1bacd105aa5c3e01f11526740b4", size = 5841462 }, + { url = "https://files.pythonhosted.org/packages/80/74/c3526fc610a103cd6b932b3c767d0ed9433ed6e6ed04a3bd1eb719e3cc41/impit-0.2.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:fd9843e654ec7c520f90ed1a1e9b49853cf942d1e9e040e64b5ac6fdbf7405b4", size = 6133281 }, + { url = "https://files.pythonhosted.org/packages/8b/98/2c31e46d88e9273fe5cb4c2253aeb6d83754617a1071f65d495c4f360691/impit-0.2.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:0c3621e76e65b2447420bbe2a9e42b8c9cb37461d78f00857fc770d3645fe7f6", size = 5990905 }, + { url = "https://files.pythonhosted.org/packages/e9/a9/eed45dd81b0b9b5a19891131cae6c6a4063bf83a1f690646e9fd43f7dc0c/impit-0.2.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2c0d4fd6ff16d910535a2ba986053f86b420638f9876d62640a9e7e0227a4d2", size = 5841231 }, + { url = "https://files.pythonhosted.org/packages/f9/75/fe5d5610801d8031fff348d824c8d65acc644cf666c21ab880ef649c4fbe/impit-0.2.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1df570fe5a1fb0ea2cdd8a6b36875868a7d722021e858c1b260e9a8623f57c3f", size = 6133297 }, + { url = "https://files.pythonhosted.org/packages/f4/a6/12d366441287e5490297fec41da3916843fcd28da615d4f7e88f74907989/impit-0.2.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:feddc758b530c04344ec1a7a87b770072a1191d144b07ddc4dc5e226e44bfda1", size = 5990741 }, + { url = "https://files.pythonhosted.org/packages/60/d8/d5d29a60f9c88017b37c08541206b10e7590e87147121e75a41a2ddeb025/impit-0.2.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:4341cfadcd4047741e4c91a72bc690162c81fa483ceff1193f80b7bfa7aabbe8", size = 3760028 }, + { url = "https://files.pythonhosted.org/packages/60/af/a70b6a77811a1fc7fc23b5b34204515bc23a0336be5179312403e2f07409/impit-0.2.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:695c5598b1c80fae8614b2cb1facac5b575acb37db97a8fa426ae98f45b8e35d", size = 6133795 }, + { url = "https://files.pythonhosted.org/packages/8a/86/db31d933e64682a72cd114aa4eae7ab51bd128196cb94710e12126af6a00/impit-0.2.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:05d192f9148fed0afa3bad50d64440234e00e52e4858a5f2bb391653cfa6566d", size = 5991288 }, ] [[package]] From dfc55992481e4d894fbabe186e143e39e630888f Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 10 Apr 2025 21:24:22 +0000 Subject: [PATCH 4/9] update version --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4f349273b4..d996be5167 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,7 @@ adaptive-crawler = [ beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"] cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"] curl-impersonate = ["curl-cffi>=0.9.0"] -impit = ["impit>=0.1.0"] +impit = ["impit>=0.2.0"] parsel = ["parsel>=1.10.0"] playwright = ["playwright>=1.27.0"] diff --git a/uv.lock b/uv.lock index 99a5a6fd2e..4020f908dd 100644 --- a/uv.lock +++ b/uv.lock @@ -706,7 +706,7 @@ requires-dist = [ { name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" }, { name = "httpx", extras = ["brotli", "http2", "zstd"], specifier = ">=0.27.0" }, { name = "impit", marker = "extra == 'all'", specifier = ">=0.1.0" }, - { name = "impit", marker = "extra == 'impit'", specifier = ">=0.1.0" }, + { name = "impit", marker = "extra == 'impit'", specifier = ">=0.2.0" }, { name = "inquirer", marker = "extra == 'all'", specifier = ">=3.3.0" }, { name = "inquirer", marker = "extra == 'cli'", specifier = ">=3.3.0" }, { name = "jaro-winkler", marker = "extra == 'adaptive-crawler'", specifier = ">=2.0.3" }, From 3bd2dc56622a495816ff8266558b6b0fdf0e3042 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 11 Apr 2025 02:56:19 +0000 Subject: [PATCH 5/9] fix headers --- src/crawlee/http_clients/_impit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/crawlee/http_clients/_impit.py b/src/crawlee/http_clients/_impit.py index 675b0c5fc1..6e283f362e 100644 --- a/src/crawlee/http_clients/_impit.py +++ b/src/crawlee/http_clients/_impit.py @@ -107,12 +107,13 @@ async def crawl( url=request.url, method=request.method, content=request.payload, + headers=dict(request.headers) if request.headers else None, ) if statistics: statistics.register_status_code(response.status_code) - request.loaded_url = str(request.url) + request.loaded_url = str(response.url) return HttpCrawlingResult( http_response=_ImpitResponse(response), From bb744c53a76a9238e462a6b328aa706f6ffc9f81 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 13 Apr 2025 22:16:23 +0000 Subject: [PATCH 6/9] update tests --- src/crawlee/_utils/blocked.py | 1 + src/crawlee/http_clients/_impit.py | 44 ++++++++--- tests/unit/http_clients/test_impit.py | 103 ++++++++++++++++++++++++++ 3 files changed, 136 insertions(+), 12 deletions(-) create mode 100644 tests/unit/http_clients/test_impit.py diff --git a/src/crawlee/_utils/blocked.py b/src/crawlee/_utils/blocked.py index 53d37e14e2..af158456cb 100644 --- a/src/crawlee/_utils/blocked.py +++ b/src/crawlee/_utils/blocked.py @@ -21,6 +21,7 @@ 'ERR_PROXY_CONNECTION_FAILED', 'ERR_TUNNEL_CONNECTION_FAILED', 'Proxy responded with', + 'unsuccessful tunnel', ] """ Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning. diff --git a/src/crawlee/http_clients/_impit.py b/src/crawlee/http_clients/_impit.py index 6e283f362e..83e9b15af0 100644 --- a/src/crawlee/http_clients/_impit.py +++ b/src/crawlee/http_clients/_impit.py @@ -7,7 +7,9 @@ from typing_extensions import override from crawlee._types import HttpHeaders +from crawlee._utils.blocked import ROTATE_PROXY_ERRORS from crawlee._utils.docs import docs_group +from crawlee.errors import ProxyError from crawlee.fingerprint_suite import HeaderGenerator from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse @@ -103,12 +105,17 @@ async def crawl( ) -> HttpCrawlingResult: client = self._get_client(proxy_info.url if proxy_info else None) - response = await client.request( - url=request.url, - method=request.method, - content=request.payload, - headers=dict(request.headers) if request.headers else None, - ) + try: + response = await client.request( + url=request.url, + method=request.method, + content=request.payload, + headers=dict(request.headers) if request.headers else None, + ) + except RuntimeError as exc: + if self._is_proxy_error(exc): + raise ProxyError from exc + raise if statistics: statistics.register_status_code(response.status_code) @@ -135,12 +142,17 @@ async def send_request( client = self._get_client(proxy_info.url if proxy_info else None) - response = await client.request( - url=url, - method=method, - headers=dict(headers) if headers else None, - content=payload, - ) + try: + response = await client.request( + url=url, + method=method, + headers=dict(headers) if headers else None, + content=payload, + ) + except RuntimeError as exc: + if self._is_proxy_error(exc): + raise ProxyError from exc + raise return _ImpitResponse(response) @@ -165,3 +177,11 @@ def _get_client(self, proxy_url: str | None) -> AsyncClient: self._client_by_proxy_url[proxy_url] = client return self._client_by_proxy_url[proxy_url] + + @staticmethod + def _is_proxy_error(error: RuntimeError) -> bool: + """Determine whether the given error is related to a proxy issue. + + Check if the error message contains known proxy-related error keywords. + """ + return any(needle in str(error) for needle in ROTATE_PROXY_ERRORS) diff --git a/tests/unit/http_clients/test_impit.py b/tests/unit/http_clients/test_impit.py new file mode 100644 index 0000000000..406f75f3bf --- /dev/null +++ b/tests/unit/http_clients/test_impit.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import os +from typing import TYPE_CHECKING + +import pytest + +from crawlee import Request +from crawlee.errors import ProxyError +from crawlee.http_clients import ImpitHttpClient +from crawlee.statistics import Statistics + +if TYPE_CHECKING: + from yarl import URL + + from crawlee.proxy_configuration import ProxyInfo + + +@pytest.fixture +def http_client() -> ImpitHttpClient: + return ImpitHttpClient() + + +async def test_http_1(server_url: URL) -> None: + http_client = ImpitHttpClient() + response = await http_client.send_request(str(server_url)) + assert response.http_version == 'HTTP/1.1' + + +@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows') +async def test_proxy( + http_client: ImpitHttpClient, + proxy: ProxyInfo, + server_url: URL, +) -> None: + url = str(server_url / 'status/222') + request = Request.from_url(url) + + async with Statistics.with_default_state() as statistics: + result = await http_client.crawl(request, proxy_info=proxy, statistics=statistics) + + assert result.http_response.status_code == 222 + + +@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows') +async def test_proxy_disabled( + http_client: ImpitHttpClient, + disabled_proxy: ProxyInfo, +) -> None: + url = 'https://apify.com/' + request = Request.from_url(url) + + with pytest.raises(ProxyError): + async with Statistics.with_default_state() as statistics: + await http_client.crawl(request, proxy_info=disabled_proxy, statistics=statistics) + + +@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows') +async def test_send_request_with_proxy( + http_client: ImpitHttpClient, + proxy: ProxyInfo, + server_url: URL, +) -> None: + url = str(server_url / 'status/222') + + response = await http_client.send_request(url, proxy_info=proxy) + assert response.status_code == 222 # 222 - authentication successful + + +@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows') +async def test_send_request_with_proxy_disabled( + http_client: ImpitHttpClient, + disabled_proxy: ProxyInfo, +) -> None: + url = 'https://apify.com/' + + with pytest.raises(ProxyError): + await http_client.send_request(url, proxy_info=disabled_proxy) + + +async def test_crawl_follow_redirects_by_default(http_client: ImpitHttpClient, server_url: URL) -> None: + target_url = str(server_url / 'status/200') + redirect_url = str((server_url / 'redirect').update_query(url=target_url)) + request = Request.from_url(redirect_url) + + crawling_result = await http_client.crawl(request) + + assert crawling_result.http_response.status_code == 200 + assert request.loaded_url == target_url + + +async def test_crawl_follow_redirects_false(server_url: URL) -> None: + http_client = ImpitHttpClient(follow_redirects=False) + + target_url = str(server_url / 'status/200') + redirect_url = str((server_url / 'redirect').update_query(url=target_url)) + request = Request.from_url(redirect_url) + + crawling_result = await http_client.crawl(request) + + assert crawling_result.http_response.status_code == 302 + assert crawling_result.http_response.headers['Location'] == target_url + assert request.loaded_url == redirect_url From d99c74ccba9dcb121eb36bbe9da52f28263daae4 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 14 Apr 2025 14:41:13 +0000 Subject: [PATCH 7/9] set default browser impersionate --- src/crawlee/http_clients/_impit.py | 1 + tests/unit/http_clients/test_impit.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/src/crawlee/http_clients/_impit.py b/src/crawlee/http_clients/_impit.py index 83e9b15af0..8ce4d6d618 100644 --- a/src/crawlee/http_clients/_impit.py +++ b/src/crawlee/http_clients/_impit.py @@ -168,6 +168,7 @@ def _get_client(self, proxy_url: str | None) -> AsyncClient: 'http3': self._http3, 'verify': self._verify, 'follow_redirects': True, + 'browser': 'firefox', } # Update the default kwargs with any additional user-provided kwargs. diff --git a/tests/unit/http_clients/test_impit.py b/tests/unit/http_clients/test_impit.py index 406f75f3bf..61db18da94 100644 --- a/tests/unit/http_clients/test_impit.py +++ b/tests/unit/http_clients/test_impit.py @@ -27,6 +27,12 @@ async def test_http_1(server_url: URL) -> None: assert response.http_version == 'HTTP/1.1' +async def test_http_2() -> None: + http_client = ImpitHttpClient() + response = await http_client.send_request('https://apify.com/') + assert response.http_version == 'HTTP/2' + + @pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows') async def test_proxy( http_client: ImpitHttpClient, From 8b49c2ef01b8ce29968912eb5c8bd0b0aa024c66 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 14 Apr 2025 14:49:26 +0000 Subject: [PATCH 8/9] docs fix --- src/crawlee/http_clients/_impit.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/crawlee/http_clients/_impit.py b/src/crawlee/http_clients/_impit.py index 8ce4d6d618..8a266e8332 100644 --- a/src/crawlee/http_clients/_impit.py +++ b/src/crawlee/http_clients/_impit.py @@ -10,7 +10,6 @@ from crawlee._utils.blocked import ROTATE_PROXY_ERRORS from crawlee._utils.docs import docs_group from crawlee.errors import ProxyError -from crawlee.fingerprint_suite import HeaderGenerator from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse if TYPE_CHECKING: @@ -65,8 +64,6 @@ class ImpitHttpClient(HttpClient): ``` """ - _DEFAULT_HEADER_GENERATOR = HeaderGenerator() - def __init__( self, *, @@ -82,7 +79,7 @@ def __init__( http3: Whether to enable HTTP/3 support. verify: SSL certificates used to verify the identity of requested hosts. header_generator: Header generator instance to use for generating common headers. - async_client_kwargs: Additional keyword arguments for `httpx.AsyncClient`. + async_client_kwargs: Additional keyword arguments for `impit.AsyncClient`. """ super().__init__( persist_cookies_per_session=persist_cookies_per_session, From 75f34d3cc4069797caa7caef4c7d9181024630f3 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 14 Apr 2025 15:28:43 +0000 Subject: [PATCH 9/9] fix version in pyproject --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d996be5167..a15ceb6097 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ all = [ "curl-cffi>=0.9.0", "html5lib>=1.0", "inquirer>=3.3.0", - "impit>=0.1.0", + "impit>=0.2.0", "jaro-winkler>=2.0.3", "parsel>=1.10.0", "playwright>=1.27.0", diff --git a/uv.lock b/uv.lock index d5acc6c316..951183412c 100644 --- a/uv.lock +++ b/uv.lock @@ -705,7 +705,7 @@ requires-dist = [ { name = "html5lib", marker = "extra == 'all'", specifier = ">=1.0" }, { name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" }, { name = "httpx", extras = ["brotli", "http2", "zstd"], specifier = ">=0.27.0" }, - { name = "impit", marker = "extra == 'all'", specifier = ">=0.1.0" }, + { name = "impit", marker = "extra == 'all'", specifier = ">=0.2.0" }, { name = "impit", marker = "extra == 'impit'", specifier = ">=0.2.0" }, { name = "inquirer", marker = "extra == 'all'", specifier = ">=3.3.0" }, { name = "inquirer", marker = "extra == 'cli'", specifier = ">=3.3.0" },