Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

"Hacking websites with CDP" is now on YouTube #3380

Open
mdmintz opened this issue Jan 1, 2025 · 10 comments
Open

"Hacking websites with CDP" is now on YouTube #3380

mdmintz opened this issue Jan 1, 2025 · 10 comments
Assignees
Labels
News / Announcements News Tutorials & Learning Tutorial videos and blog posts UC Mode / CDP Mode Undetected Chromedriver Mode / CDP Mode

Comments

@mdmintz
Copy link
Member

mdmintz commented Jan 1, 2025

"Hacking websites with CDP" is now on YouTube:

https://www.youtube.com/watch?v=vt2zsdiNh3U

@mdmintz mdmintz added News / Announcements News Tutorials & Learning Tutorial videos and blog posts UC Mode / CDP Mode Undetected Chromedriver Mode / CDP Mode labels Jan 1, 2025
@mdmintz mdmintz self-assigned this Jan 1, 2025
@mdmintz mdmintz pinned this issue Jan 4, 2025
@mdmintz mdmintz unpinned this issue Jan 4, 2025
@mdmintz
Copy link
Member Author

mdmintz commented Jan 6, 2025

The code for taking control of existing Chrome browsers via the remote-debugging-port is here:

#3354 (comment)

@boludoz
Copy link

boludoz commented Feb 6, 2025

The code for taking control of existing Chrome browsers via the remote-debugging-port is here:

#3354 (comment)

So we don't need chromedriver anymore? Like nodriver or Selenium driverless?

@mdmintz
Copy link
Member Author

mdmintz commented Feb 6, 2025

With pure CDP Mode, chromedriver isn’t necessary.

@mdmintz
Copy link
Member Author

mdmintz commented Feb 8, 2025

If stealth is important, you may have to use one of the CDP formats here: #3354 (comment)

Otherwise, you can try passing in the remote-debugging-port via chromium_arg, but that might not get you the results you’re looking for.

@dongdestroyer

This comment has been minimized.

@guocity
Copy link

guocity commented Feb 27, 2025

BaseCase class with SB() is synchronous is there anyway that i can use asynchronous driver from cdp_util instead of basecase driver?
from seleniumbase.undetected.cdp_driver import cdp_util
driver = await cdp_util.start_async()

@guocity
Copy link

guocity commented Feb 27, 2025

how do i call sb.uc_gui_click_captcha() from async code?

@mdmintz
Copy link
Member Author

mdmintz commented Feb 27, 2025

The async code can only call direct CDP methods.
(See https://chromedevtools.github.io/devtools-protocol/ for those methods)

@dima23113
Copy link

The async code can only call direct CDP methods. (See https://chromedevtools.github.io/devtools-protocol/ for those methods)

Hi! Please tell me, I've already racked my brains trying to find a solution. Why cdp doesn't get the request body, if it's 100% there and if I uncomment the line xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index) in the while loop, then everything will be fine, but only if I screw up everything from the while loop further.

`
CDP.network.ResponseReceived with CDP.network.ResourceType.XHR.
import asyncio
import json
import random

import mycdp
import time
from seleniumbase import cdp_driver

import config
from utils import distribute_scrolls, is_have_products, get_xhr_products, scroll_down, need_click_load_more

products = []

last_xhr_request = {}

xhr_requests = {}


def listenXHR(page, index_):
    async def handler(evt: mycdp.network.ResponseReceived):
        # Get AJAX requests
        if evt.type_ is mycdp.network.ResourceType.XHR and 'api/catalog/products' in evt.response.url:
            if xhr_requests.get(index_):
                xhr_requests[index_].append([evt.response.url, evt.request_id])
                last_xhr_request[index_] = time.time()
            else:
                xhr_requests[index_] = [[evt.response.url, evt.request_id], ]
                last_xhr_request[index_] = time.time()

    page.add_handler(mycdp.network.ResponseReceived, handler)


async def receiveXHR(page, requests, index_):
    responses = []
    retries = 0
    max_retries = 5
    # Wait at least 2 seconds after last XHR request for more
    while True:
        if last_xhr_request[index_] is None or retries > max_retries:
            break
        if time.time() - last_xhr_request[index_] <= 3:
            retries = retries + 1
            await asyncio.sleep(2)
            continue
        else:
            break
    await page
    # Loop through gathered requests and get response body
    for request in requests:
        try:
            res = await page.send(mycdp.network.get_response_body(request[1]))
            if res is None:
                continue
            responses.append({
                "url": request[0],
                "body": json.loads(res[0]),
                "is_base64": res[1],
            })
        except Exception as e:
            print("Error getting response:", e)
    if responses:
        xhr_requests[index_] = []
    return responses


async def request_paused_handler(event, tab):
    r = event.request
    is_image = ".png" in r.url or ".jpg" in r.url or ".gif" in r.url or ".webp" in r.url or "pcdn.goldapple.ru" in r.url or "/front/api/apm/events" in r.url or '.mp4' in r.url
    if not is_image:  # Let the data through
        tab.feed_cdp(
            mycdp.fetch.continue_request(request_id=event.request_id)
        )
    else:  # Block the data (images)
        TIMED_OUT = mycdp.network.ErrorReason.TIMED_OUT
        tab.feed_cdp(
            mycdp.fetch.fail_request(event.request_id, TIMED_OUT)
        )


async def check_hxr(index_, tab):
    print("Starting check_hxr for index:", index_)  # Debug print
    products_ = []
    while True:
        print("Waiting for XHR responses...")  # Debug print
        xhr_responses = await receiveXHR(tab, xhr_requests.get(index_, []), index_)
        print("Received XHR responses:", xhr_responses)  # Debug print
        data = await get_xhr_products(xhr_responses)
        if data:
            products_.extend(data)
        print("request: ", xhr_requests.get(index_, []))  # Original print
        await asyncio.sleep(1)


async def crawl(browser_index, link, scrols, max_pages=None, start_pages=0):
    products_parsed = []
    retries = 5
    page_count = 0
    driver = await cdp_driver.start_async()
    tab = await driver.get("about:blank")
    listenXHR(tab, browser_index)
    tab.add_handler(mycdp.fetch.RequestPaused, request_paused_handler)
    url = f"{link}?p={start_pages}&storestocks=1"
    await tab.get(url)
    await asyncio.sleep(5)
    # check_hxr_task = asyncio.create_task(check_hxr(browser_index, tab))
    # print("hello")
    while True:
        # xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index)
        # data = await get_xhr_products(xhr_responses)
        # if not data:
        #     retries -= 1
        # else:
        #     products_parsed.extend(data)
        if not await is_have_products(tab):
            break
        await scroll_down(tab)
        await need_click_load_more(tab)
        print(f"Браузер {browser_index} - Страница {page_count}; Всего продуктов: {len(products_parsed)}")
        # if retries == 0:
        #     break
        page_count += 1
        if page_count == 5:
            break
    xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index)
    data = await get_xhr_products(xhr_responses)
    print(data)
    print(f"Браузер {browser_index} собрал {len(products_parsed)} продуктов на странице {url}")
    products.extend(products_parsed)


async def main():
    driver = await cdp_driver.start_async()
    for n, link in enumerate(config.links):
        tasks = []
        tab = await driver.get("https://goldapple.ru/" + link)
        await tab.wait_for("span[data-category-products-count]")
        product_count = await tab.select("span[data-category-products-count]")
        product_count = product_count.attrs.get("data-category-products-count")
        product_pages = int(int(product_count) / 24)
        scrolls_per_browser = distribute_scrolls(product_pages, 1)
        start_pages = []
        start = 0
        for s in scrolls_per_browser:
            start_pages.append(start)
            start += s
        for i in range(1):
            tasks.append(crawl(i, "https://goldapple.ru" + link, scrolls_per_browser[i], 100, start_pages[i]))
        await asyncio.gather(*tasks)


if __name__ == "__main__":
    asyncio.run(main())
`

# for free to join this conversation on GitHub. Already have an account? # to comment
Labels
News / Announcements News Tutorials & Learning Tutorial videos and blog posts UC Mode / CDP Mode Undetected Chromedriver Mode / CDP Mode
Projects
None yet
Development

No branches or pull requests

5 participants