Skip to content

Commit

Permalink
feat: New "scrape install" command, to prepare sys dependencies
Browse files Browse the repository at this point in the history
This can be easily included in a Docker container as needed.
  • Loading branch information
clemlesne committed Nov 7, 2024
1 parent 62979a5 commit fc9f6b3
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 26 deletions.
16 changes: 15 additions & 1 deletion app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
from app.persistence.iqueue import Provider as QueueProvider
from app.persistence.isearch import Provider as SearchProvider
from app.persistence.local_disk import BLOB_DEFAULT_PATH
from app.scrape import run as scrape_backend_run, state as scrape_backend_state
from app.scrape import (
install as scrape_backend_install,
run as scrape_backend_run,
state as scrape_backend_state,
)


def run_in_async(func):
Expand Down Expand Up @@ -101,6 +105,16 @@ def scrape() -> None:
pass


@scrape.command("install")
@common_params
@run_in_async
async def scrape_install() -> None:
"""
Install all dependencies required for the scraper.
"""
await scrape_backend_install()


@click.option(
"--max-depth",
"-md",
Expand Down
36 changes: 19 additions & 17 deletions app/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -1174,16 +1174,8 @@ async def run( # noqa: PLR0913
) -> None:
logger.info("Start scraping job %s", job)

# Install Playwright
async with async_playwright() as p:
browser_type = getattr(p, BROWSER_NAME)

await asyncio.gather(
# Install Playwright
_install_browser(browser_type),
# Install Pandoc
_install_pandoc(),
)
# Make sure the dependencies are installed
await install()

# Parse cache_refresh
cache_refresh_parsed = timedelta(hours=cache_refresh)
Expand Down Expand Up @@ -1287,15 +1279,25 @@ async def state( # noqa: PLR0913
return model


async def _install_browser(
browser_type: BrowserType,
with_deps: bool = False,
) -> None:
async def install() -> None:
"""
Install browser and Pandoc dependencies.
"""
logger.info("Installing dependencies if needed, this may take a few minutes")
await asyncio.gather(
_install_browser(),
_install_pandoc(),
)


async def _install_browser() -> None:
"""
Install Playwright selected browser.
Download is persisted in the application cache directory. If requested, also install system dependencies requested by the framework. Those requires root permissions on Linux systems as the system package manager will be called.
"""
logger.debug("Installing Playwright dependency")

# Add installation path to the environment
# See: https://playwright.dev/docs/browsers#hermetic-install
env["PLAYWRIGHT_BROWSERS_PATH"] = await browsers_install_path()
Expand All @@ -1306,9 +1308,7 @@ async def _install_browser(
# Ensure only one worker is installing the browser
async with file_lock(driver_executable):
# Build the command arguments
args = [driver_executable, driver_cli, "install", browser_type.name]
if with_deps:
args.append("--with-deps")
args = [driver_executable, driver_cli, "install", BROWSER_NAME]

# Run
proc = await asyncio.create_subprocess_shell(
Expand Down Expand Up @@ -1355,6 +1355,8 @@ async def _install_pandoc() -> None:
Download is persisted in the application cache directory.
"""
logger.debug("Installing Pandoc dependency")

# Fix version is necesssary to have reproducible builds
# See: https://github.com/jgm/pandoc/releases
version = "3.2.1"
Expand Down
11 changes: 3 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import asyncio
from collections.abc import AsyncGenerator

import pytest
from playwright.async_api import Browser, async_playwright

from app.scrape import BROWSER_NAME, _get_broswer, _install_browser, _install_pandoc
from app.scrape import BROWSER_NAME, _get_broswer, install


@pytest.fixture
Expand All @@ -15,13 +14,9 @@ async def browser() -> AsyncGenerator[Browser, None]:
async with async_playwright() as p:
browser_type = getattr(p, BROWSER_NAME)
# Make sure the browser and pandoc are installed
await asyncio.gather(
# Install Playwright
_install_browser(browser_type),
# Install Pandoc
_install_pandoc(),
)
await install()

# Restart context to reload PATH to the newly installed binaries
async with async_playwright() as p:
browser_type = getattr(p, BROWSER_NAME)
async with await _get_broswer(browser_type) as browser:
Expand Down

0 comments on commit fc9f6b3

Please # to comment.