Skip to content

Commit

Permalink
feat: merged localscraper into smartscraper
Browse files Browse the repository at this point in the history
  • Loading branch information
PeriniM committed Feb 1, 2025
1 parent c898e99 commit 503dbd1
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 191 deletions.
31 changes: 0 additions & 31 deletions scrapegraph-py/examples/localscraper_example.py

This file was deleted.

2 changes: 2 additions & 0 deletions scrapegraph-py/examples/smartscraper_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
# SmartScraper request
response = sgai_client.smartscraper(
website_url="https://example.com",
# website_html="...", # Optional, if you want to pass in HTML content instead of a URL
user_prompt="Extract the main heading, description, and summary of the webpage",
)


# Print the response
print(f"Request ID: {response['request_id']}")
print(f"Result: {response['result']}")
Expand Down
1 change: 1 addition & 0 deletions scrapegraph-py/examples/smartscraper_schema_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class WebpageSchema(BaseModel):
# SmartScraper request with output schema
response = sgai_client.smartscraper(
website_url="https://example.com",
# website_html="...", # Optional, if you want to pass in HTML content instead of a URL
user_prompt="Extract webpage information",
output_schema=WebpageSchema,
)
Expand Down
51 changes: 8 additions & 43 deletions scrapegraph-py/scrapegraph_py/async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@
from scrapegraph_py.exceptions import APIError
from scrapegraph_py.logger import sgai_logger as logger
from scrapegraph_py.models.feedback import FeedbackRequest
from scrapegraph_py.models.localscraper import (
GetLocalScraperRequest,
LocalScraperRequest,
)
from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest
from scrapegraph_py.models.smartscraper import (
GetSmartScraperRequest,
Expand Down Expand Up @@ -165,16 +161,22 @@ async def get_markdownify(self, request_id: str):

async def smartscraper(
self,
website_url: str,
user_prompt: str,
website_url: Optional[str] = None,
website_html: Optional[str] = None,
output_schema: Optional[BaseModel] = None,
):
"""Send a smartscraper request"""
logger.info(f"🔍 Starting smartscraper request for {website_url}")
logger.info("🔍 Starting smartscraper request")
if website_url:
logger.debug(f"🌐 URL: {website_url}")
if website_html:
logger.debug("📄 Using provided HTML content")
logger.debug(f"📝 Prompt: {user_prompt}")

request = SmartScraperRequest(
website_url=website_url,
website_html=website_html,
user_prompt=user_prompt,
output_schema=output_schema,
)
Expand All @@ -200,43 +202,6 @@ async def get_smartscraper(self, request_id: str):
logger.info(f"✨ Successfully retrieved result for request {request_id}")
return result

async def localscraper(
self,
user_prompt: str,
website_html: str,
output_schema: Optional[BaseModel] = None,
):
"""Send a localscraper request"""
logger.info("🔍 Starting localscraper request")
logger.debug(f"📝 Prompt: {user_prompt}")

request = LocalScraperRequest(
user_prompt=user_prompt,
website_html=website_html,
output_schema=output_schema,
)
logger.debug("✅ Request validation passed")

result = await self._make_request(
"POST", f"{API_BASE_URL}/localscraper", json=request.model_dump()
)
logger.info("✨ Localscraper request completed successfully")
return result

async def get_localscraper(self, request_id: str):
"""Get the result of a previous localscraper request"""
logger.info(f"🔍 Fetching localscraper result for request {request_id}")

# Validate input using Pydantic model
GetLocalScraperRequest(request_id=request_id)
logger.debug("✅ Request ID validation passed")

result = await self._make_request(
"GET", f"{API_BASE_URL}/localscraper/{request_id}"
)
logger.info(f"✨ Successfully retrieved result for request {request_id}")
return result

async def submit_feedback(
self, request_id: str, rating: int, feedback_text: Optional[str] = None
):
Expand Down
49 changes: 8 additions & 41 deletions scrapegraph-py/scrapegraph_py/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@
from scrapegraph_py.exceptions import APIError
from scrapegraph_py.logger import sgai_logger as logger
from scrapegraph_py.models.feedback import FeedbackRequest
from scrapegraph_py.models.localscraper import (
GetLocalScraperRequest,
LocalScraperRequest,
)
from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest
from scrapegraph_py.models.smartscraper import (
GetSmartScraperRequest,
Expand Down Expand Up @@ -175,16 +171,22 @@ def get_markdownify(self, request_id: str):

def smartscraper(
self,
website_url: str,
user_prompt: str,
website_url: Optional[str] = None,
website_html: Optional[str] = None,
output_schema: Optional[BaseModel] = None,
):
"""Send a smartscraper request"""
logger.info(f"🔍 Starting smartscraper request for {website_url}")
logger.info("🔍 Starting smartscraper request")
if website_url:
logger.debug(f"🌐 URL: {website_url}")
if website_html:
logger.debug("📄 Using provided HTML content")
logger.debug(f"📝 Prompt: {user_prompt}")

request = SmartScraperRequest(
website_url=website_url,
website_html=website_html,
user_prompt=user_prompt,
output_schema=output_schema,
)
Expand All @@ -208,41 +210,6 @@ def get_smartscraper(self, request_id: str):
logger.info(f"✨ Successfully retrieved result for request {request_id}")
return result

def localscraper(
self,
user_prompt: str,
website_html: str,
output_schema: Optional[BaseModel] = None,
):
"""Send a localscraper request"""
logger.info("🔍 Starting localscraper request")
logger.debug(f"📝 Prompt: {user_prompt}")

request = LocalScraperRequest(
user_prompt=user_prompt,
website_html=website_html,
output_schema=output_schema,
)
logger.debug("✅ Request validation passed")

result = self._make_request(
"POST", f"{API_BASE_URL}/localscraper", json=request.model_dump()
)
logger.info("✨ Localscraper request completed successfully")
return result

def get_localscraper(self, request_id: str):
"""Get the result of a previous localscraper request"""
logger.info(f"🔍 Fetching localscraper result for request {request_id}")

# Validate input using Pydantic model
GetLocalScraperRequest(request_id=request_id)
logger.debug("✅ Request ID validation passed")

result = self._make_request("GET", f"{API_BASE_URL}/localscraper/{request_id}")
logger.info(f"✨ Successfully retrieved result for request {request_id}")
return result

def submit_feedback(
self, request_id: str, rating: int, feedback_text: Optional[str] = None
):
Expand Down
67 changes: 0 additions & 67 deletions scrapegraph-py/scrapegraph_py/models/localscraper.py

This file was deleted.

38 changes: 29 additions & 9 deletions scrapegraph-py/scrapegraph_py/models/smartscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Optional, Type
from uuid import UUID

from bs4 import BeautifulSoup
from pydantic import BaseModel, Field, model_validator


Expand All @@ -11,7 +12,14 @@ class SmartScraperRequest(BaseModel):
...,
example="Extract info about the company",
)
website_url: str = Field(..., example="https://scrapegraphai.com/")
website_url: Optional[str] = Field(
default=None, example="https://scrapegraphai.com/"
)
website_html: Optional[str] = Field(
default=None,
example="<html><body><h1>Title</h1><p>Content</p></body></html>",
description="HTML content, maximum size 2MB",
)
output_schema: Optional[Type[BaseModel]] = None

@model_validator(mode="after")
Expand All @@ -23,14 +31,26 @@ def validate_user_prompt(self) -> "SmartScraperRequest":
return self

@model_validator(mode="after")
def validate_url(self) -> "SmartScraperRequest":
if self.website_url is None or not self.website_url.strip():
raise ValueError("Website URL cannot be empty")
if not (
self.website_url.startswith("http://")
or self.website_url.startswith("https://")
):
raise ValueError("Invalid URL")
def validate_url_and_html(self) -> "SmartScraperRequest":
if self.website_html is not None:
if len(self.website_html.encode("utf-8")) > 2 * 1024 * 1024:
raise ValueError("Website HTML content exceeds maximum size of 2MB")
try:
soup = BeautifulSoup(self.website_html, "html.parser")
if not soup.find():
raise ValueError("Invalid HTML - no parseable content found")
except Exception as e:
raise ValueError(f"Invalid HTML structure: {str(e)}")
elif self.website_url is not None:
if not self.website_url.strip():
raise ValueError("Website URL cannot be empty")
if not (
self.website_url.startswith("http://")
or self.website_url.startswith("https://")
):
raise ValueError("Invalid URL")
else:
raise ValueError("Either website_url or website_html must be provided")
return self

def model_dump(self, *args, **kwargs) -> dict:
Expand Down

0 comments on commit 503dbd1

Please # to comment.