-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
97 lines (82 loc) · 2.67 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from fastapi.middleware.cors import CORSMiddleware
from fastapi import FastAPI, HTTPException
from selectorlib import Extractor
from pydantic import BaseModel
import httpx as requests
import yaml
import git
repo = git.Repo()
commit = repo.head.object.hexsha[:7]
app = FastAPI()
origins = ["*"]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
# Request body model
class SelectorRequest(BaseModel):
html: str = None
url: str = None
selectors: dict
# Response body model
class ExtractionResponse(BaseModel):
data: dict
@app.get('/ping')
async def ping():
return {
'data': {
'message': 'Pong'
}
}
@app.get('/')
async def index():
"""
# Index Page of the API
"""
return {
"message": [
"""Welcome to CrawlX API""",
"""Use /docs to Know More about the API Endpoints""",
"""Demo project at https://notoriousarnav.github.io/simple_alpine_axios_prj/""",
"""Source Code Available at https://github.com/NotoriousArnav/crawlx""",
f"""Running Commit {commit}"""
]
}
@app.post("/extract", response_model=ExtractionResponse)
async def extract_data(request: SelectorRequest):
"""
# Simple Page Data Extraction/Scraping API
Used to Scrape data from a HTML Document/URL Provided. Selectors needs to be Provided to Extract and Format the Data Accordingly.
"""
# Check if either html or url is provided
if not request.html and not request.url:
raise HTTPException(status_code=400, detail="Either 'html' or 'url' must be provided")
# If url is provided, fetch the HTML content
if request.url:
response = requests.get(
request.url,
headers = headers
)
if response.status_code != 200:
raise HTTPException(status_code=400, detail="Failed to fetch URL")
html = response.text
else:
html = request.html
# Convert selectors from JSON to YAML
selectors_yaml = yaml.dump(request.selectors)
# Extract data using Selectorlib
extractor = Extractor.from_yaml_string(selectors_yaml)
extracted_data = extractor.extract(html)
return {"data": extracted_data}