Skip to content

Commit 560a2fe

Browse files
authored
Merge pull request #963 from ScrapeGraphAI/pre/beta
Pre/beta
2 parents 0c34b76 + b1b8579 commit 560a2fe

File tree

63 files changed

+4421
-1137
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+4421
-1137
lines changed

Diff for: .gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,6 @@ cython_debug/
192192
.DS_Store
193193

194194
dev.ipynb
195+
196+
# CodeBeaver reports and artifacts
197+
.codebeaver

Diff for: CHANGELOG.md

+15
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,23 @@
1+
## [1.47.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.46.0...v1.47.0-beta.1) (2025-04-15)
2+
3+
4+
### Features
5+
6+
* add new proxy rotation ([8913d8d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8913d8d3af3a2809d3ddcbfa09cbf2c9982a19cd))
7+
8+
9+
### CI
10+
11+
* **release:** 1.44.0-beta.1 [skip ci] ([5e944cc](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5e944cc573f62585dbf3366aa840c997847523d1))
12+
113
## [1.46.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.45.0...v1.46.0) (2025-03-27)
214

315

16+
417
### Features
518

19+
* add new proxy rotation ([8913d8d](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/8913d8d3af3a2809d3ddcbfa09cbf2c9982a19cd))
20+
621
* add new logo ([c085d6c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c085d6c7ffcbf446439de97c9f88f8eadba5909c))
722

823
## [1.45.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.44.0...v1.45.0) (2025-03-27)

Diff for: examples/ScrapegraphAI_cookbook.ipynb

+902-903
Large diffs are not rendered by default.

Diff for: examples/code_generator_graph/ollama/code_generator_graph_ollama.py

-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
Basic example of scraping pipeline using Code Generator with schema
33
"""
44

5-
import json
65
from typing import List
76

87
from dotenv import load_dotenv

Diff for: examples/custom_graph/ollama/custom_graph_ollama.py

-3
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,13 @@
22
Example of custom graph using existing nodes
33
"""
44

5-
import os
6-
75
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
86

97
from scrapegraphai.graphs import BaseGraph
108
from scrapegraphai.nodes import (
119
FetchNode,
1210
GenerateAnswerNode,
1311
ParseNode,
14-
RAGNode,
1512
RobotsNode,
1613
)
1714

Diff for: examples/extras/chromium_selenium.py

-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
ChromiumLoader,
1010
)
1111
from scrapegraphai.graphs import SmartScraperGraph
12-
from scrapegraphai.utils import prettify_exec_info
1312

1413
# Load environment variables for API keys
1514
load_dotenv()

Diff for: examples/extras/no_cut.py

-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
"""
44

55
import json
6-
import os
76

87
from scrapegraphai.graphs import SmartScraperGraph
98
from scrapegraphai.utils import prettify_exec_info

Diff for: examples/extras/serch_graph_scehma.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class Ceos(BaseModel):
4040
# ************************************************
4141

4242
search_graph = SearchGraph(
43-
prompt=f"Who is the ceo of Appke?",
43+
prompt="Who is the ceo of Appke?",
4444
schema=Ceos,
4545
config=graph_config,
4646
)

Diff for: examples/script_generator_graph/ollama/script_multi_generator_ollama.py

-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
Basic example of scraping pipeline using ScriptCreatorGraph
33
"""
44

5-
import os
6-
75
from dotenv import load_dotenv
86

97
from scrapegraphai.graphs import ScriptCreatorMultiGraph

Diff for: pyproject.toml

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
[project]
22
name = "scrapegraphai"
33

4-
version = "1.46.0"
5-
4+
version = "1.47.0b1"
65

76

87
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."

Diff for: scrapegraphai/builders/graph_builder.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,7 @@ def _create_extraction_chain(self):
113113
{nodes_description}
114114
115115
Based on the user's input: "{input}", identify the essential nodes required for the task and suggest a graph configuration that outlines the flow between the chosen nodes.
116-
""".format(
117-
nodes_description=self.nodes_description, input="{input}"
118-
)
116+
""".format(nodes_description=self.nodes_description, input="{input}")
119117
extraction_prompt = ChatPromptTemplate.from_template(
120118
create_graph_prompt_template
121119
)

Diff for: scrapegraphai/docloaders/chromium.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,8 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
360360
else:
361361
raise ValueError(f"Invalid browser name: {browser_name}")
362362
context = await browser.new_context(
363-
storage_state=self.storage_state
363+
storage_state=self.storage_state,
364+
ignore_https_errors=True,
364365
)
365366
await Malenia.apply_stealth(context)
366367
page = await context.new_page()

Diff for: scrapegraphai/docloaders/scrape_do.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
Scrape_do module
33
"""
44

5+
import os
56
import urllib.parse
67

78
import requests
8-
import os
99
import urllib3
1010

1111
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

Diff for: scrapegraphai/graphs/abstract_graph.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def _create_llm(self, llm_config: dict) -> object:
177177
]
178178
if len(possible_providers) <= 0:
179179
raise ValueError(
180-
f"""Provider {llm_params['model_provider']} is not supported.
180+
f"""Provider {llm_params["model_provider"]} is not supported.
181181
If possible, try to use a model instance instead."""
182182
)
183183
llm_params["model_provider"] = possible_providers[0]
@@ -190,7 +190,7 @@ def _create_llm(self, llm_config: dict) -> object:
190190

191191
if llm_params["model_provider"] not in known_providers:
192192
raise ValueError(
193-
f"""Provider {llm_params['model_provider']} is not supported.
193+
f"""Provider {llm_params["model_provider"]} is not supported.
194194
If possible, try to use a model instance instead."""
195195
)
196196

@@ -201,7 +201,7 @@ def _create_llm(self, llm_config: dict) -> object:
201201
]
202202
except KeyError:
203203
print(
204-
f"""Max input tokens for model {llm_params['model_provider']}/{llm_params['model']} not found,
204+
f"""Max input tokens for model {llm_params["model_provider"]}/{llm_params["model"]} not found,
205205
please specify the model_tokens parameter in the llm section of the graph configuration.
206206
Using default token size: 8192"""
207207
)

Diff for: scrapegraphai/graphs/csv_scraper_multi_graph.py

-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ def __init__(
4949
config: dict,
5050
schema: Optional[Type[BaseModel]] = None,
5151
):
52-
5352
self.copy_config = safe_deepcopy(config)
5453
self.copy_schema = deepcopy(schema)
5554

Diff for: scrapegraphai/graphs/json_scraper_multi_graph.py

-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ def __init__(
4949
config: dict,
5050
schema: Optional[Type[BaseModel]] = None,
5151
):
52-
5352
self.copy_config = safe_deepcopy(config)
5453
self.copy_schema = deepcopy(schema)
5554

Diff for: scrapegraphai/graphs/omni_search_graph.py

-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ class OmniSearchGraph(AbstractGraph):
4444
def __init__(
4545
self, prompt: str, config: dict, schema: Optional[Type[BaseModel]] = None
4646
):
47-
4847
self.max_results = config.get("max_results", 3)
4948

5049
self.copy_config = safe_deepcopy(config)

Diff for: scrapegraphai/graphs/script_creator_multi_graph.py

-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ def __init__(
4848
config: dict,
4949
schema: Optional[Type[BaseModel]] = None,
5050
):
51-
5251
self.copy_config = safe_deepcopy(config)
5352
self.copy_schema = deepcopy(schema)
5453
super().__init__(prompt, config, source, schema)

Diff for: scrapegraphai/graphs/smart_scraper_multi_concat_graph.py

-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ def __init__(
5353
config: dict,
5454
schema: Optional[Type[BaseModel]] = None,
5555
):
56-
5756
self.copy_config = safe_deepcopy(config)
5857
self.copy_schema = deepcopy(schema)
5958

Diff for: scrapegraphai/graphs/smart_scraper_multi_graph.py

-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ def __init__(
5555
config: dict,
5656
schema: Optional[Type[BaseModel]] = None,
5757
):
58-
5958
self.max_results = config.get("max_results", 3)
6059
self.copy_config = safe_deepcopy(config)
6160
self.copy_schema = deepcopy(schema)

Diff for: scrapegraphai/graphs/smart_scraper_multi_lite_graph.py

-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ def __init__(
5555
config: dict,
5656
schema: Optional[Type[BaseModel]] = None,
5757
):
58-
5958
self.copy_config = safe_deepcopy(config)
6059
self.copy_schema = deepcopy(schema)
6160
super().__init__(prompt, config, source, schema)

Diff for: scrapegraphai/graphs/xml_scraper_multi_graph.py

-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ def __init__(
4949
config: dict,
5050
schema: Optional[Type[BaseModel]] = None,
5151
):
52-
5352
self.copy_config = safe_deepcopy(config)
5453
self.copy_schema = deepcopy(schema)
5554
super().__init__(prompt, config, source, schema)

Diff for: scrapegraphai/models/openai_tts.py

-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ class OpenAITextToSpeech:
1919
"""
2020

2121
def __init__(self, tts_config: dict):
22-
2322
self.client = OpenAI(
2423
api_key=tts_config.get("api_key"), base_url=tts_config.get("base_url", None)
2524
)

Diff for: scrapegraphai/nodes/base_node.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ def __init__(
5454
min_input_len: int = 1,
5555
node_config: Optional[dict] = None,
5656
):
57-
5857
self.node_name = node_name
5958
self.input = input
6059
self.output = output
@@ -197,7 +196,6 @@ def evaluate_simple_expression(exp: str) -> List[str]:
197196
"""Evaluate an expression without parentheses."""
198197

199198
for or_segment in exp.split("|"):
200-
201199
and_segment = or_segment.split("&")
202200
if all(elem.strip() in state for elem in and_segment):
203201
return [
@@ -226,7 +224,7 @@ def evaluate_expression(expression: str) -> List[str]:
226224
raise ValueError(
227225
f"""No state keys matched the expression.
228226
Expression was {expression}.
229-
State contains keys: {', '.join(state.keys())}"""
227+
State contains keys: {", ".join(state.keys())}"""
230228
)
231229

232230
final_result = []

Diff for: scrapegraphai/nodes/concat_answers_node.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,7 @@ def __init__(
3636
)
3737

3838
def _merge_dict(self, items):
39-
40-
return {"products": {f"item_{i+1}": item for i, item in enumerate(items)}}
39+
return {"products": {f"item_{i + 1}": item for i, item in enumerate(items)}}
4140

4241
def execute(self, state: dict) -> dict:
4342
"""

Diff for: scrapegraphai/nodes/description_node.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def execute(self, state: dict) -> dict:
5858
template=DESCRIPTION_NODE_PROMPT,
5959
partial_variables={"content": chunk.get("document")},
6060
)
61-
chain_name = f"chunk{i+1}"
61+
chain_name = f"chunk{i + 1}"
6262
chains_dict[chain_name] = prompt | self.llm_model
6363

6464
async_runner = RunnableParallel(**chains_dict)

Diff for: scrapegraphai/nodes/generate_answer_csv_node.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ def execute(self, state):
9696
doc = input_data[1]
9797

9898
if self.node_config.get("schema", None) is not None:
99-
10099
if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
101100
self.llm_model = self.llm_model.with_structured_output(
102101
schema=self.node_config["schema"]
@@ -151,7 +150,7 @@ def execute(self, state):
151150
},
152151
)
153152

154-
chain_name = f"chunk{i+1}"
153+
chain_name = f"chunk{i + 1}"
155154
chains_dict[chain_name] = prompt | self.llm_model | output_parser
156155

157156
async_runner = RunnableParallel(**chains_dict)

Diff for: scrapegraphai/nodes/generate_answer_from_image_node.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ async def execute_async(self, state: dict) -> dict:
8585
raise ValueError(
8686
f"""The model provided
8787
is not supported. Supported models are:
88-
{', '.join(supported_models)}."""
88+
{", ".join(supported_models)}."""
8989
)
9090

9191
api_key = self.node_config.get("config", {}).get("llm", {}).get("api_key", "")

Diff for: scrapegraphai/nodes/generate_answer_node.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ def execute(self, state: dict) -> dict:
221221
"format_instructions": format_instructions,
222222
},
223223
)
224-
chain_name = f"chunk{i+1}"
224+
chain_name = f"chunk{i + 1}"
225225
chains_dict[chain_name] = prompt | self.llm_model
226226
if output_parser:
227227
chains_dict[chain_name] = chains_dict[chain_name] | output_parser

Diff for: scrapegraphai/nodes/generate_answer_node_k_level.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def execute(self, state: dict) -> dict:
155155
"chunk_id": i + 1,
156156
},
157157
)
158-
chain_name = f"chunk{i+1}"
158+
chain_name = f"chunk{i + 1}"
159159
chains_dict[chain_name] = prompt | self.llm_model
160160

161161
async_runner = RunnableParallel(**chains_dict)

Diff for: scrapegraphai/nodes/generate_answer_omni_node.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ def execute(self, state: dict) -> dict:
8989
imag_desc = input_data[2]
9090

9191
if self.node_config.get("schema", None) is not None:
92-
9392
if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
9493
self.llm_model = self.llm_model.with_structured_output(
9594
schema=self.node_config["schema"]
@@ -151,7 +150,7 @@ def execute(self, state: dict) -> dict:
151150
},
152151
)
153152

154-
chain_name = f"chunk{i+1}"
153+
chain_name = f"chunk{i + 1}"
155154
chains_dict[chain_name] = prompt | self.llm_model | output_parser
156155

157156
async_runner = RunnableParallel(**chains_dict)

Diff for: scrapegraphai/nodes/merge_answers_node.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,9 @@ def execute(self, state: dict) -> dict:
8282

8383
answers_str = ""
8484
for i, answer in enumerate(answers):
85-
answers_str += f"CONTENT WEBSITE {i+1}: {answer}\n"
85+
answers_str += f"CONTENT WEBSITE {i + 1}: {answer}\n"
8686

8787
if self.node_config.get("schema", None) is not None:
88-
8988
if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
9089
self.llm_model = self.llm_model.with_structured_output(
9190
schema=self.node_config["schema"]

Diff for: scrapegraphai/nodes/merge_generated_scripts_node.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def execute(self, state: dict) -> dict:
6464
scripts_str = ""
6565
for i, script in enumerate(scripts):
6666
scripts_str += "-----------------------------------\n"
67-
scripts_str += f"SCRIPT URL {i+1}\n"
67+
scripts_str += f"SCRIPT URL {i + 1}\n"
6868
scripts_str += "-----------------------------------\n"
6969
scripts_str += script
7070

Diff for: scrapegraphai/nodes/parse_node.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def execute(self, state: dict) -> dict:
122122
state.update({self.output[0]: chunks})
123123
state.update({"parsed_doc": chunks})
124124
state.update({"content": chunks})
125-
125+
126126
if self.parse_urls:
127127
state.update({self.output[1]: link_urls})
128128
state.update({self.output[2]: img_urls})

Diff for: scrapegraphai/nodes/search_link_node.py

-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,6 @@ def execute(self, state: dict) -> dict:
122122
)
123123
):
124124
try:
125-
126125
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
127126

128127
if not self.filter_links:

Diff for: scrapegraphai/utils/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
__init__.py file for utils folder
2+
__init__.py file for utils folder
33
"""
44

55
from .cleanup_code import extract_code

0 commit comments

Comments
 (0)