From d579ab617a544c55bc2d8abbc9d0bac2dd75bb88 Mon Sep 17 00:00:00 2001 From: ErfanMhi Date: Fri, 12 Apr 2024 10:31:46 -0600 Subject: [PATCH] fix: the issue with sync scraper resolved. --- src/crawler/docs_scraper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/crawler/docs_scraper.py b/src/crawler/docs_scraper.py index 01f0ac7..e635bac 100644 --- a/src/crawler/docs_scraper.py +++ b/src/crawler/docs_scraper.py @@ -344,6 +344,9 @@ def scrape(self, limit_tokens: bool = False): content.append(page_content) if char_count >= MAX_OVERVIEW_TOKENS and limit_tokens: return '\n\n'.join(content) + + code_blocks = soup.find_all('pre') + print('code_blocks:', code_blocks) # Find all links and add them to the queue if not visited for link in soup.find_all('a', href=True): @@ -360,7 +363,10 @@ def scrape(self, limit_tokens: bool = False): if limit_tokens: return '\n\n'.join(content) - return content + return { + 'content': content, + 'code': [] + } def get_doc_data(library: str, language: Optional[str]) -> dict: