Skip to content

Commit

Permalink
fix: the issue with sync scraper resolved.
Browse files Browse the repository at this point in the history
  • Loading branch information
erfanMhi committed Apr 12, 2024
1 parent 28abd6f commit d579ab6
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion src/crawler/docs_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,9 @@ def scrape(self, limit_tokens: bool = False):
content.append(page_content)
if char_count >= MAX_OVERVIEW_TOKENS and limit_tokens:
return '\n\n'.join(content)

code_blocks = soup.find_all('pre')
print('code_blocks:', code_blocks)

# Find all links and add them to the queue if not visited
for link in soup.find_all('a', href=True):
Expand All @@ -360,7 +363,10 @@ def scrape(self, limit_tokens: bool = False):

if limit_tokens:
return '\n\n'.join(content)
return content
return {
'content': content,
'code': []
}


def get_doc_data(library: str, language: Optional[str]) -> dict:
Expand Down

0 comments on commit d579ab6

Please # to comment.