fix: the issue with sync scraper resolved.

intractai · Apr 12, 2024 · d579ab6 · d579ab6
1 parent 28abd6f
commit d579ab6
Showing 1 changed file with 7 additions and 1 deletion.
diff --git a/src/crawler/docs_scraper.py b/src/crawler/docs_scraper.py
@@ -344,6 +344,9 @@ def scrape(self, limit_tokens: bool = False):
                 content.append(page_content)
                 if char_count >= MAX_OVERVIEW_TOKENS and limit_tokens:
                     return '\n\n'.join(content)
+
+                code_blocks = soup.find_all('pre')
+                print('code_blocks:', code_blocks)
 
                 # Find all links and add them to the queue if not visited
                 for link in soup.find_all('a', href=True):
@@ -360,7 +363,10 @@ def scrape(self, limit_tokens: bool = False):
 
         if limit_tokens:
             return '\n\n'.join(content)
-        return content
+        return {
+            'content': content,
+            'code': []
+        }
 
 
 def get_doc_data(library: str, language: Optional[str]) -> dict: