From ca19262d6d239aea07f9cca4755bb9ac595271f0 Mon Sep 17 00:00:00 2001
From: Andreas Motl <andreas.motl@panodata.org>
Date: Mon, 30 Oct 2023 06:34:52 +0100
Subject: [PATCH] Add `CachedWebResource`, a wrapper around requests-cache and
 langchain

---
 .github/workflows/main.yml |  2 +-
 CHANGES.md                 |  1 +
 pueblo/context.py          |  4 +++
 pueblo/nlp/__init__.py     |  0
 pueblo/nlp/resource.py     | 67 ++++++++++++++++++++++++++++++++++++++
 pyproject.toml             | 11 +++++++
 tests/test_web.py          | 11 +++++++
 7 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 pueblo/context.py
 create mode 100644 pueblo/nlp/__init__.py
 create mode 100644 pueblo/nlp/resource.py
 create mode 100644 tests/test_web.py

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 1ec2d0f..88fdace 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -51,7 +51,7 @@ jobs:
         pip install --upgrade 'setuptools>=64'
 
         # Install package in editable mode.
-        pip install --use-pep517 --prefer-binary --editable='.[cli,develop,env,test]'
+        pip install --use-pep517 --prefer-binary --editable='.[all,develop,test]'
 
     - name: Run linter and software tests
       run: |
diff --git a/CHANGES.md b/CHANGES.md
index 938633b..fb49acc 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -5,3 +5,4 @@
 
 ## 2023-10-30 0.0.1
 - Initial thing
+- Add `CachedWebResource`, a wrapper around `requests-cache` and `langchain`
diff --git a/pueblo/context.py b/pueblo/context.py
new file mode 100644
index 0000000..8670e69
--- /dev/null
+++ b/pueblo/context.py
@@ -0,0 +1,4 @@
+import platformdirs
+
+pueblo_cache_path = platformdirs.user_cache_path().joinpath("pueblo")
+pueblo_cache_path.mkdir(parents=True, exist_ok=True)
diff --git a/pueblo/nlp/__init__.py b/pueblo/nlp/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pueblo/nlp/resource.py b/pueblo/nlp/resource.py
new file mode 100644
index 0000000..9aec214
--- /dev/null
+++ b/pueblo/nlp/resource.py
@@ -0,0 +1,67 @@
+import logging
+import typing as t
+
+import requests_cache
+
+from pueblo.context import pueblo_cache_path
+
+if t.TYPE_CHECKING:
+    from langchain.schema import Document
+
+http_cache_file = pueblo_cache_path / ".httpcache.sqlite"
+http = requests_cache.CachedSession(str(http_cache_file))
+
+logger = logging.getLogger(__name__)
+
+
+class CachedWebResource:
+    """
+    A basic wrapper around `requests-cache` and `langchain`.
+    """
+
+    def __init__(self, url: str):
+        logger.info(f"Using web cache: {http_cache_file}")
+        self.url = url
+
+    def fetch_single(self) -> t.List["Document"]:
+        return [self.document_from_url()]
+
+    @staticmethod
+    def fetch_multi(urls) -> t.List["Document"]:
+        from langchain.document_loaders import UnstructuredURLLoader
+
+        loader = UnstructuredURLLoader(urls=urls)
+        return loader.load()
+
+    def document_from_url(self) -> "Document":
+        """
+        Converge URL resource into LangChain Document.
+        """
+        logger.info(f"Acquiring web resource: {self.url}")
+        from langchain.schema import Document
+        from unstructured.partition.html import partition_html
+
+        response = http.get(self.url)
+        elements = partition_html(text=response.text)
+        text = "\n\n".join([str(el) for el in elements])
+        metadata = {"source": self.url}
+        return Document(page_content=text, metadata=metadata)
+
+    def langchain_documents(self, **kwargs) -> t.List["Document"]:
+        """
+        Load URL resource, and split paragraphs in response into individual documents.
+        """
+        from langchain.text_splitter import CharacterTextSplitter
+
+        documents = self.fetch_single()
+        text_splitter = CharacterTextSplitter(**kwargs)
+        return text_splitter.split_documents(documents)
+
+
+if __name__ == "__main__":
+    from pueblo import setup_logging
+
+    setup_logging()
+    url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt"
+    docs = CachedWebResource(url).langchain_documents(chunk_size=1000, chunk_overlap=0)
+    print("docs:", docs)  # noqa: T201
diff --git a/pyproject.toml b/pyproject.toml
index 0ece434..a0a8ae3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,9 +62,13 @@ dynamic = [
 ]
 
 dependencies = [
+  "platformdirs<4",
 ]
 
 [project.optional-dependencies]
+all = [
+  "pueblo[cli,env,nlp,web]",
+]
 cli = [
   "click<9",
   "click-aliases<2",
@@ -81,6 +85,10 @@ develop = [
 env = [
   "python-dotenv<2",
 ]
+nlp = [
+  "langchain==0.0.325",
+  "unstructured<0.11",
+]
 release = [
   "build<2",
   "twine<5",
@@ -91,6 +99,9 @@ test = [
   "pytest-cov<5",
   "pytest-mock<4",
 ]
+web = [
+  "requests-cache<2",
+]
 [project.scripts]
 pueblo = "pueblo.cli:cli"
 [tool.setuptools]
diff --git a/tests/test_web.py b/tests/test_web.py
new file mode 100644
index 0000000..90b74d6
--- /dev/null
+++ b/tests/test_web.py
@@ -0,0 +1,11 @@
+from pueblo.nlp.resource import CachedWebResource
+
+
+def test_cached_web_resource():
+    url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt"
+    docs = CachedWebResource(url).langchain_documents(chunk_size=1000, chunk_overlap=0)
+    assert len(docs) == 42
+
+    from langchain.schema import Document
+
+    assert isinstance(docs[0], Document)