Skip to content

Commit

Permalink
Add CachedWebResource, a wrapper around requests-cache and langchain
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed Oct 30, 2023
1 parent 6a890c4 commit ca19262
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
pip install --upgrade 'setuptools>=64'
# Install package in editable mode.
pip install --use-pep517 --prefer-binary --editable='.[cli,develop,env,test]'
pip install --use-pep517 --prefer-binary --editable='.[all,develop,test]'
- name: Run linter and software tests
run: |
Expand Down
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@

## 2023-10-30 0.0.1
- Initial thing
- Add `CachedWebResource`, a wrapper around `requests-cache` and `langchain`
4 changes: 4 additions & 0 deletions pueblo/context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import platformdirs

pueblo_cache_path = platformdirs.user_cache_path().joinpath("pueblo")
pueblo_cache_path.mkdir(parents=True, exist_ok=True)
Empty file added pueblo/nlp/__init__.py
Empty file.
67 changes: 67 additions & 0 deletions pueblo/nlp/resource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import logging
import typing as t

import requests_cache

from pueblo.context import pueblo_cache_path

if t.TYPE_CHECKING:
from langchain.schema import Document

http_cache_file = pueblo_cache_path / ".httpcache.sqlite"
http = requests_cache.CachedSession(str(http_cache_file))

logger = logging.getLogger(__name__)


class CachedWebResource:
"""
A basic wrapper around `requests-cache` and `langchain`.
"""

def __init__(self, url: str):
logger.info(f"Using web cache: {http_cache_file}")
self.url = url

def fetch_single(self) -> t.List["Document"]:
return [self.document_from_url()]

@staticmethod
def fetch_multi(urls) -> t.List["Document"]:
from langchain.document_loaders import UnstructuredURLLoader

loader = UnstructuredURLLoader(urls=urls)
return loader.load()

def document_from_url(self) -> "Document":
"""
Converge URL resource into LangChain Document.
"""
logger.info(f"Acquiring web resource: {self.url}")
from langchain.schema import Document
from unstructured.partition.html import partition_html

response = http.get(self.url)
elements = partition_html(text=response.text)
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": self.url}
return Document(page_content=text, metadata=metadata)

def langchain_documents(self, **kwargs) -> t.List["Document"]:
"""
Load URL resource, and split paragraphs in response into individual documents.
"""
from langchain.text_splitter import CharacterTextSplitter

documents = self.fetch_single()
text_splitter = CharacterTextSplitter(**kwargs)
return text_splitter.split_documents(documents)


if __name__ == "__main__":
from pueblo import setup_logging

setup_logging()
url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt"
docs = CachedWebResource(url).langchain_documents(chunk_size=1000, chunk_overlap=0)
print("docs:", docs) # noqa: T201
11 changes: 11 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,13 @@ dynamic = [
]

dependencies = [
"platformdirs<4",
]

[project.optional-dependencies]
all = [
"pueblo[cli,env,nlp,web]",
]
cli = [
"click<9",
"click-aliases<2",
Expand All @@ -81,6 +85,10 @@ develop = [
env = [
"python-dotenv<2",
]
nlp = [
"langchain==0.0.325",
"unstructured<0.11",
]
release = [
"build<2",
"twine<5",
Expand All @@ -91,6 +99,9 @@ test = [
"pytest-cov<5",
"pytest-mock<4",
]
web = [
"requests-cache<2",
]
[project.scripts]
pueblo = "pueblo.cli:cli"
[tool.setuptools]
Expand Down
11 changes: 11 additions & 0 deletions tests/test_web.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from pueblo.nlp.resource import CachedWebResource


def test_cached_web_resource():
url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt"
docs = CachedWebResource(url).langchain_documents(chunk_size=1000, chunk_overlap=0)
assert len(docs) == 42

from langchain.schema import Document

assert isinstance(docs[0], Document)

0 comments on commit ca19262

Please # to comment.