diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1ec2d0f..88fdace 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -51,7 +51,7 @@ jobs: pip install --upgrade 'setuptools>=64' # Install package in editable mode. - pip install --use-pep517 --prefer-binary --editable='.[cli,develop,env,test]' + pip install --use-pep517 --prefer-binary --editable='.[all,develop,test]' - name: Run linter and software tests run: | diff --git a/CHANGES.md b/CHANGES.md index 938633b..fb49acc 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,3 +5,4 @@ ## 2023-10-30 0.0.1 - Initial thing +- Add `CachedWebResource`, a wrapper around `requests-cache` and `langchain` diff --git a/pueblo/context.py b/pueblo/context.py new file mode 100644 index 0000000..8670e69 --- /dev/null +++ b/pueblo/context.py @@ -0,0 +1,4 @@ +import platformdirs + +pueblo_cache_path = platformdirs.user_cache_path().joinpath("pueblo") +pueblo_cache_path.mkdir(parents=True, exist_ok=True) diff --git a/pueblo/nlp/__init__.py b/pueblo/nlp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pueblo/nlp/resource.py b/pueblo/nlp/resource.py new file mode 100644 index 0000000..9aec214 --- /dev/null +++ b/pueblo/nlp/resource.py @@ -0,0 +1,67 @@ +import logging +import typing as t + +import requests_cache + +from pueblo.context import pueblo_cache_path + +if t.TYPE_CHECKING: + from langchain.schema import Document + +http_cache_file = pueblo_cache_path / ".httpcache.sqlite" +http = requests_cache.CachedSession(str(http_cache_file)) + +logger = logging.getLogger(__name__) + + +class CachedWebResource: + """ + A basic wrapper around `requests-cache` and `langchain`. + """ + + def __init__(self, url: str): + logger.info(f"Using web cache: {http_cache_file}") + self.url = url + + def fetch_single(self) -> t.List["Document"]: + return [self.document_from_url()] + + @staticmethod + def fetch_multi(urls) -> t.List["Document"]: + from langchain.document_loaders import UnstructuredURLLoader + + loader = UnstructuredURLLoader(urls=urls) + return loader.load() + + def document_from_url(self) -> "Document": + """ + Converge URL resource into LangChain Document. + """ + logger.info(f"Acquiring web resource: {self.url}") + from langchain.schema import Document + from unstructured.partition.html import partition_html + + response = http.get(self.url) + elements = partition_html(text=response.text) + text = "\n\n".join([str(el) for el in elements]) + metadata = {"source": self.url} + return Document(page_content=text, metadata=metadata) + + def langchain_documents(self, **kwargs) -> t.List["Document"]: + """ + Load URL resource, and split paragraphs in response into individual documents. + """ + from langchain.text_splitter import CharacterTextSplitter + + documents = self.fetch_single() + text_splitter = CharacterTextSplitter(**kwargs) + return text_splitter.split_documents(documents) + + +if __name__ == "__main__": + from pueblo import setup_logging + + setup_logging() + url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt" + docs = CachedWebResource(url).langchain_documents(chunk_size=1000, chunk_overlap=0) + print("docs:", docs) # noqa: T201 diff --git a/pyproject.toml b/pyproject.toml index 0ece434..a0a8ae3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,9 +62,13 @@ dynamic = [ ] dependencies = [ + "platformdirs<4", ] [project.optional-dependencies] +all = [ + "pueblo[cli,env,nlp,web]", +] cli = [ "click<9", "click-aliases<2", @@ -81,6 +85,10 @@ develop = [ env = [ "python-dotenv<2", ] +nlp = [ + "langchain==0.0.325", + "unstructured<0.11", +] release = [ "build<2", "twine<5", @@ -91,6 +99,9 @@ test = [ "pytest-cov<5", "pytest-mock<4", ] +web = [ + "requests-cache<2", +] [project.scripts] pueblo = "pueblo.cli:cli" [tool.setuptools] diff --git a/tests/test_web.py b/tests/test_web.py new file mode 100644 index 0000000..90b74d6 --- /dev/null +++ b/tests/test_web.py @@ -0,0 +1,11 @@ +from pueblo.nlp.resource import CachedWebResource + + +def test_cached_web_resource(): + url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt" + docs = CachedWebResource(url).langchain_documents(chunk_size=1000, chunk_overlap=0) + assert len(docs) == 42 + + from langchain.schema import Document + + assert isinstance(docs[0], Document)