Unstructured-IO · krish-adi · Oct 6, 2024 · Oct 6, 2024 · Oct 6, 2024 · Oct 11, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## 0.0.25
+
+### Features
+
+* **Add Ollama embedder** Adds support for creating embeddings vector using Ollama
+
 ## 0.0.24
 
 ### Enhancements

diff --git a/requirements/embed/ollama.in b/requirements/embed/ollama.in
@@ -0,0 +1,3 @@
+-c ../common/constraints.txt
+
+ollama
diff --git a/requirements/embed/ollama.txt b/requirements/embed/ollama.txt
@@ -0,0 +1,28 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile ./embed/ollama.in --output-file ./embed/ollama.txt --no-strip-extras --python-version 3.9
+anyio==4.6.0
+    # via httpx
+certifi==2024.8.30
+    # via
+    #   httpcore
+    #   httpx
+exceptiongroup==1.2.2
+    # via anyio
+h11==0.14.0
+    # via httpcore
+httpcore==1.0.6
+    # via httpx
+httpx==0.27.2
+    # via ollama
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+ollama==0.3.3
+    # via -r ./embed/ollama.in
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   httpx
+typing-extensions==4.12.2
+    # via anyio
diff --git a/setup.py b/setup.py
@@ -128,6 +128,7 @@ def load_requirements(file: Union[str, Path]) -> List[str]:
 
 embed_reqs = {
     "embed-huggingface": load_requirements("requirements/embed/huggingface.in"),
+    "embed-ollama": load_requirements("requirements/embed/ollama.in"),
     "embed-octoai": load_requirements("requirements/embed/octoai.in"),
     "embed-vertexai": load_requirements("requirements/embed/vertexai.in"),
     "embed-voyageai": load_requirements("requirements/embed/voyageai.in"),

diff --git a/test/embed/test_ollama.py b/test/embed/test_ollama.py
@@ -0,0 +1,22 @@
+from unstructured_ingest.embed.ollama import OllamaEmbeddingConfig, OllamaEmbeddingEncoder
+
+
+def test_embed_documents_does_not_break_element_to_dict(mocker):
+    # Mocked client with the desired behavior for embed_documents
+    mock_response = mocker.MagicMock()
+    mocker.patch.object(mock_response, "embeddings", [1, 2])
+    mock_client = mocker.MagicMock()
+    mock_client.embed.return_value = mock_response
+
+    # Mock get_client to return our mock_client
+    mocker.patch.object(OllamaEmbeddingConfig, "get_client", return_value=mock_client)
+
+    encoder = OllamaEmbeddingEncoder(config=OllamaEmbeddingConfig(model_name="all-minilm"))
+    raw_elements = [{"text": f"This is sentence {i+1}"} for i in range(2)]
+
+    elements = encoder.embed_documents(
+        elements=raw_elements,
+    )
+    assert len(elements) == 2
+    assert elements[0]["text"] == "This is sentence 1"
+    assert elements[1]["text"] == "This is sentence 2"
diff --git a/test_e2e/src/local-embed-ollama.sh b/test_e2e/src/local-embed-ollama.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+set -e
+
+SRC_PATH=$(dirname "$(realpath "$0")")
+SCRIPT_DIR=$(dirname "$SRC_PATH")
+cd "$SCRIPT_DIR"/.. || exit 1
+OUTPUT_FOLDER_NAME=embed-ollama
+OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
+OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
+WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
+max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
+
+# shellcheck disable=SC1091
+source "$SCRIPT_DIR"/cleanup.sh
+function cleanup() {
+  cleanup_dir "$OUTPUT_DIR"
+  cleanup_dir "$WORK_DIR"
+}
+trap cleanup EXIT
+
+RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
+PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
+  local \
+  --api-key "$UNS_PAID_API_KEY" \
+  --partition-by-api \
+  --partition-endpoint "https://api.unstructuredapp.io" \
+  --num-processes "$max_processes" \
+  --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
+  --output-dir "$OUTPUT_DIR" \
+  --verbose \
+  --reprocess \
+  --input-path example-docs/book-war-and-peace-1p.txt \
+  --work-dir "$WORK_DIR" \
+  --embedding-provider "ollama" \
+  --embedding-model-name "all-minilm"
+
+"$SCRIPT_DIR"/check-diff-expected-output.py --output-folder-name $OUTPUT_FOLDER_NAME
diff --git a/test_e2e/test-src.sh b/test_e2e/test-src.sh
@@ -63,6 +63,7 @@ all_tests=(
   'hubspot.sh'
   'local-embed.sh'
   'local-embed-bedrock.sh'
+  'local-embed-ollama.sh'
   # NOTE (yao): octoai url is giving 404
   # 'local-embed-octoai.sh'
   'local-embed-vertexai.sh'

diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py
@@ -1 +1 @@
-__version__ = "0.0.24"  # pragma: no cover
+__version__ = "0.0.25"  # pragma: no cover
diff --git a/unstructured_ingest/cli/interfaces.py b/unstructured_ingest/cli/interfaces.py
@@ -417,6 +417,7 @@ def get_cli_options() -> t.List[click.Option]:
         embed_providers = [
             "openai",
             "huggingface",
+            "ollama",
             "aws-bedrock",
             "vertexai",
             "voyageai",

diff --git a/unstructured_ingest/embed/ollama.py b/unstructured_ingest/embed/ollama.py
@@ -0,0 +1,62 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import numpy as np
+from pydantic import Field
+
+from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.utils.dep_check import requires_dependencies
+
+if TYPE_CHECKING:
+    from ollama import embed as OllamaClient
+
+
+class OllamaEmbeddingConfig(EmbeddingConfig):
+    embedder_model_name: Optional[str] = Field(default="all-minilm", alias="model_name")
+
+    @requires_dependencies(
+        ["ollama"],
+        extras="embed-ollama",
+    )
+    def get_client(self) -> "OllamaClient":
+        from ollama import embed as OllamaClient
+
+        return OllamaClient
+
+
+@dataclass
+class OllamaEmbeddingEncoder(BaseEmbeddingEncoder):
+    config: OllamaEmbeddingConfig
+
+    def get_exemplary_embedding(self) -> list[float]:
+        return self.embed_query(query="Q")
+
+    def num_of_dimensions(self) -> tuple[int, ...]:
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.shape(exemplary_embedding)
+
+    def is_unit_vector(self) -> bool:
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
+
+    def embed_query(self, query: str) -> list[float]:
+        return self._embed_documents(texts=[query])[0]
+
+    def _embed_documents(self, texts: list[str]) -> list[list[float]]:
+        client = self.config.get_client()
+        _r = client(model=self.config.embedder_model_name, input=texts)
+        return _r["embeddings"]
+
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
+        embeddings = self._embed_documents([e.get("text", "") for e in elements])
+        elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
+        return elements_with_embeddings
+
+    def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> list[dict]:
+        assert len(elements) == len(embeddings)
+        elements_w_embedding = []
+
+        for i, element in enumerate(elements):
+            element["embeddings"] = embeddings[i]
+            elements_w_embedding.append(element)
+        return elements
diff --git a/unstructured_ingest/interfaces.py b/unstructured_ingest/interfaces.py
@@ -218,6 +218,13 @@ def get_embedder(self) -> "BaseEmbeddingEncoder":
             )
 
             return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
+        elif self.provider == "ollama":
+            from unstructured_ingest.embed.ollama import (
+                OllamaEmbeddingConfig,
+                OllamaEmbeddingEncoder,
+            )
+
+            return OllamaEmbeddingEncoder(config=OllamaEmbeddingConfig(**kwargs))
         elif self.provider == "octoai":
             from unstructured_ingest.embed.octoai import (
                 OctoAiEmbeddingConfig,

diff --git a/unstructured_ingest/v2/processes/embedder.py b/unstructured_ingest/v2/processes/embedder.py
@@ -17,6 +17,7 @@ class EmbedderConfig(BaseModel):
         Literal[
             "openai",
             "huggingface",
+            "ollama",
             "aws-bedrock",
             "vertexai",
             "voyageai",
@@ -53,6 +54,11 @@ def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEnco
             config=HuggingFaceEmbeddingConfig.model_validate(embedding_kwargs)
         )
 
+    def get_ollama_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
+        from unstructured_ingest.embed.ollama import OllamaEmbeddingConfig, OllamaEmbeddingEncoder
+
+        return OllamaEmbeddingEncoder(config=OllamaEmbeddingConfig.model_validate(embedding_kwargs))
+
     def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
         from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
 
@@ -120,6 +126,9 @@ def get_embedder(self) -> "BaseEmbeddingEncoder":
         if self.embedding_provider == "huggingface":
             return self.get_huggingface_embedder(embedding_kwargs=kwargs)
 
+        if self.embedding_provider == "ollama":
+            return self.get_ollama_embedder(embedding_kwargs=kwargs)
+
         if self.embedding_provider == "octoai":
             return self.get_octoai_embedder(embedding_kwargs=kwargs)
 
@@ -131,6 +140,7 @@ def get_embedder(self) -> "BaseEmbeddingEncoder":
 
         if self.embedding_provider == "voyageai":
             return self.get_voyageai_embedder(embedding_kwargs=kwargs)
+
         if self.embedding_provider == "mixedbread-ai":
             return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.0.24" # pragma: no cover
		__version__ = "0.0.25" # pragma: no cover