Filimoa · leonardobaggio · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/src/openparse/processing/__init__.py b/src/openparse/processing/__init__.py
@@ -15,7 +15,7 @@
     NoOpIngestionPipeline,
     SemanticIngestionPipeline,
 )
-from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings
+from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings, AzureOpenAIEmbeddings
 
 __all__ = [
     "ProcessingStep",
@@ -33,4 +33,5 @@
     "RemoveNodesBelowNTokens",
     "CombineNodesSemantically",
     "OpenAIEmbeddings",
+    "AzureOpenAIEmbeddings",
 ]
diff --git a/src/openparse/processing/ingest.py b/src/openparse/processing/ingest.py
@@ -17,6 +17,7 @@
     CombineNodesSemantically,
     EmbeddingModel,
     OpenAIEmbeddings,
+    AzureOpenAIEmbeddings
 )
 from openparse.schemas import Node
 
@@ -97,12 +98,24 @@ class SemanticIngestionPipeline(IngestionPipeline):
 
     def __init__(
         self,
-        openai_api_key: str,
+        api_key: str,
+        api_endpoint: str,
+        deployment: str,
+        api_version: str = "2024-02-15-preview",
         model: EmbeddingModel = "text-embedding-3-large",
         min_tokens: int = consts.TOKENIZATION_LOWER_LIMIT,
         max_tokens: int = consts.TOKENIZATION_UPPER_LIMIT,
     ) -> None:
-        embedding_client = OpenAIEmbeddings(api_key=openai_api_key, model=model)
+        # if an api endpoint is provided, use AzureOpenAIEmbeddings
+        if api_endpoint is not None:
+            embedding_client = AzureOpenAIEmbeddings(
+                api_key=api_key,
+                api_endpoint=api_endpoint,
+                deployment=deployment,
+                api_version=api_version
+            )
+        else:
+            embedding_client = OpenAIEmbeddings(api_key=api_key, model=model)
 
         self.transformations = [
             RemoveTextInsideTables(),

diff --git a/src/openparse/processing/semantic_transforms.py b/src/openparse/processing/semantic_transforms.py
@@ -1,3 +1,4 @@
+from abc import ABC, abstractmethod
 from typing import List, Literal, Union
 
 import numpy as np
@@ -14,10 +15,27 @@
 def cosine_similarity(
     a: Union[np.ndarray, List[float]], b: Union[np.ndarray, List[float]]
 ) -> float:
+    """
+    Calculate the cosine similarity between two vectors.
+
+    Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them.
+
+    Parameters:
+    a (Union[np.ndarray, List[float]]): The first vector.
+    b (Union[np.ndarray, List[float]]): The second vector.
+
+    Returns:
+    float: The cosine similarity between vector `a` and vector `b`. The value ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, with 0 usually indicating orthogonality (independence), and in-between values indicating intermediate similarity or dissimilarity.
+    """
     return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
 
 
-class OpenAIEmbeddings:
+class BaseEmbeddings(ABC):
+    @abstractmethod
+    def embed_many(self, texts: List[str]) -> List[List[float]]:
+        pass
+
+class OpenAIEmbeddings(BaseEmbeddings):
     def __init__(
         self,
         model: EmbeddingModel,
@@ -68,15 +86,72 @@ def _create_client(self):
             ) from err
         return OpenAI(api_key=self.api_key)
 
+class AzureOpenAIEmbeddings(BaseEmbeddings):
+    def __init__(
+        self,
+        api_key: str,
+        api_endpoint: str,
+        deployment: str,
+        api_version: str = "2024-02-15-preview",
+        batch_size: int = 256,
+    ):
+        """
+        Used to generate embeddings for Nodes.
+
+        Args:
+            model (str): The embedding model to use.
+            api_key (str): Your Azure OpenAI API key.
+            api_endpoint (str): The Azure endpoint to use.
+            api_version (str): The version of the API to use.
+            deployment (str): The deployment to use.
+            batch_size (int): The number of texts to process in each api call.
+        """
+        self.api_key = api_key
+        self.api_endpoint = api_endpoint
+        self.api_version = api_version
+        self.deployment = deployment
+        self.batch_size = batch_size
+        self.client = self._create_client()
 
+    def embed_many(self, texts: List[str]) -> List[List[float]]:
+        """
+        Generate embeddings for a list of texts in batches.
+
+        Args:
+            texts (list[str]): The list of texts to embed.
+            batch_size (int): The number of texts to process in each batch.
+
+        Returns:
+            List[List[float]]: A list of embeddings.
+        """
+        res = []
+        for i in range(0, len(texts), self.batch_size):
+            batch_texts = texts[i : i + self.batch_size]
+            api_resp = self.client.embeddings.create(
+                input=batch_texts, model=self.deployment
+            )
+            batch_res = [val.embedding for val in api_resp.data]
+            res.extend(batch_res)
+
+        return res
+
+    def _create_client(self):
+        try:
+            from openai import AzureOpenAI
+        except ImportError as err:
+            raise ImportError(
+                "You need to install the openai package to use this feature."
+            ) from err
+        return AzureOpenAI(api_key=self.api_key, azure_endpoint=self.api_endpoint, azure_deployment=self.deployment, api_version=self.api_version)
+
 class CombineNodesSemantically(ProcessingStep):
     """
     Combines nodes that are semantically related.
     """
 
     def __init__(
         self,
-        embedding_client: OpenAIEmbeddings,
+        embedding_client: BaseEmbeddings,
         min_similarity: float,
         max_tokens: int,
     ):