Skip to content

Adding support for Azure OpenAI #73

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/openparse/processing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
NoOpIngestionPipeline,
SemanticIngestionPipeline,
)
from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings
from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings, AzureOpenAIEmbeddings

__all__ = [
"ProcessingStep",
Expand All @@ -33,4 +33,5 @@
"RemoveNodesBelowNTokens",
"CombineNodesSemantically",
"OpenAIEmbeddings",
"AzureOpenAIEmbeddings",
]
17 changes: 15 additions & 2 deletions src/openparse/processing/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
CombineNodesSemantically,
EmbeddingModel,
OpenAIEmbeddings,
AzureOpenAIEmbeddings
)
from openparse.schemas import Node

Expand Down Expand Up @@ -97,12 +98,24 @@ class SemanticIngestionPipeline(IngestionPipeline):

def __init__(
self,
openai_api_key: str,
api_key: str,
api_endpoint: str,
deployment: str,
api_version: str = "2024-02-15-preview",
model: EmbeddingModel = "text-embedding-3-large",
min_tokens: int = consts.TOKENIZATION_LOWER_LIMIT,
max_tokens: int = consts.TOKENIZATION_UPPER_LIMIT,
) -> None:
embedding_client = OpenAIEmbeddings(api_key=openai_api_key, model=model)
# if an api endpoint is provided, use AzureOpenAIEmbeddings
if api_endpoint is not None:
embedding_client = AzureOpenAIEmbeddings(
api_key=api_key,
api_endpoint=api_endpoint,
deployment=deployment,
api_version=api_version
)
else:
embedding_client = OpenAIEmbeddings(api_key=api_key, model=model)

self.transformations = [
RemoveTextInsideTables(),
Expand Down
79 changes: 77 additions & 2 deletions src/openparse/processing/semantic_transforms.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from abc import ABC, abstractmethod
from typing import List, Literal, Union

import numpy as np
Expand All @@ -14,10 +15,27 @@
def cosine_similarity(
a: Union[np.ndarray, List[float]], b: Union[np.ndarray, List[float]]
) -> float:
"""
Calculate the cosine similarity between two vectors.

Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them.

Parameters:
a (Union[np.ndarray, List[float]]): The first vector.
b (Union[np.ndarray, List[float]]): The second vector.

Returns:
float: The cosine similarity between vector `a` and vector `b`. The value ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, with 0 usually indicating orthogonality (independence), and in-between values indicating intermediate similarity or dissimilarity.
"""
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


class OpenAIEmbeddings:
class BaseEmbeddings(ABC):
@abstractmethod
def embed_many(self, texts: List[str]) -> List[List[float]]:
pass

class OpenAIEmbeddings(BaseEmbeddings):
def __init__(
self,
model: EmbeddingModel,
Expand Down Expand Up @@ -68,15 +86,72 @@ def _create_client(self):
) from err
return OpenAI(api_key=self.api_key)

class AzureOpenAIEmbeddings(BaseEmbeddings):
def __init__(
self,
api_key: str,
api_endpoint: str,
deployment: str,
api_version: str = "2024-02-15-preview",
batch_size: int = 256,
):
"""
Used to generate embeddings for Nodes.

Args:
model (str): The embedding model to use.
api_key (str): Your Azure OpenAI API key.
api_endpoint (str): The Azure endpoint to use.
api_version (str): The version of the API to use.
deployment (str): The deployment to use.
batch_size (int): The number of texts to process in each api call.
"""
self.api_key = api_key
self.api_endpoint = api_endpoint
self.api_version = api_version
self.deployment = deployment
self.batch_size = batch_size
self.client = self._create_client()

def embed_many(self, texts: List[str]) -> List[List[float]]:
"""
Generate embeddings for a list of texts in batches.

Args:
texts (list[str]): The list of texts to embed.
batch_size (int): The number of texts to process in each batch.

Returns:
List[List[float]]: A list of embeddings.
"""
res = []
for i in range(0, len(texts), self.batch_size):
batch_texts = texts[i : i + self.batch_size]
api_resp = self.client.embeddings.create(
input=batch_texts, model=self.deployment
)
batch_res = [val.embedding for val in api_resp.data]
res.extend(batch_res)

return res

def _create_client(self):
try:
from openai import AzureOpenAI
except ImportError as err:
raise ImportError(
"You need to install the openai package to use this feature."
) from err
return AzureOpenAI(api_key=self.api_key, azure_endpoint=self.api_endpoint, azure_deployment=self.deployment, api_version=self.api_version)

class CombineNodesSemantically(ProcessingStep):
"""
Combines nodes that are semantically related.
"""

def __init__(
self,
embedding_client: OpenAIEmbeddings,
embedding_client: BaseEmbeddings,
min_similarity: float,
max_tokens: int,
):
Expand Down