diff --git a/libs/agno/agno/embedder/google.py b/libs/agno/agno/embedder/google.py index 72bc73eba..2922ddd22 100644 --- a/libs/agno/agno/embedder/google.py +++ b/libs/agno/agno/embedder/google.py @@ -45,7 +45,12 @@ def client(self): return self.gemini_client def _response(self, text: str) -> EmbedContentResponse: - _request_params: Dict[str, Any] = {"contents": text, "model": self.id, "config": {}} + # If a user provides a model id with the `models/` prefix, we need to remove it + _id = self.id + if _id.startswith("models/"): + _id = _id.split("/")[-1] + + _request_params: Dict[str, Any] = {"contents": text, "model": _id, "config": {}} if self.dimensions: _request_params["config"]["output_dimensionality"] = self.dimensions if self.task_type: diff --git a/libs/agno/agno/vectordb/chroma/chromadb.py b/libs/agno/agno/vectordb/chroma/chromadb.py index 6853a72d0..988dbe79c 100644 --- a/libs/agno/agno/vectordb/chroma/chromadb.py +++ b/libs/agno/agno/vectordb/chroma/chromadb.py @@ -90,14 +90,20 @@ def doc_exists(self, document: Document) -> bool: Returns: bool: True if document exists, False otherwise. """ - if self.client: - try: - collection: Collection = self.client.get_collection(name=self.collection_name) - collection_data: GetResult = collection.get(include=[IncludeEnum.documents]) - if collection_data.get("documents") != []: - return True - except Exception as e: - logger.error(f"Document does not exist: {e}") + if not self.client: + logger.warning("Client not initialized") + return False + + try: + collection: Collection = self.client.get_collection(name=self.collection_name) + collection_data: GetResult = collection.get(include=[IncludeEnum.documents]) + existing_documents = collection_data.get("documents", []) + cleaned_content = document.content.replace("\x00", "\ufffd") + if cleaned_content in existing_documents: # type: ignore + return True + except Exception as e: + logger.error(f"Document does not exist: {e}") + return False def name_exists(self, name: str) -> bool: @@ -217,7 +223,7 @@ def search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = metadata = result.get("metadatas", [{}])[0] # type: ignore documents = result.get("documents", [[]])[0] # type: ignore embeddings = result.get("embeddings")[0] # type: ignore - embeddings = [e.tolist() if hasattr(e, "tolist") else e for e in embeddings] + embeddings = [e.tolist() if hasattr(e, "tolist") else e for e in embeddings] # type: ignore distances = result.get("distances", [[]])[0] # type: ignore for idx, distance in enumerate(distances): diff --git a/libs/agno/tests/unit/vectordb/test_chromadb.py b/libs/agno/tests/unit/vectordb/test_chromadb.py index 31b0a5abc..c9a83a67c 100644 --- a/libs/agno/tests/unit/vectordb/test_chromadb.py +++ b/libs/agno/tests/unit/vectordb/test_chromadb.py @@ -163,3 +163,21 @@ def test_custom_embedder(mock_embedder): finally: if os.path.exists(TEST_PATH): shutil.rmtree(TEST_PATH) + + +def test_multiple_document_operations(chroma_db, sample_documents): + """Test multiple document operations including batch inserts""" + # Test batch insert + first_batch = sample_documents[:2] + chroma_db.insert(first_batch) + assert chroma_db.get_count() == 2 + + # Test adding another document + second_batch = [sample_documents[2]] + chroma_db.insert(second_batch) + assert chroma_db.get_count() == 3 + + # Verify all documents are searchable + curry_results = chroma_db.search("curry", limit=1) + assert len(curry_results) == 1 + assert "curry" in curry_results[0].content.lower()