-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchroma.py
43 lines (33 loc) · 1.3 KB
/
chroma.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import MarkdownHeaderTextSplitter
import os
from dotenv import load_dotenv
load_dotenv()
persist_directory = "portfolio_db"
headers_on_split = [
("##", "Header 2"),
("#", "Header 1"),
("###", "Header 3"),
("####", "Header 4")
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_on_split, strip_headers=False)
def process_file(file_path):
with open(file_path, "r") as file:
markdown_text = file.read()
return markdown_splitter.split_text(markdown_text)
markdown_file_ru = os.path.join(os.path.dirname(__file__), "document_ru.md")
markdown_file_en = os.path.join(os.path.dirname(__file__), "document_en.md")
md_header_splits_ru = process_file(markdown_file_ru)
md_header_splits_en = process_file(markdown_file_en)
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
docs = md_header_splits_en + md_header_splits_ru
existing_ids = set(db.get()["ids"])
for i, doc in enumerate(docs):
doc_id = f"doc_{i}"
if doc_id in existing_ids:
db.update_document(doc_id, doc)
else:
db.add_documents([doc], ids=[doc_id])
db.persist()