-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeeplake_embed.py
31 lines (25 loc) · 1.18 KB
/
deeplake_embed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
import os
from dotenv import load_dotenv
load_dotenv()
# Prepare the file
headers_on_split = [("##")]
mardown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_on_split, strip_headers=False)
markdown_file_ru = os.path.join(os.path.dirname(__file__), "document_ru.md")
with open(markdown_file_ru, "r") as file:
markdown_text_ru = file.read()
md_header_splits_ru = mardown_splitter.split_text(markdown_text_ru)
markdown_file_en = os.path.join(os.path.dirname(__file__), "document_en.md")
with open(markdown_file_en, "r") as file:
markdown_text_en = file.read()
md_header_splits_en = mardown_splitter.split_text(markdown_text_en)
# Let's embed
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
my_activeloop_org_id = "learningprocess123"
my_activeloop_dataset_name = "my_dataset"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)
db.add_documents(md_header_splits_ru)
db.add_documents(md_header_splits_en)