-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_data.py
executable file
·82 lines (65 loc) · 2.27 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python3
# this script loads data from a CSV file and creates an vector index using embedding model
from dotenv import load_dotenv
from modules.init import init_models
from sys import argv
from pathlib import Path
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings
from time import sleep, localtime, strftime
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import (
SummaryExtractor,
QuestionsAnsweredExtractor,
TitleExtractor,
KeywordExtractor,
)
from llama_index.core.ingestion import IngestionPipeline
from modules.MyPandasCSVReader import MyPandasCSVReader
load_dotenv()
if (len(argv) < 2):
print("Provide a path to data to load")
exit(1)
path = Path(argv[1])
if (not path.exists()):
print("File does not exist")
exit(1)
llm, embed_model = init_models("cohere/command-r", "OrdalieTech/Solon-embeddings-large-0.1", local=True)
if llm is None or embed_model is None:
print("Error while initializing the models")
exit(1)
# PandasCSVReader uses pandas.read_csv() to load data from a CSV file
reader = MyPandasCSVReader(
concat_rows=False,
)
print("Loading data from ", path)
documents = reader.load_data(file = path)
print("Number of documents loaded: ", len(documents))
Settings.llm=llm
Settings.embed_model=embed_model
Settings.chunk_size=2048
transformations = [
SentenceSplitter(chunk_size=Settings.chunk_size),
# TitleExtractor(nodes=5),
# QuestionsAnsweredExtractor(questions=3),
# SummaryExtractor(summaries=["prev", "self"]),
# KeywordExtractor(keywords=10),
# EntityExtractor(prediction_threshold=0.5),
]
pipeline = IngestionPipeline(
transformations=transformations
)
nodes = pipeline.run(documents=documents)
for i, node in enumerate(nodes):
print(f"Node [{i}]")
for key, value in node.metadata.items():
print(key, " : ", value)
print("\n")
if i == 5:
break
print("Number of documents ", len(documents))
index = VectorStoreIndex(nodes=nodes, show_progress=True)
persist_dir_name = Path("./storage_" + path.name + strftime("%Y-%m-%d_%H-%M-%S", localtime()))
index.storage_context.persist(persist_dir_name)
print("Index saved in ", persist_dir_name)