-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
168 lines (143 loc) · 6.18 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import json
from dotenv import load_dotenv
from pydantic import BaseModel
import streamlit as st
import pandas as pd # type: ignore
from langchain.chains.question_answering import stuff_prompt, map_reduce_prompt, refine_prompts, map_rerank_prompt
import tiktoken
from config import MATCH_TEMPLATE, RETRIEVE_CHAIN_TEMPLATE, Configuration
from datetime import datetime
from evaluate import evaluate
class ValidationSample(BaseModel):
query: str
answer: str
class ValidationList(BaseModel):
validations: list[ValidationSample]
def run_app():
# data
documents = None
val_list = [
ValidationSample(
query="What is the capital of Japan?",
answer="Tokyo",
)
]
st.set_page_config(
page_title="LLM-chain-eval",
page_icon="🤖",
layout="wide",
initial_sidebar_state="expanded",
)
st.title("LLM-chain-eval")
# sidebar
st.sidebar.title("Configuration")
st.sidebar.subheader("LLM Model")
llm_model = st.sidebar.selectbox(
"LLM Model",
[
Configuration.LlmModel.gpt_3_turbo.value,
Configuration.LlmModel.gpt_4.value,
Configuration.LlmModel.google_flan_t5_large.value,
Configuration.LlmModel.google_flan_t5_xl.value,
Configuration.LlmModel.dolly_v2_3b.value,
],
index=0
)
st.sidebar.subheader("Chunk size")
st.sidebar.markdown("The number of characters per chunk.")
chunk_size = st.sidebar.slider("chunk size", 100, 3000, 1000, 100, label_visibility="collapsed")
st.sidebar.subheader("Chunk Overlap")
st.sidebar.caption("The number of characters to overlap between chunks.")
chunk_overlap = st.sidebar.slider("chunk overlap", 0, 1000, 0, 1, label_visibility="collapsed")
st.sidebar.subheader("Chain type")
st.sidebar.caption("The way to retrieve the answer.")
chain_type = st.sidebar.selectbox(
"chain type",
["stuff", "map_reduce", "refine", "map_rerank"],
index=0,
label_visibility="collapsed",
)
st.sidebar.subheader("Embedding Model")
st.sidebar.caption("Model to calculate embedding.")
emb_type = st.sidebar.selectbox(
"Embedding Model",
[
Configuration.EmbeddingModel.gpt_embedding.value,
Configuration.EmbeddingModel.huggingface_embedding.value,
],
index=0,
label_visibility="collapsed",
)
st.sidebar.subheader("Top K Chunk")
st.sidebar.caption("The number of chunks to retrieve.")
top_k_chunk = st.sidebar.slider("top k chunk", 1, 10, 2, 1, label_visibility="collapsed")
if chain_type == "stuff":
st.sidebar.subheader("Prompt template (currently only for stuff)")
st.sidebar.caption("Template to retrieve chain.")
retrieve_chain_template = st.sidebar.text_area("Retrieve Chain template", stuff_prompt.prompt_template, label_visibility="collapsed")
else:
retrieve_chain_template = None
st.sidebar.subheader("Match template")
st.sidebar.caption("Template used to match the answer.")
match_template = st.sidebar.text_area("Match template", MATCH_TEMPLATE, label_visibility="collapsed")
config = Configuration(
llm_model=llm_model,
embedding_model=emb_type,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
chain_type=chain_type,
top_k_chunk=top_k_chunk,
retrieve_chain_template=retrieve_chain_template,
match_template=match_template,
)
# source file
st.subheader("Source documents")
doc_upload = st.file_uploader("Text file you want to evaluate", type=["txt", "pdf"])
if doc_upload is not None:
documents = load_documents(doc_upload)
st.caption(documents[0].page_content[:200] + "...")
char_count = sum([len(doc.page_content) for doc in documents])
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
token_count = sum([len(encoding.encode(doc.page_content)) for doc in documents])
st.markdown(f":green[{char_count}] characters, :green[{token_count}] tokens (gpt-3.5-turbo)")
# validation file (optional)
st.subheader("Validation datasets")
val_upload = st.file_uploader("Validation json file. You can input query and answer directly.", type=["json"])
if val_upload is not None:
val_list = ValidationList.parse_raw(val_upload.read()).validations
val_data = pd.DataFrame([v.dict() for v in val_list], columns=["query", "answer"])
edited_val_df = st.experimental_data_editor(val_data, num_rows="dynamic", use_container_width=True)
val_list = [ValidationSample(**v) for v in edited_val_df.to_dict("records") if v["query"] is not None and v["answer"] is not None]
if documents is not None and val_list is not None:
evaluate_button = st.button("Run Evaluate", type="primary")
if evaluate_button and documents is not None:
with st.spinner(text="Evaluating..."):
eval_results = evaluate(documents, val_list, config)
score = 100 * sum([1 if res["correct"] else 0 for res in eval_results]) / len(eval_results)
st.divider()
st.header(f"Accuracy: {round(score, 2)}%")
st.subheader("Samples")
for i, res in enumerate(eval_results):
result_md = f"""
#### Validation {i}
###### {res["query"]}
###### AI Response: {res["response"]}
###### Ground Truth: {res["answer"]}
###### Correct: :{'green' if res["correct"] else 'red'}[{res["correct"]}]
"""
st.markdown(result_md)
# save results and config
with open(f"eval_{datetime.now().strftime('%Y%m%d%H%M%S')}.json", "w") as save_file:
res = {
"score": score,
"eval_results": eval_results,
"config": config.dict(),
}
json.dump(res, save_file, ensure_ascii=False, indent=2)
def load_documents(uploaded_file):
from langchain.document_loaders import TextLoader
documents = TextLoader(uploaded_file.name).load()
return documents
if __name__ == "__main__":
load_dotenv()
run_app()