Skip to content

main : improve UX for chat mode (to be discussed) #11203

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/main/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-cli)
add_executable(${TARGET} main.cpp)
add_executable(${TARGET} main.cpp chat.hpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
224 changes: 224 additions & 0 deletions examples/main/chat.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
#include "arg.h"
#include "common.h"
#include "console.h"
#include "log.h"
#include "sampling.h"
#include "llama.h"

#include <fstream>

struct llama_cli_chat {
struct llama_context * ctx;
const struct llama_model * model;
const struct llama_vocab * vocab;
struct common_sampler * smpl;
struct common_params params;

bool interacting = false;
std::vector<common_chat_msg> chat_msgs;
std::ostringstream pending_input;

struct llama_batch batch;
llama_tokens cache_tokens;
int n_past = 0;

llama_cli_chat(
struct common_params & params,
struct llama_context * ctx,
struct common_sampler * smpl) : ctx(ctx), smpl(smpl), params(params) {
model = llama_get_model(ctx);
vocab = llama_model_get_vocab(model);
batch = llama_batch_init(params.n_batch, 0, 1);
}

void decode(llama_tokens & eval_tokens, bool is_generating) {
if (is_generating) {
GGML_ASSERT(eval_tokens.size() == 1);
} else {
n_past = common_lcp(cache_tokens, eval_tokens);
// in case we do a re-generation, we need to prevent eval_tokens from being empty
if ((int) eval_tokens.size() == n_past) {
n_past--;
}
if (n_past > 0) {
eval_tokens.erase(eval_tokens.begin(), eval_tokens.begin() + n_past);
cache_tokens.erase(cache_tokens.begin() + n_past, cache_tokens.end());
LOG_DBG("remove from cache [%d, inf)\n", n_past);
LOG_DBG("in cache: %s\n", common_detokenize(ctx, cache_tokens, true).c_str());
LOG_DBG("to decode %d tokens\n", (int) eval_tokens.size());
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
}
}

// decode
for (size_t i = 0; i < eval_tokens.size(); i += params.n_batch) {
if (interacting) {
break;
}

common_batch_clear(batch);
for (int j = 0; j < params.n_batch && i + j < eval_tokens.size(); ++j) {
n_past++;
bool is_last_token = i + j == eval_tokens.size() - 1;
common_batch_add(batch, eval_tokens[i + j], n_past, {0}, is_last_token);
}

if (llama_decode(ctx, batch)) {
GGML_ABORT("failed to decode\n");
}
}

// update cache tokens
if (is_generating) {
cache_tokens.push_back(eval_tokens[0]);
} else {
cache_tokens.insert(cache_tokens.end(), eval_tokens.begin(), eval_tokens.end());
}
}

[[noreturn]] void run() {
while (true) {
interacting = true;
LOG("\n> ");

// color user input only
console::set_display(console::user_input);
std::string line;
bool another_line = true;
bool continue_input = false;
do {
another_line = console::readline(line, params.multiline_input);
if (handle_command(line, continue_input)) {
continue; // do not add this line to pending_input
}
pending_input << line;
} while (another_line);

if (continue_input) {
continue;
}

if (pending_input.tellp() == 0) {
LOG_DBG("empty line, passing control back\n");
continue;
}

// done taking input, reset color
console::set_display(console::reset);
interacting = false;

// add message and format chat
if (!chat_msgs.empty() && chat_msgs.back().role == "user") {
chat_msgs.pop_back();
}
chat_msgs.push_back({"user", string_strip(pending_input.str())});
pending_input.str(""); // clear
auto formatted = common_chat_apply_template(model, params.chat_template, chat_msgs, true);

// tokenize the new chat history and decode
llama_tokens prompt_tokens = common_tokenize(ctx, formatted, true, true);
decode(prompt_tokens, false);

// generate response
llama_token new_token_id = LLAMA_TOKEN_NULL;
llama_tokens generated_tokens;
common_sampler_reset(smpl);
while (true) {
if (interacting) {
break;
}

// sample the next token
new_token_id = common_sampler_sample(smpl, ctx, -1);

// is it an end of generation?
if (llama_vocab_is_eog(vocab, new_token_id)) {
break;
}

// print the token, then decode it
printf("%s", common_token_to_piece(ctx, new_token_id, params.special).c_str());
fflush(stdout);
generated_tokens.push_back(new_token_id);
llama_tokens new_tok = {new_token_id};
decode(new_tok, true);
}

// add the generated tokens to the chat history
std::string response = common_detokenize(ctx, generated_tokens, true);
chat_msgs.push_back({"assistant", response});

// print a new line if needed
if (!response.empty() && response.back() != '\n') {
printf("\n");
}
}
}

void interrupt() {
if (interacting) {
// exit
printf("\n");
console::cleanup();
common_perf_print(ctx, smpl);
common_log_pause(common_log_main());
exit(0);
}
interacting = true;
}

bool handle_command(std::string & inp, bool & continue_input) {
if (inp.empty() || inp[0] != '/') {
return false; // not a command
}
auto parts = string_split<std::string>(string_strip(inp), ' ');
std::string & cmd = parts[0];
if (cmd == "/help") {
LOG("TODO\n");
continue_input = true;
} else if (cmd == "/history") {
display_history();
continue_input = true;
} else if (cmd == "/regen") {
if (chat_msgs.empty()) {
LOG_ERR("no chat history to regenerate\n");
continue_input = true;
return true;
}
if (chat_msgs.back().role == "assistant") {
chat_msgs.pop_back();
}
if (chat_msgs.back().role == "user") {
pending_input.str(""); // clear
pending_input << chat_msgs.back().content;
chat_msgs.pop_back();
}
continue_input = false;
} else if (cmd == "/readfile") {
const std::string filename = parts[1];
LOG_DBG("reading file: '%s'\n", filename.c_str());
std::ifstream text_file(filename);
if (!text_file) {
LOG("failed to open file '%s'\n", filename.c_str());
} else {
pending_input << text_file.rdbuf() << "\n\n";
LOG("read %zu characters from file\n", (size_t) text_file.tellg());
}
continue_input = true;
} else {
LOG_ERR("unknown command: %s\n", cmd.c_str());
continue_input = true;
}
return true;
}

void display_history() {
for (const auto & msg : chat_msgs) {
LOG("%s: %s\n\n", msg.role.c_str(), msg.content.c_str());
}
}

~llama_cli_chat() {
llama_batch_free(batch);
}
};
Loading
Loading