Skip to content

Commit 76e4416

Browse files
committed
Workaround llama_init_from_file parameter copy
This is needed until ggml-org/llama.cpp#1902 is addressed/merged. Signed-off-by: mudler <mudler@mocaccino.org>
1 parent 75e99c8 commit 76e4416

File tree

2 files changed

+117
-1
lines changed

2 files changed

+117
-1
lines changed

binding.cpp

+113-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "common.h"
22
#include "llama.h"
3+
#include "llama.cpp"
4+
35
#include "binding.h"
46

57
#include <cassert>
@@ -630,11 +632,121 @@ void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool
630632
llama_init_backend();
631633
void* res = nullptr;
632634
try {
633-
res = llama_init_from_file(fname, lparams);
635+
res = llama_init_from_file2(fname, &lparams);
634636
} catch(std::runtime_error& e) {
635637
fprintf(stderr, "failed %s",e.what());
636638
return res;
637639
}
638640

639641
return res;
640642
}
643+
644+
645+
struct llama_context * llama_init_from_file2(
646+
const char * path_model,
647+
const struct llama_context_params * params_ptr) {
648+
struct llama_context_params params = *params_ptr;
649+
ggml_time_init();
650+
651+
llama_context * ctx = new llama_context;
652+
653+
if (params.seed < 0) {
654+
params.seed = time(NULL);
655+
}
656+
657+
unsigned cur_percentage = 0;
658+
if (params.progress_callback == NULL) {
659+
params.progress_callback_user_data = &cur_percentage;
660+
params.progress_callback = [](float progress, void * ctx) {
661+
unsigned * cur_percentage_p = (unsigned *) ctx;
662+
unsigned percentage = (unsigned) (100 * progress);
663+
while (percentage > *cur_percentage_p) {
664+
*cur_percentage_p = percentage;
665+
fprintf(stderr, ".");
666+
fflush(stderr);
667+
if (percentage >= 100) {
668+
fprintf(stderr, "\n");
669+
}
670+
}
671+
};
672+
}
673+
674+
ctx->rng = std::mt19937(params.seed);
675+
ctx->logits_all = params.logits_all;
676+
677+
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
678+
679+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
680+
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
681+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
682+
fprintf(stderr, "%s: failed to load model\n", __func__);
683+
llama_free(ctx);
684+
return nullptr;
685+
}
686+
687+
// reserve memory for context buffers
688+
if (!params.vocab_only) {
689+
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
690+
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
691+
llama_free(ctx);
692+
return nullptr;
693+
}
694+
695+
{
696+
const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
697+
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
698+
}
699+
700+
const auto & hparams = ctx->model.hparams;
701+
702+
// resized during inference
703+
if (params.logits_all) {
704+
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
705+
} else {
706+
ctx->logits.reserve(hparams.n_vocab);
707+
}
708+
709+
if (params.embedding){
710+
ctx->embedding.resize(hparams.n_embd);
711+
}
712+
713+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
714+
715+
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
716+
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
717+
}
718+
719+
#ifdef GGML_USE_METAL
720+
if (params.n_gpu_layers > 0) {
721+
// this allocates all Metal resources and memory buffers
722+
ctx->ctx_metal = ggml_metal_init();
723+
724+
void *data_ptr = NULL;
725+
size_t data_size = 0;
726+
if (params.use_mmap) {
727+
data_ptr = ctx->model.mapping->addr;
728+
data_size= ctx->model.mapping->size;
729+
} else {
730+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
731+
data_size= ggml_get_mem_size(ctx->model.ctx);
732+
}
733+
734+
#define LLAMA_METAL_CHECK_BUF(result) \
735+
if (!(result)) { \
736+
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
737+
llama_free(ctx); \
738+
return NULL; \
739+
}
740+
741+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
742+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
743+
744+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
745+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
746+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
747+
#undef LLAMA_METAL_CHECK_BUF
748+
}
749+
#endif
750+
751+
return ctx;
752+
}

binding.h

+4
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ void llama_free_model(void* state);
3232

3333
int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug);
3434

35+
struct llama_context * llama_init_from_file2(
36+
const char * path_model,
37+
const struct llama_context_params * params_ptr);
38+
3539
#ifdef __cplusplus
3640
}
3741

0 commit comments

Comments
 (0)