Workaround llama_init_from_file parameter copy

mudler · mudler · commit 76e441611ab8 · 2023-06-17T00:01:22.000+02:00
This is needed until ggml-org/llama.cpp#1902 is addressed/merged. Signed-off-by: mudler <mudler@mocaccino.org>
diff --git a/binding.cpp b/binding.cpp
@@ -1,5 +1,7 @@
 #include "common.h"
 #include "llama.h"
+#include "llama.cpp"
+
 #include "binding.h"
 
 #include <cassert>
@@ -630,11 +632,121 @@ void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool
     llama_init_backend();
     void* res = nullptr;
     try {
-        res = llama_init_from_file(fname, lparams);
+        res = llama_init_from_file2(fname, &lparams);
     } catch(std::runtime_error& e) {   
         fprintf(stderr, "failed %s",e.what());
         return res;
     }
 
     return res;
 }
+
+
+struct llama_context * llama_init_from_file2(
+                             const char * path_model,
+         const   struct llama_context_params * params_ptr) {
+    struct llama_context_params params = *params_ptr;
+    ggml_time_init();
+
+    llama_context * ctx = new llama_context;
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    unsigned cur_percentage = 0;
+    if (params.progress_callback == NULL) {
+        params.progress_callback_user_data = &cur_percentage;
+        params.progress_callback = [](float progress, void * ctx) {
+            unsigned * cur_percentage_p = (unsigned *) ctx;
+            unsigned percentage = (unsigned) (100 * progress);
+            while (percentage > *cur_percentage_p) {
+                *cur_percentage_p = percentage;
+                fprintf(stderr, ".");
+                fflush(stderr);
+                if (percentage >= 100) {
+                    fprintf(stderr, "\n");
+                }
+            }
+        };
+    }
+
+    ctx->rng = std::mt19937(params.seed);
+    ctx->logits_all = params.logits_all;
+
+    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
+                params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
+                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
+        fprintf(stderr, "%s: failed to load model\n", __func__);
+        llama_free(ctx);
+        return nullptr;
+    }
+
+    // reserve memory for context buffers
+    if (!params.vocab_only) {
+        if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
+            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
+            llama_free(ctx);
+            return nullptr;
+        }
+
+        {
+            const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
+            fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
+        }
+
+        const auto & hparams = ctx->model.hparams;
+
+        // resized during inference
+        if (params.logits_all) {
+            ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
+        } else {
+            ctx->logits.reserve(hparams.n_vocab);
+        }
+
+        if (params.embedding){
+            ctx->embedding.resize(hparams.n_embd);
+        }
+
+        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+
+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
+        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
+    }
+
+#ifdef GGML_USE_METAL
+    if (params.n_gpu_layers > 0) {
+        // this allocates all Metal resources and memory buffers
+        ctx->ctx_metal = ggml_metal_init();
+
+        void *data_ptr = NULL;
+        size_t data_size = 0;
+        if (params.use_mmap) {
+            data_ptr = ctx->model.mapping->addr;
+            data_size= ctx->model.mapping->size;
+        } else {
+            data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
+            data_size= ggml_get_mem_size(ctx->model.ctx);
+        }
+
+#define LLAMA_METAL_CHECK_BUF(result)                                          \
+    if (!(result)) {                                                           \
+        fprintf(stderr, "%s: failed to add buffer\n", __func__);               \
+        llama_free(ctx);                                                       \
+        return NULL;                                                           \
+    }
+
+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
+
+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr,    ctx->buf_scratch[0].size));
+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr,    ctx->buf_scratch[1].size));
+#undef LLAMA_METAL_CHECK_BUF
+    }
+#endif
+
+    return ctx;
+}
diff --git a/binding.h b/binding.h
@@ -32,6 +32,10 @@ void llama_free_model(void* state);
 
 int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug);
 
+struct llama_context * llama_init_from_file2(
+                             const char * path_model,
+         const struct llama_context_params * params_ptr);
+
 #ifdef __cplusplus
 }
 

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,10 @@ void llama_free_model(void* state);`
`32`	`32`
`33`	`33`	`int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug);`
`34`	`34`
	`35`	`+struct llama_context * llama_init_from_file2(`
	`36`	`+ const char * path_model,`
	`37`	`+ const struct llama_context_params * params_ptr);`
	`38`	`+`
`35`	`39`	`#ifdef __cplusplus`
`36`	`40`	`}`
`37`	`41`