|
1 | 1 | #include "common.h"
|
2 | 2 | #include "llama.h"
|
| 3 | +#include "llama.cpp" |
| 4 | + |
3 | 5 | #include "binding.h"
|
4 | 6 |
|
5 | 7 | #include <cassert>
|
@@ -630,11 +632,121 @@ void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool
|
630 | 632 | llama_init_backend();
|
631 | 633 | void* res = nullptr;
|
632 | 634 | try {
|
633 |
| - res = llama_init_from_file(fname, lparams); |
| 635 | + res = llama_init_from_file2(fname, &lparams); |
634 | 636 | } catch(std::runtime_error& e) {
|
635 | 637 | fprintf(stderr, "failed %s",e.what());
|
636 | 638 | return res;
|
637 | 639 | }
|
638 | 640 |
|
639 | 641 | return res;
|
640 | 642 | }
|
| 643 | + |
| 644 | + |
| 645 | +struct llama_context * llama_init_from_file2( |
| 646 | + const char * path_model, |
| 647 | + const struct llama_context_params * params_ptr) { |
| 648 | + struct llama_context_params params = *params_ptr; |
| 649 | + ggml_time_init(); |
| 650 | + |
| 651 | + llama_context * ctx = new llama_context; |
| 652 | + |
| 653 | + if (params.seed < 0) { |
| 654 | + params.seed = time(NULL); |
| 655 | + } |
| 656 | + |
| 657 | + unsigned cur_percentage = 0; |
| 658 | + if (params.progress_callback == NULL) { |
| 659 | + params.progress_callback_user_data = &cur_percentage; |
| 660 | + params.progress_callback = [](float progress, void * ctx) { |
| 661 | + unsigned * cur_percentage_p = (unsigned *) ctx; |
| 662 | + unsigned percentage = (unsigned) (100 * progress); |
| 663 | + while (percentage > *cur_percentage_p) { |
| 664 | + *cur_percentage_p = percentage; |
| 665 | + fprintf(stderr, "."); |
| 666 | + fflush(stderr); |
| 667 | + if (percentage >= 100) { |
| 668 | + fprintf(stderr, "\n"); |
| 669 | + } |
| 670 | + } |
| 671 | + }; |
| 672 | + } |
| 673 | + |
| 674 | + ctx->rng = std::mt19937(params.seed); |
| 675 | + ctx->logits_all = params.logits_all; |
| 676 | + |
| 677 | + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; |
| 678 | + |
| 679 | + if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu, |
| 680 | + params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, |
| 681 | + params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { |
| 682 | + fprintf(stderr, "%s: failed to load model\n", __func__); |
| 683 | + llama_free(ctx); |
| 684 | + return nullptr; |
| 685 | + } |
| 686 | + |
| 687 | + // reserve memory for context buffers |
| 688 | + if (!params.vocab_only) { |
| 689 | + if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { |
| 690 | + fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); |
| 691 | + llama_free(ctx); |
| 692 | + return nullptr; |
| 693 | + } |
| 694 | + |
| 695 | + { |
| 696 | + const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); |
| 697 | + fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); |
| 698 | + } |
| 699 | + |
| 700 | + const auto & hparams = ctx->model.hparams; |
| 701 | + |
| 702 | + // resized during inference |
| 703 | + if (params.logits_all) { |
| 704 | + ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); |
| 705 | + } else { |
| 706 | + ctx->logits.reserve(hparams.n_vocab); |
| 707 | + } |
| 708 | + |
| 709 | + if (params.embedding){ |
| 710 | + ctx->embedding.resize(hparams.n_embd); |
| 711 | + } |
| 712 | + |
| 713 | + ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); |
| 714 | + |
| 715 | + ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); |
| 716 | + ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); |
| 717 | + } |
| 718 | + |
| 719 | +#ifdef GGML_USE_METAL |
| 720 | + if (params.n_gpu_layers > 0) { |
| 721 | + // this allocates all Metal resources and memory buffers |
| 722 | + ctx->ctx_metal = ggml_metal_init(); |
| 723 | + |
| 724 | + void *data_ptr = NULL; |
| 725 | + size_t data_size = 0; |
| 726 | + if (params.use_mmap) { |
| 727 | + data_ptr = ctx->model.mapping->addr; |
| 728 | + data_size= ctx->model.mapping->size; |
| 729 | + } else { |
| 730 | + data_ptr = ggml_get_mem_buffer(ctx->model.ctx); |
| 731 | + data_size= ggml_get_mem_size(ctx->model.ctx); |
| 732 | + } |
| 733 | + |
| 734 | +#define LLAMA_METAL_CHECK_BUF(result) \ |
| 735 | + if (!(result)) { \ |
| 736 | + fprintf(stderr, "%s: failed to add buffer\n", __func__); \ |
| 737 | + llama_free(ctx); \ |
| 738 | + return NULL; \ |
| 739 | + } |
| 740 | + |
| 741 | + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size)); |
| 742 | + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size)); |
| 743 | + |
| 744 | + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size)); |
| 745 | + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size)); |
| 746 | + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size)); |
| 747 | +#undef LLAMA_METAL_CHECK_BUF |
| 748 | + } |
| 749 | +#endif |
| 750 | + |
| 751 | + return ctx; |
| 752 | +} |
0 commit comments