|
15 | 15 |
|
16 | 16 | #include "model_adapter.h"
|
17 | 17 |
|
18 |
| - |
| 18 | +#if defined(GGML_USE_CLBLAST) |
| 19 | +#include "ggml-opencl.h" |
| 20 | +#endif |
19 | 21 |
|
20 | 22 | // load the model's weights from a file
|
21 |
| -bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) { |
| 23 | +bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab, int gpulayers) { |
22 | 24 | printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
23 | 25 |
|
24 | 26 | auto fin = std::ifstream(fname, std::ios::binary);
|
@@ -75,7 +77,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
|
75 | 77 | std::string word;
|
76 | 78 | std::vector<char> buf(128);
|
77 | 79 |
|
78 |
| - for (int i = 0; i < n_vocab; i++) { |
| 80 | + for (int i = 0; i < n_vocab; i++) { |
79 | 81 | uint32_t len;
|
80 | 82 | fin.read((char *) &len, sizeof(len));
|
81 | 83 |
|
@@ -278,6 +280,29 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
|
278 | 280 |
|
279 | 281 | fin.close();
|
280 | 282 |
|
| 283 | + //gpu offload |
| 284 | + #if defined(GGML_USE_CLBLAST) |
| 285 | + if(gpulayers>0) |
| 286 | + { |
| 287 | + const auto & hparams = model.hparams; |
| 288 | + size_t vram_total = 0; |
| 289 | + const int n_gpu = std::min(gpulayers, int(hparams.n_layers)); |
| 290 | + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); |
| 291 | + for (int i = 0; i < n_gpu; ++i) { |
| 292 | + const auto & layer = model.layers[i]; |
| 293 | + layer.ffn_up_proj->backend = GGML_BACKEND_GPU; |
| 294 | + layer.ffn_down_proj->backend = GGML_BACKEND_GPU; |
| 295 | + layer.c_attn_wqkv_weight->backend = GGML_BACKEND_GPU; |
| 296 | + layer.c_attn_out_proj_weight->backend = GGML_BACKEND_GPU; |
| 297 | + ggml_cl_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_nbytes(layer.ffn_up_proj); |
| 298 | + ggml_cl_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_nbytes(layer.ffn_down_proj); |
| 299 | + ggml_cl_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_nbytes(layer.c_attn_wqkv_weight); |
| 300 | + ggml_cl_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight); |
| 301 | + } |
| 302 | + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); |
| 303 | + } |
| 304 | + #endif |
| 305 | + |
281 | 306 | return true;
|
282 | 307 | }
|
283 | 308 |
|
|
0 commit comments