Skip to content

Commit b6ff890

Browse files
authored
Merge branch 'LostRuins:concedo' into main
2 parents eb094f0 + e6ddb15 commit b6ff890

File tree

6 files changed

+109
-10
lines changed

6 files changed

+109
-10
lines changed

gpttype_adapter.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
672672
{
673673
if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
674674
{
675-
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format);
675+
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format, inputs.gpulayers);
676676
if(res==ModelLoadResult::FAIL)
677677
{
678678
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
@@ -734,7 +734,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
734734
}
735735
else if(file_format==FileFormat::MPT_1)
736736
{
737-
bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab);
737+
bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab, inputs.gpulayers);
738738
if(res==false)
739739
{
740740
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());

otherarch/gpt2_v3.cpp

+23
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,29 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
345345

346346
fin.close();
347347

348+
//gpu offload
349+
#if defined(GGML_USE_CLBLAST)
350+
if(gpulayers>0)
351+
{
352+
const auto & hparams = model.hparams;
353+
size_t vram_total = 0;
354+
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
355+
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
356+
for (int i = 0; i < n_gpu; ++i) {
357+
const auto & layer = model.layers[i];
358+
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
359+
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
360+
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
361+
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
362+
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
363+
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
364+
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
365+
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
366+
}
367+
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
368+
}
369+
#endif
370+
348371
return ModelLoadResult::SUCCESS;
349372
}
350373

otherarch/gptj_v3.cpp

+29-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515

1616
#include "model_adapter.h"
1717

18-
18+
#if defined(GGML_USE_CLBLAST)
19+
#include "ggml-opencl.h"
20+
#endif
1921

2022
// load the model's weights from a file
2123
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
@@ -331,7 +333,32 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
331333

332334
fin.close();
333335

334-
336+
//gpu offload
337+
#if defined(GGML_USE_CLBLAST)
338+
if(gpulayers>0)
339+
{
340+
const auto & hparams = model.hparams;
341+
size_t vram_total = 0;
342+
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
343+
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
344+
for (int i = 0; i < n_gpu; ++i) {
345+
const auto & layer = model.layers[i];
346+
layer.c_attn_q_proj_w->backend = GGML_BACKEND_GPU;
347+
layer.c_attn_k_proj_w->backend = GGML_BACKEND_GPU;
348+
layer.c_attn_v_proj_w->backend = GGML_BACKEND_GPU;
349+
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
350+
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
351+
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
352+
ggml_cl_transform_tensor(layer.c_attn_q_proj_w->data,layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
353+
ggml_cl_transform_tensor(layer.c_attn_k_proj_w->data,layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
354+
ggml_cl_transform_tensor(layer.c_attn_v_proj_w->data,layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
355+
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
356+
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
357+
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
358+
}
359+
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
360+
}
361+
#endif
335362

336363
return ModelLoadResult::SUCCESS;
337364
}

otherarch/mpt_v3.cpp

+28-3
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@
1515

1616
#include "model_adapter.h"
1717

18-
18+
#if defined(GGML_USE_CLBLAST)
19+
#include "ggml-opencl.h"
20+
#endif
1921

2022
// load the model's weights from a file
21-
bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {
23+
bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab, int gpulayers) {
2224
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
2325

2426
auto fin = std::ifstream(fname, std::ios::binary);
@@ -75,7 +77,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
7577
std::string word;
7678
std::vector<char> buf(128);
7779

78-
for (int i = 0; i < n_vocab; i++) {
80+
for (int i = 0; i < n_vocab; i++) {
7981
uint32_t len;
8082
fin.read((char *) &len, sizeof(len));
8183

@@ -278,6 +280,29 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
278280

279281
fin.close();
280282

283+
//gpu offload
284+
#if defined(GGML_USE_CLBLAST)
285+
if(gpulayers>0)
286+
{
287+
const auto & hparams = model.hparams;
288+
size_t vram_total = 0;
289+
const int n_gpu = std::min(gpulayers, int(hparams.n_layers));
290+
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
291+
for (int i = 0; i < n_gpu; ++i) {
292+
const auto & layer = model.layers[i];
293+
layer.ffn_up_proj->backend = GGML_BACKEND_GPU;
294+
layer.ffn_down_proj->backend = GGML_BACKEND_GPU;
295+
layer.c_attn_wqkv_weight->backend = GGML_BACKEND_GPU;
296+
layer.c_attn_out_proj_weight->backend = GGML_BACKEND_GPU;
297+
ggml_cl_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_nbytes(layer.ffn_up_proj);
298+
ggml_cl_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_nbytes(layer.ffn_down_proj);
299+
ggml_cl_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_nbytes(layer.c_attn_wqkv_weight);
300+
ggml_cl_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight);
301+
}
302+
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
303+
}
304+
#endif
305+
281306
return true;
282307
}
283308

otherarch/neox_v3.cpp

+27-2
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@
1313
#include <vector>
1414
#include <iostream>
1515

16-
16+
#if defined(GGML_USE_CLBLAST)
17+
#include "ggml-opencl.h"
18+
#endif
1719

1820
// load the model's weights from a file
19-
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
21+
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
2022
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
2123

2224
auto fin = std::ifstream(fname, std::ios::binary);
@@ -318,6 +320,29 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
318320

319321
fin.close();
320322

323+
//gpu offload
324+
#if defined(GGML_USE_CLBLAST)
325+
if(gpulayers>0)
326+
{
327+
const auto & hparams = model.hparams;
328+
size_t vram_total = 0;
329+
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
330+
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
331+
for (int i = 0; i < n_gpu; ++i) {
332+
const auto & layer = model.layers[i];
333+
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
334+
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
335+
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
336+
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
337+
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
338+
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
339+
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
340+
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
341+
}
342+
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
343+
}
344+
#endif
345+
321346
return ModelLoadResult::SUCCESS;
322347
}
323348

otherarch/otherarch.h

-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ struct gptj_layer {
4343
struct ggml_tensor * c_mlp_fc_b;
4444

4545
struct ggml_tensor * c_mlp_proj_w;
46-
struct ggml_tensor * c_mlp_proj_w_trans; //for backwards compatibility
4746
struct ggml_tensor * c_mlp_proj_b;
4847
};
4948
struct gptj_layer_v2 {

0 commit comments

Comments
 (0)