Skip to content

Commit c8ae945

Browse files
committed
Merge 'origin/master' into hipblas
2 parents c1e5c83 + 0be54f7 commit c8ae945

25 files changed

+2967
-534
lines changed

CMakeLists.txt

+9-5
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ option(LLAMA_HIPBLAS "llama: use hipBLAS"
7676
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7777
option(LLAMA_METAL "llama: use Metal" OFF)
7878
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
79+
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
7980

8081
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
8182
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@@ -226,6 +227,14 @@ if (LLAMA_BLAS)
226227
endif()
227228
endif()
228229

230+
if (LLAMA_K_QUANTS)
231+
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
232+
add_compile_definitions(GGML_USE_K_QUANTS)
233+
if (LLAMA_QKK_64)
234+
add_compile_definitions(GGML_QKK_64)
235+
endif()
236+
endif()
237+
229238
if (LLAMA_CUBLAS)
230239
cmake_minimum_required(VERSION 3.17)
231240

@@ -290,11 +299,6 @@ if (LLAMA_METAL)
290299
)
291300
endif()
292301

293-
if (LLAMA_K_QUANTS)
294-
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
295-
add_compile_definitions(GGML_USE_K_QUANTS)
296-
endif()
297-
298302
if (LLAMA_CLBLAST)
299303
find_package(CLBlast)
300304
if (CLBlast_FOUND)

Makefile

+8-1
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,11 @@ endif
4343

4444
# keep standard at C11 and C++11
4545
# -Ofast tends to produce faster code, but may not be available for some compilers.
46-
#OPT = -Ofast
46+
ifdef LLAMA_FAST
47+
OPT = -Ofast
48+
else
4749
OPT = -O3
50+
endif
4851
CFLAGS = -I. $(OPT) -std=c11 -fPIC
4952
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
5053
LDFLAGS =
@@ -131,6 +134,10 @@ ifndef LLAMA_NO_K_QUANTS
131134
CFLAGS += -DGGML_USE_K_QUANTS
132135
CXXFLAGS += -DGGML_USE_K_QUANTS
133136
OBJS += k_quants.o
137+
ifdef LLAMA_QKK_64
138+
CFLAGS += -DGGML_QKK_64
139+
CXXFLAGS += -DGGML_QKK_64
140+
endif
134141
endif
135142

136143
ifndef LLAMA_NO_ACCELERATE

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
1111

1212
**Hot topics:**
1313

14+
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
1415
- New roadmap: https://github.com/users/ggerganov/projects/7
1516
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
1617
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
@@ -92,6 +93,7 @@ as the main playground for developing new features for the [ggml](https://github
9293
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
9394
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
9495
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
96+
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
9597

9698
**UI:**
9799

@@ -686,6 +688,8 @@ GGML_OPENCL_DEVICE=0
686688
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
687689
```
688690

691+
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
692+
689693
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
690694

691695
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.

examples/baby-llama/baby-llama.cpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -566,8 +566,8 @@ struct ggml_tensor * forward(
566566
// wk shape [n_embd, n_embd, 1, 1]
567567
// Qcur shape [n_embd/n_head, n_head, N, 1]
568568
// Kcur shape [n_embd/n_head, n_head, N, 1]
569-
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
570-
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
569+
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
570+
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
571571

572572
// store key and value to memory
573573
{
@@ -823,8 +823,8 @@ struct ggml_tensor * forward_batch(
823823
// wk shape [n_embd, n_embd, 1, 1]
824824
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
825825
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
826-
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
827-
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
826+
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
827+
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
828828
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
829829
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
830830

@@ -1116,7 +1116,7 @@ struct ggml_tensor * forward_lora(
11161116
model->layers[il].wqb,
11171117
cur)),
11181118
n_embd/n_head, n_head, N),
1119-
n_past, n_rot, 0);
1119+
n_past, n_rot, 0, 0);
11201120
struct ggml_tensor * Kcur = ggml_rope(ctx0,
11211121
ggml_reshape_3d(ctx0,
11221122
ggml_mul_mat(ctx0,
@@ -1125,7 +1125,7 @@ struct ggml_tensor * forward_lora(
11251125
model->layers[il].wkb,
11261126
cur)),
11271127
n_embd/n_head, n_head, N),
1128-
n_past, n_rot, 0);
1128+
n_past, n_rot, 0, 0);
11291129

11301130
// store key and value to memory
11311131
{

examples/common.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
343343
params.use_mmap = false;
344344
} else if (arg == "--mtest") {
345345
params.mem_test = true;
346+
} else if (arg == "--numa") {
347+
params.numa = true;
346348
} else if (arg == "--export") {
347349
params.export_cgraph = true;
348350
} else if (arg == "--verbose-prompt") {
@@ -488,6 +490,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
488490
if (llama_mmap_supported()) {
489491
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
490492
}
493+
fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
494+
fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
495+
fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
491496
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
492497
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
493498
fprintf(stderr, " number of layers to store in VRAM\n");

examples/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ struct gpt_params {
7676
bool use_mmap = true; // use mmap for faster loads
7777
bool use_mlock = false; // use mlock to keep model in memory
7878
bool mem_test = false; // compute maximum memory usage
79+
bool numa = false; // attempt optimizations that help on some NUMA systems
7980
bool export_cgraph = false; // export the computation graph
8081
bool verbose_prompt = false; // print prompt tokens before generation
8182
};

examples/embedding/embedding.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
3535
params.prompt = gpt_random_prompt(rng);
3636
}
3737

38-
llama_init_backend();
38+
llama_init_backend(params.numa);
3939

4040
llama_model * model;
4141
llama_context * ctx;

examples/main/README.md

+4
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,10 @@ These options help improve the performance and memory usage of the LLaMA models.
262262

263263
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
264264

265+
### NUMA support
266+
267+
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
268+
265269
### Memory Float 32
266270

267271
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.

examples/main/main.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
105105
params.prompt = gpt_random_prompt(rng);
106106
}
107107

108-
llama_init_backend();
108+
llama_init_backend(params.numa);
109109

110110
llama_model * model;
111111
llama_context * ctx;

examples/perplexity/perplexity.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
147147
params.prompt = gpt_random_prompt(rng);
148148
}
149149

150-
llama_init_backend();
150+
llama_init_backend(params.numa);
151151

152152
llama_model * model;
153153
llama_context * ctx;

examples/quantize/quantize.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
180180
usage(argv[0]);
181181
}
182182

183-
llama_init_backend();
183+
llama_init_backend(false);
184184

185185
// parse command line arguments
186186
const std::string fname_inp = argv[arg_idx];

examples/server/server.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -789,7 +789,7 @@ int main(int argc, char ** argv) {
789789
params.model_alias = params.model;
790790
}
791791

792-
llama_init_backend();
792+
llama_init_backend(params.numa);
793793

794794
LOG_INFO("build info", {
795795
{ "build", BUILD_NUMBER },

examples/simple/simple.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
6666
// Init LLM :
6767
//---------------------------------
6868

69-
llama_init_backend();
69+
llama_init_backend(params.numa);
7070

7171
llama_model * model;
7272
llama_context * ctx;

0 commit comments

Comments
 (0)