Skip to content

Commit 66a1bb4

Browse files
authored
add gpu index opts and udpate doc commands (ggml-org#2)
1 parent fe3bc49 commit 66a1bb4

File tree

7 files changed

+42
-40
lines changed

7 files changed

+42
-40
lines changed

README.md

+28-25
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ The SparseLLM Team is currently converting the Mistral-7B model to a sparser ver
5050

5151
- [Installation](##setup--installation)
5252
- [Model Weights](##model-weights)
53-
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
5453

5554
## Setup & Installation
5655
### Get the Code
@@ -60,28 +59,24 @@ git clone https://github.com/hodlen/PowerInfer
6059
cd PowerInfer
6160
```
6261
### Build
63-
In order to build PowerInfer you have two different options.
64-
65-
- Using `make`:
66-
- On Linux or MacOS:
67-
```bash
68-
make
69-
```
70-
- Using `CMake`:
71-
- If you have one GPU:
72-
```bash
73-
mkdir build
74-
cd build
75-
cmake .. -DLLAMA_CUBLAS=ON
76-
cmake --build . --config Release
77-
```
78-
- If you just CPU:
79-
```bash
80-
mkdir build
81-
cd build
82-
cmake ..
83-
cmake --build . --config Release
84-
```
62+
In order to build PowerInfer you have two different options. These commands are supposed to be run from the root directory of the project.
63+
64+
Using `make` on Linux or MacOS:
65+
```bash
66+
make
67+
```
68+
69+
Using `CMake`:
70+
* If you have one GPU:
71+
```bash
72+
cmake -S . -B build -DLLAMA_CUBLAS=ON
73+
cmake --build build --config Release
74+
```
75+
* If you just CPU:
76+
```bash
77+
cmake -S . -B build
78+
cmake --build build --config Release
79+
```
8580

8681
## Model Weights
8782

@@ -96,11 +91,19 @@ In order to build PowerInfer you have two different options.
9691
```bash
9792
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
9893
```
99-
- If you have CPU with one consumer grade GPU:
94+
- If you have CPU with one GPU:
10095
```bash
101-
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
96+
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
10297
```
10398

99+
As for now, it requires a offline-generated "GPU index" file to split FFNs on GPU. If you want to try it, please use the following instruction to generate the GPU index file:
100+
```bash
101+
python scripts/export-gpu-split.py $(activation_count_path) $(output_idx_path) solver
102+
```
103+
Then, you can use the following instruction to run PowerInfer with GPU index:
104+
```bash
105+
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt) --gpu-index $(split_path)
106+
```
104107

105108
## Evaluation
106109

common/common.cpp

+5-6
Original file line numberDiff line numberDiff line change
@@ -471,12 +471,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
471471
break;
472472
}
473473
params.lora_base = argv[i];
474-
} else if (arg == "--mlp-adapter") {
474+
} else if (arg == "--gpu-index") {
475475
if (++i >= argc) {
476476
invalid_param = true;
477477
break;
478478
}
479-
params.mlp_adapter = argv[i];
479+
params.gpu_index = argv[i];
480480
} else if (arg == "--mmproj") {
481481
if (++i >= argc) {
482482
invalid_param = true;
@@ -970,9 +970,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
970970

971971
if (llama_use_sparse_inference(model)) {
972972
fprintf(stderr, "%s: postprocessing PowerInfer model '%s'\n", __func__, params.model.c_str());
973-
if (!params.mlp_adapter.empty()) {
974-
fprintf(stderr, "%s: warning: --mlp-adapter is deprecated and has no effect\n", __func__);
975-
int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
973+
if (!params.gpu_index.empty()) {
974+
int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
976975
if (err != 0) {
977976
fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
978977
llama_free_model(model);
@@ -1358,7 +1357,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
13581357
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
13591358
}
13601359
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
1361-
fprintf(stream, "mlp_adapter: %s\n", params.mlp_adapter.c_str());
1360+
fprintf(stream, "gpu_index: %s\n", params.gpu_index.c_str());
13621361
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
13631362
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
13641363
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);

common/common.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ struct gpt_params {
9191
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
9292
std::string lora_base = ""; // base model path for the lora adapter
9393

94-
std::string mlp_adapter = ""; // sparse activation mlp adapter path
94+
std::string gpu_index = ""; // sparse activation mlp adapter path
9595

9696
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
9797
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line

examples/batched/batched.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,11 @@ int main(int argc, char ** argv) {
4949
}
5050

5151
if (argc >= 8) {
52-
params.mlp_adapter = argv[7];
52+
params.gpu_index = argv[7];
5353
}
5454

55-
printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, mlp_adapter = %s\n",
56-
params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.mlp_adapter.c_str());
55+
printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, gpu_index = %s\n",
56+
params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.gpu_index.c_str());
5757

5858
if (params.prompt.empty()) {
5959
params.prompt = "Hello my name is";
@@ -76,8 +76,8 @@ int main(int argc, char ** argv) {
7676
return 1;
7777
}
7878

79-
if (!params.mlp_adapter.empty()) {
80-
int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
79+
if (!params.gpu_index.empty()) {
80+
int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
8181
if (err != 0) {
8282
fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
8383
llama_free_model(model);

llama.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -9660,7 +9660,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
96609660
}
96619661
}
96629662

9663-
int llama_model_apply_mlp_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) {
9663+
int llama_model_apply_gpu_idx_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) {
96649664
llama_mlp_model_loader * mlp_ml = new llama_mlp_model_loader(path_mlp, use_mmap);
96659665
if (mlp_ml -> apply_tensors_to_base_model(model) > 0) {
96669666
LLAMA_LOG_ERROR("%s: failed to apply mlp adapter\n", __func__);

llama.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ extern "C" {
342342
const char * path_base_model,
343343
int n_threads);
344344

345-
LLAMA_API int llama_model_apply_mlp_from_file(
345+
LLAMA_API int llama_model_apply_gpu_idx_from_file(
346346
struct llama_model * model,
347347
const char * path_mlp,
348348
bool use_mmap);

scripts/export-gpu-split.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def main(predictors_path: str, output_path: str, solver_path: str):
134134
parser.add_argument(
135135
"output_path",
136136
help="path to the output GGML adapter",
137-
default="./ggml-mlp-adapters.bin",
137+
default="./gpu-index.bin",
138138
)
139139
parser.add_argument("solver", help="path to the solver")
140140

0 commit comments

Comments
 (0)