chsasank
diff --git a/‎README.md
+17-6 b/‎README.md
+17-6
diff --git a/‎common/common.cpp
+8-25 b/‎common/common.cpp
+8-25
diff --git a/‎common/common.h
+2-1 b/‎common/common.h
+2-1
diff --git a/‎examples/batched/batched.cpp
+4-20 b/‎examples/batched/batched.cpp
+4-20
diff --git a/‎ggml.c
+4-4 b/‎ggml.c
+4-4
diff --git a/‎gguf-py/gguf/constants.py
+6 b/‎gguf-py/gguf/constants.py
+6
@@ -71,6 +71,7 @@ And new features coming soon:
 ```bash
 git clone https://github.com/SJTU-IPADS/PowerInfer
 cd PowerInfer
+pip install -r requirements.txt # install Python helpers' dependencies
 ```
 ### Build
 In order to build PowerInfer you have two different options. These commands are supposed to be run from the root directory of the project.
@@ -89,7 +90,8 @@ cmake --build build --config Release
 
 ## Model Weights
 
-PowerInfer models are stored in a special format called *PowerInfer GGUF* based on GGUF format, consisting of both LLM weights and predictor weights. You can download PowerInfer GGUF weights from Hugging Face or convert them from the original model weights and predictor weights.
+PowerInfer models are stored in a special format called *PowerInfer GGUF* based on GGUF format, consisting of both LLM weights and predictor weights. 
+You can obtain PowerInfer GGUF weights at `*.powerinfer.gguf` as well as profiled model activation statistics under `activation/` for 'hot'-neuron offloading from each Hugging Face model repo under "PowerInfer GGUF Format" column. You can also convert them from the original model weights and predictor weights.
 
 | Base Model | PowerInfer GGUF Format | Original Model | Predictor |
 |------------|------------------|----------------|---------------------|
@@ -102,14 +104,16 @@ PowerInfer models are stored in a special format called *PowerInfer GGUF* based
 
 For CPU-only and CPU-GPU hybrid inference with all available VRAM, you can use the following instructions to run PowerInfer:
 ```bash
-  ./build/bin/main -m /PATH/TO/MODEL -n $output_token_count -t $thread_num -p $prompt
+./build/bin/main -m /PATH/TO/MODEL -n $output_token_count -t $thread_num -p $prompt
+# ./build/bin/main -m ./ReluFalcon-40B-PowerInfer-GGUF/falcon-40b-relu.q4.powerinfer.gguf -n 128 -t 8 -p "Once upon a time"
 ```
+
 If you want to limit the VRAM usage of GPU:
 ```bash
-  ./build/bin/main -m /PATH/TO/MODEL -n $output_token_count -t $thread_num -p $prompt --vram-budget $vram_gb
+./build/bin/main -m /PATH/TO/MODEL -n $output_token_count -t $thread_num -p $prompt --vram-budget $vram_gb
+# ./build/bin/main -m ./ReluLLaMA-7B-PowerInfer-GGUF/llama-7b-relu.powerinfer.gguf -n 128 -t 8 -p "Once upon a time" --vram-budget 8
 ```
-
-As for now, it requires an offline-generated "GPU index" file to split FFNs on GPU. And we found these files are hard to maintain and distribute. We will ship automatic FFN split based on VRAM capacity via [#11](https://github.com/SJTU-IPADS/PowerInfer/pull/11) very soon.
+Under CPU-GPU hybrid inference, PowerInfer will automatically offload all dense activation blocks to GPU and split FFN on GPU if possible. 
 
 ## Evaluation
 
@@ -119,6 +123,13 @@ As for now, it requires an offline-generated "GPU index" file to split FFNs on G
 
 PowerInfer achieves up to 11x and 8x speedup for FP16 and INT4 models!
 
+## FAQs
+1. What if I encountered `CUDA_ERROR_OUT_OF_MEMORY`?
+   - You can try to run with `--reset-gpu-index` argument to rebuild GPU index for this model to avoid any stale cache.
+   - Due to our current implementation, model offloading might not be accurate as expected. You can try with `--vram-budget` with a slightly lower value or `--disable-gpu-index` to disable FFN offloading. 
+2. What if...
+   - Issues are welcomed! Please feel free to open an issue and attach your running environment and running parameters. We will try our best to help you.
+
 ## TODOs
 We will release the code and data in the following order, please stay tuned!
 
@@ -130,7 +141,7 @@ We will release the code and data in the following order, please stay tuned!
 - [ ] Support Metal for Mac
 - [ ] Release code for OPT models
 - [ ] Release predictor training code 
-- [ ] Support online split for FFN network
+- [x] Support online split for FFN network
 - [ ] Support Multi-GPU
 
 
 
@@ -471,12 +471,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.lora_base = argv[i];
-        } else if (arg == "--gpu-index") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.gpu_index = argv[i];
+        } else if (arg == "--reset-gpu-index") {
+            params.reset_gpu_index = true;
+        } else if (arg == "--disable-gpu-index") {
+            params.disale_gpu_index = true;
         } else if (arg == "--mmproj") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -910,6 +908,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
+    mparams.reset_gpu_index = params.reset_gpu_index;
+    mparams.disable_gpu_index = params.disale_gpu_index;
 
     return mparams;
 }
@@ -968,24 +968,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         return std::make_tuple(nullptr, nullptr);
     }
 
-    if (llama_use_sparse_inference(model)) {
-        fprintf(stderr, "%s: postprocessing PowerInfer model '%s'\n", __func__, params.model.c_str());
-        if (!params.gpu_index.empty()) {
-            int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
-            if (err != 0) {
-                fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
-                llama_free_model(model);
-                return std::make_tuple(nullptr, nullptr);
-            }
-        }
-
-        if (llama_model_apply_augmentation(model) != 0) {
-            fprintf(stderr, "%s: error: failed to apply augmentation\n", __func__);
-            llama_free_model(model);
-            return std::make_tuple(nullptr, nullptr);
-        }
-    }
-
     auto cparams = llama_context_params_from_gpt_params(params);
     llama_context * lctx = llama_new_context_with_model(model, cparams);
     if (lctx == NULL) {
@@ -1357,7 +1339,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
         fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
     }
     fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
-    fprintf(stream, "gpu_index: %s\n", params.gpu_index.c_str());
+    fprintf(stream, "reset_gpu_index: %s\n", params.reset_gpu_index ? "true" : "false");
+    fprintf(stream, "disable_gpu_index: %s\n", params.disale_gpu_index? "true": "false");
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
     fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
 
@@ -91,7 +91,8 @@ struct gpt_params {
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
     std::string lora_base  = "";                              // base model path for the lora adapter
 
-    std::string gpu_index = "";  // sparse activation mlp adapter path
+    bool reset_gpu_index   = false; // refresh the gpu index file
+    bool disale_gpu_index  = false; // disable loading gpu index and splitting ffn
 
     int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
 
@@ -48,12 +48,11 @@ int main(int argc, char ** argv) {
         params.n_threads = std::atoi(argv[6]);
     }
 
-    if (argc >= 8) {
-        params.gpu_index = argv[7];
-    }
+    // For testing purposes, we always reset the GPU index
+    params.reset_gpu_index = true;
 
-    printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, gpu_index = %s\n",
-           params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.gpu_index.c_str());
+    printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, reset_gpu_index = true\n",
+           params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads);
 
     if (params.prompt.empty()) {
         params.prompt = "Hello my name is";
@@ -76,21 +75,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    if (!params.gpu_index.empty()) {
-        int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
-        if (err != 0) {
-            fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    if (llama_model_apply_augmentation(model) != 0) {
-        fprintf(stderr, "%s: error: failed to apply model augmentation\n", __func__);
-        llama_free_model(model);
-        return 1;
-    }
-
     // tokenize the prompt
 
     std::vector<llama_token> tokens_list;
 
@@ -17497,7 +17497,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
     }
 
     const int n_threads = cplan->n_threads;
-#ifdef LLAMA_CUBLAS
+#ifdef GGML_USE_CUBLAS
     struct ggml_compute_state_shared state_shared = {
         /*.cgraph                  =*/ cgraph,
         /*.cgraph_plan             =*/ cplan,
@@ -17534,7 +17534,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
                 .ith = j,
                 .shared = &state_shared,
             };
-#ifdef LLAMA_CUBLAS
+#ifdef GGML_USE_CUBLAS
             const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread_hybrid, &workers[j]);
 #else
             const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
@@ -17551,7 +17551,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
     const int64_t perf_start_time_us = ggml_perf_time_us();
 
     // this is a work thread too
-#ifdef LLAMA_CUBLAS
+
+#ifdef GGML_USE_CUBLAS
     int compute_status = (size_t) ggml_graph_compute_thread_hybrid(&workers[0]);
 #else
     int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
@@ -19590,7 +19591,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
             sparse_deriv = GGML_DENSE_INFERENCE;
         } else if (strncmp(magic, GGUF_POWERINFER_MAGIC, sizeof(magic)) == 0) {
             sparse_deriv = GGML_SPARSE_INFERENCE;
-            fprintf(stderr, "%s: PowerInfer derived model detected. Sparse inference will be used.\n", __func__);
         } else {
             fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
             fclose(file);
 
@@ -74,6 +74,9 @@ class Tokenizer:
     class PowerInfer:
         SPARSE_THRESHOLD = "powerinfer.sparse_threshold"
 
+    class Split:
+        VRAM_CAPACITY = "split.vram_capacity"
+
 
 #
 # recommended mapping of model tensor names for storage in gguf
@@ -385,6 +388,9 @@ class GGMLQuantizationType(IntEnum):
     Q5_K = 13
     Q6_K = 14
     Q8_K = 15
+    I8 = 16,
+    I16 = 17
+    I32 = 18,
 
 
 class GGUFEndian(IntEnum):