AAbushady
diff --git a/‎.gitignore
+1-1 b/‎.gitignore
+1-1
diff --git a/‎CMakeLists.txt
+1-1 b/‎CMakeLists.txt
+1-1
diff --git a/‎Makefile
+12-7 b/‎Makefile
+12-7
diff --git a/‎examples/CMakeLists.txt
+1 b/‎examples/CMakeLists.txt
+1
diff --git a/‎examples/benchmark/CMakeLists.txt
+4 b/‎examples/benchmark/CMakeLists.txt
+4
diff --git a/‎examples/benchmark/benchmark-q4_0-matmult.c renamed to ‎examples/benchmark/benchmark-matmult.cpp
+10-20 b/‎examples/benchmark/benchmark-q4_0-matmult.c renamed to ‎examples/benchmark/benchmark-matmult.cpp
+10-20
diff --git a/‎examples/save-load-state/save-load-state.cpp
+49-31 b/‎examples/save-load-state/save-load-state.cpp
+49-31
@@ -28,7 +28,7 @@ models/*
 /result
 /perplexity
 /embedding
-/benchmark-q4_0-matmult
+/benchmark-matmult
 /vdot
 /Pipfile
 
 
@@ -367,7 +367,7 @@ endif()
 add_library(llama
             llama.cpp
             llama.h
-            llama_util.h)
+            llama-util.h)
 
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 
@@ -34,10 +34,15 @@ endif
 #
 
 # keep standard at C11 and C++11
-CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+CFLAGS   = -I.              -O3 -std=c11   -fPIC
+CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
 LDFLAGS  =
 
+ifndef LLAMA_DEBUG
+	CFLAGS   += -DNDEBUG
+	CXXFLAGS += -DNDEBUG
+endif
+
 # warnings
 CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
@@ -181,14 +186,14 @@ $(info )
 ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
-llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h
+llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult
 
 main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@@ -218,9 +223,9 @@ libllama.so: llama.o ggml.o $(OBJS)
 # Tests
 #
 
-benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
-	./benchmark-q4_0-matmult
+benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+	./$@
 
 .PHONY: tests
 tests:
 
@@ -35,4 +35,5 @@ else()
     add_subdirectory(perplexity)
     add_subdirectory(embedding)
     add_subdirectory(save-load-state)
+    add_subdirectory(benchmark)
 endif()
@@ -0,0 +1,4 @@
+set(TARGET benchmark)
+add_executable(${TARGET} benchmark-matmult.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -1,11 +1,3 @@
-/*
-    License: MIT License
-
-    Changelog:
-    - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
-
-*/
-
 #include <locale.h>
 #include "ggml.h"
 #include <assert.h>
@@ -45,7 +37,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
 
 #define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
 
-#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
+#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5ld x %5ld x %5ld, nb = (%5li, %5li, %5li) - ", #TENSOR, \
         TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
         TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
     { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
@@ -98,12 +90,9 @@ int main(int argc, char ** argv)  {
         }
     }
 
-
     // create the ggml context
     printf("Starting Test\n");
 
-
-
     struct ggml_context * ctx;
     //const int sizex = 4096;
     //const int sizey = 11008;
@@ -125,16 +114,18 @@ int main(int argc, char ** argv)  {
 #endif
 
     //printf("Memsize required = %i\n", sizex*sizex);
-    ggml_type wtype = GGML_TYPE_F32;
 
     size_t ctx_size = 0;
-    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
-    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizeof(float);
-    ctx_size += 1024*1024*100;
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += 1024*1024*16;
 
-    printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
+    printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024));
 
     struct ggml_init_params params = {
         /*.mem_size   =*/ ctx_size,
@@ -217,7 +208,7 @@ int main(int argc, char ** argv)  {
     const int dimz = sizez;
     long long int flops_per_dot_product = dimy + dimy;
     long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
-    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
+    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
 
 
     // Let's use the F32 result from above as a reference for the q4_0 multiplication
@@ -234,7 +225,6 @@ int main(int argc, char ** argv)  {
         ggml_graph_compute(ctx, &gf31);
         long long int stop = ggml_time_us();
         long long int usec = stop-start;
-        float sec = usec/1000000;
         float flops_per_usec = (1.0f*flops_per_matrix)/usec;
         printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
             i,
 
@@ -1,12 +1,9 @@
-#include <vector>
-#include <cstdio>
-#include <chrono>
-
 #include "common.h"
 #include "llama.h"
-#include "llama.cpp"
 
-using namespace std;
+#include <vector>
+#include <cstdio>
+#include <chrono>
 
 int main(int argc, char ** argv) {
     gpt_params params;
@@ -20,21 +17,25 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    if (params.n_predict < 0) {
+        params.n_predict = 16;
+    }
+
     auto lparams = llama_context_default_params();
 
-    lparams.n_ctx      = params.n_ctx;
-    lparams.n_parts    = params.n_parts;
-    lparams.seed       = params.seed;
-    lparams.f16_kv     = params.memory_f16;
-    lparams.use_mmap   = params.use_mmap;
-    lparams.use_mlock  = params.use_mlock;
+    lparams.n_ctx     = params.n_ctx;
+    lparams.n_parts   = params.n_parts;
+    lparams.seed      = params.seed;
+    lparams.f16_kv    = params.memory_f16;
+    lparams.use_mmap  = params.use_mmap;
+    lparams.use_mlock = params.use_mlock;
 
     auto n_past = 0;
-    auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
+    auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
 
     // init
     auto ctx = llama_init_from_file(params.model.c_str(), lparams);
-    auto tokens = vector<llama_token>(params.n_ctx);
+    auto tokens = std::vector<llama_token>(params.n_ctx);
     auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
 
     if (n_prompt_tokens < 1) {
@@ -43,26 +44,29 @@ int main(int argc, char ** argv) {
     }
 
     // evaluate prompt
-
     llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
 
     last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
     n_past += n_prompt_tokens;
 
+    const size_t state_size = llama_get_state_size(ctx);
+    uint8_t * state_mem = new uint8_t[state_size];
+
     // Save state (rng, logits, embedding and kv_cache) to file
-    FILE *fp_write = fopen("dump_state.bin", "wb");
-    auto state_size = llama_get_state_size(ctx);
-    auto state_mem = new uint8_t[state_size];
-    llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
-    fwrite(state_mem, 1, state_size, fp_write);
-    fclose(fp_write);
+    {
+        FILE *fp_write = fopen("dump_state.bin", "wb");
+        llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
+        fwrite(state_mem, 1, state_size, fp_write);
+        fclose(fp_write);
+    }
 
     // save state (last tokens)
-    auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
-    auto n_past_saved = n_past;
+    const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
+    const auto n_past_saved = n_past;
 
     // first run
     printf("\n%s", params.prompt.c_str());
+
     for (auto i = 0; i < params.n_predict; i++) {
         auto logits = llama_get_logits(ctx);
         auto n_vocab = llama_n_vocab(ctx);
@@ -75,31 +79,42 @@ int main(int argc, char ** argv) {
         auto next_token = llama_sample_token(ctx, &candidates_p);
         auto next_token_str = llama_token_to_str(ctx, next_token);
         last_n_tokens_data.push_back(next_token);
+
         printf("%s", next_token_str);
         if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             return 1;
         }
         n_past += 1;
     }
+
     printf("\n\n");
 
     // free old model
     llama_free(ctx);
 
     // load new model
-
     auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
 
     // Load state (rng, logits, embedding and kv_cache) from file
-    FILE *fp_read = fopen("dump_state.bin", "rb");
-    auto state_size2 = llama_get_state_size(ctx2);
-    if (state_size != state_size2) {
-        fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
+    {
+        FILE *fp_read = fopen("dump_state.bin", "rb");
+        if (state_size != llama_get_state_size(ctx2)) {
+            fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
+            return 1;
+        }
+
+        const size_t ret = fread(state_mem, 1, state_size, fp_read);
+        if (ret != state_size) {
+            fprintf(stderr, "\n%s : failed to read state\n", __func__);
+            return 1;
+        }
+
+        llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
+        fclose(fp_read);
     }
-    fread(state_mem, 1, state_size, fp_read);
-    llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
-    fclose(fp_read);
+
+    delete[] state_mem;
 
     // restore state (last tokens)
     last_n_tokens_data = last_n_tokens_data_saved;
@@ -118,13 +133,16 @@ int main(int argc, char ** argv) {
         auto next_token = llama_sample_token(ctx2, &candidates_p);
         auto next_token_str = llama_token_to_str(ctx2, next_token);
         last_n_tokens_data.push_back(next_token);
+
         printf("%s", next_token_str);
         if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             return 1;
         }
         n_past += 1;
     }
+
     printf("\n\n");
+
     return 0;
 }