ggml-org · KerfuffleV2 · Aug 12, 2023 · Nov 2, 2023 · Nov 2, 2023 · Nov 2, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -96,6 +96,8 @@ option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 
+option(LLAMA_SEQREP_SAMPLER                  "llama: build with support for seqrep sampler"     ON)
+
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)

diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora tests/test-c.o
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora simple-inference tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = \
@@ -572,6 +572,14 @@ grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 train.o: common/train.cpp common/train.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+ifndef LLAMA_NO_SEQREP_SAMPLER
+COMMON_H_DEFS += common/seqrep-sampler.h
+COMMON_DEPS += seqrep-sampler.o
+
+seqrep-sampler.o: common/seqrep-sampler.cpp common/seqrep-sampler.h $(COMMON_H_DEPS)
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif
+
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
@@ -594,13 +602,16 @@ infill: examples/infill/infill.cpp                            ggml.o llama.o $(C
 simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+simple-inference: examples/simple-inference/simple-inference.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 tokenize: examples/tokenize/tokenize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
+batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)

diff --git a/build.zig b/build.zig
@@ -111,6 +111,8 @@ pub fn build(b: *std.build.Builder) !void {
     var make = try Maker.init(b);
     make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
 
+    try make.addFlag("-DLLAMA_NO_SEQREP_SAMPLER");
+
     const ggml = make.obj("ggml", "ggml.c");
     const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
     const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
@@ -54,6 +54,12 @@ add_library(${TARGET} STATIC
     train.cpp
     )
 
+if (LLAMA_SEQREP_SAMPLER)
+    target_sources(${TARGET} PRIVATE seqrep-sampler.h seqrep-sampler.cpp)
+else()
+    add_compile_definitions(LLAMA_NO_SEQREP_SAMPLER)
+endif()
+
 if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

diff --git a/common/common.cpp b/common/common.cpp
@@ -1,6 +1,10 @@
 #include "common.h"
 #include "llama.h"
 
+#ifndef LLAMA_NO_SEQREP_SAMPLER
+#include "seqrep-sampler.h"
+#endif
+
 #include <algorithm>
 #include <cassert>
 #include <cmath>
@@ -336,6 +340,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             sparams.penalty_present = std::stof(argv[i]);
+#ifndef LLAMA_NO_SEQREP_SAMPLER
+        } else if (arg == "-seqrep" || arg == "--seqrep-penalty") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            if (std::strcmp(argv[i], "help") == 0) {
+                seqrep_sampler_help();
+                exit(0);
+            }
+            llama_sampler_seqrep_params sr_params;
+            seqrep_sampler_params_init(&sr_params);
+            if (!seqrep_sampler_params_parse(argv[i], &sr_params)) {
+                seqrep_sampler_help();
+                exit(1);
+            }
+            sparams.seqrep_params.push_back(sr_params);
+#endif
         } else if (arg == "--mirostat") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -770,6 +792,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
     printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
     printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
+#ifndef LLAMA_NO_SEQREP_SAMPLER
+    printf("  -seqrep CFG, --seqrep-penalty CFG\n");
+    printf("                        add a copy of the sequence repetition penalty sampler. may be specified multiple times. for help: -seqrep help\n");
+#endif
     printf("  --mirostat N          use Mirostat sampling.\n");
     printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
     printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);

diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -103,7 +103,8 @@ llama_token llama_sampling_sample(
                   struct llama_sampling_context * ctx_sampling,
                   struct llama_context * ctx_main,
                   struct llama_context * ctx_cfg,
-                  const int idx) {
+                  const int idx,
+                  const std::vector<llama_token> & all_last_tokens) {
     const llama_sampling_params & params = ctx_sampling->params;
 
     const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
@@ -155,6 +156,13 @@ llama_token llama_sampling_sample(
                 prev.data() + prev.size() - penalty_last_n,
                 penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
 
+#ifndef LLAMA_NO_SEQREP_SAMPLER
+        for (auto & sr_params : params.seqrep_params) {
+            if ((sr_params.flags & LLAMA_SEQREP_REWIND_MODE) != 0) continue;
+            llama_sample_seqrep_penalty(ctx_main, &cur_p, all_last_tokens, &sr_params);
+        }
+#endif
+
         if (!penalize_nl) {
             for (size_t idx = 0; idx < cur_p.size; idx++) {
                 if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {

diff --git a/common/sampling.h b/common/sampling.h
@@ -4,6 +4,10 @@
 
 #include "grammar-parser.h"
 
+#ifndef LLAMA_NO_SEQREP_SAMPLER
+#include "seqrep-sampler.h"
+#endif
+
 #include <string>
 #include <vector>
 #include <unordered_map>
@@ -35,6 +39,11 @@ typedef struct llama_sampling_params {
     float       cfg_scale     = 1.f; // how strong is guidance
 
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+
+#ifndef LLAMA_NO_SEQREP_SAMPLER
+    std::vector<llama_sampler_seqrep_params> seqrep_params;
+#endif
+
 } llama_sampling_params;
 
 // general sampler context
@@ -101,7 +110,8 @@ llama_token llama_sampling_sample(
         struct llama_sampling_context * ctx_sampling,
         struct llama_context * ctx_main,
         struct llama_context * ctx_cfg,
-        int idx = 0);
+        int idx = 0,
+        const std::vector<llama_token> & all_last_tokens = {});
 
 void llama_sampling_accept(
         struct llama_sampling_context * ctx_sampling,