From 0e4602aa34f313a1ac215f2a252ca8332297e1c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20Luis=20L=C3=B3pez=20L=C3=B3pez?= Date: Mon, 13 May 2024 12:42:46 +0200 Subject: [PATCH 1/8] =?UTF-8?q?Return=20quickly=20if=20there=20is=20no=20m?= =?UTF-8?q?odel=20in=20bearer=20token,=20an=20inputModel=20is=20specified?= =?UTF-8?q?=20but=20it=20doen=C2=B4t=20exist=20in=20modelPath?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jose Luis López López --- core/http/ctx/fiber.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/core/http/ctx/fiber.go b/core/http/ctx/fiber.go index ffb631112abe..6f3d6c06d59f 100644 --- a/core/http/ctx/fiber.go +++ b/core/http/ctx/fiber.go @@ -38,6 +38,17 @@ func ModelFromContext(ctx *fiber.Ctx, loader *model.ModelLoader, modelInput stri if bearerExists { log.Debug().Msgf("Using model from bearer token: %s", bearer) modelInput = bearer + //Takes precedence, then return inmediatly + return modelInput,nil } + + //if a model is specified in modelInput, and there is no bearertoken,but it doesn´t exists in the model path, return QUICKLY. https://github.com/mudler/LocalAI/issues/1076 + modelExists := modelInput != "" && loader.ExistsInModelPath(modelInput) + if !modelExists{ + log.Debug().Msgf("The specified model does not exist in the model PATH, returning error") + return "", fmt.Errorf("no model specified") + } + + //A model is specified, or the first available model is picked return modelInput, nil } From 1766deec45a6cc2b4ef31273ca6a2bf8233f5ae3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Serta=C3=A7=20=C3=96zercan?= <852750+sozercan@users.noreply.github.com> Date: Mon, 13 May 2024 02:37:52 -0700 Subject: [PATCH 2/8] feat: auto select llama-cpp cpu variant (#2305) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * auto select cpu variant Signed-off-by: Sertac Ozercan * remove cuda target for now Signed-off-by: Sertac Ozercan * fix metal Signed-off-by: Sertac Ozercan * fix path Signed-off-by: Sertac Ozercan --------- Signed-off-by: Sertac Ozercan Signed-off-by: Jose Luis López López --- Makefile | 42 +++++++------- go.mod | 77 ++++++++++++------------- go.sum | 18 +----- pkg/model/initializers.go | 114 +++++++++++++++++++++++++++----------- 4 files changed, 140 insertions(+), 111 deletions(-) diff --git a/Makefile b/Makefile index 7f3b9cf84007..9725faf1bdfa 100644 --- a/Makefile +++ b/Makefile @@ -70,7 +70,7 @@ UNAME_S := $(shell uname -s) endif ifeq ($(OS),Darwin) - + ifeq ($(OSX_SIGNING_IDENTITY),) OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/') endif @@ -154,8 +154,8 @@ endif ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings -ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp -ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-noavx +ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx +ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all @@ -652,30 +652,30 @@ else LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server endif -backend-assets/grpc/llama-cpp: backend-assets/grpc - $(info ${GREEN}I llama-cpp build info:standard${RESET}) - cp -rf backend/cpp/llama backend/cpp/llama-default - $(MAKE) -C backend/cpp/llama-default purge - $(MAKE) VARIANT="llama-default" build-llama-cpp-grpc-server - cp -rfv backend/cpp/llama-default/grpc-server backend-assets/grpc/llama-cpp -# TODO: every binary should have its own folder instead, so can have different metal implementations -ifeq ($(BUILD_TYPE),metal) - cp backend/cpp/llama-default/llama.cpp/build/bin/default.metallib backend-assets/grpc/ -endif +backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc + cp -rf backend/cpp/llama backend/cpp/llama-avx2 + $(MAKE) -C backend/cpp/llama-avx2 purge + $(info ${GREEN}I llama-cpp build info:avx2${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server + cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2 -backend-assets/grpc/llama-cpp-noavx: backend-assets/grpc - cp -rf backend/cpp/llama backend/cpp/llama-noavx - $(MAKE) -C backend/cpp/llama-noavx purge - $(info ${GREEN}I llama-cpp build info:noavx${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF" $(MAKE) VARIANT="llama-noavx" build-llama-cpp-grpc-server - cp -rfv backend/cpp/llama-noavx/grpc-server backend-assets/grpc/llama-cpp-noavx +backend-assets/grpc/llama-cpp-avx: backend-assets/grpc + cp -rf backend/cpp/llama backend/cpp/llama-avx + $(MAKE) -C backend/cpp/llama-avx purge + $(info ${GREEN}I llama-cpp build info:avx${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server + cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc cp -rf backend/cpp/llama backend/cpp/llama-fallback $(MAKE) -C backend/cpp/llama-fallback purge $(info ${GREEN}I llama-cpp build info:fallback${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server + CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback +# TODO: every binary should have its own folder instead, so can have different metal implementations +ifeq ($(BUILD_TYPE),metal) + cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/ +endif backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \ @@ -719,7 +719,7 @@ docker: --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \ --build-arg BUILD_TYPE=$(BUILD_TYPE) \ -t $(DOCKER_IMAGE) . - + docker-aio: @echo "Building AIO image with base $(BASE_IMAGE) as $(DOCKER_AIO_IMAGE)" docker build \ diff --git a/go.mod b/go.mod index 75fd5a82afe2..833fea9f8c19 100644 --- a/go.mod +++ b/go.mod @@ -7,8 +7,11 @@ toolchain go1.22.2 require ( github.com/M0Rf30/go-tiny-dream v0.0.0-20231128165230-772a9c0d9aaf github.com/Masterminds/sprig/v3 v3.2.3 + github.com/alecthomas/kong v0.9.0 github.com/charmbracelet/glamour v0.7.0 + github.com/chasefleming/elem-go v0.25.0 github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df + github.com/elliotchance/orderedmap/v2 v2.2.0 github.com/fsnotify/fsnotify v1.7.0 github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230628193450-85ed71aaec8e github.com/go-audio/wav v1.1.0 @@ -18,10 +21,13 @@ require ( github.com/gofiber/swagger v1.0.0 github.com/gofiber/template/html/v2 v2.1.1 github.com/google/uuid v1.5.0 - github.com/hashicorp/go-multierror v1.1.1 github.com/hpcloud/tail v1.0.0 github.com/imdario/mergo v0.3.16 + github.com/jaypipes/ghw v0.12.0 + github.com/joho/godotenv v1.5.1 + github.com/klauspost/cpuid/v2 v2.2.7 github.com/mholt/archiver/v3 v3.5.1 + github.com/microcosm-cc/bluemonday v1.0.26 github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530 @@ -35,6 +41,7 @@ require ( github.com/russross/blackfriday v1.6.0 github.com/sashabaranov/go-openai v1.20.4 github.com/schollz/progressbar/v3 v3.13.1 + github.com/shirou/gopsutil/v3 v3.23.9 github.com/stretchr/testify v1.9.0 github.com/swaggo/swag v1.16.3 github.com/tmc/langchaingo v0.0.0-20231019140956-c636b3da7701 @@ -43,23 +50,13 @@ require ( go.opentelemetry.io/otel/exporters/prometheus v0.42.0 go.opentelemetry.io/otel/metric v1.19.0 go.opentelemetry.io/otel/sdk/metric v1.19.0 + golang.org/x/sys v0.19.0 google.golang.org/grpc v1.59.0 google.golang.org/protobuf v1.33.0 gopkg.in/yaml.v2 v2.4.0 gopkg.in/yaml.v3 v3.0.1 ) -require ( - github.com/go-ole/go-ole v1.2.6 // indirect - github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect - github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect - github.com/shirou/gopsutil/v3 v3.23.9 - github.com/shoenig/go-m1cpu v0.1.6 // indirect - github.com/tklauser/go-sysconf v0.3.12 // indirect - github.com/tklauser/numcpus v0.6.1 // indirect - github.com/yusufpapurcu/wmi v1.2.3 // indirect -) - require ( github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78 // indirect github.com/KyleBanks/depth v1.2.1 // indirect @@ -69,12 +66,12 @@ require ( github.com/Nvveen/Gotty v0.0.0-20120604004816-cd527374f1e5 // indirect github.com/StackExchange/wmi v1.2.1 // indirect github.com/alecthomas/chroma/v2 v2.8.0 // indirect + github.com/andybalholm/brotli v1.0.5 // indirect github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/aymerick/douceur v0.2.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cenkalti/backoff/v4 v4.1.3 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect - github.com/chasefleming/elem-go v0.25.0 // indirect github.com/containerd/continuity v0.3.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/dlclark/regexp2 v1.8.1 // indirect @@ -84,34 +81,45 @@ require ( github.com/docker/go-units v0.4.0 // indirect github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 // indirect github.com/ghodss/yaml v1.0.0 // indirect + github.com/go-audio/audio v1.0.0 // indirect + github.com/go-audio/riff v1.0.0 // indirect + github.com/go-logr/logr v1.2.4 // indirect github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/spec v0.21.0 // indirect github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect + github.com/gofiber/contrib/fiberzerolog v1.0.0 github.com/gofiber/template v1.8.3 // indirect github.com/gofiber/utils v1.1.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/golang/snappy v0.0.2 // indirect + github.com/google/go-cmp v0.6.0 // indirect + github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect github.com/gorilla/css v1.0.1 // indirect github.com/huandu/xstrings v1.3.3 // indirect - github.com/jaypipes/ghw v0.12.0 // indirect github.com/jaypipes/pcidb v1.0.0 // indirect github.com/josharian/intern v1.0.0 // indirect - github.com/klauspost/cpuid/v2 v2.2.7 // indirect + github.com/klauspost/compress v1.17.0 // indirect github.com/klauspost/pgzip v1.2.5 // indirect github.com/lucasb-eyer/go-colorful v1.2.0 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mattn/go-runewidth v0.0.15 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect - github.com/microcosm-cc/bluemonday v1.0.26 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/mitchellh/copystructure v1.0.0 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/mitchellh/reflectwalk v1.0.0 // indirect github.com/moby/term v0.0.0-20201216013528-df9cb8a40635 // indirect + github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760 github.com/muesli/reflow v0.3.0 // indirect github.com/muesli/termenv v0.15.2 // indirect github.com/nwaples/rardecode v1.1.0 // indirect @@ -123,53 +131,38 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pkoukk/tiktoken-go v0.1.2 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect github.com/prometheus/common v0.44.0 // indirect github.com/prometheus/procfs v0.11.1 // indirect + github.com/rivo/uniseg v0.2.0 // indirect + github.com/shoenig/go-m1cpu v0.1.6 // indirect github.com/shopspring/decimal v1.2.0 // indirect github.com/sirupsen/logrus v1.8.1 // indirect github.com/spf13/cast v1.3.1 // indirect github.com/swaggo/files/v2 v2.0.0 // indirect + github.com/tklauser/go-sysconf v0.3.12 // indirect + github.com/tklauser/numcpus v0.6.1 // indirect github.com/ulikunitz/xz v0.5.9 // indirect + github.com/valyala/bytebufferpool v1.0.0 // indirect + github.com/valyala/tcplisten v1.0.0 // indirect github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f // indirect github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect github.com/xeipuuv/gojsonschema v1.2.0 // indirect github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect github.com/yuin/goldmark v1.5.4 // indirect github.com/yuin/goldmark-emoji v1.0.2 // indirect + github.com/yusufpapurcu/wmi v1.2.3 // indirect go.opentelemetry.io/otel/sdk v1.19.0 // indirect go.opentelemetry.io/otel/trace v1.19.0 // indirect golang.org/x/crypto v0.22.0 // indirect golang.org/x/mod v0.16.0 // indirect + golang.org/x/net v0.24.0 // indirect golang.org/x/term v0.19.0 // indirect + golang.org/x/text v0.14.0 // indirect + golang.org/x/tools v0.19.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d // indirect gopkg.in/fsnotify.v1 v1.4.7 // indirect gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect howett.net/plist v1.0.0 // indirect ) - -require ( - github.com/alecthomas/kong v0.9.0 - github.com/andybalholm/brotli v1.0.5 // indirect - github.com/go-audio/audio v1.0.0 // indirect - github.com/go-audio/riff v1.0.0 // indirect - github.com/go-logr/logr v1.2.4 // indirect - github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect - github.com/gofiber/contrib/fiberzerolog v1.0.0 - github.com/google/go-cmp v0.6.0 // indirect - github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 // indirect - github.com/hashicorp/errwrap v1.0.0 // indirect - github.com/joho/godotenv v1.5.1 - github.com/klauspost/compress v1.17.0 // indirect - github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.20 // indirect - github.com/mattn/go-runewidth v0.0.15 // indirect - github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760 - github.com/rivo/uniseg v0.2.0 // indirect - github.com/valyala/bytebufferpool v1.0.0 // indirect - github.com/valyala/tcplisten v1.0.0 // indirect - golang.org/x/net v0.24.0 // indirect - golang.org/x/sys v0.19.0 // indirect - golang.org/x/text v0.14.0 // indirect - golang.org/x/tools v0.19.0 // indirect -) diff --git a/go.sum b/go.sum index 7f23b64d63ea..da2cef048842 100644 --- a/go.sum +++ b/go.sum @@ -67,6 +67,8 @@ github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df/go.mod h1:gWy7 github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 h1:iFaUwBSo5Svw6L7HYpRu/0lE3e0BaElwnNO1qkNQxBY= github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s= github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY= +github.com/elliotchance/orderedmap/v2 v2.2.0 h1:7/2iwO98kYT4XkOjA9mBEIwvi4KpGB4cyHeOFOnj4Vk= +github.com/elliotchance/orderedmap/v2 v2.2.0/go.mod h1:85lZyVbpGaGvHvnKa7Qhx7zncAdBIBq6u56Hb1PRU5Q= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= @@ -149,14 +151,8 @@ github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3 github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY= -github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c= github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8= github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0= -github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= -github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= -github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= -github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= @@ -299,8 +295,6 @@ github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJ github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= -github.com/rs/zerolog v1.31.0 h1:FcTR3NnLWW+NnTwwhFWiJSZr4ECLpqCm6QsEnyvbV4A= -github.com/rs/zerolog v1.31.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= github.com/rs/zerolog v1.32.0 h1:keLypqrlIjaFsbmJOBdB/qvyF8KEtCWHwobLp5l/mQ0= github.com/rs/zerolog v1.32.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= github.com/russross/blackfriday v1.6.0 h1:KqfZb0pUVN2lYqZUYRddxF4OR8ZMURnJIG5Y3VRLtww= @@ -391,8 +385,6 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= -golang.org/x/crypto v0.21.0 h1:X31++rzVUdKhX5sWmSOFZxx8UW/ldWx55cbf08iNAMA= -golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30= golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -411,8 +403,6 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= -golang.org/x/net v0.22.0 h1:9sGLhx7iRIHEiX0oAJ3MRZMUCElJgy7Br1nO+AMN3Tc= -golang.org/x/net v0.22.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w= golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -450,16 +440,12 @@ golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= -golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= -golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8= -golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= golang.org/x/term v0.19.0 h1:+ThwsDv+tYfnJFhF4L8jITxu1tdTWRTZpdsWgEgjL6Q= golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index d5c77c3f390f..4ccc21145458 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -13,6 +13,9 @@ import ( grpc "github.com/go-skynet/LocalAI/pkg/grpc" "github.com/phayes/freeport" "github.com/rs/zerolog/log" + "golang.org/x/sys/cpu" + + "github.com/elliotchance/orderedmap/v2" ) var Aliases map[string]string = map[string]string{ @@ -24,8 +27,11 @@ var Aliases map[string]string = map[string]string{ const ( LlamaGGML = "llama-ggml" - LLamaCPP = "llama-cpp" + LLamaCPP = "llama-cpp" + LLamaCPPCUDA12 = "llama-cpp-cuda12" + LLamaCPPAVX2 = "llama-cpp-avx2" + LLamaCPPAVX = "llama-cpp-avx" LLamaCPPFallback = "llama-cpp-fallback" Gpt4AllLlamaBackend = "gpt4all-llama" @@ -50,14 +56,14 @@ func backendPath(assetDir, backend string) string { // backendsInAssetDir returns the list of backends in the asset directory // that should be loaded -func backendsInAssetDir(assetDir string) ([]string, error) { +func backendsInAssetDir(assetDir string) (*orderedmap.OrderedMap[string, any], error) { // Exclude backends from automatic loading excludeBackends := []string{LocalStoreBackend} entry, err := os.ReadDir(backendPath(assetDir, "")) if err != nil { return nil, err } - var backends []string + backends := make(map[string][]string) ENTRY: for _, e := range entry { for _, exclude := range excludeBackends { @@ -66,7 +72,28 @@ ENTRY: } } if !e.IsDir() { - backends = append(backends, e.Name()) + //backends = append(backends, e.Name()) + if !strings.Contains(e.Name(), LLamaCPP) { + backends[e.Name()] = []string{} + } + } + } + + foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback := false, false, false + if _, ok := backends[LLamaCPP]; !ok { + for _, e := range entry { + if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX2) + foundLCPPAVX2 = true + } + if strings.Contains(e.Name(), LLamaCPPAVX) && !foundLCPPAVX { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX) + foundLCPPAVX = true + } + if strings.Contains(e.Name(), LLamaCPPFallback) && !foundLCPPFallback { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback) + foundLCPPFallback = true + } } } @@ -77,37 +104,40 @@ ENTRY: // First has more priority priorityList := []string{ // First llama.cpp and llama-ggml - LLamaCPP, LLamaCPPFallback, LlamaGGML, Gpt4All, + LLamaCPP, LlamaGGML, Gpt4All, } + toTheEnd := []string{ // last has to be huggingface LCHuggingFaceBackend, // then bert embeddings BertEmbeddingsBackend, } - slices.Reverse(priorityList) - slices.Reverse(toTheEnd) - - // order certain backends first - for _, b := range priorityList { - for i, be := range backends { - if be == b { - backends = append([]string{be}, append(backends[:i], backends[i+1:]...)...) - break - } + + // create an ordered map + orderedBackends := orderedmap.NewOrderedMap[string, any]() + // add priorityList first + for _, p := range priorityList { + if _, ok := backends[p]; ok { + orderedBackends.Set(p, backends[p]) } } - // make sure that some others are pushed at the end - for _, b := range toTheEnd { - for i, be := range backends { - if be == b { - backends = append(append(backends[:i], backends[i+1:]...), be) - break + + for k, v := range backends { + if !slices.Contains(toTheEnd, k) { + if _, ok := orderedBackends.Get(k); !ok { + orderedBackends.Set(k, v) } } } - return backends, nil + for _, t := range toTheEnd { + if _, ok := backends[t]; ok { + orderedBackends.Set(t, backends[t]) + } + } + + return orderedBackends, nil } // starts the grpcModelProcess for the backend, and returns a grpc client @@ -159,6 +189,21 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string } } else { grpcProcess := backendPath(o.assetDir, backend) + + // for llama-cpp, check CPU capabilities and load the appropriate variant + if backend == LLamaCPP { + if cpu.X86.HasAVX2 { + log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) + grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2) + } else if cpu.X86.HasAVX { + log.Info().Msgf("[%s] attempting to load with AVX variant", backend) + grpcProcess = backendPath(o.assetDir, LLamaCPPAVX) + } else { + log.Info().Msgf("[%s] attempting to load with fallback variant", backend) + grpcProcess = backendPath(o.assetDir, LLamaCPPFallback) + } + } + // Check if the file exists if _, err := os.Stat(grpcProcess); os.IsNotExist(err) { return "", fmt.Errorf("grpc process not found: %s. some backends(stablediffusion, tts) require LocalAI compiled with GO_TAGS", grpcProcess) @@ -301,25 +346,30 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) { var err error // autoload also external backends - allBackendsToAutoLoad := []string{} + allBackendsToAutoLoad := orderedmap.NewOrderedMap[string, any]() autoLoadBackends, err := backendsInAssetDir(o.assetDir) if err != nil { return nil, err } log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends) - allBackendsToAutoLoad = append(allBackendsToAutoLoad, autoLoadBackends...) + + for _, k := range autoLoadBackends.Keys() { + v, _ := autoLoadBackends.Get(k) + allBackendsToAutoLoad.Set(k, v) + } + for _, b := range o.externalBackends { - allBackendsToAutoLoad = append(allBackendsToAutoLoad, b) + allBackendsToAutoLoad.Set(b, []string{}) } if o.model != "" { - log.Info().Msgf("Trying to load the model '%s' with all the available backends: %s", o.model, strings.Join(allBackendsToAutoLoad, ", ")) + log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, allBackendsToAutoLoad.Keys()) } - for _, b := range allBackendsToAutoLoad { - log.Info().Msgf("[%s] Attempting to load", b) + for _, key := range allBackendsToAutoLoad.Keys() { + log.Info().Msgf("[%s] Attempting to load", key) options := []Option{ - WithBackendString(b), + WithBackendString(key), WithModel(o.model), WithLoadGRPCLoadModelOpts(o.gRPCOptions), WithThreads(o.threads), @@ -332,14 +382,14 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) { model, modelerr := ml.BackendLoader(options...) if modelerr == nil && model != nil { - log.Info().Msgf("[%s] Loads OK", b) + log.Info().Msgf("[%s] Loads OK", key) return model, nil } else if modelerr != nil { err = errors.Join(err, modelerr) - log.Info().Msgf("[%s] Fails: %s", b, modelerr.Error()) + log.Info().Msgf("[%s] Fails: %s", key, modelerr.Error()) } else if model == nil { err = errors.Join(err, fmt.Errorf("backend returned no usable model")) - log.Info().Msgf("[%s] Fails: %s", b, "backend returned no usable model") + log.Info().Msgf("[%s] Fails: %s", key, "backend returned no usable model") } } From 2e38c4ef12dcf5918cf24dfbef0a2e2250bbabe4 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 13 May 2024 18:44:10 +0200 Subject: [PATCH 3/8] models(gallery): add aura-llama-Abliterated (#2309) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ettore Di Giacinto Signed-off-by: Jose Luis López López --- gallery/index.yaml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 66faea5f6a10..d50885d758ba 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -370,6 +370,27 @@ - filename: L3-Solana-8B-v1.q5_K_M.gguf sha256: 9b8cd2c3beaab5e4f82efd10e7d44f099ad40a4e0ee286ca9fce02c8eec26d2f uri: huggingface://Sao10K/L3-Solana-8B-v1-GGUF/L3-Solana-8B-v1.q5_K_M.gguf +- !!merge <<: *llama3 + name: "aura-llama-abliterated" + icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/AwLNDVB-GIY7k0wnVV_TX.png + license: apache-2.0 + urls: + - https://huggingface.co/TheSkullery/Aura-Llama-Abliterated + - https://huggingface.co/mudler/Aura-Llama-Abliterated-Q4_K_M-GGUF + description: | + Aura-llama is using the methodology presented by SOLAR for scaling LLMs called depth up-scaling (DUS), which encompasses architectural modifications with continued pretraining. Using the solar paper as a base, I integrated Llama-3 weights into the upscaled layers, and In the future plan to continue training the model. + + Aura-llama is a merge of the following models to create a base model to work from: + + meta-llama/Meta-Llama-3-8B-Instruct + meta-llama/Meta-Llama-3-8B-Instruct + overrides: + parameters: + model: aura-llama-abliterated.Q4_K_M.gguf + files: + - filename: aura-llama-abliterated.Q4_K_M.gguf + sha256: ad4a16b90f1ffb5b49185b3fd00ed7adb1cda69c4fad0a1d987bd344ce601dcd + uri: huggingface://mudler/Aura-Llama-Abliterated-Q4_K_M-GGUF/aura-llama-abliterated.Q4_K_M.gguf - !!merge <<: *llama3 name: "average_normie_l3_v1_8b-gguf-iq-imatrix" urls: From 2f01e6f41adb19c98853eba44770088b6accf7eb Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 13 May 2024 18:44:25 +0200 Subject: [PATCH 4/8] models(gallery): add Bunny-llama (#2311) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ettore Di Giacinto Signed-off-by: Jose Luis López López --- gallery/index.yaml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index d50885d758ba..93cb489b62d6 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -852,6 +852,33 @@ - filename: Llava_1.5_Llama3_mmproj_updated.gguf sha256: 4f2bb77ca60f2c932d1c6647d334f5d2cd71966c19e850081030c9883ef1906c uri: https://huggingface.co/ChaoticNeutrals/LLaVA-Llama-3-8B-mmproj-Updated/resolve/main/llava-v1.5-8B-Updated-Stop-Token/mmproj-model-f16.gguf +- !!merge <<: *llama3 + name: "bunny-llama-3-8b-v" + urls: + - https://huggingface.co/BAAI/Bunny-Llama-3-8B-V-gguf + description: | + Bunny is a family of lightweight but powerful multimodal models. It offers multiple plug-and-play vision encoders, like EVA-CLIP, SigLIP and language backbones, including Llama-3-8B, Phi-1.5, StableLM-2, Qwen1.5, MiniCPM and Phi-2. To compensate for the decrease in model size, we construct more informative training data by curated selection from a broader data source. + + We provide Bunny-Llama-3-8B-V, which is built upon SigLIP and Llama-3-8B-Instruct. More details about this model can be found in GitHub. + icon: https://huggingface.co/BAAI/Bunny-Llama-3-8B-V-gguf/resolve/main/icon.png + tags: + - llm + - multimodal + - gguf + - gpu + - llama3 + - cpu + overrides: + mmproj: Bunny-Llama-3-8B-Q4_K_M-mmproj.gguf + parameters: + model: Bunny-Llama-3-8B-Q4_K_M.gguf + files: + - filename: Bunny-Llama-3-8B-Q4_K_M-mmproj.gguf + sha256: 96d033387a91e56cf97fa5d60e02c0128ce07c8fa83aaaefb74ec40541615ea5 + uri: huggingace://BAAI/Bunny-Llama-3-8B-V-gguf/mmproj-model-f16.gguf + - filename: Bunny-Llama-3-8B-Q4_K_M.gguf + sha256: 88f0a61f947dbf129943328be7262ae82e3a582a0c75e53544b07f70355a7c30 + uri: huggingace://BAAI/Bunny-Llama-3-8B-V-gguf/ggml-model-Q4_K_M.gguf - !!merge <<: *llama3 name: "llava-llama-3-8b-v1_1" description: | From 78eba71c0f521117956ed279e179460965e485e5 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 13 May 2024 18:44:44 +0200 Subject: [PATCH 5/8] models(gallery): add lumimaidv2 (#2312) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ettore Di Giacinto Signed-off-by: Jose Luis López López --- gallery/index.yaml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 93cb489b62d6..f7dbf50fa753 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -572,6 +572,29 @@ - filename: Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf sha256: 1199440aa13c55f5f2cad1cb215535306f21e52a81de23f80a9e3586c8ac1c50 uri: huggingface://Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix/Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf +- !!merge <<: *llama3 + name: "llama-3-lumimaid-v2-8b-v0.1-oas-iq-imatrix" + urls: + - https://huggingface.co/Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix + icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/JUxfdTot7v7LTdIGYyzYM.png + license: cc-by-nc-4.0 + description: | + This model uses the Llama3 prompting format. + + Llama3 trained on our RP datasets, we tried to have a balance between the ERP and the RP, not too horny, but just enough. + + We also added some non-RP dataset, making the model less dumb overall. It should look like a 40%/60% ratio for Non-RP/RP+ERP data. + + "This model received the Orthogonal Activation Steering treatment, meaning it will rarely refuse any request." + + This is v2! + overrides: + parameters: + model: v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf + files: + - filename: v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf + sha256: b00b4cc2ea4e06db592e5f581171758387106626bcbf445c03a1cb7b424be881 + uri: huggingface://Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix/v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf - !!merge <<: *llama3 name: "suzume-llama-3-8B-multilingual" urls: From 118ff66930727d762e55243de373fbaa4be807b5 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 13 May 2024 18:45:58 +0200 Subject: [PATCH 6/8] models(gallery): add orthocopter (#2313) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ettore Di Giacinto Signed-off-by: Jose Luis López López --- gallery/index.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index f7dbf50fa753..88431cdd1f45 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -485,6 +485,24 @@ - filename: Llama-3-Unholy-8B.q8_0.gguf uri: huggingface://Undi95/Llama-3-Unholy-8B-GGUF/Llama-3-Unholy-8B.q8_0.gguf sha256: 419dd76f61afe586076323c17c3a1c983e591472717f1ea178167ede4dc864df +- !!merge <<: *llama3 + name: "orthocopter_8b-imatrix" + urls: + - https://huggingface.co/Lewdiculous/Orthocopter_8B-GGUF-Imatrix + icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/cxM5EaC6ilXnSo_10stA8.png + description: | + This model is thanks to the hard work of lucyknada with the Edgerunners. Her work produced the following model, which I used as the base: + + https://huggingface.co/Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-10fail-1000total + + I then applied two handwritten datasets over top of this and the results are pretty nice, with no refusals and plenty of personality. + overrides: + parameters: + model: Orthocopter_8B-Q4_K_M-imat.gguf + files: + - filename: Orthocopter_8B-Q4_K_M-imat.gguf + uri: huggingface://Lewdiculous/Orthocopter_8B-GGUF-Imatrix/Orthocopter_8B-Q4_K_M-imat.gguf + sha256: ce93366c9eb20329530b19b9d6841a973d458bcdcfa8a521e9f9d0660cc94578 - !!merge <<: *llama3 name: "therapyllama-8b-v1" urls: From 52bd3dc2ef41b2b7f77f5febc61eda324729ee66 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 13 May 2024 19:07:51 +0200 Subject: [PATCH 7/8] feat(llama.cpp): add `flash_attention` and `no_kv_offloading` (#2310) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit feat(llama.cpp): add flash_attn and no_kv_offload Signed-off-by: Ettore Di Giacinto Signed-off-by: Jose Luis López López --- backend/backend.proto | 3 +++ backend/cpp/llama/grpc-server.cpp | 3 +++ core/backend/options.go | 2 ++ core/config/backend_config.go | 3 +++ 4 files changed, 11 insertions(+) diff --git a/backend/backend.proto b/backend/backend.proto index 778a96ffac97..cb87fe02d46f 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -212,6 +212,9 @@ message ModelOptions { float YarnBetaSlow = 47; string Type = 49; + + bool FlashAttention = 56; + bool NoKVOffload = 57; } message Result { diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 6fb086585f4e..f9673b33ccfa 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -2254,6 +2254,9 @@ static void params_parse(const backend::ModelOptions* request, } params.use_mlock = request->mlock(); params.use_mmap = request->mmap(); + params.flash_attn = request->flashattention(); + params.no_kv_offload = request->nokvoffload(); + params.embedding = request->embeddings(); if (request->ropescaling() == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } diff --git a/core/backend/options.go b/core/backend/options.go index 4a7435e68586..c638ebd5ed83 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -77,6 +77,8 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions { MaxModelLen: int32(c.MaxModelLen), TensorParallelSize: int32(c.TensorParallelSize), MMProj: c.MMProj, + FlashAttention: c.FlashAttention, + NoKVOffload: c.NoKVOffloading, YarnExtFactor: c.YarnExtFactor, YarnAttnFactor: c.YarnAttnFactor, YarnBetaFast: c.YarnBetaFast, diff --git a/core/config/backend_config.go b/core/config/backend_config.go index cb1b7c2a360b..41c792fb1da3 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -132,6 +132,9 @@ type LLMConfig struct { TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM MMProj string `yaml:"mmproj"` + FlashAttention bool `yaml:"flash_attention"` + NoKVOffloading bool `yaml:"no_kv_offloading"` + RopeScaling string `yaml:"rope_scaling"` ModelType string `yaml:"type"` From fa02b567cff3e84fa24a362a38d3d4ca548f108d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20Luis=20L=C3=B3pez=20L=C3=B3pez?= Date: Mon, 13 May 2024 20:39:53 +0200 Subject: [PATCH 8/8] better message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modified the message to report that the specified model doesn´t exist in the model Path. Signed-off-by: Jose Luis López López --- core/http/ctx/fiber.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/http/ctx/fiber.go b/core/http/ctx/fiber.go index 6f3d6c06d59f..5b2e1e0702e2 100644 --- a/core/http/ctx/fiber.go +++ b/core/http/ctx/fiber.go @@ -46,7 +46,7 @@ func ModelFromContext(ctx *fiber.Ctx, loader *model.ModelLoader, modelInput stri modelExists := modelInput != "" && loader.ExistsInModelPath(modelInput) if !modelExists{ log.Debug().Msgf("The specified model does not exist in the model PATH, returning error") - return "", fmt.Errorf("no model specified") + return "", fmt.Errorf("invalid model filename") } //A model is specified, or the first available model is picked