-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdocker-compose.gpu.yaml
68 lines (60 loc) · 1.98 KB
/
docker-compose.gpu.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
version: '3'
services:
guidance:
image: guidance_server:gpu-0.5
ports:
- "9000:9000"
volumes:
- ./models:/models
# Add the "labels" section to set the container to remove itself on shutdown
labels:
- "com.docker.compose.container-name=guidance-server"
- "com.docker.compose.service=guidance-server"
- "com.docker.compose.oneoff=False"
- "com.docker.compose.project=brainchulo-guidance-server"
# Add the "stop_signal: SIGINT" option to use SIGINT signal for stopping the container gracefully
stop_signal: SIGINT
environment:
GENERAL_BASE_IMAGE: GPU
# CPP Model Example:
GENERAL_MODEL_PATH: /models/open-llama-7B-open-instruct.ggmlv3.q4_0.bin
GENERAL_TOKENIZER_PATH: /models/VMware_open-llama-7b-open-instruct
GENERAL_LOADING_METHOD: CPP
# GPTQ Model Example:
# GENERAL_MODEL_PATH: /models/vicuna-7B-1.1-GPTQ-4bit-128g
# GENERAL_LOADING_METHOD: GPTQ
# HF Model Example
# GENERAL_MODEL_PATH: /models/VMware_open-llama-7b-open-instruct
# GENERAL_LOADING_METHOD: HUGGING_FACE
# Guidance Settings
GUIDANCE_AFTER_ROLE: "|>"
GUIDANCE_BEFORE_ROLE: "<|"
# Tokenizer Settings
TK_BOOL_USE_FAST: false
# HuggingFace
HF_BOOL_USE_8_BIT: true
HF_BOOL_USE_4_BIT: false
HF_DEVICE_MAP: auto
# GPTQ
GPTQ_INT_WBITS: 4
GPTQ_INT_GROUP_SIZE: 128
# How many layers loaded into GPU, decrease to save more VRAM at the expense of inference speed
GPTQ_INT_PRE_LOADED_LAYERS: 20
GPTQ_DEVICE: "cuda"
GPTQ_BOOL_CPU_OFFLOADING: true
# LLaMA CPP
CPP_INT_N_GPU_LAYERS: 300
CPP_INT_N_THREADS: 12
CPP_BOOL_CACHING: false
# Need to increase ulimit for LLaMA
ulimits:
memlock: 8791351808
deploy:
resources:
limits:
memory: 28000M
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]