docker-compose.gpu.yaml

version: '3'
services:
  guidance:
    image: guidance_server:gpu-0.5
    ports:
      - "9000:9000"
    volumes:
      - ./models:/models
    # Add the "labels" section to set the container to remove itself on shutdown
    labels:
      - "com.docker.compose.container-name=guidance-server"
      - "com.docker.compose.service=guidance-server"
      - "com.docker.compose.oneoff=False"
      - "com.docker.compose.project=brainchulo-guidance-server"
    # Add the "stop_signal: SIGINT" option to use SIGINT signal for stopping the container gracefully
    stop_signal: SIGINT
    environment:
      GENERAL_BASE_IMAGE: GPU
      # CPP Model Example:
      GENERAL_MODEL_PATH: /models/open-llama-7B-open-instruct.ggmlv3.q4_0.bin
      GENERAL_TOKENIZER_PATH: /models/VMware_open-llama-7b-open-instruct
      GENERAL_LOADING_METHOD: CPP

      # GPTQ Model Example:
      # GENERAL_MODEL_PATH: /models/vicuna-7B-1.1-GPTQ-4bit-128g
      # GENERAL_LOADING_METHOD: GPTQ

      # HF Model Example
      # GENERAL_MODEL_PATH: /models/VMware_open-llama-7b-open-instruct
      # GENERAL_LOADING_METHOD: HUGGING_FACE

      # Guidance Settings
      GUIDANCE_AFTER_ROLE: "|>"
      GUIDANCE_BEFORE_ROLE: "<|"

      # Tokenizer Settings
      TK_BOOL_USE_FAST: false

      # HuggingFace
      HF_BOOL_USE_8_BIT: true
      HF_BOOL_USE_4_BIT: false
      HF_DEVICE_MAP: auto

      # GPTQ
      GPTQ_INT_WBITS: 4
      GPTQ_INT_GROUP_SIZE: 128
      # How many layers loaded into GPU, decrease to save more VRAM at the expense of inference speed
      GPTQ_INT_PRE_LOADED_LAYERS: 20
      GPTQ_DEVICE: "cuda"
      GPTQ_BOOL_CPU_OFFLOADING: true

      # LLaMA CPP
      CPP_INT_N_GPU_LAYERS: 300
      CPP_INT_N_THREADS: 12
      CPP_BOOL_CACHING: false

    # Need to increase ulimit for LLaMA
    ulimits:
      memlock: 8791351808
    deploy:
      resources:
        limits:
          memory: 28000M
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]