Dockerfile.gpu

FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04

# Install Python
RUN apt update
RUN apt install -y python3 python3-pip && python3 --version
 
# Install PyTorch with GPU support
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install git for bleeding-edge dependencies
RUN apt install -y git

COPY ./requirements.gpu.txt requirements.gpu.txt
RUN pip3 install -r requirements.gpu.txt

# Expose library path to dynamic library linking
ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/cuda-12/targets/x86_64-linux/lib

# Tell bitsandbytes to use CUDA 121
RUN ln -s /usr/local/cuda-12/targets/x86_64-linux/lib/libcudart.so.12 /usr/local/cuda-12/targets/x86_64-linux/lib/libcudart.so 

# User land code
RUN pip install git+https://github.com/paolorechia/GPTQ-for-LLaMa@3797a5ced97e9075bd1786c5744091c146c7bc9f
COPY ./guidance_server guidance_server
WORKDIR guidance_server

RUN LLAMA_CUBLAS=1 CMAKE_ARGS=-DLLAMA_CUBLAS=on FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir --force-reinstall --verbose
# Set the entrypoint
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "9000"]