From d6d1fea5d4729e567659e6114ae2427ce62af409 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Sat, 8 Mar 2025 00:05:57 +0800 Subject: [PATCH] Re-arch on tutorials/quickstart/installation Signed-off-by: Yikun Jiang --- docs/source/conf.py | 2 + docs/source/developer_guide/contributing.md | 10 +- .../source/developer_guide/contributing.zh.md | 10 +- docs/source/index.md | 2 +- docs/source/installation.md | 41 ++- docs/source/quick_start.md | 31 +- docs/source/tutorials.md | 317 ------------------ docs/source/tutorials/index.md | 9 + docs/source/tutorials/multi_node.md | 109 ++++++ docs/source/tutorials/multi_npu.md | 105 ++++++ docs/source/tutorials/single_npu.md | 133 ++++++++ 11 files changed, 412 insertions(+), 357 deletions(-) delete mode 100644 docs/source/tutorials.md create mode 100644 docs/source/tutorials/index.md create mode 100644 docs/source/tutorials/multi_node.md create mode 100644 docs/source/tutorials/multi_npu.md create mode 100644 docs/source/tutorials/single_npu.md diff --git a/docs/source/conf.py b/docs/source/conf.py index b1bb2bfa..4ec952a2 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -72,6 +72,8 @@ # This value should be updated when cut down release. 'pip_vllm_ascend_version': "0.7.3rc1", 'pip_vllm_version': "0.7.3", + # CANN image tag + 'cann_image_tag': "8.0.0-910b-ubuntu22.04-py3.10", } # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/developer_guide/contributing.md b/docs/source/developer_guide/contributing.md index 953550b5..5bfdfa21 100644 --- a/docs/source/developer_guide/contributing.md +++ b/docs/source/developer_guide/contributing.md @@ -53,21 +53,21 @@ locally. The simplest way to run these integration tests locally is through a co git clone https://github.com/vllm-project/vllm-ascend.git cd vllm-ascend -IMAGE=vllm-ascend-dev-image -CONTAINER_NAME=vllm-ascend-dev -DEVICE=/dev/davinci1 +export IMAGE=vllm-ascend-dev-image +export CONTAINER_NAME=vllm-ascend-dev +export DEVICE=/dev/davinci1 # The first build will take about 10 mins (10MB/s) to download the base image and packages docker build -t $IMAGE -f ./Dockerfile . # You can also specify the mirror repo via setting VLLM_REPO to speedup # docker build -t $IMAGE -f ./Dockerfile . --build-arg VLLM_REPO=https://gitee.com/mirrors/vllm -docker run --name $CONTAINER_NAME --network host --device $DEVICE \ +docker run --rm --name $CONTAINER_NAME --network host --device $DEVICE \ --device /dev/davinci_manager --device /dev/devmm_svm \ --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi \ -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ - -ti --rm $IMAGE bash + -ti $IMAGE bash cd vllm-ascend pip install -r requirements-dev.txt diff --git a/docs/source/developer_guide/contributing.zh.md b/docs/source/developer_guide/contributing.zh.md index 144ff35b..68b10446 100644 --- a/docs/source/developer_guide/contributing.zh.md +++ b/docs/source/developer_guide/contributing.zh.md @@ -48,21 +48,21 @@ git commit -sm "your commit info" git clone https://github.com/vllm-project/vllm-ascend.git cd vllm-ascend -IMAGE=vllm-ascend-dev-image -CONTAINER_NAME=vllm-ascend-dev -DEVICE=/dev/davinci1 +export IMAGE=vllm-ascend-dev-image +export CONTAINER_NAME=vllm-ascend-dev +export DEVICE=/dev/davinci1 # 首次构建会花费10分钟(10MB/s)下载基础镜像和包 docker build -t $IMAGE -f ./Dockerfile . # 您还可以通过设置 VLLM_REPO 来指定镜像仓库以加速 # docker build -t $IMAGE -f ./Dockerfile . --build-arg VLLM_REPO=https://gitee.com/mirrors/vllm -docker run --name $CONTAINER_NAME --network host --device $DEVICE \ +docker run --rm --name $CONTAINER_NAME --network host --device $DEVICE \ --device /dev/davinci_manager --device /dev/devmm_svm \ --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi \ -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ - -ti --rm $IMAGE bash + -ti $IMAGE bash cd vllm-ascend pip install -r requirements-dev.txt diff --git a/docs/source/index.md b/docs/source/index.md index e5f9b41b..0dee9993 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -35,7 +35,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l :maxdepth: 1 quick_start installation -tutorials +tutorials/index.md faqs ::: diff --git a/docs/source/installation.md b/docs/source/installation.md index beb8efcb..36189c35 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -44,10 +44,12 @@ Refer to [Ascend Environment Setup Guide](https://ascend.github.io/docs/sources/ The easiest way to prepare your software environment is using CANN image directly: -```bash +```{code-block} bash + :substitutions: # Update DEVICE according to your device (/dev/davinci[0-7]) export DEVICE=/dev/davinci7 - +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/cann:|cann_image_tag| docker run --rm \ --name vllm-ascend-env \ --device $DEVICE \ @@ -59,14 +61,16 @@ docker run --rm \ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ -v /etc/ascend_install.info:/etc/ascend_install.info \ - -it quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10 bash + -it $IMAGE bash ``` +:::{dropdown} Click here to see "Install CANN manally" +:animate: fade-in-slide-down You can also install CANN manually: -:::{note} +```{note} This guide takes aarch64 as an example. If you run on x86, you need to replace `aarch64` with `x86_64` for the package name shown below. -::: +``` ```bash # Create a virtual environment @@ -94,6 +98,8 @@ chmod +x. /Ascend-cann-nnal_8.0.0_linux-aarch64.run source /usr/local/Ascend/nnal/atb/set_env.sh ``` +::: + :::: ::::{tab-item} Before using docker @@ -125,6 +131,7 @@ pip install vllm==|pip_vllm_version| pip install vllm-ascend==|pip_vllm_ascend_version| --extra-index https://download.pytorch.org/whl/cpu/ ``` +:::{dropdown} Click here to see "Build from source code" or build from **source code**: ```{code-block} bash @@ -140,6 +147,7 @@ git clone --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-proj cd vllm-ascend pip install -e . --extra-index https://download.pytorch.org/whl/cpu/ ``` +::: Current version depends on a unreleased `torch-npu`, you need to install manually: @@ -167,14 +175,23 @@ pip install ./torch_npu-2.5.1.dev20250307-cp310-cp310-manylinux_2_17_aarch64.man You can just pull the **prebuilt image** and run it with bash. +:::{dropdown} Click here to see "Build from Dockerfile" +or build IMAGE from **source code**: + +```bash +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend +docker build -t vllm-ascend-dev-image:latest -f ./Dockerfile . +``` +::: + ```{code-block} bash :substitutions: # Update DEVICE according to your device (/dev/davinci[0-7]) -DEVICE=/dev/davinci7 +export DEVICE=/dev/davinci7 # Update the vllm-ascend image -IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| -docker pull $IMAGE +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| docker run --rm \ --name vllm-ascend-env \ --device $DEVICE \ @@ -189,14 +206,6 @@ docker run --rm \ -it $IMAGE bash ``` -or build IMAGE from **source code**: - -```bash -git clone https://github.com/vllm-project/vllm-ascend.git -cd vllm-ascend -docker build -t vllm-ascend-dev-image:latest -f ./Dockerfile . -``` - :::: ::::: diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index ff3a9f3a..8fbf178e 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -11,12 +11,13 @@ ```{code-block} bash :substitutions: -# You can change version a suitable one base on your requirement, e.g. main +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci0 +# Update the vllm-ascend image export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| - -docker run \ +docker run --rm \ --name vllm-ascend \ ---device /dev/davinci0 \ +--device $DEVICE \ --device /dev/davinci_manager \ --device /dev/devmm_svm \ --device /dev/hisi_hdc \ @@ -32,17 +33,19 @@ docker run \ ## Usage -There are two ways to start vLLM on Ascend NPU: - -### Offline Batched Inference with vLLM - -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). +You can use Modelscope mirror to speed up download: ```bash -# Use Modelscope mirror to speed up download export VLLM_USE_MODELSCOPE=true ``` +There are two ways to start vLLM on Ascend NPU: + +:::::{tab-set} +::::{tab-item} Offline Batched Inference + +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). + Try to run below Python script directly or use `python3` shell to generate texts: ```python @@ -64,15 +67,15 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -### OpenAI Completions API with vLLM +:::: + +::::{tab-item} OpenAI Completions API vLLM can also be deployed as a server that implements the OpenAI API protocol. Run the following command to start the vLLM server with the [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) model: ```bash -# Use Modelscope mirror to speed up download -export VLLM_USE_MODELSCOPE=true # Deploy vLLM server (The first run will take about 3-5 mins (10 MB/s) to download models) vllm serve Qwen/Qwen2.5-0.5B-Instruct & ``` @@ -124,3 +127,5 @@ INFO: Application shutdown complete. ``` Finally, you can exit container by using `ctrl-D`. +:::: +::::: \ No newline at end of file diff --git a/docs/source/tutorials.md b/docs/source/tutorials.md deleted file mode 100644 index a81e0380..00000000 --- a/docs/source/tutorials.md +++ /dev/null @@ -1,317 +0,0 @@ -# Tutorials - -## Run vllm-ascend on Single NPU - -### Offline Inference on Single NPU - -Run docker container: - -```{code-block} bash - :substitutions: -docker run \ ---name vllm-ascend \ ---device /dev/davinci0 \ ---device /dev/davinci_manager \ ---device /dev/devmm_svm \ ---device /dev/hisi_hdc \ --v /usr/local/dcmi:/usr/local/dcmi \ --v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ --v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ --v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ --v /etc/ascend_install.info:/etc/ascend_install.info \ --v /root/.cache:/root/.cache \ --p 8000:8000 \ --it quay.io/ascend/vllm-ascend:|vllm_ascend_version| bash -``` - -Setup environment variables: - -```bash -# Use Modelscope mirror to speed up model download -export VLLM_USE_MODELSCOPE=True - -# To avoid NPU out of memory, set `max_split_size_mb` to any value lower than you need to allocate for Qwen2.5-7B-Instruct -export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 -``` - -:::{note} -`max_split_size_mb` prevents the native allocator from splitting blocks larger than this size (in MB). This can reduce fragmentation and may allow some borderline workloads to complete without running out of memory. You can find more details [here](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html). -::: - -Run the following script to execute offline inference on a single NPU: - -```python -from vllm import LLM, SamplingParams - -prompts = [ - "Hello, my name is", - "The future of AI is", -] -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -llm = LLM(model="Qwen/Qwen2.5-7B-Instruct", max_model_len=26240) - -outputs = llm.generate(prompts, sampling_params) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` - -If you run this script successfully, you can see the info shown below: - -```bash -Prompt: 'Hello, my name is', Generated text: ' Daniel and I am an 8th grade student at York Middle School. I' -Prompt: 'The future of AI is', Generated text: ' following you. As the technology advances, a new report from the Institute for the' -``` - -### Online Serving on Single NPU - -Run docker container to start the vLLM server on a single NPU: - -```{code-block} bash - :substitutions: - -docker run \ ---name vllm-ascend \ ---device /dev/davinci0 \ ---device /dev/davinci_manager \ ---device /dev/devmm_svm \ ---device /dev/hisi_hdc \ --v /usr/local/dcmi:/usr/local/dcmi \ --v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ --v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ --v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ --v /etc/ascend_install.info:/etc/ascend_install.info \ --v /root/.cache:/root/.cache \ --p 8000:8000 \ --e VLLM_USE_MODELSCOPE=True \ --e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ --it quay.io/ascend/vllm-ascend:|vllm_ascend_version| \ -vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240 -``` - -:::{note} -Add `--max_model_len` option to avoid ValueError that the Qwen2.5-7B model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (26240). This will differ with different NPU series base on the HBM size. Please modify the value according to a suitable value for your NPU series. -::: - -If your service start successfully, you can see the info shown below: - -```bash -INFO: Started server process [6873] -INFO: Waiting for application startup. -INFO: Application startup complete. -``` - -Once your server is started, you can query the model with input prompts: - -```bash -curl http://localhost:8000/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Qwen/Qwen2.5-7B-Instruct", - "prompt": "The future of AI is", - "max_tokens": 7, - "temperature": 0 - }' -``` - -If you query the server successfully, you can see the info shown below (client): - -```bash -{"id":"cmpl-b25a59a2f985459781ce7098aeddfda7","object":"text_completion","created":1739523925,"model":"Qwen/Qwen2.5-7B-Instruct","choices":[{"index":0,"text":" here. It’s not just a","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}],"usage":{"prompt_tokens":5,"total_tokens":12,"completion_tokens":7,"prompt_tokens_details":null}} -``` - -Logs of the vllm server: - -```bash -INFO: 172.17.0.1:49518 - "POST /v1/completions HTTP/1.1" 200 OK -INFO 02-13 08:34:35 logger.py:39] Received request cmpl-574f00e342904692a73fb6c1c986c521-0: prompt: 'San Francisco is a', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=7, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None), prompt_token_ids: [23729, 12879, 374, 264], lora_request: None, prompt_adapter_request: None. -``` - -## Run vllm-ascend on Multi-NPU - -### Distributed Inference on Multi-NPU - -Run docker container: - -```{code-block} bash - :substitutions: - -docker run \ ---name vllm-ascend \ ---device /dev/davinci0 \ ---device /dev/davinci1 \ ---device /dev/davinci_manager \ ---device /dev/devmm_svm \ ---device /dev/hisi_hdc \ --v /usr/local/dcmi:/usr/local/dcmi \ --v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ --v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ --v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ --v /etc/ascend_install.info:/etc/ascend_install.info \ --v /root/.cache:/root/.cache \ --p 8000:8000 \ --it quay.io/ascend/vllm-ascend:|vllm_ascend_version| bash -``` - -Setup environment variables: - -```bash -# Use Modelscope mirror to speed up model download -export VLLM_USE_MODELSCOPE=True - -# To avoid NPU out of memory, set `max_split_size_mb` to any value lower than you need to allocate for Qwen2.5-7B-Instruct -export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 -``` - -Run the following script to execute offline inference on multi-NPU: - -```python -import gc - -import torch - -from vllm import LLM, SamplingParams -from vllm.distributed.parallel_state import (destroy_distributed_environment, - destroy_model_parallel) - -def clean_up(): - destroy_model_parallel() - destroy_distributed_environment() - gc.collect() - torch.npu.empty_cache() - -prompts = [ - "Hello, my name is", - "The future of AI is", -] -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -llm = LLM(model="Qwen/Qwen2.5-7B-Instruct", - tensor_parallel_size=2, - distributed_executor_backend="mp", - max_model_len=26240) - -outputs = llm.generate(prompts, sampling_params) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -del llm -clean_up() -``` - -If you run this script successfully, you can see the info shown below: - -```bash -Prompt: 'Hello, my name is', Generated text: ' Daniel and I am an 8th grade student at York Middle School. I' -Prompt: 'The future of AI is', Generated text: ' following you. As the technology advances, a new report from the Institute for the' -``` - -## Online Serving on Multi Machine - -Run docker container on each machine: - -```{code-block} bash - :substitutions: - -docker run \ ---name vllm-ascend \ ---device /dev/davinci0 \ ---device /dev/davinci1 \ ---device /dev/davinci2\ ---device /dev/davinci3 \ ---device /dev/davinci4 \ ---device /dev/davinci5 \ ---device /dev/davinci6 \ ---device /dev/davinci7 \ ---device /dev/davinci_manager \ ---device /dev/devmm_svm \ ---device /dev/hisi_hdc \ --v /usr/local/dcmi:/usr/local/dcmi \ --v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ --v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ --v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ --v /etc/ascend_install.info:/etc/ascend_install.info \ --v /root/.cache:/root/.cache \ --p 8000:8000 \ --it quay.io/ascend/vllm-ascend:|vllm_ascend_version| bash -``` - -Choose one machine as head node, the other are worker nodes, then start ray on each machine: - -:::{note} -Check out your `nic_name` by command `ip addr`. -::: - -```shell -# Head node -export HCCL_IF_IP={local_ip} -export GLOO_SOCKET_IFNAME={nic_name} -export TP_SOCKET_IFNAME={nic_name} -export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 -ray start --head --num-gpus=8 - -# Worker node -export HCCL_IF_IP={local_ip} -export ASCEND_PROCESS_LOG_PATH={plog_save_path} -export GLOO_SOCKET_IFNAME={nic_name} -export TP_SOCKET_IFNAME={nic_name} -export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 -export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -ray start --address='{head_node_ip}:{port_num}' --num-gpus=8 --node-ip-address={local_ip} -``` - -Start the vLLM server on head node: - -```shell -export VLLM_HOST_IP={head_node_ip} -export HCCL_CONNECT_TIMEOUT=120 -export ASCEND_PROCESS_LOG_PATH={plog_save_path} -export HCCL_IF_IP={head_node_ip} - -if [ -d "{plog_save_path}" ]; then - rm -rf {plog_save_path} - echo ">>> remove {plog_save_path}" -fi - -LOG_FILE="multinode_$(date +%Y%m%d_%H%M).log" -VLLM_TORCH_PROFILER_DIR=./vllm_profile -python -m vllm.entrypoints.openai.api_server \ - --model="Deepseek/DeepSeek-V2-Lite-Chat" \ - --trust-remote-code \ - --enforce-eager \ - --max-model-len {max_model_len} \ - --distributed_executor_backend "ray" \ - --tensor-parallel-size 16 \ - --disable-log-requests \ - --disable-log-stats \ - --disable-frontend-multiprocessing \ - --port {port_num} \ -``` - -Once your server is started, you can query the model with input prompts: - -```shell -curl -X POST http://127.0.0.1:{prot_num}/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Deepseek/DeepSeek-V2-Lite-Chat", - "prompt": "The future of AI is", - "max_tokens": 24 - }' -``` - -If you query the server successfully, you can see the info shown below (client): - -``` -{"id":"cmpl-6dfb5a8d8be54d748f0783285dd52303","object":"text_completion","created":1739957835,"model":"/home/data/DeepSeek-V2-Lite-Chat/","choices":[{"index":0,"text":" heavily influenced by neuroscience and cognitiveGuionistes. The goalochondria is to combine the efforts of researchers, technologists,","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}],"usage":{"prompt_tokens":6,"total_tokens":30,"completion_tokens":24,"prompt_tokens_details":null}} -``` - -Logs of the vllm server: - -``` -INFO: 127.0.0.1:59384 - "POST /v1/completions HTTP/1.1" 200 OK -INFO 02-19 17:37:35 metrics.py:453] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 1.9 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%. -``` \ No newline at end of file diff --git a/docs/source/tutorials/index.md b/docs/source/tutorials/index.md new file mode 100644 index 00000000..da513d8d --- /dev/null +++ b/docs/source/tutorials/index.md @@ -0,0 +1,9 @@ +# Tutorials + +:::{toctree} +:caption: Deployment +:maxdepth: 1 +single_npu +multi_npu +multi_node +::: diff --git a/docs/source/tutorials/multi_node.md b/docs/source/tutorials/multi_node.md new file mode 100644 index 00000000..324f414f --- /dev/null +++ b/docs/source/tutorials/multi_node.md @@ -0,0 +1,109 @@ +# Multi-Node (DeepSeek) + +## Online Serving on Multi node + +Run docker container on each machine: + +```{code-block} bash + :substitutions: + +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci1 \ +--device /dev/davinci2\ +--device /dev/davinci3 \ +--device /dev/davinci4 \ +--device /dev/davinci5 \ +--device /dev/davinci6 \ +--device /dev/davinci7 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it quay.io/ascend/vllm-ascend:|vllm_ascend_version| bash +``` + +Choose one machine as head node, the other are worker nodes, then start ray on each machine: + +:::{note} +Check out your `nic_name` by command `ip addr`. +::: + +```shell +# Head node +export HCCL_IF_IP={local_ip} +export GLOO_SOCKET_IFNAME={nic_name} +export TP_SOCKET_IFNAME={nic_name} +export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 +ray start --head --num-gpus=8 + +# Worker node +export HCCL_IF_IP={local_ip} +export ASCEND_PROCESS_LOG_PATH={plog_save_path} +export GLOO_SOCKET_IFNAME={nic_name} +export TP_SOCKET_IFNAME={nic_name} +export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 +export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +ray start --address='{head_node_ip}:{port_num}' --num-gpus=8 --node-ip-address={local_ip} +``` + +Start the vLLM server on head node: + +```shell +export VLLM_HOST_IP={head_node_ip} +export HCCL_CONNECT_TIMEOUT=120 +export ASCEND_PROCESS_LOG_PATH={plog_save_path} +export HCCL_IF_IP={head_node_ip} + +if [ -d "{plog_save_path}" ]; then + rm -rf {plog_save_path} + echo ">>> remove {plog_save_path}" +fi + +LOG_FILE="multinode_$(date +%Y%m%d_%H%M).log" +VLLM_TORCH_PROFILER_DIR=./vllm_profile +python -m vllm.entrypoints.openai.api_server \ + --model="Deepseek/DeepSeek-V2-Lite-Chat" \ + --trust-remote-code \ + --enforce-eager \ + --max-model-len {max_model_len} \ + --distributed_executor_backend "ray" \ + --tensor-parallel-size 16 \ + --disable-log-requests \ + --disable-log-stats \ + --disable-frontend-multiprocessing \ + --port {port_num} \ +``` + +Once your server is started, you can query the model with input prompts: + +```shell +curl -X POST http://127.0.0.1:{prot_num}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Deepseek/DeepSeek-V2-Lite-Chat", + "prompt": "The future of AI is", + "max_tokens": 24 + }' +``` + +If you query the server successfully, you can see the info shown below (client): + +``` +{"id":"cmpl-6dfb5a8d8be54d748f0783285dd52303","object":"text_completion","created":1739957835,"model":"/home/data/DeepSeek-V2-Lite-Chat/","choices":[{"index":0,"text":" heavily influenced by neuroscience and cognitiveGuionistes. The goalochondria is to combine the efforts of researchers, technologists,","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}],"usage":{"prompt_tokens":6,"total_tokens":30,"completion_tokens":24,"prompt_tokens_details":null}} +``` + +Logs of the vllm server: + +``` +INFO: 127.0.0.1:59384 - "POST /v1/completions HTTP/1.1" 200 OK +INFO 02-19 17:37:35 metrics.py:453] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 1.9 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%. +``` \ No newline at end of file diff --git a/docs/source/tutorials/multi_npu.md b/docs/source/tutorials/multi_npu.md new file mode 100644 index 00000000..8bf80cac --- /dev/null +++ b/docs/source/tutorials/multi_npu.md @@ -0,0 +1,105 @@ +# Multi-NPU (QwQ 32B) + +## Run vllm-ascend on Multi-NPU + +Run docker container: + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci1 \ +--device /dev/davinci2 \ +--device /dev/davinci3 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` + +Setup environment variables: + +```bash +# Use Modelscope mirror to speed up model download +export VLLM_USE_MODELSCOPE=True + +# To avoid NPU out of memory, set `max_split_size_mb` to any value lower than you need to allocate for Qwen2.5-7B-Instruct +export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 +``` + +### Online Inference on Multi-NPU + +Run the following script to start the vLLM server on Multi-NPU: + +```bash +vllm serve Qwen/QwQ-32B --max-model-len 4096 --port 8000 -tp 4 +``` + +Once your server is started, you can query the model with input prompts + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/QwQ-32B", + "prompt": "请问什么是深度学习?", + "max_tokens": 100, + "temperature": 0.7 + }' +``` + +### Offline Inference on Multi-NPU + +Run the following script to execute offline inference on multi-NPU: + +```python +import gc + +import torch + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() + +prompts = [ + "Hello, my name is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +llm = LLM(model="Qwen/QwQ-32B", + tensor_parallel_size=4, + distributed_executor_backend="tp", + max_model_len=26240) + +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +del llm +clean_up() +``` + +If you run this script successfully, you can see the info shown below: + +```bash +Prompt: 'Hello, my name is', Generated text: ' Daniel and I am an 8th grade student at York Middle School. I' +Prompt: 'The future of AI is', Generated text: ' following you. As the technology advances, a new report from the Institute for the' +``` diff --git a/docs/source/tutorials/single_npu.md b/docs/source/tutorials/single_npu.md new file mode 100644 index 00000000..d5bba0a7 --- /dev/null +++ b/docs/source/tutorials/single_npu.md @@ -0,0 +1,133 @@ +# Single NPU (Qwen 7B) + +## Run vllm-ascend on Single NPU + +### Offline Inference on Single NPU + +Run docker container: + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` + +Setup environment variables: + +```bash +# Use Modelscope mirror to speed up model download +export VLLM_USE_MODELSCOPE=True + +# To avoid NPU out of memory, set `max_split_size_mb` to any value lower than you need to allocate for Qwen2.5-7B-Instruct +export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 +``` + +:::{note} +`max_split_size_mb` prevents the native allocator from splitting blocks larger than this size (in MB). This can reduce fragmentation and may allow some borderline workloads to complete without running out of memory. You can find more details [here](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html). +::: + +Run the following script to execute offline inference on a single NPU: + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +llm = LLM(model="Qwen/Qwen2.5-7B-Instruct", max_model_len=26240) + +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +If you run this script successfully, you can see the info shown below: + +```bash +Prompt: 'Hello, my name is', Generated text: ' Daniel and I am an 8th grade student at York Middle School. I' +Prompt: 'The future of AI is', Generated text: ' following you. As the technology advances, a new report from the Institute for the' +``` + +### Online Serving on Single NPU + +Run docker container to start the vLLM server on a single NPU: + +```{code-block} bash + :substitutions: + +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-e VLLM_USE_MODELSCOPE=True \ +-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ +-it $IMAGE \ +vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240 +``` + +:::{note} +Add `--max_model_len` option to avoid ValueError that the Qwen2.5-7B model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (26240). This will differ with different NPU series base on the HBM size. Please modify the value according to a suitable value for your NPU series. +::: + +If your service start successfully, you can see the info shown below: + +```bash +INFO: Started server process [6873] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +Once your server is started, you can query the model with input prompts: + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "prompt": "The future of AI is", + "max_tokens": 7, + "temperature": 0 + }' +``` + +If you query the server successfully, you can see the info shown below (client): + +```bash +{"id":"cmpl-b25a59a2f985459781ce7098aeddfda7","object":"text_completion","created":1739523925,"model":"Qwen/Qwen2.5-7B-Instruct","choices":[{"index":0,"text":" here. It’s not just a","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}],"usage":{"prompt_tokens":5,"total_tokens":12,"completion_tokens":7,"prompt_tokens_details":null}} +``` + +Logs of the vllm server: + +```bash +INFO: 172.17.0.1:49518 - "POST /v1/completions HTTP/1.1" 200 OK +INFO 02-13 08:34:35 logger.py:39] Received request cmpl-574f00e342904692a73fb6c1c986c521-0: prompt: 'San Francisco is a', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=7, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None), prompt_token_ids: [23729, 12879, 374, 264], lora_request: None, prompt_adapter_request: None. +```