vllm-project
diff --git a/‎docs/source/index.rst
+15-10 b/‎docs/source/index.rst
+15-10
diff --git a/‎docs/source/serving/openai_compatible_server.md
+2-2 b/‎docs/source/serving/openai_compatible_server.md
+2-2
diff --git a/‎docs/source/serving/compatibility_matrix.rst ‎docs/source/usage/compatibility_matrix.rst b/‎docs/source/serving/compatibility_matrix.rst ‎docs/source/usage/compatibility_matrix.rst
diff --git a/‎docs/source/models/engine_args.rst ‎docs/source/usage/engine_args.rst b/‎docs/source/models/engine_args.rst ‎docs/source/usage/engine_args.rst
diff --git a/‎docs/source/serving/env_vars.rst ‎docs/source/usage/env_vars.rst b/‎docs/source/serving/env_vars.rst ‎docs/source/usage/env_vars.rst
diff --git a/‎docs/source/serving/faq.rst ‎docs/source/usage/faq.rst
+2 b/‎docs/source/serving/faq.rst ‎docs/source/usage/faq.rst
+2
diff --git a/‎docs/source/models/lora.rst ‎docs/source/usage/lora.rst b/‎docs/source/models/lora.rst ‎docs/source/usage/lora.rst
diff --git a/‎docs/source/models/performance.rst ‎docs/source/usage/performance.rst b/‎docs/source/models/performance.rst ‎docs/source/usage/performance.rst
diff --git a/‎docs/source/models/spec_decode.rst ‎docs/source/usage/spec_decode.rst
+2-2 b/‎docs/source/models/spec_decode.rst ‎docs/source/usage/spec_decode.rst
+2-2
diff --git a/‎docs/source/models/structured_outputs.rst ‎docs/source/usage/structured_outputs.rst b/‎docs/source/models/structured_outputs.rst ‎docs/source/usage/structured_outputs.rst
diff --git a/‎docs/source/serving/usage_stats.md ‎docs/source/usage/usage_stats.md b/‎docs/source/serving/usage_stats.md ‎docs/source/usage/usage_stats.md
diff --git a/‎docs/source/models/vlm.rst ‎docs/source/usage/vlm.rst b/‎docs/source/models/vlm.rst ‎docs/source/usage/vlm.rst
diff --git a/‎vllm/attention/backends/rocm_flash_attn.py
+1-1 b/‎vllm/attention/backends/rocm_flash_attn.py
+1-1
diff --git a/‎vllm/config.py
+4-4 b/‎vllm/config.py
+4-4
diff --git a/‎vllm/engine/arg_utils.py
+1-1 b/‎vllm/engine/arg_utils.py
+1-1
diff --git a/‎vllm/engine/output_processor/multi_step.py
+1-1 b/‎vllm/engine/output_processor/multi_step.py
+1-1
diff --git a/‎vllm/executor/cpu_executor.py
+1-1 b/‎vllm/executor/cpu_executor.py
+1-1
diff --git a/‎vllm/platforms/cpu.py
+1-1 b/‎vllm/platforms/cpu.py
+1-1
diff --git a/‎vllm/spec_decode/spec_decode_worker.py
+1-1 b/‎vllm/spec_decode/spec_decode_worker.py
+1-1
diff --git a/‎vllm/utils.py
+1-1 b/‎vllm/utils.py
+1-1
diff --git a/‎vllm/worker/multi_step_model_runner.py
+1-1 b/‎vllm/worker/multi_step_model_runner.py
+1-1
diff --git a/‎vllm/worker/utils.py
+1-1 b/‎vllm/worker/utils.py
+1-1
@@ -85,12 +85,23 @@ Documentation
    serving/deploying_with_nginx
    serving/distributed_serving
    serving/metrics
-   serving/env_vars
-   serving/usage_stats
    serving/integrations
    serving/tensorizer
-   serving/compatibility_matrix
-   serving/faq
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Usage
+
+   usage/lora
+   usage/structured_outputs
+   usage/spec_decode
+   usage/vlm
+   usage/compatibility_matrix
+   usage/performance
+   usage/faq
+   usage/engine_args
+   usage/env_vars
+   usage/usage_stats
 
 .. toctree::
    :maxdepth: 1
@@ -99,12 +110,6 @@ Documentation
    models/supported_models
    models/adding_model
    models/enabling_multimodal_inputs
-   models/engine_args
-   models/lora
-   models/vlm
-   models/structured_outputs
-   models/spec_decode
-   models/performance
 
 .. toctree::
    :maxdepth: 1
 
@@ -32,7 +32,7 @@ We currently support the following OpenAI APIs:
 - [Completions API](https://platform.openai.com/docs/api-reference/completions)
   - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
-  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst).
+  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../usage/vlm.rst).
     - *Note: `image_url.detail` parameter is not supported.*
   - We also support `audio_url` content type for audio files.
     - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
@@ -41,7 +41,7 @@ We currently support the following OpenAI APIs:
 - [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
   - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),
     which will be treated as a single prompt to the model according to its chat template.
-    - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst).
+    - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../usage/vlm.rst).
   - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.*
 
 ## Score API for Cross Encoder Models
 
@@ -1,3 +1,5 @@
+.. _faq:
+
 Frequently Asked Questions
 ===========================
 
 
@@ -182,7 +182,7 @@ speculative decoding, breaking down the guarantees into three key areas:
 3. **vLLM Logprob Stability**
    - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the 
    same request across runs. For more details, see the FAQ section 
-   titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_.
+   titled *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs <faq>`.
 
 
 **Conclusion**
@@ -197,7 +197,7 @@ can occur due to following factors:
 
 **Mitigation Strategies**
 
-For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_.
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs <faq>`.
 
 Resources for vLLM contributors
 -------------------------------
 
@@ -429,7 +429,7 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
 
@@ -509,7 +509,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if device_config.device_type not in ("cuda", "tpu", "xpu", "hpu"):
             logger.warning(
@@ -525,7 +525,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if device_config.device_type == "cuda" and self.enforce_eager:
             logger.warning(
@@ -540,7 +540,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
         if self.task == "embedding":
             self.use_async_output_proc = False
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if speculative_config:
             logger.warning("Async output processing is not supported with"
@@ -1721,7 +1721,7 @@ def verify_with_model_config(self, model_config: ModelConfig):
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if scheduler_config.chunked_prefill_enabled:
             raise ValueError("LoRA is not supported with chunked prefill yet.")
 
@@ -1110,7 +1110,7 @@ def create_engine_config(self,
             disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if self.num_scheduler_steps > 1:
             if speculative_config is not None:
 
@@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
     @staticmethod
     @functools.lru_cache
     def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         logger.warning(
             "Prompt logprob is not supported by multi step workers. "
 
@@ -23,7 +23,7 @@ class CPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
 
 
@@ -46,7 +46,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         import vllm.envs as envs
         from vllm.utils import GiB_bytes
         model_config = vllm_config.model_config
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         if not model_config.enforce_eager:
             logger.warning(
 
@@ -104,7 +104,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     return spec_decode_worker
 
 
-# Reminder: Please update docs/source/serving/compatibility_matrix.rst
+# Reminder: Please update docs/source/usage/compatibility_matrix.rst
 # If the feature combo become valid
 class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     """Worker which implements speculative decoding.
 
@@ -47,7 +47,7 @@
 
 # Exception strings for non-implemented encoder/decoder scenarios
 
-# Reminder: Please update docs/source/serving/compatibility_matrix.rst
+# Reminder: Please update docs/source/usage/compatibility_matrix.rst
 # If the feature combo become valid
 
 STR_NOT_IMPL_ENC_DEC_SWA = \
 
@@ -817,7 +817,7 @@ def _pythonize_sampler_output(
 
     for sgdx, (seq_group,
                sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
         # If the feature combo become valid
         # (Check for Guided Decoding)
         if seq_group.sampling_params.logits_processors:
 
@@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario(
     a supported scenario.
     '''
 
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+    # Reminder: Please update docs/source/usage/compatibility_matrix.rst
     # If the feature combo become valid
 
     if enc_dec_mr.cache_config.enable_prefix_caching:
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+.. _faq:`
	`2`	`+`
`1`	`3`	`Frequently Asked Questions`
`2`	`4`	`===========================`
`3`	`5`
Original file line number	Diff line number	Diff line change
`@@ -1110,7 +1110,7 @@ def create_engine_config(self,`
`1110`	`1110`	`disable_logprobs=self.disable_logprobs_during_spec_decoding,`
`1111`	`1111`	`)`
`1112`	`1112`
`1113`		`- # Reminder: Please update docs/source/serving/compatibility_matrix.rst`
	`1113`	`+ # Reminder: Please update docs/source/usage/compatibility_matrix.rst`
`1114`	`1114`	`# If the feature combo become valid`
`1115`	`1115`	`if self.num_scheduler_steps > 1:`
`1116`	`1116`	`if speculative_config is not None:`