Skip to content

Commit 00623a9

Browse files
markmcGWS0428
authored andcommitted
[V1][Metrics] Add per-request prompt/generation_tokens histograms (vllm-project#12516)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent fa6ac5c commit 00623a9

File tree

5 files changed

+102
-14
lines changed

5 files changed

+102
-14
lines changed

tests/entrypoints/openai/test_metrics.py

+6
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
202202
"vllm:num_requests_waiting",
203203
"vllm:prompt_tokens_total",
204204
"vllm:generation_tokens_total",
205+
"vllm:request_prompt_tokens_sum",
206+
"vllm:request_prompt_tokens_bucket",
207+
"vllm:request_prompt_tokens_count",
208+
"vllm:request_generation_tokens_sum",
209+
"vllm:request_generation_tokens_bucket",
210+
"vllm:request_generation_tokens_count",
205211
]
206212

207213

vllm/v1/engine/async_llm.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,7 @@ def __init__(
5353
self.log_stats = log_stats
5454
self.stat_loggers: List[StatLoggerBase] = [
5555
LoggingStatLogger(),
56-
PrometheusStatLogger(labels=dict(
57-
model_name=self.model_config.served_model_name)),
56+
PrometheusStatLogger(vllm_config.model_config),
5857
]
5958

6059
# Tokenizer (+ ensure liveness if running in another process).

vllm/v1/engine/output_processor.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
99
from vllm.v1.engine.detokenizer import (DetokenizerOutput,
1010
IncrementalDetokenizer)
11-
from vllm.v1.metrics.stats import IterationStats
11+
from vllm.v1.metrics.stats import IterationStats, RequestStateStats
1212

1313

1414
@dataclass
@@ -37,6 +37,8 @@ def __init__(
3737
self.is_prefilling = True
3838
self.queue = queue
3939

40+
self.stats = RequestStateStats()
41+
4042
@classmethod
4143
def from_new_request(
4244
cls,
@@ -146,7 +148,8 @@ def process_outputs(
146148
# 1) Compute stats for this iteration.
147149
iteration_stats.update_from_output(engine_core_output,
148150
req_state.is_prefilling,
149-
req_state.prompt_len)
151+
req_state.prompt_len,
152+
req_state.stats)
150153
req_state.is_prefilling = False
151154

152155
# 2) Detokenize the token ids into text.
@@ -171,6 +174,10 @@ def process_outputs(
171174
# detected stop string, abort needed in EngineCore.
172175
reqs_to_abort.append(req_id)
173176

177+
# Track per-request stats
178+
iteration_stats.update_from_finished_request(
179+
request_output, req_state.stats)
180+
174181
return OutputProcessorOutput(
175182
request_outputs=request_outputs,
176183
reqs_to_abort=reqs_to_abort,

vllm/v1/metrics/loggers.py

+54-6
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import time
22
from abc import ABC, abstractmethod
3-
from typing import Dict, List
3+
from typing import List
44

55
import numpy as np
66
import prometheus_client
77

8+
from vllm.config import ModelConfig
89
from vllm.logger import init_logger
910
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
1011

@@ -78,13 +79,13 @@ def log(self, scheduler_stats: SchedulerStats,
7879

7980
class PrometheusStatLogger(StatLoggerBase):
8081

81-
def __init__(self, labels: Dict[str, str]):
82-
self.labels = labels
82+
def __init__(self, model_config: ModelConfig):
83+
self._unregister_vllm_metrics()
8384

84-
labelnames = self.labels.keys()
85-
labelvalues = self.labels.values()
85+
labelnames = ["model_name"]
86+
labelvalues = [model_config.served_model_name]
8687

87-
self._unregister_vllm_metrics()
88+
max_model_len = model_config.max_model_len
8889

8990
self.gauge_scheduler_running = prometheus_client.Gauge(
9091
name="vllm:num_requests_running",
@@ -106,6 +107,20 @@ def __init__(self, labels: Dict[str, str]):
106107
documentation="Number of generation tokens processed.",
107108
labelnames=labelnames).labels(*labelvalues)
108109

110+
self.histogram_num_prompt_tokens_request = \
111+
prometheus_client.Histogram(
112+
name="vllm:request_prompt_tokens",
113+
documentation="Number of prefill tokens processed.",
114+
buckets=build_1_2_5_buckets(max_model_len),
115+
labelnames=labelnames).labels(*labelvalues)
116+
117+
self.histogram_num_generation_tokens_request = \
118+
prometheus_client.Histogram(
119+
name="vllm:request_generation_tokens",
120+
documentation="Number of generation tokens processed.",
121+
buckets=build_1_2_5_buckets(max_model_len),
122+
labelnames=labelnames).labels(*labelvalues)
123+
109124
def log(self, scheduler_stats: SchedulerStats,
110125
iteration_stats: IterationStats):
111126
"""Log to prometheus."""
@@ -116,9 +131,42 @@ def log(self, scheduler_stats: SchedulerStats,
116131
self.counter_generation_tokens.inc(
117132
iteration_stats.num_generation_tokens)
118133

134+
for finished_request in iteration_stats.finished_requests:
135+
self.histogram_num_prompt_tokens_request.observe(
136+
finished_request.num_prompt_tokens)
137+
self.histogram_num_generation_tokens_request.observe(
138+
finished_request.num_generation_tokens)
139+
119140
@staticmethod
120141
def _unregister_vllm_metrics():
121142
# Unregister any existing vLLM collectors (for CI/CD
122143
for collector in list(prometheus_client.REGISTRY._collector_to_names):
123144
if hasattr(collector, "_name") and "vllm" in collector._name:
124145
prometheus_client.REGISTRY.unregister(collector)
146+
147+
148+
def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
149+
"""
150+
Builds a list of buckets with increasing powers of 10 multiplied by
151+
mantissa values until the value exceeds the specified maximum.
152+
153+
"""
154+
exponent = 0
155+
buckets: List[int] = []
156+
while True:
157+
for m in mantissa_lst:
158+
value = m * 10**exponent
159+
if value <= max_value:
160+
buckets.append(value)
161+
else:
162+
return buckets
163+
exponent += 1
164+
165+
166+
def build_1_2_5_buckets(max_value: int) -> List[int]:
167+
"""
168+
Example:
169+
>>> build_1_2_5_buckets(100)
170+
[1, 2, 5, 10, 20, 50, 100]
171+
"""
172+
return build_buckets([1, 2, 5], max_value)

vllm/v1/metrics/stats.py

+32-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from dataclasses import dataclass
2-
from typing import TYPE_CHECKING
2+
from typing import TYPE_CHECKING, List
33

44
if TYPE_CHECKING:
5+
from vllm.outputs import RequestOutput
56
from vllm.v1.engine import EngineCoreOutput
67

78

@@ -16,24 +17,51 @@ class SchedulerStats:
1617
# gpu_prefix_cache_hit_rate: float = 0.0
1718

1819

20+
@dataclass
21+
class RequestStateStats:
22+
"""Stats that need to be tracked across delta updates."""
23+
24+
num_generation_tokens: int = 0
25+
26+
27+
@dataclass
28+
class FinishedRequestStats:
29+
"""Stats associated with a finished request."""
30+
31+
num_prompt_tokens: int = 0
32+
num_generation_tokens: int = 0
33+
34+
1935
class IterationStats:
2036
"""Stats associated with a single set of EngineCoreOutputs."""
2137

2238
def __init__(self, log_stats: bool):
2339
self.log_stats = log_stats
2440
self.num_generation_tokens = 0
2541
self.num_prompt_tokens = 0
42+
self.finished_requests: List[FinishedRequestStats] = []
2643

2744
def update_from_output(self, output: "EngineCoreOutput",
28-
is_prefilling: bool, prompt_len: int):
45+
is_prefilling: bool, prompt_len: int,
46+
request_state_stats: RequestStateStats):
2947
if not self.log_stats:
3048
return
3149

32-
self.num_generation_tokens += len(output.new_token_ids)
50+
num_new_generation_tokens = len(output.new_token_ids)
51+
52+
self.num_generation_tokens += num_new_generation_tokens
3353
if is_prefilling:
3454
# This relies on the invariant that EngineCore does
3555
# not stream outputs for partially completed prefills
3656
# (scheduler.update_from_output makes EngineCoreOutput
3757
# iff num_computed_tokens == num_tokens).
38-
assert (len(output.new_token_ids) > 0)
58+
assert (num_new_generation_tokens > 0)
3959
self.num_prompt_tokens += prompt_len
60+
61+
request_state_stats.num_generation_tokens += num_new_generation_tokens
62+
63+
def update_from_finished_request(self, request_output: "RequestOutput",
64+
request_state_stats: RequestStateStats):
65+
self.finished_requests.append(
66+
FinishedRequestStats(len(request_output.prompt_token_ids),
67+
request_state_stats.num_generation_tokens))

0 commit comments

Comments
 (0)