Skip to content

Commit ec7e49a

Browse files
markmckerthcet
authored andcommitted
[V1][Metrics] Add TTFT and TPOT histograms (vllm-project#12530)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent dbe5b1f commit ec7e49a

File tree

4 files changed

+45
-1
lines changed

4 files changed

+45
-1
lines changed

tests/entrypoints/openai/test_metrics.py

+6
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
208208
"vllm:request_generation_tokens_sum",
209209
"vllm:request_generation_tokens_bucket",
210210
"vllm:request_generation_tokens_count",
211+
"vllm:time_to_first_token_seconds_sum",
212+
"vllm:time_to_first_token_seconds_bucket",
213+
"vllm:time_to_first_token_seconds_count",
214+
"vllm:time_per_output_token_seconds_sum",
215+
"vllm:time_per_output_token_seconds_bucket",
216+
"vllm:time_per_output_token_seconds_count",
211217
]
212218

213219

vllm/v1/engine/output_processor.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def __init__(
2727
prompt: Optional[str],
2828
prompt_token_ids: List[int],
2929
detokenizer: IncrementalDetokenizer,
30+
arrival_time: float,
3031
queue: Optional[asyncio.Queue[RequestOutput]],
3132
):
3233
self.request_id = request_id
@@ -37,7 +38,7 @@ def __init__(
3738
self.is_prefilling = True
3839
self.queue = queue
3940

40-
self.stats = RequestStateStats()
41+
self.stats = RequestStateStats(last_token_time=arrival_time)
4142

4243
@classmethod
4344
def from_new_request(
@@ -54,6 +55,7 @@ def from_new_request(
5455
tokenizer=tokenizer,
5556
request=request,
5657
),
58+
arrival_time=request.arrival_time,
5759
queue=queue,
5860
)
5961

vllm/v1/metrics/loggers.py

+25
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,26 @@ def __init__(self, model_config: ModelConfig):
121121
buckets=build_1_2_5_buckets(max_model_len),
122122
labelnames=labelnames).labels(*labelvalues)
123123

124+
self.histogram_time_to_first_token = \
125+
prometheus_client.Histogram(
126+
name="vllm:time_to_first_token_seconds",
127+
documentation="Histogram of time to first token in seconds.",
128+
buckets=[
129+
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
130+
0.75, 1.0, 2.5, 5.0, 7.5, 10.0
131+
],
132+
labelnames=labelnames).labels(*labelvalues)
133+
134+
self.histogram_time_per_output_token = \
135+
prometheus_client.Histogram(
136+
name="vllm:time_per_output_token_seconds",
137+
documentation="Histogram of time per output token in seconds.",
138+
buckets=[
139+
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
140+
0.75, 1.0, 2.5
141+
],
142+
labelnames=labelnames).labels(*labelvalues)
143+
124144
def log(self, scheduler_stats: SchedulerStats,
125145
iteration_stats: IterationStats):
126146
"""Log to prometheus."""
@@ -137,6 +157,11 @@ def log(self, scheduler_stats: SchedulerStats,
137157
self.histogram_num_generation_tokens_request.observe(
138158
finished_request.num_generation_tokens)
139159

160+
for ttft in iteration_stats.time_to_first_tokens_iter:
161+
self.histogram_time_to_first_token.observe(ttft)
162+
for tpot in iteration_stats.time_per_output_tokens_iter:
163+
self.histogram_time_per_output_token.observe(tpot)
164+
140165
@staticmethod
141166
def _unregister_vllm_metrics():
142167
# Unregister any existing vLLM collectors (for CI/CD

vllm/v1/metrics/stats.py

+11
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import time
12
from dataclasses import dataclass
23
from typing import TYPE_CHECKING, List
34

@@ -22,6 +23,7 @@ class RequestStateStats:
2223
"""Stats that need to be tracked across delta updates."""
2324

2425
num_generation_tokens: int = 0
26+
last_token_time: float = 0.0
2527

2628

2729
@dataclass
@@ -40,6 +42,8 @@ def __init__(self, log_stats: bool):
4042
self.num_generation_tokens = 0
4143
self.num_prompt_tokens = 0
4244
self.finished_requests: List[FinishedRequestStats] = []
45+
self.time_to_first_tokens_iter: List[float] = []
46+
self.time_per_output_tokens_iter: List[float] = []
4347

4448
def update_from_output(self, output: "EngineCoreOutput",
4549
is_prefilling: bool, prompt_len: int,
@@ -48,6 +52,8 @@ def update_from_output(self, output: "EngineCoreOutput",
4852
return
4953

5054
num_new_generation_tokens = len(output.new_token_ids)
55+
now = time.time()
56+
last_token_latency = now - request_state_stats.last_token_time
5157

5258
self.num_generation_tokens += num_new_generation_tokens
5359
if is_prefilling:
@@ -58,7 +64,12 @@ def update_from_output(self, output: "EngineCoreOutput",
5864
assert (num_new_generation_tokens > 0)
5965
self.num_prompt_tokens += prompt_len
6066

67+
self.time_to_first_tokens_iter.append(last_token_latency)
68+
else:
69+
self.time_per_output_tokens_iter.append(last_token_latency)
70+
6171
request_state_stats.num_generation_tokens += num_new_generation_tokens
72+
request_state_stats.last_token_time = now
6273

6374
def update_from_finished_request(self, request_output: "RequestOutput",
6475
request_state_stats: RequestStateStats):

0 commit comments

Comments
 (0)