Skip to content

Commit

Permalink
benchmark: include convert latency in bench_append_paged_kv_cache (#590)
Browse files Browse the repository at this point in the history
```
model: l1b      seqlens: [1, 1, 1, 1, 1, 1, 1, 1]                 convert: 45us 1layer:  7us 16layers: 151us throughput:    4.936GB/s
model: l1b      seqlens: [4993, 1, 1, 1, 1, 1, 1, 1]              convert: 42us 1layer: 14us 16layers: 271us throughput: 1434.769GB/s
model: l1b      seqlens: [5000]                                   convert: 44us 1layer: 14us 16layers: 272us throughput: 1438.581GB/s
model: l1b      seqlens: [625, 625, 625, 625, 625, 625, 625, 625] convert: 46us 1layer: 14us 16layers: 274us throughput: 1440.357GB/s
---
model: l3b      seqlens: [1, 1, 1, 1, 1, 1, 1, 1]                 convert: 42us 1layer:  7us 28layers: 226us throughput:    9.946GB/s
model: l3b      seqlens: [4993, 1, 1, 1, 1, 1, 1, 1]              convert: 43us 1layer: 22us 28layers: 647us throughput: 1896.687GB/s
model: l3b      seqlens: [5000]                                   convert: 42us 1layer: 22us 28layers: 646us throughput: 1898.796GB/s
model: l3b      seqlens: [625, 625, 625, 625, 625, 625, 625, 625] convert: 41us 1layer: 22us 28layers: 648us throughput: 1890.115GB/s
---
model: l8b      seqlens: [1, 1, 1, 1, 1, 1, 1, 1]                 convert: 41us 1layer:  7us 32layers: 252us throughput:    9.940GB/s
model: l8b      seqlens: [4993, 1, 1, 1, 1, 1, 1, 1]              convert: 42us 1layer: 21us 32layers: 730us throughput: 1905.826GB/s
model: l8b      seqlens: [5000]                                   convert: 41us 1layer: 22us 32layers: 729us throughput: 1903.697GB/s
model: l8b      seqlens: [625, 625, 625, 625, 625, 625, 625, 625] convert: 47us 1layer: 22us 32layers: 737us throughput: 1899.630GB/s
---
model: l70b-tp8 seqlens: [1, 1, 1, 1, 1, 1, 1, 1]                 convert: 42us 1layer:  6us 80layers: 552us throughput:    1.283GB/s
model: l70b-tp8 seqlens: [4993, 1, 1, 1, 1, 1, 1, 1]              convert: 41us 1layer:  9us 80layers: 800us throughput:  539.484GB/s
model: l70b-tp8 seqlens: [5000]                                   convert: 41us 1layer:  9us 80layers: 788us throughput:  548.648GB/s
model: l70b-tp8 seqlens: [625, 625, 625, 625, 625, 625, 625, 625] convert: 41us 1layer: 10us 80layers: 803us throughput:  537.731GB/s
```
  • Loading branch information
abcdabcd987 authored Nov 6, 2024
1 parent e15f7c9 commit d7300c4
Showing 1 changed file with 17 additions and 11 deletions.
28 changes: 17 additions & 11 deletions benchmarks/bench_append_paged_kv_cache.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import argparse
import dataclasses
from typing import cast
from typing import Tuple, cast

import torch
from triton.testing import do_bench
Expand Down Expand Up @@ -99,14 +99,19 @@ def main():
dtype=torch.int32,
)

batch_indices, positions = flashinfer.get_batch_indices_positions(
x_indptr,
flashinfer.get_seq_lens(kv_indptr, kv_last_page_len, page_len),
k.shape[0],
)
@torch.cuda.nvtx.range(f"convert model={model_name}, seqlens={seqlens}")
def fn_convert() -> Tuple[torch.Tensor, torch.Tensor]:
return flashinfer.get_batch_indices_positions(
x_indptr,
flashinfer.get_seq_lens(kv_indptr, kv_last_page_len, page_len),
k.shape[0],
)

batch_indices, positions = fn_convert()
convert_latency_ms = cast(float, do_bench(fn_convert))

@torch.cuda.nvtx.range(f"model={model_name}, seqlens={seqlens}")
def fn():
@torch.cuda.nvtx.range(f"append model={model_name}, seqlens={seqlens}")
def fn() -> None:
flashinfer.append_paged_kv_cache(
k,
v,
Expand All @@ -120,7 +125,7 @@ def fn():
)

latency_ms = cast(float, do_bench(fn))
all_layers_latency_ms = latency_ms * model.num_layers
all_layers_latency_ms = convert_latency_ms + latency_ms * model.num_layers
throughput = (
k.numel()
* k.element_size()
Expand All @@ -131,8 +136,9 @@ def fn():
print(
f"model: {model_name:8}",
f"seqlens: {seqlens!r:{seqlen_strlen}}",
f"single_layer: {latency_ms:5.3f}ms",
f"all_layers: {all_layers_latency_ms:7.3f}ms",
f"convert: {convert_latency_ms*1e3:2.0f}us",
f"1layer: {latency_ms*1e3:2.0f}us",
f"{model.num_layers}layers: {all_layers_latency_ms*1e3:3.0f}us",
f"throughput: {throughput*1e-9:8.3f}GB/s",
)
print("---")
Expand Down

0 comments on commit d7300c4

Please # to comment.