Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Fix error showing time spent in llama perf context print #1898

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def __init__(
offload_kqv: bool = True,
flash_attn: bool = False,
# Sampling Params
no_perf: bool = False,
last_n_tokens_size: int = 64,
# LoRA Params
lora_base: Optional[str] = None,
Expand Down Expand Up @@ -173,6 +174,7 @@ def __init__(
embedding: Embedding mode only.
offload_kqv: Offload K, Q, V to GPU.
flash_attn: Use flash attention.
no_perf: Measure performance timings.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
lora_path: Path to a LoRA file to apply to the model.
Expand Down Expand Up @@ -351,6 +353,7 @@ def __init__(
if type_v is not None:
self.context_params.type_v = type_v
# Sampling Params
self.context_params.no_perf = no_perf
self.last_n_tokens_size = last_n_tokens_size

self.cache: Optional[BaseLlamaCache] = None
Expand Down Expand Up @@ -2093,6 +2096,7 @@ def __getstate__(self):
offload_kqv=self.context_params.offload_kqv,
flash_attn=self.context_params.flash_attn,
# Sampling Params
no_perf=self.context_params.no_perf,
last_n_tokens_size=self.last_n_tokens_size,
# LoRA Params
lora_base=self.lora_base,
Expand Down
3 changes: 3 additions & 0 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,7 @@ class llama_context_params(ctypes.Structure):
embeddings (bool): if true, extract embeddings (together with logits)
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
flash_attn (bool): whether to use flash attention
no_perf (bool): whether to measure performance timings
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
"""
Expand Down Expand Up @@ -810,6 +811,7 @@ class llama_context_params(ctypes.Structure):
embeddings: bool
offload_kqv: bool
flash_attn: bool
no_perf: bool
abort_callback: Callable[[ctypes.c_void_p], bool]
abort_callback_data: ctypes.c_void_p

Expand Down Expand Up @@ -839,6 +841,7 @@ class llama_context_params(ctypes.Structure):
("embeddings", ctypes.c_bool),
("offload_kqv", ctypes.c_bool),
("flash_attn", ctypes.c_bool),
("no_perf", ctypes.c_bool),
("abort_callback", ggml_abort_callback),
("abort_callback_data", ctypes.c_void_p),
]
Expand Down