From 923f036519fa2d7c386edd4784543cd701eadcd0 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Wed, 5 Feb 2025 15:46:36 +0100 Subject: [PATCH 1/2] Improved stability of litellm models for reasoning models. --- src/lighteval/models/litellm_model.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/lighteval/models/litellm_model.py b/src/lighteval/models/litellm_model.py index 840061788..d995f7369 100644 --- a/src/lighteval/models/litellm_model.py +++ b/src/lighteval/models/litellm_model.py @@ -22,6 +22,7 @@ import logging import os +import re import time from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass @@ -93,12 +94,17 @@ def __init__(self, config, env_config) -> None: litellm.drop_params = True litellm.set_verbose = False + def is_reasoning_model(self): + return "o1" in self.model or "o3" in self.model or "R1" in self.model + def _prepare_stop_sequence(self, stop_sequence): """Prepare and validate stop sequence.""" if self.provider == "anthropic": # Filter out whitespace-only stop sequences if stop_sequence: stop_sequence = [s for s in stop_sequence if s and s.strip()] + if not stop_sequence: # If empty after filtering + stop_sequence = ["\n"] return stop_sequence def _prepare_max_new_tokens(self, max_new_tokens): @@ -106,7 +112,7 @@ def _prepare_max_new_tokens(self, max_new_tokens): if not max_new_tokens or max_new_tokens <= 0: return None - if "o1" in self.model: + if self.is_reasoning_model(): # We need to allow more tokens to include reasoning tokens max_new_tokens = min(max_new_tokens * 10, 32000) return max_new_tokens @@ -132,8 +138,8 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se "n": num_samples, "caching": True, } - if "o1" in self.model: - logger.warning("O1 models do not support temperature, top_p, stop sequence. Disabling.") + if self.is_reasoning_model(): + logger.warning("Reasoning models do not support temperature, top_p, stop sequence. Disabling.") else: kwargs["temperature"] = self.TEMPERATURE kwargs["top_p"] = self.TOP_P @@ -142,10 +148,17 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se response = litellm.completion(**kwargs) # If response is empty, retry without caching (maybe the error is recoverable and solved with a retry) - if response.choices[0].message.content is None: + content = response.choices[0].message.content + if not content: kwargs["caching"] = False logger.info("Response is empty, retrying without caching") response = litellm.completion(**kwargs) + + if content and "" in content: + logger.debug(f"Removing tags from response: {content}") + response.choices[0].message.content = re.sub( + r".*?", "", content, flags=re.DOTALL + ).strip() return response except litellm.BadRequestError as e: if "message" in e.__dict__: From 1a468b7c8dd3c59fe5421e2faabb473e1f89ac80 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Wed, 12 Feb 2025 17:16:10 +0100 Subject: [PATCH 2/2] Update src/lighteval/models/litellm_model.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- src/lighteval/models/litellm_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/models/litellm_model.py b/src/lighteval/models/litellm_model.py index d995f7369..6c37c0e68 100644 --- a/src/lighteval/models/litellm_model.py +++ b/src/lighteval/models/litellm_model.py @@ -154,7 +154,7 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se logger.info("Response is empty, retrying without caching") response = litellm.completion(**kwargs) - if content and "" in content: + if content is not None and "" in content: logger.debug(f"Removing tags from response: {content}") response.choices[0].message.content = re.sub( r".*?", "", content, flags=re.DOTALL