Skip to content

Commit 346353e

Browse files
gcalmettestlrmchlsmth
authored andcommitted
[Bugfix] Ensure special tokens are properly filtered out for guided structured output with MistralTokenizer (vllm-project#10363)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
1 parent 1f0d75b commit 346353e

File tree

2 files changed

+17
-6
lines changed

2 files changed

+17
-6
lines changed

requirements-common.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ pillow # Required for image processing
1717
prometheus_client >= 0.18.0
1818
prometheus-fastapi-instrumentator >= 7.0.0
1919
tiktoken >= 0.6.0 # Required for DBRX tokenizer
20-
lm-format-enforcer == 0.10.6
20+
lm-format-enforcer >= 0.10.9, < 0.11
2121
outlines >= 0.0.43, < 0.1
2222
typing_extensions >= 4.10
2323
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
@@ -31,4 +31,4 @@ pyyaml
3131
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
3232
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
3333
einops # Required for Qwen2-VL.
34-
compressed-tensors == 0.8.0 # required for compressed-tensors
34+
compressed-tensors == 0.8.0 # required for compressed-tensors

vllm/transformers_utils/tokenizers/mistral.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -174,18 +174,29 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
174174
revision=revision)
175175
return tokenizer_file
176176

177-
# the following attributes are set to fit VLLM's design
177+
# the following attributes are set to fit VLLM's design and are used
178+
# by the guided structured output backends.
178179
@property
179180
def all_special_tokens_extended(self) -> List[str]:
180-
return []
181+
# tekken defines its own extended special tokens list
182+
if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
183+
special_tokens = self.tokenizer.SPECIAL_TOKENS
184+
else:
185+
special_tokens = list(SpecialTokens)
186+
return [
187+
s.value if isinstance(s, SpecialTokens) else s
188+
for s in special_tokens
189+
]
181190

182191
@property
183192
def all_special_tokens(self) -> List[str]:
184-
return []
193+
return self.all_special_tokens_extended
185194

186195
@property
187196
def all_special_ids(self) -> List[int]:
188-
return []
197+
return [
198+
self.all_special_tokens.index(t) for t in self.all_special_tokens
199+
]
189200

190201
@property
191202
def bos_token_id(self) -> int:

0 commit comments

Comments
 (0)