[Tokenizer] Fix TokenizerFast missing clean_up_tokenization_spaces (#…

…9304)
PaddlePaddle · Oct 23, 2024 · 0102f31 · 0102f31
1 parent a564483
commit 0102f31
Showing 1 changed file with 3 additions and 0 deletions.
diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py
@@ -1371,6 +1371,9 @@ def __init__(self, **kwargs):
 
         self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
 
+        # By default, cleaning tokenization spaces for both fast and slow tokenizers
+        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
+
         # By default, do not split special tokens for both fast and slow tokenizers
         self.split_special_tokens = kwargs.pop("split_special_tokens", False)