diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py
index 32de7553992f..c5a54765b723 100644
--- a/paddlenlp/transformers/__init__.py
+++ b/paddlenlp/transformers/__init__.py
@@ -260,6 +260,7 @@
from .bloom.configuration import *
from .bloom.modeling import *
from .bloom.tokenizer import *
+from .bloom.tokenizer_fast import *
from .clipseg.configuration import *
from .clipseg.modeling import *
from .clipseg.processing import *
diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py
index 6fd8b5fcf2b0..84e537a1902f 100644
--- a/paddlenlp/transformers/auto/tokenizer.py
+++ b/paddlenlp/transformers/auto/tokenizer.py
@@ -54,7 +54,10 @@
),
),
("blenderbot", "BlenderbotTokenizer"),
- ("bloom", "BloomTokenizer"),
+ (
+ "bloom",
+ ("BloomTokenizer", "BloomTokenizerFast" if is_tokenizers_available() else None),
+ ),
("clip", "CLIPTokenizer"),
("codegen", "CodeGenTokenizer"),
("convbert", "ConvBertTokenizer"),
diff --git a/paddlenlp/transformers/bert/tokenizer_fast.py b/paddlenlp/transformers/bert/tokenizer_fast.py
index ba11e48f1b37..e8db2825dfb5 100644
--- a/paddlenlp/transformers/bert/tokenizer_fast.py
+++ b/paddlenlp/transformers/bert/tokenizer_fast.py
@@ -1,3 +1,4 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py
new file mode 100644
index 000000000000..1658bcea9b98
--- /dev/null
+++ b/paddlenlp/transformers/bloom/tokenizer_fast.py
@@ -0,0 +1,132 @@
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+from typing import Optional, Tuple
+
+from ..tokenizer_utils_base import BatchEncoding
+from ..tokenizer_utils_fast import PretrainedTokenizerFast
+from .tokenizer import BloomTokenizer
+
+VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
+
+
+class BloomTokenizerFast(PretrainedTokenizerFast):
+ r"""
+ Construct a "fast" Bloom tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+ Byte-Pair-Encoding.
+
+ This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+ You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+ the model was not pretrained this way, it might yield a decrease in performance.
+
+
+
+ When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+
+
+
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+ refer to this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ merges_file (`str`):
+ Path to the merges file.
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The beginning of sequence token.
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The end of sequence token.
+ add_prefix_space (`bool`, *optional*, defaults to `False`):
+ Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+ other word. (Bloom tokenizer detect beginning of words by the preceding space).
+ trim_offsets (`bool`, *optional*, defaults to `True`):
+ Whether or not the post-processing step should trim offsets to avoid including whitespaces.
+ """
+
+ resource_files_names = VOCAB_FILES_NAMES
+ model_input_names = ["input_ids", "attention_mask"]
+ slow_tokenizer_class = BloomTokenizer
+
+ def __init__(
+ self,
+ vocab_file=None,
+ merges_file=None,
+ tokenizer_file=None,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token="",
+ add_prefix_space=False,
+ clean_up_tokenization_spaces=False,
+ **kwargs,
+ ):
+ super().__init__(
+ vocab_file=vocab_file,
+ merges_file=merges_file,
+ tokenizer_file=tokenizer_file,
+ unk_token=unk_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ add_prefix_space=add_prefix_space,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ **kwargs,
+ )
+
+ pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
+ decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
+
+ if add_prefix_space:
+ pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+ decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+ self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
+ self.backend_tokenizer.decoder = pickle.loads(decoder_state)
+
+ self.add_prefix_space = add_prefix_space
+
+ def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+ is_split_into_words = kwargs.get("is_split_into_words", False)
+ if not (self.add_prefix_space or not is_split_into_words):
+ raise Exception(
+ f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
+ " pretokenized inputs."
+ )
+
+ return super()._batch_encode_plus(*args, **kwargs)
+
+ def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+ is_split_into_words = kwargs.get("is_split_into_words", False)
+
+ if not (self.add_prefix_space or not is_split_into_words):
+ raise Exception(
+ f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
+ " pretokenized inputs."
+ )
+
+ return super()._encode_plus(*args, **kwargs)
+
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+ return tuple(files)
diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py
index 3cbd4a07cd9c..da36ec04d270 100644
--- a/paddlenlp/transformers/convert_slow_tokenizer.py
+++ b/paddlenlp/transformers/convert_slow_tokenizer.py
@@ -442,10 +442,7 @@ def pre_tokenizer(self, replacement, add_prefix_space):
return None
-SLOW_TO_FAST_CONVERTERS = {
- "LlamaTokenizer": LlamaConverter,
- "BertTokenizer": BertConverter,
-}
+SLOW_TO_FAST_CONVERTERS = {"LlamaTokenizer": LlamaConverter, "BertTokenizer": BertConverter}
def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer:
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
index 8131c2c1a284..243e544da2ae 100644
--- a/tests/transformers/bloom/test_tokenizer.py
+++ b/tests/transformers/bloom/test_tokenizer.py
@@ -17,7 +17,7 @@
import os
import unittest
-from paddlenlp.transformers import BloomTokenizer
+from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast
from ..test_tokenizer_common import TokenizerTesterMixin
@@ -30,6 +30,7 @@
class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BloomTokenizer
+ rust_tokenizer_class = BloomTokenizerFast
from_pretrained_kwargs = {"add_prefix_space": True}
test_decode_token = True
test_seq2seq = False
@@ -90,8 +91,25 @@ def test_full_tokenizer(self):
input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+ # test encode_plus
+ def test_encodings_from_sample_data(self):
+ """
+ Assert that the created tokens are the same than the hard-coded ones
+ """
+ tokenizer = self.rust_tokenizer_class.from_pretrained("bigscience/bloom-560m")
+
+ INPUT_SENTENCES = ["The quick brown fox", "jumps over the lazy dog"]
+ TARGET_TOKENS = [[2175, 23714, 73173, 144252, 2], [77, 132619, 3478, 368, 109586, 35433, 2]]
+
+ computed_tokens = tokenizer.batch_encode(INPUT_SENTENCES)["input_ids"]
+ self.assertListEqual(TARGET_TOKENS, computed_tokens)
+
+ decoded_tokens = tokenizer.batch_decode(computed_tokens)
+ self.assertListEqual(decoded_tokens, INPUT_SENTENCES)
+
def test_pretokenized_inputs(self, *args, **kwargs):
pass