diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index 32de7553992f..c5a54765b723 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -260,6 +260,7 @@ from .bloom.configuration import * from .bloom.modeling import * from .bloom.tokenizer import * +from .bloom.tokenizer_fast import * from .clipseg.configuration import * from .clipseg.modeling import * from .clipseg.processing import * diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 6fd8b5fcf2b0..84e537a1902f 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -54,7 +54,10 @@ ), ), ("blenderbot", "BlenderbotTokenizer"), - ("bloom", "BloomTokenizer"), + ( + "bloom", + ("BloomTokenizer", "BloomTokenizerFast" if is_tokenizers_available() else None), + ), ("clip", "CLIPTokenizer"), ("codegen", "CodeGenTokenizer"), ("convbert", "ConvBertTokenizer"), diff --git a/paddlenlp/transformers/bert/tokenizer_fast.py b/paddlenlp/transformers/bert/tokenizer_fast.py index ba11e48f1b37..e8db2825dfb5 100644 --- a/paddlenlp/transformers/bert/tokenizer_fast.py +++ b/paddlenlp/transformers/bert/tokenizer_fast.py @@ -1,3 +1,4 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py new file mode 100644 index 000000000000..1658bcea9b98 --- /dev/null +++ b/paddlenlp/transformers/bloom/tokenizer_fast.py @@ -0,0 +1,132 @@ +# Copyright 2022 The HuggingFace Inc. team. +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle +from typing import Optional, Tuple + +from ..tokenizer_utils_base import BatchEncoding +from ..tokenizer_utils_fast import PretrainedTokenizerFast +from .tokenizer import BloomTokenizer + +VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} + + +class BloomTokenizerFast(PretrainedTokenizerFast): + r""" + Construct a "fast" Bloom tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level + Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since + the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (`str`, *optional*, defaults to `<|endoftext|>`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `<|endoftext|>`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `<|endoftext|>`): + The end of sequence token. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (Bloom tokenizer detect beginning of words by the preceding space). + trim_offsets (`bool`, *optional*, defaults to `True`): + Whether or not the post-processing step should trim offsets to avoid including whitespaces. + """ + + resource_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = BloomTokenizer + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + add_prefix_space=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + tokenizer_file=tokenizer_file, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + add_prefix_space=add_prefix_space, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer) + decoder_state = pickle.dumps(self.backend_tokenizer.decoder) + + if add_prefix_space: + pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) + self.backend_tokenizer.decoder = pickle.loads(decoder_state) + + self.add_prefix_space = add_prefix_space + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py index 3cbd4a07cd9c..da36ec04d270 100644 --- a/paddlenlp/transformers/convert_slow_tokenizer.py +++ b/paddlenlp/transformers/convert_slow_tokenizer.py @@ -442,10 +442,7 @@ def pre_tokenizer(self, replacement, add_prefix_space): return None -SLOW_TO_FAST_CONVERTERS = { - "LlamaTokenizer": LlamaConverter, - "BertTokenizer": BertConverter, -} +SLOW_TO_FAST_CONVERTERS = {"LlamaTokenizer": LlamaConverter, "BertTokenizer": BertConverter} def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer: diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index 8131c2c1a284..243e544da2ae 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -17,7 +17,7 @@ import os import unittest -from paddlenlp.transformers import BloomTokenizer +from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast from ..test_tokenizer_common import TokenizerTesterMixin @@ -30,6 +30,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BloomTokenizer + rust_tokenizer_class = BloomTokenizerFast from_pretrained_kwargs = {"add_prefix_space": True} test_decode_token = True test_seq2seq = False @@ -90,8 +91,25 @@ def test_full_tokenizer(self): input_tokens = tokens + [tokenizer.unk_token] input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + # test encode_plus + def test_encodings_from_sample_data(self): + """ + Assert that the created tokens are the same than the hard-coded ones + """ + tokenizer = self.rust_tokenizer_class.from_pretrained("bigscience/bloom-560m") + + INPUT_SENTENCES = ["The quick brown fox", "jumps over the lazy dog"] + TARGET_TOKENS = [[2175, 23714, 73173, 144252, 2], [77, 132619, 3478, 368, 109586, 35433, 2]] + + computed_tokens = tokenizer.batch_encode(INPUT_SENTENCES)["input_ids"] + self.assertListEqual(TARGET_TOKENS, computed_tokens) + + decoded_tokens = tokenizer.batch_decode(computed_tokens) + self.assertListEqual(decoded_tokens, INPUT_SENTENCES) + def test_pretokenized_inputs(self, *args, **kwargs): pass