From 565c6ead93e9df991e8c2faa8d55e4bde1ffa822 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Fri, 8 Nov 2024 00:24:25 +0800 Subject: [PATCH 01/12] add bloom tokenizer fast --- paddlenlp/transformers/auto/tokenizer.py | 8 +- .../transformers/bloom/tokenizer_fast.py | 132 ++++++++++++++++++ tests/transformers/auto/test_tokenizer.py | 1 + tests/transformers/bloom/test_tokenizer.py | 7 + 4 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 paddlenlp/transformers/bloom/tokenizer_fast.py diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 6fd8b5fcf2b0..db2c36598856 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -54,7 +54,13 @@ ), ), ("blenderbot", "BlenderbotTokenizer"), - ("bloom", "BloomTokenizer"), + ( + "bloom", + ( + "BloomTokenizer", + "BloomTokenizerFast" if is_tokenizers_available() else None + ), + ), ("clip", "CLIPTokenizer"), ("codegen", "CodeGenTokenizer"), ("convbert", "ConvBertTokenizer"), diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py new file mode 100644 index 000000000000..7c87a79a65e4 --- /dev/null +++ b/paddlenlp/transformers/bloom/tokenizer_fast.py @@ -0,0 +1,132 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import List, Optional, Tuple + +from tokenizers import normalizers + +from ..tokenizer_utils_fast import PretrainedTokenizerFast +from .tokenizer import BloomTokenizer + +VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} + + +class BloomTokenizerFast(PretrainedTokenizerFast): + r""" + Construct a "fast" Bloom tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level + Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since + the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (`str`, *optional*, defaults to `<|endoftext|>`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `<|endoftext|>`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `<|endoftext|>`): + The end of sequence token. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (Bloom tokenizer detect beginning of words by the preceding space). + trim_offsets (`bool`, *optional*, defaults to `True`): + Whether or not the post-processing step should trim offsets to avoid including whitespaces. + """ + + resource_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = None + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + add_prefix_space=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + tokenizer_file=tokenizer_file, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + add_prefix_space=add_prefix_space, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer) + decoder_state = pickle.dumps(self.backend_tokenizer.decoder) + + if add_prefix_space: + pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) + self.backend_tokenizer.decoder = pickle.loads(decoder_state) + + self.add_prefix_space = add_prefix_space + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) diff --git a/tests/transformers/auto/test_tokenizer.py b/tests/transformers/auto/test_tokenizer.py index 54c568113023..70cbdcc1036b 100644 --- a/tests/transformers/auto/test_tokenizer.py +++ b/tests/transformers/auto/test_tokenizer.py @@ -24,6 +24,7 @@ from paddlenlp.transformers.bert.configuration import BertConfig from paddlenlp.transformers.bert.tokenizer import BertTokenizer from paddlenlp.transformers.bert.tokenizer_fast import BertTokenizerFast + from paddlenlp.utils.env import TOKENIZER_CONFIG_NAME from ...utils.test_module.custom_configuration import CustomConfig diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index 8131c2c1a284..5afb502deaa8 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -18,6 +18,8 @@ import unittest from paddlenlp.transformers import BloomTokenizer +# bloom +from paddlenlp.transformers.bloom.tokenizer_fast import BloomTokenizerFast from ..test_tokenizer_common import TokenizerTesterMixin @@ -30,6 +32,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BloomTokenizer + rust_tokenizer_class = BloomTokenizerFast from_pretrained_kwargs = {"add_prefix_space": True} test_decode_token = True test_seq2seq = False @@ -76,6 +79,10 @@ def get_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs) + def get_tokenizer_fast(self, **kwargs): + kwargs.update(self.special_tokens_map) + return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs) + def get_input_output_texts(self, tokenizer): input_text = "lower newer" output_text = "lower newer" From bbe555e2bc6afc9c8b91cd5e9ff572f7c2908435 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Wed, 13 Nov 2024 02:30:54 +0800 Subject: [PATCH 02/12] fix fast --- paddlenlp/transformers/__init__.py | 1 + .../transformers/bloom/tokenizer_fast.py | 2 ++ .../transformers/convert_slow_tokenizer.py | 2 +- tests/transformers/bloom/test_tokenizer.py | 21 ++++++++++++------- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index ab7510e0897e..159ba2725f35 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -258,6 +258,7 @@ from .bloom.configuration import * from .bloom.modeling import * from .bloom.tokenizer import * +from .bloom.tokenizer_fast import * from .clipseg.configuration import * from .clipseg.modeling import * from .clipseg.processing import * diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py index 7c87a79a65e4..2e66474f7663 100644 --- a/paddlenlp/transformers/bloom/tokenizer_fast.py +++ b/paddlenlp/transformers/bloom/tokenizer_fast.py @@ -13,10 +13,12 @@ # limitations under the License. import json +import pickle from typing import List, Optional, Tuple from tokenizers import normalizers +from ..tokenizer_utils_base import BatchEncoding from ..tokenizer_utils_fast import PretrainedTokenizerFast from .tokenizer import BloomTokenizer diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py index 3cbd4a07cd9c..78882431fddc 100644 --- a/paddlenlp/transformers/convert_slow_tokenizer.py +++ b/paddlenlp/transformers/convert_slow_tokenizer.py @@ -444,7 +444,7 @@ def pre_tokenizer(self, replacement, add_prefix_space): SLOW_TO_FAST_CONVERTERS = { "LlamaTokenizer": LlamaConverter, - "BertTokenizer": BertConverter, + "BertTokenizer": BertConverter } diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index 5afb502deaa8..ad29d6212eb8 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -18,10 +18,11 @@ import unittest from paddlenlp.transformers import BloomTokenizer -# bloom -from paddlenlp.transformers.bloom.tokenizer_fast import BloomTokenizerFast +from paddlenlp.transformers import BloomTokenizerFast -from ..test_tokenizer_common import TokenizerTesterMixin +from tests.transformers.test_tokenizer_common import ( + TokenizerTesterMixin +) VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", @@ -79,10 +80,6 @@ def get_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs) - def get_tokenizer_fast(self, **kwargs): - kwargs.update(self.special_tokens_map) - return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs) - def get_input_output_texts(self, tokenizer): input_text = "lower newer" output_text = "lower newer" @@ -90,14 +87,22 @@ def get_input_output_texts(self, tokenizer): def test_full_tokenizer(self): tokenizer = BloomTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) + tokenizer2 = BloomTokenizerFast(self.vocab_file, self.merges_file, **self.special_tokens_map) text = "lower newer" bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] tokens = tokenizer.tokenize(text, add_prefix_space=True) self.assertListEqual(tokens, bpe_tokens) + tokens2 = tokenizer2.tokenize(text, add_prefix_space=True) + self.assertListEqual(tokens2, bpe_tokens) + input_tokens = tokens + [tokenizer.unk_token] + + input_tokens2 = tokens2 + [tokenizer.unk_token] input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens2), input_bpe_tokens) def test_pretokenized_inputs(self, *args, **kwargs): pass @@ -152,6 +157,7 @@ def test_padding_if_pad_token_set_slow(self): # short slice pair does have padding self.assertTrue(pad_token_id in out_p2["input_ids"][1]) self.assertTrue(0 in out_p2["attention_mask"][1]) + def test_add_bos_token_slow(self): bos_token = "$$$" @@ -174,6 +180,7 @@ def test_add_bos_token_slow(self): self.assertEqual(decode_s.split()[0], bos_token) self.assertTrue(all(d.split()[0] == bos_token for d in decode_s2)) + # tokenizer has no padding token def test_padding_different_model_input_name(self): pass From d6c371fd271418140ccd5a9a80a4206ec4cd7cb9 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Wed, 13 Nov 2024 12:33:39 +0800 Subject: [PATCH 03/12] Update test_tokenizer.py --- tests/transformers/bloom/test_tokenizer.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index ad29d6212eb8..91853ded75e7 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -87,22 +87,15 @@ def get_input_output_texts(self, tokenizer): def test_full_tokenizer(self): tokenizer = BloomTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) - tokenizer2 = BloomTokenizerFast(self.vocab_file, self.merges_file, **self.special_tokens_map) text = "lower newer" bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] tokens = tokenizer.tokenize(text, add_prefix_space=True) self.assertListEqual(tokens, bpe_tokens) - tokens2 = tokenizer2.tokenize(text, add_prefix_space=True) - self.assertListEqual(tokens2, bpe_tokens) - input_tokens = tokens + [tokenizer.unk_token] - - input_tokens2 = tokens2 + [tokenizer.unk_token] input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) - self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens2), input_bpe_tokens) def test_pretokenized_inputs(self, *args, **kwargs): pass From 6d95920a67f420a23550fe2816395c984dc1c400 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Wed, 13 Nov 2024 16:04:10 +0800 Subject: [PATCH 04/12] fix lint --- paddlenlp/transformers/auto/tokenizer.py | 5 +---- paddlenlp/transformers/bloom/tokenizer_fast.py | 6 +----- paddlenlp/transformers/convert_slow_tokenizer.py | 5 +---- tests/transformers/auto/test_tokenizer.py | 1 - tests/transformers/bloom/test_tokenizer.py | 12 +++--------- 5 files changed, 6 insertions(+), 23 deletions(-) diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index db2c36598856..84e537a1902f 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -56,10 +56,7 @@ ("blenderbot", "BlenderbotTokenizer"), ( "bloom", - ( - "BloomTokenizer", - "BloomTokenizerFast" if is_tokenizers_available() else None - ), + ("BloomTokenizer", "BloomTokenizerFast" if is_tokenizers_available() else None), ), ("clip", "CLIPTokenizer"), ("codegen", "CodeGenTokenizer"), diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py index 2e66474f7663..618a7b8e2fbd 100644 --- a/paddlenlp/transformers/bloom/tokenizer_fast.py +++ b/paddlenlp/transformers/bloom/tokenizer_fast.py @@ -12,15 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import pickle -from typing import List, Optional, Tuple - -from tokenizers import normalizers +from typing import Optional, Tuple from ..tokenizer_utils_base import BatchEncoding from ..tokenizer_utils_fast import PretrainedTokenizerFast -from .tokenizer import BloomTokenizer VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py index 78882431fddc..da36ec04d270 100644 --- a/paddlenlp/transformers/convert_slow_tokenizer.py +++ b/paddlenlp/transformers/convert_slow_tokenizer.py @@ -442,10 +442,7 @@ def pre_tokenizer(self, replacement, add_prefix_space): return None -SLOW_TO_FAST_CONVERTERS = { - "LlamaTokenizer": LlamaConverter, - "BertTokenizer": BertConverter -} +SLOW_TO_FAST_CONVERTERS = {"LlamaTokenizer": LlamaConverter, "BertTokenizer": BertConverter} def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer: diff --git a/tests/transformers/auto/test_tokenizer.py b/tests/transformers/auto/test_tokenizer.py index 70cbdcc1036b..54c568113023 100644 --- a/tests/transformers/auto/test_tokenizer.py +++ b/tests/transformers/auto/test_tokenizer.py @@ -24,7 +24,6 @@ from paddlenlp.transformers.bert.configuration import BertConfig from paddlenlp.transformers.bert.tokenizer import BertTokenizer from paddlenlp.transformers.bert.tokenizer_fast import BertTokenizerFast - from paddlenlp.utils.env import TOKENIZER_CONFIG_NAME from ...utils.test_module.custom_configuration import CustomConfig diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index 91853ded75e7..91e8e176e89c 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -17,12 +17,8 @@ import os import unittest -from paddlenlp.transformers import BloomTokenizer -from paddlenlp.transformers import BloomTokenizerFast - -from tests.transformers.test_tokenizer_common import ( - TokenizerTesterMixin -) +from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast +from tests.transformers.test_tokenizer_common import TokenizerTesterMixin VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", @@ -94,7 +90,7 @@ def test_full_tokenizer(self): input_tokens = tokens + [tokenizer.unk_token] input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] - + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) def test_pretokenized_inputs(self, *args, **kwargs): @@ -150,7 +146,6 @@ def test_padding_if_pad_token_set_slow(self): # short slice pair does have padding self.assertTrue(pad_token_id in out_p2["input_ids"][1]) self.assertTrue(0 in out_p2["attention_mask"][1]) - def test_add_bos_token_slow(self): bos_token = "$$$" @@ -173,7 +168,6 @@ def test_add_bos_token_slow(self): self.assertEqual(decode_s.split()[0], bos_token) self.assertTrue(all(d.split()[0] == bos_token for d in decode_s2)) - # tokenizer has no padding token def test_padding_different_model_input_name(self): pass From bef43ae392c84be07c39a7e36bdc28a4221dd1d3 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Wed, 20 Nov 2024 15:55:37 +0800 Subject: [PATCH 05/12] reopen ci --- tests/transformers/bloom/test_tokenizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index 91e8e176e89c..54cae2f7dc02 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -20,6 +20,7 @@ from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast from tests.transformers.test_tokenizer_common import TokenizerTesterMixin + VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", From 4ffdaa483c63875a1e91258cae9c56bd52587a37 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Wed, 20 Nov 2024 18:34:01 +0800 Subject: [PATCH 06/12] rerun ci --- tests/transformers/bloom/test_tokenizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index 54cae2f7dc02..91e8e176e89c 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -20,7 +20,6 @@ from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast from tests.transformers.test_tokenizer_common import TokenizerTesterMixin - VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", "merges_file": "merges.txt", From b0eb5edc9ff5757f5a39270115eea947439e5acf Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Fri, 22 Nov 2024 19:29:07 +0800 Subject: [PATCH 07/12] fix ci --- tests/transformers/bloom/test_tokenizer.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index 91e8e176e89c..c6a8e6abc91f 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -93,6 +93,22 @@ def test_full_tokenizer(self): self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + # test encode_plus + def test_encodings_from_sample_data(self): + """ + Assert that the created tokens are the same than the hard-coded ones + """ + tokenizer = self.get_rust_tokenizer() + + INPUT_SENTENCES = ["The quick brown fox", "jumps over the lazy dog"] + TARGET_TOKENS = [[2175, 23714, 73173, 144252, 2], [77, 132619, 3478, 368, 109586, 35433, 2]] + + computed_tokens = tokenizer.batch_encode_plus(INPUT_SENTENCES)["input_ids"] + self.assertListEqual(TARGET_TOKENS, computed_tokens) + + decoded_tokens = tokenizer.batch_decode(computed_tokens) + self.assertListEqual(decoded_tokens, INPUT_SENTENCES) + def test_pretokenized_inputs(self, *args, **kwargs): pass From 3dea2e82d6aa1b23471e3176c1006c86cae3bae3 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Fri, 22 Nov 2024 20:55:45 +0800 Subject: [PATCH 08/12] fix bloom test --- tests/transformers/bloom/test_tokenizer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index c6a8e6abc91f..7a576395b8d2 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -76,6 +76,10 @@ def get_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs) + def get_rust_tokenizer(self, **kwargs): + kwargs.update(self.special_tokens_map) + return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + def get_input_output_texts(self, tokenizer): input_text = "lower newer" output_text = "lower newer" From f6d0fcf5c373a1bad6faa1384a1423c2d2b95d8c Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 23 Nov 2024 00:54:06 +0800 Subject: [PATCH 09/12] fix bloom coverage test --- paddlenlp/transformers/bloom/tokenizer_fast.py | 3 ++- tests/transformers/bloom/test_tokenizer.py | 8 ++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py index 618a7b8e2fbd..c4aa109d097d 100644 --- a/paddlenlp/transformers/bloom/tokenizer_fast.py +++ b/paddlenlp/transformers/bloom/tokenizer_fast.py @@ -17,6 +17,7 @@ from ..tokenizer_utils_base import BatchEncoding from ..tokenizer_utils_fast import PretrainedTokenizerFast +from .tokenizer import BloomTokenizer VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} @@ -65,7 +66,7 @@ class BloomTokenizerFast(PretrainedTokenizerFast): resource_files_names = VOCAB_FILES_NAMES model_input_names = ["input_ids", "attention_mask"] - slow_tokenizer_class = None + slow_tokenizer_class = BloomTokenizer def __init__( self, diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index 7a576395b8d2..29364fb96d8d 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -76,10 +76,6 @@ def get_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) - def get_input_output_texts(self, tokenizer): input_text = "lower newer" output_text = "lower newer" @@ -102,12 +98,12 @@ def test_encodings_from_sample_data(self): """ Assert that the created tokens are the same than the hard-coded ones """ - tokenizer = self.get_rust_tokenizer() + tokenizer = self.rust_tokenizer_class.from_pretrained("bigscience/bloom-560m") INPUT_SENTENCES = ["The quick brown fox", "jumps over the lazy dog"] TARGET_TOKENS = [[2175, 23714, 73173, 144252, 2], [77, 132619, 3478, 368, 109586, 35433, 2]] - computed_tokens = tokenizer.batch_encode_plus(INPUT_SENTENCES)["input_ids"] + computed_tokens = tokenizer.batch_encode(INPUT_SENTENCES)["input_ids"] self.assertListEqual(TARGET_TOKENS, computed_tokens) decoded_tokens = tokenizer.batch_decode(computed_tokens) From 7161b6031a95708090674d665bbe52544fdbe7fa Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Sat, 23 Nov 2024 01:46:23 +0800 Subject: [PATCH 10/12] fix bloom coverage test --- tests/transformers/bloom/test_tokenizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index 29364fb96d8d..243e544da2ae 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -18,7 +18,8 @@ import unittest from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast -from tests.transformers.test_tokenizer_common import TokenizerTesterMixin + +from ..test_tokenizer_common import TokenizerTesterMixin VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", From 13dd590af8bbf273b6f648a905466cb422e51b30 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Tue, 26 Nov 2024 15:46:41 +0800 Subject: [PATCH 11/12] add copyright for bert tokenizer fast --- paddlenlp/transformers/bert/tokenizer_fast.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paddlenlp/transformers/bert/tokenizer_fast.py b/paddlenlp/transformers/bert/tokenizer_fast.py index ba11e48f1b37..e8db2825dfb5 100644 --- a/paddlenlp/transformers/bert/tokenizer_fast.py +++ b/paddlenlp/transformers/bert/tokenizer_fast.py @@ -1,3 +1,4 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); From 8907b0cfb413cacc7d306e0a67c32b7fb7cf7c55 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Tue, 26 Nov 2024 15:47:39 +0800 Subject: [PATCH 12/12] add copyright for bloom tokenizer fast --- paddlenlp/transformers/bloom/tokenizer_fast.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py index c4aa109d097d..1658bcea9b98 100644 --- a/paddlenlp/transformers/bloom/tokenizer_fast.py +++ b/paddlenlp/transformers/bloom/tokenizer_fast.py @@ -1,3 +1,4 @@ +# Copyright 2022 The HuggingFace Inc. team. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License");