From 565c6ead93e9df991e8c2faa8d55e4bde1ffa822 Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Fri, 8 Nov 2024 00:24:25 +0800
Subject: [PATCH 01/12] add bloom tokenizer fast
---
paddlenlp/transformers/auto/tokenizer.py | 8 +-
.../transformers/bloom/tokenizer_fast.py | 132 ++++++++++++++++++
tests/transformers/auto/test_tokenizer.py | 1 +
tests/transformers/bloom/test_tokenizer.py | 7 +
4 files changed, 147 insertions(+), 1 deletion(-)
create mode 100644 paddlenlp/transformers/bloom/tokenizer_fast.py
diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py
index 6fd8b5fcf2b0..db2c36598856 100644
--- a/paddlenlp/transformers/auto/tokenizer.py
+++ b/paddlenlp/transformers/auto/tokenizer.py
@@ -54,7 +54,13 @@
),
),
("blenderbot", "BlenderbotTokenizer"),
- ("bloom", "BloomTokenizer"),
+ (
+ "bloom",
+ (
+ "BloomTokenizer",
+ "BloomTokenizerFast" if is_tokenizers_available() else None
+ ),
+ ),
("clip", "CLIPTokenizer"),
("codegen", "CodeGenTokenizer"),
("convbert", "ConvBertTokenizer"),
diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py
new file mode 100644
index 000000000000..7c87a79a65e4
--- /dev/null
+++ b/paddlenlp/transformers/bloom/tokenizer_fast.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from typing import List, Optional, Tuple
+
+from tokenizers import normalizers
+
+from ..tokenizer_utils_fast import PretrainedTokenizerFast
+from .tokenizer import BloomTokenizer
+
+VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
+
+
+class BloomTokenizerFast(PretrainedTokenizerFast):
+ r"""
+ Construct a "fast" Bloom tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+ Byte-Pair-Encoding.
+
+ This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+ You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+ the model was not pretrained this way, it might yield a decrease in performance.
+
+
+
+ When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+
+
+
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+ refer to this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ merges_file (`str`):
+ Path to the merges file.
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The beginning of sequence token.
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The end of sequence token.
+ add_prefix_space (`bool`, *optional*, defaults to `False`):
+ Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+ other word. (Bloom tokenizer detect beginning of words by the preceding space).
+ trim_offsets (`bool`, *optional*, defaults to `True`):
+ Whether or not the post-processing step should trim offsets to avoid including whitespaces.
+ """
+
+ resource_files_names = VOCAB_FILES_NAMES
+ model_input_names = ["input_ids", "attention_mask"]
+ slow_tokenizer_class = None
+
+ def __init__(
+ self,
+ vocab_file=None,
+ merges_file=None,
+ tokenizer_file=None,
+ unk_token="",
+ bos_token="",
+ eos_token="",
+ pad_token="",
+ add_prefix_space=False,
+ clean_up_tokenization_spaces=False,
+ **kwargs,
+ ):
+ super().__init__(
+ vocab_file=vocab_file,
+ merges_file=merges_file,
+ tokenizer_file=tokenizer_file,
+ unk_token=unk_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ add_prefix_space=add_prefix_space,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ **kwargs,
+ )
+
+ pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
+ decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
+
+ if add_prefix_space:
+ pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+ decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+ self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
+ self.backend_tokenizer.decoder = pickle.loads(decoder_state)
+
+ self.add_prefix_space = add_prefix_space
+
+ def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+ is_split_into_words = kwargs.get("is_split_into_words", False)
+ if not (self.add_prefix_space or not is_split_into_words):
+ raise Exception(
+ f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
+ " pretokenized inputs."
+ )
+
+ return super()._batch_encode_plus(*args, **kwargs)
+
+ def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+ is_split_into_words = kwargs.get("is_split_into_words", False)
+
+ if not (self.add_prefix_space or not is_split_into_words):
+ raise Exception(
+ f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
+ " pretokenized inputs."
+ )
+
+ return super()._encode_plus(*args, **kwargs)
+
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+ return tuple(files)
diff --git a/tests/transformers/auto/test_tokenizer.py b/tests/transformers/auto/test_tokenizer.py
index 54c568113023..70cbdcc1036b 100644
--- a/tests/transformers/auto/test_tokenizer.py
+++ b/tests/transformers/auto/test_tokenizer.py
@@ -24,6 +24,7 @@
from paddlenlp.transformers.bert.configuration import BertConfig
from paddlenlp.transformers.bert.tokenizer import BertTokenizer
from paddlenlp.transformers.bert.tokenizer_fast import BertTokenizerFast
+
from paddlenlp.utils.env import TOKENIZER_CONFIG_NAME
from ...utils.test_module.custom_configuration import CustomConfig
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
index 8131c2c1a284..5afb502deaa8 100644
--- a/tests/transformers/bloom/test_tokenizer.py
+++ b/tests/transformers/bloom/test_tokenizer.py
@@ -18,6 +18,8 @@
import unittest
from paddlenlp.transformers import BloomTokenizer
+# bloom
+from paddlenlp.transformers.bloom.tokenizer_fast import BloomTokenizerFast
from ..test_tokenizer_common import TokenizerTesterMixin
@@ -30,6 +32,7 @@
class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BloomTokenizer
+ rust_tokenizer_class = BloomTokenizerFast
from_pretrained_kwargs = {"add_prefix_space": True}
test_decode_token = True
test_seq2seq = False
@@ -76,6 +79,10 @@ def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+ def get_tokenizer_fast(self, **kwargs):
+ kwargs.update(self.special_tokens_map)
+ return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
From bbe555e2bc6afc9c8b91cd5e9ff572f7c2908435 Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Wed, 13 Nov 2024 02:30:54 +0800
Subject: [PATCH 02/12] fix fast
---
paddlenlp/transformers/__init__.py | 1 +
.../transformers/bloom/tokenizer_fast.py | 2 ++
.../transformers/convert_slow_tokenizer.py | 2 +-
tests/transformers/bloom/test_tokenizer.py | 21 ++++++++++++-------
4 files changed, 18 insertions(+), 8 deletions(-)
diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py
index ab7510e0897e..159ba2725f35 100644
--- a/paddlenlp/transformers/__init__.py
+++ b/paddlenlp/transformers/__init__.py
@@ -258,6 +258,7 @@
from .bloom.configuration import *
from .bloom.modeling import *
from .bloom.tokenizer import *
+from .bloom.tokenizer_fast import *
from .clipseg.configuration import *
from .clipseg.modeling import *
from .clipseg.processing import *
diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py
index 7c87a79a65e4..2e66474f7663 100644
--- a/paddlenlp/transformers/bloom/tokenizer_fast.py
+++ b/paddlenlp/transformers/bloom/tokenizer_fast.py
@@ -13,10 +13,12 @@
# limitations under the License.
import json
+import pickle
from typing import List, Optional, Tuple
from tokenizers import normalizers
+from ..tokenizer_utils_base import BatchEncoding
from ..tokenizer_utils_fast import PretrainedTokenizerFast
from .tokenizer import BloomTokenizer
diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py
index 3cbd4a07cd9c..78882431fddc 100644
--- a/paddlenlp/transformers/convert_slow_tokenizer.py
+++ b/paddlenlp/transformers/convert_slow_tokenizer.py
@@ -444,7 +444,7 @@ def pre_tokenizer(self, replacement, add_prefix_space):
SLOW_TO_FAST_CONVERTERS = {
"LlamaTokenizer": LlamaConverter,
- "BertTokenizer": BertConverter,
+ "BertTokenizer": BertConverter
}
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
index 5afb502deaa8..ad29d6212eb8 100644
--- a/tests/transformers/bloom/test_tokenizer.py
+++ b/tests/transformers/bloom/test_tokenizer.py
@@ -18,10 +18,11 @@
import unittest
from paddlenlp.transformers import BloomTokenizer
-# bloom
-from paddlenlp.transformers.bloom.tokenizer_fast import BloomTokenizerFast
+from paddlenlp.transformers import BloomTokenizerFast
-from ..test_tokenizer_common import TokenizerTesterMixin
+from tests.transformers.test_tokenizer_common import (
+ TokenizerTesterMixin
+)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
@@ -79,10 +80,6 @@ def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs)
- def get_tokenizer_fast(self, **kwargs):
- kwargs.update(self.special_tokens_map)
- return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
@@ -90,14 +87,22 @@ def get_input_output_texts(self, tokenizer):
def test_full_tokenizer(self):
tokenizer = BloomTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+ tokenizer2 = BloomTokenizerFast(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "lower newer"
bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
tokens = tokenizer.tokenize(text, add_prefix_space=True)
self.assertListEqual(tokens, bpe_tokens)
+ tokens2 = tokenizer2.tokenize(text, add_prefix_space=True)
+ self.assertListEqual(tokens2, bpe_tokens)
+
input_tokens = tokens + [tokenizer.unk_token]
+
+ input_tokens2 = tokens2 + [tokenizer.unk_token]
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+ self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens2), input_bpe_tokens)
def test_pretokenized_inputs(self, *args, **kwargs):
pass
@@ -152,6 +157,7 @@ def test_padding_if_pad_token_set_slow(self):
# short slice pair does have padding
self.assertTrue(pad_token_id in out_p2["input_ids"][1])
self.assertTrue(0 in out_p2["attention_mask"][1])
+
def test_add_bos_token_slow(self):
bos_token = "$$$"
@@ -174,6 +180,7 @@ def test_add_bos_token_slow(self):
self.assertEqual(decode_s.split()[0], bos_token)
self.assertTrue(all(d.split()[0] == bos_token for d in decode_s2))
+
# tokenizer has no padding token
def test_padding_different_model_input_name(self):
pass
From d6c371fd271418140ccd5a9a80a4206ec4cd7cb9 Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Wed, 13 Nov 2024 12:33:39 +0800
Subject: [PATCH 03/12] Update test_tokenizer.py
---
tests/transformers/bloom/test_tokenizer.py | 7 -------
1 file changed, 7 deletions(-)
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
index ad29d6212eb8..91853ded75e7 100644
--- a/tests/transformers/bloom/test_tokenizer.py
+++ b/tests/transformers/bloom/test_tokenizer.py
@@ -87,22 +87,15 @@ def get_input_output_texts(self, tokenizer):
def test_full_tokenizer(self):
tokenizer = BloomTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
- tokenizer2 = BloomTokenizerFast(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "lower newer"
bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
tokens = tokenizer.tokenize(text, add_prefix_space=True)
self.assertListEqual(tokens, bpe_tokens)
- tokens2 = tokenizer2.tokenize(text, add_prefix_space=True)
- self.assertListEqual(tokens2, bpe_tokens)
-
input_tokens = tokens + [tokenizer.unk_token]
-
- input_tokens2 = tokens2 + [tokenizer.unk_token]
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
- self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens2), input_bpe_tokens)
def test_pretokenized_inputs(self, *args, **kwargs):
pass
From 6d95920a67f420a23550fe2816395c984dc1c400 Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Wed, 13 Nov 2024 16:04:10 +0800
Subject: [PATCH 04/12] fix lint
---
paddlenlp/transformers/auto/tokenizer.py | 5 +----
paddlenlp/transformers/bloom/tokenizer_fast.py | 6 +-----
paddlenlp/transformers/convert_slow_tokenizer.py | 5 +----
tests/transformers/auto/test_tokenizer.py | 1 -
tests/transformers/bloom/test_tokenizer.py | 12 +++---------
5 files changed, 6 insertions(+), 23 deletions(-)
diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py
index db2c36598856..84e537a1902f 100644
--- a/paddlenlp/transformers/auto/tokenizer.py
+++ b/paddlenlp/transformers/auto/tokenizer.py
@@ -56,10 +56,7 @@
("blenderbot", "BlenderbotTokenizer"),
(
"bloom",
- (
- "BloomTokenizer",
- "BloomTokenizerFast" if is_tokenizers_available() else None
- ),
+ ("BloomTokenizer", "BloomTokenizerFast" if is_tokenizers_available() else None),
),
("clip", "CLIPTokenizer"),
("codegen", "CodeGenTokenizer"),
diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py
index 2e66474f7663..618a7b8e2fbd 100644
--- a/paddlenlp/transformers/bloom/tokenizer_fast.py
+++ b/paddlenlp/transformers/bloom/tokenizer_fast.py
@@ -12,15 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import json
import pickle
-from typing import List, Optional, Tuple
-
-from tokenizers import normalizers
+from typing import Optional, Tuple
from ..tokenizer_utils_base import BatchEncoding
from ..tokenizer_utils_fast import PretrainedTokenizerFast
-from .tokenizer import BloomTokenizer
VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py
index 78882431fddc..da36ec04d270 100644
--- a/paddlenlp/transformers/convert_slow_tokenizer.py
+++ b/paddlenlp/transformers/convert_slow_tokenizer.py
@@ -442,10 +442,7 @@ def pre_tokenizer(self, replacement, add_prefix_space):
return None
-SLOW_TO_FAST_CONVERTERS = {
- "LlamaTokenizer": LlamaConverter,
- "BertTokenizer": BertConverter
-}
+SLOW_TO_FAST_CONVERTERS = {"LlamaTokenizer": LlamaConverter, "BertTokenizer": BertConverter}
def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer:
diff --git a/tests/transformers/auto/test_tokenizer.py b/tests/transformers/auto/test_tokenizer.py
index 70cbdcc1036b..54c568113023 100644
--- a/tests/transformers/auto/test_tokenizer.py
+++ b/tests/transformers/auto/test_tokenizer.py
@@ -24,7 +24,6 @@
from paddlenlp.transformers.bert.configuration import BertConfig
from paddlenlp.transformers.bert.tokenizer import BertTokenizer
from paddlenlp.transformers.bert.tokenizer_fast import BertTokenizerFast
-
from paddlenlp.utils.env import TOKENIZER_CONFIG_NAME
from ...utils.test_module.custom_configuration import CustomConfig
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
index 91853ded75e7..91e8e176e89c 100644
--- a/tests/transformers/bloom/test_tokenizer.py
+++ b/tests/transformers/bloom/test_tokenizer.py
@@ -17,12 +17,8 @@
import os
import unittest
-from paddlenlp.transformers import BloomTokenizer
-from paddlenlp.transformers import BloomTokenizerFast
-
-from tests.transformers.test_tokenizer_common import (
- TokenizerTesterMixin
-)
+from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast
+from tests.transformers.test_tokenizer_common import TokenizerTesterMixin
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
@@ -94,7 +90,7 @@ def test_full_tokenizer(self):
input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-
+
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def test_pretokenized_inputs(self, *args, **kwargs):
@@ -150,7 +146,6 @@ def test_padding_if_pad_token_set_slow(self):
# short slice pair does have padding
self.assertTrue(pad_token_id in out_p2["input_ids"][1])
self.assertTrue(0 in out_p2["attention_mask"][1])
-
def test_add_bos_token_slow(self):
bos_token = "$$$"
@@ -173,7 +168,6 @@ def test_add_bos_token_slow(self):
self.assertEqual(decode_s.split()[0], bos_token)
self.assertTrue(all(d.split()[0] == bos_token for d in decode_s2))
-
# tokenizer has no padding token
def test_padding_different_model_input_name(self):
pass
From bef43ae392c84be07c39a7e36bdc28a4221dd1d3 Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Wed, 20 Nov 2024 15:55:37 +0800
Subject: [PATCH 05/12] reopen ci
---
tests/transformers/bloom/test_tokenizer.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
index 91e8e176e89c..54cae2f7dc02 100644
--- a/tests/transformers/bloom/test_tokenizer.py
+++ b/tests/transformers/bloom/test_tokenizer.py
@@ -20,6 +20,7 @@
from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast
from tests.transformers.test_tokenizer_common import TokenizerTesterMixin
+
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
From 4ffdaa483c63875a1e91258cae9c56bd52587a37 Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Wed, 20 Nov 2024 18:34:01 +0800
Subject: [PATCH 06/12] rerun ci
---
tests/transformers/bloom/test_tokenizer.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
index 54cae2f7dc02..91e8e176e89c 100644
--- a/tests/transformers/bloom/test_tokenizer.py
+++ b/tests/transformers/bloom/test_tokenizer.py
@@ -20,7 +20,6 @@
from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast
from tests.transformers.test_tokenizer_common import TokenizerTesterMixin
-
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
From b0eb5edc9ff5757f5a39270115eea947439e5acf Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Fri, 22 Nov 2024 19:29:07 +0800
Subject: [PATCH 07/12] fix ci
---
tests/transformers/bloom/test_tokenizer.py | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
index 91e8e176e89c..c6a8e6abc91f 100644
--- a/tests/transformers/bloom/test_tokenizer.py
+++ b/tests/transformers/bloom/test_tokenizer.py
@@ -93,6 +93,22 @@ def test_full_tokenizer(self):
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+ # test encode_plus
+ def test_encodings_from_sample_data(self):
+ """
+ Assert that the created tokens are the same than the hard-coded ones
+ """
+ tokenizer = self.get_rust_tokenizer()
+
+ INPUT_SENTENCES = ["The quick brown fox", "jumps over the lazy dog"]
+ TARGET_TOKENS = [[2175, 23714, 73173, 144252, 2], [77, 132619, 3478, 368, 109586, 35433, 2]]
+
+ computed_tokens = tokenizer.batch_encode_plus(INPUT_SENTENCES)["input_ids"]
+ self.assertListEqual(TARGET_TOKENS, computed_tokens)
+
+ decoded_tokens = tokenizer.batch_decode(computed_tokens)
+ self.assertListEqual(decoded_tokens, INPUT_SENTENCES)
+
def test_pretokenized_inputs(self, *args, **kwargs):
pass
From 3dea2e82d6aa1b23471e3176c1006c86cae3bae3 Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Fri, 22 Nov 2024 20:55:45 +0800
Subject: [PATCH 08/12] fix bloom test
---
tests/transformers/bloom/test_tokenizer.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
index c6a8e6abc91f..7a576395b8d2 100644
--- a/tests/transformers/bloom/test_tokenizer.py
+++ b/tests/transformers/bloom/test_tokenizer.py
@@ -76,6 +76,10 @@ def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+ def get_rust_tokenizer(self, **kwargs):
+ kwargs.update(self.special_tokens_map)
+ return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
From f6d0fcf5c373a1bad6faa1384a1423c2d2b95d8c Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Sat, 23 Nov 2024 00:54:06 +0800
Subject: [PATCH 09/12] fix bloom coverage test
---
paddlenlp/transformers/bloom/tokenizer_fast.py | 3 ++-
tests/transformers/bloom/test_tokenizer.py | 8 ++------
2 files changed, 4 insertions(+), 7 deletions(-)
diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py
index 618a7b8e2fbd..c4aa109d097d 100644
--- a/paddlenlp/transformers/bloom/tokenizer_fast.py
+++ b/paddlenlp/transformers/bloom/tokenizer_fast.py
@@ -17,6 +17,7 @@
from ..tokenizer_utils_base import BatchEncoding
from ..tokenizer_utils_fast import PretrainedTokenizerFast
+from .tokenizer import BloomTokenizer
VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
@@ -65,7 +66,7 @@ class BloomTokenizerFast(PretrainedTokenizerFast):
resource_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask"]
- slow_tokenizer_class = None
+ slow_tokenizer_class = BloomTokenizer
def __init__(
self,
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
index 7a576395b8d2..29364fb96d8d 100644
--- a/tests/transformers/bloom/test_tokenizer.py
+++ b/tests/transformers/bloom/test_tokenizer.py
@@ -76,10 +76,6 @@ def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return BloomTokenizer.from_pretrained(self.tmpdirname, **kwargs)
- def get_rust_tokenizer(self, **kwargs):
- kwargs.update(self.special_tokens_map)
- return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
@@ -102,12 +98,12 @@ def test_encodings_from_sample_data(self):
"""
Assert that the created tokens are the same than the hard-coded ones
"""
- tokenizer = self.get_rust_tokenizer()
+ tokenizer = self.rust_tokenizer_class.from_pretrained("bigscience/bloom-560m")
INPUT_SENTENCES = ["The quick brown fox", "jumps over the lazy dog"]
TARGET_TOKENS = [[2175, 23714, 73173, 144252, 2], [77, 132619, 3478, 368, 109586, 35433, 2]]
- computed_tokens = tokenizer.batch_encode_plus(INPUT_SENTENCES)["input_ids"]
+ computed_tokens = tokenizer.batch_encode(INPUT_SENTENCES)["input_ids"]
self.assertListEqual(TARGET_TOKENS, computed_tokens)
decoded_tokens = tokenizer.batch_decode(computed_tokens)
From 7161b6031a95708090674d665bbe52544fdbe7fa Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Sat, 23 Nov 2024 01:46:23 +0800
Subject: [PATCH 10/12] fix bloom coverage test
---
tests/transformers/bloom/test_tokenizer.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py
index 29364fb96d8d..243e544da2ae 100644
--- a/tests/transformers/bloom/test_tokenizer.py
+++ b/tests/transformers/bloom/test_tokenizer.py
@@ -18,7 +18,8 @@
import unittest
from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast
-from tests.transformers.test_tokenizer_common import TokenizerTesterMixin
+
+from ..test_tokenizer_common import TokenizerTesterMixin
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
From 13dd590af8bbf273b6f648a905466cb422e51b30 Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Tue, 26 Nov 2024 15:46:41 +0800
Subject: [PATCH 11/12] add copyright for bert tokenizer fast
---
paddlenlp/transformers/bert/tokenizer_fast.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/paddlenlp/transformers/bert/tokenizer_fast.py b/paddlenlp/transformers/bert/tokenizer_fast.py
index ba11e48f1b37..e8db2825dfb5 100644
--- a/paddlenlp/transformers/bert/tokenizer_fast.py
+++ b/paddlenlp/transformers/bert/tokenizer_fast.py
@@ -1,3 +1,4 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
From 8907b0cfb413cacc7d306e0a67c32b7fb7cf7c55 Mon Sep 17 00:00:00 2001
From: yinfan98 <1106310035@qq.com>
Date: Tue, 26 Nov 2024 15:47:39 +0800
Subject: [PATCH 12/12] add copyright for bloom tokenizer fast
---
paddlenlp/transformers/bloom/tokenizer_fast.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py
index c4aa109d097d..1658bcea9b98 100644
--- a/paddlenlp/transformers/bloom/tokenizer_fast.py
+++ b/paddlenlp/transformers/bloom/tokenizer_fast.py
@@ -1,3 +1,4 @@
+# Copyright 2022 The HuggingFace Inc. team.
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");