From 6d95920a67f420a23550fe2816395c984dc1c400 Mon Sep 17 00:00:00 2001 From: yinfan98 <1106310035@qq.com> Date: Wed, 13 Nov 2024 16:04:10 +0800 Subject: [PATCH] fix lint --- paddlenlp/transformers/auto/tokenizer.py | 5 +---- paddlenlp/transformers/bloom/tokenizer_fast.py | 6 +----- paddlenlp/transformers/convert_slow_tokenizer.py | 5 +---- tests/transformers/auto/test_tokenizer.py | 1 - tests/transformers/bloom/test_tokenizer.py | 12 +++--------- 5 files changed, 6 insertions(+), 23 deletions(-) diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index db2c36598856..84e537a1902f 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -56,10 +56,7 @@ ("blenderbot", "BlenderbotTokenizer"), ( "bloom", - ( - "BloomTokenizer", - "BloomTokenizerFast" if is_tokenizers_available() else None - ), + ("BloomTokenizer", "BloomTokenizerFast" if is_tokenizers_available() else None), ), ("clip", "CLIPTokenizer"), ("codegen", "CodeGenTokenizer"), diff --git a/paddlenlp/transformers/bloom/tokenizer_fast.py b/paddlenlp/transformers/bloom/tokenizer_fast.py index 2e66474f7663..618a7b8e2fbd 100644 --- a/paddlenlp/transformers/bloom/tokenizer_fast.py +++ b/paddlenlp/transformers/bloom/tokenizer_fast.py @@ -12,15 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import pickle -from typing import List, Optional, Tuple - -from tokenizers import normalizers +from typing import Optional, Tuple from ..tokenizer_utils_base import BatchEncoding from ..tokenizer_utils_fast import PretrainedTokenizerFast -from .tokenizer import BloomTokenizer VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py index 78882431fddc..da36ec04d270 100644 --- a/paddlenlp/transformers/convert_slow_tokenizer.py +++ b/paddlenlp/transformers/convert_slow_tokenizer.py @@ -442,10 +442,7 @@ def pre_tokenizer(self, replacement, add_prefix_space): return None -SLOW_TO_FAST_CONVERTERS = { - "LlamaTokenizer": LlamaConverter, - "BertTokenizer": BertConverter -} +SLOW_TO_FAST_CONVERTERS = {"LlamaTokenizer": LlamaConverter, "BertTokenizer": BertConverter} def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer: diff --git a/tests/transformers/auto/test_tokenizer.py b/tests/transformers/auto/test_tokenizer.py index 70cbdcc1036b..54c568113023 100644 --- a/tests/transformers/auto/test_tokenizer.py +++ b/tests/transformers/auto/test_tokenizer.py @@ -24,7 +24,6 @@ from paddlenlp.transformers.bert.configuration import BertConfig from paddlenlp.transformers.bert.tokenizer import BertTokenizer from paddlenlp.transformers.bert.tokenizer_fast import BertTokenizerFast - from paddlenlp.utils.env import TOKENIZER_CONFIG_NAME from ...utils.test_module.custom_configuration import CustomConfig diff --git a/tests/transformers/bloom/test_tokenizer.py b/tests/transformers/bloom/test_tokenizer.py index 91853ded75e7..91e8e176e89c 100644 --- a/tests/transformers/bloom/test_tokenizer.py +++ b/tests/transformers/bloom/test_tokenizer.py @@ -17,12 +17,8 @@ import os import unittest -from paddlenlp.transformers import BloomTokenizer -from paddlenlp.transformers import BloomTokenizerFast - -from tests.transformers.test_tokenizer_common import ( - TokenizerTesterMixin -) +from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast +from tests.transformers.test_tokenizer_common import TokenizerTesterMixin VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", @@ -94,7 +90,7 @@ def test_full_tokenizer(self): input_tokens = tokens + [tokenizer.unk_token] input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] - + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) def test_pretokenized_inputs(self, *args, **kwargs): @@ -150,7 +146,6 @@ def test_padding_if_pad_token_set_slow(self): # short slice pair does have padding self.assertTrue(pad_token_id in out_p2["input_ids"][1]) self.assertTrue(0 in out_p2["attention_mask"][1]) - def test_add_bos_token_slow(self): bos_token = "$$$" @@ -173,7 +168,6 @@ def test_add_bos_token_slow(self): self.assertEqual(decode_s.split()[0], bos_token) self.assertTrue(all(d.split()[0] == bos_token for d in decode_s2)) - # tokenizer has no padding token def test_padding_different_model_input_name(self): pass