Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

【Hackathon 7th No.43】完善 TokenizerFast 功能支持 part 1 #9407

Merged
merged 15 commits into from
Nov 27, 2024
1 change: 1 addition & 0 deletions paddlenlp/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@
from .bloom.configuration import *
from .bloom.modeling import *
from .bloom.tokenizer import *
from .bloom.tokenizer_fast import *
from .clipseg.configuration import *
from .clipseg.modeling import *
from .clipseg.processing import *
Expand Down
5 changes: 4 additions & 1 deletion paddlenlp/transformers/auto/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,10 @@
),
),
("blenderbot", "BlenderbotTokenizer"),
("bloom", "BloomTokenizer"),
(
"bloom",
("BloomTokenizer", "BloomTokenizerFast" if is_tokenizers_available() else None),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

建议这里换行一下,格式统一

),
("clip", "CLIPTokenizer"),
("codegen", "CodeGenTokenizer"),
("convbert", "ConvBertTokenizer"),
Expand Down
1 change: 1 addition & 0 deletions paddlenlp/transformers/bert/tokenizer_fast.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
132 changes: 132 additions & 0 deletions paddlenlp/transformers/bloom/tokenizer_fast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# Copyright 2022 The HuggingFace Inc. team.
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

辛苦在这里增加一下HuggingFace的Copyright

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pickle
from typing import Optional, Tuple

from ..tokenizer_utils_base import BatchEncoding
from ..tokenizer_utils_fast import PretrainedTokenizerFast
from .tokenizer import BloomTokenizer

VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}


class BloomTokenizerFast(PretrainedTokenizerFast):
r"""
Construct a "fast" Bloom tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
Byte-Pair-Encoding.

This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:

You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
the model was not pretrained this way, it might yield a decrease in performance.

<Tip>

When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.

</Tip>

This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The end of sequence token.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (Bloom tokenizer detect beginning of words by the preceding space).
trim_offsets (`bool`, *optional*, defaults to `True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
"""

resource_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = BloomTokenizer

def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token="<pad>",
add_prefix_space=False,
clean_up_tokenization_spaces=False,
**kwargs,
):
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
tokenizer_file=tokenizer_file,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
add_prefix_space=add_prefix_space,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
decoder_state = pickle.dumps(self.backend_tokenizer.decoder)

if add_prefix_space:
pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
self.backend_tokenizer.decoder = pickle.loads(decoder_state)

self.add_prefix_space = add_prefix_space

def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)
if not (self.add_prefix_space or not is_split_into_words):
raise Exception(

Check warning on line 112 in paddlenlp/transformers/bloom/tokenizer_fast.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/bloom/tokenizer_fast.py#L112

Added line #L112 was not covered by tests
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
" pretokenized inputs."
)

return super()._batch_encode_plus(*args, **kwargs)

def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)

Check warning on line 120 in paddlenlp/transformers/bloom/tokenizer_fast.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/bloom/tokenizer_fast.py#L120

Added line #L120 was not covered by tests

if not (self.add_prefix_space or not is_split_into_words):
raise Exception(

Check warning on line 123 in paddlenlp/transformers/bloom/tokenizer_fast.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/bloom/tokenizer_fast.py#L122-L123

Added lines #L122 - L123 were not covered by tests
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
" pretokenized inputs."
)

return super()._encode_plus(*args, **kwargs)

Check warning on line 128 in paddlenlp/transformers/bloom/tokenizer_fast.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/bloom/tokenizer_fast.py#L128

Added line #L128 was not covered by tests

def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)

Check warning on line 132 in paddlenlp/transformers/bloom/tokenizer_fast.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/bloom/tokenizer_fast.py#L131-L132

Added lines #L131 - L132 were not covered by tests
5 changes: 1 addition & 4 deletions paddlenlp/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,10 +442,7 @@ def pre_tokenizer(self, replacement, add_prefix_space):
return None


SLOW_TO_FAST_CONVERTERS = {
"LlamaTokenizer": LlamaConverter,
"BertTokenizer": BertConverter,
}
SLOW_TO_FAST_CONVERTERS = {"LlamaTokenizer": LlamaConverter, "BertTokenizer": BertConverter}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里的convert是可以通用吗?后续可以验证一下

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里应该没有新加bloom的,因为我看在hf上bloom只有fast,没有convert的流程



def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer:
Expand Down
20 changes: 19 additions & 1 deletion tests/transformers/bloom/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import os
import unittest

from paddlenlp.transformers import BloomTokenizer
from paddlenlp.transformers import BloomTokenizer, BloomTokenizerFast

from ..test_tokenizer_common import TokenizerTesterMixin

Expand All @@ -30,6 +30,7 @@
class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):

tokenizer_class = BloomTokenizer
rust_tokenizer_class = BloomTokenizerFast
from_pretrained_kwargs = {"add_prefix_space": True}
test_decode_token = True
test_seq2seq = False
Expand Down Expand Up @@ -90,8 +91,25 @@ def test_full_tokenizer(self):

input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]

self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)

# test encode_plus
def test_encodings_from_sample_data(self):
"""
Assert that the created tokens are the same than the hard-coded ones
"""
tokenizer = self.rust_tokenizer_class.from_pretrained("bigscience/bloom-560m")

INPUT_SENTENCES = ["The quick brown fox</s>", "jumps over the lazy dog</s>"]
TARGET_TOKENS = [[2175, 23714, 73173, 144252, 2], [77, 132619, 3478, 368, 109586, 35433, 2]]

computed_tokens = tokenizer.batch_encode(INPUT_SENTENCES)["input_ids"]
self.assertListEqual(TARGET_TOKENS, computed_tokens)

decoded_tokens = tokenizer.batch_decode(computed_tokens)
self.assertListEqual(decoded_tokens, INPUT_SENTENCES)

def test_pretokenized_inputs(self, *args, **kwargs):
pass

Expand Down
Loading