From 477d35fa68f8d4fca51f481652326ec23aa7fa89 Mon Sep 17 00:00:00 2001 From: ghlai9665 Date: Thu, 21 Apr 2022 07:07:54 -0500 Subject: [PATCH] tiny tweak to allow BatchEncoding.token_to_char when token doesn't correspond to chars (#15901) * tweak to allow BatchEncoding.char_to_token(0) * update docstring * remote trailing whitespace * make fixup * make value checking for span_indices explicit Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/tokenization_utils_base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 899c5d3a0284d1..d75b05c057866a 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -503,7 +503,8 @@ def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = the sequence. Returns: - [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string. + [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token + (e.g. , ) doesn't correspond to any chars in the origin string. """ if not self._encodings: @@ -513,7 +514,9 @@ def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = else: batch_index = 0 token_index = batch_or_token_index - return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) + span_indices = self._encodings[batch_index].token_to_chars(token_index) + + return CharSpan(*span_indices) if span_indices is not None else None def char_to_token( self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0