diff --git a/whisper/tokenizer.py b/whisper/tokenizer.py index 5ae691d5f..88142c43c 100644 --- a/whisper/tokenizer.py +++ b/whisper/tokenizer.py @@ -245,9 +245,7 @@ def non_speech_tokens(self) -> Tuple[int]: keeping basic punctuations like commas, periods, question marks, exclamation points, etc. """ - - result = set() - symbols = list("'\"#()*+-/:;<=>@[\\]^_`{|}~「」『』") + symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』") symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split() # symbols that may be a single token or multiple tokens depending on the tokenizer. @@ -257,6 +255,8 @@ def non_speech_tokens(self) -> Tuple[int]: miscellaneous = set("♩♪♫♬♭♮♯") assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous) + # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word + result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]} for symbol in symbols + list(miscellaneous): for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]: if len(tokens) == 1 or symbol in miscellaneous: