From fe6bbd5f404e01fd11f829288da7aa30b64381c3 Mon Sep 17 00:00:00 2001 From: c Date: Sun, 23 Feb 2025 13:53:10 +0100 Subject: [PATCH] replace lazy_static with stabilized std::sync::LazyLock in 1.80 --- tokenizers/Cargo.toml | 1 - tokenizers/src/lib.rs | 2 -- tokenizers/src/normalizers/byte_level.rs | 7 ++----- tokenizers/src/pre_tokenizers/byte_level.rs | 21 ++++++++++---------- tokenizers/src/pre_tokenizers/whitespace.rs | 6 +++--- tokenizers/src/tokenizer/added_vocabulary.rs | 11 +++++----- 6 files changed, 20 insertions(+), 28 deletions(-) diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 0633b8ef6..5a9da3d52 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -42,7 +42,6 @@ required-features = ["http"] harness = false [dependencies] -lazy_static = "1.4" rand = "0.8" onig = { version = "6.4", default-features = false, optional = true } regex = "1.10" diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index 441612717..821357273 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -130,8 +130,6 @@ #[macro_use] extern crate log; -#[macro_use] -extern crate lazy_static; #[macro_use] extern crate derive_builder; diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs index 130e2ce1e..ae47de5ac 100644 --- a/tokenizers/src/normalizers/byte_level.rs +++ b/tokenizers/src/normalizers/byte_level.rs @@ -2,16 +2,13 @@ use crate::processors::byte_level::bytes_char; use crate::tokenizer::{NormalizedString, Normalizer, Result}; use crate::utils::macro_rules_attribute; use std::collections::{HashMap, HashSet}; +use std::sync::LazyLock; #[derive(Clone, Debug)] #[macro_rules_attribute(impl_serde_type!)] pub struct ByteLevel; -lazy_static! { - static ref BYTES_CHAR: HashMap = bytes_char(); - static ref CHAR_BYTES: HashMap = - bytes_char().into_iter().map(|(c, b)| (b, c)).collect(); -} +static BYTES_CHAR: LazyLock> = LazyLock::new(bytes_char); impl Default for ByteLevel { fn default() -> Self { diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index 8396f1a7b..e761cbc96 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -1,4 +1,5 @@ use std::collections::{HashMap, HashSet}; +use std::sync::LazyLock; use crate::utils::SysRegex; use serde::{Deserialize, Serialize}; @@ -37,17 +38,15 @@ pub(crate) fn bytes_char() -> HashMap { .collect() } -lazy_static! { - /// Regex that matches exactly one token. - /// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98 - static ref RE: SysRegex = SysRegex::new( - r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" - ) - .unwrap(); - static ref BYTES_CHAR: HashMap = bytes_char(); - static ref CHAR_BYTES: HashMap = - bytes_char().into_iter().map(|(c, b)| (b, c)).collect(); -} +/// Regex that matches exactly one token. +/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98 +static RE: LazyLock = LazyLock::new(|| { + SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+") + .unwrap() +}); +static BYTES_CHAR: LazyLock> = LazyLock::new(bytes_char); +static CHAR_BYTES: LazyLock> = + LazyLock::new(|| bytes_char().into_iter().map(|(c, b)| (b, c)).collect()); #[derive(Copy, Clone, Debug, PartialEq, Eq)] /// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index 8c24e8efb..20cfb6519 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -1,3 +1,5 @@ +use std::sync::LazyLock; + use regex::Regex; use crate::tokenizer::{ @@ -17,9 +19,7 @@ impl Default for Whitespace { impl PreTokenizer for Whitespace { fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { - lazy_static! { - static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap(); - } + static RE: LazyLock = LazyLock::new(|| Regex::new(r"\w+|[^\w\s]+").unwrap()); let re_ref: &Regex = &RE; pretokenized.split(|_, normalized| { diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index a0c2f4542..f988477be 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -5,6 +5,7 @@ use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; use regex::Regex; use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer}; use std::collections::{HashMap, HashSet}; +use std::sync::LazyLock; /// Represent a token added by the user on top of the existing Model vocabulary. /// AddedToken can be configured to specify the behavior they should have in various situations @@ -94,12 +95,10 @@ impl std::hash::Hash for AddedToken { type MatchingSet = (AhoCorasick, Vec); -lazy_static! { - static ref STARTS_WITH_WORD: Regex = Regex::new(r"^\w").unwrap(); - static ref ENDS_WITH_WORD: Regex = Regex::new(r"\w$").unwrap(); - static ref RIGHTMOST_SPACE_AT_START: Regex = Regex::new(r"^\s*").unwrap(); - static ref LEFTMOST_SPACE_AT_END: Regex = Regex::new(r"\s*$").unwrap(); -} +static STARTS_WITH_WORD: LazyLock = LazyLock::new(|| Regex::new(r"^\w").unwrap()); +static ENDS_WITH_WORD: LazyLock = LazyLock::new(|| Regex::new(r"\w$").unwrap()); +static RIGHTMOST_SPACE_AT_START: LazyLock = LazyLock::new(|| Regex::new(r"^\s*").unwrap()); +static LEFTMOST_SPACE_AT_END: LazyLock = LazyLock::new(|| Regex::new(r"\s*$").unwrap()); fn ends_with_word(sentence: &str) -> bool { ENDS_WITH_WORD.is_match(sentence)