huggingface · sftse · Feb 23, 2025
diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml
@@ -42,7 +42,6 @@ required-features = ["http"]
 harness = false
 
 [dependencies]
-lazy_static = "1.4"
 rand = "0.8"
 onig = { version = "6.4", default-features = false, optional = true }
 regex = "1.10"

diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs
@@ -130,8 +130,6 @@
 
 #[macro_use]
 extern crate log;
-#[macro_use]
-extern crate lazy_static;
 
 #[macro_use]
 extern crate derive_builder;

diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs
@@ -2,16 +2,13 @@ use crate::processors::byte_level::bytes_char;
 use crate::tokenizer::{NormalizedString, Normalizer, Result};
 use crate::utils::macro_rules_attribute;
 use std::collections::{HashMap, HashSet};
+use std::sync::LazyLock;
 
 #[derive(Clone, Debug)]
 #[macro_rules_attribute(impl_serde_type!)]
 pub struct ByteLevel;
 
-lazy_static! {
-    static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
-    static ref CHAR_BYTES: HashMap<char, u8> =
-        bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
-}
+static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
 
 impl Default for ByteLevel {
     fn default() -> Self {

diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -1,4 +1,5 @@
 use std::collections::{HashMap, HashSet};
+use std::sync::LazyLock;
 
 use crate::utils::SysRegex;
 use serde::{Deserialize, Serialize};
@@ -37,17 +38,15 @@ pub(crate) fn bytes_char() -> HashMap<u8, char> {
         .collect()
 }
 
-lazy_static! {
-    /// Regex that matches exactly one token.
-    /// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
-    static ref RE: SysRegex = SysRegex::new(
-        r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
-    )
-    .unwrap();
-    static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
-    static ref CHAR_BYTES: HashMap<char, u8> =
-        bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
-}
+/// Regex that matches exactly one token.
+/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
+static RE: LazyLock<SysRegex> = LazyLock::new(|| {
+    SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
+        .unwrap()
+});
+static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
+static CHAR_BYTES: LazyLock<HashMap<char, u8>> =
+    LazyLock::new(|| bytes_char().into_iter().map(|(c, b)| (b, c)).collect());
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 /// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care

diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs
@@ -1,3 +1,5 @@
+use std::sync::LazyLock;
+
 use regex::Regex;
 
 use crate::tokenizer::{
@@ -17,9 +19,7 @@ impl Default for Whitespace {
 
 impl PreTokenizer for Whitespace {
     fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
-        lazy_static! {
-            static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
-        }
+        static RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w+|[^\w\s]+").unwrap());
         let re_ref: &Regex = &RE;
 
         pretokenized.split(|_, normalized| {

diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -5,6 +5,7 @@ use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
 use regex::Regex;
 use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer};
 use std::collections::{HashMap, HashSet};
+use std::sync::LazyLock;
 
 /// Represent a token added by the user on top of the existing Model vocabulary.
 /// AddedToken can be configured to specify the behavior they should have in various situations
@@ -94,12 +95,10 @@ impl std::hash::Hash for AddedToken {
 
 type MatchingSet = (AhoCorasick, Vec<u32>);
 
-lazy_static! {
-    static ref STARTS_WITH_WORD: Regex = Regex::new(r"^\w").unwrap();
-    static ref ENDS_WITH_WORD: Regex = Regex::new(r"\w$").unwrap();
-    static ref RIGHTMOST_SPACE_AT_START: Regex = Regex::new(r"^\s*").unwrap();
-    static ref LEFTMOST_SPACE_AT_END: Regex = Regex::new(r"\s*$").unwrap();
-}
+static STARTS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\w").unwrap());
+static ENDS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w$").unwrap());
+static RIGHTMOST_SPACE_AT_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*").unwrap());
+static LEFTMOST_SPACE_AT_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s*$").unwrap());
 
 fn ends_with_word(sentence: &str) -> bool {
     ENDS_WITH_WORD.is_match(sentence)