Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

replace lazy_static with stabilized std::sync::LazyLock in 1.80 #1739

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ required-features = ["http"]
harness = false

[dependencies]
lazy_static = "1.4"
rand = "0.8"
onig = { version = "6.4", default-features = false, optional = true }
regex = "1.10"
Expand Down
2 changes: 0 additions & 2 deletions tokenizers/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,6 @@

#[macro_use]
extern crate log;
#[macro_use]
extern crate lazy_static;

#[macro_use]
extern crate derive_builder;
Expand Down
7 changes: 2 additions & 5 deletions tokenizers/src/normalizers/byte_level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,13 @@ use crate::processors::byte_level::bytes_char;
use crate::tokenizer::{NormalizedString, Normalizer, Result};
use crate::utils::macro_rules_attribute;
use std::collections::{HashMap, HashSet};
use std::sync::LazyLock;

#[derive(Clone, Debug)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct ByteLevel;

lazy_static! {
static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
static ref CHAR_BYTES: HashMap<char, u8> =
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
}
static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);

impl Default for ByteLevel {
fn default() -> Self {
Expand Down
21 changes: 10 additions & 11 deletions tokenizers/src/pre_tokenizers/byte_level.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::collections::{HashMap, HashSet};
use std::sync::LazyLock;

use crate::utils::SysRegex;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -37,17 +38,15 @@ pub(crate) fn bytes_char() -> HashMap<u8, char> {
.collect()
}

lazy_static! {
/// Regex that matches exactly one token.
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
static ref RE: SysRegex = SysRegex::new(
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
)
.unwrap();
static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
static ref CHAR_BYTES: HashMap<char, u8> =
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
}
/// Regex that matches exactly one token.
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
static RE: LazyLock<SysRegex> = LazyLock::new(|| {
SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
.unwrap()
});
static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
static CHAR_BYTES: LazyLock<HashMap<char, u8>> =
LazyLock::new(|| bytes_char().into_iter().map(|(c, b)| (b, c)).collect());

#[derive(Copy, Clone, Debug, PartialEq, Eq)]
/// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care
Expand Down
6 changes: 3 additions & 3 deletions tokenizers/src/pre_tokenizers/whitespace.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::LazyLock;

use regex::Regex;

use crate::tokenizer::{
Expand All @@ -17,9 +19,7 @@ impl Default for Whitespace {

impl PreTokenizer for Whitespace {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
}
static RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w+|[^\w\s]+").unwrap());
let re_ref: &Regex = &RE;

pretokenized.split(|_, normalized| {
Expand Down
11 changes: 5 additions & 6 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use regex::Regex;
use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer};
use std::collections::{HashMap, HashSet};
use std::sync::LazyLock;

/// Represent a token added by the user on top of the existing Model vocabulary.
/// AddedToken can be configured to specify the behavior they should have in various situations
Expand Down Expand Up @@ -94,12 +95,10 @@ impl std::hash::Hash for AddedToken {

type MatchingSet = (AhoCorasick, Vec<u32>);

lazy_static! {
static ref STARTS_WITH_WORD: Regex = Regex::new(r"^\w").unwrap();
static ref ENDS_WITH_WORD: Regex = Regex::new(r"\w$").unwrap();
static ref RIGHTMOST_SPACE_AT_START: Regex = Regex::new(r"^\s*").unwrap();
static ref LEFTMOST_SPACE_AT_END: Regex = Regex::new(r"\s*$").unwrap();
}
static STARTS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\w").unwrap());
static ENDS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w$").unwrap());
static RIGHTMOST_SPACE_AT_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*").unwrap());
static LEFTMOST_SPACE_AT_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s*$").unwrap());

fn ends_with_word(sentence: &str) -> bool {
ENDS_WITH_WORD.is_match(sentence)
Expand Down