From ca07ce416e966af42f87de465f571db68ee5299c Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Tue, 7 Jan 2025 16:50:51 +0100 Subject: [PATCH] chore: Replace lazy static (#219) * chore: replace lazy_static with LazyLock from std, generate lookup tables at build time * remove unused code *bump MRSV to 1.80 --- Cargo.toml | 3 +- benches/translate_cds.rs | 21 +- build.rs | 754 +++++++++++++++++++++++++++++++++++++ src/parser/display.rs | 16 +- src/sequences.rs | 783 +++------------------------------------ tables.in | 594 +++++++++++++++++++++++++++++ 6 files changed, 1411 insertions(+), 760 deletions(-) create mode 100644 build.rs create mode 100644 tables.in diff --git a/Cargo.toml b/Cargo.toml index 4d09390..0e22e24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ description = "Port of biocommons/hgvs to Rust" license = "Apache-2.0" repository = "https://github.com/varfish-org/hgvs-rs" readme = "README.md" -rust-version = "1.64.0" +rust-version = "1.80.0" [lib] name = "hgvs" @@ -19,7 +19,6 @@ bio = "2.0" chrono = "0.4" enum-map = "2.4" flate2 = "1.0" -lazy_static = "1.4" log = "0.4" md-5 = "0.10" nom = "7.1" diff --git a/benches/translate_cds.rs b/benches/translate_cds.rs index b1c9175..43cd6c9 100644 --- a/benches/translate_cds.rs +++ b/benches/translate_cds.rs @@ -1,21 +1,20 @@ use criterion::{criterion_group, criterion_main, Criterion}; use hgvs::sequences::{translate_cds, TranslationTable}; +use std::sync::LazyLock; /// TTN FASTA string from https://www.ncbi.nlm.nih.gov/nuccore/NM_001126114.1 static TTN_FASTA: &str = include_str!("TTN.fasta"); -lazy_static::lazy_static! { - /// Raw TTN sequence. - static ref SEQ_TTN: String = { - let mut seq = String::new(); - for line in TTN_FASTA.lines() { - if !line.starts_with('>') { - seq.push_str(line); - } +/// Raw TTN sequence. +static SEQ_TTN: LazyLock = LazyLock::new(|| { + let mut seq = String::new(); + for line in TTN_FASTA.lines() { + if !line.starts_with('>') { + seq.push_str(line); } - seq - }; -} + } + seq +}); fn criterion_benchmark(c: &mut Criterion) { c.bench_function("translate_cds TTN", |b| { diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..2be8998 --- /dev/null +++ b/build.rs @@ -0,0 +1,754 @@ +use std::env; +use std::fs::File; +use std::io::{BufWriter, Result, Write}; +use std::path::Path; + +fn main() -> Result<()> { + let out_dir = env::var("OUT_DIR").unwrap(); + let dest_path = Path::new(&out_dir).join("tables_gen.rs"); + let mut f = File::create(&dest_path).map(BufWriter::new)?; + + include_hardcoded_translation_tables(&mut f)?; + generate_dna_ascii_map(&mut f)?; + + generate_codon_2bit_to_aa1_lut(&mut f)?; + generate_codon_2bit_to_aa1_sec(&mut f)?; + generate_codon_2bit_to_aa1_chrmt_vertebrate(&mut f)?; + + generate_aa1_to_aa3_str_lookup_function(&mut f)?; + generate_aa1_to_aa3_str_lookup_table(&mut f)?; + generate_aa3_to_aa1_lookup_function(&mut f)?; + + f.flush()?; + println!("cargo::rerun-if-changed=build.rs"); + Ok(()) +} + +fn generate_dna_ascii_map(f: &mut BufWriter) -> Result<()> { + let mut result = [0; 256]; + for c in 0..=255 { + if c == b'u' || c == b'U' { + result[c as usize] = b'T'; + } else if c.is_ascii_lowercase() { + result[c as usize] = c.to_ascii_uppercase(); + } else { + result[c as usize] = c; + } + } + + writeln!(f, "/// Mapping for DNA characters for normalization.")?; + write!(f, "const DNA_ASCII_MAP: [u8; 256] = [")?; + for v in result { + write!(f, "{}, ", v)?; + } + writeln!(f, "];")?; + Ok(()) +} +fn generate_codon_2bit_to_aa1_lut(f: &mut BufWriter) -> Result<()> { + let mut result = [0; 64]; + for (i, (dna3, aa1)) in DNA_TO_AA1_LUT_VEC.iter().enumerate() { + if i > 63 { + break; // skip degenerate codons + } + let dna3_2bit = dna3_to_2bit(dna3.as_bytes()).expect("invalid dna3"); + result[dna3_2bit as usize] = aa1.as_bytes()[0]; + } + write!(f, "const CODON_2BIT_TO_AA1_LUT: [u8; 64] = [")?; + for v in result { + write!(f, "{}, ", v)?; + } + writeln!(f, "];")?; + Ok(()) +} + +fn generate_codon_2bit_to_aa1_sec(f: &mut BufWriter) -> Result<()> { + let mut result = [0; 64]; + for (i, (dna3, aa1)) in DNA_TO_AA1_SEC_VEC.iter().enumerate() { + if i > 63 { + break; // skip degenerate codons + } + let dna3_2bit = dna3_to_2bit(dna3.as_bytes()).expect("invalid dna3"); + result[dna3_2bit as usize] = aa1.as_bytes()[0]; + } + write!(f, "const CODON_2BIT_TO_AA1_SEC: [u8; 64] = [")?; + for v in result { + write!(f, "{}, ", v)?; + } + writeln!(f, "];")?; + Ok(()) +} + +fn generate_codon_2bit_to_aa1_chrmt_vertebrate(f: &mut BufWriter) -> Result<()> { + let mut result = [0; 64]; + for (i, (dna3, aa1)) in DNA_TO_AA1_CHRMT_VERTEBRATE_VEC.iter().enumerate() { + if i > 63 { + break; // skip degenerate codons + } + let dna3_2bit = dna3_to_2bit(dna3.as_bytes()).expect("invalid dna3"); + result[dna3_2bit as usize] = aa1.as_bytes()[0]; + } + write!(f, "const CODON_2BIT_TO_AA1_CHRMT_VERTEBRATE: [u8; 64] = [")?; + for v in result { + write!(f, "{}, ", v)?; + } + writeln!(f, "];")?; + Ok(()) +} + +fn generate_aa1_to_aa3_str_lookup_function(f: &mut BufWriter) -> Result<()> { + writeln!( + f, + "const fn _aa1_to_aa3_str(aa1: u8) -> Option<&'static str> {{" + )?; + writeln!(f, " match aa1 {{")?; + for (aa3, aa1) in AA3_TO_AA1_VEC { + writeln!(f, " b'{}' => Some(\"{}\"),", aa1, aa3)?; + } + writeln!(f, r" _ => None,")?; + writeln!(f, " }}")?; + writeln!(f, "}}")?; + Ok(()) +} + +fn generate_aa1_to_aa3_str_lookup_table(f: &mut BufWriter) -> Result<()> { + let mut result = [""; 256]; + for (aa3, aa1) in AA3_TO_AA1_VEC { + result[aa1.as_bytes()[0] as usize] = aa3; + } + write!(f, "const AA1_TO_AA3_STR: [Option<&str>; 256] = [")?; + for v in result { + if v.is_empty() { + write!(f, "None, ")?; + } else { + write!(f, r##"Some("{}"), "##, v)?; + } + } + writeln!(f, "];")?; + Ok(()) +} + +fn generate_aa3_to_aa1_lookup_function(f: &mut BufWriter) -> Result<()> { + writeln!(f, "const fn _aa3_to_aa1(aa3: &[u8]) -> Option {{")?; + writeln!(f, " match aa3 {{")?; + for (aa3, aa1) in AA3_TO_AA1_VEC { + writeln!(f, " b\"{}\" => Some(b'{}'),", aa3, aa1)?; + } + writeln!(f, " _ => None,")?; + writeln!(f, " }}")?; + writeln!(f, "}}")?; + Ok(()) +} + +fn include_hardcoded_translation_tables(f: &mut BufWriter) -> Result<()> { + let text = include_str!("tables.in"); + writeln!(f, "{}", text)?; + Ok(()) +} + +const DNA_ASCII_TO_2BIT: [u8; 256] = { + let mut result = [255; 256]; + + result[b'A' as usize] = 0; + result[b'a' as usize] = 0; + + result[b'C' as usize] = 1; + result[b'c' as usize] = 1; + + result[b'G' as usize] = 2; + result[b'g' as usize] = 2; + + result[b'T' as usize] = 3; + result[b't' as usize] = 3; + result[b'U' as usize] = 3; + result[b'u' as usize] = 3; + result +}; + +fn dna3_to_2bit(c: &[u8]) -> Option { + let mut result = 0; + for i in 0..3 { + result <<= 2; + let tmp = DNA_ASCII_TO_2BIT[c[i] as usize]; + if tmp == 255 { + return None; + } + result |= tmp; + } + Some(result) +} + +// Hard-coded translation tables from src/tables.rs + +pub const AA3_TO_AA1_VEC: &[(&str, &str)] = &[ + ("Ala", "A"), + ("Arg", "R"), + ("Asn", "N"), + ("Asp", "D"), + ("Cys", "C"), + ("Gln", "Q"), + ("Glu", "E"), + ("Gly", "G"), + ("His", "H"), + ("Ile", "I"), + ("Leu", "L"), + ("Lys", "K"), + ("Met", "M"), + ("Phe", "F"), + ("Pro", "P"), + ("Ser", "S"), + ("Thr", "T"), + ("Trp", "W"), + ("Tyr", "Y"), + ("Val", "V"), + ("Xaa", "X"), + ("Ter", "*"), + ("Sec", "U"), +]; + +const DNA_TO_AA1_LUT_VEC: &[(&str, &str)] = &[ + ("AAA", "K"), + ("AAC", "N"), + ("AAG", "K"), + ("AAT", "N"), + ("ACA", "T"), + ("ACC", "T"), + ("ACG", "T"), + ("ACT", "T"), + ("AGA", "R"), + ("AGC", "S"), + ("AGG", "R"), + ("AGT", "S"), + ("ATA", "I"), + ("ATC", "I"), + ("ATG", "M"), + ("ATT", "I"), + ("CAA", "Q"), + ("CAC", "H"), + ("CAG", "Q"), + ("CAT", "H"), + ("CCA", "P"), + ("CCC", "P"), + ("CCG", "P"), + ("CCT", "P"), + ("CGA", "R"), + ("CGC", "R"), + ("CGG", "R"), + ("CGT", "R"), + ("CTA", "L"), + ("CTC", "L"), + ("CTG", "L"), + ("CTT", "L"), + ("GAA", "E"), + ("GAC", "D"), + ("GAG", "E"), + ("GAT", "D"), + ("GCA", "A"), + ("GCC", "A"), + ("GCG", "A"), + ("GCT", "A"), + ("GGA", "G"), + ("GGC", "G"), + ("GGG", "G"), + ("GGT", "G"), + ("GTA", "V"), + ("GTC", "V"), + ("GTG", "V"), + ("GTT", "V"), + ("TAA", "*"), + ("TAC", "Y"), + ("TAG", "*"), + ("TAT", "Y"), + ("TCA", "S"), + ("TCC", "S"), + ("TCG", "S"), + ("TCT", "S"), + // caveat lector + ("TGA", "*"), + ("TGC", "C"), + ("TGG", "W"), + ("TGT", "C"), + ("TTA", "L"), + ("TTC", "F"), + ("TTG", "L"), + ("TTT", "F"), + // degenerate codons + ("AAR", "K"), + ("AAY", "N"), + ("ACB", "T"), + ("ACD", "T"), + ("ACH", "T"), + ("ACK", "T"), + ("ACM", "T"), + ("ACN", "T"), + ("ACR", "T"), + ("ACS", "T"), + ("ACV", "T"), + ("ACW", "T"), + ("ACY", "T"), + ("AGR", "R"), + ("AGY", "S"), + ("ATH", "I"), + ("ATM", "I"), + ("ATW", "I"), + ("ATY", "I"), + ("CAR", "Q"), + ("CAY", "H"), + ("CCB", "P"), + ("CCD", "P"), + ("CCH", "P"), + ("CCK", "P"), + ("CCM", "P"), + ("CCN", "P"), + ("CCR", "P"), + ("CCS", "P"), + ("CCV", "P"), + ("CCW", "P"), + ("CCY", "P"), + ("CGB", "R"), + ("CGD", "R"), + ("CGH", "R"), + ("CGK", "R"), + ("CGM", "R"), + ("CGN", "R"), + ("CGR", "R"), + ("CGS", "R"), + ("CGV", "R"), + ("CGW", "R"), + ("CGY", "R"), + ("CTB", "L"), + ("CTD", "L"), + ("CTH", "L"), + ("CTK", "L"), + ("CTM", "L"), + ("CTN", "L"), + ("CTR", "L"), + ("CTS", "L"), + ("CTV", "L"), + ("CTW", "L"), + ("CTY", "L"), + ("GAR", "E"), + ("GAY", "D"), + ("GCB", "A"), + ("GCD", "A"), + ("GCH", "A"), + ("GCK", "A"), + ("GCM", "A"), + ("GCN", "A"), + ("GCR", "A"), + ("GCS", "A"), + ("GCV", "A"), + ("GCW", "A"), + ("GCY", "A"), + ("GGB", "G"), + ("GGD", "G"), + ("GGH", "G"), + ("GGK", "G"), + ("GGM", "G"), + ("GGN", "G"), + ("GGR", "G"), + ("GGS", "G"), + ("GGV", "G"), + ("GGW", "G"), + ("GGY", "G"), + ("GTB", "V"), + ("GTD", "V"), + ("GTH", "V"), + ("GTK", "V"), + ("GTM", "V"), + ("GTN", "V"), + ("GTR", "V"), + ("GTS", "V"), + ("GTV", "V"), + ("GTW", "V"), + ("GTY", "V"), + ("MGA", "R"), + ("MGG", "R"), + ("MGR", "R"), + ("TAR", "*"), + ("TAY", "Y"), + ("TCB", "S"), + ("TCD", "S"), + ("TCH", "S"), + ("TCK", "S"), + ("TCM", "S"), + ("TCN", "S"), + ("TCR", "S"), + ("TCS", "S"), + ("TCV", "S"), + ("TCW", "S"), + ("TCY", "S"), + ("TGY", "C"), + ("TRA", "*"), + ("TTR", "L"), + ("TTY", "F"), + ("YTA", "L"), + ("YTG", "L"), + ("YTR", "L"), +]; + +/// Translation table for selenocysteine. +const DNA_TO_AA1_SEC_VEC: &[(&str, &str)] = &[ + ("AAA", "K"), + ("AAC", "N"), + ("AAG", "K"), + ("AAT", "N"), + ("ACA", "T"), + ("ACC", "T"), + ("ACG", "T"), + ("ACT", "T"), + ("AGA", "R"), + ("AGC", "S"), + ("AGG", "R"), + ("AGT", "S"), + ("ATA", "I"), + ("ATC", "I"), + ("ATG", "M"), + ("ATT", "I"), + ("CAA", "Q"), + ("CAC", "H"), + ("CAG", "Q"), + ("CAT", "H"), + ("CCA", "P"), + ("CCC", "P"), + ("CCG", "P"), + ("CCT", "P"), + ("CGA", "R"), + ("CGC", "R"), + ("CGG", "R"), + ("CGT", "R"), + ("CTA", "L"), + ("CTC", "L"), + ("CTG", "L"), + ("CTT", "L"), + ("GAA", "E"), + ("GAC", "D"), + ("GAG", "E"), + ("GAT", "D"), + ("GCA", "A"), + ("GCC", "A"), + ("GCG", "A"), + ("GCT", "A"), + ("GGA", "G"), + ("GGC", "G"), + ("GGG", "G"), + ("GGT", "G"), + ("GTA", "V"), + ("GTC", "V"), + ("GTG", "V"), + ("GTT", "V"), + ("TAA", "*"), + ("TAC", "Y"), + ("TAG", "*"), + ("TAT", "Y"), + ("TCA", "S"), + ("TCC", "S"), + ("TCG", "S"), + ("TCT", "S"), + // caveat lector + ("TGA", "U"), + ("TGC", "C"), + ("TGG", "W"), + ("TGT", "C"), + ("TTA", "L"), + ("TTC", "F"), + ("TTG", "L"), + ("TTT", "F"), + // degenerate codons + ("AAR", "K"), + ("AAY", "N"), + ("ACB", "T"), + ("ACD", "T"), + ("ACH", "T"), + ("ACK", "T"), + ("ACM", "T"), + ("ACN", "T"), + ("ACR", "T"), + ("ACS", "T"), + ("ACV", "T"), + ("ACW", "T"), + ("ACY", "T"), + ("AGR", "R"), + ("AGY", "S"), + ("ATH", "I"), + ("ATM", "I"), + ("ATW", "I"), + ("ATY", "I"), + ("CAR", "Q"), + ("CAY", "H"), + ("CCB", "P"), + ("CCD", "P"), + ("CCH", "P"), + ("CCK", "P"), + ("CCM", "P"), + ("CCN", "P"), + ("CCR", "P"), + ("CCS", "P"), + ("CCV", "P"), + ("CCW", "P"), + ("CCY", "P"), + ("CGB", "R"), + ("CGD", "R"), + ("CGH", "R"), + ("CGK", "R"), + ("CGM", "R"), + ("CGN", "R"), + ("CGR", "R"), + ("CGS", "R"), + ("CGV", "R"), + ("CGW", "R"), + ("CGY", "R"), + ("CTB", "L"), + ("CTD", "L"), + ("CTH", "L"), + ("CTK", "L"), + ("CTM", "L"), + ("CTN", "L"), + ("CTR", "L"), + ("CTS", "L"), + ("CTV", "L"), + ("CTW", "L"), + ("CTY", "L"), + ("GAR", "E"), + ("GAY", "D"), + ("GCB", "A"), + ("GCD", "A"), + ("GCH", "A"), + ("GCK", "A"), + ("GCM", "A"), + ("GCN", "A"), + ("GCR", "A"), + ("GCS", "A"), + ("GCV", "A"), + ("GCW", "A"), + ("GCY", "A"), + ("GGB", "G"), + ("GGD", "G"), + ("GGH", "G"), + ("GGK", "G"), + ("GGM", "G"), + ("GGN", "G"), + ("GGR", "G"), + ("GGS", "G"), + ("GGV", "G"), + ("GGW", "G"), + ("GGY", "G"), + ("GTB", "V"), + ("GTD", "V"), + ("GTH", "V"), + ("GTK", "V"), + ("GTM", "V"), + ("GTN", "V"), + ("GTR", "V"), + ("GTS", "V"), + ("GTV", "V"), + ("GTW", "V"), + ("GTY", "V"), + ("MGA", "R"), + ("MGG", "R"), + ("MGR", "R"), + ("TAR", "*"), + ("TAY", "Y"), + ("TCB", "S"), + ("TCD", "S"), + ("TCH", "S"), + ("TCK", "S"), + ("TCM", "S"), + ("TCN", "S"), + ("TCR", "S"), + ("TCS", "S"), + ("TCV", "S"), + ("TCW", "S"), + ("TCY", "S"), + ("TGY", "C"), + ("TRA", "*"), + ("TTR", "L"), + ("TTY", "F"), + ("YTA", "L"), + ("YTG", "L"), + ("YTR", "L"), +]; + +/// Vertebrate mitochondrial code, cf. https://en.wikipedia.org/wiki/Vertebrate_mitochondrial_code +const DNA_TO_AA1_CHRMT_VERTEBRATE_VEC: &[(&str, &str)] = &[ + ("AAA", "K"), + ("AAC", "N"), + ("AAG", "K"), + ("AAT", "N"), + ("ACA", "T"), + ("ACC", "T"), + ("ACG", "T"), + ("ACT", "T"), + // caveat lector + ("AGA", "*"), + ("AGC", "S"), + // caveat lector + ("AGG", "*"), + ("AGT", "S"), + // caveat lector + ("ATA", "M"), + ("ATC", "I"), + ("ATG", "M"), + ("ATT", "I"), + ("CAA", "Q"), + ("CAC", "H"), + ("CAG", "Q"), + ("CAT", "H"), + ("CCA", "P"), + ("CCC", "P"), + ("CCG", "P"), + ("CCT", "P"), + ("CGA", "R"), + ("CGC", "R"), + ("CGG", "R"), + ("CGT", "R"), + ("CTA", "L"), + ("CTC", "L"), + ("CTG", "L"), + ("CTT", "L"), + ("GAA", "E"), + ("GAC", "D"), + ("GAG", "E"), + ("GAT", "D"), + ("GCA", "A"), + ("GCC", "A"), + ("GCG", "A"), + ("GCT", "A"), + ("GGA", "G"), + ("GGC", "G"), + ("GGG", "G"), + ("GGT", "G"), + ("GTA", "V"), + ("GTC", "V"), + ("GTG", "V"), + ("GTT", "V"), + ("TAA", "*"), + ("TAC", "Y"), + ("TAG", "*"), + ("TAT", "Y"), + ("TCA", "S"), + ("TCC", "S"), + ("TCG", "S"), + ("TCT", "S"), + // caveat lector + ("TGA", "W"), + ("TGC", "C"), + ("TGG", "W"), + ("TGT", "C"), + ("TTA", "L"), + ("TTC", "F"), + ("TTG", "L"), + ("TTT", "F"), + // degenerate codons + ("AAR", "K"), + ("AAY", "N"), + ("ACB", "T"), + ("ACD", "T"), + ("ACH", "T"), + ("ACK", "T"), + ("ACM", "T"), + ("ACN", "T"), + ("ACR", "T"), + ("ACS", "T"), + ("ACV", "T"), + ("ACW", "T"), + ("ACY", "T"), + ("AGR", "R"), + ("AGY", "S"), + ("ATH", "I"), + ("ATM", "I"), + ("ATW", "I"), + ("ATY", "I"), + ("CAR", "Q"), + ("CAY", "H"), + ("CCB", "P"), + ("CCD", "P"), + ("CCH", "P"), + ("CCK", "P"), + ("CCM", "P"), + ("CCN", "P"), + ("CCR", "P"), + ("CCS", "P"), + ("CCV", "P"), + ("CCW", "P"), + ("CCY", "P"), + ("CGB", "R"), + ("CGD", "R"), + ("CGH", "R"), + ("CGK", "R"), + ("CGM", "R"), + ("CGN", "R"), + ("CGR", "R"), + ("CGS", "R"), + ("CGV", "R"), + ("CGW", "R"), + ("CGY", "R"), + ("CTB", "L"), + ("CTD", "L"), + ("CTH", "L"), + ("CTK", "L"), + ("CTM", "L"), + ("CTN", "L"), + ("CTR", "L"), + ("CTS", "L"), + ("CTV", "L"), + ("CTW", "L"), + ("CTY", "L"), + ("GAR", "E"), + ("GAY", "D"), + ("GCB", "A"), + ("GCD", "A"), + ("GCH", "A"), + ("GCK", "A"), + ("GCM", "A"), + ("GCN", "A"), + ("GCR", "A"), + ("GCS", "A"), + ("GCV", "A"), + ("GCW", "A"), + ("GCY", "A"), + ("GGB", "G"), + ("GGD", "G"), + ("GGH", "G"), + ("GGK", "G"), + ("GGM", "G"), + ("GGN", "G"), + ("GGR", "G"), + ("GGS", "G"), + ("GGV", "G"), + ("GGW", "G"), + ("GGY", "G"), + ("GTB", "V"), + ("GTD", "V"), + ("GTH", "V"), + ("GTK", "V"), + ("GTM", "V"), + ("GTN", "V"), + ("GTR", "V"), + ("GTS", "V"), + ("GTV", "V"), + ("GTW", "V"), + ("GTY", "V"), + ("MGA", "R"), + ("MGG", "R"), + ("MGR", "R"), + ("TAR", "*"), + ("TAY", "Y"), + ("TCB", "S"), + ("TCD", "S"), + ("TCH", "S"), + ("TCK", "S"), + ("TCM", "S"), + ("TCN", "S"), + ("TCR", "S"), + ("TCS", "S"), + ("TCV", "S"), + ("TCW", "S"), + ("TCY", "S"), + ("TGY", "C"), + ("TRA", "*"), + ("TTR", "L"), + ("TTY", "F"), + ("YTA", "L"), + ("YTG", "L"), + ("YTR", "L"), +]; diff --git a/src/parser/display.rs b/src/parser/display.rs index 7d45d5a..6aafeaf 100644 --- a/src/parser/display.rs +++ b/src/parser/display.rs @@ -96,7 +96,7 @@ impl Display for NaEdit { } } -impl<'a> Display for NoRef<'a, NaEdit> { +impl Display for NoRef<'_, NaEdit> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { NoRef(NaEdit::RefAlt { @@ -309,7 +309,7 @@ impl Display for ProtLocEdit { } } -impl<'a> Display for NoRef<'a, ProtLocEdit> { +impl Display for NoRef<'_, ProtLocEdit> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.inner().fmt(f) } @@ -321,7 +321,7 @@ impl Display for CdsLocEdit { } } -impl<'a> Display for NoRef<'a, CdsLocEdit> { +impl Display for NoRef<'_, CdsLocEdit> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}{}", self.inner().loc, NoRef(&self.inner().edit)) } @@ -362,7 +362,7 @@ impl Display for TxLocEdit { } } -impl<'a> Display for NoRef<'a, TxLocEdit> { +impl Display for NoRef<'_, TxLocEdit> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}{}", self.inner().loc, NoRef(&self.inner().edit)) } @@ -399,7 +399,7 @@ impl Display for RnaLocEdit { } } -impl<'a> Display for NoRef<'a, RnaLocEdit> { +impl Display for NoRef<'_, RnaLocEdit> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}{}", self.inner().loc, NoRef(&self.inner().edit)) } @@ -436,7 +436,7 @@ impl Display for GenomeLocEdit { } } -impl<'a> Display for NoRef<'a, GenomeLocEdit> { +impl Display for NoRef<'_, GenomeLocEdit> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}{}", self.inner().loc, NoRef(&self.inner().edit)) } @@ -464,7 +464,7 @@ impl Display for MtLocEdit { } } -impl<'a> Display for NoRef<'a, MtLocEdit> { +impl Display for NoRef<'_, MtLocEdit> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}{}", self.inner().loc, NoRef(&self.inner().edit)) } @@ -559,7 +559,7 @@ impl Display for HgvsVariant { } } -impl<'a> Display for NoRef<'a, HgvsVariant> { +impl Display for NoRef<'_, HgvsVariant> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { NoRef(HgvsVariant::CdsVariant { diff --git a/src/sequences.rs b/src/sequences.rs index 9da74f2..08ff7e0 100644 --- a/src/sequences.rs +++ b/src/sequences.rs @@ -4,9 +4,12 @@ use ahash::AHashMap; use md5::{Digest, Md5}; +use std::sync::LazyLock; pub use crate::sequences::error::Error; +include!(concat!(env!("OUT_DIR"), "/tables_gen.rs")); + mod error { /// Error type for normalization of HGVS expressins. #[derive(thiserror::Error, Debug, Clone)] @@ -83,736 +86,6 @@ pub fn revcomp(seq: &str) -> String { .to_string() } -/// Mapping for DNA characters for normalization. -/// Built via -/// ```rust,no_run -/// let mut result = [0; 256]; -/// for c in 0..=255 { -/// if c == b'u' || c == b'U' { -/// result[c as usize] = b'T'; -/// } else if c.is_ascii_lowercase() { -/// result[c as usize] = c.to_ascii_uppercase(); -/// } else { -/// result[c as usize] = c; -/// } -/// } -/// ``` -/// Could probably be done in build.rs -const DNA_ASCII_MAP: [u8; 256] = [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, - 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, - 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 65, - 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84, 86, 87, 88, 89, - 90, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, - 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, - 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, - 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, - 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, - 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, - 255, -]; -const DNA_ASCII_TO_2BIT: [u8; 256] = { - let mut result = [255; 256]; - - result[b'A' as usize] = 0; - result[b'a' as usize] = 0; - - result[b'C' as usize] = 1; - result[b'c' as usize] = 1; - - result[b'G' as usize] = 2; - result[b'g' as usize] = 2; - - result[b'T' as usize] = 3; - result[b't' as usize] = 3; - result[b'U' as usize] = 3; - result[b'u' as usize] = 3; - result -}; - -fn dna3_to_2bit(c: &[u8]) -> Option { - let mut result = 0; - for i in 0..3 { - result <<= 2; - let tmp = DNA_ASCII_TO_2BIT[c[i] as usize]; - if tmp == 255 { - return None; - } - result |= tmp; - } - Some(result) -} - -pub const AA3_TO_AA1_VEC: &[(&str, &str)] = &[ - ("Ala", "A"), - ("Arg", "R"), - ("Asn", "N"), - ("Asp", "D"), - ("Cys", "C"), - ("Gln", "Q"), - ("Glu", "E"), - ("Gly", "G"), - ("His", "H"), - ("Ile", "I"), - ("Leu", "L"), - ("Lys", "K"), - ("Met", "M"), - ("Phe", "F"), - ("Pro", "P"), - ("Ser", "S"), - ("Thr", "T"), - ("Trp", "W"), - ("Tyr", "Y"), - ("Val", "V"), - ("Xaa", "X"), - ("Ter", "*"), - ("Sec", "U"), -]; - -const DNA_TO_AA1_LUT_VEC: &[(&str, &str)] = &[ - ("AAA", "K"), - ("AAC", "N"), - ("AAG", "K"), - ("AAT", "N"), - ("ACA", "T"), - ("ACC", "T"), - ("ACG", "T"), - ("ACT", "T"), - ("AGA", "R"), - ("AGC", "S"), - ("AGG", "R"), - ("AGT", "S"), - ("ATA", "I"), - ("ATC", "I"), - ("ATG", "M"), - ("ATT", "I"), - ("CAA", "Q"), - ("CAC", "H"), - ("CAG", "Q"), - ("CAT", "H"), - ("CCA", "P"), - ("CCC", "P"), - ("CCG", "P"), - ("CCT", "P"), - ("CGA", "R"), - ("CGC", "R"), - ("CGG", "R"), - ("CGT", "R"), - ("CTA", "L"), - ("CTC", "L"), - ("CTG", "L"), - ("CTT", "L"), - ("GAA", "E"), - ("GAC", "D"), - ("GAG", "E"), - ("GAT", "D"), - ("GCA", "A"), - ("GCC", "A"), - ("GCG", "A"), - ("GCT", "A"), - ("GGA", "G"), - ("GGC", "G"), - ("GGG", "G"), - ("GGT", "G"), - ("GTA", "V"), - ("GTC", "V"), - ("GTG", "V"), - ("GTT", "V"), - ("TAA", "*"), - ("TAC", "Y"), - ("TAG", "*"), - ("TAT", "Y"), - ("TCA", "S"), - ("TCC", "S"), - ("TCG", "S"), - ("TCT", "S"), - // caveat lector - ("TGA", "*"), - ("TGC", "C"), - ("TGG", "W"), - ("TGT", "C"), - ("TTA", "L"), - ("TTC", "F"), - ("TTG", "L"), - ("TTT", "F"), - // degenerate codons - ("AAR", "K"), - ("AAY", "N"), - ("ACB", "T"), - ("ACD", "T"), - ("ACH", "T"), - ("ACK", "T"), - ("ACM", "T"), - ("ACN", "T"), - ("ACR", "T"), - ("ACS", "T"), - ("ACV", "T"), - ("ACW", "T"), - ("ACY", "T"), - ("AGR", "R"), - ("AGY", "S"), - ("ATH", "I"), - ("ATM", "I"), - ("ATW", "I"), - ("ATY", "I"), - ("CAR", "Q"), - ("CAY", "H"), - ("CCB", "P"), - ("CCD", "P"), - ("CCH", "P"), - ("CCK", "P"), - ("CCM", "P"), - ("CCN", "P"), - ("CCR", "P"), - ("CCS", "P"), - ("CCV", "P"), - ("CCW", "P"), - ("CCY", "P"), - ("CGB", "R"), - ("CGD", "R"), - ("CGH", "R"), - ("CGK", "R"), - ("CGM", "R"), - ("CGN", "R"), - ("CGR", "R"), - ("CGS", "R"), - ("CGV", "R"), - ("CGW", "R"), - ("CGY", "R"), - ("CTB", "L"), - ("CTD", "L"), - ("CTH", "L"), - ("CTK", "L"), - ("CTM", "L"), - ("CTN", "L"), - ("CTR", "L"), - ("CTS", "L"), - ("CTV", "L"), - ("CTW", "L"), - ("CTY", "L"), - ("GAR", "E"), - ("GAY", "D"), - ("GCB", "A"), - ("GCD", "A"), - ("GCH", "A"), - ("GCK", "A"), - ("GCM", "A"), - ("GCN", "A"), - ("GCR", "A"), - ("GCS", "A"), - ("GCV", "A"), - ("GCW", "A"), - ("GCY", "A"), - ("GGB", "G"), - ("GGD", "G"), - ("GGH", "G"), - ("GGK", "G"), - ("GGM", "G"), - ("GGN", "G"), - ("GGR", "G"), - ("GGS", "G"), - ("GGV", "G"), - ("GGW", "G"), - ("GGY", "G"), - ("GTB", "V"), - ("GTD", "V"), - ("GTH", "V"), - ("GTK", "V"), - ("GTM", "V"), - ("GTN", "V"), - ("GTR", "V"), - ("GTS", "V"), - ("GTV", "V"), - ("GTW", "V"), - ("GTY", "V"), - ("MGA", "R"), - ("MGG", "R"), - ("MGR", "R"), - ("TAR", "*"), - ("TAY", "Y"), - ("TCB", "S"), - ("TCD", "S"), - ("TCH", "S"), - ("TCK", "S"), - ("TCM", "S"), - ("TCN", "S"), - ("TCR", "S"), - ("TCS", "S"), - ("TCV", "S"), - ("TCW", "S"), - ("TCY", "S"), - ("TGY", "C"), - ("TRA", "*"), - ("TTR", "L"), - ("TTY", "F"), - ("YTA", "L"), - ("YTG", "L"), - ("YTR", "L"), -]; - -/// Translation table for selenocysteine. -const DNA_TO_AA1_SEC_VEC: &[(&str, &str)] = &[ - ("AAA", "K"), - ("AAC", "N"), - ("AAG", "K"), - ("AAT", "N"), - ("ACA", "T"), - ("ACC", "T"), - ("ACG", "T"), - ("ACT", "T"), - ("AGA", "R"), - ("AGC", "S"), - ("AGG", "R"), - ("AGT", "S"), - ("ATA", "I"), - ("ATC", "I"), - ("ATG", "M"), - ("ATT", "I"), - ("CAA", "Q"), - ("CAC", "H"), - ("CAG", "Q"), - ("CAT", "H"), - ("CCA", "P"), - ("CCC", "P"), - ("CCG", "P"), - ("CCT", "P"), - ("CGA", "R"), - ("CGC", "R"), - ("CGG", "R"), - ("CGT", "R"), - ("CTA", "L"), - ("CTC", "L"), - ("CTG", "L"), - ("CTT", "L"), - ("GAA", "E"), - ("GAC", "D"), - ("GAG", "E"), - ("GAT", "D"), - ("GCA", "A"), - ("GCC", "A"), - ("GCG", "A"), - ("GCT", "A"), - ("GGA", "G"), - ("GGC", "G"), - ("GGG", "G"), - ("GGT", "G"), - ("GTA", "V"), - ("GTC", "V"), - ("GTG", "V"), - ("GTT", "V"), - ("TAA", "*"), - ("TAC", "Y"), - ("TAG", "*"), - ("TAT", "Y"), - ("TCA", "S"), - ("TCC", "S"), - ("TCG", "S"), - ("TCT", "S"), - // caveat lector - ("TGA", "U"), - ("TGC", "C"), - ("TGG", "W"), - ("TGT", "C"), - ("TTA", "L"), - ("TTC", "F"), - ("TTG", "L"), - ("TTT", "F"), - // degenerate codons - ("AAR", "K"), - ("AAY", "N"), - ("ACB", "T"), - ("ACD", "T"), - ("ACH", "T"), - ("ACK", "T"), - ("ACM", "T"), - ("ACN", "T"), - ("ACR", "T"), - ("ACS", "T"), - ("ACV", "T"), - ("ACW", "T"), - ("ACY", "T"), - ("AGR", "R"), - ("AGY", "S"), - ("ATH", "I"), - ("ATM", "I"), - ("ATW", "I"), - ("ATY", "I"), - ("CAR", "Q"), - ("CAY", "H"), - ("CCB", "P"), - ("CCD", "P"), - ("CCH", "P"), - ("CCK", "P"), - ("CCM", "P"), - ("CCN", "P"), - ("CCR", "P"), - ("CCS", "P"), - ("CCV", "P"), - ("CCW", "P"), - ("CCY", "P"), - ("CGB", "R"), - ("CGD", "R"), - ("CGH", "R"), - ("CGK", "R"), - ("CGM", "R"), - ("CGN", "R"), - ("CGR", "R"), - ("CGS", "R"), - ("CGV", "R"), - ("CGW", "R"), - ("CGY", "R"), - ("CTB", "L"), - ("CTD", "L"), - ("CTH", "L"), - ("CTK", "L"), - ("CTM", "L"), - ("CTN", "L"), - ("CTR", "L"), - ("CTS", "L"), - ("CTV", "L"), - ("CTW", "L"), - ("CTY", "L"), - ("GAR", "E"), - ("GAY", "D"), - ("GCB", "A"), - ("GCD", "A"), - ("GCH", "A"), - ("GCK", "A"), - ("GCM", "A"), - ("GCN", "A"), - ("GCR", "A"), - ("GCS", "A"), - ("GCV", "A"), - ("GCW", "A"), - ("GCY", "A"), - ("GGB", "G"), - ("GGD", "G"), - ("GGH", "G"), - ("GGK", "G"), - ("GGM", "G"), - ("GGN", "G"), - ("GGR", "G"), - ("GGS", "G"), - ("GGV", "G"), - ("GGW", "G"), - ("GGY", "G"), - ("GTB", "V"), - ("GTD", "V"), - ("GTH", "V"), - ("GTK", "V"), - ("GTM", "V"), - ("GTN", "V"), - ("GTR", "V"), - ("GTS", "V"), - ("GTV", "V"), - ("GTW", "V"), - ("GTY", "V"), - ("MGA", "R"), - ("MGG", "R"), - ("MGR", "R"), - ("TAR", "*"), - ("TAY", "Y"), - ("TCB", "S"), - ("TCD", "S"), - ("TCH", "S"), - ("TCK", "S"), - ("TCM", "S"), - ("TCN", "S"), - ("TCR", "S"), - ("TCS", "S"), - ("TCV", "S"), - ("TCW", "S"), - ("TCY", "S"), - ("TGY", "C"), - ("TRA", "*"), - ("TTR", "L"), - ("TTY", "F"), - ("YTA", "L"), - ("YTG", "L"), - ("YTR", "L"), -]; - -/// Vertebrate mitochondrial code, cf. https://en.wikipedia.org/wiki/Vertebrate_mitochondrial_code -const DNA_TO_AA1_CHRMT_VERTEBRATE_VEC: &[(&str, &str)] = &[ - ("AAA", "K"), - ("AAC", "N"), - ("AAG", "K"), - ("AAT", "N"), - ("ACA", "T"), - ("ACC", "T"), - ("ACG", "T"), - ("ACT", "T"), - // caveat lector - ("AGA", "*"), - ("AGC", "S"), - // caveat lector - ("AGG", "*"), - ("AGT", "S"), - // caveat lector - ("ATA", "M"), - ("ATC", "I"), - ("ATG", "M"), - ("ATT", "I"), - ("CAA", "Q"), - ("CAC", "H"), - ("CAG", "Q"), - ("CAT", "H"), - ("CCA", "P"), - ("CCC", "P"), - ("CCG", "P"), - ("CCT", "P"), - ("CGA", "R"), - ("CGC", "R"), - ("CGG", "R"), - ("CGT", "R"), - ("CTA", "L"), - ("CTC", "L"), - ("CTG", "L"), - ("CTT", "L"), - ("GAA", "E"), - ("GAC", "D"), - ("GAG", "E"), - ("GAT", "D"), - ("GCA", "A"), - ("GCC", "A"), - ("GCG", "A"), - ("GCT", "A"), - ("GGA", "G"), - ("GGC", "G"), - ("GGG", "G"), - ("GGT", "G"), - ("GTA", "V"), - ("GTC", "V"), - ("GTG", "V"), - ("GTT", "V"), - ("TAA", "*"), - ("TAC", "Y"), - ("TAG", "*"), - ("TAT", "Y"), - ("TCA", "S"), - ("TCC", "S"), - ("TCG", "S"), - ("TCT", "S"), - // caveat lector - ("TGA", "W"), - ("TGC", "C"), - ("TGG", "W"), - ("TGT", "C"), - ("TTA", "L"), - ("TTC", "F"), - ("TTG", "L"), - ("TTT", "F"), - // degenerate codons - ("AAR", "K"), - ("AAY", "N"), - ("ACB", "T"), - ("ACD", "T"), - ("ACH", "T"), - ("ACK", "T"), - ("ACM", "T"), - ("ACN", "T"), - ("ACR", "T"), - ("ACS", "T"), - ("ACV", "T"), - ("ACW", "T"), - ("ACY", "T"), - ("AGR", "R"), - ("AGY", "S"), - ("ATH", "I"), - ("ATM", "I"), - ("ATW", "I"), - ("ATY", "I"), - ("CAR", "Q"), - ("CAY", "H"), - ("CCB", "P"), - ("CCD", "P"), - ("CCH", "P"), - ("CCK", "P"), - ("CCM", "P"), - ("CCN", "P"), - ("CCR", "P"), - ("CCS", "P"), - ("CCV", "P"), - ("CCW", "P"), - ("CCY", "P"), - ("CGB", "R"), - ("CGD", "R"), - ("CGH", "R"), - ("CGK", "R"), - ("CGM", "R"), - ("CGN", "R"), - ("CGR", "R"), - ("CGS", "R"), - ("CGV", "R"), - ("CGW", "R"), - ("CGY", "R"), - ("CTB", "L"), - ("CTD", "L"), - ("CTH", "L"), - ("CTK", "L"), - ("CTM", "L"), - ("CTN", "L"), - ("CTR", "L"), - ("CTS", "L"), - ("CTV", "L"), - ("CTW", "L"), - ("CTY", "L"), - ("GAR", "E"), - ("GAY", "D"), - ("GCB", "A"), - ("GCD", "A"), - ("GCH", "A"), - ("GCK", "A"), - ("GCM", "A"), - ("GCN", "A"), - ("GCR", "A"), - ("GCS", "A"), - ("GCV", "A"), - ("GCW", "A"), - ("GCY", "A"), - ("GGB", "G"), - ("GGD", "G"), - ("GGH", "G"), - ("GGK", "G"), - ("GGM", "G"), - ("GGN", "G"), - ("GGR", "G"), - ("GGS", "G"), - ("GGV", "G"), - ("GGW", "G"), - ("GGY", "G"), - ("GTB", "V"), - ("GTD", "V"), - ("GTH", "V"), - ("GTK", "V"), - ("GTM", "V"), - ("GTN", "V"), - ("GTR", "V"), - ("GTS", "V"), - ("GTV", "V"), - ("GTW", "V"), - ("GTY", "V"), - ("MGA", "R"), - ("MGG", "R"), - ("MGR", "R"), - ("TAR", "*"), - ("TAY", "Y"), - ("TCB", "S"), - ("TCD", "S"), - ("TCH", "S"), - ("TCK", "S"), - ("TCM", "S"), - ("TCN", "S"), - ("TCR", "S"), - ("TCS", "S"), - ("TCV", "S"), - ("TCW", "S"), - ("TCY", "S"), - ("TGY", "C"), - ("TRA", "*"), - ("TTR", "L"), - ("TTY", "F"), - ("YTA", "L"), - ("YTG", "L"), - ("YTR", "L"), -]; - -/// Generated via: -/// ```rust,no_run -/// const _: &str = stringify!{ -/// let mut result = [0; 64]; -/// for (i, (dna3, aa1)) in DNA_TO_AA1_LUT_VEC.iter().enumerate() { -/// if i > 63 { -/// break; // skip degenerate codons -/// } -/// let dna3_2bit = dna3_to_2bit(dna3.as_bytes()).expect("invalid dna3"); -/// result[dna3_2bit as usize] = aa1.as_bytes()[0]; -/// } -/// }; -/// ``` -/// -const CODON_2BIT_TO_AA1_LUT: [u8; 64] = [ - 75, 78, 75, 78, 84, 84, 84, 84, 82, 83, 82, 83, 73, 73, 77, 73, 81, 72, 81, 72, 80, 80, 80, 80, - 82, 82, 82, 82, 76, 76, 76, 76, 69, 68, 69, 68, 65, 65, 65, 65, 71, 71, 71, 71, 86, 86, 86, 86, - 42, 89, 42, 89, 83, 83, 83, 83, 42, 67, 87, 67, 76, 70, 76, 70, -]; - -lazy_static::lazy_static! { - static ref AA1_TO_AA3: AHashMap<&'static [u8], &'static str> = { - let mut m = AHashMap::default(); - for (aa3, aa1) in AA3_TO_AA1_VEC.iter() { - m.insert(aa1.as_bytes(), *aa3); - } - m - }; - - static ref AA3_TO_AA1: AHashMap<&'static [u8], &'static str> = { - let mut m = AHashMap::default(); - for (aa3, aa1) in AA3_TO_AA1_VEC.iter() { - m.insert(aa3.as_bytes(), *aa1); - } - m - }; - - static ref DNA_TO_AA1_LUT: AHashMap = { - let mut m = AHashMap::default(); - for (dna, aa1) in DNA_TO_AA1_LUT_VEC { - assert_eq!(dna.len(), 3); - let d = dna.as_bytes(); - m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); - } - m - }; - - static ref DNA_TO_AA1_SEC: AHashMap = { - let mut m = AHashMap::default(); - for (dna, aa1) in DNA_TO_AA1_SEC_VEC { - assert_eq!(dna.len(), 3); - let d = dna.as_bytes(); - m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); - } - m - }; - - static ref DNA_TO_AA1_CHRMT_VERTEBRATE: AHashMap = { - let mut m = AHashMap::default(); - for (dna, aa1) in DNA_TO_AA1_CHRMT_VERTEBRATE_VEC { - assert_eq!(dna.len(), 3); - let d = dna.as_bytes(); - m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); - } - m - }; - - static ref CODON_2BIT_TO_AA1_SEC: [u8; 64] = { - let mut result = [0; 64]; - for (i, (dna3, aa1)) in DNA_TO_AA1_SEC_VEC.iter().enumerate() { - if i > 63 { - break; // skip degenerate codons - } - let dna3_2bit = dna3_to_2bit(dna3.as_bytes()).expect("invalid dna3"); - result[dna3_2bit as usize] = aa1.as_bytes()[0]; - } - result - }; - - static ref CODON_2BIT_TO_AA1_CHRMT_VERTEBRATE: [u8; 64] = { - let mut result = [0; 64]; - for (i, (dna3, aa1)) in DNA_TO_AA1_CHRMT_VERTEBRATE_VEC.iter().enumerate() { - if i > 63 { - break; // skip degenerate codons - } - let dna3_2bit = dna3_to_2bit(dna3.as_bytes()).expect("invalid dna3"); - result[dna3_2bit as usize] = aa1.as_bytes()[0]; - } - result - }; -} - -const IUPAC_AMBIGUITY_CODES: [u8; 13] = *b"BDHVNUWSMKRYZ"; - /// Allow selection of translation table. #[derive( Debug, @@ -893,8 +166,8 @@ pub fn aa1_to_aa3(seq: &str) -> Result { let mut result = String::with_capacity(seq.len() * 3); - for (i, aa1) in seq.as_bytes().chunks(1).enumerate() { - let aa3 = AA1_TO_AA3.get(aa1).ok_or_else(|| { + for (i, aa1) in seq.as_bytes().iter().enumerate() { + let aa3 = AA1_TO_AA3_STR[*aa1 as usize].ok_or_else(|| { Error::InvalidOneLetterAminoAcid(format!("{:?}", aa1), format!("{}", i + 1)) })?; result.push_str(aa3); @@ -923,11 +196,10 @@ pub fn aa3_to_aa1(seq: &str) -> Result { let mut result = String::with_capacity(seq.len() / 3); for (i, aa3) in seq.as_bytes().chunks(3).enumerate() { - let aa1 = AA3_TO_AA1.get(aa3).ok_or(Error::InvalidOneLetterAminoAcid( - format!("{:?}", aa3), - format!("{}", i + 1), - ))?; - result.push_str(aa1); + let aa1 = _aa3_to_aa1(aa3).ok_or_else(|| { + Error::InvalidThreeLetterAminoAcid(format!("{:?}", aa3), format!("{}", i + 1)) + })? as char; + result.push(aa1); } Ok(result) @@ -955,13 +227,16 @@ type Codon = [u8; 3]; struct CodonTranslator { /// Mapping for "normalizing" DNA ASCII character (to upper case and `U -> T`). dna_ascii_map: [u8; 256], + /// Mapping from DNA ASCII to 2-bit representation. dna_ascii_to_2bit: [u8; 256], + /// IUPAC ambiguity codes. iupac_ambiguity_codes: [u8; 13], /// Mapping from 2bit DNA codon to amino acid 1-letter ASCII. codon_2bit_to_aa1: [u8; 64], + /// Mapping from DNA 2-bit to amino acid 1-letter ASCII including degenerate codons. full_dna_to_aa1: &'static AHashMap, @@ -969,6 +244,36 @@ struct CodonTranslator { codon: Codon, } +static DNA_TO_AA1_LUT: LazyLock> = LazyLock::new(|| { + let mut m = AHashMap::default(); + for (dna, aa1) in DNA_TO_AA1_LUT_VEC { + assert_eq!(dna.len(), 3); + let d = dna.as_bytes(); + m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); + } + m +}); + +static DNA_TO_AA1_SEC: LazyLock> = LazyLock::new(|| { + let mut m = AHashMap::default(); + for (dna, aa1) in DNA_TO_AA1_SEC_VEC { + assert_eq!(dna.len(), 3); + let d = dna.as_bytes(); + m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); + } + m +}); + +static DNA_TO_AA1_CHRMT_VERTEBRATE: LazyLock> = LazyLock::new(|| { + let mut m = AHashMap::default(); + for (dna, aa1) in DNA_TO_AA1_CHRMT_VERTEBRATE_VEC { + assert_eq!(dna.len(), 3); + let d = dna.as_bytes(); + m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); + } + m +}); + impl CodonTranslator { /// Initialize the struct. pub fn new(table: TranslationTable) -> Self { @@ -979,8 +284,8 @@ impl CodonTranslator { codon_2bit_to_aa1: match table { TranslationTable::Standard => CODON_2BIT_TO_AA1_LUT, - TranslationTable::Selenocysteine => *CODON_2BIT_TO_AA1_SEC, - TranslationTable::VertebrateMitochondrial => *CODON_2BIT_TO_AA1_CHRMT_VERTEBRATE, + TranslationTable::Selenocysteine => CODON_2BIT_TO_AA1_SEC, + TranslationTable::VertebrateMitochondrial => CODON_2BIT_TO_AA1_CHRMT_VERTEBRATE, }, full_dna_to_aa1: match table { TranslationTable::Standard => &DNA_TO_AA1_LUT, diff --git a/tables.in b/tables.in new file mode 100644 index 0000000..ec760c2 --- /dev/null +++ b/tables.in @@ -0,0 +1,594 @@ +const DNA_ASCII_TO_2BIT: [u8; 256] = { + let mut result = [255; 256]; + + result[b'A' as usize] = 0; + result[b'a' as usize] = 0; + + result[b'C' as usize] = 1; + result[b'c' as usize] = 1; + + result[b'G' as usize] = 2; + result[b'g' as usize] = 2; + + result[b'T' as usize] = 3; + result[b't' as usize] = 3; + result[b'U' as usize] = 3; + result[b'u' as usize] = 3; + result +}; + +pub const AA3_TO_AA1_VEC: &[(&str, &str)] = &[ + ("Ala", "A"), + ("Arg", "R"), + ("Asn", "N"), + ("Asp", "D"), + ("Cys", "C"), + ("Gln", "Q"), + ("Glu", "E"), + ("Gly", "G"), + ("His", "H"), + ("Ile", "I"), + ("Leu", "L"), + ("Lys", "K"), + ("Met", "M"), + ("Phe", "F"), + ("Pro", "P"), + ("Ser", "S"), + ("Thr", "T"), + ("Trp", "W"), + ("Tyr", "Y"), + ("Val", "V"), + ("Xaa", "X"), + ("Ter", "*"), + ("Sec", "U"), +]; + +const DNA_TO_AA1_LUT_VEC: &[(&str, &str)] = &[ + ("AAA", "K"), + ("AAC", "N"), + ("AAG", "K"), + ("AAT", "N"), + ("ACA", "T"), + ("ACC", "T"), + ("ACG", "T"), + ("ACT", "T"), + ("AGA", "R"), + ("AGC", "S"), + ("AGG", "R"), + ("AGT", "S"), + ("ATA", "I"), + ("ATC", "I"), + ("ATG", "M"), + ("ATT", "I"), + ("CAA", "Q"), + ("CAC", "H"), + ("CAG", "Q"), + ("CAT", "H"), + ("CCA", "P"), + ("CCC", "P"), + ("CCG", "P"), + ("CCT", "P"), + ("CGA", "R"), + ("CGC", "R"), + ("CGG", "R"), + ("CGT", "R"), + ("CTA", "L"), + ("CTC", "L"), + ("CTG", "L"), + ("CTT", "L"), + ("GAA", "E"), + ("GAC", "D"), + ("GAG", "E"), + ("GAT", "D"), + ("GCA", "A"), + ("GCC", "A"), + ("GCG", "A"), + ("GCT", "A"), + ("GGA", "G"), + ("GGC", "G"), + ("GGG", "G"), + ("GGT", "G"), + ("GTA", "V"), + ("GTC", "V"), + ("GTG", "V"), + ("GTT", "V"), + ("TAA", "*"), + ("TAC", "Y"), + ("TAG", "*"), + ("TAT", "Y"), + ("TCA", "S"), + ("TCC", "S"), + ("TCG", "S"), + ("TCT", "S"), + // caveat lector + ("TGA", "*"), + ("TGC", "C"), + ("TGG", "W"), + ("TGT", "C"), + ("TTA", "L"), + ("TTC", "F"), + ("TTG", "L"), + ("TTT", "F"), + // degenerate codons + ("AAR", "K"), + ("AAY", "N"), + ("ACB", "T"), + ("ACD", "T"), + ("ACH", "T"), + ("ACK", "T"), + ("ACM", "T"), + ("ACN", "T"), + ("ACR", "T"), + ("ACS", "T"), + ("ACV", "T"), + ("ACW", "T"), + ("ACY", "T"), + ("AGR", "R"), + ("AGY", "S"), + ("ATH", "I"), + ("ATM", "I"), + ("ATW", "I"), + ("ATY", "I"), + ("CAR", "Q"), + ("CAY", "H"), + ("CCB", "P"), + ("CCD", "P"), + ("CCH", "P"), + ("CCK", "P"), + ("CCM", "P"), + ("CCN", "P"), + ("CCR", "P"), + ("CCS", "P"), + ("CCV", "P"), + ("CCW", "P"), + ("CCY", "P"), + ("CGB", "R"), + ("CGD", "R"), + ("CGH", "R"), + ("CGK", "R"), + ("CGM", "R"), + ("CGN", "R"), + ("CGR", "R"), + ("CGS", "R"), + ("CGV", "R"), + ("CGW", "R"), + ("CGY", "R"), + ("CTB", "L"), + ("CTD", "L"), + ("CTH", "L"), + ("CTK", "L"), + ("CTM", "L"), + ("CTN", "L"), + ("CTR", "L"), + ("CTS", "L"), + ("CTV", "L"), + ("CTW", "L"), + ("CTY", "L"), + ("GAR", "E"), + ("GAY", "D"), + ("GCB", "A"), + ("GCD", "A"), + ("GCH", "A"), + ("GCK", "A"), + ("GCM", "A"), + ("GCN", "A"), + ("GCR", "A"), + ("GCS", "A"), + ("GCV", "A"), + ("GCW", "A"), + ("GCY", "A"), + ("GGB", "G"), + ("GGD", "G"), + ("GGH", "G"), + ("GGK", "G"), + ("GGM", "G"), + ("GGN", "G"), + ("GGR", "G"), + ("GGS", "G"), + ("GGV", "G"), + ("GGW", "G"), + ("GGY", "G"), + ("GTB", "V"), + ("GTD", "V"), + ("GTH", "V"), + ("GTK", "V"), + ("GTM", "V"), + ("GTN", "V"), + ("GTR", "V"), + ("GTS", "V"), + ("GTV", "V"), + ("GTW", "V"), + ("GTY", "V"), + ("MGA", "R"), + ("MGG", "R"), + ("MGR", "R"), + ("TAR", "*"), + ("TAY", "Y"), + ("TCB", "S"), + ("TCD", "S"), + ("TCH", "S"), + ("TCK", "S"), + ("TCM", "S"), + ("TCN", "S"), + ("TCR", "S"), + ("TCS", "S"), + ("TCV", "S"), + ("TCW", "S"), + ("TCY", "S"), + ("TGY", "C"), + ("TRA", "*"), + ("TTR", "L"), + ("TTY", "F"), + ("YTA", "L"), + ("YTG", "L"), + ("YTR", "L"), +]; + +/// Translation table for selenocysteine. +const DNA_TO_AA1_SEC_VEC: &[(&str, &str)] = &[ + ("AAA", "K"), + ("AAC", "N"), + ("AAG", "K"), + ("AAT", "N"), + ("ACA", "T"), + ("ACC", "T"), + ("ACG", "T"), + ("ACT", "T"), + ("AGA", "R"), + ("AGC", "S"), + ("AGG", "R"), + ("AGT", "S"), + ("ATA", "I"), + ("ATC", "I"), + ("ATG", "M"), + ("ATT", "I"), + ("CAA", "Q"), + ("CAC", "H"), + ("CAG", "Q"), + ("CAT", "H"), + ("CCA", "P"), + ("CCC", "P"), + ("CCG", "P"), + ("CCT", "P"), + ("CGA", "R"), + ("CGC", "R"), + ("CGG", "R"), + ("CGT", "R"), + ("CTA", "L"), + ("CTC", "L"), + ("CTG", "L"), + ("CTT", "L"), + ("GAA", "E"), + ("GAC", "D"), + ("GAG", "E"), + ("GAT", "D"), + ("GCA", "A"), + ("GCC", "A"), + ("GCG", "A"), + ("GCT", "A"), + ("GGA", "G"), + ("GGC", "G"), + ("GGG", "G"), + ("GGT", "G"), + ("GTA", "V"), + ("GTC", "V"), + ("GTG", "V"), + ("GTT", "V"), + ("TAA", "*"), + ("TAC", "Y"), + ("TAG", "*"), + ("TAT", "Y"), + ("TCA", "S"), + ("TCC", "S"), + ("TCG", "S"), + ("TCT", "S"), + // caveat lector + ("TGA", "U"), + ("TGC", "C"), + ("TGG", "W"), + ("TGT", "C"), + ("TTA", "L"), + ("TTC", "F"), + ("TTG", "L"), + ("TTT", "F"), + // degenerate codons + ("AAR", "K"), + ("AAY", "N"), + ("ACB", "T"), + ("ACD", "T"), + ("ACH", "T"), + ("ACK", "T"), + ("ACM", "T"), + ("ACN", "T"), + ("ACR", "T"), + ("ACS", "T"), + ("ACV", "T"), + ("ACW", "T"), + ("ACY", "T"), + ("AGR", "R"), + ("AGY", "S"), + ("ATH", "I"), + ("ATM", "I"), + ("ATW", "I"), + ("ATY", "I"), + ("CAR", "Q"), + ("CAY", "H"), + ("CCB", "P"), + ("CCD", "P"), + ("CCH", "P"), + ("CCK", "P"), + ("CCM", "P"), + ("CCN", "P"), + ("CCR", "P"), + ("CCS", "P"), + ("CCV", "P"), + ("CCW", "P"), + ("CCY", "P"), + ("CGB", "R"), + ("CGD", "R"), + ("CGH", "R"), + ("CGK", "R"), + ("CGM", "R"), + ("CGN", "R"), + ("CGR", "R"), + ("CGS", "R"), + ("CGV", "R"), + ("CGW", "R"), + ("CGY", "R"), + ("CTB", "L"), + ("CTD", "L"), + ("CTH", "L"), + ("CTK", "L"), + ("CTM", "L"), + ("CTN", "L"), + ("CTR", "L"), + ("CTS", "L"), + ("CTV", "L"), + ("CTW", "L"), + ("CTY", "L"), + ("GAR", "E"), + ("GAY", "D"), + ("GCB", "A"), + ("GCD", "A"), + ("GCH", "A"), + ("GCK", "A"), + ("GCM", "A"), + ("GCN", "A"), + ("GCR", "A"), + ("GCS", "A"), + ("GCV", "A"), + ("GCW", "A"), + ("GCY", "A"), + ("GGB", "G"), + ("GGD", "G"), + ("GGH", "G"), + ("GGK", "G"), + ("GGM", "G"), + ("GGN", "G"), + ("GGR", "G"), + ("GGS", "G"), + ("GGV", "G"), + ("GGW", "G"), + ("GGY", "G"), + ("GTB", "V"), + ("GTD", "V"), + ("GTH", "V"), + ("GTK", "V"), + ("GTM", "V"), + ("GTN", "V"), + ("GTR", "V"), + ("GTS", "V"), + ("GTV", "V"), + ("GTW", "V"), + ("GTY", "V"), + ("MGA", "R"), + ("MGG", "R"), + ("MGR", "R"), + ("TAR", "*"), + ("TAY", "Y"), + ("TCB", "S"), + ("TCD", "S"), + ("TCH", "S"), + ("TCK", "S"), + ("TCM", "S"), + ("TCN", "S"), + ("TCR", "S"), + ("TCS", "S"), + ("TCV", "S"), + ("TCW", "S"), + ("TCY", "S"), + ("TGY", "C"), + ("TRA", "*"), + ("TTR", "L"), + ("TTY", "F"), + ("YTA", "L"), + ("YTG", "L"), + ("YTR", "L"), +]; + +/// Vertebrate mitochondrial code, cf. https://en.wikipedia.org/wiki/Vertebrate_mitochondrial_code +const DNA_TO_AA1_CHRMT_VERTEBRATE_VEC: &[(&str, &str)] = &[ + ("AAA", "K"), + ("AAC", "N"), + ("AAG", "K"), + ("AAT", "N"), + ("ACA", "T"), + ("ACC", "T"), + ("ACG", "T"), + ("ACT", "T"), + // caveat lector + ("AGA", "*"), + ("AGC", "S"), + // caveat lector + ("AGG", "*"), + ("AGT", "S"), + // caveat lector + ("ATA", "M"), + ("ATC", "I"), + ("ATG", "M"), + ("ATT", "I"), + ("CAA", "Q"), + ("CAC", "H"), + ("CAG", "Q"), + ("CAT", "H"), + ("CCA", "P"), + ("CCC", "P"), + ("CCG", "P"), + ("CCT", "P"), + ("CGA", "R"), + ("CGC", "R"), + ("CGG", "R"), + ("CGT", "R"), + ("CTA", "L"), + ("CTC", "L"), + ("CTG", "L"), + ("CTT", "L"), + ("GAA", "E"), + ("GAC", "D"), + ("GAG", "E"), + ("GAT", "D"), + ("GCA", "A"), + ("GCC", "A"), + ("GCG", "A"), + ("GCT", "A"), + ("GGA", "G"), + ("GGC", "G"), + ("GGG", "G"), + ("GGT", "G"), + ("GTA", "V"), + ("GTC", "V"), + ("GTG", "V"), + ("GTT", "V"), + ("TAA", "*"), + ("TAC", "Y"), + ("TAG", "*"), + ("TAT", "Y"), + ("TCA", "S"), + ("TCC", "S"), + ("TCG", "S"), + ("TCT", "S"), + // caveat lector + ("TGA", "W"), + ("TGC", "C"), + ("TGG", "W"), + ("TGT", "C"), + ("TTA", "L"), + ("TTC", "F"), + ("TTG", "L"), + ("TTT", "F"), + // degenerate codons + ("AAR", "K"), + ("AAY", "N"), + ("ACB", "T"), + ("ACD", "T"), + ("ACH", "T"), + ("ACK", "T"), + ("ACM", "T"), + ("ACN", "T"), + ("ACR", "T"), + ("ACS", "T"), + ("ACV", "T"), + ("ACW", "T"), + ("ACY", "T"), + ("AGR", "R"), + ("AGY", "S"), + ("ATH", "I"), + ("ATM", "I"), + ("ATW", "I"), + ("ATY", "I"), + ("CAR", "Q"), + ("CAY", "H"), + ("CCB", "P"), + ("CCD", "P"), + ("CCH", "P"), + ("CCK", "P"), + ("CCM", "P"), + ("CCN", "P"), + ("CCR", "P"), + ("CCS", "P"), + ("CCV", "P"), + ("CCW", "P"), + ("CCY", "P"), + ("CGB", "R"), + ("CGD", "R"), + ("CGH", "R"), + ("CGK", "R"), + ("CGM", "R"), + ("CGN", "R"), + ("CGR", "R"), + ("CGS", "R"), + ("CGV", "R"), + ("CGW", "R"), + ("CGY", "R"), + ("CTB", "L"), + ("CTD", "L"), + ("CTH", "L"), + ("CTK", "L"), + ("CTM", "L"), + ("CTN", "L"), + ("CTR", "L"), + ("CTS", "L"), + ("CTV", "L"), + ("CTW", "L"), + ("CTY", "L"), + ("GAR", "E"), + ("GAY", "D"), + ("GCB", "A"), + ("GCD", "A"), + ("GCH", "A"), + ("GCK", "A"), + ("GCM", "A"), + ("GCN", "A"), + ("GCR", "A"), + ("GCS", "A"), + ("GCV", "A"), + ("GCW", "A"), + ("GCY", "A"), + ("GGB", "G"), + ("GGD", "G"), + ("GGH", "G"), + ("GGK", "G"), + ("GGM", "G"), + ("GGN", "G"), + ("GGR", "G"), + ("GGS", "G"), + ("GGV", "G"), + ("GGW", "G"), + ("GGY", "G"), + ("GTB", "V"), + ("GTD", "V"), + ("GTH", "V"), + ("GTK", "V"), + ("GTM", "V"), + ("GTN", "V"), + ("GTR", "V"), + ("GTS", "V"), + ("GTV", "V"), + ("GTW", "V"), + ("GTY", "V"), + ("MGA", "R"), + ("MGG", "R"), + ("MGR", "R"), + ("TAR", "*"), + ("TAY", "Y"), + ("TCB", "S"), + ("TCD", "S"), + ("TCH", "S"), + ("TCK", "S"), + ("TCM", "S"), + ("TCN", "S"), + ("TCR", "S"), + ("TCS", "S"), + ("TCV", "S"), + ("TCW", "S"), + ("TCY", "S"), + ("TGY", "C"), + ("TRA", "*"), + ("TTR", "L"), + ("TTY", "F"), + ("YTA", "L"), + ("YTG", "L"), + ("YTR", "L"), +]; + +const IUPAC_AMBIGUITY_CODES: [u8; 13] = *b"BDHVNUWSMKRYZ";