From 3ef2c3d887aae5af0abb0e5880b540ce2cf800db Mon Sep 17 00:00:00 2001 From: Lorenzo Cimini Date: Tue, 23 Jan 2024 11:04:50 +0100 Subject: [PATCH] radix and fidelity became parameters --- Cargo.toml | 3 +- examples/bench_bvgraph.rs | 8 +- src/ans/decoder.rs | 4 +- src/ans/mod.rs | 20 +- src/ans/model4decoder.rs | 4 +- src/bin/bvcomp.rs | 16 +- src/bin/bvtest.rs | 10 +- src/bvgraph/mock_writers.rs | 301 ++++++++------- src/bvgraph/mod.rs | 24 +- src/bvgraph/reader.rs | 44 +-- src/bvgraph/writer.rs | 206 +++++----- src/lib.rs | 19 +- src/multi_model_ans/decoder.rs | 138 ++++--- src/multi_model_ans/encoder.rs | 100 +++-- src/multi_model_ans/mod.rs | 21 +- src/multi_model_ans/model4decoder.rs | 231 ++--------- src/multi_model_ans/model4encoder.rs | 18 +- src/multi_model_ans/model4encoder_builder.rs | 56 +-- src/utils/ans_utilities.rs | 1 - tests/compressor_tests.rs | 379 ++++--------------- tests/decoder_model_tests.rs | 37 +- tests/encoder_model_tests.rs | 42 -- tests/test_bvgraph.rs | 92 ++--- tests/utils/mod.rs | 15 +- 24 files changed, 683 insertions(+), 1106 deletions(-) delete mode 100644 tests/encoder_model_tests.rs diff --git a/Cargo.toml b/Cargo.toml index 4ac6403..b0e113e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,9 +22,8 @@ env_logger = "0.10.1" tempfile = "3.9.0" epserde = "0.2.1" mem_dbg = "0.1.3" -lender = "0.2.9" dsi-progress-logger = "0.2.2" -thiserror = "1.0" + [dev-dependencies] criterion = "0.5.1" pprof = { version = "0.13.0", features = ["flamegraph", "criterion", "frame-pointer"] } diff --git a/examples/bench_bvgraph.rs b/examples/bench_bvgraph.rs index 3d8dcd7..b6518f0 100644 --- a/examples/bench_bvgraph.rs +++ b/examples/bench_bvgraph.rs @@ -1,3 +1,4 @@ +/* use epserde::prelude::{Deserialize, Serialize}; use webgraph::prelude::{BVComp, BVGraph, EmptyDict, RandomAccessLabelling, SequentialLabelling}; @@ -7,7 +8,7 @@ use folded_streaming_rans::bvgraph::writer::{BVGraphModelBuilder, BVGraphWriter} use folded_streaming_rans::multi_model_ans::Prelude; use anyhow::Result; -use folded_streaming_rans::bvgraph::mock_writers::{ANSymbolTable, EntropyMockWriter, Log2MockWriter, MockWriter}; +use folded_streaming_rans::bvgraph::mock_writers::{ANSymbolTable, EntropyMockWriter, Log2MockWriter}; use folded_streaming_rans::multi_model_ans::encoder::ANSCompressorPhase; const NODES: usize = 325557; @@ -120,4 +121,7 @@ fn main() -> Result<()> { dbg!(now.elapsed().as_nanos() / arcs as u128); Ok(()) -} \ No newline at end of file +} +*/ + +fn main() {} \ No newline at end of file diff --git a/src/ans/decoder.rs b/src/ans/decoder.rs index 1d0ad14..d50e56c 100644 --- a/src/ans/decoder.rs +++ b/src/ans/decoder.rs @@ -1,8 +1,8 @@ use crate::ans::model4decoder::VecFrame; -use crate::ans::{Prelude, K_LOG2}; +use crate::ans::{Prelude, K_LOG2, DecoderModelEntry}; use crate::traits::folding::FoldRead; use crate::traits::quasi::Quasi; -use crate::{DecoderModelEntry, RawSymbol, State, FASTER_RADIX, LOG2_B}; +use crate::{RawSymbol, State, FASTER_RADIX, LOG2_B}; use epserde::traits::ZeroCopy; use std::ops::Index; diff --git a/src/ans/mod.rs b/src/ans/mod.rs index 975fbe5..7a72d95 100644 --- a/src/ans/mod.rs +++ b/src/ans/mod.rs @@ -1,11 +1,14 @@ +use epserde::Epserde; +use epserde::prelude::ZeroCopy; use crate::traits::folding::FoldRead; use crate::{Freq, State}; use strength_reduce::StrengthReducedU64; +use crate::traits::quasi::Quasi; -mod decoder; -mod encoder; -mod model4decoder; -mod model4encoder; +pub mod decoder; +pub mod encoder; +pub mod model4decoder; +pub mod model4encoder; pub const K: usize = 16; pub const K_LOG2: usize = 4; @@ -62,3 +65,12 @@ impl From<(Freq, u64, Freq)> for EncoderModelEntry { } } } + +#[derive(Clone, Copy, Debug, Default, Epserde)] +#[repr(C)] +#[zero_copy] +pub struct DecoderModelEntry + ZeroCopy + 'static> { + pub freq: Freq, + pub cumul_freq: Freq, + pub quasi_folded: T, +} diff --git a/src/ans/model4decoder.rs b/src/ans/model4decoder.rs index 7ea0408..049889c 100644 --- a/src/ans/model4decoder.rs +++ b/src/ans/model4decoder.rs @@ -1,10 +1,10 @@ use crate::traits::quasi::Quasi; -use crate::{DecoderModelEntry, State, Symbol}; +use crate::{State, Symbol}; use epserde::prelude::*; use std::ops::Index; use sucds::bit_vectors::{Rank, Rank9Sel}; use sux::prelude::*; -use crate::ans::EncoderModelEntry; +use crate::ans::{DecoderModelEntry, EncoderModelEntry}; #[derive(Epserde)] pub struct EliasFanoFrame diff --git a/src/bin/bvcomp.rs b/src/bin/bvcomp.rs index 72984d8..166cabd 100644 --- a/src/bin/bvcomp.rs +++ b/src/bin/bvcomp.rs @@ -1,3 +1,4 @@ +/* use std::path::PathBuf; use anyhow::Result; use clap::{Parser}; @@ -8,7 +9,7 @@ use folded_streaming_rans::bvgraph::mock_writers::{ANSymbolTable, EntropyMockWri use folded_streaming_rans::bvgraph::writer::{BVGraphModelBuilder, BVGraphWriter}; const FIDELITY: usize = 2; -const RADIX: usize = 4; +const RADIX: usize = 6; #[derive(Parser, Debug)] #[command(about = "Recompress a BVGraph", long_about = None)] @@ -46,7 +47,7 @@ pub fn main() -> Result<()> { FIDELITY, RADIX, Log2MockWriter - >>::new(model_builder, 7, 2, 3, 0); + >>::new(model_builder, 16, 2, 2147483647, 0); bvcomp.extend(seq_graph.iter())?; @@ -67,7 +68,7 @@ pub fn main() -> Result<()> { FIDELITY, RADIX, EntropyMockWriter - >>::new(model_builder, 7, 2, 3, 0); + >>::new(model_builder, 16, 2, 2147483647, 0); bvcomp.extend(seq_graph.iter())?; @@ -75,9 +76,9 @@ pub fn main() -> Result<()> { let mut bvcomp = BVComp::>::new( BVGraphWriter::new(model4encoder, entropy_costs), - 7, + 16, 2, - 3, + 2147483647, 0 ); @@ -97,4 +98,7 @@ pub fn main() -> Result<()> { Ok(()) -} \ No newline at end of file +} +*/ + +fn main() {} \ No newline at end of file diff --git a/src/bin/bvtest.rs b/src/bin/bvtest.rs index b095525..0994b7e 100644 --- a/src/bin/bvtest.rs +++ b/src/bin/bvtest.rs @@ -1,3 +1,4 @@ +/* use std::{ hint::black_box, path::{PathBuf}, @@ -7,12 +8,6 @@ use anyhow::Result; use clap::Parser; use dsi_progress_logger::*; use epserde::prelude::*; -use folded_streaming_rans::{ - bvgraph::{ - reader::ANSBVGraphReaderBuilder, - }, - multi_model_ans::{encoder::ANSCompressorPhase, Prelude}, -}; use rand::rngs::SmallRng; use rand::Rng; use rand::SeedableRng; @@ -74,3 +69,6 @@ pub fn main() -> Result<()> { Ok(()) } +*/ + +fn main() {} \ No newline at end of file diff --git a/src/bvgraph/mock_writers.rs b/src/bvgraph/mock_writers.rs index 13dc697..d10399d 100644 --- a/src/bvgraph/mock_writers.rs +++ b/src/bvgraph/mock_writers.rs @@ -1,43 +1,41 @@ use std::convert::Infallible; use webgraph::prelude::BVGraphCodesWriter; -use crate::bvgraph::Component; +use crate::bvgraph::BVGraphComponent; use crate::{Freq, MAX_RAW_SYMBOL, Symbol}; use crate::utils::ans_utilities::{folding_without_streaming_out}; -/// All mock writer have to be buildable. -pub trait MockWriter { - fn build(costs_table: ANSymbolTable) -> Self; +/// A trait for those mock writers that can be buildable. +pub trait MockWriter { + + /// Builds a mock writer from a given costs table. + fn build(costs_table: ANSymbolTable, fidelity: usize, radix: usize) -> Self; } #[derive(Clone)] -pub struct ANSymbolTable { +pub struct ANSymbolTable { /// A table containing a list of costs for each model. Each list containing, at index i, the cost of encoding the /// symbol i. pub table: Vec>, - - /// A table containing the frame size for each model, intended as log(M). - frame_sizes: Vec, } -impl ANSymbolTable { +impl ANSymbolTable { - const FOLDING_THRESHOLD: u16 = 1 << (FIDELITY + RADIX - 1); - const FOLDING_OFFSET: u16 = (1 << (FIDELITY - 1)) * ((1 << RADIX) - 1); - - pub fn new(symbol_freqs: Vec>, frame_sizes: Vec) -> Self { - let mut table = Self::initialize_with_binary_cost(symbol_freqs.len()).table; + pub fn new(symbol_freqs: Vec>, frame_sizes: Vec, fidelity: usize, radix: usize) -> Self { + let mut table = Self::initialize_with_binary_cost(fidelity, radix).table; + let folding_threshold = 1u16 << (fidelity + radix - 1); + let folding_offset = ((1u16 << radix) - 1) * (1 << (fidelity - 1)); table .iter_mut() .enumerate() - .for_each(|(model_index, current_table)| { + .for_each(|(component, current_table)| { (0..current_table.len()) .into_iter() .for_each(|symbol| { - let symbol_freq = match symbol_freqs[model_index].get(symbol) { + let symbol_freq = match symbol_freqs[component].get(symbol) { Some(freq) => match *freq { 0 => 1, // we can have 0 frequencies for symbols that exists due to bigger ones _ => *freq, @@ -48,172 +46,189 @@ impl ANSymbolTable current_table[symbol] = Self::calculate_symbol_cost( symbol as Symbol, symbol_freq, - frame_sizes[model_index] + frame_sizes[component], + folding_offset, + folding_threshold, + radix, ); }); }); - Self { - table, - frame_sizes - } + Self { table } } - /// Creates a a table of `model_number` lists, each containing, at the index i, the cost of the symbol i calculated - /// as follow: `cost = log2(freq) + bytes_to_unfold * RADIX`. - pub fn initialize_with_binary_cost(model_number: usize) -> Self { - let max_folded_sym = folding_without_streaming_out(MAX_RAW_SYMBOL, RADIX, FIDELITY); - - let table = (0..model_number) - .into_iter() - .map(|model_index| { + /// Creates a a table of [`BVGraphComponent::COMPONENTS`] lists, each containing, at the index i, the cost of the + /// symbol i calculate as follow: + /// ```text + /// C(x) = (floor(log2(x)) + 1) + (bytes_to_unfold * radix) + /// ``` + pub fn initialize_with_binary_cost(fidelity: usize, radix: usize) -> Self { + let max_folded_sym = folding_without_streaming_out(MAX_RAW_SYMBOL, radix, fidelity); + let folding_threshold = 1u16 << (fidelity + radix - 1); + let folding_offset = ((1u16 << radix) - 1) * (1 << (fidelity - 1)); + + let table = (0..BVGraphComponent::COMPONENTS) + .map(|_component| { (0..max_folded_sym + 1) - .into_iter() - .map(|symbol| { - Self::get_binary_cost(symbol) - }) + .map(|symbol| Self::get_binary_cost(symbol, folding_threshold, folding_offset, radix)) .collect::>() }) .collect::>(); - Self { - table, - frame_sizes: Vec::new(), - } + Self { table } } - fn calculate_symbol_cost(symbol: Symbol, symbol_freq: Freq, frame_size: usize) -> usize { + fn calculate_symbol_cost ( + symbol: Symbol, + freq: Freq, + frame_size: usize, + folding_offset: u16, + folding_threshold:u16, + radix: usize, + ) -> usize + { // we shouldn't have a symbol with frequency 0 since we want to have the cost for each symbol - debug_assert!(symbol_freq != 0); + debug_assert!(freq != 0); - let bytes_to_unfold = match symbol < Self::FOLDING_THRESHOLD { + let bytes_to_unfold = match symbol < folding_threshold { true => 0_u16, - false => (symbol - Self::FOLDING_THRESHOLD) / Self::FOLDING_OFFSET + 1u16, + false => (symbol - folding_threshold) / folding_offset + 1u16, }; - let probability = symbol_freq as f64 / (1u64 << frame_size) as f64; + let probability = freq as f64 / (1u64 << frame_size) as f64; let inverse = 1.0 / probability; let shifted = (inverse * ((1 << 16) as f64)).round() as usize; - shifted + ((bytes_to_unfold as usize * RADIX) * (1 << 16)) + shifted + ((bytes_to_unfold as usize * radix) * (1 << 16)) } - fn get_binary_cost(symbol: Symbol) -> usize { - let bytes_to_unfold = match symbol < Self::FOLDING_THRESHOLD { + fn get_binary_cost(symbol: Symbol, folding_threshold: u16, folding_offset: u16, radix: usize) -> usize { + let bytes_to_unfold = match symbol < folding_threshold { true => 0_usize, - false => ((symbol - Self::FOLDING_THRESHOLD) / Self::FOLDING_OFFSET) as usize + 1_usize, + false => ((symbol - folding_threshold) / folding_offset) as usize + 1_usize, }; - (symbol.checked_ilog2().unwrap_or(0) as usize + 1) + bytes_to_unfold * RADIX + (symbol.checked_ilog2().unwrap_or(0) as usize + 1) + bytes_to_unfold * radix } } #[derive(Clone)] -pub struct EntropyMockWriter { - costs_table: ANSymbolTable, -} +pub struct EntropyMockWriter { + + costs_table: ANSymbolTable, + + fidelity: usize, -impl EntropyMockWriter { - const FOLDING_THRESHOLD: u64 = 1 << (FIDELITY + RADIX - 1); + radix: usize, + + folding_threshold: u64, } -impl MockWriter for EntropyMockWriter { +impl MockWriter for EntropyMockWriter { - fn build(costs_table: ANSymbolTable) -> Self { + fn build(costs_table: ANSymbolTable, fidelity: usize, radix: usize) -> Self { Self { costs_table, + fidelity, + radix, + folding_threshold: 1u64 << (fidelity + radix - 1), } } } -impl BVGraphCodesWriter for EntropyMockWriter { +impl BVGraphCodesWriter for EntropyMockWriter { type Error = Infallible; type MockWriter = Self; // it's essentially a marker fn mock(&self) -> Self::MockWriter { - Self {costs_table: ANSymbolTable::initialize_with_binary_cost(0)} // thus we can return a fake one + Self { + costs_table: ANSymbolTable::initialize_with_binary_cost(1, 1), + fidelity: 0, + radix: 0, + folding_threshold: 0, + } // thus we can return a fake one } fn write_outdegree(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::Outdegree as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::Outdegree as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::Outdegree as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::Outdegree as usize][folded_sym as usize]) } fn write_reference_offset(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::ReferenceOffset as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::ReferenceOffset as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::ReferenceOffset as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::ReferenceOffset as usize][folded_sym as usize]) } fn write_block_count(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::BlockCount as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::BlockCount as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::BlockCount as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::BlockCount as usize][folded_sym as usize]) } fn write_blocks(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::Blocks as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::Blocks as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::Blocks as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::Blocks as usize][folded_sym as usize]) } fn write_interval_count(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::IntervalCount as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::IntervalCount as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::IntervalCount as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::IntervalCount as usize][folded_sym as usize]) } fn write_interval_start(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::IntervalStart as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::IntervalStart as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::IntervalStart as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::IntervalStart as usize][folded_sym as usize]) } fn write_interval_len(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::IntervalLen as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::IntervalLen as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::IntervalLen as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::IntervalLen as usize][folded_sym as usize]) } fn write_first_residual(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::FirstResidual as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::FirstResidual as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::FirstResidual as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::FirstResidual as usize][folded_sym as usize]) } fn write_residual(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::Residual as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::Residual as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::Residual as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::Residual as usize][folded_sym as usize]) } fn flush(&mut self) -> Result<(), Self::Error> { @@ -222,111 +237,121 @@ impl BVGraphCodesWriter for EntropyM } -pub struct Log2MockWriter { - costs_table: ANSymbolTable, -} +pub struct Log2MockWriter { + costs_table: ANSymbolTable, + + fidelity: usize, + + radix: usize, -impl Log2MockWriter { - const FOLDING_THRESHOLD: u64 = 1 << (FIDELITY + RADIX - 1); + folding_threshold: u64, } -impl MockWriter for Log2MockWriter { +impl MockWriter for Log2MockWriter { - fn build(_costs_table: ANSymbolTable) -> Self { + fn build(_costs_table: ANSymbolTable, fidelity: usize, radix: usize) -> Self { Self { - costs_table: ANSymbolTable::initialize_with_binary_cost(9) + costs_table: ANSymbolTable::initialize_with_binary_cost(fidelity, radix), + fidelity, + radix, + folding_threshold: 1 << (fidelity + radix - 1), } } } -impl BVGraphCodesWriter for Log2MockWriter { +impl BVGraphCodesWriter for Log2MockWriter { type Error = Infallible; type MockWriter = Self; // it's essentially a marker fn mock(&self) -> Self::MockWriter { - Log2MockWriter { costs_table: ANSymbolTable::initialize_with_binary_cost(0)} // thus we can return a fake one + Log2MockWriter { + costs_table: ANSymbolTable::initialize_with_binary_cost(1, 1), + radix: 1, + fidelity: 1, + folding_threshold:1 + } // thus we can return a fake one } fn write_outdegree(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::Outdegree as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::Outdegree as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::Outdegree as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::Outdegree as usize][folded_sym as usize]) } fn write_reference_offset(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::ReferenceOffset as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::ReferenceOffset as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::ReferenceOffset as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::ReferenceOffset as usize][folded_sym as usize]) } fn write_block_count(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::BlockCount as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::BlockCount as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::BlockCount as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::BlockCount as usize][folded_sym as usize]) } fn write_blocks(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::Blocks as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::Blocks as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::Blocks as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::Blocks as usize][folded_sym as usize]) } fn write_interval_count(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::IntervalCount as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::IntervalCount as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::IntervalCount as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::IntervalCount as usize][folded_sym as usize]) } fn write_interval_start(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::IntervalStart as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::IntervalStart as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::IntervalStart as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::IntervalStart as usize][folded_sym as usize]) } fn write_interval_len(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::IntervalLen as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::IntervalLen as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::IntervalLen as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::IntervalLen as usize][folded_sym as usize]) } fn write_first_residual(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::FirstResidual as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::FirstResidual as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::FirstResidual as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::FirstResidual as usize][folded_sym as usize]) } fn write_residual(&mut self, value: u64) -> Result { - if value < Self::FOLDING_THRESHOLD { - return Ok(self.costs_table.table[Component::Residual as usize][value as usize]); + if value < self.folding_threshold { + return Ok(self.costs_table.table[BVGraphComponent::Residual as usize][value as usize]); } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.costs_table.table[Component::Residual as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.costs_table.table[BVGraphComponent::Residual as usize][folded_sym as usize]) } fn flush(&mut self) -> Result<(), Self::Error> { diff --git a/src/bvgraph/mod.rs b/src/bvgraph/mod.rs index 7d691d5..421144b 100644 --- a/src/bvgraph/mod.rs +++ b/src/bvgraph/mod.rs @@ -4,7 +4,8 @@ pub mod mock_writers; /// An enumeration of the components getting a different model in the Rust /// implementation of the BV format. -pub enum Component { +#[derive(Clone, Copy)] +pub enum BVGraphComponent { Outdegree, ReferenceOffset, BlockCount, @@ -15,3 +16,24 @@ pub enum Component { FirstResidual, Residual, } + +impl From for BVGraphComponent { + fn from(value: usize) -> Self { + match value { + 0 => Self::Outdegree, + 1 => Self::ReferenceOffset, + 2 => Self::BlockCount, + 3 => Self::Blocks, + 4 => Self::IntervalCount, + 5 => Self::IntervalStart, + 6 => Self::IntervalLen, + 7 => Self::FirstResidual, + 8 => Self::Residual, + _ => panic!("Invalid component."), + } + } +} + +impl BVGraphComponent { + pub const COMPONENTS: usize = 9; +} diff --git a/src/bvgraph/reader.rs b/src/bvgraph/reader.rs index 586643a..62d857f 100644 --- a/src/bvgraph/reader.rs +++ b/src/bvgraph/reader.rs @@ -3,45 +3,43 @@ use std::error::Error; use webgraph::prelude::{BVGraphCodesReaderBuilder}; use crate::multi_model_ans::decoder::ANSDecoder; -use crate::multi_model_ans::encoder::ANSCompressorPhase; use crate::multi_model_ans::model4decoder::VecFrame; -use crate::multi_model_ans::Prelude; +use crate::multi_model_ans::{ANSCompressorPhase, Prelude}; -pub struct ANSBVGraphReaderBuilder { +pub struct ANSBVGraphReaderBuilder<'a> { /// The vec of ANSCompressorPhase, one for each node of the graph. phases: Vec, /// The prelude resulting from the encoding process of the graph. - prelude: Prelude, + prelude: &'a Prelude, - decoder_model: VecFrame, -} + decoder_model: VecFrame, -impl ANSBVGraphReaderBuilder { - pub fn new(prelude: Prelude, phases: Vec) -> Self { - let folding_offset = (1u64 << (FIDELITY - 1)) * ((1 << RADIX) - 1); - let folding_threshold = 1u64 << (FIDELITY + RADIX - 1); + fidelity: usize, - let decoder_model = VecFrame::::new( - &prelude.tables, - &prelude.frame_sizes, - folding_offset, - folding_threshold, - ); + radix: usize, +} +impl <'a> ANSBVGraphReaderBuilder<'a> { + pub fn new(prelude: &'a Prelude, phases: Vec, fidelity: usize, radix: usize) -> Self { Self { prelude, phases, - decoder_model, + decoder_model: VecFrame::new( + &prelude.tables, + &prelude.frame_sizes, + fidelity, + radix, + ), + fidelity, + radix, } } } -impl BVGraphCodesReaderBuilder for ANSBVGraphReaderBuilder { - type Reader<'a> = ANSDecoder<'a, FIDELITY, RADIX, u64, VecFrame> - where - Self: 'a; +impl <'a> BVGraphCodesReaderBuilder for ANSBVGraphReaderBuilder<'a> { + type Reader<'b> = ANSDecoder<'b> where Self: 'b; fn get_reader(&self, node: usize) -> Result, Box> { let phase = self @@ -49,10 +47,12 @@ impl BVGraphCodesReaderBuilder for AN .get(node) .expect("The node must have a phase associated to it."); - Ok(ANSDecoder::>::from_raw_parts( + Ok(ANSDecoder::from_raw_parts( &self.prelude, &self.decoder_model, *phase, + self.fidelity, + self.radix, )) } } \ No newline at end of file diff --git a/src/bvgraph/writer.rs b/src/bvgraph/writer.rs index d341cda..b95165b 100644 --- a/src/bvgraph/writer.rs +++ b/src/bvgraph/writer.rs @@ -2,39 +2,41 @@ use std::{convert::Infallible}; use std::marker::PhantomData; use webgraph::graph::bvgraph::BVGraphCodesWriter; -use crate::bvgraph::Component; -use crate::multi_model_ans::encoder::ANSCompressorPhase; -use crate::{ - multi_model_ans::{ - encoder::ANSEncoder, model4encoder::ANSModel4Encoder, - model4encoder_builder::ANSModel4EncoderBuilder, - }, -}; +use crate::bvgraph::BVGraphComponent; + use crate::bvgraph::mock_writers::{ANSymbolTable, EntropyMockWriter, MockWriter}; +use crate::multi_model_ans::ANSCompressorPhase; +use crate::multi_model_ans::encoder::ANSEncoder; +use crate::multi_model_ans::model4encoder::ANSModel4Encoder; +use crate::multi_model_ans::model4encoder_builder::ANSModel4EncoderBuilder; use crate::utils::ans_utilities::folding_without_streaming_out; -pub struct BVGraphModelBuilder -where - MW: BVGraphCodesWriter + MockWriter, -{ - model_builder: ANSModel4EncoderBuilder, - symbol_costs: ANSymbolTable, +pub struct BVGraphModelBuilder { + model_builder: ANSModel4EncoderBuilder, + + symbol_costs: ANSymbolTable, + + /// The type of the mock writer. _marker: PhantomData, + + fidelity: usize, + + radix: usize, + + folding_threshold: u64, } -impl BVGraphModelBuilder -where - MW: BVGraphCodesWriter + MockWriter, -{ - const FOLDING_THRESHOLD: u64 = 1 << (FIDELITY + RADIX - 1); +impl BVGraphModelBuilder { - // symbol_costs should be ANSymbolTable::initialize_with_binary_cost(9) here - pub fn new(symbol_costs: ANSymbolTable) -> Self { + pub fn new(symbol_costs: ANSymbolTable, fidelity: usize, radix: usize) -> Self { Self { - model_builder: ANSModel4EncoderBuilder::::new(9), + model_builder: ANSModel4EncoderBuilder::new(fidelity, radix), symbol_costs, _marker: PhantomData, + fidelity, + radix, + folding_threshold: 1u64 << (fidelity + radix - 1), } } @@ -45,125 +47,122 @@ where } } -impl BVGraphCodesWriter for BVGraphModelBuilder -where - MW: BVGraphCodesWriter + MockWriter, -{ +impl BVGraphCodesWriter for BVGraphModelBuilder { type Error = Infallible; type MockWriter = MW; fn mock(&self) -> Self::MockWriter { // !!!!! now it's a clone since it's &Self. Otherwise i would give ownership !!!!! - MW::build(self.symbol_costs.clone()) + MW::build(self.symbol_costs.clone(), self.fidelity, self.radix) } fn write_outdegree(&mut self, value: u64) -> Result { self.model_builder - .push_symbol(value, Component::Outdegree as usize); + .push_symbol(value, BVGraphComponent::Outdegree); - if value < Self::FOLDING_THRESHOLD { - return Ok(self.symbol_costs.table[Component::Outdegree as usize][value as usize]) + if value < self.folding_threshold { + return Ok(self.symbol_costs.table[BVGraphComponent::Outdegree as usize][value as usize]) } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.symbol_costs.table[Component::Outdegree as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.symbol_costs.table[BVGraphComponent::Outdegree as usize][folded_sym as usize]) } fn write_reference_offset(&mut self, value: u64) -> Result { self.model_builder - .push_symbol(value, Component::ReferenceOffset as usize); + .push_symbol(value, BVGraphComponent::ReferenceOffset); - if value < Self::FOLDING_THRESHOLD { - return Ok(self.symbol_costs.table[Component::ReferenceOffset as usize][value as usize]) + if value < self.folding_threshold { + return Ok(self.symbol_costs.table[BVGraphComponent::ReferenceOffset as usize][value as usize]) } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.symbol_costs.table[Component::ReferenceOffset as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.symbol_costs.table[BVGraphComponent::ReferenceOffset as usize][folded_sym as usize]) } fn write_block_count(&mut self, value: u64) -> Result { self.model_builder - .push_symbol(value, Component::BlockCount as usize); + .push_symbol(value, BVGraphComponent::BlockCount); - if value < Self::FOLDING_THRESHOLD { - return Ok(self.symbol_costs.table[Component::BlockCount as usize][value as usize]) + if value < self.folding_threshold { + return Ok(self.symbol_costs.table[BVGraphComponent::BlockCount as usize][value as usize]) } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.symbol_costs.table[Component::BlockCount as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.symbol_costs.table[BVGraphComponent::BlockCount as usize][folded_sym as usize]) } fn write_blocks(&mut self, value: u64) -> Result { self.model_builder - .push_symbol(value, Component::Blocks as usize); + .push_symbol(value, BVGraphComponent::Blocks); - if value < Self::FOLDING_THRESHOLD { - return Ok(self.symbol_costs.table[Component::Blocks as usize][value as usize]) + if value < self.folding_threshold { + return Ok(self.symbol_costs.table[BVGraphComponent::Blocks as usize][value as usize]) } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.symbol_costs.table[Component::Blocks as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.symbol_costs.table[BVGraphComponent::Blocks as usize][folded_sym as usize]) } fn write_interval_count(&mut self, value: u64) -> Result { self.model_builder - .push_symbol(value, Component::IntervalCount as usize); + .push_symbol(value, BVGraphComponent::IntervalCount); - if value < Self::FOLDING_THRESHOLD { - return Ok(self.symbol_costs.table[Component::IntervalCount as usize][value as usize]) + if value < self.folding_threshold { + return Ok(self.symbol_costs.table[BVGraphComponent::IntervalCount as usize][value as usize]) } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.symbol_costs.table[Component::IntervalCount as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.symbol_costs.table[BVGraphComponent::IntervalCount as usize][folded_sym as usize]) } fn write_interval_start(&mut self, value: u64) -> Result { self.model_builder - .push_symbol(value, Component::IntervalStart as usize); + .push_symbol(value, BVGraphComponent::IntervalStart); - if value < Self::FOLDING_THRESHOLD { - return Ok(self.symbol_costs.table[Component::IntervalStart as usize][value as usize]) + if value < self.folding_threshold { + return Ok(self.symbol_costs.table[BVGraphComponent::IntervalStart as usize][value as usize]) } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.symbol_costs.table[Component::IntervalStart as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.symbol_costs.table[BVGraphComponent::IntervalStart as usize][folded_sym as usize]) } fn write_interval_len(&mut self, value: u64) -> Result { self.model_builder - .push_symbol(value, Component::IntervalLen as usize); + .push_symbol(value, BVGraphComponent::IntervalLen); - if value < Self::FOLDING_THRESHOLD { - return Ok(self.symbol_costs.table[Component::IntervalLen as usize][value as usize]) + if value < self.folding_threshold { + return Ok(self.symbol_costs.table[BVGraphComponent::IntervalLen as usize][value as usize]) } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.symbol_costs.table[Component::IntervalLen as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.symbol_costs.table[BVGraphComponent::IntervalLen as usize][folded_sym as usize]) } fn write_first_residual(&mut self, value: u64) -> Result { self.model_builder - .push_symbol(value, Component::FirstResidual as usize); + .push_symbol(value, BVGraphComponent::FirstResidual); - if value < Self::FOLDING_THRESHOLD { - return Ok(self.symbol_costs.table[Component::FirstResidual as usize][value as usize]) + if value < self.folding_threshold { + return Ok(self.symbol_costs.table[BVGraphComponent::FirstResidual as usize][value as usize]) } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.symbol_costs.table[Component::FirstResidual as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.symbol_costs.table[BVGraphComponent::FirstResidual as usize][folded_sym as usize]) } fn write_residual(&mut self, value: u64) -> Result { self.model_builder - .push_symbol(value, Component::Residual as usize); + .push_symbol(value, BVGraphComponent::Residual); - if value < Self::FOLDING_THRESHOLD { - return Ok(self.symbol_costs.table[Component::Residual as usize][value as usize]) + if value < self.folding_threshold { + return Ok(self.symbol_costs.table[BVGraphComponent::Residual as usize][value as usize]) } - let folded_sym = folding_without_streaming_out(value, RADIX, FIDELITY); - Ok(self.symbol_costs.table[Component::Residual as usize][folded_sym as usize]) + let folded_sym = folding_without_streaming_out(value, self.radix, self.fidelity); + Ok(self.symbol_costs.table[BVGraphComponent::Residual as usize][folded_sym as usize]) } fn flush(&mut self) -> Result<(), Self::Error> { @@ -177,7 +176,7 @@ where /// Data is gathered in a number of buffers, one for each [component](`Component`). /// At the next node (i.e. when `write_outdegree` is called again), the buffers /// are emptied in reverse order. -pub struct BVGraphWriter { +pub struct BVGraphWriter { /// The container containing the buffers (one for each [component](`Component`)) where symbols are collected. data: [Vec; 9], @@ -185,18 +184,17 @@ pub struct BVGraphWriter { curr_node: usize, /// The encoder used by this writer to encode symbols. - encoder: ANSEncoder, + encoder: ANSEncoder, /// A buffer containing a [`ANSCompressorPhase`], one for each node. phases: Vec, - mock_writer: EntropyMockWriter, + mock_writer: EntropyMockWriter, } -impl BVGraphWriter { +impl BVGraphWriter { - // costs_table should be ANSymbolTable::new by passing the table of entries of the encoder - pub fn new(model: ANSModel4Encoder, costs_table: ANSymbolTable) -> Self { + pub fn new(model: ANSModel4Encoder, costs_table: ANSymbolTable, fidelity: usize, radix: usize) -> Self { Self { curr_node: usize::MAX, data: [ @@ -210,22 +208,22 @@ impl BVGraphWriter { Vec::new(), Vec::new(), ], - mock_writer: EntropyMockWriter::build(costs_table), - encoder: ANSEncoder::::new(model), + mock_writer: EntropyMockWriter::build(costs_table, fidelity, radix), + encoder: ANSEncoder::new(model, fidelity, radix), phases: Vec::new(), } } /// Consume self and return the encoder. - pub fn into_inner(self) -> (ANSEncoder, Vec) { + pub fn into_inner(self) -> (ANSEncoder, Vec) { (self.encoder, self.phases) } } -impl BVGraphCodesWriter for BVGraphWriter { +impl BVGraphCodesWriter for BVGraphWriter { type Error = Infallible; - type MockWriter = EntropyMockWriter; + type MockWriter = EntropyMockWriter; fn mock(&self) -> Self::MockWriter { self.mock_writer.clone() // i must return costs even below so i have to keep an instance of the mock @@ -234,41 +232,41 @@ impl BVGraphCodesWriter for BVGraphWr fn write_outdegree(&mut self, value: u64) -> Result { if self.curr_node != usize::MAX { for (component, symbols) in self.data - [Component::FirstResidual as usize..=Component::Residual as usize] + [BVGraphComponent::FirstResidual as usize..=BVGraphComponent::Residual as usize] .iter() .enumerate() .rev() { for &symbol in symbols.iter().rev() { self.encoder - .encode(symbol as u64, component + Component::FirstResidual as usize); + .encode(symbol as u64, BVGraphComponent::from(component + BVGraphComponent::FirstResidual as usize)); } } debug_assert_eq!( - self.data[Component::IntervalStart as usize].len(), - self.data[Component::IntervalLen as usize].len() + self.data[BVGraphComponent::IntervalStart as usize].len(), + self.data[BVGraphComponent::IntervalLen as usize].len() ); - for i in (0..self.data[Component::IntervalStart as usize].len()).rev() { + for i in (0..self.data[BVGraphComponent::IntervalStart as usize].len()).rev() { self.encoder.encode( - self.data[Component::IntervalLen as usize][i] as u64, - Component::IntervalLen as usize, + self.data[BVGraphComponent::IntervalLen as usize][i] as u64, + BVGraphComponent::IntervalLen, ); self.encoder.encode( - self.data[Component::IntervalStart as usize][i] as u64, - Component::IntervalStart as usize, + self.data[BVGraphComponent::IntervalStart as usize][i] as u64, + BVGraphComponent::IntervalStart, ); } for (component, symbols) in self.data - [Component::Outdegree as usize..=Component::IntervalCount as usize] + [BVGraphComponent::Outdegree as usize..=BVGraphComponent::IntervalCount as usize] .iter() .enumerate() .rev() { for &symbol in symbols.iter().rev() { - self.encoder.encode(symbol as u64, component); + self.encoder.encode(symbol as u64, BVGraphComponent::from(component)); } } // save state of the encoder as soon as it finishes encoding the node @@ -282,47 +280,47 @@ impl BVGraphCodesWriter for BVGraphWr symbols.clear(); } - self.data[Component::Outdegree as usize].push(value as usize); + self.data[BVGraphComponent::Outdegree as usize].push(value as usize); self.mock_writer.write_outdegree(value) } fn write_reference_offset(&mut self, value: u64) -> Result { - self.data[Component::ReferenceOffset as usize].push(value as usize); + self.data[BVGraphComponent::ReferenceOffset as usize].push(value as usize); self.mock_writer.write_reference_offset(value) } fn write_block_count(&mut self, value: u64) -> Result { - self.data[Component::BlockCount as usize].push(value as usize); + self.data[BVGraphComponent::BlockCount as usize].push(value as usize); self.mock_writer.write_block_count(value) } fn write_blocks(&mut self, value: u64) -> Result { - self.data[Component::Blocks as usize].push(value as usize); + self.data[BVGraphComponent::Blocks as usize].push(value as usize); self.mock_writer.write_blocks(value) } fn write_interval_count(&mut self, value: u64) -> Result { - self.data[Component::IntervalCount as usize].push(value as usize); + self.data[BVGraphComponent::IntervalCount as usize].push(value as usize); self.mock_writer.write_interval_count(value) } fn write_interval_start(&mut self, value: u64) -> Result { - self.data[Component::IntervalStart as usize].push(value as usize); + self.data[BVGraphComponent::IntervalStart as usize].push(value as usize); self.mock_writer.write_interval_start(value) } fn write_interval_len(&mut self, value: u64) -> Result { - self.data[Component::IntervalLen as usize].push(value as usize); + self.data[BVGraphComponent::IntervalLen as usize].push(value as usize); self.mock_writer.write_interval_len(value) } fn write_first_residual(&mut self, value: u64) -> Result { - self.data[Component::FirstResidual as usize].push(value as usize); + self.data[BVGraphComponent::FirstResidual as usize].push(value as usize); self.mock_writer.write_first_residual(value) } fn write_residual(&mut self, value: u64) -> Result { - self.data[Component::Residual as usize].push(value as usize); + self.data[BVGraphComponent::Residual as usize].push(value as usize); self.mock_writer.write_residual(value) } @@ -330,7 +328,7 @@ impl BVGraphCodesWriter for BVGraphWr fn flush(&mut self) -> Result<(), Self::Error> { for (component, symbols) in self.data.iter().enumerate().rev() { for &symbol in symbols.iter().rev() { - self.encoder.encode(symbol as u64, component); + self.encoder.encode(symbol as u64, BVGraphComponent::from(component)); } } self.phases diff --git a/src/lib.rs b/src/lib.rs index 8cef55a..b584a81 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,15 +1,10 @@ -#![allow(dead_code)] #![allow(unused_must_use)] -#![allow(unused_variables)] +#![allow(unconditional_recursion)] +#![allow(dead_code)] -use crate::traits::quasi::Quasi; -use epserde::prelude::*; -use epserde::traits::ZeroCopy; pub mod ans; pub mod multi_model_ans; - pub mod bvgraph; - mod traits; pub mod utils; @@ -42,13 +37,3 @@ pub type Freq = u16; /// The default value for RADIX used by both the encoder and the decoder. pub const FASTER_RADIX: usize = 8; - - -#[derive(Clone, Copy, Debug, Default, Epserde)] -#[repr(C)] -#[zero_copy] -pub struct DecoderModelEntry + ZeroCopy + 'static> { - pub freq: Freq, - pub cumul_freq: Freq, - pub quasi_folded: T, -} diff --git a/src/multi_model_ans/decoder.rs b/src/multi_model_ans/decoder.rs index 167ffb9..dcc0fbe 100644 --- a/src/multi_model_ans/decoder.rs +++ b/src/multi_model_ans/decoder.rs @@ -1,65 +1,81 @@ use webgraph::prelude::BVGraphCodesReader; -use crate::multi_model_ans::encoder::ANSCompressorPhase; + use crate::multi_model_ans::model4decoder::VecFrame; use crate::multi_model_ans::model4encoder::SymbolLookup; -use crate::multi_model_ans::Prelude; -use crate::traits::quasi::{Decode, Quasi}; -use crate::{DecoderModelEntry, RawSymbol, State, FASTER_RADIX, LOG2_B}; -use crate::bvgraph::Component; +use crate::multi_model_ans::{ANSCompressorPhase, Prelude}; +use crate::{RawSymbol, State, LOG2_B}; +use crate::bvgraph::BVGraphComponent; #[derive(Clone)] -pub struct ANSDecoder<'a, const FIDELITY: usize, const RADIX: usize = FASTER_RADIX, H = u64, M = VecFrame> -where - H: Quasi, - M: Decode + SymbolLookup>, -{ - pub model: &'a M, +pub struct ANSDecoder<'a> { + /// The model used to decode the sequence. + pub model: &'a VecFrame, /// The normalized bits during the encoding process. - pub normalized_bits: &'a Vec, + pub stream: &'a Vec, + /// The current state of the decoder. pub state: State, - pub normalized_pointer: usize, + /// The pointer to the next normalized chunk of 32 bits to be read. + pub stream_pointer: usize, + + /// The value of fidelity currently used by the decoder. + pub fidelity: usize, + + /// The value of radix currently used by the decoder. + pub radix: usize, } -impl<'a, const FIDELITY: usize, const RADIX: usize, H, M> ANSDecoder<'a, FIDELITY, RADIX, H, M> -where - H: Quasi, - M: Decode + SymbolLookup>, +impl<'a> ANSDecoder<'a> { /// The lower bound of the interval. const LOWER_BOUND: State = 1 << 32; - pub fn new(prelude: &'a Prelude, model: &'a M) -> Self { + /// The number of bits reserved to represent the symbol in the quasi-folded value. + const BIT_RESERVED_FOR_SYMBOL: u64 = 48; + + pub fn new(prelude: &'a Prelude, model: &'a VecFrame, fidelity: usize, radix: usize) -> Self { Self { - normalized_pointer: prelude.normalized_bits.len(), + stream_pointer: prelude.normalized_bits.len(), model, - normalized_bits: &prelude.normalized_bits, + stream: &prelude.normalized_bits, state: prelude.state, + fidelity, + radix, } } - pub fn from_raw_parts (prelude: &'a Prelude, model: &'a M, phase: ANSCompressorPhase) -> Self { + /// Initialize a new ANSDecoder from its raw parts. + /// + /// Note: the next decoded symbol will be the last one encoded in the given [`phase`](ANSCompressorPhase) + pub fn from_raw_parts( + prelude: &'a Prelude, + model: &'a VecFrame, + phase: ANSCompressorPhase, + fidelity: usize, + radix: usize, + ) -> Self + { Self { model, - normalized_bits: &prelude.normalized_bits, + stream: &prelude.normalized_bits, state: phase.state, - normalized_pointer: phase.normalized, + stream_pointer: phase.stream_pointer, + fidelity, + radix, } } } -impl<'a, const FIDELITY: usize, const RADIX: usize, H, M> ANSDecoder<'a, FIDELITY, RADIX, H, M> -where - H: Quasi, - M: Decode + SymbolLookup>, -{ - pub fn decode(&mut self, model_index: usize) -> RawSymbol { - let slot = self.state & self.model.get_frame_mask(model_index); - let symbol_entry = self.model.symbol(slot, model_index); +impl<'a> ANSDecoder<'a> { - self.state = (self.state >> self.model.get_log2_frame_size(model_index)) + /// Decodes a single symbol of a specific [`Component`](BVGraphComponent). + pub fn decode(&mut self, component: BVGraphComponent) -> RawSymbol { + let slot = self.state & self.model.get_frame_mask(component); + let symbol_entry = self.model.symbol(slot, component); + + self.state = (self.state >> self.model.get_log2_frame_size(component)) * (symbol_entry.freq as State) + slot as State - (symbol_entry.cumul_freq as State); @@ -68,88 +84,70 @@ where self.extend_state(); } - let (quasi_unfolded, folds) = H::quasi_unfold(symbol_entry.quasi_folded); + let (quasi_unfolded, folds) = self.quasi_unfold(symbol_entry.quasi_folded); let mut fold = 0u64; for _ in 0..folds { if self.state < Self::LOWER_BOUND { self.extend_state(); } - fold = (fold << RADIX) | self.state & ((1 << RADIX) - 1); - self.state >>= RADIX; + fold = (fold << self.radix) | self.state & ((1 << self.radix) - 1); + self.state >>= self.radix; if self.state < Self::LOWER_BOUND { self.extend_state(); } } - quasi_unfolded.into() | fold + quasi_unfolded | fold } fn extend_state(&mut self) { - self.normalized_pointer -= 1; - let bits = self.normalized_bits[self.normalized_pointer]; + self.stream_pointer -= 1; + let bits = self.stream[self.stream_pointer]; self.state = (self.state << LOG2_B) | bits as State; } - /* - pub fn decode_from_phase( - &mut self, - phase: ANSCompressorPhase, - model_index: usize, - ) -> RawSymbol { - self.state = phase.state; - self.last_unfolded_pos = phase.folded; - self.normalized_pointer = phase.normalized; - - Self::decode(self, model_index) - } - - pub fn set_compressor_at_phase(&mut self, phase: &ANSCompressorPhase) { - self.state = phase.state; - self.last_unfolded_pos = phase.folded; - self.normalized_pointer = phase.normalized; + fn quasi_unfold(&self, quasi_folded: u64) -> (u64, u32) { + let symbol = quasi_folded & ((1 << Self::BIT_RESERVED_FOR_SYMBOL) - 1); + let folds = quasi_folded >> Self::BIT_RESERVED_FOR_SYMBOL; + (symbol, folds as u32) } - */ } -impl<'a, const FIDELITY: usize, const RADIX: usize, H, M> BVGraphCodesReader for ANSDecoder<'a, FIDELITY, RADIX, H, M> -where - H: Quasi, - M: Decode + SymbolLookup>, -{ +impl<'a> BVGraphCodesReader for ANSDecoder<'a> { fn read_outdegree(&mut self) -> u64 { - self.decode(Component::Outdegree as usize) + self.decode(BVGraphComponent::Outdegree) } fn read_reference_offset(&mut self) -> u64 { - self.decode(Component::ReferenceOffset as usize) + self.decode(BVGraphComponent::ReferenceOffset) } fn read_block_count(&mut self) -> u64 { - self.decode(Component::BlockCount as usize) + self.decode(BVGraphComponent::BlockCount) } fn read_blocks(&mut self) -> u64 { - self.decode(Component::Blocks as usize) + self.decode(BVGraphComponent::Blocks) } fn read_interval_count(&mut self) -> u64 { - self.decode(Component::IntervalCount as usize) + self.decode(BVGraphComponent::IntervalCount) } fn read_interval_start(&mut self) -> u64 { - self.decode(Component::IntervalStart as usize) + self.decode(BVGraphComponent::IntervalStart) } fn read_interval_len(&mut self) -> u64 { - self.decode(Component::IntervalLen as usize) + self.decode(BVGraphComponent::IntervalLen) } fn read_first_residual(&mut self) -> u64 { - self.decode(Component::FirstResidual as usize) + self.decode(BVGraphComponent::FirstResidual) } fn read_residual(&mut self) -> u64 { - self.decode(Component::Residual as usize) + self.decode(BVGraphComponent::Residual) } } diff --git a/src/multi_model_ans/encoder.rs b/src/multi_model_ans/encoder.rs index bd1fcb9..cffaf1f 100644 --- a/src/multi_model_ans/encoder.rs +++ b/src/multi_model_ans/encoder.rs @@ -1,80 +1,84 @@ -use epserde::Epserde; -use mem_dbg::{MemDbg, MemSize}; +use crate::bvgraph::BVGraphComponent; use crate::multi_model_ans::model4encoder::{ANSModel4Encoder, SymbolLookup}; -use crate::multi_model_ans::Prelude; -use crate::traits::quasi::Decode; -use crate::{RawSymbol, State, Symbol, FASTER_RADIX, LOG2_B}; +use crate::multi_model_ans::{ANSCompressorPhase, Prelude}; +use crate::{RawSymbol, State, Symbol, LOG2_B}; /// Used to extract the 32 LSB from a 64-bit state. const NORMALIZATION_MASK: u64 = 0xFFFFFFFF; #[derive(Clone)] -pub struct ANSEncoder { +pub struct ANSEncoder { pub model: ANSModel4Encoder, pub state: State, /// The normalized bits during the encoding process. - pub normalized_bits: Vec, -} + pub stream: Vec, + + /// Represent the threshold starting from which a symbol has to be folded. + pub folding_threshold: u64, + + pub folding_offset: u64, -impl ANSEncoder { - /// The biggest singleton symbol, i.e. the biggest symbol that doesn't need to be folded. - const FOLDING_THRESHOLD: RawSymbol = (1 << (FIDELITY + RADIX - 1)) as RawSymbol; + pub radix: usize, + + pub fidelity: usize, +} - const FOLDING_OFFSET: RawSymbol = ((1 << RADIX) - 1) * (1 << (FIDELITY - 1)); +impl ANSEncoder { - pub fn new(model: ANSModel4Encoder) -> Self { + pub fn new(model: ANSModel4Encoder, fidelity: usize, radix: usize) -> Self { Self { state: 1_u64 << 32, model, - normalized_bits: Vec::new(), + stream: Vec::new(), + folding_threshold: (1 << (fidelity + radix - 1)) as u64, + folding_offset: ((1 << radix) - 1) * (1 << (fidelity - 1)), + radix, + fidelity, } } - fn get_folds_number(symbol: RawSymbol) -> usize { - ((u64::ilog2(symbol) + 1) as usize - FIDELITY) / RADIX + fn get_folds_number(&self, symbol: RawSymbol) -> usize { + ((u64::ilog2(symbol) + 1) as usize - self.fidelity) / self.radix } } -/// Encoding functions -impl ANSEncoder { - /// Encodes a single symbol by using the data in the model with the given index. +impl ANSEncoder { + /// Encodes a single symbol of a specific [`Component`](BVGraphComponent). /// /// Note that the ANS decodes the sequence in reverse order. - pub fn encode(&mut self, mut symbol: RawSymbol, model_index: usize) { + pub fn encode(&mut self, mut symbol: RawSymbol, component: BVGraphComponent) { // if symbol has to be folded, dump the bytes we have to fold - if symbol >= Self::FOLDING_THRESHOLD { - let folds = Self::get_folds_number(symbol); + if symbol >= self.folding_threshold { + let folds = self.get_folds_number(symbol); for _ in 0..folds { - let bits_to_push = symbol & ((1 << RADIX) - 1); + let bits_to_push = symbol & ((1 << self.radix) - 1); - // dump in the space if there is enough space - if self.state.leading_zeros() >= RADIX as u32 { - self.state <<= RADIX; + // dump in the state if there is enough space + if self.state.leading_zeros() >= self.radix as u32 { + self.state <<= self.radix; self.state += bits_to_push; - } - // otherwise, normalize the state and push the bits in the normalized bits - else { - self.state = Self::shrink_state(self.state, &mut self.normalized_bits); - self.state <<= RADIX; + } else { // otherwise, normalize the state and push the bits in the normalized bits + self.state = Self::shrink_state(self.state, &mut self.stream); + self.state <<= self.radix; self.state += bits_to_push; } - symbol >>= RADIX; + symbol >>= self.radix; } - symbol += Self::FOLDING_OFFSET * folds as RawSymbol; + symbol += self.folding_offset * folds as RawSymbol; } let symbol = symbol as Symbol; - let sym_data = self.model.symbol(symbol, model_index); + let sym_data = self.model.symbol(symbol, component); if self.state >= sym_data.upperbound { - self.state = Self::shrink_state(self.state, &mut self.normalized_bits); + self.state = Self::shrink_state(self.state, &mut self.stream); } let block = self.state / sym_data.freq as u64; - self.state = (block << self.model.get_log2_frame_size(model_index)) + self.state = (block << self.model.get_log2_frame_size(component)) + sym_data.cumul_freq as u64 + (self.state - (block * sym_data.freq as u64)); } @@ -86,17 +90,17 @@ impl ANSEncoder { state } - pub fn serialize(&mut self) -> Prelude { + pub fn serialize(self) -> Prelude { Prelude { - tables: self.model.tables.clone(), - normalized_bits: self.normalized_bits.clone(), - frame_sizes: self.model.frame_sizes.clone(), + tables: self.model.tables, + normalized_bits: self.stream, + frame_sizes: self.model.frame_sizes, state: self.state, } } - /// Returns the current phase of the compressor, that is: the current state, the index of the last chunk of 32 bits - /// that have been normalized, and the index of the last chunk of [`RADIX`] bits that have been folded. + /// Returns the current phase of the compressor, that is: the current state and the index of the last chunk of 32 bits + /// that have been normalized. /// /// An [`ANSCompressorPhase`] can be utilized to restore the state of the compressor at a given point in time. In the /// specific, if the compressor actual phase is `phase`, then the next decode symbol will be the same as the one @@ -104,15 +108,7 @@ impl ANSEncoder { pub fn get_current_compressor_phase(&self) -> ANSCompressorPhase { ANSCompressorPhase { state: self.state, - normalized: self.normalized_bits.len(), + stream_pointer: self.stream.len(), } } -} - -#[derive(Debug, Clone, Copy, Epserde, MemDbg, MemSize)] -#[zero_copy] -#[repr(C)] -pub struct ANSCompressorPhase { - pub state: State, - pub normalized: usize, -} +} \ No newline at end of file diff --git a/src/multi_model_ans/mod.rs b/src/multi_model_ans/mod.rs index cbe8824..7f20d57 100644 --- a/src/multi_model_ans/mod.rs +++ b/src/multi_model_ans/mod.rs @@ -7,11 +7,10 @@ pub mod model4encoder_builder; use epserde::prelude::*; use mem_dbg::*; -use crate::traits::folding::FoldRead; use crate::{Freq, State}; #[derive(Clone, Debug, Epserde, MemDbg, MemSize)] -pub struct Prelude { +pub struct Prelude { /// Contains, for each index, the data associated to the symbol equal to that index. pub tables: Vec>, @@ -24,6 +23,15 @@ pub struct Prelude { pub state: State, } + +#[derive(Debug, Clone, Copy, Epserde, MemDbg, MemSize)] +#[zero_copy] +#[repr(C)] +pub struct ANSCompressorPhase { + pub state: State, + pub stream_pointer: usize, +} + #[derive(Clone, Copy, Debug, Epserde, MemDbg, MemSize)] #[repr(C)] #[zero_copy] @@ -50,3 +58,12 @@ impl From<(Freq, u64, Freq)> for EncoderModelEntry { } } } + +#[derive(Clone, Copy, Debug, Default, Epserde)] +#[repr(C)] +#[zero_copy] +pub struct DecoderModelEntry { + pub freq: Freq, + pub cumul_freq: Freq, + pub quasi_folded: u64, +} diff --git a/src/multi_model_ans/model4decoder.rs b/src/multi_model_ans/model4decoder.rs index 9613cf7..aba2606 100644 --- a/src/multi_model_ans/model4decoder.rs +++ b/src/multi_model_ans/model4decoder.rs @@ -1,197 +1,31 @@ -use epserde::Epserde; -use sucds::bit_vectors::{Rank, Rank9Sel}; -use sux::prelude::*; - use crate::multi_model_ans::model4encoder::SymbolLookup; -use crate::traits::quasi::{Decode, Quasi}; -use crate::{DecoderModelEntry, State, Symbol}; +use crate::{RawSymbol, State, Symbol}; +use crate::bvgraph::BVGraphComponent; use crate::multi_model_ans::EncoderModelEntry; - -#[derive(Epserde)] -pub struct EliasFanoFrame -where - T: Quasi, -{ - /// Contains the log2 of the frame size for each model. - frame_sizes: Vec, - - /// Contains a list of vector of entries for each model where, in each index, the data associated to the symbol equal to that index. - symbols: Vec>>, - - /// The mapped frames as Elias-Fano structures. - frames: Vec, -} - -impl> Decode for EliasFanoFrame { - #[inline(always)] - fn get_frame_mask(&self, model_index: usize) -> u64 { - (1 << self.frame_sizes[model_index]) - 1 - } - - #[inline(always)] - fn get_log2_frame_size(&self, model_index: usize) -> usize { - self.frame_sizes[model_index] - } -} - -impl> EliasFanoFrame { - pub fn new( - tables: &Vec>, - frame_sizes: &Vec, - folding_offset: u64, - folding_threshold: u64, - ) -> Self { - let mut symbols_table = Vec::with_capacity(tables.len()); - let mut elias_table = Vec::with_capacity(tables.len()); - - tables.iter().enumerate().for_each(|(model_index, table)| { - let nonzero_symbols = tables[model_index] - .iter() - .filter(|sym| sym.freq > 0) - .count(); - let mut symbols = Vec::with_capacity(nonzero_symbols); - let mut frame_builder = - EliasFanoBuilder::new(nonzero_symbols + 1, (1 << frame_sizes[model_index]) + 1); - - for (sym, sym_data) in table.iter().enumerate() { - if sym_data.freq == 0 { - continue; - } - - frame_builder.push(sym_data.cumul_freq as usize).unwrap(); - - symbols.push(DecoderModelEntry { - freq: sym_data.freq, - cumul_freq: sym_data.cumul_freq, - quasi_folded: T::quasi_fold(sym as Symbol, folding_threshold, folding_offset), - }); - } - - frame_builder.push(1 << frame_sizes[model_index]).unwrap(); - let frame: EliasFano = frame_builder.build().convert_to().unwrap(); - symbols_table.push(symbols); - elias_table.push(frame); - }); - - Self { - frame_sizes: frame_sizes.clone(), - symbols: symbols_table, - frames: elias_table, - } - } -} - -impl> SymbolLookup for EliasFanoFrame { - type Output = DecoderModelEntry; - - #[inline(always)] - fn symbol(&self, slot: State, model_index: usize) -> &Self::Output { - let symbol_index = unsafe { - self.frames[model_index] - .pred_unchecked::(&(slot as usize)) - .0 as Symbol - }; - &self.symbols[model_index][symbol_index as usize] - } -} - -#[derive(Clone)] -pub struct Rank9SelFrame> { - /// Contains the log2 of the frame size for each model. - frame_sizes: Vec, - - /// Contains a list of vector of entries for each model where, in each index, the data associated to the symbol equal to that index. - symbols: Vec>>, - - frames: Vec, -} - -impl> Rank9SelFrame { - pub fn new( - tables: &Vec>, - frame_sizes: &Vec, - folding_offset: u64, - folding_threshold: u64, - ) -> Self { - let mut symbols_table = Vec::with_capacity(tables.len()); - let mut rank9_table = Vec::with_capacity(tables.len()); - - tables.iter().enumerate().for_each(|(model_index, table)| { - let nonzero_symbols = tables[model_index] - .iter() - .filter(|sym| sym.freq > 0) - .count(); - let mut symbols = Vec::with_capacity(nonzero_symbols); - let mut vec = vec![false; 1 << frame_sizes[model_index]]; - - for (sym, sym_data) in table.iter().enumerate() { - if sym_data.freq == 0 { - continue; - } - - match sym_data.cumul_freq { - 0 => (), - _ => *vec.get_mut(sym_data.cumul_freq as usize).unwrap() = true, - } - - symbols.push(DecoderModelEntry { - freq: sym_data.freq, - cumul_freq: sym_data.cumul_freq, - quasi_folded: T::quasi_fold(sym as Symbol, folding_threshold, folding_offset), - }); - } - - rank9_table.push(Rank9Sel::from_bits(vec)); - symbols_table.push(symbols); - }); - - Self { - frame_sizes: frame_sizes.clone(), - symbols: symbols_table, - frames: rank9_table, - } - } -} - -impl> Decode for Rank9SelFrame { - #[inline(always)] - fn get_frame_mask(&self, model_index: usize) -> u64 { - (1 << self.frame_sizes[model_index]) - 1 - } - - #[inline(always)] - fn get_log2_frame_size(&self, model_index: usize) -> usize { - self.frame_sizes[model_index] - } -} - -impl> SymbolLookup for Rank9SelFrame { - type Output = DecoderModelEntry; - - #[inline(always)] - fn symbol(&self, slot: State, model_index: usize) -> &Self::Output { - let symbol_index = self.frames[model_index].rank1((slot + 1) as usize).unwrap() as Symbol; - &self.symbols[model_index][symbol_index as usize] - } -} +use crate::multi_model_ans::DecoderModelEntry; #[derive(Clone)] -pub struct VecFrame> { +pub struct VecFrame { /// Contains the log2 of the frame size for each model. frame_sizes: Vec, /// Contains a set of vectors, one each for model. Within each vector, each index contains the data associated to /// the symbol equal to that index. - symbols: Vec>>, + symbols: Vec>, } -impl> VecFrame { +impl VecFrame { + const BIT_RESERVED_FOR_SYMBOL: u64 = 48; + pub fn new( tables: &Vec>, frame_sizes: &Vec, - folding_offset: u64, - folding_threshold: u64, - ) -> Self { + fidelity: usize, + radix: usize, + ) -> Self + { + let folding_offset = ((1 << radix) - 1) * (1 << (fidelity - 1)); + let folding_threshold = (1 << (fidelity + radix - 1)) as u64; let mut vectors = Vec::with_capacity(tables.len()); tables.iter().enumerate().for_each(|(model_index, table)| { @@ -208,10 +42,11 @@ impl> VecFrame { *vec.get_mut(slot as usize).unwrap() = DecoderModelEntry { freq: symbol_entry.freq, cumul_freq: symbol_entry.cumul_freq, - quasi_folded: T::quasi_fold ( + quasi_folded: Self::quasi_fold( sym as Symbol, - folding_threshold, folding_offset, + folding_threshold, + radix, ), }; } @@ -225,25 +60,37 @@ impl> VecFrame { symbols: vectors, } } -} -impl> Decode for VecFrame { + fn quasi_fold(sym: Symbol, folding_offset: u64, folding_threshold: u64, radix: usize) -> u64 { + if sym < folding_threshold as Symbol { + return sym as u64; + } + + let mut symbol = sym as u64; + let folds = (symbol - folding_threshold) / folding_offset + 1_u64; + let folds_bits = folds << Self::BIT_RESERVED_FOR_SYMBOL; + + symbol -= folding_offset * folds as RawSymbol; + symbol <<= folds * radix as u64; + symbol | folds_bits + } + #[inline(always)] - fn get_frame_mask(&self, model_index: usize) -> u64 { - (1 << self.frame_sizes[model_index]) - 1 + pub fn get_frame_mask(&self, component: BVGraphComponent) -> u64 { + (1 << self.frame_sizes[component as usize]) - 1 } #[inline(always)] - fn get_log2_frame_size(&self, model_index: usize) -> usize { - self.frame_sizes[model_index] + pub fn get_log2_frame_size(&self, component: BVGraphComponent) -> usize { + self.frame_sizes[component as usize] } } -impl> SymbolLookup for VecFrame { - type Output = DecoderModelEntry; +impl SymbolLookup for VecFrame { + type Output = DecoderModelEntry; #[inline(always)] - fn symbol(&self, slot: State, model_index: usize) -> &Self::Output { - &self.symbols[model_index][slot as usize] + fn symbol(&self, slot: State, component: BVGraphComponent) -> &Self::Output { + &self.symbols[component as usize][slot as usize] } } diff --git a/src/multi_model_ans/model4encoder.rs b/src/multi_model_ans/model4encoder.rs index 1db89f0..e328ebc 100644 --- a/src/multi_model_ans/model4encoder.rs +++ b/src/multi_model_ans/model4encoder.rs @@ -1,11 +1,11 @@ use crate::multi_model_ans::{EncoderModelEntry}; -use crate::traits::quasi::Decode; use crate::{Freq, Symbol}; +use crate::bvgraph::BVGraphComponent; pub trait SymbolLookup { type Output; - fn symbol(&self, data: Idx, model_index: usize) -> &Self::Output; + fn symbol(&self, data: Idx, component: BVGraphComponent) -> &Self::Output; } #[derive(Clone)] @@ -26,17 +26,15 @@ impl ANSModel4Encoder { .map(|x| x.iter().map(|y| y.freq).collect::>()) .collect::>() } -} -impl Decode for ANSModel4Encoder { #[inline(always)] - fn get_frame_mask(&self, model_index: usize) -> u64 { - (1 << self.frame_sizes[model_index]) - 1 + pub fn get_frame_mask(&self, component: BVGraphComponent) -> u64 { + (1 << self.frame_sizes[component as usize]) - 1 } #[inline(always)] - fn get_log2_frame_size(&self, model_index: usize) -> usize { - self.frame_sizes[model_index] + pub fn get_log2_frame_size(&self, component: BVGraphComponent) -> usize { + self.frame_sizes[component as usize] } } @@ -44,7 +42,7 @@ impl SymbolLookup for ANSModel4Encoder { type Output = EncoderModelEntry; #[inline(always)] - fn symbol(&self, symbol: Symbol, model_index: usize) -> &Self::Output { - &self.tables[model_index][symbol as usize] + fn symbol(&self, symbol: Symbol, component: BVGraphComponent) -> &Self::Output { + &self.tables[component as usize][symbol as usize] } } diff --git a/src/multi_model_ans/model4encoder_builder.rs b/src/multi_model_ans/model4encoder_builder.rs index 36dbbf9..ba18684 100644 --- a/src/multi_model_ans/model4encoder_builder.rs +++ b/src/multi_model_ans/model4encoder_builder.rs @@ -3,6 +3,7 @@ use anyhow::{bail, Result}; use crate::{LOG2_B, MAX_RAW_SYMBOL, RawSymbol, Symbol}; +use crate::bvgraph::BVGraphComponent; use crate::multi_model_ans::EncoderModelEntry; use crate::multi_model_ans::model4encoder::ANSModel4Encoder; use crate::utils::ans_utilities::folding_without_streaming_out; @@ -16,50 +17,62 @@ use crate::utils::data_utilities::{cross_entropy, entropy, try_scale_freqs}; const THETA: f64 = 1.001; -pub struct ANSModel4EncoderBuilder { - models: usize, +pub struct ANSModel4EncoderBuilder { + frequencies: Vec>, + + /// For each max_sym: Vec, + + /// The current fidelity used to fold the symbols. + fidelity: usize, + + /// The current radix used to fold the symbols. + radix: usize, + + /// Represent the threshold starting from which a symbol has to be folded. + folding_threshold: RawSymbol, } -impl ANSModel4EncoderBuilder { - const FOLDING_THRESHOLD: RawSymbol = (1 << (FIDELITY + RADIX - 1)) as RawSymbol; +impl ANSModel4EncoderBuilder { /// Creates a new AnsModel4EncoderBuilder with the given number of models. - pub fn new(models: usize) -> Self { + pub fn new(fidelity: usize, radix: usize) -> Self { // we can calculate the biggest folded sym that we can see. This is used to create a vec with already-allocated frequencies - let presumed_max_bucket: Symbol = folding_without_streaming_out(MAX_RAW_SYMBOL, RADIX, FIDELITY); - let frequencies = vec![ vec![0_usize; presumed_max_bucket as usize]; models]; + let presumed_max_bucket: Symbol = folding_without_streaming_out(MAX_RAW_SYMBOL, radix, fidelity); + let frequencies = vec![ vec![0_usize; presumed_max_bucket as usize]; BVGraphComponent::COMPONENTS]; Self { - models, frequencies, - max_sym: vec![0; models], + max_sym: vec![0; BVGraphComponent::COMPONENTS], + fidelity, + radix, + folding_threshold: (1 << (fidelity + radix - 1)) as u64, } } - pub fn push_symbol(&mut self, symbol: RawSymbol, model_index: usize) -> Result<()> { + pub fn push_symbol(&mut self, symbol: RawSymbol, component: BVGraphComponent) -> Result<()> { if symbol > MAX_RAW_SYMBOL { bail!("Symbol can't be bigger than u48::MAX"); } - let folded_sym = match symbol < Self::FOLDING_THRESHOLD { + let folded_sym = match symbol < self.folding_threshold { true => symbol as Symbol, - false => folding_without_streaming_out(symbol, RADIX, FIDELITY), + false => folding_without_streaming_out(symbol, self.radix, self.fidelity), }; // this unwrap is safe since we have already filled the vec with all zeros - *self.frequencies[model_index].get_mut(folded_sym as usize).unwrap() += 1; - self.max_sym[model_index] = max(self.max_sym[model_index], folded_sym); + *self.frequencies[component as usize].get_mut(folded_sym as usize).unwrap() += 1; + self.max_sym[component as usize] = max(self.max_sym[component as usize], folded_sym); Ok(()) } pub fn build(self) -> ANSModel4Encoder { - let mut tables: Vec> = Vec::with_capacity(self.models); - let mut frame_sizes = Vec::with_capacity(self.models); + let mut tables: Vec> = Vec::with_capacity(BVGraphComponent::COMPONENTS); + let mut frame_sizes = Vec::with_capacity(BVGraphComponent::COMPONENTS); - for model_index in 0..self.models { + for model_index in 0..BVGraphComponent::COMPONENTS { let symbols = self.frequencies[model_index].iter().filter(|freq| **freq > 0).count(); let (approx_freqs, m) = Self::approx_freqs( &self.frequencies[model_index], @@ -72,11 +85,11 @@ impl ANSModel4EncoderBuilder 0 {k} else {31}; // TODO: (2) Addressed here for now + k = if log_m > 0 {k} else {31}; table.push(EncoderModelEntry { freq: *freq as u16, - upperbound: (1_u64 << (k + LOG2_B)) * *freq as u64, // TODO: (1) If M is 0 (not used a specific model at all) this panics + upperbound: (1_u64 << (k + LOG2_B)) * *freq as u64, cumul_freq: last_covered_freq, }); last_covered_freq += *freq as u16; @@ -138,9 +151,8 @@ impl ANSModel4EncoderBuilder { - panic!("\ - We can't approximate the frequencies with a frame size bigger than 2^15. You may want - to change RADIX and/or FIDELITY to handle this distribution. + panic!("The distribution of the symbols cannot be satisfactorily approximated with a frame + size smaller than 2^16. You may want to change RADIX and/or FIDELITY to make the compressor work. "); } } diff --git a/src/utils/ans_utilities.rs b/src/utils/ans_utilities.rs index 32a7fa7..c46ff3e 100644 --- a/src/utils/ans_utilities.rs +++ b/src/utils/ans_utilities.rs @@ -1,5 +1,4 @@ use crate::{RawSymbol, Symbol}; -use crate::multi_model_ans::EncoderModelEntry; /// Folds a symbol without streaming out the bits. pub fn folding_without_streaming_out(mut sym: RawSymbol, radix: usize, fidelity: usize) -> Symbol { diff --git a/tests/compressor_tests.rs b/tests/compressor_tests.rs index effd404..8ce8ccd 100644 --- a/tests/compressor_tests.rs +++ b/tests/compressor_tests.rs @@ -1,43 +1,46 @@ mod utils; -use utils::*; - -use rand::prelude::{IteratorRandom, SliceRandom}; +use rand::prelude::SliceRandom; +use folded_streaming_rans::bvgraph::BVGraphComponent; use folded_streaming_rans::RawSymbol; use folded_streaming_rans::multi_model_ans::decoder::ANSDecoder; use folded_streaming_rans::multi_model_ans::encoder::ANSEncoder; use folded_streaming_rans::multi_model_ans::model4decoder::VecFrame; use folded_streaming_rans::multi_model_ans::model4encoder_builder::ANSModel4EncoderBuilder; +use crate::utils::{get_zipfian_distr, SYMBOL_LIST_LENGTH}; + +const FIDELITY: usize = 2; +const RADIX: usize = 4; #[test] fn decoder_decodes_correctly_single_dummy_sequence() { let source = vec![1_u64, 1, 1, 2, 2, 2, 3, 3, 4, 5]; - let mut model4encoder_builder = ANSModel4EncoderBuilder::::new(1); + let mut model4encoder_builder = ANSModel4EncoderBuilder::new(FIDELITY, RADIX); for symbol in &source { - model4encoder_builder.push_symbol(*symbol, 0).unwrap(); + model4encoder_builder.push_symbol(*symbol, BVGraphComponent::Outdegree).unwrap(); } let encoder_model = model4encoder_builder.build(); - let mut encoder = ANSEncoder::::new(encoder_model); // if not specified is always 8 + let mut encoder = ANSEncoder::new(encoder_model, FIDELITY, RADIX); for symbol in &source { - encoder.encode(*symbol, 0); + encoder.encode(*symbol, BVGraphComponent::Outdegree); } let prelude = encoder.serialize(); - let model = VecFrame::::new( - &prelude.tables.clone(), - &prelude.frame_sizes.clone(), - get_folding_offset(FASTER_RADIX, FIDELITY), - get_folding_threshold(FASTER_RADIX, FIDELITY), + let model = VecFrame::new( + &prelude.tables, + &prelude.frame_sizes, + FIDELITY, + RADIX, ); - let mut decoder = ANSDecoder::::new(&prelude, &model); + let mut decoder = ANSDecoder::new(&prelude, &model, FIDELITY, RADIX); let mut decoded_symbols: Vec = Vec::new(); for _ in 0..source.len() { - decoded_symbols.push(decoder.decode(0)); + decoded_symbols.push(decoder.decode(BVGraphComponent::Outdegree)); } decoded_symbols.reverse(); // since encodes as a LIFO @@ -48,32 +51,32 @@ fn decoder_decodes_correctly_single_dummy_sequence() { fn decoder_decodes_correctly_dummy_sequence_with_folding() { let source = vec![1000, 1000, 2000]; - let mut model4encoder_builder = ANSModel4EncoderBuilder::::new(1); + let mut model4encoder_builder = ANSModel4EncoderBuilder::new(FIDELITY, RADIX); for symbol in &source { - model4encoder_builder.push_symbol(*symbol, 0).unwrap(); + model4encoder_builder.push_symbol(*symbol, BVGraphComponent::Outdegree).unwrap(); } let encoder_model = model4encoder_builder.build(); - let mut encoder = ANSEncoder::::new(encoder_model); // if not specified is always 8 + let mut encoder = ANSEncoder::new(encoder_model, FIDELITY, RADIX); for symbol in &source { - encoder.encode(*symbol, 0); + encoder.encode(*symbol, BVGraphComponent::Outdegree); } let prelude = encoder.serialize(); - let model = VecFrame::<4, u64>::new( - &prelude.tables.clone(), - &prelude.frame_sizes.clone(), - get_folding_offset(4, FIDELITY), - get_folding_threshold(4, FIDELITY), + let model = VecFrame::new( + &prelude.tables, + &prelude.frame_sizes, + FIDELITY, + RADIX, ); - let mut decoder = ANSDecoder::>::new(&prelude, &model); + let mut decoder = ANSDecoder::new(&prelude, &model, FIDELITY, RADIX); let mut decoded_symbols: Vec = Vec::new(); for _ in 0..source.len() { - decoded_symbols.push(decoder.decode(0)); + decoded_symbols.push(decoder.decode(BVGraphComponent::Outdegree)); } decoded_symbols.reverse(); // since encodes as a LIFO @@ -84,61 +87,27 @@ fn decoder_decodes_correctly_dummy_sequence_with_folding() { fn decoder_decodes_correctly_real_sequence() { let source = get_zipfian_distr(0, 1.2).to_vec(); - let mut model4encoder_builder = ANSModel4EncoderBuilder::::new(1); + let mut model4encoder_builder = ANSModel4EncoderBuilder::new(FIDELITY, RADIX); for symbol in &source { - model4encoder_builder.push_symbol(*symbol, 0).unwrap(); + model4encoder_builder.push_symbol(*symbol, BVGraphComponent::Outdegree).unwrap(); } let encoder_model = model4encoder_builder.build(); - let mut encoder = ANSEncoder::::new(encoder_model); + let mut encoder = ANSEncoder::new(encoder_model, FIDELITY, RADIX); for symbol in &source { - encoder.encode(*symbol, 0); + encoder.encode(*symbol, BVGraphComponent::Outdegree); } let prelude = encoder.serialize(); - let model = VecFrame::<6, u64>::new( - &prelude.tables.clone(), - &prelude.frame_sizes.clone(), - get_folding_offset(6, FIDELITY), - get_folding_threshold(6, FIDELITY), - ); + let model = VecFrame::new(&prelude.tables.clone(), &prelude.frame_sizes.clone(), FIDELITY, RADIX); - let mut decoder = ANSDecoder::>::new(&prelude, &model); + let mut decoder = ANSDecoder::new(&prelude, &model, FIDELITY, RADIX); let mut decoded_symbols: Vec = Vec::new(); for _ in 0..source.len() { - decoded_symbols.push(decoder.decode(0)); - } - decoded_symbols.reverse(); // since encodes as a LIFO - - assert_eq!(decoded_symbols, source); -} - -/* -#[test] -fn decoder_decodes_correctly_a_single_dummy_sequence() { - let source = vec![1_u64, 1, 1, 2, 2, 2, 3, 3, 4, 5]; - let mut encoder_model_builder = ANSModel4EncoderBuilder::::new(1); - - for symbol in &source { - encoder_model_builder.push_symbol(*symbol, 0).unwrap(); // first traversal to build the statistics - } - - let encoder_model = encoder_model_builder.build(); - let mut encoder = ANSEncoder::::new(encoder_model); - - for symbol in &source { - encoder.encode(*symbol, 0); // second traversal to encode the symbols - } - - let prelude = encoder.serialize(); - let mut decoder = ANSDecoder::::new(&prelude); - let mut decoded_symbols: Vec = Vec::new(); - - for _ in 0..source.len() { - decoded_symbols.push(decoder.decode(0)); + decoded_symbols.push(decoder.decode(BVGraphComponent::Outdegree)); } decoded_symbols.reverse(); // since encodes as a LIFO @@ -149,30 +118,31 @@ fn decoder_decodes_correctly_a_single_dummy_sequence() { fn decoder_decodes_correctly_dummy_sequences() { let first_source = vec![1_u64, 1, 1, 2, 2, 2, 3, 3, 4, 5]; let second_source = vec![1_u64, 3, 3, 3, 2, 2, 3, 3, 4, 5]; - let mut encoder_model_builder = ANSModel4EncoderBuilder::::new(2); + let mut encoder_model_builder = ANSModel4EncoderBuilder::new(FIDELITY, RADIX); for index in 0..first_source.len() { - encoder_model_builder.push_symbol(first_source[index], 0).unwrap(); - encoder_model_builder.push_symbol(second_source[index], 1).unwrap(); + encoder_model_builder.push_symbol(first_source[index], BVGraphComponent::Outdegree).unwrap(); + encoder_model_builder.push_symbol(second_source[index], BVGraphComponent::BlockCount).unwrap(); } let encoder_model = encoder_model_builder.build(); - let mut encoder = ANSEncoder::::new(encoder_model); + let mut encoder = ANSEncoder::new(encoder_model, FIDELITY, RADIX); for index in 0..first_source.len() { - encoder.encode(first_source[index], 0); - encoder.encode(second_source[index], 1); + encoder.encode(first_source[index], BVGraphComponent::Outdegree); + encoder.encode(second_source[index], BVGraphComponent::BlockCount); } let prelude = encoder.serialize(); - let mut decoder = ANSDecoder::::new(&prelude); + let model = VecFrame::new(&prelude.tables.clone(), &prelude.frame_sizes.clone(), FIDELITY, RADIX); + let mut decoder = ANSDecoder::new(&prelude, &model, FIDELITY, RADIX); let mut first_decoded_sequence: Vec = Vec::new(); let mut second_decoded_sequence: Vec = Vec::new(); for _ in 0..first_source.len() { - second_decoded_sequence.push(decoder.decode(1)); // let's start from the last encoded - first_decoded_sequence.push(decoder.decode(0)); + second_decoded_sequence.push(decoder.decode(BVGraphComponent::BlockCount)); // let's start from the last encoded + first_decoded_sequence.push(decoder.decode(BVGraphComponent::Outdegree)); } first_decoded_sequence.reverse(); // since encodes as a LIFO @@ -182,161 +152,58 @@ fn decoder_decodes_correctly_dummy_sequences() { assert_eq!(second_decoded_sequence, second_source); } -#[test] -fn decoder_decodes_correctly_dummy_interleaved_sequences() { - // (model_index, symbol) - let first_source = vec![(0, 1_u64),(0, 1),(0, 1),(0, 2),(0, 2),(0,2),(0, 3),(0, 3),(0, 4),(0,5)]; - let second_source = vec![(1, 1_u64),(1, 1),(1, 1),(1, 1),(1, 4),(1,3),(1, 3),(1, 3),(1, 4),(1,10)]; - let mut encoder_model_builder = ANSModel4EncoderBuilder::::new(2); - - for index in 0..first_source.len() { - encoder_model_builder.push_symbol(first_source[index].1, 0).unwrap(); - encoder_model_builder.push_symbol(second_source[index].1, 1).unwrap(); - } - - let encoder_model = encoder_model_builder.build(); - let mut encoder = ANSEncoder::::new(encoder_model); - - // create a unique source of symbols and randomize it - let mut random_unified_source = vec![first_source, second_source].concat(); - random_unified_source.shuffle(&mut rand::thread_rng()); - - let expected_first_source = random_unified_source - .iter() - .filter(|(model_index, _)| *model_index == 0) - .map(|(_, symbol)| *symbol).collect::>(); - - let expected_second_source = random_unified_source - .iter() - .filter(|(model_index, _)| *model_index == 1) - .map(|(_, symbol)| *symbol).collect::>(); - - // now encode each symbol (in random order) with the corresponding model previously associated - for (model_index, symbol) in random_unified_source.iter() { - encoder.encode(*symbol, *model_index); - } - - let prelude = encoder.serialize(); - let mut decoder = ANSDecoder::::new(&prelude); - let mut decoded: Vec> = vec![Vec::new(), Vec::new()]; - - random_unified_source.reverse(); // now let's reverse the order of the model_index-symbol pairs to decode in reverse - - for (model_index, _symbol) in &random_unified_source { - decoded[*model_index].push(decoder.decode(*model_index)); - } - - decoded[0].reverse(); // they have been decoded in reversed order - decoded[1].reverse(); - - assert_eq!(expected_first_source, decoded[0]); - assert_eq!(expected_second_source, decoded[1]); -} - - -#[test] -fn decoder_decodes_correctly_real_interleaved_sequences() { - // (model_index, symbol) - let first_sequence = get_zipfian_distr(0, 1.0).iter().map(|symbol| (0, *symbol)).collect::>(); - let second_sequence = get_zipfian_distr(1, 1.0).iter().map(|symbol| (1, *symbol)).collect::>(); - let third_sequence = get_zipfian_distr(2, 1.0).iter().map(|symbol| (2, *symbol)).collect::>(); - let fourth_sequence = get_zipfian_distr(1, 1.0).iter().map(|symbol| (3, *symbol)).collect::>(); - - let mut encoder_model_builder = ANSModel4EncoderBuilder::::new(4); - - for index in 0..first_sequence.len() { - encoder_model_builder.push_symbol(first_sequence[index].1, 0).unwrap(); - encoder_model_builder.push_symbol(second_sequence[index].1, 1).unwrap(); - encoder_model_builder.push_symbol(third_sequence[index].1, 2).unwrap(); - encoder_model_builder.push_symbol(fourth_sequence[index].1, 3).unwrap(); - } - let encoder_model = encoder_model_builder.build(); - let mut encoder = ANSEncoder::::new(encoder_model); - let mut source = vec![first_sequence, second_sequence, third_sequence, fourth_sequence].concat(); - source.shuffle(&mut rand::thread_rng()); // randomize the order of the symbols to encode - - let mut expected = vec![Vec::new(), Vec::new(), Vec::new(), Vec::new()]; - - for (model_index, symbol) in &source { - expected[*model_index].push(*symbol); - } - - // now encode each symbol with the corresponding model previously associated - for (model_index, symbol) in source.iter() { - encoder.encode(*symbol, *model_index); - } - - let prelude = encoder.serialize(); - let mut decoder = ANSDecoder::::new(&prelude); - let mut decoded: Vec> = vec![Vec::new(), Vec::new(), Vec::new(), Vec::new()]; - - source.reverse(); // now let's reverse the order of the model_index-symbol pairs to decode in reverse - - for (model_index, _symbol) in &source { - decoded[*model_index].push(decoder.decode(*model_index)); - } - - decoded.iter_mut().for_each(|sequence| sequence.reverse()); // they have been decoded in reversed order - - assert_eq!(expected[0], decoded[0]); - assert_eq!(expected[1], decoded[1]); - assert_eq!(expected[2], decoded[2]); - assert_eq!(expected[3], decoded[3]); -} - #[test] // Frame sizes: [9, 14, 13, 10] (note that these are actually log_2 of the frame sizes) fn decoder_decodes_correctly_real_interleaved_sequences_with_different_frame_sizes() { - // let's get a random sequence of symbols to encode and map them to have this shape: (model_index, symbol) + // let's get a random sequence of symbols to encode and map them to have this shape: (component, symbol) let first_sequence = get_zipfian_distr(0, 1.3) .iter() - .map(|symbol| (0, *symbol)).collect::>()[..SYMBOL_LIST_LENGTH/2000].to_vec(); + .map(|symbol| (BVGraphComponent::Outdegree, *symbol)) + .collect::>()[..SYMBOL_LIST_LENGTH/2000] + .to_vec(); let second_sequence = get_zipfian_distr(1, 1.2) .iter() - .map(|symbol| (1, *symbol)).collect::>(); + .map(|symbol| (BVGraphComponent::BlockCount, *symbol)) + .collect::>(); let third_sequence = get_zipfian_distr(2, 1.0) .iter() - .map(|symbol| (2, *symbol)) - .collect::>(); - - let fourth_sequence = get_zipfian_distr(3, 1.4) - .iter() - .map(|symbol| (3, *symbol)) - .collect::>()[..SYMBOL_LIST_LENGTH/1000].to_vec(); + .map(|symbol| (BVGraphComponent::Residual, *symbol)) + .collect::>(); // now let's unify each source in a single one and randomize it - let mut source = vec![first_sequence, second_sequence, third_sequence, fourth_sequence].concat(); + let mut source = vec![first_sequence, second_sequence, third_sequence].concat(); source.shuffle(&mut rand::thread_rng()); - let mut encoder_model_builder = ANSModel4EncoderBuilder::::new(4); + let mut encoder_model_builder = ANSModel4EncoderBuilder::new(FIDELITY, RADIX); - for (model_index, symbol) in &source { - encoder_model_builder.push_symbol(*symbol, *model_index).unwrap(); + for (component, symbol) in &source { + encoder_model_builder.push_symbol(*symbol, *component).unwrap(); } let encoder_model = encoder_model_builder.build(); - let mut encoder = ANSEncoder::::new(encoder_model); - let mut expected = vec![Vec::new(), Vec::new(), Vec::new(), Vec::new()]; + let mut encoder = ANSEncoder::new(encoder_model, FIDELITY, RADIX); + let mut expected = vec![Vec::new(); BVGraphComponent::COMPONENTS]; - for (model_index, symbol) in &source { - expected[*model_index].push(*symbol); + for (component, symbol) in &source { + expected[*component as usize].push(*symbol); } // now encode each symbol with the corresponding model previously associated - for (model_index, symbol) in source.iter() { - encoder.encode(*symbol, *model_index); + for (component, symbol) in source.iter() { + encoder.encode(*symbol, *component); } let prelude = encoder.serialize(); - let mut decoder = ANSDecoder::::new(&prelude); - let mut decoded: Vec> = vec![Vec::new(), Vec::new(), Vec::new(), Vec::new()]; + let model4decoder = VecFrame::new(&prelude.tables, &prelude.frame_sizes, FIDELITY, RADIX); + let mut decoder = ANSDecoder::new(&prelude, &model4decoder, FIDELITY, RADIX); + let mut decoded: Vec> = vec![Vec::new(); BVGraphComponent::COMPONENTS]; source.reverse(); // now let's reverse the order of the model_index-symbol pairs to decode in reverse - for (model_index, _symbol) in &source { - decoded[*model_index].push(decoder.decode(*model_index)); + for (component, _symbol) in &source { + decoded[*component as usize].push(decoder.decode(*component)); } decoded.iter_mut().for_each(|sequence| sequence.reverse()); // they have been decoded in reversed order @@ -345,112 +212,4 @@ fn decoder_decodes_correctly_real_interleaved_sequences_with_different_frame_siz assert_eq!(expected[1], decoded[1]); assert_eq!(expected[2], decoded[2]); assert_eq!(expected[3], decoded[3]); -} - -#[test] -fn test_random_access() { - // let's get a random sequence of symbols to encode and map them to have this shape: (model_index, symbol) - let first_sequence = get_zipfian_distr(0, 1.3) - .iter() - .map(|symbol| (0, *symbol)).collect::>(); - - let second_sequence = get_zipfian_distr(1, 1.2) - .iter() - .map(|symbol| (1, *symbol)).collect::>(); - - let third_sequence = get_zipfian_distr(2, 1.0) - .iter() - .map(|symbol| (2, *symbol)) - .collect::>(); - - let fourth_sequence = get_zipfian_distr(3, 1.4) - .iter() - .map(|symbol| (3, *symbol)) - .collect::>(); - - // now let's unify each source in a single one and randomize it - let mut source = vec![first_sequence, second_sequence, third_sequence, fourth_sequence].concat(); - source.shuffle(&mut rand::thread_rng()); - - let mut encoder_model_builder = ANSModel4EncoderBuilder::::new(4); - - for (model_index, symbol) in &source { - encoder_model_builder.push_symbol(*symbol, *model_index).unwrap(); - } - - let encoder_model = encoder_model_builder.build(); - let mut encoder = ANSEncoder::::new(encoder_model); - - // let's take 100 random indexes of symbols that will lately want to decode - let random_symbols_indexes = (0..source.len()).into_iter().choose_multiple(&mut rand::thread_rng(), 100); - let mut phases = Vec::new(); - let mut expected = Vec::new(); - - for (model_index, symbol) in &source { - encoder.encode(*symbol, *model_index); - - if random_symbols_indexes.contains(&model_index) { - phases.push(encoder.get_current_compressor_phase()); // save the phase of the symbol at index i - expected.push(*symbol); // save the symbol at index i - } - } - - let prelude = encoder.serialize(); - let mut decoder = ANSDecoder::::new(&prelude); - - for phase_index in 0..phases.len() { - let phase = phases[phase_index].clone(); - assert_eq!(decoder.decode_from_phase(phase, 0), expected[phase_index]); - } -} - -#[test] -fn test_random_access_with_bitvec() { - let sequence = get_zipfian_distr(0, 1.2); - let mut encoder_model_builder = ANSModel4EncoderBuilder::::new(1); - - for symbol in &sequence { - encoder_model_builder.push_symbol(*symbol, 0).unwrap(); - } - - let encoder_model = encoder_model_builder.build(); - let mut encoder = ANSEncoder::>::with_parameters(encoder_model, BitVec::new()); - - // let's take 100 random indexes of symbols that will lately want to decode - let random_symbols_indexes = (0..sequence.len()).into_iter().choose_multiple(&mut rand::thread_rng(), 100); - - let mut phases = Vec::new(); - let mut expected = Vec::new(); - - for index in 0..sequence.len() { - encoder.encode(sequence[index], 0); - - if random_symbols_indexes.contains(&index) { - phases.push(encoder.get_current_compressor_phase()); // save the phase of the symbol at index i - expected.push(sequence[index]); // save the symbol at index i - } - } - - let prelude = encoder.serialize(); - - let frame = VecFrame::<5, u64>::new( - &prelude.tables.clone(), - &prelude.frame_sizes.clone(), - get_folding_offset(5, FIDELITY), - get_folding_threshold(5, FIDELITY), - ); - - let mut decoder = ANSDecoder::< - FIDELITY, - 5, - u64, - VecFrame<5, u64>, - BitVec - >::with_parameters(&prelude, frame); - - for phase_index in 0..phases.len() { - let phase = phases[phase_index].clone(); - assert_eq!(decoder.decode_from_phase(phase, 0), expected[phase_index]); - } -} -*/ +} \ No newline at end of file diff --git a/tests/decoder_model_tests.rs b/tests/decoder_model_tests.rs index 0d69a56..75ffc57 100644 --- a/tests/decoder_model_tests.rs +++ b/tests/decoder_model_tests.rs @@ -1,12 +1,11 @@ mod utils; use rstest::*; -use utils::*; use folded_streaming_rans::{RawSymbol, State}; -use folded_streaming_rans::multi_model_ans::model4decoder::{EliasFanoFrame, Rank9SelFrame, VecFrame}; +use folded_streaming_rans::ans::model4decoder::*; +use folded_streaming_rans::ans::model4encoder::SingleANSModel4Encoder; use folded_streaming_rans::multi_model_ans::model4encoder::SymbolLookup; -use folded_streaming_rans::multi_model_ans::model4encoder_builder::ANSModel4EncoderBuilder; const RADIX: usize = 4; const FIDELITY: usize = 2; @@ -25,33 +24,25 @@ fn probe_works_for_all_types_of_frames( #[case] slots: Vec, #[case] expected_symbols: Vec, ) { - let mut encoder_model_builder = ANSModel4EncoderBuilder::::new(1); - - for symbol in symbols { - encoder_model_builder.push_symbol(symbol, 0).unwrap(); - } - - let encoder_model = encoder_model_builder.build(); - let tables = encoder_model.tables; - let frame_sizes = encoder_model.frame_sizes; - let folding_offset = get_folding_offset(RADIX, FIDELITY); - let folding_threshold = get_folding_threshold(RADIX, FIDELITY); + let model4encoder = SingleANSModel4Encoder::new(&symbols, FIDELITY, RADIX); + let folding_threshold = (1 << (FIDELITY + RADIX - 1)) as u64; + let folding_offset = ((1 << RADIX) - 1) * (1 << (FIDELITY - 1)); let bitvec_frame = Rank9SelFrame::::new( - &tables, - &frame_sizes, + &model4encoder.table, + model4encoder.log2_frame_size, folding_offset, folding_threshold, ); let vec_frame = VecFrame::::new( - &tables, - &frame_sizes, + &model4encoder.table, + model4encoder.log2_frame_size, folding_offset, folding_threshold, ); let elias_frame = EliasFanoFrame::::new( - &tables, - &frame_sizes, + &model4encoder.table, + model4encoder.log2_frame_size, folding_offset, folding_threshold, ); @@ -59,8 +50,8 @@ fn probe_works_for_all_types_of_frames( for i in 0..slots.len() { let slot_to_probe = slots[i] as State; - assert_eq!(expected_symbols[i], bitvec_frame.symbol(slot_to_probe, 0).quasi_folded); - assert_eq!(expected_symbols[i], elias_frame.symbol(slot_to_probe, 0).quasi_folded); - assert_eq!(expected_symbols[i], vec_frame.symbol(slot_to_probe, 0).quasi_folded); + assert_eq!(expected_symbols[i], bitvec_frame[slot_to_probe].quasi_folded); + assert_eq!(expected_symbols[i], elias_frame[slot_to_probe].quasi_folded); + assert_eq!(expected_symbols[i], vec_frame[slot_to_probe].quasi_folded); } } \ No newline at end of file diff --git a/tests/encoder_model_tests.rs b/tests/encoder_model_tests.rs deleted file mode 100644 index 5781b44..0000000 --- a/tests/encoder_model_tests.rs +++ /dev/null @@ -1,42 +0,0 @@ -use folded_streaming_rans::multi_model_ans::encoder::ANSEncoder; -use folded_streaming_rans::multi_model_ans::model4encoder_builder::ANSModel4EncoderBuilder; - -const RADIX: usize = 4; -const FIDELITY: usize = 2; - -#[test] -fn builder_is_created_without_errors() { - let first_symbols = vec![1,1,1,2,2,2,3,3,4,5]; - let second_symbols = vec![1,1,1,2,2,2,3,3,4,5]; - let third_symbols = vec![1,1,1,2,2,2,3,3,4,5]; - - let mut builder = ANSModel4EncoderBuilder::::new(3); - - for index in 0..first_symbols.len() { - builder.push_symbol(first_symbols[index], 0).unwrap(); - builder.push_symbol(second_symbols[index], 1).unwrap(); - builder.push_symbol(third_symbols[index], 2).unwrap(); - } -} - -#[test] -fn encoder_encodes_without_errors() { - let first_symbols = vec![1, 1, 1, 2, 2, 2, 3, 3, 4, 5]; - let second_symbols = vec![1, 1, 1, 2, 2, 2, 3, 3, 4, 5]; - let third_symbols = vec![1, 1, 1, 2, 2, 2, 3, 3, 4, 5]; - - let mut builder = ANSModel4EncoderBuilder::::new(3); - - for index in 0..first_symbols.len() { - builder.push_symbol(first_symbols[index], 0).unwrap(); - builder.push_symbol(second_symbols[index], 1).unwrap(); - builder.push_symbol(third_symbols[index], 2).unwrap(); - } - let model = builder.build(); - let mut encoder = ANSEncoder::::new(model); - - for index in 0..first_symbols.len() { - encoder.encode(first_symbols[index], 0); - } -} - diff --git a/tests/test_bvgraph.rs b/tests/test_bvgraph.rs index b68dfef..d1eecd1 100644 --- a/tests/test_bvgraph.rs +++ b/tests/test_bvgraph.rs @@ -1,8 +1,8 @@ - use anyhow::Result; -use folded_streaming_rans::bvgraph::writer::{BVGraphModelBuilder, BVGraphWriter}; +use folded_streaming_rans::bvgraph::writer::{BVGraphModelBuilder, BVGraphWriter}; use folded_streaming_rans::bvgraph::reader::ANSBVGraphReaderBuilder; + use webgraph::prelude::{BVGraph, EmptyDict, RandomAccessLabelling}; use webgraph::{graph::bvgraph::BVComp, traits::SequentialLabelling}; use folded_streaming_rans::bvgraph::mock_writers::{ANSymbolTable, EntropyMockWriter, Log2MockWriter}; @@ -28,14 +28,9 @@ fn decoder_decodes_correctly_dummy_graph() -> Result<()> { // 1 -> 5 // let's pass a dummy table since the Log2MockWriter it's not going to use it - let binary_costs_table = ANSymbolTable::::initialize_with_binary_cost(9); - let model_builder = BVGraphModelBuilder::>::new(binary_costs_table); - - let mut bvcomp = BVComp:: - >>::new(model_builder, 7, 2, 3, 0); + let binary_costs_table = ANSymbolTable::initialize_with_binary_cost(FIDELITY, RADIX); + let model_builder = BVGraphModelBuilder::new(binary_costs_table, FIDELITY, RADIX); + let mut bvcomp = BVComp::>::new(model_builder, 7, 2, 3, 0); // first iteration -> build the model with log2 mock writer for node_index in 0..graph.num_nodes() { @@ -46,23 +41,13 @@ fn decoder_decodes_correctly_dummy_graph() -> Result<()> { bvcomp.push(successors)?; } - let mut model4encoder = bvcomp.flush()?.build(); + let model4encoder = bvcomp.flush()?.build(); let symbol_freqs = model4encoder.get_symbol_freqs(); - let entropic_costs_table = ANSymbolTable::::new( - symbol_freqs, - model4encoder.frame_sizes.clone() - ); - - let model_builder = BVGraphModelBuilder::>::new( - entropic_costs_table.clone(), - ); + let entropic_costs_table = ANSymbolTable::new(symbol_freqs, model4encoder.frame_sizes, FIDELITY, RADIX); + let model_builder = BVGraphModelBuilder::::new(entropic_costs_table.clone(), FIDELITY, RADIX); - let mut bvcomp = BVComp:: - >>::new(model_builder, 7, 2, 3, 0); + let mut bvcomp = BVComp::>::new(model_builder, 7, 2, 3, 0); // second iteration -> build the model with entropic mock writer for node_index in 0..graph.num_nodes() { @@ -74,16 +59,15 @@ fn decoder_decodes_correctly_dummy_graph() -> Result<()> { } let model4encoder = bvcomp.flush()?.build(); - - let mut bvcomp = BVComp::>::new( - BVGraphWriter::new(model4encoder, entropic_costs_table), + let mut bvcomp = BVComp::::new( + BVGraphWriter::new(model4encoder, entropic_costs_table, FIDELITY, RADIX), 7, 2, 3, 0 ); - // second iteration: encodes the graph + // now encode the graph for node_index in 0..graph.num_nodes() { let successors = graph .successors(node_index) @@ -93,12 +77,12 @@ fn decoder_decodes_correctly_dummy_graph() -> Result<()> { bvcomp.push(successors)?; } - let (mut encoder, phases) = bvcomp.flush()?.into_inner(); + let (encoder, phases) = bvcomp.flush()?.into_inner(); let prelude = encoder.serialize(); - let code_reader_builder = ANSBVGraphReaderBuilder::::new(prelude, phases); + let code_reader_builder = ANSBVGraphReaderBuilder::new(&prelude, phases, FIDELITY, RADIX); - let decoded_graph = BVGraph::, EmptyDict>::new( + let decoded_graph = BVGraph::>::new( code_reader_builder, 2, 7, @@ -112,49 +96,32 @@ fn decoder_decodes_correctly_dummy_graph() -> Result<()> { Ok(()) } - #[test] fn decoder_decodes_correctly_cnr_graph() -> Result<()> { let graph = webgraph::graph::bvgraph::load("tests/data/cnr-2000/cnr-2000")?; let num_nodes = graph.num_nodes(); let num_arcs = graph.num_arcs_hint().unwrap(); - let binary_costs_table = ANSymbolTable::::initialize_with_binary_cost(9); - let model_builder = BVGraphModelBuilder::>::new(binary_costs_table); - let mut bvcomp = BVComp:: - >>::new(model_builder, 7, 2, 3, 0); + let binary_costs_table = ANSymbolTable::initialize_with_binary_cost(FIDELITY, RADIX); + let model_builder = BVGraphModelBuilder::::new(binary_costs_table, FIDELITY, RADIX); + let mut bvcomp = BVComp::>::new(model_builder, 7, 2, 3, 0); // First iteration with Log2MockWriter - bvcomp.extend(graph.iter())?; // I SIMBOLI VANNO UNFOLDATI PRIMA DI CHIEDERE COSTO!!!!! + bvcomp.extend(graph.iter())?; - let mut model4encoder = bvcomp.flush()?.build(); + let model4encoder = bvcomp.flush()?.build(); let symbol_freqs = model4encoder.get_symbol_freqs(); - - let entropic_costs_table = ANSymbolTable::::new( - symbol_freqs, - model4encoder.frame_sizes.clone() - ); - - let model_builder = BVGraphModelBuilder::>::new( - entropic_costs_table.clone(), - ); - - let mut bvcomp = BVComp:: - >>::new(model_builder, 7, 2, 3, 0); + let entropic_costs_table = ANSymbolTable::new(symbol_freqs, model4encoder.frame_sizes, FIDELITY, RADIX); + let model_builder = BVGraphModelBuilder::::new(entropic_costs_table.clone(), FIDELITY, RADIX); + let mut bvcomp = BVComp::>::new(model_builder, 7, 2, 3, 0); // second iteration with EntropyMockWriter bvcomp.extend(graph.iter())?; let model4encoder = bvcomp.flush()?.build(); - let mut bvcomp = BVComp::>::new( - BVGraphWriter::new(model4encoder, entropic_costs_table), + let mut bvcomp = BVComp::::new( + BVGraphWriter::new(model4encoder, entropic_costs_table, FIDELITY, RADIX), 7, 2, 3, @@ -164,12 +131,11 @@ fn decoder_decodes_correctly_cnr_graph() -> Result<()> { // Encoding the graph bvcomp.extend(graph.iter())?; - let (mut encoder, phases) = bvcomp.flush()?.into_inner(); + let (encoder, phases) = bvcomp.flush()?.into_inner(); let prelude = encoder.serialize(); + let code_reader_builder = ANSBVGraphReaderBuilder::new(&prelude, phases, FIDELITY, RADIX); - let code_reader_builder = ANSBVGraphReaderBuilder::::new(prelude, phases); - - let decoded_graph = BVGraph::, EmptyDict>::new( + let decoded_graph = BVGraph::>::new( code_reader_builder, 2, 7, @@ -185,4 +151,4 @@ fn decoder_decodes_correctly_cnr_graph() -> Result<()> { } Ok(()) -} +} \ No newline at end of file diff --git a/tests/utils/mod.rs b/tests/utils/mod.rs index 0be7410..66f64b2 100644 --- a/tests/utils/mod.rs +++ b/tests/utils/mod.rs @@ -3,21 +3,10 @@ use rand::SeedableRng; use rand_distr::Zipf; use folded_streaming_rans::RawSymbol; -pub const FIDELITY: usize = 2; -pub const FASTER_RADIX: usize = 8; -/// Size of the list of symbols used to test. + pub const SYMBOL_LIST_LENGTH: usize = 1_000_000; /// Maximum value that the zipfian distribution can output. -const MAXIMUM_SYMBOL: u64 = 1 << 20; - - -pub fn get_folding_offset(radix: usize, fidelity: usize) -> u64 { - (1 << (fidelity - 1)) * ((1 << radix) - 1) -} - -pub fn get_folding_threshold(radix: usize, fidelity: usize) -> u64 { - 1 << (fidelity + radix - 1) -} +const MAXIMUM_SYMBOL: u64 = 1 << 30; /// Creates a sequence of size [`SYMBOL_LIST_LENGTH`], containing symbols sampled from a Zipfian /// distribution that can output values up to [`MAXIMUM_SYMBOL`].