Skip to content

Commit

Permalink
move file operations from b3sum to blake3
Browse files Browse the repository at this point in the history
  • Loading branch information
Banyc authored and oconnor663 committed Sep 16, 2023
1 parent 12b3685 commit e0bb915
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 56 deletions.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ no_avx2 = []
no_avx512 = []
no_neon = []

file = ["memmap2", "rayon", "std"]

[package.metadata.docs.rs]
# Document Hasher::update_rayon on docs.rs.
features = ["rayon", "zeroize"]
Expand All @@ -93,6 +95,7 @@ rayon = { version = "1.2.1", optional = true }
cfg-if = "1.0.0"
digest = { version = "0.10.1", features = [ "mac" ], optional = true }
zeroize = { version = "1", default-features = false, features = ["zeroize_derive"], optional = true }
memmap2 = { version = "0.7.1", optional = true }

[dev-dependencies]
hex = "0.4.2"
Expand Down
1 change: 1 addition & 0 deletions b3sum/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion b3sum/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pure = ["blake3/pure"]

[dependencies]
anyhow = "1.0.25"
blake3 = { version = "1", path = "..", features = ["rayon"] }
blake3 = { version = "1", path = "..", features = ["file", "rayon"] }
clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
hex = "0.4.0"
memmap2 = "0.7.0"
Expand Down
58 changes: 3 additions & 55 deletions b3sum/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ impl Input {
}
let file = File::open(path)?;
if !args.no_mmap() {
if let Some(mmap) = maybe_memmap_file(&file)? {
if let Some(mmap) = blake3::file::maybe_memmap_file(&file)? {
return Ok(Self::Mmap(io::Cursor::new(mmap)));
}
}
Expand All @@ -208,12 +208,12 @@ impl Input {
// one. We might implement that in the future, but since this is
// the slow path anyway, it's not high priority.
Self::File(file) => {
copy_wide(file, &mut hasher)?;
blake3::copy_wide(file, &mut hasher)?;
}
Self::Stdin => {
let stdin = io::stdin();
let lock = stdin.lock();
copy_wide(lock, &mut hasher)?;
blake3::copy_wide(lock, &mut hasher)?;
}
}
let mut output_reader = hasher.finalize_xof();
Expand All @@ -232,58 +232,6 @@ impl Read for Input {
}
}

// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
// can support at least 64 KiB, and there's some performance benefit to using
// bigger reads, so that's what we use here.
fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result<u64> {
let mut buffer = [0; 65536];
let mut total = 0;
loop {
match reader.read(&mut buffer) {
Ok(0) => return Ok(total),
Ok(n) => {
hasher.update(&buffer[..n]);
total += n as u64;
}
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => return Err(e),
}
}
}

// Mmap a file, if it looks like a good idea. Return None in cases where we
// know mmap will fail, or if the file is short enough that mmapping isn't
// worth it. However, if we do try to mmap and it fails, return the error.
fn maybe_memmap_file(file: &File) -> Result<Option<memmap2::Mmap>> {
let metadata = file.metadata()?;
let file_size = metadata.len();
Ok(if !metadata.is_file() {
// Not a real file.
None
} else if file_size > isize::max_value() as u64 {
// Too long to safely map.
// https://github.com/danburkert/memmap-rs/issues/69
None
} else if file_size == 0 {
// Mapping an empty file currently fails.
// https://github.com/danburkert/memmap-rs/issues/72
None
} else if file_size < 16 * 1024 {
// Mapping small files is not worth it.
None
} else {
// Explicitly set the length of the memory map, so that filesystem
// changes can't race to violate the invariants we just checked.
let map = unsafe {
memmap2::MmapOptions::new()
.len(file_size as usize)
.map(file)?
};
Some(map)
})
}

fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> {
// Encoding multiples of the 64 bytes is most efficient.
// TODO: This computes each output block twice when the --seek argument isn't a multiple of 64.
Expand Down
67 changes: 67 additions & 0 deletions src/file.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
//! The file-related utilities.
//!
//! # Examples
//!
//! ```no_run
//! use std::io;
//!
//! use blake3::file::hash_path_maybe_mmap;
//!
//! fn main() -> io::Result<()> {
//! let args: Vec<_> = std::env::args_os().collect();
//! assert_eq!(args.len(), 2);
//! let path = &args[1];
//! let mut hasher = blake3::Hasher::new();
//! hash_path_maybe_mmap(&mut hasher, path)?;
//! println!("{}", hasher.finalize());
//! Ok(())
//! }
//! ```
use std::{fs::File, io, path::Path};

/// Mmap a file, if it looks like a good idea. Return None in cases where we
/// know mmap will fail, or if the file is short enough that mmapping isn't
/// worth it. However, if we do try to mmap and it fails, return the error.
pub fn maybe_memmap_file(file: &File) -> io::Result<Option<memmap2::Mmap>> {
let metadata = file.metadata()?;
let file_size = metadata.len();
#[allow(clippy::if_same_then_else)]
if !metadata.is_file() {
// Not a real file.
Ok(None)
} else if file_size > isize::max_value() as u64 {
// Too long to safely map.
// https://github.com/danburkert/memmap-rs/issues/69
Ok(None)
} else if file_size == 0 {
// Mapping an empty file currently fails.
// https://github.com/danburkert/memmap-rs/issues/72
Ok(None)
} else if file_size < 16 * 1024 {
// Mapping small files is not worth it.
Ok(None)
} else {
// Explicitly set the length of the memory map, so that filesystem
// changes can't race to violate the invariants we just checked.
let map = unsafe {
memmap2::MmapOptions::new()
.len(file_size as usize)
.map(file)?
};
Ok(Some(map))
}
}

/// Hash a file fast.
///
/// It may use mmap if the file is big enough. If not, it will read the whole file into a buffer.
pub fn hash_path_maybe_mmap(hasher: &mut crate::Hasher, path: impl AsRef<Path>) -> io::Result<()> {
let file = File::open(path.as_ref())?;
if let Some(mmap) = maybe_memmap_file(&file)? {
hasher.update_rayon(&mmap);
} else {
crate::copy_wide(&file, hasher)?;
}
Ok(())
}
26 changes: 26 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ mod sse41;
#[cfg(feature = "traits-preview")]
pub mod traits;

#[cfg(feature = "file")]
pub mod file;

mod join;

use arrayref::{array_mut_ref, array_ref};
Expand Down Expand Up @@ -1352,6 +1355,29 @@ impl std::io::Write for Hasher {
}
}

/// Copy from `reader` to `hasher`, returning the number of bytes read.
///
/// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
/// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
/// can support at least 64 KiB, and there's some performance benefit to using
/// bigger reads, so that's what we use here.
#[cfg(feature = "std")]
pub fn copy_wide(mut reader: impl std::io::Read, hasher: &mut Hasher) -> std::io::Result<u64> {
let mut buffer = [0; 65536];
let mut total = 0;
loop {
match reader.read(&mut buffer) {
Ok(0) => return Ok(total),
Ok(n) => {
hasher.update(&buffer[..n]);
total += n as u64;
}
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
Err(e) => return Err(e),
}
}
}

/// An incremental reader for extended output, returned by
/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof).
///
Expand Down

0 comments on commit e0bb915

Please # to comment.