diff --git a/Cargo.toml b/Cargo.toml index e9ab95e67..74aed3099 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -81,6 +81,8 @@ no_avx2 = [] no_avx512 = [] no_neon = [] +file = ["memmap2", "rayon", "std"] + [package.metadata.docs.rs] # Document Hasher::update_rayon on docs.rs. features = ["rayon", "zeroize"] @@ -93,6 +95,7 @@ rayon = { version = "1.2.1", optional = true } cfg-if = "1.0.0" digest = { version = "0.10.1", features = [ "mac" ], optional = true } zeroize = { version = "1", default-features = false, features = ["zeroize_derive"], optional = true } +memmap2 = { version = "0.7.1", optional = true } [dev-dependencies] hex = "0.4.2" diff --git a/b3sum/Cargo.lock b/b3sum/Cargo.lock index d1049af22..3c7c7379c 100644 --- a/b3sum/Cargo.lock +++ b/b3sum/Cargo.lock @@ -110,6 +110,7 @@ dependencies = [ "cc", "cfg-if", "constant_time_eq", + "memmap2", "rayon", ] diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml index 02c9405f6..19b617e6c 100644 --- a/b3sum/Cargo.toml +++ b/b3sum/Cargo.toml @@ -15,7 +15,7 @@ pure = ["blake3/pure"] [dependencies] anyhow = "1.0.25" -blake3 = { version = "1", path = "..", features = ["rayon"] } +blake3 = { version = "1", path = "..", features = ["file", "rayon"] } clap = { version = "4.0.8", features = ["derive", "wrap_help"] } hex = "0.4.0" memmap2 = "0.7.0" diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs index fd35f6861..165c57933 100644 --- a/b3sum/src/main.rs +++ b/b3sum/src/main.rs @@ -182,7 +182,7 @@ impl Input { } let file = File::open(path)?; if !args.no_mmap() { - if let Some(mmap) = maybe_memmap_file(&file)? { + if let Some(mmap) = blake3::file::maybe_memmap_file(&file)? { return Ok(Self::Mmap(io::Cursor::new(mmap))); } } @@ -208,12 +208,12 @@ impl Input { // one. We might implement that in the future, but since this is // the slow path anyway, it's not high priority. Self::File(file) => { - copy_wide(file, &mut hasher)?; + blake3::copy_wide(file, &mut hasher)?; } Self::Stdin => { let stdin = io::stdin(); let lock = stdin.lock(); - copy_wide(lock, &mut hasher)?; + blake3::copy_wide(lock, &mut hasher)?; } } let mut output_reader = hasher.finalize_xof(); @@ -232,58 +232,6 @@ impl Read for Input { } } -// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets -// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms -// can support at least 64 KiB, and there's some performance benefit to using -// bigger reads, so that's what we use here. -fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result { - let mut buffer = [0; 65536]; - let mut total = 0; - loop { - match reader.read(&mut buffer) { - Ok(0) => return Ok(total), - Ok(n) => { - hasher.update(&buffer[..n]); - total += n as u64; - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => return Err(e), - } - } -} - -// Mmap a file, if it looks like a good idea. Return None in cases where we -// know mmap will fail, or if the file is short enough that mmapping isn't -// worth it. However, if we do try to mmap and it fails, return the error. -fn maybe_memmap_file(file: &File) -> Result> { - let metadata = file.metadata()?; - let file_size = metadata.len(); - Ok(if !metadata.is_file() { - // Not a real file. - None - } else if file_size > isize::max_value() as u64 { - // Too long to safely map. - // https://github.com/danburkert/memmap-rs/issues/69 - None - } else if file_size == 0 { - // Mapping an empty file currently fails. - // https://github.com/danburkert/memmap-rs/issues/72 - None - } else if file_size < 16 * 1024 { - // Mapping small files is not worth it. - None - } else { - // Explicitly set the length of the memory map, so that filesystem - // changes can't race to violate the invariants we just checked. - let map = unsafe { - memmap2::MmapOptions::new() - .len(file_size as usize) - .map(file)? - }; - Some(map) - }) -} - fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> { // Encoding multiples of the 64 bytes is most efficient. // TODO: This computes each output block twice when the --seek argument isn't a multiple of 64. diff --git a/src/file.rs b/src/file.rs new file mode 100644 index 000000000..81ccbbe60 --- /dev/null +++ b/src/file.rs @@ -0,0 +1,67 @@ +//! The file-related utilities. +//! +//! # Examples +//! +//! ```no_run +//! use std::io; +//! +//! use blake3::file::hash_path_maybe_mmap; +//! +//! fn main() -> io::Result<()> { +//! let args: Vec<_> = std::env::args_os().collect(); +//! assert_eq!(args.len(), 2); +//! let path = &args[1]; +//! let mut hasher = blake3::Hasher::new(); +//! hash_path_maybe_mmap(&mut hasher, path)?; +//! println!("{}", hasher.finalize()); +//! Ok(()) +//! } +//! ``` + +use std::{fs::File, io, path::Path}; + +/// Mmap a file, if it looks like a good idea. Return None in cases where we +/// know mmap will fail, or if the file is short enough that mmapping isn't +/// worth it. However, if we do try to mmap and it fails, return the error. +pub fn maybe_memmap_file(file: &File) -> io::Result> { + let metadata = file.metadata()?; + let file_size = metadata.len(); + #[allow(clippy::if_same_then_else)] + if !metadata.is_file() { + // Not a real file. + Ok(None) + } else if file_size > isize::max_value() as u64 { + // Too long to safely map. + // https://github.com/danburkert/memmap-rs/issues/69 + Ok(None) + } else if file_size == 0 { + // Mapping an empty file currently fails. + // https://github.com/danburkert/memmap-rs/issues/72 + Ok(None) + } else if file_size < 16 * 1024 { + // Mapping small files is not worth it. + Ok(None) + } else { + // Explicitly set the length of the memory map, so that filesystem + // changes can't race to violate the invariants we just checked. + let map = unsafe { + memmap2::MmapOptions::new() + .len(file_size as usize) + .map(file)? + }; + Ok(Some(map)) + } +} + +/// Hash a file fast. +/// +/// It may use mmap if the file is big enough. If not, it will read the whole file into a buffer. +pub fn hash_path_maybe_mmap(hasher: &mut crate::Hasher, path: impl AsRef) -> io::Result<()> { + let file = File::open(path.as_ref())?; + if let Some(mmap) = maybe_memmap_file(&file)? { + hasher.update_rayon(&mmap); + } else { + crate::copy_wide(&file, hasher)?; + } + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs index 52971b727..b26238096 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,6 +115,9 @@ mod sse41; #[cfg(feature = "traits-preview")] pub mod traits; +#[cfg(feature = "file")] +pub mod file; + mod join; use arrayref::{array_mut_ref, array_ref}; @@ -1352,6 +1355,29 @@ impl std::io::Write for Hasher { } } +/// Copy from `reader` to `hasher`, returning the number of bytes read. +/// +/// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets +/// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms +/// can support at least 64 KiB, and there's some performance benefit to using +/// bigger reads, so that's what we use here. +#[cfg(feature = "std")] +pub fn copy_wide(mut reader: impl std::io::Read, hasher: &mut Hasher) -> std::io::Result { + let mut buffer = [0; 65536]; + let mut total = 0; + loop { + match reader.read(&mut buffer) { + Ok(0) => return Ok(total), + Ok(n) => { + hasher.update(&buffer[..n]); + total += n as u64; + } + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + } + } +} + /// An incremental reader for extended output, returned by /// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). ///