From ac94d7b1e9db237aa6e491185206d438f455a454 Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Sat, 5 Oct 2024 08:09:27 -0300 Subject: [PATCH] cmp: print verbose diffs as we find them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this change, we would first find all changes so we could obtain the largest offset we will report and use that to set up the padding. Now we use the file sizes to estimate the largest possible offset. Not only does this allow us to print earlier, reduces memory usage, as we do not store diffs to report later, but it also fixes a case in which our output was different to GNU cmp's - because it also seems to estimate based on size. Memory usage drops by a factor of 1000(!), without losing performance while comparing 2 binaries of hundreds of MBs: Before: Maximum resident set size (kbytes): 2489260 Benchmark 1: ../target/release/diffutils \ cmp -l -b /usr/lib64/chromium-browser/chromium-browser /usr/lib64/firefox/libxul.so Time (mean ± σ): 14.466 s ± 0.166 s [User: 12.367 s, System: 2.012 s] Range (min … max): 14.350 s … 14.914 s 10 runs After: Maximum resident set size (kbytes): 2636 Benchmark 1: ../target/release/diffutils \ cmp -l -b /usr/lib64/chromium-browser/chromium-browser /usr/lib64/firefox/libxul.so Time (mean ± σ): 13.724 s ± 0.038 s [User: 12.263 s, System: 1.372 s] Range (min … max): 13.667 s … 13.793 s 10 runs --- src/cmp.rs | 194 +++++++++++++++++++++---------------------- tests/integration.rs | 2 +- 2 files changed, 97 insertions(+), 99 deletions(-) diff --git a/src/cmp.rs b/src/cmp.rs index 1d9ca9e..3a67de8 100644 --- a/src/cmp.rs +++ b/src/cmp.rs @@ -9,7 +9,7 @@ use std::ffi::OsString; use std::io::{BufRead, BufReader, BufWriter, Read, Write}; use std::iter::Peekable; use std::process::ExitCode; -use std::{fs, io}; +use std::{cmp, fs, io}; #[cfg(not(target_os = "windows"))] use std::os::fd::{AsRawFd, FromRawFd}; @@ -320,10 +320,39 @@ pub fn cmp(params: &Params) -> Result { let mut from = prepare_reader(¶ms.from, ¶ms.skip_a, params)?; let mut to = prepare_reader(¶ms.to, ¶ms.skip_b, params)?; + let mut offset_width = if let Some(max_bytes) = params.max_bytes { + max_bytes + } else { + usize::MAX + }; + + if let (Ok(a_meta), Ok(b_meta)) = (fs::metadata(¶ms.from), fs::metadata(¶ms.to)) { + #[cfg(not(target_os = "windows"))] + let (a_size, b_size) = (a_meta.size(), b_meta.size()); + + #[cfg(target_os = "windows")] + let (a_size, b_size) = (a_meta.file_size(), b_meta.file_size()); + + let smaller = cmp::min(a_size, b_size) as usize; + offset_width = cmp::min(smaller, offset_width); + + // If the files have different sizes, we already know they are not identical. If we have not + // been asked to show even the first difference, we can quit early. + if params.quiet && a_size != b_size { + return Ok(Cmp::Different); + } + } + + let offset_width = 1 + offset_width.checked_ilog10().unwrap_or(1) as usize; + + // Capacity calc: at_byte width + 2 x 3-byte octal numbers + 2 x 4-byte value + 4 spaces + let mut output = Vec::::with_capacity(offset_width + 3 * 2 + 4 * 2 + 4); + let mut at_byte = 1; let mut at_line = 1; let mut start_of_line = true; - let mut verbose_diffs = vec![]; + let mut stdout = BufWriter::new(io::stdout().lock()); + let mut compare = Cmp::Equal; loop { // Fill up our buffers. let from_buf = match from.fill_buf() { @@ -360,10 +389,6 @@ pub fn cmp(params: &Params) -> Result { ¶ms.to.to_string_lossy() }; - if params.verbose { - report_verbose_diffs(verbose_diffs, params)?; - } - report_eof(at_byte, at_line, start_of_line, eof_on, params); return Ok(Cmp::Different); } @@ -395,8 +420,24 @@ pub fn cmp(params: &Params) -> Result { // first one runs out. for (&from_byte, &to_byte) in from_buf.iter().zip(to_buf.iter()) { if from_byte != to_byte { + compare = Cmp::Different; + if params.verbose { - verbose_diffs.push((at_byte, from_byte, to_byte)); + format_verbose_difference( + from_byte, + to_byte, + at_byte, + offset_width, + &mut output, + params, + )?; + stdout.write_all(output.as_slice()).map_err(|e| { + format!( + "{}: error printing output: {e}", + params.executable.to_string_lossy() + ) + })?; + output.clear(); } else { report_difference(from_byte, to_byte, at_byte, at_line, params); return Ok(Cmp::Different); @@ -422,12 +463,7 @@ pub fn cmp(params: &Params) -> Result { to.consume(consumed); } - if params.verbose && !verbose_diffs.is_empty() { - report_verbose_diffs(verbose_diffs, params)?; - return Ok(Cmp::Different); - } - - Ok(Cmp::Equal) + Ok(compare) } // Exit codes are documented at @@ -450,21 +486,6 @@ pub fn main(opts: Peekable) -> ExitCode { return ExitCode::SUCCESS; } - // If the files have different sizes, we already know they are not identical. If we have not - // been asked to show even the first difference, we can quit early. - if params.quiet { - if let (Ok(a_meta), Ok(b_meta)) = (fs::metadata(¶ms.from), fs::metadata(¶ms.to)) { - #[cfg(not(target_os = "windows"))] - if a_meta.size() != b_meta.size() { - return ExitCode::from(1); - } - #[cfg(target_os = "windows")] - if a_meta.file_size() != b_meta.file_size() { - return ExitCode::from(1); - } - } - } - match cmp(¶ms) { Ok(Cmp::Equal) => ExitCode::SUCCESS, Ok(Cmp::Different) => ExitCode::from(1), @@ -533,99 +554,76 @@ fn format_byte(byte: u8) -> String { // This function has been optimized to not use the Rust fmt system, which // leads to a massive speed up when processing large files: cuts the time // for comparing 2 ~36MB completely different files in half on an M1 Max. -fn report_verbose_diffs(diffs: Vec<(usize, u8, u8)>, params: &Params) -> Result<(), String> { +#[inline] +fn format_verbose_difference( + from_byte: u8, + to_byte: u8, + at_byte: usize, + offset_width: usize, + output: &mut Vec, + params: &Params, +) -> Result<(), String> { assert!(!params.quiet); - let mut stdout = BufWriter::new(io::stdout().lock()); - if let Some((offset, _, _)) = diffs.last() { - // Obtain the width of the first column from the last byte offset. - let width = format!("{}", offset).len(); - - let mut at_byte_buf = itoa::Buffer::new(); - let mut from_oct = [0u8; 3]; // for octal conversions - let mut to_oct = [0u8; 3]; - - // Capacity calc: at_byte width + 2 x 3-byte octal numbers + 4-byte value + up to 2 byte value + 4 spaces - let mut output = Vec::::with_capacity(width + 3 * 2 + 4 + 2 + 4); - - if params.print_bytes { - for (at_byte, from_byte, to_byte) in diffs { - output.clear(); + let mut at_byte_buf = itoa::Buffer::new(); + let mut from_oct = [0u8; 3]; // for octal conversions + let mut to_oct = [0u8; 3]; - // "{:>width$} {:>3o} {:4} {:>3o} {}", - let at_byte_str = at_byte_buf.format(at_byte); - let at_byte_padding = width - at_byte_str.len(); - - for _ in 0..at_byte_padding { - output.push(b' ') - } - - output.extend_from_slice(at_byte_str.as_bytes()); + if params.print_bytes { + // "{:>width$} {:>3o} {:4} {:>3o} {}", + let at_byte_str = at_byte_buf.format(at_byte); + let at_byte_padding = offset_width.saturating_sub(at_byte_str.len()); - output.push(b' '); + for _ in 0..at_byte_padding { + output.push(b' ') + } - output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes()); + output.extend_from_slice(at_byte_str.as_bytes()); - output.push(b' '); + output.push(b' '); - let from_byte_str = format_byte(from_byte); - let from_byte_padding = 4 - from_byte_str.len(); + output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes()); - output.extend_from_slice(from_byte_str.as_bytes()); + output.push(b' '); - for _ in 0..from_byte_padding { - output.push(b' ') - } + let from_byte_str = format_byte(from_byte); + let from_byte_padding = 4 - from_byte_str.len(); - output.push(b' '); + output.extend_from_slice(from_byte_str.as_bytes()); - output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes()); - - output.push(b' '); + for _ in 0..from_byte_padding { + output.push(b' ') + } - output.extend_from_slice(format_byte(to_byte).as_bytes()); + output.push(b' '); - output.push(b'\n'); + output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes()); - stdout.write_all(output.as_slice()).map_err(|e| { - format!( - "{}: error printing output: {e}", - params.executable.to_string_lossy() - ) - })?; - } - } else { - for (at_byte, from_byte, to_byte) in diffs { - output.clear(); + output.push(b' '); - // "{:>width$} {:>3o} {:>3o}" - let at_byte_str = at_byte_buf.format(at_byte); - let at_byte_padding = width - at_byte_str.len(); + output.extend_from_slice(format_byte(to_byte).as_bytes()); - for _ in 0..at_byte_padding { - output.push(b' ') - } + output.push(b'\n'); + } else { + // "{:>width$} {:>3o} {:>3o}" + let at_byte_str = at_byte_buf.format(at_byte); + let at_byte_padding = offset_width - at_byte_str.len(); - output.extend_from_slice(at_byte_str.as_bytes()); + for _ in 0..at_byte_padding { + output.push(b' ') + } - output.push(b' '); + output.extend_from_slice(at_byte_str.as_bytes()); - output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes()); + output.push(b' '); - output.push(b' '); + output.extend_from_slice(format_octal(from_byte, &mut from_oct).as_bytes()); - output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes()); + output.push(b' '); - output.push(b'\n'); + output.extend_from_slice(format_octal(to_byte, &mut to_oct).as_bytes()); - stdout.write_all(output.as_slice()).map_err(|e| { - format!( - "{}: error printing output: {e}", - params.executable.to_string_lossy() - ) - })?; - } - } + output.push(b'\n'); } Ok(()) diff --git a/tests/integration.rs b/tests/integration.rs index 4cff8ff..5619b1a 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -616,7 +616,7 @@ mod cmp { .code(predicate::eq(1)) .failure() .stderr(predicate::str::is_empty()) - .stdout(predicate::eq("4 40 144 d\n8 40 150 h\n")); + .stdout(predicate::eq(" 4 40 144 d\n 8 40 150 h\n")); Ok(()) }