diff --git a/BENCHMARKING.md b/BENCHMARKING.md new file mode 100644 index 0000000..01a9736 --- /dev/null +++ b/BENCHMARKING.md @@ -0,0 +1,63 @@ +# Benchmarking diff + +The engine used by our diff tool tries to balance execution time with patch +quality. It implements the Myers algorithm with a few heuristics which are also +used by GNU diff to avoid pathological cases. + +The original paper can be found here: +- https://link.springer.com/article/10.1007/BF01840446 + +Currently, not all tricks used by GNU diff are adopted by our implementation. +For instance, GNU diff will isolate lines that only exist in each of the files +and not include them on the diffing process. It also does post-processing of the +edits to produce more cohesive hunks. Both of these combinar should make it +produce better patches for large files which are very different. + +Run `cargo build --release` before benchmarking after you make a change! + +## How to benchmark + +It is recommended that you use the 'hyperfine' tool to run your benchmarks. This +is an example of how to run a comparison with GNU diff: + +``` +> hyperfine -N -i --warmup 2 --output=pipe 'diff t/huge t/huge.3' +'./target/release/diffutils diff t/huge t/huge.3' +Benchmark 1: diff t/huge t/huge.3 + Time (mean ± σ): 136.3 ms ± 3.0 ms [User: 88.5 ms, System: 17.9 ms] + Range (min … max): 131.8 ms … 144.4 ms 21 runs + + Warning: Ignoring non-zero exit code. + +Benchmark 2: ./target/release/diffutils diff t/huge t/huge.3 + Time (mean ± σ): 74.4 ms ± 1.0 ms [User: 47.6 ms, System: 24.9 ms] + Range (min … max): 72.9 ms … 77.1 ms 41 runs + + Warning: Ignoring non-zero exit code. + +Summary + ./target/release/diffutils diff t/huge t/huge.3 ran + 1.83 ± 0.05 times faster than diff t/huge t/huge.3 +> +``` + +As you can see, you should provide both commands you want to compare on a single +invocation of 'hyperfine'. Each as a single argument, so use quotes. These are +the relevant parameters: + +- -N: avoids using a shell as intermediary to run the command +- -i: ignores non-zero exit code, which diff uses to mean files differ +- --warmup 2: 2 runs before measuring, warms up I/O cache for large files +- --output=pipe: disable any potential optimizations based on output destination + +## Inputs + +Performance will vary based on several factors, the main ones being: + +- how large the files being compared are +- how different the files being compared are +- how large and far between sequences of equal lines are + +When looking at performance improvements, testing small and large (tens of MBs) +which have few differences, many differences, completely different is important +to cover all of the potential pathological cases. diff --git a/Cargo.lock b/Cargo.lock index fe461de..0dd14f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -67,7 +67,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", - "regex-automata", + "regex-automata 0.4.8", "serde", ] @@ -77,6 +77,12 @@ version = "3.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cc" version = "1.0.90" @@ -127,13 +133,15 @@ version = "0.4.2" dependencies = [ "assert_cmd", "chrono", - "diff", "itoa", "predicates", "pretty_assertions", + "rand", "regex", "same-file", "tempfile", + "tracing", + "tracing-subscriber", "unicode-width", ] @@ -168,6 +176,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "iana-time-zone" version = "0.1.60" @@ -206,6 +225,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "libc" version = "0.2.159" @@ -224,6 +249,15 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +[[package]] +name = "matchers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +dependencies = [ + "regex-automata 0.1.10", +] + [[package]] name = "memchr" version = "2.7.1" @@ -236,6 +270,16 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num-traits" version = "0.2.18" @@ -251,6 +295,27 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + +[[package]] +name = "pin-project-lite" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + [[package]] name = "predicates" version = "3.1.2" @@ -309,6 +374,36 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + [[package]] name = "regex" version = "1.11.0" @@ -317,8 +412,17 @@ checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" dependencies = [ "aho-corasick", "memchr", - "regex-automata", - "regex-syntax", + "regex-automata 0.4.8", + "regex-syntax 0.8.5", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", ] [[package]] @@ -329,9 +433,15 @@ checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.5", ] +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.8.5" @@ -380,6 +490,21 @@ dependencies = [ "syn", ] +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + [[package]] name = "syn" version = "2.0.50" @@ -410,6 +535,77 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" +[[package]] +name = "thread_local" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" +dependencies = [ + "cfg-if", + "once_cell", +] + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + [[package]] name = "unicode-ident" version = "1.0.12" @@ -422,6 +618,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" +[[package]] +name = "valuable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + [[package]] name = "wait-timeout" version = "0.2.0" @@ -431,6 +633,12 @@ dependencies = [ "libc", ] +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "wasm-bindgen" version = "0.2.92" @@ -612,3 +820,24 @@ name = "yansi" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index 6fa1a3c..b94725b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,11 +16,13 @@ path = "src/main.rs" [dependencies] chrono = "0.4.38" -diff = "0.1.13" itoa = "1.0.11" regex = "1.10.4" same-file = "1.0.6" unicode-width = "0.2.0" +tracing = "0.1.40" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +rand = "0.8.5" [dev-dependencies] pretty_assertions = "1.4.0" @@ -42,6 +44,11 @@ ci = ["github"] # The installers to generate for each app installers = [] # Target platforms to build apps for (Rust target-triple syntax) -targets = ["aarch64-apple-darwin", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-pc-windows-msvc"] +targets = [ + "aarch64-apple-darwin", + "x86_64-apple-darwin", + "x86_64-unknown-linux-gnu", + "x86_64-pc-windows-msvc", +] # Publish jobs to run in CI pr-run-mode = "plan" diff --git a/src/context_diff.rs b/src/context_diff.rs index 873fc3d..8e568eb 100644 --- a/src/context_diff.rs +++ b/src/context_diff.rs @@ -6,6 +6,7 @@ use std::collections::VecDeque; use std::io::Write; +use crate::engine::{self, Edit}; use crate::params::Params; use crate::utils::do_write_line; use crate::utils::get_modification_time; @@ -77,9 +78,9 @@ fn make_diff( // Rust only allows allocations to grow to isize::MAX, and this is bigger than that. let mut expected_lines_change_idx: usize = !0; - for result in diff::slice(&expected_lines, &actual_lines) { + for result in engine::diff(&expected_lines, &actual_lines) { match result { - diff::Result::Left(str) => { + Edit::Delete(str) => { if lines_since_mismatch > context_size && lines_since_mismatch > 0 { results.push(mismatch); mismatch = Mismatch::new( @@ -101,7 +102,7 @@ fn make_diff( line_number_expected += 1; lines_since_mismatch = 0; } - diff::Result::Right(str) => { + Edit::Insert(str) => { if lines_since_mismatch > context_size && lines_since_mismatch > 0 { results.push(mismatch); mismatch = Mismatch::new( @@ -132,7 +133,7 @@ fn make_diff( line_number_actual += 1; lines_since_mismatch = 0; } - diff::Result::Both(str, _) => { + Edit::Keep(str) => { expected_lines_change_idx = !0; // if one of them is missing a newline and the other isn't, then they don't actually match if (line_number_actual > actual_lines_count) @@ -616,6 +617,7 @@ mod tests { let _ = fb; let output = Command::new("patch") .arg("-p0") + .arg(format!("{target}/alefx")) .arg("--context") .stdin(File::open(format!("{target}/abx.diff")).unwrap()) .output() diff --git a/src/ed_diff.rs b/src/ed_diff.rs index b8cdbc5..9b3ff78 100644 --- a/src/ed_diff.rs +++ b/src/ed_diff.rs @@ -5,6 +5,7 @@ use std::io::Write; +use crate::engine::{self, Edit}; use crate::params::Params; use crate::utils::do_write_line; @@ -71,9 +72,9 @@ fn make_diff(expected: &[u8], actual: &[u8], stop_early: bool) -> Result { + Edit::Delete(str) => { if !mismatch.actual.is_empty() { results.push(mismatch); mismatch = Mismatch::new(line_number_expected, line_number_actual); @@ -81,11 +82,11 @@ fn make_diff(expected: &[u8], actual: &[u8], stop_early: bool) -> Result { + Edit::Insert(str) => { mismatch.actual.push(str.to_vec()); line_number_actual += 1; } - diff::Result::Both(_str, _) => { + Edit::Keep(_str) => { line_number_expected += 1; line_number_actual += 1; if !mismatch.actual.is_empty() || !mismatch.expected.is_empty() { diff --git a/src/engine.rs b/src/engine.rs new file mode 100644 index 0000000..56d9d14 --- /dev/null +++ b/src/engine.rs @@ -0,0 +1,514 @@ +// This file is part of the uutils diffutils package. +// +// For the full copyright and license information, please view the LICENSE-* +// files that was distributed with this source code. + +// This engine implements the Myers diff algorithm, which uses a double-ended +// diagonal search to identify the longest common subsequence (LCS) between two +// collections. The original paper can be found here: +// +// https://link.springer.com/article/10.1007/BF01840446 +// +// Unlike a naive LCS implementation, which covers all possible combinations, +// the Myers algorithm gradualy expands the search space, and only encodes +// the furthest progress made by each diagonal rather than storing each step +// of the search on a matrix. +// +// This makes it a lot more memory-efficient, as it only needs 2 * (m + n) +// positions to represent the state of the search, where m and n are the number +// of items in the collections being compared, whereas the naive LCS requires +// m * n positions. +// +// The downside is it is more compute-intensive than the naive method when +// searching through very different files. This may lead to unnacceptable run +// time in pathological cases (large, completely different files), so heuristics +// are often used to bail on the search if it gets too costly and/or a good enough +// subsequence has been found. +// +// We implement 3 main heuristics that are also used by GNU diff: +// +// 1. if we found a large enough common subsequence (also known as a 'snake') +// and have searched for a while, we return that one +// +// 2. if we have searched for a significant chunk of the collections (with a +// minimum of 4096 iterations, so we cover easy cases fully) and have not found +// one, we use whatever we have, even if it is a small snake or no snake at all +// +// 3. we keep track of the overall cost of the various searches that are done +// over the course of the divide and conquer strategy, and if that becomes too +// large we give up on trying to find long similarities altogether +// +// This last heuristic could be improved significantly in the future if we +// implement an optimization that separates items that only appear in either +// collection and remove them from the diffing process, like GNU diff does. +use std::fmt::Debug; +use std::ops::{Index, IndexMut, RangeInclusive}; + +use rand::Rng as _; +use tracing::{info, instrument, trace, Level}; + +#[derive(Debug, Default, PartialEq)] +struct Snake { + x: usize, + y: usize, + length: usize, +} + +impl Snake { + fn is_good(&self) -> bool { + // This magic number comes from GNU diff. + self.length > 20 + } + + fn maybe_update(&mut self, x: isize, y: isize, length: isize) { + let length = length as usize; + if length > self.length { + trace!(x = x, y = y, length = length, "new best snake"); + self.x = x as usize; + self.y = y as usize; + self.length = length; + } + } + + fn maybe_set(&mut self, x: isize, y: isize) { + if self.length == 0 { + self.x = x as usize; + self.y = y as usize; + } + } +} + +#[instrument(skip_all)] +fn find_split_point>>( + left: &[T], + right: &[T], + total_cost: &mut usize, +) -> Snake { + let left_length = left.len() as isize; + let right_length = right.len() as isize; + + let max_cost = left_length + right_length; + + // This constant is the value used by GNU diff; using it should give us + // more similar diffs. + const HIGH_COST: isize = 200; + + // This magic number was borrowed from GNU diff - apparently this is a + // good number for modern CPUs. + let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096); + info!(too_expensive = too_expensive); + + // We've been constantly hitting the too expensive heuristic, this means the + // files are too different for us to get a good diff in reasonable amount of + // time. Do naive splits from now on. + if *total_cost as isize > too_expensive * 10 { + info!( + total_cost = total_cost, + "hit too costly overall heuristic, creating naive split" + ); + let mut rng = rand::thread_rng(); + let x = if left_length == 0 { + 0 + } else { + rng.gen_range(0..left.len()) + }; + let y = if right_length == 0 { + 0 + } else { + rng.gen_range(0..right.len()) + }; + return Snake { x, y, length: 0 }; + } + + // For collections of different sizes, the diagonals will not neatly balance. That means the + // "middle" diagonal for the backwards search will be offset from the forward one, so we need + // to keep track of that so we start at the right point. + let backwards_mid = left_length - right_length; + + // Since we explore in steps of 2, if the offset mentioned above is odd the diagonals will + // not align during exploration. We use this to know if we check for meeting in the middle + // in the forwards or backwards search. + let ends_align = backwards_mid & 1 != 1; + + trace!(backwards_mid = backwards_mid, ends_align = ends_align); + + // The diagonals are initialized with values that are outside of the limits of the expected + // values so that the edit choices at the frontiers are always correct. We set the values at + // the mid diagonals to their correct initial values, though. + // + // The conceptual model of this algorithm is that 'left' is the title row of a matrix, and + // 'right' is the title column. The vector positions represent the best value of x we managed + // to achieve so far for each of those diagonals, rather then filling in the whole matrix. Note + // that "best" will be "the highest" for forward searches, and "the lowest" for backward, since + // we start from the high end on that one. + // + // Let's focus on the forward one, with x as an index for 'left', y as index for 'right', and + // d as the index of the vector. At the start, d = 0 means x = 0, y = 0, no offsets. If we go + // to the previous position on the vector, that conceptually means increasing the offset of y, + // since its value is derived from x - d. Offsetting 'right' means we are exploring an insertion. + // Going to the next position on the other hand means we are decreasing the offset of y, which + // means we are exploring a deletion (offsetting x relative to y). + let mut forward_diagonals = Diagonals::new(-1isize, left_length, right_length); + forward_diagonals[0] = 0; + + let mut backward_diagonals = Diagonals::new(isize::MAX, left_length, right_length); + backward_diagonals[backwards_mid] = left_length; + + let in_bounds = |x: isize, y: isize, offset: isize| -> bool { + x >= offset && y >= offset && x < left_length + offset && y < right_length + offset + }; + + let mut best_snake = Snake::default(); + + let forward_span = tracing::span!(Level::TRACE, "forward"); + let backward_span = tracing::span!(Level::TRACE, "backward"); + 'outer: for c in 1..max_cost { + *total_cost += 1; + + info!(c = c, snake_length = best_snake.length); + // The files appear to be large and too different. Go for good enough + if c > too_expensive { + break 'outer; + } + + // Forwards search + forward_diagonals.expand_search(); + let fwd = forward_span.enter(); + trace!( + low = forward_diagonals.search_range.start(), + high = forward_diagonals.search_range.end(), + "search space" + ); + for d in forward_diagonals.search_range.clone().rev().step_by(2) { + let mut x = if forward_diagonals[d - 1] < forward_diagonals[d + 1] { + trace!( + insertion = forward_diagonals[d - 1], + deletion = forward_diagonals[d + 1], + "exploring deletion" + ); + forward_diagonals[d + 1] + } else { + trace!( + insertion = forward_diagonals[d - 1], + deletion = forward_diagonals[d + 1], + "exploring insertion" + ); + forward_diagonals[d - 1] + 1 + }; + debug_assert!(x != -1); + + let initial_x = x; + let mut y = x - d; + + trace!(d = d, x = x, y = y, "before snaking"); + while in_bounds(x, y, 0) && left[x as usize] == right[y as usize] { + x += 1; + y += 1; + } + trace!(d = d, x = x, y = y, "after snaking"); + + forward_diagonals[d] = x; + + let snake_length = x - initial_x; + best_snake.maybe_update(initial_x, y - snake_length, snake_length); + + if !ends_align + && backward_diagonals.search_range.contains(&d) + && x >= backward_diagonals[d] + { + trace!("met backward at mid point"); + best_snake.maybe_set(x, y); + break 'outer; + } + } + drop(fwd); + + // Backwards search + backward_diagonals.expand_search(); + let bwd = backward_span.enter(); + trace!( + low = backward_diagonals.search_range.start(), + high = backward_diagonals.search_range.end(), + "search space" + ); + for d in backward_diagonals.search_range.clone().rev().step_by(2) { + // If we hit this assert we went outside the explored boundaries. + debug_assert!( + backward_diagonals[d - 1] != isize::MAX || backward_diagonals[d + 1] != isize::MAX + ); + let mut x = if backward_diagonals[d - 1] < backward_diagonals[d + 1] { + trace!( + insertion = backward_diagonals[d - 1], + deletion = backward_diagonals[d + 1], + "exploring insertion" + ); + backward_diagonals[d - 1] + } else { + trace!( + insertion = backward_diagonals[d - 1], + deletion = backward_diagonals[d + 1], + "exploring deletion" + ); + backward_diagonals[d + 1] - 1 + }; + + let initial_x = x; + let mut y = x - d; + + trace!(d = d, x = x, y = y, "before snaking"); + while in_bounds(x, y, 1) && left[x as usize - 1] == right[y as usize - 1] { + x -= 1; + y -= 1; + } + trace!(d = d, x = x, y = y, "after snaking"); + + backward_diagonals[d] = x; + + best_snake.maybe_update(x, y, initial_x - x); + + if ends_align + && forward_diagonals.search_range.contains(&d) + && x <= forward_diagonals[d] + { + trace!("met forward at mid point"); + best_snake.maybe_set(x, y); + break 'outer; + } + } + drop(bwd); + + if c > HIGH_COST && best_snake.is_good() { + info!("met criteria for high cost with good snake heuristic"); + break 'outer; + } + } + + // If we hit this condition, the search ran too long and found 0 matches. + // Get the best we can do as a split point - furthest diagonal. + if best_snake.length == 0 { + let (x, y) = forward_diagonals.get_furthest_progress(); + best_snake.x = x; + best_snake.y = y; + } + + info!( + x = best_snake.x, + y = best_snake.y, + length = best_snake.length, + "** DONE best snake:" + ); + best_snake +} + +// Delete: we skip that line from 'left' +// Insert: we add that line from 'right' +// Keep: both have that line, leave untouched +#[derive(Debug)] +pub enum Edit<'a, T: Debug + PartialEq> { + Delete(&'a T), + Insert(&'a T), + Keep(&'a T), +} + +#[instrument(skip_all)] +pub fn diff<'a, T: Clone + Debug + PartialEq + Into>>( + left: &'a [T], + right: &'a [T], +) -> Vec> { + trace!(left_length = left.len(), right_length = right.len()); + let mut edits = vec![]; + let mut total_cost = 0; + do_diff(left, right, &mut edits, &mut total_cost); + edits +} + +#[instrument(skip_all)] +fn do_diff<'a, T: Clone + Debug + PartialEq + Into>>( + left: &'a [T], + right: &'a [T], + edits: &mut Vec>, + total_cost: &mut usize, +) { + if left.is_empty() { + right.iter().for_each(|r| edits.push(Edit::Insert(r))); + return; + } else if right.is_empty() { + left.iter().for_each(|l| edits.push(Edit::Delete(l))); + return; + } + + // Add leading matches to our edits while finding them. + let leading_matches = left + .iter() + .zip(right.iter()) + .take_while(|(l, r)| l == r) + .map(|(l, _)| edits.push(Edit::Keep(l))) + .count(); + + // We need to hold on to add the trailing ones to keep ordering + // so just calculate how many there are. + let trailing_matches = left[leading_matches..] + .iter() + .rev() + .zip(right[leading_matches..].iter().rev()) + .take_while(|(l, r)| l == r) + .count(); + + trace!( + leading_matches = leading_matches, + trailing_matches = trailing_matches + ); + + let left_remaining = &left[leading_matches..left.len() - trailing_matches]; + let right_remaining = &right[leading_matches..right.len() - trailing_matches]; + + let snake = find_split_point(left_remaining, right_remaining, total_cost); + + trace!(x = snake.x, y = snake.y, length = snake.length, "snake"); + + // No more matches were found, do all deletions / insertions. + if snake.length == 0 { + left_remaining + .iter() + .for_each(|l| edits.push(Edit::Delete(l))); + right_remaining + .iter() + .for_each(|r| edits.push(Edit::Insert(r))); + } else { + // Divide and conquer based on the best snake we found. + let (l1, l2) = left_remaining.split_at(snake.x); + let (r1, r2) = right_remaining.split_at(snake.y); + + trace!( + a = l1.len(), + b = r1.len(), + a = l2.len(), + b = r2.len(), + "split" + ); + + do_diff(l1, r1, edits, total_cost); + do_diff(l2, r2, edits, total_cost); + } + + // Finally add the trailing matches. + left[left.len() - trailing_matches..] + .iter() + .for_each(|l| edits.push(Edit::Keep(l))); +} + +struct Diagonals { + data: Vec, + center: usize, + search_range: RangeInclusive, + + min_diag: isize, + max_diag: isize, +} + +impl Debug for Diagonals { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + for (i, v) in self.data[..self.center].iter().enumerate() { + let _ = write!(f, "({}: {v})", i as isize - self.center as isize); + } + + let _ = writeln!(f, "\ncenter: ({}: {})", self.center, self.data[self.center]); + + for (i, v) in self.data[self.center + 1..].iter().enumerate() { + let _ = write!(f, "({}: {v})", i + 1); + } + + Ok(()) + } +} + +impl Diagonals { + pub fn new(filler: isize, left_length: isize, right_length: isize) -> Self { + let size = left_length + .checked_add(right_length) + .and_then(|s| s.checked_add(3)); + let Some(size) = size else { + panic!("Tried to create Diagonals of a size we cannot represent: {left_length} + {right_length} + 3"); + }; + + // Our internal representaiton has 3 more positions than the sum of the lengths. + // That is because we always look at diagonals of either side when evaluating a + // diagonal, so we need room for an "out of bounds" value when checking the extremes. + // We also need room to represent the middle diagonal at the middle. + let mid_diag = left_length - right_length; + Self { + data: vec![filler; size as usize], + center: (right_length + 1) as usize, + search_range: mid_diag..=mid_diag, + + min_diag: -(right_length), + max_diag: left_length, + } + } + + fn actual_index(&self, index: isize) -> usize { + (self.center as isize + index) as usize + } + + fn in_bounds(&self, index: isize) -> bool { + let actual = self.center as isize + index; + actual >= 0 && (actual as usize) < self.data.len() + } + + fn get_furthest_progress(&self) -> (usize, usize) { + let (d, x) = self + .data + .iter() + .enumerate() + .filter(|(d, &x)| x - (*d as isize) >= 0) + .max_by_key(|(_, &x)| x) + .map(|(i, x)| (i as isize, *x)) + .unwrap_or((0isize, 0isize)); + let y = x - d; + debug_assert!(x >= 0); + debug_assert!(y >= 0); + (x as usize, y as usize) + } + + fn expand_search(&mut self) { + let upper = if *self.search_range.end() == self.max_diag { + self.search_range.end() - 1 + } else { + self.search_range.end() + 1 + }; + let lower = (self.search_range.start() - 1).max(self.min_diag); + + trace!( + min_diag = self.min_diag, + max_diag = self.max_diag, + prev_lower = self.search_range.start(), + prev_upper = self.search_range.end(), + new_lower = lower, + new_upper = upper, + ); + + self.search_range = lower..=upper; + } +} + +impl Index for Diagonals { + type Output = isize; + + fn index(&self, index: isize) -> &Self::Output { + if !self.in_bounds(index) { + panic!("Index out of bounds: {} for SignedVec", index); + } + let actual_index = self.actual_index(index); + &self.data[actual_index] + } +} + +impl IndexMut for Diagonals { + fn index_mut(&mut self, index: isize) -> &mut Self::Output { + if !self.in_bounds(index) { + panic!("Index out of bounds: {} for SignedVec", index); + } + let actual_index = self.actual_index(index); + &mut self.data[actual_index] + } +} diff --git a/src/lib.rs b/src/lib.rs index a20ac56..bf9e507 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ pub mod cmp; pub mod context_diff; pub mod ed_diff; +pub mod engine; pub mod macros; pub mod normal_diff; pub mod params; diff --git a/src/main.rs b/src/main.rs index 8194d00..a6a08d9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,6 +15,7 @@ mod cmp; mod context_diff; mod diff; mod ed_diff; +mod engine; mod macros; mod normal_diff; mod params; @@ -52,6 +53,8 @@ fn second_arg_error(name: &OsStr) -> ! { } fn main() -> ExitCode { + tracing_subscriber::fmt::init(); + let mut args = std::env::args_os().peekable(); let exe_path = binary_path(&mut args); diff --git a/src/normal_diff.rs b/src/normal_diff.rs index 002cd01..ebb7a3e 100644 --- a/src/normal_diff.rs +++ b/src/normal_diff.rs @@ -5,6 +5,7 @@ use std::io::Write; +use crate::engine::{self, Edit}; use crate::params::Params; use crate::utils::do_write_line; @@ -54,9 +55,9 @@ fn make_diff(expected: &[u8], actual: &[u8], stop_early: bool) -> Vec actual_lines.pop(); } - for result in diff::slice(&expected_lines, &actual_lines) { + for result in engine::diff(&expected_lines, &actual_lines) { match result { - diff::Result::Left(str) => { + Edit::Delete(str) => { if !mismatch.actual.is_empty() && !mismatch.actual_missing_nl { results.push(mismatch); mismatch = Mismatch::new(line_number_expected, line_number_actual); @@ -65,12 +66,12 @@ fn make_diff(expected: &[u8], actual: &[u8], stop_early: bool) -> Vec mismatch.expected_missing_nl = line_number_expected > expected_lines_count; line_number_expected += 1; } - diff::Result::Right(str) => { + Edit::Insert(str) => { mismatch.actual.push(str.to_vec()); mismatch.actual_missing_nl = line_number_actual > actual_lines_count; line_number_actual += 1; } - diff::Result::Both(str, _) => { + Edit::Keep(str) => { match ( line_number_expected > expected_lines_count, line_number_actual > actual_lines_count, diff --git a/src/unified_diff.rs b/src/unified_diff.rs index 0f504a8..abdb5dd 100644 --- a/src/unified_diff.rs +++ b/src/unified_diff.rs @@ -6,6 +6,7 @@ use std::collections::VecDeque; use std::io::Write; +use crate::engine::{self, Edit}; use crate::params::Params; use crate::utils::do_write_line; use crate::utils::get_modification_time; @@ -65,9 +66,9 @@ fn make_diff( actual_lines.pop(); } - for result in diff::slice(&expected_lines, &actual_lines) { + for result in engine::diff(&expected_lines, &actual_lines) { match result { - diff::Result::Left(str) => { + Edit::Delete(str) => { if lines_since_mismatch >= context_size && lines_since_mismatch > 0 { results.push(mismatch); mismatch = Mismatch::new( @@ -104,7 +105,7 @@ fn make_diff( line_number_expected += 1; lines_since_mismatch = 0; } - diff::Result::Right(str) => { + Edit::Insert(str) => { if lines_since_mismatch >= context_size && lines_since_mismatch > 0 { results.push(mismatch); mismatch = Mismatch::new( @@ -125,7 +126,7 @@ fn make_diff( line_number_actual += 1; lines_since_mismatch = 0; } - diff::Result::Both(str, _) => { + Edit::Keep(str) => { // if one of them is missing a newline and the other isn't, then they don't actually match if (line_number_actual > actual_lines_count) && (line_number_expected > expected_lines_count) @@ -771,6 +772,7 @@ mod tests { let _ = fb; let output = Command::new("patch") .arg("-p0") + .arg(format!("{target}/alefx")) .stdin(File::open(format!("{target}/abx.diff")).unwrap()) .output() .unwrap(); diff --git a/tests/integration.rs b/tests/integration.rs index c11726e..40b497c 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -95,6 +95,8 @@ mod common { } mod diff { + use std::process::Stdio; + use diffutilslib::assert_diff_eq; use super::*; @@ -341,6 +343,97 @@ mod diff { Ok(()) } + + fn str_bar_diff(bar: &[u8]) -> String { + String::from_utf8( + bar.split(|b| *b == b'\n') + .filter(|b| b != b"") + .flat_map(|b| [b">", b" ", b, b"\n"].concat()) + .collect::>(), + ) + .unwrap() + } + + #[test] + fn large_similar_files() -> Result<(), Box> { + // Large similar files should still produce ideal diffs, not + // triggering the total cost heuristic. + let foo = b"f\n".repeat(16 * 1024 * 1024); + let bar = b"b\n".repeat(26); + + let mut file1 = NamedTempFile::new()?; + file1.write_all(&foo)?; + file1.write_all(&foo)?; + let mut file2 = NamedTempFile::new()?; + file2.write_all(&foo)?; + file2.write_all(&bar)?; + file2.write_all(&foo)?; + + let mut cmd = Command::cargo_bin("diffutils")?; + cmd.arg("diff").arg(file1.path()).arg(file2.path()); + cmd.assert() + .code(predicate::eq(1)) + .failure() + .stdout(predicate::eq(format!( + "16777216a16777217,16777242\n{}", + str_bar_diff(&bar) + ))); + + let mut file1 = NamedTempFile::new()?; + file1.write_all(&bar)?; + file1.write_all(&foo)?; + file1.write_all(&foo)?; + let mut file2 = NamedTempFile::new()?; + file2.write_all(&foo)?; + file2.write_all(&foo)?; + + let mut cmd = Command::cargo_bin("diffutils")?; + cmd.arg("diff").arg(file1.path()).arg(file2.path()); + cmd.assert() + .code(predicate::eq(1)) + .failure() + .stdout(predicate::eq(format!( + "1,26d0\n{}", + str_bar_diff(&bar).replace(">", "<") + ))); + + Ok(()) + } + + #[test] + fn large_different_files() -> Result<(), Box> { + let foo = b"f\n".repeat(4 * 1024 * 1024); + let bar = b"b\n".repeat(26); + let baz = b"z\n".repeat(4 * 1024 * 1024); + + let mut file1 = NamedTempFile::new()?; + file1.write_all(&foo)?; + file1.write_all(&bar)?; + let mut file2 = NamedTempFile::new()?; + file2.write_all(&baz)?; + file2.write_all(&bar)?; + + let mut child = std::process::Command::new(assert_cmd::cargo::cargo_bin("diffutils")) + .arg("diff") + .arg(file1.path()) + .arg(file2.path()) + .stdout(Stdio::null()) + .spawn() + .unwrap(); + + // The total cost heuristic should give up trying to find good split points + // in a reasonable amount of time (can still be a fairly big in debug builds) + for retries in 0.. { + std::thread::sleep(std::time::Duration::from_secs(1)); + if let Some(status) = child.try_wait()? { + assert_eq!(status.code(), Some(1)); + break; + } + assert!(retries < 10); + } + + Ok(()) + } } mod cmp {