diff --git a/BENCHMARKING.md b/BENCHMARKING.md
new file mode 100644
index 0000000..01a9736
--- /dev/null
+++ b/BENCHMARKING.md
@@ -0,0 +1,63 @@
+# Benchmarking diff
+
+The engine used by our diff tool tries to balance execution time with patch
+quality. It implements the Myers algorithm with a few heuristics which are also
+used by GNU diff to avoid pathological cases.
+
+The original paper can be found here:
+- https://link.springer.com/article/10.1007/BF01840446
+
+Currently, not all tricks used by GNU diff are adopted by our implementation.
+For instance, GNU diff will isolate lines that only exist in each of the files
+and not include them on the diffing process. It also does post-processing of the
+edits to produce more cohesive hunks. Both of these combinar should make it
+produce better patches for large files which are very different.
+
+Run `cargo build --release` before benchmarking after you make a change!
+
+## How to benchmark
+
+It is recommended that you use the 'hyperfine' tool to run your benchmarks. This
+is an example of how to run a comparison with GNU diff:
+
+```
+> hyperfine -N -i --warmup 2 --output=pipe 'diff t/huge t/huge.3'
+'./target/release/diffutils diff t/huge t/huge.3'
+Benchmark 1: diff t/huge t/huge.3
+  Time (mean ± σ):     136.3 ms ±   3.0 ms    [User: 88.5 ms, System: 17.9 ms]
+  Range (min … max):   131.8 ms … 144.4 ms    21 runs
+
+  Warning: Ignoring non-zero exit code.
+
+Benchmark 2: ./target/release/diffutils diff t/huge t/huge.3
+  Time (mean ± σ):      74.4 ms ±   1.0 ms    [User: 47.6 ms, System: 24.9 ms]
+  Range (min … max):    72.9 ms …  77.1 ms    41 runs
+
+  Warning: Ignoring non-zero exit code.
+
+Summary
+  ./target/release/diffutils diff t/huge t/huge.3 ran
+    1.83 ± 0.05 times faster than diff t/huge t/huge.3
+>
+```
+
+As you can see, you should provide both commands you want to compare on a single
+invocation of 'hyperfine'. Each as a single argument, so use quotes. These are
+the relevant parameters:
+
+- -N: avoids using a shell as intermediary to run the command
+- -i: ignores non-zero exit code, which diff uses to mean files differ
+- --warmup 2: 2 runs before measuring, warms up I/O cache for large files
+- --output=pipe: disable any potential optimizations based on output destination
+
+## Inputs
+
+Performance will vary based on several factors, the main ones being:
+
+- how large the files being compared are
+- how different the files being compared are
+- how large and far between sequences of equal lines are
+
+When looking at performance improvements, testing small and large (tens of MBs)
+which have few differences, many differences, completely different is important
+to cover all of the potential pathological cases.
diff --git a/Cargo.lock b/Cargo.lock
index fe461de..0dd14f1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -67,7 +67,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc"
 dependencies = [
  "memchr",
- "regex-automata",
+ "regex-automata 0.4.8",
  "serde",
 ]
 
@@ -77,6 +77,12 @@ version = "3.15.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa"
 
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
 [[package]]
 name = "cc"
 version = "1.0.90"
@@ -127,13 +133,15 @@ version = "0.4.2"
 dependencies = [
  "assert_cmd",
  "chrono",
- "diff",
  "itoa",
  "predicates",
  "pretty_assertions",
+ "rand",
  "regex",
  "same-file",
  "tempfile",
+ "tracing",
+ "tracing-subscriber",
  "unicode-width",
 ]
 
@@ -168,6 +176,17 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
 [[package]]
 name = "iana-time-zone"
 version = "0.1.60"
@@ -206,6 +225,12 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
 [[package]]
 name = "libc"
 version = "0.2.159"
@@ -224,6 +249,15 @@ version = "0.4.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
 
+[[package]]
+name = "matchers"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
+dependencies = [
+ "regex-automata 0.1.10",
+]
+
 [[package]]
 name = "memchr"
 version = "2.7.1"
@@ -236,6 +270,16 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be"
 
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num-traits"
 version = "0.2.18"
@@ -251,6 +295,27 @@ version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
+dependencies = [
+ "zerocopy",
+]
+
 [[package]]
 name = "predicates"
 version = "3.1.2"
@@ -309,6 +374,36 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
 [[package]]
 name = "regex"
 version = "1.11.0"
@@ -317,8 +412,17 @@ checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-automata",
- "regex-syntax",
+ "regex-automata 0.4.8",
+ "regex-syntax 0.8.5",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax 0.6.29",
 ]
 
 [[package]]
@@ -329,9 +433,15 @@ checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax",
+ "regex-syntax 0.8.5",
 ]
 
+[[package]]
+name = "regex-syntax"
+version = "0.6.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+
 [[package]]
 name = "regex-syntax"
 version = "0.8.5"
@@ -380,6 +490,21 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
+
 [[package]]
 name = "syn"
 version = "2.0.50"
@@ -410,6 +535,77 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
 
+[[package]]
+name = "thread_local"
+version = "1.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+]
+
+[[package]]
+name = "tracing"
+version = "0.1.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+dependencies = [
+ "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "regex",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+]
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.12"
@@ -422,6 +618,12 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
 
+[[package]]
+name = "valuable"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
+
 [[package]]
 name = "wait-timeout"
 version = "0.2.0"
@@ -431,6 +633,12 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.92"
@@ -612,3 +820,24 @@ name = "yansi"
 version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
+
+[[package]]
+name = "zerocopy"
+version = "0.7.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
+dependencies = [
+ "byteorder",
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
diff --git a/Cargo.toml b/Cargo.toml
index 6fa1a3c..b94725b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,11 +16,13 @@ path = "src/main.rs"
 
 [dependencies]
 chrono = "0.4.38"
-diff = "0.1.13"
 itoa = "1.0.11"
 regex = "1.10.4"
 same-file = "1.0.6"
 unicode-width = "0.2.0"
+tracing = "0.1.40"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+rand = "0.8.5"
 
 [dev-dependencies]
 pretty_assertions = "1.4.0"
@@ -42,6 +44,11 @@ ci = ["github"]
 # The installers to generate for each app
 installers = []
 # Target platforms to build apps for (Rust target-triple syntax)
-targets = ["aarch64-apple-darwin", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-pc-windows-msvc"]
+targets = [
+    "aarch64-apple-darwin",
+    "x86_64-apple-darwin",
+    "x86_64-unknown-linux-gnu",
+    "x86_64-pc-windows-msvc",
+]
 # Publish jobs to run in CI
 pr-run-mode = "plan"
diff --git a/src/context_diff.rs b/src/context_diff.rs
index 873fc3d..8e568eb 100644
--- a/src/context_diff.rs
+++ b/src/context_diff.rs
@@ -6,6 +6,7 @@
 use std::collections::VecDeque;
 use std::io::Write;
 
+use crate::engine::{self, Edit};
 use crate::params::Params;
 use crate::utils::do_write_line;
 use crate::utils::get_modification_time;
@@ -77,9 +78,9 @@ fn make_diff(
     // Rust only allows allocations to grow to isize::MAX, and this is bigger than that.
     let mut expected_lines_change_idx: usize = !0;
 
-    for result in diff::slice(&expected_lines, &actual_lines) {
+    for result in engine::diff(&expected_lines, &actual_lines) {
         match result {
-            diff::Result::Left(str) => {
+            Edit::Delete(str) => {
                 if lines_since_mismatch > context_size && lines_since_mismatch > 0 {
                     results.push(mismatch);
                     mismatch = Mismatch::new(
@@ -101,7 +102,7 @@ fn make_diff(
                 line_number_expected += 1;
                 lines_since_mismatch = 0;
             }
-            diff::Result::Right(str) => {
+            Edit::Insert(str) => {
                 if lines_since_mismatch > context_size && lines_since_mismatch > 0 {
                     results.push(mismatch);
                     mismatch = Mismatch::new(
@@ -132,7 +133,7 @@ fn make_diff(
                 line_number_actual += 1;
                 lines_since_mismatch = 0;
             }
-            diff::Result::Both(str, _) => {
+            Edit::Keep(str) => {
                 expected_lines_change_idx = !0;
                 // if one of them is missing a newline and the other isn't, then they don't actually match
                 if (line_number_actual > actual_lines_count)
@@ -616,6 +617,7 @@ mod tests {
                                 let _ = fb;
                                 let output = Command::new("patch")
                                     .arg("-p0")
+                                    .arg(format!("{target}/alefx"))
                                     .arg("--context")
                                     .stdin(File::open(format!("{target}/abx.diff")).unwrap())
                                     .output()
diff --git a/src/ed_diff.rs b/src/ed_diff.rs
index b8cdbc5..9b3ff78 100644
--- a/src/ed_diff.rs
+++ b/src/ed_diff.rs
@@ -5,6 +5,7 @@
 
 use std::io::Write;
 
+use crate::engine::{self, Edit};
 use crate::params::Params;
 use crate::utils::do_write_line;
 
@@ -71,9 +72,9 @@ fn make_diff(expected: &[u8], actual: &[u8], stop_early: bool) -> Result<Vec<Mis
         return Err(DiffError::MissingNL);
     }
 
-    for result in diff::slice(&expected_lines, &actual_lines) {
+    for result in engine::diff(&expected_lines, &actual_lines) {
         match result {
-            diff::Result::Left(str) => {
+            Edit::Delete(str) => {
                 if !mismatch.actual.is_empty() {
                     results.push(mismatch);
                     mismatch = Mismatch::new(line_number_expected, line_number_actual);
@@ -81,11 +82,11 @@ fn make_diff(expected: &[u8], actual: &[u8], stop_early: bool) -> Result<Vec<Mis
                 mismatch.expected.push(str.to_vec());
                 line_number_expected += 1;
             }
-            diff::Result::Right(str) => {
+            Edit::Insert(str) => {
                 mismatch.actual.push(str.to_vec());
                 line_number_actual += 1;
             }
-            diff::Result::Both(_str, _) => {
+            Edit::Keep(_str) => {
                 line_number_expected += 1;
                 line_number_actual += 1;
                 if !mismatch.actual.is_empty() || !mismatch.expected.is_empty() {
diff --git a/src/engine.rs b/src/engine.rs
new file mode 100644
index 0000000..56d9d14
--- /dev/null
+++ b/src/engine.rs
@@ -0,0 +1,514 @@
+// This file is part of the uutils diffutils package.
+//
+// For the full copyright and license information, please view the LICENSE-*
+// files that was distributed with this source code.
+
+// This engine implements the Myers diff algorithm, which uses a double-ended
+// diagonal search to identify the longest common subsequence (LCS) between two
+// collections. The original paper can be found here:
+//
+// https://link.springer.com/article/10.1007/BF01840446
+//
+// Unlike a naive LCS implementation, which covers all possible combinations,
+// the Myers algorithm gradualy expands the search space, and only encodes
+// the furthest progress made by each diagonal rather than storing each step
+// of the search on a matrix.
+//
+// This makes it a lot more memory-efficient, as it only needs 2 * (m + n)
+// positions to represent the state of the search, where m and n are the number
+// of items in the collections being compared, whereas the naive LCS requires
+// m * n positions.
+//
+// The downside is it is more compute-intensive than the naive method when
+// searching through very different files. This may lead to unnacceptable run
+// time in pathological cases (large, completely different files), so heuristics
+// are often used to bail on the search if it gets too costly and/or a good enough
+// subsequence has been found.
+//
+// We implement 3 main heuristics that are also used by GNU diff:
+//
+// 1. if we found a large enough common subsequence (also known as a 'snake')
+// and have searched for a while, we return that one
+//
+// 2. if we have searched for a significant chunk of the collections (with a
+// minimum of 4096 iterations, so we cover easy cases fully) and have not found
+// one, we use whatever we have, even if it is a small snake or no snake at all
+//
+// 3. we keep track of the overall cost of the various searches that are done
+// over the course of the divide and conquer strategy, and if that becomes too
+// large we give up on trying to find long similarities altogether
+//
+// This last heuristic could be improved significantly in the future if we
+// implement an optimization that separates items that only appear in either
+// collection and remove them from the diffing process, like GNU diff does.
+use std::fmt::Debug;
+use std::ops::{Index, IndexMut, RangeInclusive};
+
+use rand::Rng as _;
+use tracing::{info, instrument, trace, Level};
+
+#[derive(Debug, Default, PartialEq)]
+struct Snake {
+    x: usize,
+    y: usize,
+    length: usize,
+}
+
+impl Snake {
+    fn is_good(&self) -> bool {
+        // This magic number comes from GNU diff.
+        self.length > 20
+    }
+
+    fn maybe_update(&mut self, x: isize, y: isize, length: isize) {
+        let length = length as usize;
+        if length > self.length {
+            trace!(x = x, y = y, length = length, "new best snake");
+            self.x = x as usize;
+            self.y = y as usize;
+            self.length = length;
+        }
+    }
+
+    fn maybe_set(&mut self, x: isize, y: isize) {
+        if self.length == 0 {
+            self.x = x as usize;
+            self.y = y as usize;
+        }
+    }
+}
+
+#[instrument(skip_all)]
+fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
+    left: &[T],
+    right: &[T],
+    total_cost: &mut usize,
+) -> Snake {
+    let left_length = left.len() as isize;
+    let right_length = right.len() as isize;
+
+    let max_cost = left_length + right_length;
+
+    // This constant is the value used by GNU diff; using it should give us
+    // more similar diffs.
+    const HIGH_COST: isize = 200;
+
+    // This magic number was borrowed from GNU diff - apparently this is a
+    // good number for modern CPUs.
+    let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096);
+    info!(too_expensive = too_expensive);
+
+    // We've been constantly hitting the too expensive heuristic, this means the
+    // files are too different for us to get a good diff in reasonable amount of
+    // time. Do naive splits from now on.
+    if *total_cost as isize > too_expensive * 10 {
+        info!(
+            total_cost = total_cost,
+            "hit too costly overall heuristic, creating naive split"
+        );
+        let mut rng = rand::thread_rng();
+        let x = if left_length == 0 {
+            0
+        } else {
+            rng.gen_range(0..left.len())
+        };
+        let y = if right_length == 0 {
+            0
+        } else {
+            rng.gen_range(0..right.len())
+        };
+        return Snake { x, y, length: 0 };
+    }
+
+    // For collections of different sizes, the diagonals will not neatly balance. That means the
+    // "middle" diagonal for the backwards search will be offset from the forward one, so we need
+    // to keep track of that so we start at the right point.
+    let backwards_mid = left_length - right_length;
+
+    // Since we explore in steps of 2, if the offset mentioned above is odd the diagonals will
+    // not align during exploration. We use this to know if we check for meeting in the middle
+    // in the forwards or backwards search.
+    let ends_align = backwards_mid & 1 != 1;
+
+    trace!(backwards_mid = backwards_mid, ends_align = ends_align);
+
+    // The diagonals are initialized with values that are outside of the limits of the expected
+    // values so that the edit choices at the frontiers are always correct. We set the values at
+    // the mid diagonals to their correct initial values, though.
+    //
+    // The conceptual model of this algorithm is that 'left' is the title row of a matrix, and
+    // 'right' is the title column. The vector positions represent the best value of x we managed
+    // to achieve so far for each of those diagonals, rather then filling in the whole matrix. Note
+    // that "best" will be "the highest" for forward searches, and "the lowest" for backward, since
+    // we start from the high end on that one.
+    //
+    // Let's focus on the forward one, with x as an index for 'left', y as index for 'right', and
+    // d as the index of the vector. At the start, d = 0 means x = 0, y = 0, no offsets. If we go
+    // to the previous position on the vector, that conceptually means increasing the offset of y,
+    // since its value is derived from x - d. Offsetting 'right' means we are exploring an insertion.
+    // Going to the next position on the other hand means we are decreasing the offset of y, which
+    // means we are exploring a deletion (offsetting x relative to y).
+    let mut forward_diagonals = Diagonals::new(-1isize, left_length, right_length);
+    forward_diagonals[0] = 0;
+
+    let mut backward_diagonals = Diagonals::new(isize::MAX, left_length, right_length);
+    backward_diagonals[backwards_mid] = left_length;
+
+    let in_bounds = |x: isize, y: isize, offset: isize| -> bool {
+        x >= offset && y >= offset && x < left_length + offset && y < right_length + offset
+    };
+
+    let mut best_snake = Snake::default();
+
+    let forward_span = tracing::span!(Level::TRACE, "forward");
+    let backward_span = tracing::span!(Level::TRACE, "backward");
+    'outer: for c in 1..max_cost {
+        *total_cost += 1;
+
+        info!(c = c, snake_length = best_snake.length);
+        // The files appear to be large and too different. Go for good enough
+        if c > too_expensive {
+            break 'outer;
+        }
+
+        // Forwards search
+        forward_diagonals.expand_search();
+        let fwd = forward_span.enter();
+        trace!(
+            low = forward_diagonals.search_range.start(),
+            high = forward_diagonals.search_range.end(),
+            "search space"
+        );
+        for d in forward_diagonals.search_range.clone().rev().step_by(2) {
+            let mut x = if forward_diagonals[d - 1] < forward_diagonals[d + 1] {
+                trace!(
+                    insertion = forward_diagonals[d - 1],
+                    deletion = forward_diagonals[d + 1],
+                    "exploring deletion"
+                );
+                forward_diagonals[d + 1]
+            } else {
+                trace!(
+                    insertion = forward_diagonals[d - 1],
+                    deletion = forward_diagonals[d + 1],
+                    "exploring insertion"
+                );
+                forward_diagonals[d - 1] + 1
+            };
+            debug_assert!(x != -1);
+
+            let initial_x = x;
+            let mut y = x - d;
+
+            trace!(d = d, x = x, y = y, "before snaking");
+            while in_bounds(x, y, 0) && left[x as usize] == right[y as usize] {
+                x += 1;
+                y += 1;
+            }
+            trace!(d = d, x = x, y = y, "after snaking");
+
+            forward_diagonals[d] = x;
+
+            let snake_length = x - initial_x;
+            best_snake.maybe_update(initial_x, y - snake_length, snake_length);
+
+            if !ends_align
+                && backward_diagonals.search_range.contains(&d)
+                && x >= backward_diagonals[d]
+            {
+                trace!("met backward at mid point");
+                best_snake.maybe_set(x, y);
+                break 'outer;
+            }
+        }
+        drop(fwd);
+
+        // Backwards search
+        backward_diagonals.expand_search();
+        let bwd = backward_span.enter();
+        trace!(
+            low = backward_diagonals.search_range.start(),
+            high = backward_diagonals.search_range.end(),
+            "search space"
+        );
+        for d in backward_diagonals.search_range.clone().rev().step_by(2) {
+            // If we hit this assert we went outside the explored boundaries.
+            debug_assert!(
+                backward_diagonals[d - 1] != isize::MAX || backward_diagonals[d + 1] != isize::MAX
+            );
+            let mut x = if backward_diagonals[d - 1] < backward_diagonals[d + 1] {
+                trace!(
+                    insertion = backward_diagonals[d - 1],
+                    deletion = backward_diagonals[d + 1],
+                    "exploring insertion"
+                );
+                backward_diagonals[d - 1]
+            } else {
+                trace!(
+                    insertion = backward_diagonals[d - 1],
+                    deletion = backward_diagonals[d + 1],
+                    "exploring deletion"
+                );
+                backward_diagonals[d + 1] - 1
+            };
+
+            let initial_x = x;
+            let mut y = x - d;
+
+            trace!(d = d, x = x, y = y, "before snaking");
+            while in_bounds(x, y, 1) && left[x as usize - 1] == right[y as usize - 1] {
+                x -= 1;
+                y -= 1;
+            }
+            trace!(d = d, x = x, y = y, "after snaking");
+
+            backward_diagonals[d] = x;
+
+            best_snake.maybe_update(x, y, initial_x - x);
+
+            if ends_align
+                && forward_diagonals.search_range.contains(&d)
+                && x <= forward_diagonals[d]
+            {
+                trace!("met forward at mid point");
+                best_snake.maybe_set(x, y);
+                break 'outer;
+            }
+        }
+        drop(bwd);
+
+        if c > HIGH_COST && best_snake.is_good() {
+            info!("met criteria for high cost with good snake heuristic");
+            break 'outer;
+        }
+    }
+
+    // If we hit this condition, the search ran too long and found 0 matches.
+    // Get the best we can do as a split point - furthest diagonal.
+    if best_snake.length == 0 {
+        let (x, y) = forward_diagonals.get_furthest_progress();
+        best_snake.x = x;
+        best_snake.y = y;
+    }
+
+    info!(
+        x = best_snake.x,
+        y = best_snake.y,
+        length = best_snake.length,
+        "** DONE best snake:"
+    );
+    best_snake
+}
+
+// Delete: we skip that line from 'left'
+// Insert: we add that line from 'right'
+// Keep: both have that line, leave untouched
+#[derive(Debug)]
+pub enum Edit<'a, T: Debug + PartialEq> {
+    Delete(&'a T),
+    Insert(&'a T),
+    Keep(&'a T),
+}
+
+#[instrument(skip_all)]
+pub fn diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
+    left: &'a [T],
+    right: &'a [T],
+) -> Vec<Edit<'a, T>> {
+    trace!(left_length = left.len(), right_length = right.len());
+    let mut edits = vec![];
+    let mut total_cost = 0;
+    do_diff(left, right, &mut edits, &mut total_cost);
+    edits
+}
+
+#[instrument(skip_all)]
+fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
+    left: &'a [T],
+    right: &'a [T],
+    edits: &mut Vec<Edit<'a, T>>,
+    total_cost: &mut usize,
+) {
+    if left.is_empty() {
+        right.iter().for_each(|r| edits.push(Edit::Insert(r)));
+        return;
+    } else if right.is_empty() {
+        left.iter().for_each(|l| edits.push(Edit::Delete(l)));
+        return;
+    }
+
+    // Add leading matches to our edits while finding them.
+    let leading_matches = left
+        .iter()
+        .zip(right.iter())
+        .take_while(|(l, r)| l == r)
+        .map(|(l, _)| edits.push(Edit::Keep(l)))
+        .count();
+
+    // We need to hold on to add the trailing ones to keep ordering
+    // so just calculate how many there are.
+    let trailing_matches = left[leading_matches..]
+        .iter()
+        .rev()
+        .zip(right[leading_matches..].iter().rev())
+        .take_while(|(l, r)| l == r)
+        .count();
+
+    trace!(
+        leading_matches = leading_matches,
+        trailing_matches = trailing_matches
+    );
+
+    let left_remaining = &left[leading_matches..left.len() - trailing_matches];
+    let right_remaining = &right[leading_matches..right.len() - trailing_matches];
+
+    let snake = find_split_point(left_remaining, right_remaining, total_cost);
+
+    trace!(x = snake.x, y = snake.y, length = snake.length, "snake");
+
+    // No more matches were found, do all deletions / insertions.
+    if snake.length == 0 {
+        left_remaining
+            .iter()
+            .for_each(|l| edits.push(Edit::Delete(l)));
+        right_remaining
+            .iter()
+            .for_each(|r| edits.push(Edit::Insert(r)));
+    } else {
+        // Divide and conquer based on the best snake we found.
+        let (l1, l2) = left_remaining.split_at(snake.x);
+        let (r1, r2) = right_remaining.split_at(snake.y);
+
+        trace!(
+            a = l1.len(),
+            b = r1.len(),
+            a = l2.len(),
+            b = r2.len(),
+            "split"
+        );
+
+        do_diff(l1, r1, edits, total_cost);
+        do_diff(l2, r2, edits, total_cost);
+    }
+
+    // Finally add the trailing matches.
+    left[left.len() - trailing_matches..]
+        .iter()
+        .for_each(|l| edits.push(Edit::Keep(l)));
+}
+
+struct Diagonals {
+    data: Vec<isize>,
+    center: usize,
+    search_range: RangeInclusive<isize>,
+
+    min_diag: isize,
+    max_diag: isize,
+}
+
+impl Debug for Diagonals {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        for (i, v) in self.data[..self.center].iter().enumerate() {
+            let _ = write!(f, "({}: {v})", i as isize - self.center as isize);
+        }
+
+        let _ = writeln!(f, "\ncenter: ({}: {})", self.center, self.data[self.center]);
+
+        for (i, v) in self.data[self.center + 1..].iter().enumerate() {
+            let _ = write!(f, "({}: {v})", i + 1);
+        }
+
+        Ok(())
+    }
+}
+
+impl Diagonals {
+    pub fn new(filler: isize, left_length: isize, right_length: isize) -> Self {
+        let size = left_length
+            .checked_add(right_length)
+            .and_then(|s| s.checked_add(3));
+        let Some(size) = size else {
+            panic!("Tried to create Diagonals of a size we cannot represent: {left_length} + {right_length} + 3");
+        };
+
+        // Our internal representaiton has 3 more positions than the sum of the lengths.
+        // That is because we always look at diagonals of either side when evaluating a
+        // diagonal, so we need room for an "out of bounds" value when checking the extremes.
+        // We also need room to represent the middle diagonal at the middle.
+        let mid_diag = left_length - right_length;
+        Self {
+            data: vec![filler; size as usize],
+            center: (right_length + 1) as usize,
+            search_range: mid_diag..=mid_diag,
+
+            min_diag: -(right_length),
+            max_diag: left_length,
+        }
+    }
+
+    fn actual_index(&self, index: isize) -> usize {
+        (self.center as isize + index) as usize
+    }
+
+    fn in_bounds(&self, index: isize) -> bool {
+        let actual = self.center as isize + index;
+        actual >= 0 && (actual as usize) < self.data.len()
+    }
+
+    fn get_furthest_progress(&self) -> (usize, usize) {
+        let (d, x) = self
+            .data
+            .iter()
+            .enumerate()
+            .filter(|(d, &x)| x - (*d as isize) >= 0)
+            .max_by_key(|(_, &x)| x)
+            .map(|(i, x)| (i as isize, *x))
+            .unwrap_or((0isize, 0isize));
+        let y = x - d;
+        debug_assert!(x >= 0);
+        debug_assert!(y >= 0);
+        (x as usize, y as usize)
+    }
+
+    fn expand_search(&mut self) {
+        let upper = if *self.search_range.end() == self.max_diag {
+            self.search_range.end() - 1
+        } else {
+            self.search_range.end() + 1
+        };
+        let lower = (self.search_range.start() - 1).max(self.min_diag);
+
+        trace!(
+            min_diag = self.min_diag,
+            max_diag = self.max_diag,
+            prev_lower = self.search_range.start(),
+            prev_upper = self.search_range.end(),
+            new_lower = lower,
+            new_upper = upper,
+        );
+
+        self.search_range = lower..=upper;
+    }
+}
+
+impl Index<isize> for Diagonals {
+    type Output = isize;
+
+    fn index(&self, index: isize) -> &Self::Output {
+        if !self.in_bounds(index) {
+            panic!("Index out of bounds: {} for SignedVec", index);
+        }
+        let actual_index = self.actual_index(index);
+        &self.data[actual_index]
+    }
+}
+
+impl IndexMut<isize> for Diagonals {
+    fn index_mut(&mut self, index: isize) -> &mut Self::Output {
+        if !self.in_bounds(index) {
+            panic!("Index out of bounds: {} for SignedVec", index);
+        }
+        let actual_index = self.actual_index(index);
+        &mut self.data[actual_index]
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index a20ac56..bf9e507 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,7 @@
 pub mod cmp;
 pub mod context_diff;
 pub mod ed_diff;
+pub mod engine;
 pub mod macros;
 pub mod normal_diff;
 pub mod params;
diff --git a/src/main.rs b/src/main.rs
index 8194d00..a6a08d9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -15,6 +15,7 @@ mod cmp;
 mod context_diff;
 mod diff;
 mod ed_diff;
+mod engine;
 mod macros;
 mod normal_diff;
 mod params;
@@ -52,6 +53,8 @@ fn second_arg_error(name: &OsStr) -> ! {
 }
 
 fn main() -> ExitCode {
+    tracing_subscriber::fmt::init();
+
     let mut args = std::env::args_os().peekable();
 
     let exe_path = binary_path(&mut args);
diff --git a/src/normal_diff.rs b/src/normal_diff.rs
index 002cd01..ebb7a3e 100644
--- a/src/normal_diff.rs
+++ b/src/normal_diff.rs
@@ -5,6 +5,7 @@
 
 use std::io::Write;
 
+use crate::engine::{self, Edit};
 use crate::params::Params;
 use crate::utils::do_write_line;
 
@@ -54,9 +55,9 @@ fn make_diff(expected: &[u8], actual: &[u8], stop_early: bool) -> Vec<Mismatch>
         actual_lines.pop();
     }
 
-    for result in diff::slice(&expected_lines, &actual_lines) {
+    for result in engine::diff(&expected_lines, &actual_lines) {
         match result {
-            diff::Result::Left(str) => {
+            Edit::Delete(str) => {
                 if !mismatch.actual.is_empty() && !mismatch.actual_missing_nl {
                     results.push(mismatch);
                     mismatch = Mismatch::new(line_number_expected, line_number_actual);
@@ -65,12 +66,12 @@ fn make_diff(expected: &[u8], actual: &[u8], stop_early: bool) -> Vec<Mismatch>
                 mismatch.expected_missing_nl = line_number_expected > expected_lines_count;
                 line_number_expected += 1;
             }
-            diff::Result::Right(str) => {
+            Edit::Insert(str) => {
                 mismatch.actual.push(str.to_vec());
                 mismatch.actual_missing_nl = line_number_actual > actual_lines_count;
                 line_number_actual += 1;
             }
-            diff::Result::Both(str, _) => {
+            Edit::Keep(str) => {
                 match (
                     line_number_expected > expected_lines_count,
                     line_number_actual > actual_lines_count,
diff --git a/src/unified_diff.rs b/src/unified_diff.rs
index 0f504a8..abdb5dd 100644
--- a/src/unified_diff.rs
+++ b/src/unified_diff.rs
@@ -6,6 +6,7 @@
 use std::collections::VecDeque;
 use std::io::Write;
 
+use crate::engine::{self, Edit};
 use crate::params::Params;
 use crate::utils::do_write_line;
 use crate::utils::get_modification_time;
@@ -65,9 +66,9 @@ fn make_diff(
         actual_lines.pop();
     }
 
-    for result in diff::slice(&expected_lines, &actual_lines) {
+    for result in engine::diff(&expected_lines, &actual_lines) {
         match result {
-            diff::Result::Left(str) => {
+            Edit::Delete(str) => {
                 if lines_since_mismatch >= context_size && lines_since_mismatch > 0 {
                     results.push(mismatch);
                     mismatch = Mismatch::new(
@@ -104,7 +105,7 @@ fn make_diff(
                 line_number_expected += 1;
                 lines_since_mismatch = 0;
             }
-            diff::Result::Right(str) => {
+            Edit::Insert(str) => {
                 if lines_since_mismatch >= context_size && lines_since_mismatch > 0 {
                     results.push(mismatch);
                     mismatch = Mismatch::new(
@@ -125,7 +126,7 @@ fn make_diff(
                 line_number_actual += 1;
                 lines_since_mismatch = 0;
             }
-            diff::Result::Both(str, _) => {
+            Edit::Keep(str) => {
                 // if one of them is missing a newline and the other isn't, then they don't actually match
                 if (line_number_actual > actual_lines_count)
                     && (line_number_expected > expected_lines_count)
@@ -771,6 +772,7 @@ mod tests {
                                 let _ = fb;
                                 let output = Command::new("patch")
                                     .arg("-p0")
+                                    .arg(format!("{target}/alefx"))
                                     .stdin(File::open(format!("{target}/abx.diff")).unwrap())
                                     .output()
                                     .unwrap();
diff --git a/tests/integration.rs b/tests/integration.rs
index c11726e..40b497c 100644
--- a/tests/integration.rs
+++ b/tests/integration.rs
@@ -95,6 +95,8 @@ mod common {
 }
 
 mod diff {
+    use std::process::Stdio;
+
     use diffutilslib::assert_diff_eq;
 
     use super::*;
@@ -341,6 +343,97 @@ mod diff {
 
         Ok(())
     }
+
+    fn str_bar_diff(bar: &[u8]) -> String {
+        String::from_utf8(
+            bar.split(|b| *b == b'\n')
+                .filter(|b| b != b"")
+                .flat_map(|b| [b">", b" ", b, b"\n"].concat())
+                .collect::<Vec<u8>>(),
+        )
+        .unwrap()
+    }
+
+    #[test]
+    fn large_similar_files() -> Result<(), Box<dyn std::error::Error>> {
+        // Large similar files should still produce ideal diffs, not
+        // triggering the total cost heuristic.
+        let foo = b"f\n".repeat(16 * 1024 * 1024);
+        let bar = b"b\n".repeat(26);
+
+        let mut file1 = NamedTempFile::new()?;
+        file1.write_all(&foo)?;
+        file1.write_all(&foo)?;
+        let mut file2 = NamedTempFile::new()?;
+        file2.write_all(&foo)?;
+        file2.write_all(&bar)?;
+        file2.write_all(&foo)?;
+
+        let mut cmd = Command::cargo_bin("diffutils")?;
+        cmd.arg("diff").arg(file1.path()).arg(file2.path());
+        cmd.assert()
+            .code(predicate::eq(1))
+            .failure()
+            .stdout(predicate::eq(format!(
+                "16777216a16777217,16777242\n{}",
+                str_bar_diff(&bar)
+            )));
+
+        let mut file1 = NamedTempFile::new()?;
+        file1.write_all(&bar)?;
+        file1.write_all(&foo)?;
+        file1.write_all(&foo)?;
+        let mut file2 = NamedTempFile::new()?;
+        file2.write_all(&foo)?;
+        file2.write_all(&foo)?;
+
+        let mut cmd = Command::cargo_bin("diffutils")?;
+        cmd.arg("diff").arg(file1.path()).arg(file2.path());
+        cmd.assert()
+            .code(predicate::eq(1))
+            .failure()
+            .stdout(predicate::eq(format!(
+                "1,26d0\n{}",
+                str_bar_diff(&bar).replace(">", "<")
+            )));
+
+        Ok(())
+    }
+
+    #[test]
+    fn large_different_files() -> Result<(), Box<dyn std::error::Error>> {
+        let foo = b"f\n".repeat(4 * 1024 * 1024);
+        let bar = b"b\n".repeat(26);
+        let baz = b"z\n".repeat(4 * 1024 * 1024);
+
+        let mut file1 = NamedTempFile::new()?;
+        file1.write_all(&foo)?;
+        file1.write_all(&bar)?;
+        let mut file2 = NamedTempFile::new()?;
+        file2.write_all(&baz)?;
+        file2.write_all(&bar)?;
+
+        let mut child = std::process::Command::new(assert_cmd::cargo::cargo_bin("diffutils"))
+            .arg("diff")
+            .arg(file1.path())
+            .arg(file2.path())
+            .stdout(Stdio::null())
+            .spawn()
+            .unwrap();
+
+        // The total cost heuristic should give up trying to find good split points
+        // in a reasonable amount of time (can still be a fairly big in debug builds)
+        for retries in 0.. {
+            std::thread::sleep(std::time::Duration::from_secs(1));
+            if let Some(status) = child.try_wait()? {
+                assert_eq!(status.code(), Some(1));
+                break;
+            }
+            assert!(retries < 10);
+        }
+
+        Ok(())
+    }
 }
 
 mod cmp {