afnanenayet · afnanenayet · Mar 21, 2022 · Mar 21, 2022
@@ -62,4 +62,7 @@
     // ${fallback_cmd} ${old} ${new}
     // ```
     "fallback-cmd": "diff",
+    "input-processing": {
+        "split-graphemes": true,
+    }
 }
@@ -1,5 +1,6 @@
 //! Utilities and definitions for config handling
 
+use crate::input_processing::TreeSitterProcessor;
 use crate::{formatting::DiffWriter, parse::GrammarConfig};
 use anyhow::{Context, Result};
 use json5 as json;
@@ -35,6 +36,10 @@ pub struct Config {
     /// Options for loading
     pub grammar: GrammarConfig,
 
+    /// Options for processing tree-sitter input.
+    #[serde(default)]
+    pub input_processing: TreeSitterProcessor,
+
     /// The program to invoke if the given files can not be parsed by the available tree-sitter
     /// parsers.
     ///

@@ -1,9 +1,10 @@
 //! Structs and other convenience methods for handling logical concepts pertaining to diffs, such
 //! as hunks.
 
-use crate::ast::{EditType, Entry};
+use crate::input_processing::{EditType, Entry};
 use crate::neg_idx_vec::NegIdxVec;
 use anyhow::Result;
+use logging_timer::time;
 use std::collections::VecDeque;
 use std::fmt::Debug;
 use std::iter::FromIterator;
@@ -691,6 +692,34 @@ impl Myers {
     }
 }
 
+/// Compute the hunks corresponding to the minimum edit path between two documents.
+///
+/// This will process the the AST vectors with the user-provided settings.
+///
+/// This will return two groups of [hunks](diff::Hunks) in a tuple of the form
+/// `(old_hunks, new_hunks)`.
+#[time("info", "diff::{}")]
+pub fn compute_edit_script<'a>(old: &[Entry<'a>], new: &[Entry<'a>]) -> (Hunks<'a>, Hunks<'a>) {
+    let myers = Myers::default();
+    let edit_script = myers.diff(old, new);
+    let edit_script_len = edit_script.len();
+
+    let mut old_edits = Vec::with_capacity(edit_script_len);
+    let mut new_edits = Vec::with_capacity(edit_script_len);
+
+    for edit in edit_script {
+        match edit {
+            EditType::Deletion(&e) => old_edits.push(e),
+            EditType::Addition(&e) => new_edits.push(e),
+        };
+    }
+
+    // Convert the vectors of edits into consolidated edit hunks
+    let old_hunks = old_edits.into_iter().collect();
+    let new_hunks = new_edits.into_iter().collect();
+    (old_hunks, new_hunks)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

@@ -1,14 +1,64 @@
 //! Utilities for processing the ASTs provided by `tree_sitter`
 
-use crate::diff::{Engine, Hunks, Myers};
 use logging_timer::time;
+use serde::{Deserialize, Serialize};
 use std::hash::{Hash, Hasher};
 use std::{cell::RefCell, ops::Index, path::PathBuf};
 use tree_sitter::Node as TSNode;
 use tree_sitter::Point;
 use tree_sitter::Tree as TSTree;
 use unicode_segmentation as us;
 
+/// The configuration options for processing tree-sitter output.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "kebab-case")]
+pub struct TreeSitterProcessor {
+    /// Whether we should split the nodes graphemes.
+    ///
+    /// If this is disabled, then the direct tree-sitter nodes will be used and diffs will be less
+    /// granular. This has the advantage of being faster and using less memory.
+    pub split_graphemes: bool,
+}
+
+impl Default for TreeSitterProcessor {
+    fn default() -> Self {
+        Self {
+            split_graphemes: true,
+        }
+    }
+}
+
+impl TreeSitterProcessor {
+    #[time("info", "ast::{}")]
+    pub fn process<'a>(&self, tree: &'a TSTree, text: &'a str) -> Vec<Entry<'a>> {
+        let ast_vector = from_ts_tree(tree, text);
+        let entries = if self.split_graphemes {
+            ast_vector
+                .leaves
+                .iter()
+                .flat_map(|leaf| leaf.split_on_graphemes())
+                .collect()
+        } else {
+            ast_vector.leaves.iter().map(|&x| Entry::from(x)).collect()
+        };
+        entries
+    }
+}
+
+/// Create a `DiffVector` from a `tree_sitter` tree
+///
+/// This method calls a helper function that does an in-order traversal of the tree and adds
+/// leaf nodes to a vector
+#[time("info", "ast::{}")]
+fn from_ts_tree<'a>(tree: &'a TSTree, text: &'a str) -> Vector<'a> {
+    let leaves = RefCell::new(Vec::new());
+    build(&leaves, tree.root_node(), text);
+    Vector {
+        leaves: leaves.into_inner(),
+        source_text: text,
+    }
+}
+
 /// The leaves of an AST vector
 ///
 /// This is used as an intermediate struct for flattening the tree structure.
@@ -50,7 +100,7 @@ impl<'a> VectorLeaf<'a> {
     ///
     /// Each grapheme will get its own [Entry] struct. This method will resolve the
     /// indices/positioning of each grapheme from the `self.text` field.
-    fn split_graphemes(self) -> Vec<Entry<'a>> {
+    fn split_on_graphemes(self) -> Vec<Entry<'a>> {
         let mut entries = Vec::new();
         let indices: Vec<(usize, &str)> =
             us::UnicodeSegmentation::grapheme_indices(self.text, true).collect();
@@ -98,6 +148,18 @@ impl<'a> VectorLeaf<'a> {
     }
 }
 
+impl<'a> From<VectorLeaf<'a>> for Entry<'a> {
+    fn from(leaf: VectorLeaf<'a>) -> Self {
+        Self {
+            reference: leaf.reference,
+            text: leaf.text,
+            start_position: leaf.reference.start_position(),
+            end_position: leaf.reference.start_position(),
+            kind_id: leaf.reference.kind_id(),
+        }
+    }
+}
+
 impl<'a> Entry<'a> {
     /// Get the start position of an entry
     pub fn start_position(&self) -> Point {
@@ -116,7 +178,7 @@ impl<'a> From<&'a Vector<'a>> for Vec<Entry<'a>> {
         entries.reserve(ast_vector.leaves.len());
 
         for entry in &ast_vector.leaves {
-            entries.extend(entry.split_graphemes().iter());
+            entries.extend(entry.split_on_graphemes().iter());
         }
         entries
     }
@@ -252,39 +314,3 @@ pub enum EditType<T> {
     /// An element that was deleted in the edit script
     Deletion(T),
 }
-
-/// Compute the hunks corresponding to the minimum edit path between two documents
-///
-/// This method computes the minimum edit distance between two `DiffVector`s, which are the leaf
-/// nodes of an AST, using the standard DP approach to the longest common subsequence problem, the
-/// only twist is that here, instead of operating on raw text, we're operating on the leaves of an
-/// AST.
-///
-/// This has O(mn) space complexity and uses O(mn) space to compute the minimum edit path, and then
-/// has O(mn) space complexity and uses O(mn) space to backtrack and recreate the path.
-///
-/// This will return two groups of [hunks](diff::Hunks) in a tuple of the form
-/// `(old_hunks, new_hunks)`.
-#[time("info", "ast::{}")]
-pub fn compute_edit_script<'a>(a: &'a Vector, b: &'a Vector) -> (Hunks<'a>, Hunks<'a>) {
-    let myers = Myers::default();
-    let a_graphemes: Vec<Entry> = a.into();
-    let b_graphemes: Vec<Entry> = b.into();
-    let edit_script = myers.diff(&a_graphemes[..], &b_graphemes[..]);
-    let edit_script_len = edit_script.len();
-
-    let mut old_edits = Vec::with_capacity(edit_script_len);
-    let mut new_edits = Vec::with_capacity(edit_script_len);
-
-    for edit in edit_script {
-        match edit {
-            EditType::Deletion(&e) => old_edits.push(e),
-            EditType::Addition(&e) => new_edits.push(e),
-        };
-    }
-
-    // Convert the vectors of edits into hunks that can be displayed
-    let old_hunks = old_edits.into_iter().collect();
-    let new_hunks = new_edits.into_iter().collect();
-    (old_hunks, new_hunks)
-}
@@ -1,20 +1,20 @@
-mod ast;
 mod cli;
 mod config;
 mod diff;
 mod formatting;
+mod input_processing;
 mod neg_idx_vec;
 mod parse;
 
 use crate::parse::supported_languages;
 use anyhow::Result;
-use ast::{Vector, VectorData};
 use clap::IntoApp;
 use clap::Parser;
 use cli::{Args, ColorOutputPolicy};
 use config::{Config, ReadError};
 use console::Term;
 use formatting::{DisplayParameters, DocumentDiffData};
+use input_processing::VectorData;
 use log::{debug, error, info, warn, LevelFilter};
 use parse::{generate_language, language_from_ext, GrammarConfig};
 use serde_json as json;
@@ -96,19 +96,6 @@ fn generate_ast_vector_data(
     Ok(VectorData { text, tree, path })
 }
 
-/// Generate an AST vector from the underlying data.
-///
-/// This will break up the AST vector data into a list of AST nodes that correspond to graphemes.
-fn generate_ast_vector(data: &VectorData) -> Vector<'_> {
-    let ast_vec = Vector::from_ts_tree(&data.tree, &data.text);
-    info!(
-        "Constructed a diff vector with {} nodes for {}",
-        ast_vec.len(),
-        data.path.to_string_lossy(),
-    );
-    ast_vec
-}
-
 /// Check if the input files are supported by this program.
 ///
 /// If the user provides a language override, this will check that the language is supported by the
@@ -165,10 +152,15 @@ fn run_diff(args: &Args, config: &Config) -> Result<()> {
     let ast_data_a = generate_ast_vector_data(path_a.clone(), file_type, &config.grammar)?;
     let ast_data_b = generate_ast_vector_data(path_b.clone(), file_type, &config.grammar)?;
 
-    let diff_vec_a = generate_ast_vector(&ast_data_a);
-    let diff_vec_b = generate_ast_vector(&ast_data_b);
+    let diff_vec_a = config
+        .input_processing
+        .process(&ast_data_a.tree, &ast_data_a.text);
 
-    let (old_hunks, new_hunks) = ast::compute_edit_script(&diff_vec_a, &diff_vec_b);
+    let diff_vec_b = config
+        .input_processing
+        .process(&ast_data_b.tree, &ast_data_b.text);
+
+    let (old_hunks, new_hunks) = diff::compute_edit_script(&diff_vec_a, &diff_vec_b);
     let params = DisplayParameters {
         old: DocumentDiffData {
             filename: &ast_data_a.path.to_string_lossy(),
@@ -330,23 +322,28 @@ mod tests {
         (path_a, path_b)
     }
 
-    #[test_case("short", "rust", "rs")]
-    #[test_case("short", "python", "py")]
-    #[test_case("medium", "rust", "rs")]
-    #[test_case("medium", "cpp", "cpp")]
-    fn diff_hunks_snapshot(test_type: &str, name: &str, ext: &str) {
+    #[test_case("short", "rust", "rs", true)]
+    #[test_case("short", "python", "py", true)]
+    #[test_case("medium", "rust", "rs", true)]
+    #[test_case("medium", "rust", "rs", false)]
+    #[test_case("medium", "cpp", "cpp", true)]
+    #[test_case("medium", "cpp", "cpp", false)]
+    fn diff_hunks_snapshot(test_type: &str, name: &str, ext: &str, split_graphemes: bool) {
         let (path_a, path_b) = get_test_paths(test_type, name, ext);
         let config = GrammarConfig::default();
         let ast_data_a = generate_ast_vector_data(path_a, None, &config).unwrap();
         let ast_data_b = generate_ast_vector_data(path_b, None, &config).unwrap();
-        let diff_vec_a = generate_ast_vector(&ast_data_a);
-        let diff_vec_b = generate_ast_vector(&ast_data_b);
-        let diff_hunks = ast::compute_edit_script(&diff_vec_a, &diff_vec_b);
+
+        let processor = input_processing::TreeSitterProcessor { split_graphemes };
+
+        let diff_vec_a = processor.process(&ast_data_a.tree, &ast_data_a.text);
+        let diff_vec_b = processor.process(&ast_data_b.tree, &ast_data_b.text);
+        let diff_hunks = diff::compute_edit_script(&diff_vec_a, &diff_vec_b);
 
         // We have to set the snapshot name manually, otherwise there appear to be threading issues
         // and we end up with more snapshot files than there are tests, which cause
         // nondeterministic errors.
-        let snapshot_name = format!("{test_type}_{name}");
+        let snapshot_name = format!("{test_type}_{name}_{split_graphemes}");
         assert_debug_snapshot!(snapshot_name, diff_hunks);
     }
 }