Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

feat: Configure splitting on graphemes #314

Merged
merged 1 commit into from
Mar 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions assets/sample_config.json5
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,7 @@
// ${fallback_cmd} ${old} ${new}
// ```
"fallback-cmd": "diff",
"input-processing": {
"split-graphemes": true,
}
}
5 changes: 5 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Utilities and definitions for config handling

use crate::input_processing::TreeSitterProcessor;
use crate::{formatting::DiffWriter, parse::GrammarConfig};
use anyhow::{Context, Result};
use json5 as json;
Expand Down Expand Up @@ -35,6 +36,10 @@ pub struct Config {
/// Options for loading
pub grammar: GrammarConfig,

/// Options for processing tree-sitter input.
#[serde(default)]
pub input_processing: TreeSitterProcessor,

/// The program to invoke if the given files can not be parsed by the available tree-sitter
/// parsers.
///
Expand Down
31 changes: 30 additions & 1 deletion src/diff.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
//! Structs and other convenience methods for handling logical concepts pertaining to diffs, such
//! as hunks.

use crate::ast::{EditType, Entry};
use crate::input_processing::{EditType, Entry};
use crate::neg_idx_vec::NegIdxVec;
use anyhow::Result;
use logging_timer::time;
use std::collections::VecDeque;
use std::fmt::Debug;
use std::iter::FromIterator;
Expand Down Expand Up @@ -691,6 +692,34 @@ impl Myers {
}
}

/// Compute the hunks corresponding to the minimum edit path between two documents.
///
/// This will process the the AST vectors with the user-provided settings.
///
/// This will return two groups of [hunks](diff::Hunks) in a tuple of the form
/// `(old_hunks, new_hunks)`.
#[time("info", "diff::{}")]
pub fn compute_edit_script<'a>(old: &[Entry<'a>], new: &[Entry<'a>]) -> (Hunks<'a>, Hunks<'a>) {
let myers = Myers::default();
let edit_script = myers.diff(old, new);
let edit_script_len = edit_script.len();

let mut old_edits = Vec::with_capacity(edit_script_len);
let mut new_edits = Vec::with_capacity(edit_script_len);

for edit in edit_script {
match edit {
EditType::Deletion(&e) => old_edits.push(e),
EditType::Addition(&e) => new_edits.push(e),
};
}

// Convert the vectors of edits into consolidated edit hunks
let old_hunks = old_edits.into_iter().collect();
let new_hunks = new_edits.into_iter().collect();
(old_hunks, new_hunks)
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
104 changes: 65 additions & 39 deletions src/ast.rs → src/input_processing.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,64 @@
//! Utilities for processing the ASTs provided by `tree_sitter`

use crate::diff::{Engine, Hunks, Myers};
use logging_timer::time;
use serde::{Deserialize, Serialize};
use std::hash::{Hash, Hasher};
use std::{cell::RefCell, ops::Index, path::PathBuf};
use tree_sitter::Node as TSNode;
use tree_sitter::Point;
use tree_sitter::Tree as TSTree;
use unicode_segmentation as us;

/// The configuration options for processing tree-sitter output.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "kebab-case")]
pub struct TreeSitterProcessor {
/// Whether we should split the nodes graphemes.
///
/// If this is disabled, then the direct tree-sitter nodes will be used and diffs will be less
/// granular. This has the advantage of being faster and using less memory.
pub split_graphemes: bool,
}

impl Default for TreeSitterProcessor {
fn default() -> Self {
Self {
split_graphemes: true,
}
}
}

impl TreeSitterProcessor {
#[time("info", "ast::{}")]
pub fn process<'a>(&self, tree: &'a TSTree, text: &'a str) -> Vec<Entry<'a>> {
let ast_vector = from_ts_tree(tree, text);
let entries = if self.split_graphemes {
ast_vector
.leaves
.iter()
.flat_map(|leaf| leaf.split_on_graphemes())
.collect()
} else {
ast_vector.leaves.iter().map(|&x| Entry::from(x)).collect()
};
entries
}
}

/// Create a `DiffVector` from a `tree_sitter` tree
///
/// This method calls a helper function that does an in-order traversal of the tree and adds
/// leaf nodes to a vector
#[time("info", "ast::{}")]
fn from_ts_tree<'a>(tree: &'a TSTree, text: &'a str) -> Vector<'a> {
let leaves = RefCell::new(Vec::new());
build(&leaves, tree.root_node(), text);
Vector {
leaves: leaves.into_inner(),
source_text: text,
}
}

/// The leaves of an AST vector
///
/// This is used as an intermediate struct for flattening the tree structure.
Expand Down Expand Up @@ -50,7 +100,7 @@ impl<'a> VectorLeaf<'a> {
///
/// Each grapheme will get its own [Entry] struct. This method will resolve the
/// indices/positioning of each grapheme from the `self.text` field.
fn split_graphemes(self) -> Vec<Entry<'a>> {
fn split_on_graphemes(self) -> Vec<Entry<'a>> {
let mut entries = Vec::new();
let indices: Vec<(usize, &str)> =
us::UnicodeSegmentation::grapheme_indices(self.text, true).collect();
Expand Down Expand Up @@ -98,6 +148,18 @@ impl<'a> VectorLeaf<'a> {
}
}

impl<'a> From<VectorLeaf<'a>> for Entry<'a> {
fn from(leaf: VectorLeaf<'a>) -> Self {
Self {
reference: leaf.reference,
text: leaf.text,
start_position: leaf.reference.start_position(),
end_position: leaf.reference.start_position(),
kind_id: leaf.reference.kind_id(),
}
}
}

impl<'a> Entry<'a> {
/// Get the start position of an entry
pub fn start_position(&self) -> Point {
Expand All @@ -116,7 +178,7 @@ impl<'a> From<&'a Vector<'a>> for Vec<Entry<'a>> {
entries.reserve(ast_vector.leaves.len());

for entry in &ast_vector.leaves {
entries.extend(entry.split_graphemes().iter());
entries.extend(entry.split_on_graphemes().iter());
}
entries
}
Expand Down Expand Up @@ -252,39 +314,3 @@ pub enum EditType<T> {
/// An element that was deleted in the edit script
Deletion(T),
}

/// Compute the hunks corresponding to the minimum edit path between two documents
///
/// This method computes the minimum edit distance between two `DiffVector`s, which are the leaf
/// nodes of an AST, using the standard DP approach to the longest common subsequence problem, the
/// only twist is that here, instead of operating on raw text, we're operating on the leaves of an
/// AST.
///
/// This has O(mn) space complexity and uses O(mn) space to compute the minimum edit path, and then
/// has O(mn) space complexity and uses O(mn) space to backtrack and recreate the path.
///
/// This will return two groups of [hunks](diff::Hunks) in a tuple of the form
/// `(old_hunks, new_hunks)`.
#[time("info", "ast::{}")]
pub fn compute_edit_script<'a>(a: &'a Vector, b: &'a Vector) -> (Hunks<'a>, Hunks<'a>) {
let myers = Myers::default();
let a_graphemes: Vec<Entry> = a.into();
let b_graphemes: Vec<Entry> = b.into();
let edit_script = myers.diff(&a_graphemes[..], &b_graphemes[..]);
let edit_script_len = edit_script.len();

let mut old_edits = Vec::with_capacity(edit_script_len);
let mut new_edits = Vec::with_capacity(edit_script_len);

for edit in edit_script {
match edit {
EditType::Deletion(&e) => old_edits.push(e),
EditType::Addition(&e) => new_edits.push(e),
};
}

// Convert the vectors of edits into hunks that can be displayed
let old_hunks = old_edits.into_iter().collect();
let new_hunks = new_edits.into_iter().collect();
(old_hunks, new_hunks)
}
51 changes: 24 additions & 27 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
mod ast;
mod cli;
mod config;
mod diff;
mod formatting;
mod input_processing;
mod neg_idx_vec;
mod parse;

use crate::parse::supported_languages;
use anyhow::Result;
use ast::{Vector, VectorData};
use clap::IntoApp;
use clap::Parser;
use cli::{Args, ColorOutputPolicy};
use config::{Config, ReadError};
use console::Term;
use formatting::{DisplayParameters, DocumentDiffData};
use input_processing::VectorData;
use log::{debug, error, info, warn, LevelFilter};
use parse::{generate_language, language_from_ext, GrammarConfig};
use serde_json as json;
Expand Down Expand Up @@ -96,19 +96,6 @@ fn generate_ast_vector_data(
Ok(VectorData { text, tree, path })
}

/// Generate an AST vector from the underlying data.
///
/// This will break up the AST vector data into a list of AST nodes that correspond to graphemes.
fn generate_ast_vector(data: &VectorData) -> Vector<'_> {
let ast_vec = Vector::from_ts_tree(&data.tree, &data.text);
info!(
"Constructed a diff vector with {} nodes for {}",
ast_vec.len(),
data.path.to_string_lossy(),
);
ast_vec
}

/// Check if the input files are supported by this program.
///
/// If the user provides a language override, this will check that the language is supported by the
Expand Down Expand Up @@ -165,10 +152,15 @@ fn run_diff(args: &Args, config: &Config) -> Result<()> {
let ast_data_a = generate_ast_vector_data(path_a.clone(), file_type, &config.grammar)?;
let ast_data_b = generate_ast_vector_data(path_b.clone(), file_type, &config.grammar)?;

let diff_vec_a = generate_ast_vector(&ast_data_a);
let diff_vec_b = generate_ast_vector(&ast_data_b);
let diff_vec_a = config
.input_processing
.process(&ast_data_a.tree, &ast_data_a.text);

let (old_hunks, new_hunks) = ast::compute_edit_script(&diff_vec_a, &diff_vec_b);
let diff_vec_b = config
.input_processing
.process(&ast_data_b.tree, &ast_data_b.text);

let (old_hunks, new_hunks) = diff::compute_edit_script(&diff_vec_a, &diff_vec_b);
let params = DisplayParameters {
old: DocumentDiffData {
filename: &ast_data_a.path.to_string_lossy(),
Expand Down Expand Up @@ -330,23 +322,28 @@ mod tests {
(path_a, path_b)
}

#[test_case("short", "rust", "rs")]
#[test_case("short", "python", "py")]
#[test_case("medium", "rust", "rs")]
#[test_case("medium", "cpp", "cpp")]
fn diff_hunks_snapshot(test_type: &str, name: &str, ext: &str) {
#[test_case("short", "rust", "rs", true)]
#[test_case("short", "python", "py", true)]
#[test_case("medium", "rust", "rs", true)]
#[test_case("medium", "rust", "rs", false)]
#[test_case("medium", "cpp", "cpp", true)]
#[test_case("medium", "cpp", "cpp", false)]
fn diff_hunks_snapshot(test_type: &str, name: &str, ext: &str, split_graphemes: bool) {
let (path_a, path_b) = get_test_paths(test_type, name, ext);
let config = GrammarConfig::default();
let ast_data_a = generate_ast_vector_data(path_a, None, &config).unwrap();
let ast_data_b = generate_ast_vector_data(path_b, None, &config).unwrap();
let diff_vec_a = generate_ast_vector(&ast_data_a);
let diff_vec_b = generate_ast_vector(&ast_data_b);
let diff_hunks = ast::compute_edit_script(&diff_vec_a, &diff_vec_b);

let processor = input_processing::TreeSitterProcessor { split_graphemes };

let diff_vec_a = processor.process(&ast_data_a.tree, &ast_data_a.text);
let diff_vec_b = processor.process(&ast_data_b.tree, &ast_data_b.text);
let diff_hunks = diff::compute_edit_script(&diff_vec_a, &diff_vec_b);

// We have to set the snapshot name manually, otherwise there appear to be threading issues
// and we end up with more snapshot files than there are tests, which cause
// nondeterministic errors.
let snapshot_name = format!("{test_type}_{name}");
let snapshot_name = format!("{test_type}_{name}_{split_graphemes}");
assert_debug_snapshot!(snapshot_name, diff_hunks);
}
}
Loading