diff --git a/gix-blame/src/error.rs b/gix-blame/src/error.rs index 979cc8cd6ef..4d50668837d 100644 --- a/gix-blame/src/error.rs +++ b/gix-blame/src/error.rs @@ -17,6 +17,8 @@ pub enum Error { /// The commit whose tree didn't contain `file_path`. commit_id: gix_hash::ObjectId, }, + #[error("No `BlameState` found, so no state to start blaming from")] + BlameStateNotSet, #[error("Couldn't find commit or tree in the object database")] FindObject(#[from] gix_object::find::Error), #[error("Could not find existing blob or commit")] diff --git a/gix-blame/src/file/checkpoint.rs b/gix-blame/src/file/checkpoint.rs new file mode 100644 index 00000000000..2a55434d432 --- /dev/null +++ b/gix-blame/src/file/checkpoint.rs @@ -0,0 +1,228 @@ +use crate::types::{Change, UnblamedHunk}; +use crate::{BlameCheckpoint, BlameEntry}; +use gix_hash::ObjectId; +use std::num::NonZeroU32; +use std::ops::Range; + +/// Updates blame entries from a checkpoint with newly detected changes. +/// +/// # Arguments +/// * `checkpoint` - Previous checkpoint with blame entries +/// * `changes` - Changes since the checkpoint +/// * `suspect` - Commit ID being investigated +/// +/// # Returns +/// Updated blame entries and new hunks to blame +pub(crate) fn update_checkpoint_blames_with_changes( + checkpoint: BlameCheckpoint, + changes: Vec, + suspect: ObjectId, +) -> (Vec, Vec) { + fn blame_fully_contained_by_change( + blame_lines: &BlameLines, + blame: &BlameEntry, + change_lines: &ChangeLines, + change: &Change, + ) -> bool { + blame_lines.get_remaining(blame) < change_lines.get_remaining(change) + } + + let mut updated_blames = Vec::new(); + let mut new_hunks_to_blame = Vec::new(); + + let mut blame_iter = checkpoint.entries.into_iter().peekable(); + + // This nested loop iterates through changes and blame entries in parallel, tracking how many + // lines have been processed in each. For each change type: + // - Unchanged: Keep the original blame but adjust line numbers + // - Deleted: Remove blame entries for deleted lines + // - Added/Replaced: Create new hunks to be blamed later + // The tracking ensures we correctly handle partial overlaps between changes and blame entries. + 'change: for change in changes { + let mut change_assigned = ChangeLines::default(); + while let Some(blame) = blame_iter.peek_mut() { + let mut blame_assigned = BlameLines::default(); + + // For each of the three cases we have to check if the blame is fully contained by the change. + // If so we can update the blame with the remaining length of the blame. + // If not we have to update the blame with the remaining length of the change. + match change { + Change::Unchanged(ref range) => { + match blame_fully_contained_by_change(&blame_assigned, blame, &change_assigned, &change) { + true => { + updated_blames.push(BlameEntry { + start_in_blamed_file: range.start + change_assigned.assigned.get_assigned(), + start_in_source_file: blame.start_in_source_file, + len: blame.len, + commit_id: blame.commit_id, + }); + + change_assigned.assigned.add_assigned(blame.len.get()); + blame_assigned.assigned.add_assigned(blame.len.get()); + } + false => { + updated_blames.push(BlameEntry { + start_in_blamed_file: range.start + change_assigned.assigned.get_assigned(), + start_in_source_file: blame.start_in_source_file, + len: NonZeroU32::new(change_assigned.get_remaining(&change)).unwrap(), + commit_id: blame.commit_id, + }); + + blame_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + change_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + } + } + } + Change::Deleted(_start_deletion, _lines_deleted) => { + match blame_fully_contained_by_change(&blame_assigned, blame, &change_assigned, &change) { + true => { + blame_assigned.assigned.add_assigned(blame.len.get()); + change_assigned.assigned.add_assigned(blame.len.get()); + } + false => { + blame_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + change_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + } + } + } + Change::AddedOrReplaced(ref range, lines_deleted) => { + let new_unblamed_hunk = |range: &Range, suspect: ObjectId| UnblamedHunk { + range_in_blamed_file: range.clone(), + suspects: [(suspect, range.clone())].into(), + }; + match blame_fully_contained_by_change(&blame_assigned, blame, &change_assigned, &change) { + true => { + if lines_deleted == 0 { + new_hunks_to_blame.push(new_unblamed_hunk(range, suspect)); + } + + change_assigned.assigned.add_assigned(blame.len.get()); + blame_assigned.assigned.add_assigned(blame.len.get()); + } + false => { + new_hunks_to_blame.push(new_unblamed_hunk(range, suspect)); + + blame_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + change_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + } + } + } + } + + // Check if the blame or the change is fully assigned. + // If the blame is fully assigned we can continue with the next blame. + // If the change is fully assigned we can continue with the next change. + // Since we have a mutable reference to the blame we can update it and reset the assigned blame lines. + // If both are fully assigned we can continue with the next blame and change. + match ( + blame_assigned.has_remaining(blame), + change_assigned.has_remaining(&change), + ) { + (true, true) => { + // Both have remaining + blame.update_blame(&blame_assigned.assigned); + } + (true, false) => { + // Change is fully assigned + blame.update_blame(&blame_assigned.assigned); + continue 'change; + } + (false, true) => { + // Blame is fully assigned + blame_iter.next(); + } + (false, false) => { + // Both are fully assigned + blame_iter.next(); + continue 'change; + } + } + } + } + (updated_blames, new_hunks_to_blame) +} + +impl BlameEntry { + /// Updates the blame entry's line numbers by the given offset. + /// + /// This is used when processing changes to adjust the line numbers in the blame entry + /// to account for lines that have already been processed. It updates: + /// * The starting line in the blamed file + /// * The starting line in the source file + /// * The length of the blame entry + pub(crate) fn update_blame(&mut self, offset: &LinesAssigned) { + self.start_in_blamed_file += offset.get_assigned(); + self.start_in_source_file += offset.get_assigned(); + self.len = NonZeroU32::new(u32::from(self.len) - offset.get_assigned()).unwrap(); + } +} + +/// Tracks the number of lines processed during blame updates +#[derive(Debug, Default)] +pub(crate) struct LinesAssigned { + lines_assigned: u32, +} + +impl LinesAssigned { + /// Add lines to the count + fn add_assigned(&mut self, lines: u32) { + self.lines_assigned += lines; + } + + /// Get current count + fn get_assigned(&self) -> u32 { + self.lines_assigned + } +} + +/// Tracks line assignments for blame entries +#[derive(Debug, Default)] +struct BlameLines { + assigned: LinesAssigned, +} + +impl BlameLines { + /// Calculate remaining lines in a blame entry + fn get_remaining(&self, blame: &BlameEntry) -> u32 { + blame.len.get() - self.assigned.get_assigned() + } + + /// Check if any lines remain + fn has_remaining(&self, blame: &BlameEntry) -> bool { + self.get_remaining(blame) > 0 + } +} + +/// Tracks line assignments for changes +#[derive(Debug, Default)] +struct ChangeLines { + assigned: LinesAssigned, +} + +impl ChangeLines { + /// Calculate remaining lines in a change + fn get_remaining(&self, change: &Change) -> u32 { + match &change { + Change::Unchanged(range) => range.len() as u32 - self.assigned.get_assigned(), + Change::AddedOrReplaced(_, deleted_in_before) => *deleted_in_before - self.assigned.get_assigned(), + Change::Deleted(_, deleted_in_before) => *deleted_in_before - self.assigned.get_assigned(), + } + } + + /// Check if any lines remain + fn has_remaining(&self, change: &Change) -> bool { + self.get_remaining(change) > 0 + } +} diff --git a/gix-blame/src/file/function.rs b/gix-blame/src/file/function.rs index 05293053231..6e533a0877a 100644 --- a/gix-blame/src/file/function.rs +++ b/gix-blame/src/file/function.rs @@ -10,12 +10,13 @@ use gix_traverse::commit::find as find_commit; use smallvec::SmallVec; use super::{process_changes, Change, UnblamedHunk}; -use crate::{BlameEntry, Error, Options, Outcome, Statistics}; +use crate::file::checkpoint::update_checkpoint_blames_with_changes; +use crate::{BlameCheckpoint, BlameEntry, Error, Options, Outcome, Statistics}; /// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file /// at `suspect:` originated in. /// -/// ## Paramters +/// ## Parameters /// /// * `odb` /// - Access to database objects, also for used for diffing. @@ -26,10 +27,8 @@ use crate::{BlameEntry, Error, Options, Outcome, Statistics}; /// - Optionally, the commitgraph cache. /// * `file_path` /// - A *slash-separated* worktree-relative path to the file to blame. -/// * `range` -/// - A 1-based inclusive range, in order to mirror `git`’s behaviour. `Some(20..40)` represents -/// 21 lines, spanning from line 20 up to and including line 40. This will be converted to -/// `19..40` internally as the algorithm uses 0-based ranges that are exclusive at the end. +/// * `options` +/// - Options to control the blame operation. /// * `resource_cache` /// - Used for diffing trees. /// @@ -49,7 +48,7 @@ use crate::{BlameEntry, Error, Options, Outcome, Statistics}; /// /// - get the commit /// - walk through its parents -/// - for each parent, do a diff and mark lines that don’t have a suspect yet (this is the term +/// - for each parent, do a diff and mark lines that don't have a suspect yet (this is the term /// used in `libgit2`), but that have been changed in this commit /// /// The algorithm in `libgit2` works by going through parents and keeping a linked list of blame @@ -71,307 +70,43 @@ pub fn file( ) -> Result { let _span = gix_trace::coarse!("gix_blame::file()", ?file_path, ?suspect); - let mut stats = Statistics::default(); - let (mut buf, mut buf2, mut buf3) = (Vec::new(), Vec::new(), Vec::new()); - let blamed_file_entry_id = find_path_entry_in_commit( - &odb, - &suspect, - file_path, - cache.as_ref(), - &mut buf, - &mut buf2, - &mut stats, - )? - .ok_or_else(|| Error::FileMissing { - file_path: file_path.to_owned(), - commit_id: suspect, - })?; - let blamed_file_blob = odb.find_blob(&blamed_file_entry_id, &mut buf)?.data.to_vec(); - let num_lines_in_blamed = tokens_for_diffing(&blamed_file_blob).tokenize().count() as u32; - - // Binary or otherwise empty? - if num_lines_in_blamed == 0 { - return Ok(Outcome::default()); - } - - let ranges = options.range.to_zero_based_exclusive(num_lines_in_blamed)?; - let mut hunks_to_blame = Vec::with_capacity(ranges.len()); - - for range in ranges { - hunks_to_blame.push(UnblamedHunk { - range_in_blamed_file: range.clone(), - suspects: [(suspect, range)].into(), - }); - } - - let (mut buf, mut buf2) = (Vec::new(), Vec::new()); - let commit = find_commit(cache.as_ref(), &odb, &suspect, &mut buf)?; - let mut queue: gix_revwalk::PriorityQueue = gix_revwalk::PriorityQueue::new(); - queue.insert(commit_time(commit)?, suspect); - - let mut out = Vec::new(); - let mut diff_state = gix_diff::tree::State::default(); - let mut previous_entry: Option<(ObjectId, ObjectId)> = None; - 'outer: while let Some(suspect) = queue.pop_value() { - stats.commits_traversed += 1; - if hunks_to_blame.is_empty() { - break; - } - - let is_still_suspect = hunks_to_blame.iter().any(|hunk| hunk.has_suspect(&suspect)); - if !is_still_suspect { - // There are no `UnblamedHunk`s associated with this `suspect`, so we can continue with - // the next one. - continue 'outer; - } - - let commit = find_commit(cache.as_ref(), &odb, &suspect, &mut buf)?; - let commit_time = commit_time(commit)?; - - if let Some(since) = options.since { - if commit_time < since.seconds { - if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { - break 'outer; - } - - continue; - } - } - - let parent_ids: ParentIds = collect_parents(commit, &odb, cache.as_ref(), &mut buf2)?; - - if parent_ids.is_empty() { - if queue.is_empty() { - // I’m not entirely sure if this is correct yet. `suspect`, at this point, is the - // `id` of the last `item` that was yielded by `queue`, so it makes sense to assign - // the remaining lines to it, even though we don’t explicitly check whether that is - // true here. We could perhaps use diff-tree-to-tree to compare `suspect` against - // an empty tree to validate this assumption. - if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { - break 'outer; - } - } - // There is more, keep looking. - continue; - } - - let mut entry = previous_entry - .take() - .filter(|(id, _)| *id == suspect) - .map(|(_, entry)| entry); - if entry.is_none() { - entry = find_path_entry_in_commit( - &odb, - &suspect, - file_path, - cache.as_ref(), - &mut buf, - &mut buf2, - &mut stats, - )?; - } - - let Some(entry_id) = entry else { - continue; - }; - - // This block asserts that, for every `UnblamedHunk`, all lines in the *Blamed File* are - // identical to the corresponding lines in the *Source File*. - #[cfg(debug_assertions)] - { - let source_blob = odb.find_blob(&entry_id, &mut buf)?.data.to_vec(); - let mut source_interner = gix_diff::blob::intern::Interner::new(source_blob.len() / 100); - let source_lines_as_tokens: Vec<_> = tokens_for_diffing(&source_blob) - .tokenize() - .map(|token| source_interner.intern(token)) - .collect(); - - let mut blamed_interner = gix_diff::blob::intern::Interner::new(blamed_file_blob.len() / 100); - let blamed_lines_as_tokens: Vec<_> = tokens_for_diffing(&blamed_file_blob) - .tokenize() - .map(|token| blamed_interner.intern(token)) - .collect(); - - for hunk in hunks_to_blame.iter() { - if let Some(range_in_suspect) = hunk.get_range(&suspect) { - let range_in_blamed_file = hunk.range_in_blamed_file.clone(); - - for (blamed_line_number, source_line_number) in range_in_blamed_file.zip(range_in_suspect.clone()) { - let source_token = source_lines_as_tokens[source_line_number as usize]; - let blame_token = blamed_lines_as_tokens[blamed_line_number as usize]; - - let source_line = BString::new(source_interner[source_token].into()); - let blamed_line = BString::new(blamed_interner[blame_token].into()); - - assert_eq!(source_line, blamed_line); - } - } - } - } - - for (pid, (parent_id, parent_commit_time)) in parent_ids.iter().enumerate() { - if let Some(parent_entry_id) = find_path_entry_in_commit( - &odb, - parent_id, - file_path, - cache.as_ref(), - &mut buf, - &mut buf2, - &mut stats, - )? { - let no_change_in_entry = entry_id == parent_entry_id; - if pid == 0 { - previous_entry = Some((*parent_id, parent_entry_id)); - } - if no_change_in_entry { - pass_blame_from_to(suspect, *parent_id, &mut hunks_to_blame); - queue.insert(*parent_commit_time, *parent_id); - continue 'outer; - } - } - } - - let more_than_one_parent = parent_ids.len() > 1; - for (parent_id, parent_commit_time) in parent_ids { - queue.insert(parent_commit_time, parent_id); - let changes_for_file_path = tree_diff_at_file_path( - &odb, - file_path, - suspect, - parent_id, - cache.as_ref(), - &mut stats, - &mut diff_state, - &mut buf, - &mut buf2, - &mut buf3, - )?; - let Some(modification) = changes_for_file_path else { - if more_than_one_parent { - // None of the changes affected the file we’re currently blaming. - // Copy blame to parent. - for unblamed_hunk in &mut hunks_to_blame { - unblamed_hunk.clone_blame(suspect, parent_id); - } - } else { - pass_blame_from_to(suspect, parent_id, &mut hunks_to_blame); - } - continue; - }; - - match modification { - gix_diff::tree::recorder::Change::Addition { .. } => { - if more_than_one_parent { - // Do nothing under the assumption that this always (or almost always) - // implies that the file comes from a different parent, compared to which - // it was modified, not added. - } else if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { - break 'outer; - } - } - gix_diff::tree::recorder::Change::Deletion { .. } => { - unreachable!("We already found file_path in suspect^{{tree}}, so it can't be deleted") - } - gix_diff::tree::recorder::Change::Modification { previous_oid, oid, .. } => { - let changes = blob_changes( - &odb, - resource_cache, - oid, - previous_oid, - file_path, - options.diff_algorithm, - &mut stats, - )?; - hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent_id); - } - } - } - - hunks_to_blame.retain_mut(|unblamed_hunk| { - if unblamed_hunk.suspects.len() == 1 { - if let Some(entry) = BlameEntry::from_unblamed_hunk(unblamed_hunk, suspect) { - // At this point, we have copied blame for every hunk to a parent. Hunks - // that have only `suspect` left in `suspects` have not passed blame to any - // parent, and so they can be converted to a `BlameEntry` and moved to - // `out`. - out.push(entry); - return false; - } - } - unblamed_hunk.remove_blame(suspect); - true - }); - - // This block asserts that line ranges for each suspect never overlap. If they did overlap - // this would mean that the same line in a *Source File* would map to more than one line in - // the *Blamed File* and this is not possible. - #[cfg(debug_assertions)] - { - let ranges = hunks_to_blame.iter().fold( - std::collections::BTreeMap::>>::new(), - |mut acc, hunk| { - for (suspect, range) in hunk.suspects.clone() { - acc.entry(suspect).or_default().push(range); - } - - acc - }, - ); - - for (_, mut ranges) in ranges { - ranges.sort_by(|a, b| a.start.cmp(&b.start)); - - for window in ranges.windows(2) { - if let [a, b] = window { - assert!(a.end <= b.start, "#{hunks_to_blame:#?}"); - } - } - } - } - } - - debug_assert_eq!( - hunks_to_blame, - vec![], - "only if there is no portion of the file left we have completed the blame" - ); - - // I don’t know yet whether it would make sense to use a data structure instead that preserves - // order on insertion. - out.sort_by(|a, b| a.start_in_blamed_file.cmp(&b.start_in_blamed_file)); - Ok(Outcome { - entries: coalesce_blame_entries(out), - blob: blamed_file_blob, - statistics: stats, - }) + let mut processor = BlameProcessor::new(&odb, cache, resource_cache, file_path, options)?; + processor.set_blame_state(suspect)?; + processor.process_blame() } -/// Pass ownership of each unblamed hunk of `from` to `to`. +/// Process blame information starting from a checkpoint. /// -/// This happens when `from` didn't actually change anything in the blamed file. -fn pass_blame_from_to(from: ObjectId, to: ObjectId, hunks_to_blame: &mut Vec) { - for unblamed_hunk in hunks_to_blame { - unblamed_hunk.pass_blame(from, to); - } -} - -/// Convert each of the unblamed hunk in `hunks_to_blame` into a [`BlameEntry`], consuming them in the process. +/// Uses the same core blame algorithm as `file()` but continues from a previously provided checkpoint, +/// allowing for incremental blame operations by reusing previously computed information. +/// +/// # Parameters /// -/// Return `true` if we are done because `hunks_to_blame` is empty. -fn unblamed_to_out_is_done( - hunks_to_blame: &mut Vec, - out: &mut Vec, +/// * `odb` - Access to database objects, used for diffing and object lookup +/// * `suspect` - The commit to start blame processing from +/// * `cache` - Optional commitgraph cache for performance +/// * `blame_checkpoint` - Previous blame information to start from +/// * `resource_cache` - Cache used for diffing trees +/// * `file_path` - Path to the file being blamed (slash-separated, worktree-relative) +/// * `options` - Options controlling the blame operation +/// +/// # Returns +/// +/// Returns an `Outcome` containing the blame entries and statistics, or an error if the operation fails. +pub fn file_checkpoint( + odb: impl gix_object::Find + gix_object::FindHeader, suspect: ObjectId, -) -> bool { - let mut without_suspect = Vec::new(); - out.extend(hunks_to_blame.drain(..).filter_map(|hunk| { - BlameEntry::from_unblamed_hunk(&hunk, suspect).or_else(|| { - without_suspect.push(hunk); - None - }) - })); - *hunks_to_blame = without_suspect; - hunks_to_blame.is_empty() + cache: Option, + blame_checkpoint: BlameCheckpoint, + resource_cache: &mut gix_diff::blob::Platform, + file_path: &BStr, + options: Options, +) -> Result { + let _span = gix_trace::coarse!("gix_blame::file_checkpoint()", ?file_path, ?suspect); + + let mut processor = BlameProcessor::new(&odb, cache, resource_cache, file_path, options)?; + processor.set_blame_state_from_checkpoint(blame_checkpoint, suspect)?; + processor.process_blame() } /// This function merges adjacent blame entries. It merges entries that are adjacent both in the @@ -662,7 +397,7 @@ fn commit_time(commit: gix_traverse::commit::Either<'_, '_>) -> Result; +type ParentIds = SmallVec<[(ObjectId, i64); 2]>; fn collect_parents( commit: gix_traverse::commit::Either<'_, '_>, @@ -699,3 +434,561 @@ fn collect_parents( pub(crate) fn tokens_for_diffing(data: &[u8]) -> impl TokenSource { gix_diff::blob::sources::byte_lines_with_terminator(data) } + +/// State maintained during the blame operation +/// +/// Tracks both the current processing state and final results, including unblamed hunks, +/// completed entries, file content, and statistics. Can be created either from scratch +/// or from a checkpoint. +struct BlameState { + /// The current state of unblamed hunks that still need processing + hunks_to_blame: Vec, + /// The final blame entries that have been processed + out: Vec, + /// The content of the file being blamed + blamed_file_blob: Vec, + /// Statistics gathered during the blame operation + stats: Statistics, + /// Priority queue for processing commits in chronological order + queue: gix_revwalk::PriorityQueue, + /// Cache for the previous entry to avoid redundant lookups + previous_entry: Option<(ObjectId, ObjectId)>, +} + +/// Processor for handling blame operations +/// +/// Encapsulates the logic for processing blame operations, whether starting +/// from scratch or continuing from a checkpoint. +struct BlameProcessor<'a, T: gix_object::Find + gix_object::FindHeader> { + odb: &'a T, + commit_graph_cache: Option, + resource_cache: &'a mut gix_diff::blob::Platform, + file_path: &'a BStr, + options: Options, + blame_state: Option, +} + +impl<'a, T: gix_object::Find + gix_object::FindHeader> BlameProcessor<'a, T> { + /// Creates a new blame processor with the given parameters + fn new( + odb: &'a T, + commit_graph_cache: Option, + resource_cache: &'a mut gix_diff::blob::Platform, + file_path: &'a BStr, + options: Options, + ) -> Result { + Ok(Self { + odb, + commit_graph_cache, + resource_cache, + file_path, + blame_state: None, + options, + }) + } + + /// Sets up the initial blame state for processing from scratch + fn set_blame_state(&mut self, suspect: ObjectId) -> Result<(), Error> { + let blame_state = self.create_blame_state(suspect)?; + self.blame_state = Some(blame_state); + Ok(()) + } + + /// Sets up the initial blame state using information from a checkpoint + fn set_blame_state_from_checkpoint( + &mut self, + blame_checkpoint: BlameCheckpoint, + suspect: ObjectId, + ) -> Result<(), Error> { + let blame_state = self.create_blame_state_from_checkpoint(blame_checkpoint, suspect)?; + self.blame_state = Some(blame_state); + Ok(()) + } + + /// Helper function to find a file entry in a commit and return its blob data + fn find_file_entry_and_blob( + &self, + commit_id: &ObjectId, + stats: &mut Statistics, + buf: &mut Vec, + buf2: &mut Vec, + ) -> Result<(ObjectId, Vec), Error> { + let entry_id = find_path_entry_in_commit( + self.odb, + commit_id, + self.file_path, + self.commit_graph_cache.as_ref(), + buf, + buf2, + stats, + )? + .ok_or_else(|| Error::FileMissing { + file_path: self.file_path.to_owned(), + commit_id: *commit_id, + })?; + + let blob_data = self.odb.find_blob(&entry_id, buf)?.data.to_vec(); + Ok((entry_id, blob_data)) + } + + /// Creates the initial blame state from a checkpoint + /// + /// Sets up blame state by: + /// 1. Getting file contents from checkpoint and suspect commits + /// 2. Computing and applying changes + /// 3. Setting up remaining hunks for processing + fn create_blame_state_from_checkpoint( + &mut self, + blame_checkpoint: BlameCheckpoint, + suspect: ObjectId, + ) -> Result { + let mut stats = Statistics::default(); + let (mut buf, mut buf2) = (Vec::new(), Vec::new()); + + let (checkpoint_file_entry_id, _) = + self.find_file_entry_and_blob(&blame_checkpoint.checkpoint_commit_id, &mut stats, &mut buf, &mut buf2)?; + + let (blamed_file_entry_id, blamed_file_blob) = + self.find_file_entry_and_blob(&suspect, &mut stats, &mut buf, &mut buf2)?; + + let changes_with_checkpoint = blob_changes( + self.odb, + self.resource_cache, + checkpoint_file_entry_id, + blamed_file_entry_id, + self.file_path, + self.options.diff_algorithm, + &mut stats, + )?; + + let mut blame_state = BlameState::empty(blamed_file_blob, suspect, stats); + + // If there are no changes, we update the blame_state with the checkpoint entries. + // Because the range stays 0..0 we are returning early during processing + if changes_with_checkpoint + .iter() + .all(|change| matches!(change, Change::Unchanged(_))) + { + blame_state.set_out(blame_checkpoint.entries); + return Ok(blame_state); + } + + let (blame_entries, hunks_to_blame) = + update_checkpoint_blames_with_changes(blame_checkpoint, changes_with_checkpoint, suspect); + + blame_state.set_out(blame_entries); + if !hunks_to_blame.is_empty() { + blame_state.set_hunks(hunks_to_blame); + } + Ok(blame_state) + } + + /// Set a new BlameState for the given suspect commit + fn create_blame_state(&self, suspect: ObjectId) -> Result { + let mut stats = Statistics::default(); + let (mut buf, mut buf2) = (Vec::new(), Vec::new()); + + let (_, blamed_file_blob) = self.find_file_entry_and_blob(&suspect, &mut stats, &mut buf, &mut buf2)?; + + let num_lines_in_blamed = tokens_for_diffing(&blamed_file_blob).tokenize().count() as u32; + + // Binary or otherwise empty? + if num_lines_in_blamed == 0 { + return Ok(BlameState::empty(blamed_file_blob, suspect, stats)); + } + + let ranges_in_blamed_file = self + .options + .range + .to_zero_based_exclusive(num_lines_in_blamed) + .unwrap_or_default(); + + let commit = find_commit(self.commit_graph_cache.as_ref(), self.odb, &suspect, &mut buf)?; + let initial_commit_time = commit_time(commit)?; + + Ok(BlameState::new( + blamed_file_blob, + ranges_in_blamed_file, + suspect, + initial_commit_time, + Some(stats), + )) + } + + fn process_blame(self) -> Result { + let mut state = match self.blame_state { + Some(state) => state, + None => { + return Err(Error::BlameStateNotSet); + } + }; + + let mut diff_state = gix_diff::tree::State::default(); + let (mut buf, mut buf2, mut buf3) = (Vec::new(), Vec::new(), Vec::new()); + + 'outer: while let Some(suspect) = state.queue.pop_value() { + state.stats.commits_traversed += 1; + if !state.has_unblamed_hunks() { + break; + } + + let is_still_suspect = state.hunks_to_blame.iter().any(|hunk| hunk.has_suspect(&suspect)); + if !is_still_suspect { + // There are no `UnblamedHunk`s associated with this `suspect`, so we can continue with + // the next one. + continue 'outer; + } + + let commit = find_commit(self.commit_graph_cache.as_ref(), self.odb, &suspect, &mut buf)?; + let commit_time = commit_time(commit)?; + + if let Some(since) = self.options.since { + if commit_time < since.seconds { + if state.has_unblamed_hunks() && state.unblamed_to_out_is_done(suspect) { + break 'outer; + } + continue; + } + } + + let parent_ids: ParentIds = collect_parents(commit, self.odb, self.commit_graph_cache.as_ref(), &mut buf2)?; + + if parent_ids.is_empty() { + if state.queue.is_empty() { + // TODO I'm not entirely sure if this is correct yet. `suspect`, at this point, is the + // `id` of the last `item` that was yielded by `queue`, so it makes sense to assign + // the remaining lines to it, even though we don't explicitly check whether that is + // true here. We could perhaps use diff-tree-to-tree to compare `suspect` against + // an empty tree to validate this assumption. + if state.unblamed_to_out_is_done(suspect) { + break 'outer; + } + } + // There is more, keep looking. + continue; + } + + let mut entry = state + .previous_entry + .take() + .filter(|(id, _)| *id == suspect) + .map(|(_, entry)| entry); + if entry.is_none() { + entry = find_path_entry_in_commit( + self.odb, + &suspect, + self.file_path, + self.commit_graph_cache.as_ref(), + &mut buf, + &mut buf2, + &mut state.stats, + )?; + } + + let Some(entry_id) = entry else { + continue; + }; + + #[cfg(debug_assertions)] + state.assert_hunk_and_blame_ranges_identical(self.odb, entry_id, &mut buf)?; + + if state.check_for_unchanged_parents( + self.odb, + self.commit_graph_cache.as_ref(), + self.file_path, + entry_id, + suspect, + &parent_ids, + &mut buf, + &mut buf2, + )? { + continue 'outer; + } + + let more_than_one_parent = parent_ids.len() > 1; + for (parent_id, parent_commit_time) in parent_ids { + state.queue.insert(parent_commit_time, parent_id); + let changes_for_file_path = tree_diff_at_file_path( + self.odb, + self.file_path, + suspect, + parent_id, + self.commit_graph_cache.as_ref(), + &mut state.stats, + &mut diff_state, + &mut buf, + &mut buf2, + &mut buf3, + )?; + + let Some(modification) = changes_for_file_path else { + if more_than_one_parent { + // None of the changes affected the file we're currently blaming. + // Copy blame to parent + state.clone_blame(suspect, parent_id); + } else { + state.pass_blame(suspect, parent_id); + } + continue; + }; + + match modification { + gix_diff::tree::recorder::Change::Addition { .. } => { + if more_than_one_parent { + // Do nothing under the assumption that this always (or almost always) + // implies that the file comes from a different parent, compared to which + // it was modified, not added. + } else if state.unblamed_to_out_is_done(suspect) { + break 'outer; + } + } + gix_diff::tree::recorder::Change::Deletion { .. } => { + unreachable!("We already found file_path in suspect^{{tree}}, so it can't be deleted") + } + gix_diff::tree::recorder::Change::Modification { previous_oid, oid, .. } => { + let changes = blob_changes( + self.odb, + self.resource_cache, + oid, + previous_oid, + self.file_path, + self.options.diff_algorithm, + &mut state.stats, + )?; + state.hunks_to_blame = process_changes(state.hunks_to_blame, changes, suspect, parent_id); + } + } + } + + state.process_completed_hunks(suspect); + } + + #[cfg(debug_assertions)] + state.assert_no_overlapping_ranges(); + + Ok(state.finalize()) + } +} + +impl BlameState { + /// Create a new BlameState with initial values + fn new( + blamed_file_blob: Vec, + initial_ranges: Vec>, + initial_suspect: ObjectId, + initial_commit_time: CommitTime, + statistics: Option, + ) -> Self { + let mut queue = gix_revwalk::PriorityQueue::new(); + queue.insert(initial_commit_time, initial_suspect); + + let mut hunks_to_blame = Vec::with_capacity(initial_ranges.len()); + + for range in initial_ranges { + hunks_to_blame.push(UnblamedHunk { + range_in_blamed_file: range.clone(), + suspects: [(initial_suspect, range)].into(), + }); + } + + Self { + hunks_to_blame, + out: Vec::new(), + blamed_file_blob, + stats: statistics.unwrap_or_default(), + queue, + previous_entry: None, + } + } + + /// Create a BlameState with an empty range, used when there are no lines to blame + /// or when starting from a checkpoint + fn empty(blamed_file_blob: Vec, suspect: ObjectId, stats: Statistics) -> Self { + let empty_range = 0..0; + Self::new(blamed_file_blob, vec![empty_range], suspect, 0, Some(stats)) + } + + /// Check if there are any remaining hunks to process + fn has_unblamed_hunks(&self) -> bool { + !self.hunks_to_blame.is_empty() + } + + /// Convert each of the unblamed hunk in `hunks_to_blame` into a [`BlameEntry`], consuming them in the process. + /// + /// Return `true` if we are done because `hunks_to_blame` is empty. + fn unblamed_to_out_is_done(&mut self, suspect: ObjectId) -> bool { + let mut without_suspect = Vec::new(); + self.out.extend(self.hunks_to_blame.drain(..).filter_map(|hunk| { + BlameEntry::from_unblamed_hunk(&hunk, suspect).or_else(|| { + without_suspect.push(hunk); + None + }) + })); + self.hunks_to_blame = without_suspect; + self.hunks_to_blame.is_empty() + } + + fn set_out(&mut self, out: Vec) { + self.out = out; + } + + fn set_hunks(&mut self, unblamed_hunks: Vec) { + self.hunks_to_blame = unblamed_hunks; + } + + /// Process completed hunks for a suspect, moving them to the output + fn process_completed_hunks(&mut self, suspect: ObjectId) { + self.hunks_to_blame.retain_mut(|unblamed_hunk| { + if unblamed_hunk.suspects.len() == 1 { + if let Some(entry) = BlameEntry::from_unblamed_hunk(unblamed_hunk, suspect) { + // At this point, we have copied blame for every hunk to a parent. Hunks + // that have only `suspect` left in `suspects` have not passed blame to any + // parent, and so they can be converted to a `BlameEntry` and moved to + // `out`. + self.out.push(entry); + return false; + } + } + unblamed_hunk.remove_blame(suspect); + true + }); + } + + /// Pass blame from one commit to another for all hunks + fn pass_blame(&mut self, from: ObjectId, to: ObjectId) { + for unblamed_hunk in &mut self.hunks_to_blame { + unblamed_hunk.pass_blame(from, to); + } + } + + /// Clone blame from one commit to another for all hunks + fn clone_blame(&mut self, from: ObjectId, to: ObjectId) { + for unblamed_hunk in &mut self.hunks_to_blame { + unblamed_hunk.clone_blame(from, to); + } + } + + /// Finalize the blame state and return the outcome + fn finalize(mut self) -> Outcome { + debug_assert_eq!( + self.hunks_to_blame, + vec![], + "only if there is no portion of the file left we have completed the blame" + ); + + // TODO I don't know yet whether it would make sense to use a data structure instead that preserves + // order on insertion. + self.out + .sort_by(|a, b| a.start_in_blamed_file.cmp(&b.start_in_blamed_file)); + Outcome { + entries: coalesce_blame_entries(self.out), + blob: self.blamed_file_blob, + statistics: self.stats, + } + } + + /// Check if any parent has the exact same file content and if so, pass blame to that parent. + /// This is an optimization to avoid unnecessary diffing. + /// Returns true if an unchanged parent was found and handled. + #[allow(clippy::too_many_arguments)] + fn check_for_unchanged_parents( + &mut self, + odb: &impl gix_object::Find, + commit_graph_cache: Option<&gix_commitgraph::Graph>, + file_path: &BStr, + entry_id: ObjectId, + suspect: ObjectId, + parent_ids: &[(ObjectId, i64)], + buf: &mut Vec, + buf2: &mut Vec, + ) -> Result { + for (pid, (parent_id, parent_commit_time)) in parent_ids.iter().enumerate() { + if let Some(parent_entry_id) = find_path_entry_in_commit( + odb, + parent_id, + file_path, + commit_graph_cache, + buf, + buf2, + &mut self.stats, + )? { + let no_change_in_entry = entry_id == parent_entry_id; + if pid == 0 { + self.previous_entry = Some((*parent_id, parent_entry_id)); + } + if no_change_in_entry { + self.pass_blame(suspect, *parent_id); + self.queue.insert(*parent_commit_time, *parent_id); + return Ok(true); + } + } + } + Ok(false) + } + + /// Assert that line ranges for each suspect never overlap + /// This block asserts that line ranges for each suspect never overlap. If they did overlap + // this would mean that the same line in a *Source File* would map to more than one line in + // the *Blamed File* and this is not possible. + #[cfg(debug_assertions)] + fn assert_no_overlapping_ranges(&self) { + let ranges = self.hunks_to_blame.iter().fold( + std::collections::BTreeMap::>>::new(), + |mut acc, hunk| { + for (suspect, range) in hunk.suspects.clone() { + acc.entry(suspect).or_default().push(range); + } + acc + }, + ); + + for (_, mut ranges) in ranges { + ranges.sort_by(|a, b| a.start.cmp(&b.start)); + for window in ranges.windows(2) { + if let [a, b] = window { + assert!(a.end <= b.start, "#{:?}", self.hunks_to_blame); + } + } + } + } + + /// Assert that for every UnblamedHunk, all lines in the Blamed File are identical + /// to the corresponding lines in the Source File. + #[cfg(debug_assertions)] + fn assert_hunk_and_blame_ranges_identical( + &self, + odb: &impl gix_object::Find, + entry_id: ObjectId, + buf: &mut Vec, + ) -> Result<(), Error> { + let source_blob = odb.find_blob(&entry_id, buf)?.data.to_vec(); + let mut source_interner = gix_diff::blob::intern::Interner::new(source_blob.len() / 100); + let source_lines_as_tokens: Vec<_> = tokens_for_diffing(&source_blob) + .tokenize() + .map(|token| source_interner.intern(token)) + .collect(); + + let mut blamed_interner = gix_diff::blob::intern::Interner::new(self.blamed_file_blob.len() / 100); + let blamed_lines_as_tokens: Vec<_> = tokens_for_diffing(&self.blamed_file_blob) + .tokenize() + .map(|token| blamed_interner.intern(token)) + .collect(); + + for hunk in self.hunks_to_blame.iter() { + if let Some(range_in_suspect) = hunk.get_range(&entry_id) { + let range_in_blamed_file = hunk.range_in_blamed_file.clone(); + + for (blamed_line_number, source_line_number) in range_in_blamed_file.zip(range_in_suspect.clone()) { + let source_token = source_lines_as_tokens[source_line_number as usize]; + let blame_token = blamed_lines_as_tokens[blamed_line_number as usize]; + + let source_line = BString::new(source_interner[source_token].into()); + let blamed_line = BString::new(blamed_interner[blame_token].into()); + + assert_eq!(source_line, blamed_line); + } + } + } + Ok(()) + } +} diff --git a/gix-blame/src/file/mod.rs b/gix-blame/src/file/mod.rs index bbca9681837..9528da2cfa2 100644 --- a/gix-blame/src/file/mod.rs +++ b/gix-blame/src/file/mod.rs @@ -6,6 +6,7 @@ use gix_hash::ObjectId; use crate::types::{BlameEntry, Change, Either, LineRange, Offset, UnblamedHunk}; +pub(crate) mod checkpoint; pub(super) mod function; /// Compare a section from the *Blamed File* (`hunk`) with a change from a diff and see if there diff --git a/gix-blame/src/file/tests.rs b/gix-blame/src/file/tests.rs index 02e7d89ebd1..4db06f18964 100644 --- a/gix-blame/src/file/tests.rs +++ b/gix-blame/src/file/tests.rs @@ -29,6 +29,265 @@ fn one_sha() -> ObjectId { ObjectId::from_str("1111111111111111111111111111111111111111").unwrap() } +fn two_sha() -> ObjectId { + use std::str::FromStr; + + ObjectId::from_str("2222222222222222222222222222222222222222").unwrap() +} + +mod blame_cache_to_hunks { + use super::*; + use crate::file::{checkpoint::update_checkpoint_blames_with_changes, BlameEntry, Change, UnblamedHunk}; + use crate::BlameCheckpoint; + + fn single_blame_entry() -> BlameCheckpoint { + BlameCheckpoint { + entries: vec![BlameEntry::new(0..5, 0..5, zero_sha())], + checkpoint_commit_id: zero_sha(), + } + } + + fn multiple_blame_entries() -> BlameCheckpoint { + BlameCheckpoint { + entries: vec![ + BlameEntry::new(0..5, 0..5, zero_sha()), + BlameEntry::new(5..10, 0..5, one_sha()), + ], + checkpoint_commit_id: zero_sha(), + } + } + + #[test] + fn no_changes() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Unchanged(0..5)]; + + let expected_blame = single_blame_entry().entries; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn deleted_line_full_blame() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Deleted(0, 5)]; + let expected_blame = vec![]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn deleted_line_partial_first_delete() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Deleted(0, 3), Change::Unchanged(0..2)]; + let expected_blame = vec![BlameEntry::new(0..2, 3..5, zero_sha())]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn deleted_line_partial_first_unchanged() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Unchanged(0..2), Change::Deleted(0, 3)]; + let expected_blame = vec![BlameEntry::new(0..2, 0..2, zero_sha())]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn deleted_line_spanning() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Unchanged(0..1), Change::Deleted(1, 3), Change::Unchanged(1..2)]; + let expected_blame = vec![ + BlameEntry::new(0..1, 0..1, zero_sha()), + BlameEntry::new(1..2, 4..5, zero_sha()), + ]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn add_or_replace_full_blame_delete() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::AddedOrReplaced(0..5, 5)]; + let expected_blame = vec![]; + let expected_unblamed_hunks = vec![UnblamedHunk { + range_in_blamed_file: 0..5, + suspects: [(two_sha(), 0..5)].into(), + }]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } + + #[test] + fn add_or_replace_first_partial_blame_delete() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::AddedOrReplaced(0..5, 3), Change::Unchanged(5..7)]; + let expected_blame = vec![BlameEntry::new(5..7, 3..5, zero_sha())]; + let expected_unblamed_hunks = vec![UnblamedHunk { + range_in_blamed_file: 0..5, + suspects: [(two_sha(), 0..5)].into(), + }]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } + + #[test] + fn add_or_replace_first_unchanged_partial_blame_delete() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Unchanged(0..3), Change::AddedOrReplaced(0..5, 2)]; + let expected_blame = vec![BlameEntry::new(0..3, 0..3, zero_sha())]; + let expected_unblamed_hunks = vec![UnblamedHunk { + range_in_blamed_file: 0..5, + suspects: [(two_sha(), 0..5)].into(), + }]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } + + #[test] + fn add_or_replace_unchanged_replace_unchanged() { + let checkpoint = single_blame_entry(); + let changes = vec![ + Change::Unchanged(0..2), + Change::AddedOrReplaced(0..5, 1), + Change::Unchanged(7..9), + ]; + let expected_blame = vec![ + BlameEntry::new(0..2, 0..2, zero_sha()), + BlameEntry::new(7..9, 3..5, zero_sha()), + ]; + let expected_unblamed_hunks = vec![UnblamedHunk { + range_in_blamed_file: 0..5, + suspects: [(two_sha(), 0..5)].into(), + }]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } + + #[test] + fn add_or_replace_no_deletion() { + let checkpoint = single_blame_entry(); + let changes = vec![ + Change::Unchanged(0..2), + Change::AddedOrReplaced(0..5, 0), + Change::Unchanged(6..9), + ]; + let expected_blame = vec![ + BlameEntry::new(0..2, 0..2, zero_sha()), + BlameEntry::new(6..9, 2..5, zero_sha()), + ]; + let expected_unblamed_hunks = vec![UnblamedHunk { + range_in_blamed_file: 0..5, + suspects: [(two_sha(), 0..5)].into(), + }]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } + + #[test] + fn multiple_blames_no_change() { + let checkpoint = multiple_blame_entries(); + let changes = vec![Change::Unchanged(0..10)]; + let expected_blame = multiple_blame_entries().entries; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn multiple_blames_change_spans_multiple_lines_first_unchanged() { + let checkpoint = multiple_blame_entries(); + let changes = vec![Change::Unchanged(0..6), Change::Deleted(6, 4)]; + let expected_blame = vec![ + BlameEntry::new(0..5, 0..5, zero_sha()), + BlameEntry::new(5..6, 0..1, one_sha()), + ]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn multiple_blames_change_spans_multiple_lines_first_delete() { + let checkpoint = multiple_blame_entries(); + let changes = vec![Change::Deleted(0, 4), Change::Unchanged(0..6)]; + let expected_blame = vec![ + BlameEntry::new(0..1, 4..5, zero_sha()), + BlameEntry::new(1..6, 0..5, one_sha()), + ]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn multiple_blames_change_spans_delete_spanning() { + let checkpoint = multiple_blame_entries(); + let changes = vec![Change::Unchanged(0..4), Change::Deleted(4, 4), Change::Unchanged(4..6)]; + let expected_blame = vec![ + BlameEntry::new(0..4, 0..4, zero_sha()), + BlameEntry::new(4..6, 3..5, one_sha()), + ]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn multiple_blames_add_or_replace_blame_delete_spanning() { + let checkpoint = multiple_blame_entries(); + let changes = vec![ + Change::Unchanged(0..4), + Change::AddedOrReplaced(4..9, 3), + Change::Unchanged(9..10), + Change::AddedOrReplaced(10..12, 2), + ]; + let expected_blame = vec![ + BlameEntry::new(0..4, 0..4, zero_sha()), + BlameEntry::new(9..10, 2..3, one_sha()), + ]; + let expected_unblamed_hunks = vec![ + UnblamedHunk { + range_in_blamed_file: 4..9, + suspects: [(two_sha(), 4..9)].into(), + }, + UnblamedHunk { + range_in_blamed_file: 10..12, + suspects: [(two_sha(), 10..12)].into(), + }, + ]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } +} + mod process_change { use super::*; use crate::file::{process_change, Change, Offset, UnblamedHunk}; diff --git a/gix-blame/src/lib.rs b/gix-blame/src/lib.rs index aca4299d41c..399c7e1e619 100644 --- a/gix-blame/src/lib.rs +++ b/gix-blame/src/lib.rs @@ -17,7 +17,7 @@ mod error; pub use error::Error; mod types; -pub use types::{BlameEntry, BlameRanges, Options, Outcome, Statistics}; +pub use types::{BlameCheckpoint, BlameEntry, BlameRanges, Options, Outcome, Statistics}; mod file; -pub use file::function::file; +pub use file::function::{file, file_checkpoint}; diff --git a/gix-blame/src/types.rs b/gix-blame/src/types.rs index d60b723e598..184eb5bb523 100644 --- a/gix-blame/src/types.rs +++ b/gix-blame/src/types.rs @@ -312,6 +312,23 @@ impl BlameEntry { } } +/// A checkpoint for resuming blame operations. +/// +/// Contains previously computed blame entries that can be used with `file_checkpoint()` to: +/// - Resume processing from a previous point +/// - Enable incremental updates when history changes +/// - Speed up subsequent operations through caching +/// +/// When used, entries are validated against changes since checkpoint creation +/// and serve as the starting point for remaining blame computation. +#[derive(Debug, PartialEq)] +pub struct BlameCheckpoint { + /// The entries of the cache. + pub entries: Vec, + /// The commit that was blamed to produce these entries. + pub checkpoint_commit_id: ObjectId, +} + pub(crate) trait LineRange { fn shift_by(&self, offset: Offset) -> Self; } diff --git a/gix-blame/tests/blame.rs b/gix-blame/tests/blame.rs index 99e5105f6df..8f14b600ff5 100644 --- a/gix-blame/tests/blame.rs +++ b/gix-blame/tests/blame.rs @@ -1,6 +1,7 @@ use std::path::PathBuf; -use gix_blame::BlameRanges; +use gix_blame::{BlameCheckpoint, BlameRanges}; +use gix_ref::file::ReferenceExt; use gix_hash::ObjectId; use gix_object::bstr; @@ -417,6 +418,77 @@ mod blame_ranges { } } +#[test] +fn blame_with_checkpoint() -> gix_testtools::Result<()> { + let Fixture { + odb, + mut resource_cache, + suspect, + } = Fixture::new()?; + + // Find the checkpoint commit using the tag + let store = gix_ref::file::Store::at( + fixture_path().join(".git"), + gix_ref::store::init::Options { + write_reflog: gix_ref::store::WriteReflog::Disable, + ..Default::default() + }, + ); + let mut checkpoint_ref = gix_ref::file::Store::find(&store, "refs/tags/checkpoint-commit")?; + let checkpoint_commit = checkpoint_ref.peel_to_id_in_place(&store, &odb)?; + assert_ne!( + checkpoint_commit, + suspect, + "should not be the same commit as the suspect"); + + // First run blame on the checkpoint commit + let checkpoint_blame = gix_blame::file( + &odb, + checkpoint_commit, + None, + &mut resource_cache, + "checkpoint-test.txt".into(), + gix_blame::Options { + diff_algorithm: gix_diff::blob::Algorithm::Histogram, + range: BlameRanges::default(), + since: None, + }, + )?; + + // Create checkpoint + let checkpoint = BlameCheckpoint { + entries: checkpoint_blame.entries.clone(), + checkpoint_commit_id: checkpoint_commit, + }; + + // Run blame from checkpoint up to suspect commit + let blame_from_checkpoint = gix_blame::file_checkpoint( + &odb, + suspect, + None, + checkpoint, + &mut resource_cache, + "checkpoint-test.txt".into(), + gix_blame::Options { + diff_algorithm: gix_diff::blob::Algorithm::Histogram, + range: BlameRanges::default(), + since: None, + }, + )?; + + // Compare with git's blame output + let git_dir = fixture_path().join(".git"); + let baseline = Baseline::collect(git_dir.join("checkpoint-test.baseline"))?; + assert_eq!(blame_from_checkpoint.entries, baseline); + + // Also verify the checkpoint state matches git's blame at that commit + let checkpoint_baseline = Baseline::collect(git_dir.join("checkpoint-test-initial.baseline"))?; + assert_eq!(checkpoint_blame.entries, checkpoint_baseline); + assert_ne!(blame_from_checkpoint.entries, checkpoint_blame.entries); + + Ok(()) +} + fn fixture_path() -> PathBuf { gix_testtools::scripted_fixture_read_only("make_blame_repo.sh").unwrap() } diff --git a/gix-blame/tests/fixtures/make_blame_repo.sh b/gix-blame/tests/fixtures/make_blame_repo.sh index fe5ca4f1af0..5afcf46d1e0 100755 --- a/gix-blame/tests/fixtures/make_blame_repo.sh +++ b/gix-blame/tests/fixtures/make_blame_repo.sh @@ -6,6 +6,10 @@ git config --local diff.algorithm histogram git config merge.ff false +# Set up committer info with explicit dates +export GIT_COMMITTER_DATE="2024-01-01 12:00:00 +0000" +export GIT_AUTHOR_DATE="2024-01-01 12:00:00 +0000" + git checkout -q -b main echo "line 1" >> simple.txt @@ -53,6 +57,7 @@ echo -e "line 1\nline in between\nline 2" > coalesce-adjacent-hunks.txt git add same-line-changed-twice.txt git add coalesce-adjacent-hunks.txt git commit -q -m c2.4 +git tag c2.4 # Add tag for the commit echo "line 3" >> simple.txt git add simple.txt @@ -128,9 +133,24 @@ cp empty-lines-histogram.txt empty-lines-myers.txt git add empty-lines-histogram.txt empty-lines-myers.txt git commit -q -m c5.4 +# Add test case for checkpoint functionality +echo -e "line 1\nline 2\nline 3" > checkpoint-test.txt +git add checkpoint-test.txt +git commit -q -m c5.5 +git tag checkpoint-commit # Tag this commit for easy reference + +echo -e "line 1\nline 2 modified\nline 3" > checkpoint-test.txt +git add checkpoint-test.txt +git commit -q -m c5.6 + +# Generate baseline for both states +git blame --porcelain checkpoint-test.txt > .git/checkpoint-test.baseline +git checkout checkpoint-commit +git blame --porcelain checkpoint-test.txt > .git/checkpoint-test-initial.baseline + # The commit history created by the commits above this line is linear, it only # contains commits that have exactly one parent. -# Below this line, there’s also commits that have more than one parent. +# Below this line, there's also commits that have more than one parent. echo -e "line 1 original\nline 2\n line 3" > resolved-conflict.txt git add resolved-conflict.txt @@ -200,7 +220,7 @@ git merge branch-that-has-one-of-the-changes || true # This is to verify that, even though commits `c15`, `c15.1` and `merge` are # not chronologically ordered (`c15.1` has a `CommitDate` that is before -# `c15`’s `CommitDate`), the resulting blame is still correct. +# `c15`'s `CommitDate`), the resulting blame is still correct. # # ---c15------c15.2 # \ \ @@ -253,3 +273,5 @@ git blame --porcelain empty-lines-histogram.txt > .git/empty-lines-histogram.bas git config --local diff.algorithm myers git blame --porcelain empty-lines-myers.txt > .git/empty-lines-myers.baseline + +