From d46766aa29c4ac0bb198aa74fadb5b07ba82f03b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20R=C3=BC=C3=9Fler?= Date: Sat, 3 May 2025 10:42:35 +0200 Subject: [PATCH 1/5] Provide more context in assertion --- gix-blame/src/file/function.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/gix-blame/src/file/function.rs b/gix-blame/src/file/function.rs index fbbb5c4248f..2a5666d9de6 100644 --- a/gix-blame/src/file/function.rs +++ b/gix-blame/src/file/function.rs @@ -197,15 +197,16 @@ pub fn file( if let Some(range_in_suspect) = hunk.get_range(&suspect) { let range_in_blamed_file = hunk.range_in_blamed_file.clone(); - for (blamed_line_number, source_line_number) in range_in_blamed_file.zip(range_in_suspect.clone()) { - let source_token = source_lines_as_tokens[source_line_number as usize]; - let blame_token = blamed_lines_as_tokens[blamed_line_number as usize]; - - let source_line = BString::new(source_interner[source_token].into()); - let blamed_line = BString::new(blamed_interner[blame_token].into()); - - assert_eq!(source_line, blamed_line); - } + let source_lines = range_in_suspect + .clone() + .map(|i| BString::new(source_interner[source_lines_as_tokens[i as usize]].into())) + .collect::>(); + let blamed_lines = range_in_blamed_file + .clone() + .map(|i| BString::new(blamed_interner[blamed_lines_as_tokens[i as usize]].into())) + .collect::>(); + + assert_eq!(source_lines, blamed_lines); } } } From 6e1ea6d85b8396b8348498c643d92eafb832987c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20R=C3=BC=C3=9Fler?= Date: Sat, 3 May 2025 10:43:48 +0200 Subject: [PATCH 2/5] Correctly process overlapping unblamed hunks Previously, `process_changes` assumed that `UnblamedHunk`s would never overlap. This constraint has been lifted. As a consequence, the high-level assumption that two different lines from a *Blamed File* can never map to the same line in a *Source File* has been lifted as well. There is not really a way to enforce this constraint in the blame algorithm alone as it heavily depends on the algorithm used for diffing. --- Cargo.lock | 1 + gix-blame/Cargo.toml | 1 + gix-blame/src/file/function.rs | 27 ------------------ gix-blame/src/file/mod.rs | 51 +++++++++++++++++++--------------- gix-blame/src/file/tests.rs | 36 ++++++++++++++++++++++++ gix-blame/src/types.rs | 5 +++- 6 files changed, 71 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c9c3de2b29b..28286ceb130 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1567,6 +1567,7 @@ dependencies = [ "gix-trace 0.1.12", "gix-traverse 0.46.1", "gix-worktree 0.41.0", + "pretty_assertions", "smallvec", "thiserror 2.0.12", ] diff --git a/gix-blame/Cargo.toml b/gix-blame/Cargo.toml index 6731854c123..8a047a2fc3d 100644 --- a/gix-blame/Cargo.toml +++ b/gix-blame/Cargo.toml @@ -31,3 +31,4 @@ gix-fs = { path = "../gix-fs" } gix-index = { path = "../gix-index" } gix-odb = { path = "../gix-odb" } gix-testtools = { path = "../tests/tools" } +pretty_assertions = "1.4.0" diff --git a/gix-blame/src/file/function.rs b/gix-blame/src/file/function.rs index 2a5666d9de6..a0197b44c15 100644 --- a/gix-blame/src/file/function.rs +++ b/gix-blame/src/file/function.rs @@ -303,33 +303,6 @@ pub fn file( unblamed_hunk.remove_blame(suspect); true }); - - // This block asserts that line ranges for each suspect never overlap. If they did overlap - // this would mean that the same line in a *Source File* would map to more than one line in - // the *Blamed File* and this is not possible. - #[cfg(debug_assertions)] - { - let ranges = hunks_to_blame.iter().fold( - std::collections::BTreeMap::>>::new(), - |mut acc, hunk| { - for (suspect, range) in hunk.suspects.clone() { - acc.entry(suspect).or_default().push(range); - } - - acc - }, - ); - - for (_, mut ranges) in ranges { - ranges.sort_by(|a, b| a.start.cmp(&b.start)); - - for window in ranges.windows(2) { - if let [a, b] = window { - assert!(a.end <= b.start, "#{hunks_to_blame:#?}"); - } - } - } - } } debug_assert_eq!( diff --git a/gix-blame/src/file/mod.rs b/gix-blame/src/file/mod.rs index 7470d71e037..bf67ea62060 100644 --- a/gix-blame/src/file/mod.rs +++ b/gix-blame/src/file/mod.rs @@ -320,36 +320,43 @@ fn process_change( /// Consume `hunks_to_blame` and `changes` to pair up matches ranges (also overlapping) with each other. /// Once a match is found, it's pushed onto `out`. +/// +/// `process_changes` assumes that ranges coming from the same *Source File* can and do +/// occasionally overlap. If it were a desirable property of the blame algorithm as a whole to +/// never have two different lines from a *Blamed File* mapped to the same line in a *Source File*, +/// this property would need to be enforced at a higher level than `process_changes` if which case +/// the nested loops could potentially be flattened into one. fn process_changes( hunks_to_blame: Vec, changes: Vec, suspect: ObjectId, parent: ObjectId, ) -> Vec { - let mut hunks_iter = hunks_to_blame.into_iter(); - let mut changes_iter = changes.into_iter(); + let mut new_hunks_to_blame = Vec::new(); - let mut hunk = hunks_iter.next(); - let mut change = changes_iter.next(); + for hunk in hunks_to_blame { + let mut hunk = Some(hunk); - let mut new_hunks_to_blame = Vec::new(); - let mut offset_in_destination = Offset::Added(0); - - loop { - (hunk, change) = process_change( - &mut new_hunks_to_blame, - &mut offset_in_destination, - suspect, - parent, - hunk, - change, - ); - - hunk = hunk.or_else(|| hunks_iter.next()); - change = change.or_else(|| changes_iter.next()); - - if hunk.is_none() && change.is_none() { - break; + let mut offset_in_destination = Offset::Added(0); + + let mut changes_iter = changes.iter(); + let mut change: Option = changes_iter.next().map(|change| change.clone()); + + loop { + (hunk, change) = process_change( + &mut new_hunks_to_blame, + &mut offset_in_destination, + suspect, + parent, + hunk, + change, + ); + + change = change.or_else(|| changes_iter.next().map(|change| change.clone())); + + if hunk.is_none() { + break; + } } } new_hunks_to_blame diff --git a/gix-blame/src/file/tests.rs b/gix-blame/src/file/tests.rs index 02e7d89ebd1..9185ca50633 100644 --- a/gix-blame/src/file/tests.rs +++ b/gix-blame/src/file/tests.rs @@ -959,6 +959,8 @@ mod process_change { } mod process_changes { + use pretty_assertions::assert_eq; + use crate::file::{ process_changes, tests::{new_unblamed_hunk, one_sha, zero_sha}, @@ -1337,4 +1339,38 @@ mod process_changes { ] ); } + + #[test] + fn subsequent_hunks_overlapping_end_of_addition() { + let suspect = zero_sha(); + let parent = one_sha(); + let hunks_to_blame = vec![ + new_unblamed_hunk(13..16, suspect, Offset::Added(0)), + new_unblamed_hunk(10..17, suspect, Offset::Added(0)), + ]; + let changes = vec![Change::AddedOrReplaced(10..14, 0)]; + let new_hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent); + + assert_eq!( + new_hunks_to_blame, + [ + UnblamedHunk { + range_in_blamed_file: 13..14, + suspects: [(suspect, 13..14)].into() + }, + UnblamedHunk { + range_in_blamed_file: 14..16, + suspects: [(parent, 10..12)].into(), + }, + UnblamedHunk { + range_in_blamed_file: 10..14, + suspects: [(suspect, 10..14)].into(), + }, + UnblamedHunk { + range_in_blamed_file: 14..17, + suspects: [(parent, 10..13)].into(), + }, + ] + ); + } } diff --git a/gix-blame/src/types.rs b/gix-blame/src/types.rs index d60b723e598..84b107b6f8d 100644 --- a/gix-blame/src/types.rs +++ b/gix-blame/src/types.rs @@ -353,7 +353,10 @@ pub(crate) enum Either { } /// A single change between two blobs, or an unchanged region. -#[derive(Debug, PartialEq)] +/// +/// Line numbers refer to the file that is referred to as `after` or `NewOrDestination`, depending +/// on the context. +#[derive(Clone, Debug, PartialEq)] pub enum Change { /// A range of tokens that wasn't changed. Unchanged(Range), From 2f6786b08a0c94106b4e93f7835a708adc859fed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20R=C3=BC=C3=9Fler?= Date: Sat, 3 May 2025 13:22:06 +0200 Subject: [PATCH 3/5] Use *Blamed File* and *Source File* more consistently This change also reflects the way `blamed_file` and `source_file` are used throughout the code. --- gix-blame/src/file/function.rs | 3 ++- gix-blame/src/file/mod.rs | 9 +++++---- gix-blame/src/lib.rs | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/gix-blame/src/file/function.rs b/gix-blame/src/file/function.rs index a0197b44c15..03429b61b12 100644 --- a/gix-blame/src/file/function.rs +++ b/gix-blame/src/file/function.rs @@ -37,7 +37,8 @@ use crate::{BlameEntry, Error, Options, Outcome, Statistics}; /// /// *For brevity, `HEAD` denotes the starting point of the blame operation. It could be any commit, or even commits that /// represent the worktree state. -/// We begin with a single *Unblamed Hunk* and a single suspect, usually the `HEAD` commit as the commit containing the +/// +/// We begin with one or more *Unblamed Hunks* and a single suspect, usually the `HEAD` commit as the commit containing the /// *Blamed File*, so that it contains the entire file, with the first commit being a candidate for the entire *Blamed File*. /// We traverse the commit graph starting at the first suspect, and see if there have been changes to `file_path`. /// If so, we have found a *Source File* and a *Suspect* commit, and have hunks that represent these changes. diff --git a/gix-blame/src/file/mod.rs b/gix-blame/src/file/mod.rs index bf67ea62060..f35cdbe9e61 100644 --- a/gix-blame/src/file/mod.rs +++ b/gix-blame/src/file/mod.rs @@ -8,11 +8,12 @@ use crate::types::{BlameEntry, Change, Either, LineRange, Offset, UnblamedHunk}; pub(super) mod function; -/// Compare a section from the *Blamed File* (`hunk`) with a change from a diff and see if there -/// is an intersection with `change`. Based on that intersection, we may generate a [`BlameEntry`] for `out` -/// and/or split the `hunk` into multiple. +/// Compare a section from a potential *Source File* (`hunk`) with a change from a diff and see if +/// there is an intersection with `change`. Based on that intersection, we may generate a +/// [`BlameEntry`] for `out` and/or split the `hunk` into multiple. /// -/// This is the core of the blame implementation as it matches regions in *Source File* to the *Blamed File*. +/// This is the core of the blame implementation as it matches regions in *Blamed File* to +/// corresponding regions in one or more than one *Source File*. fn process_change( new_hunks_to_blame: &mut Vec, offset: &mut Offset, diff --git a/gix-blame/src/lib.rs b/gix-blame/src/lib.rs index aca4299d41c..1cdbeb010d2 100644 --- a/gix-blame/src/lib.rs +++ b/gix-blame/src/lib.rs @@ -2,10 +2,10 @@ //! //! ### Terminology //! -//! * **Source File** -//! - The file as it exists in `HEAD`. -//! - the initial state with all lines that we need to associate with a *Source File*. //! * **Blamed File** +//! - The file as it exists in `HEAD`. +//! - the initial state with all lines that we need to associate with a *Blamed File*. +//! * **Source File** //! - A file at a version (i.e. commit) that introduces hunks into the final 'image'. //! * **Suspects** //! - The versions of the files that can contain hunks that we could use in the final 'image' From ee6f5cc1dc08975da364836adf3a3261d20c7ded Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20R=C3=BC=C3=9Fler?= Date: Sat, 3 May 2025 16:03:06 +0200 Subject: [PATCH 4/5] Thanks clippy --- gix-blame/src/file/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gix-blame/src/file/mod.rs b/gix-blame/src/file/mod.rs index f35cdbe9e61..3e729cec002 100644 --- a/gix-blame/src/file/mod.rs +++ b/gix-blame/src/file/mod.rs @@ -341,7 +341,7 @@ fn process_changes( let mut offset_in_destination = Offset::Added(0); let mut changes_iter = changes.iter(); - let mut change: Option = changes_iter.next().map(|change| change.clone()); + let mut change: Option = changes_iter.next().cloned(); loop { (hunk, change) = process_change( @@ -353,7 +353,7 @@ fn process_changes( change, ); - change = change.or_else(|| changes_iter.next().map(|change| change.clone())); + change = change.or_else(|| changes_iter.next().cloned()); if hunk.is_none() { break; From b2121bcd8be3546cf708242dae070c7173a7d384 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 4 May 2025 20:42:36 +0200 Subject: [PATCH 5/5] refactor all just minor tweaks. --- gix-blame/src/file/mod.rs | 14 ++++++-------- gix-blame/src/lib.rs | 6 +++--- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/gix-blame/src/file/mod.rs b/gix-blame/src/file/mod.rs index 3e729cec002..935fb09eed8 100644 --- a/gix-blame/src/file/mod.rs +++ b/gix-blame/src/file/mod.rs @@ -325,8 +325,8 @@ fn process_change( /// `process_changes` assumes that ranges coming from the same *Source File* can and do /// occasionally overlap. If it were a desirable property of the blame algorithm as a whole to /// never have two different lines from a *Blamed File* mapped to the same line in a *Source File*, -/// this property would need to be enforced at a higher level than `process_changes` if which case -/// the nested loops could potentially be flattened into one. +/// this property would need to be enforced at a higher level than `process_changes`. +/// Then the nested loops could potentially be flattened into one. fn process_changes( hunks_to_blame: Vec, changes: Vec, @@ -335,13 +335,11 @@ fn process_changes( ) -> Vec { let mut new_hunks_to_blame = Vec::new(); - for hunk in hunks_to_blame { - let mut hunk = Some(hunk); - + for mut hunk in hunks_to_blame.into_iter().map(Some) { let mut offset_in_destination = Offset::Added(0); - let mut changes_iter = changes.iter(); - let mut change: Option = changes_iter.next().cloned(); + let mut changes_iter = changes.iter().cloned(); + let mut change = changes_iter.next(); loop { (hunk, change) = process_change( @@ -353,7 +351,7 @@ fn process_changes( change, ); - change = change.or_else(|| changes_iter.next().cloned()); + change = change.or_else(|| changes_iter.next()); if hunk.is_none() { break; diff --git a/gix-blame/src/lib.rs b/gix-blame/src/lib.rs index 1cdbeb010d2..e811ab88ddb 100644 --- a/gix-blame/src/lib.rs +++ b/gix-blame/src/lib.rs @@ -4,13 +4,13 @@ //! //! * **Blamed File** //! - The file as it exists in `HEAD`. -//! - the initial state with all lines that we need to associate with a *Blamed File*. +//! - the initial state with all lines that we need to associate with a *Source File*. //! * **Source File** -//! - A file at a version (i.e. commit) that introduces hunks into the final 'image'. +//! - A file at a version (i.e., commit) that introduces hunks into the final 'image' of the *Blamed File*. //! * **Suspects** //! - The versions of the files that can contain hunks that we could use in the final 'image' //! - multiple at the same time as the commit-graph may split up. -//! - turns into *Source File* once we have found an association into the *Blamed File*. +//! - They turn into a *Source File* once we have found an association into the *Blamed File*. #![deny(rust_2018_idioms, missing_docs)] #![forbid(unsafe_code)]