Skip to content

Add it blame-copy-royal #2041

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 84 additions & 13 deletions gix-blame/src/file/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use gix_traverse::commit::find as find_commit;
use smallvec::SmallVec;

use super::{process_changes, Change, UnblamedHunk};
use crate::{BlameEntry, Error, Options, Outcome, Statistics};
use crate::{types::BlamePathEntry, BlameEntry, Error, Options, Outcome, Statistics};

/// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file
/// at `suspect:<file_path>` originated in.
Expand Down Expand Up @@ -115,6 +115,12 @@ pub fn file(
let mut out = Vec::new();
let mut diff_state = gix_diff::tree::State::default();
let mut previous_entry: Option<(ObjectId, ObjectId)> = None;
let mut blame_path = if options.debug_track_path {
Some(Vec::new())
} else {
None
};

'outer: while let Some(suspect) = queue.pop_value() {
stats.commits_traversed += 1;
if hunks_to_blame.is_empty() {
Expand Down Expand Up @@ -156,6 +162,23 @@ pub fn file(
// true here. We could perhaps use diff-tree-to-tree to compare `suspect` against
// an empty tree to validate this assumption.
if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) {
if let Some(ref mut blame_path) = blame_path {
let entry = previous_entry
.take()
.filter(|(id, _)| *id == suspect)
.map(|(_, entry)| entry);

let blame_path_entry = BlamePathEntry {
source_file_path: current_file_path.clone(),
previous_source_file_path: None,
commit_id: suspect,
blob_id: entry.unwrap_or(ObjectId::null(gix_hash::Kind::Sha1)),
previous_blob_id: ObjectId::null(gix_hash::Kind::Sha1),
index: 0,
};
blame_path.push(blame_path_entry);
}

break 'outer;
}
}
Expand Down Expand Up @@ -241,13 +264,13 @@ pub fn file(
}

let more_than_one_parent = parent_ids.len() > 1;
for (parent_id, parent_commit_time) in parent_ids {
queue.insert(parent_commit_time, parent_id);
for (index, (parent_id, parent_commit_time)) in parent_ids.iter().enumerate() {
queue.insert(*parent_commit_time, *parent_id);
let changes_for_file_path = tree_diff_at_file_path(
&odb,
current_file_path.as_ref(),
suspect,
parent_id,
*parent_id,
cache.as_ref(),
&mut stats,
&mut diff_state,
Expand All @@ -262,21 +285,33 @@ pub fn file(
// None of the changes affected the file we’re currently blaming.
// Copy blame to parent.
for unblamed_hunk in &mut hunks_to_blame {
unblamed_hunk.clone_blame(suspect, parent_id);
unblamed_hunk.clone_blame(suspect, *parent_id);
}
} else {
pass_blame_from_to(suspect, parent_id, &mut hunks_to_blame);
pass_blame_from_to(suspect, *parent_id, &mut hunks_to_blame);
}
continue;
};

match modification {
TreeDiffChange::Addition => {
TreeDiffChange::Addition { id } => {
if more_than_one_parent {
// Do nothing under the assumption that this always (or almost always)
// implies that the file comes from a different parent, compared to which
// it was modified, not added.
} else if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) {
if let Some(ref mut blame_path) = blame_path {
let blame_path_entry = BlamePathEntry {
source_file_path: current_file_path.clone(),
previous_source_file_path: None,
commit_id: suspect,
blob_id: id,
previous_blob_id: ObjectId::null(gix_hash::Kind::Sha1),
index,
};
blame_path.push(blame_path_entry);
}

break 'outer;
}
}
Expand All @@ -294,7 +329,22 @@ pub fn file(
options.diff_algorithm,
&mut stats,
)?;
hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent_id);
hunks_to_blame = process_changes(hunks_to_blame, changes.clone(), suspect, *parent_id);
if let Some(ref mut blame_path) = blame_path {
let has_blame_been_passed = hunks_to_blame.iter().any(|hunk| hunk.has_suspect(parent_id));

if has_blame_been_passed {
let blame_path_entry = BlamePathEntry {
source_file_path: current_file_path.clone(),
previous_source_file_path: Some(current_file_path.clone()),
commit_id: suspect,
blob_id: id,
previous_blob_id: previous_id,
index,
};
blame_path.push(blame_path_entry);
}
}
}
TreeDiffChange::Rewrite {
source_location,
Expand All @@ -311,11 +361,29 @@ pub fn file(
options.diff_algorithm,
&mut stats,
)?;
hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent_id);
hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, *parent_id);

let mut has_blame_been_passed = false;

for hunk in hunks_to_blame.iter_mut() {
if hunk.has_suspect(&parent_id) {
if hunk.has_suspect(parent_id) {
hunk.source_file_name = Some(source_location.clone());

has_blame_been_passed = true;
}
}

if has_blame_been_passed {
if let Some(ref mut blame_path) = blame_path {
let blame_path_entry = BlamePathEntry {
source_file_path: current_file_path.clone(),
previous_source_file_path: Some(source_location.clone()),
commit_id: suspect,
blob_id: id,
previous_blob_id: source_id,
index,
};
blame_path.push(blame_path_entry);
}
}
}
Expand Down Expand Up @@ -351,6 +419,7 @@ pub fn file(
entries: coalesce_blame_entries(out),
blob: blamed_file_blob,
statistics: stats,
blame_path,
})
}

Expand Down Expand Up @@ -435,7 +504,9 @@ fn coalesce_blame_entries(lines_blamed: Vec<BlameEntry>) -> Vec<BlameEntry> {
/// The union of [`gix_diff::tree::recorder::Change`] and [`gix_diff::tree_with_rewrites::Change`],
/// keeping only the blame-relevant information.
enum TreeDiffChange {
Addition,
Addition {
id: ObjectId,
},
Deletion,
Modification {
previous_id: ObjectId,
Expand All @@ -453,7 +524,7 @@ impl From<gix_diff::tree::recorder::Change> for TreeDiffChange {
use gix_diff::tree::recorder::Change;

match value {
Change::Addition { .. } => Self::Addition,
Change::Addition { oid, .. } => Self::Addition { id: oid },
Change::Deletion { .. } => Self::Deletion,
Change::Modification { previous_oid, oid, .. } => Self::Modification {
previous_id: previous_oid,
Expand All @@ -468,7 +539,7 @@ impl From<gix_diff::tree_with_rewrites::Change> for TreeDiffChange {
use gix_diff::tree_with_rewrites::Change;

match value {
Change::Addition { .. } => Self::Addition,
Change::Addition { id, .. } => Self::Addition { id },
Change::Deletion { .. } => Self::Deletion,
Change::Modification { previous_id, id, .. } => Self::Modification { previous_id, id },
Change::Rewrite {
Expand Down
2 changes: 1 addition & 1 deletion gix-blame/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
mod error;
pub use error::Error;
mod types;
pub use types::{BlameEntry, BlameRanges, Options, Outcome, Statistics};
pub use types::{BlameEntry, BlamePathEntry, BlameRanges, Options, Outcome, Statistics};

mod file;
pub use file::function::file;
29 changes: 29 additions & 0 deletions gix-blame/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,33 @@ pub struct Options {
pub since: Option<gix_date::Time>,
/// Determine if rename tracking should be performed, and how.
pub rewrites: Option<gix_diff::Rewrites>,
/// Collect debug information whenever there's a diff or rename that affects the outcome of a
/// blame.
pub debug_track_path: bool,
}

/// Represents a change during history traversal for blame. It is supposed to capture enough
/// information to allow reconstruction of the way a blame was performed, i. e. the path the
/// history traversal, combined with repeated diffing of two subsequent states in this history, has
/// taken.
///
/// This is intended for debugging purposes.
#[derive(Clone, Debug)]
pub struct BlamePathEntry {
/// The path to the *Source File* in the blob after the change.
pub source_file_path: BString,
/// The path to the *Source File* in the blob before the change. Allows
/// detection of renames. `None` for root commits.
pub previous_source_file_path: Option<BString>,
/// The commit id associated with the state after the change.
pub commit_id: ObjectId,
/// The blob id associated with the state after the change.
pub blob_id: ObjectId,
/// The blob id associated with the state before the change.
pub previous_blob_id: ObjectId,
/// When there is more than one `BlamePathEntry` for a commit, this indicates to which parent
/// commit the change is related.
pub index: usize,
}

/// The outcome of [`file()`](crate::file()).
Expand All @@ -161,6 +188,8 @@ pub struct Outcome {
pub blob: Vec<u8>,
/// Additional information about the amount of work performed to produce the blame.
pub statistics: Statistics,
/// Contains a log of all changes that affected the outcome of this blame.
pub blame_path: Option<Vec<BlamePathEntry>>,
}

/// Additional information about the performed operations.
Expand Down
7 changes: 7 additions & 0 deletions gix-blame/tests/blame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ macro_rules! mktest {
range: BlameRanges::default(),
since: None,
rewrites: Some(gix_diff::Rewrites::default()),
debug_track_path: false,
},
)?
.entries;
Expand Down Expand Up @@ -317,6 +318,7 @@ fn diff_disparity() {
range: BlameRanges::default(),
since: None,
rewrites: Some(gix_diff::Rewrites::default()),
debug_track_path: false,
},
)
.unwrap()
Expand Down Expand Up @@ -352,6 +354,7 @@ fn since() -> gix_testtools::Result {
range: BlameRanges::default(),
since: Some(gix_date::parse("2025-01-31", None)?),
rewrites: Some(gix_diff::Rewrites::default()),
debug_track_path: false,
},
)?
.entries;
Expand Down Expand Up @@ -391,6 +394,7 @@ mod blame_ranges {
range: BlameRanges::from_range(1..=2),
since: None,
rewrites: Some(gix_diff::Rewrites::default()),
debug_track_path: false,
},
)?
.entries;
Expand Down Expand Up @@ -431,6 +435,7 @@ mod blame_ranges {
range: ranges,
since: None,
rewrites: None,
debug_track_path: false,
},
)?
.entries;
Expand Down Expand Up @@ -471,6 +476,7 @@ mod blame_ranges {
range: ranges,
since: None,
rewrites: None,
debug_track_path: false,
},
)?
.entries;
Expand Down Expand Up @@ -516,6 +522,7 @@ mod rename_tracking {
range: BlameRanges::default(),
since: None,
rewrites: Some(gix_diff::Rewrites::default()),
debug_track_path: false,
},
)?
.entries;
Expand Down
1 change: 1 addition & 0 deletions src/plumbing/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1591,6 +1591,7 @@ pub fn main() -> Result<()> {
range: gix::blame::BlameRanges::from_ranges(ranges),
since,
rewrites: Some(gix::diff::Rewrites::default()),
debug_track_path: false,
},
out,
statistics.then_some(err),
Expand Down
2 changes: 1 addition & 1 deletion tests/it/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ path = "src/main.rs"
[dependencies]
anyhow = "1.0.98"
clap = { version = "4.5.39", features = ["derive"] }
gix = { version = "^0.72.1", path = "../../gix", default-features = false, features = ["attributes", "revision"] }
gix = { version = "^0.72.1", path = "../../gix", default-features = false, features = ["attributes", "blame", "blob-diff", "revision"] }
once_cell = "1.21.3"
regex = { version = "1.11.1", default-features = false, features = ["std"] }
46 changes: 46 additions & 0 deletions tests/it/src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,52 @@ pub struct Args {

#[derive(Debug, clap::Subcommand)]
pub enum Subcommands {
/// Generate a shell script that creates a git repository containing all commits that are
/// traversed when a blame is generated.
///
/// This command extracts the file’s history so that blame, when run on the repository created
/// by the script, shows the same characteristics, in particular bugs, as the original, but in
/// a way that the original source file's content cannot be reconstructed.
///
/// The idea is that by obfuscating the file's content we make it easier for people to share
/// the subset of data that's required for debugging purposes from repositories that are not
/// public.
///
/// Note that the obfuscation leaves certain properties of the source intact, so they can still
/// be inferred from the extracted history. Among these properties are directory structure
/// (though not the directories' names), renames, number of lines, and whitespace.
///
/// This command can also be helpful in debugging the blame algorithm itself.
///
/// ### Terminology
///
/// A **blame history** is the set of commits that the blame algorithm, at some point, treated
/// as potential suspects for any line in a file. It is a subset of all commits that ever
/// changed a file in its history.
///
/// With respect to branches and merge commits, the **blame history** will not necessarily be
/// identical to the file's history in the source repository. This is because the blame
/// algorithm will stop following a file's history for branches that only touch lines for which
/// the source has already been found. The **blame history**, thus, looks likely "cleaner" and
/// "simpler" than the source history.
#[clap(visible_alias = "bcr")]
BlameCopyRoyal {
/// Don't really copy anything.
#[clap(long, short = 'n')]
dry_run: bool,
/// The git root whose history to extract the blame-relevant parts from.
worktree_dir: PathBuf,
/// The directory into which to copy the files.
destination_dir: PathBuf,
/// The file to extract the history for.
file: std::ffi::OsString,
/// Do not use `copy-royal` to obfuscate the content of blobs, but copy it verbatim.
///
/// Note that this should only be done if the source history does not contain information
/// you're not willing to share.
#[clap(long)]
verbatim: bool,
},
/// Copy a tree so that it diffs the same but can't be traced back uniquely to its source.
///
/// The idea is that we don't want to deal with licensing, it's more about patterns in order to
Expand Down
Loading
Loading