diff --git a/gix-blame/src/file/function.rs b/gix-blame/src/file/function.rs index b69780ae244..b5a4067171c 100644 --- a/gix-blame/src/file/function.rs +++ b/gix-blame/src/file/function.rs @@ -10,7 +10,7 @@ use gix_traverse::commit::find as find_commit; use smallvec::SmallVec; use super::{process_changes, Change, UnblamedHunk}; -use crate::{BlameEntry, Error, Options, Outcome, Statistics}; +use crate::{types::BlamePathEntry, BlameEntry, Error, Options, Outcome, Statistics}; /// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file /// at `suspect:` originated in. @@ -115,6 +115,12 @@ pub fn file( let mut out = Vec::new(); let mut diff_state = gix_diff::tree::State::default(); let mut previous_entry: Option<(ObjectId, ObjectId)> = None; + let mut blame_path = if options.debug_track_path { + Some(Vec::new()) + } else { + None + }; + 'outer: while let Some(suspect) = queue.pop_value() { stats.commits_traversed += 1; if hunks_to_blame.is_empty() { @@ -156,6 +162,23 @@ pub fn file( // true here. We could perhaps use diff-tree-to-tree to compare `suspect` against // an empty tree to validate this assumption. if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { + if let Some(ref mut blame_path) = blame_path { + let entry = previous_entry + .take() + .filter(|(id, _)| *id == suspect) + .map(|(_, entry)| entry); + + let blame_path_entry = BlamePathEntry { + source_file_path: current_file_path.clone(), + previous_source_file_path: None, + commit_id: suspect, + blob_id: entry.unwrap_or(ObjectId::null(gix_hash::Kind::Sha1)), + previous_blob_id: ObjectId::null(gix_hash::Kind::Sha1), + index: 0, + }; + blame_path.push(blame_path_entry); + } + break 'outer; } } @@ -241,13 +264,13 @@ pub fn file( } let more_than_one_parent = parent_ids.len() > 1; - for (parent_id, parent_commit_time) in parent_ids { - queue.insert(parent_commit_time, parent_id); + for (index, (parent_id, parent_commit_time)) in parent_ids.iter().enumerate() { + queue.insert(*parent_commit_time, *parent_id); let changes_for_file_path = tree_diff_at_file_path( &odb, current_file_path.as_ref(), suspect, - parent_id, + *parent_id, cache.as_ref(), &mut stats, &mut diff_state, @@ -262,21 +285,33 @@ pub fn file( // None of the changes affected the file we’re currently blaming. // Copy blame to parent. for unblamed_hunk in &mut hunks_to_blame { - unblamed_hunk.clone_blame(suspect, parent_id); + unblamed_hunk.clone_blame(suspect, *parent_id); } } else { - pass_blame_from_to(suspect, parent_id, &mut hunks_to_blame); + pass_blame_from_to(suspect, *parent_id, &mut hunks_to_blame); } continue; }; match modification { - TreeDiffChange::Addition => { + TreeDiffChange::Addition { id } => { if more_than_one_parent { // Do nothing under the assumption that this always (or almost always) // implies that the file comes from a different parent, compared to which // it was modified, not added. } else if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { + if let Some(ref mut blame_path) = blame_path { + let blame_path_entry = BlamePathEntry { + source_file_path: current_file_path.clone(), + previous_source_file_path: None, + commit_id: suspect, + blob_id: id, + previous_blob_id: ObjectId::null(gix_hash::Kind::Sha1), + index, + }; + blame_path.push(blame_path_entry); + } + break 'outer; } } @@ -294,7 +329,22 @@ pub fn file( options.diff_algorithm, &mut stats, )?; - hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent_id); + hunks_to_blame = process_changes(hunks_to_blame, changes.clone(), suspect, *parent_id); + if let Some(ref mut blame_path) = blame_path { + let has_blame_been_passed = hunks_to_blame.iter().any(|hunk| hunk.has_suspect(parent_id)); + + if has_blame_been_passed { + let blame_path_entry = BlamePathEntry { + source_file_path: current_file_path.clone(), + previous_source_file_path: Some(current_file_path.clone()), + commit_id: suspect, + blob_id: id, + previous_blob_id: previous_id, + index, + }; + blame_path.push(blame_path_entry); + } + } } TreeDiffChange::Rewrite { source_location, @@ -311,11 +361,29 @@ pub fn file( options.diff_algorithm, &mut stats, )?; - hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent_id); + hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, *parent_id); + + let mut has_blame_been_passed = false; for hunk in hunks_to_blame.iter_mut() { - if hunk.has_suspect(&parent_id) { + if hunk.has_suspect(parent_id) { hunk.source_file_name = Some(source_location.clone()); + + has_blame_been_passed = true; + } + } + + if has_blame_been_passed { + if let Some(ref mut blame_path) = blame_path { + let blame_path_entry = BlamePathEntry { + source_file_path: current_file_path.clone(), + previous_source_file_path: Some(source_location.clone()), + commit_id: suspect, + blob_id: id, + previous_blob_id: source_id, + index, + }; + blame_path.push(blame_path_entry); } } } @@ -351,6 +419,7 @@ pub fn file( entries: coalesce_blame_entries(out), blob: blamed_file_blob, statistics: stats, + blame_path, }) } @@ -435,7 +504,9 @@ fn coalesce_blame_entries(lines_blamed: Vec) -> Vec { /// The union of [`gix_diff::tree::recorder::Change`] and [`gix_diff::tree_with_rewrites::Change`], /// keeping only the blame-relevant information. enum TreeDiffChange { - Addition, + Addition { + id: ObjectId, + }, Deletion, Modification { previous_id: ObjectId, @@ -453,7 +524,7 @@ impl From for TreeDiffChange { use gix_diff::tree::recorder::Change; match value { - Change::Addition { .. } => Self::Addition, + Change::Addition { oid, .. } => Self::Addition { id: oid }, Change::Deletion { .. } => Self::Deletion, Change::Modification { previous_oid, oid, .. } => Self::Modification { previous_id: previous_oid, @@ -468,7 +539,7 @@ impl From for TreeDiffChange { use gix_diff::tree_with_rewrites::Change; match value { - Change::Addition { .. } => Self::Addition, + Change::Addition { id, .. } => Self::Addition { id }, Change::Deletion { .. } => Self::Deletion, Change::Modification { previous_id, id, .. } => Self::Modification { previous_id, id }, Change::Rewrite { diff --git a/gix-blame/src/lib.rs b/gix-blame/src/lib.rs index e811ab88ddb..2a31c874f8f 100644 --- a/gix-blame/src/lib.rs +++ b/gix-blame/src/lib.rs @@ -17,7 +17,7 @@ mod error; pub use error::Error; mod types; -pub use types::{BlameEntry, BlameRanges, Options, Outcome, Statistics}; +pub use types::{BlameEntry, BlamePathEntry, BlameRanges, Options, Outcome, Statistics}; mod file; pub use file::function::file; diff --git a/gix-blame/src/types.rs b/gix-blame/src/types.rs index 5672eaf679d..bd3502ef993 100644 --- a/gix-blame/src/types.rs +++ b/gix-blame/src/types.rs @@ -149,6 +149,33 @@ pub struct Options { pub since: Option, /// Determine if rename tracking should be performed, and how. pub rewrites: Option, + /// Collect debug information whenever there's a diff or rename that affects the outcome of a + /// blame. + pub debug_track_path: bool, +} + +/// Represents a change during history traversal for blame. It is supposed to capture enough +/// information to allow reconstruction of the way a blame was performed, i. e. the path the +/// history traversal, combined with repeated diffing of two subsequent states in this history, has +/// taken. +/// +/// This is intended for debugging purposes. +#[derive(Clone, Debug)] +pub struct BlamePathEntry { + /// The path to the *Source File* in the blob after the change. + pub source_file_path: BString, + /// The path to the *Source File* in the blob before the change. Allows + /// detection of renames. `None` for root commits. + pub previous_source_file_path: Option, + /// The commit id associated with the state after the change. + pub commit_id: ObjectId, + /// The blob id associated with the state after the change. + pub blob_id: ObjectId, + /// The blob id associated with the state before the change. + pub previous_blob_id: ObjectId, + /// When there is more than one `BlamePathEntry` for a commit, this indicates to which parent + /// commit the change is related. + pub index: usize, } /// The outcome of [`file()`](crate::file()). @@ -161,6 +188,8 @@ pub struct Outcome { pub blob: Vec, /// Additional information about the amount of work performed to produce the blame. pub statistics: Statistics, + /// Contains a log of all changes that affected the outcome of this blame. + pub blame_path: Option>, } /// Additional information about the performed operations. diff --git a/gix-blame/tests/blame.rs b/gix-blame/tests/blame.rs index bec002c6dd7..cdd91f852a9 100644 --- a/gix-blame/tests/blame.rs +++ b/gix-blame/tests/blame.rs @@ -232,6 +232,7 @@ macro_rules! mktest { range: BlameRanges::default(), since: None, rewrites: Some(gix_diff::Rewrites::default()), + debug_track_path: false, }, )? .entries; @@ -317,6 +318,7 @@ fn diff_disparity() { range: BlameRanges::default(), since: None, rewrites: Some(gix_diff::Rewrites::default()), + debug_track_path: false, }, ) .unwrap() @@ -352,6 +354,7 @@ fn since() -> gix_testtools::Result { range: BlameRanges::default(), since: Some(gix_date::parse("2025-01-31", None)?), rewrites: Some(gix_diff::Rewrites::default()), + debug_track_path: false, }, )? .entries; @@ -391,6 +394,7 @@ mod blame_ranges { range: BlameRanges::from_range(1..=2), since: None, rewrites: Some(gix_diff::Rewrites::default()), + debug_track_path: false, }, )? .entries; @@ -431,6 +435,7 @@ mod blame_ranges { range: ranges, since: None, rewrites: None, + debug_track_path: false, }, )? .entries; @@ -471,6 +476,7 @@ mod blame_ranges { range: ranges, since: None, rewrites: None, + debug_track_path: false, }, )? .entries; @@ -516,6 +522,7 @@ mod rename_tracking { range: BlameRanges::default(), since: None, rewrites: Some(gix_diff::Rewrites::default()), + debug_track_path: false, }, )? .entries; diff --git a/src/plumbing/main.rs b/src/plumbing/main.rs index aedee5cc052..8db7d3603be 100644 --- a/src/plumbing/main.rs +++ b/src/plumbing/main.rs @@ -1591,6 +1591,7 @@ pub fn main() -> Result<()> { range: gix::blame::BlameRanges::from_ranges(ranges), since, rewrites: Some(gix::diff::Rewrites::default()), + debug_track_path: false, }, out, statistics.then_some(err), diff --git a/tests/it/Cargo.toml b/tests/it/Cargo.toml index 5895ee5dab8..197d37a2179 100644 --- a/tests/it/Cargo.toml +++ b/tests/it/Cargo.toml @@ -17,6 +17,6 @@ path = "src/main.rs" [dependencies] anyhow = "1.0.98" clap = { version = "4.5.39", features = ["derive"] } -gix = { version = "^0.72.1", path = "../../gix", default-features = false, features = ["attributes", "revision"] } +gix = { version = "^0.72.1", path = "../../gix", default-features = false, features = ["attributes", "blame", "blob-diff", "revision"] } once_cell = "1.21.3" regex = { version = "1.11.1", default-features = false, features = ["std"] } diff --git a/tests/it/src/args.rs b/tests/it/src/args.rs index 85c95d92648..d3b1bf75131 100644 --- a/tests/it/src/args.rs +++ b/tests/it/src/args.rs @@ -14,6 +14,52 @@ pub struct Args { #[derive(Debug, clap::Subcommand)] pub enum Subcommands { + /// Generate a shell script that creates a git repository containing all commits that are + /// traversed when a blame is generated. + /// + /// This command extracts the file’s history so that blame, when run on the repository created + /// by the script, shows the same characteristics, in particular bugs, as the original, but in + /// a way that the original source file's content cannot be reconstructed. + /// + /// The idea is that by obfuscating the file's content we make it easier for people to share + /// the subset of data that's required for debugging purposes from repositories that are not + /// public. + /// + /// Note that the obfuscation leaves certain properties of the source intact, so they can still + /// be inferred from the extracted history. Among these properties are directory structure + /// (though not the directories' names), renames, number of lines, and whitespace. + /// + /// This command can also be helpful in debugging the blame algorithm itself. + /// + /// ### Terminology + /// + /// A **blame history** is the set of commits that the blame algorithm, at some point, treated + /// as potential suspects for any line in a file. It is a subset of all commits that ever + /// changed a file in its history. + /// + /// With respect to branches and merge commits, the **blame history** will not necessarily be + /// identical to the file's history in the source repository. This is because the blame + /// algorithm will stop following a file's history for branches that only touch lines for which + /// the source has already been found. The **blame history**, thus, looks likely "cleaner" and + /// "simpler" than the source history. + #[clap(visible_alias = "bcr")] + BlameCopyRoyal { + /// Don't really copy anything. + #[clap(long, short = 'n')] + dry_run: bool, + /// The git root whose history to extract the blame-relevant parts from. + worktree_dir: PathBuf, + /// The directory into which to copy the files. + destination_dir: PathBuf, + /// The file to extract the history for. + file: std::ffi::OsString, + /// Do not use `copy-royal` to obfuscate the content of blobs, but copy it verbatim. + /// + /// Note that this should only be done if the source history does not contain information + /// you're not willing to share. + #[clap(long)] + verbatim: bool, + }, /// Copy a tree so that it diffs the same but can't be traced back uniquely to its source. /// /// The idea is that we don't want to deal with licensing, it's more about patterns in order to diff --git a/tests/it/src/commands/blame_copy_royal.rs b/tests/it/src/commands/blame_copy_royal.rs new file mode 100644 index 00000000000..d73741cd8f4 --- /dev/null +++ b/tests/it/src/commands/blame_copy_royal.rs @@ -0,0 +1,340 @@ +pub struct Options { + pub verbatim: bool, +} + +pub(super) mod function { + use anyhow::Context; + use gix::{ + blame::BlamePathEntry, + bstr::{BStr, BString, ByteSlice}, + objs::FindExt, + ObjectId, + }; + use std::{ + collections::BTreeSet, + ffi::OsStr, + fmt::Display, + path::{Path, PathBuf}, + }; + + use super::Options; + + pub fn blame_copy_royal( + dry_run: bool, + worktree_dir: &Path, + destination_dir: PathBuf, + file: &OsStr, + Options { verbatim }: Options, + ) -> anyhow::Result<()> { + let prefix = if dry_run { "WOULD" } else { "Will" }; + let repo = gix::open(worktree_dir)?; + + let suspect: gix::ObjectId = repo.head()?.into_peeled_id()?.into(); + let cache: Option = repo.commit_graph_if_enabled()?; + let mut resource_cache = repo.diff_resource_cache_for_tree_diff()?; + let diff_algorithm = repo.diff_algorithm()?; + + let options = gix::blame::Options { + diff_algorithm, + range: gix::blame::BlameRanges::default(), + since: None, + rewrites: Some(gix::diff::Rewrites::default()), + debug_track_path: true, + }; + + let index = repo.index_or_empty()?; + + // The following block, including the `TODO` comment, comes from + // `gitoxide_core::repository::blame`. + let file = gix::path::os_str_into_bstr(file)?; + let specs = repo.pathspec( + false, + [file], + true, + &index, + gix::worktree::stack::state::attributes::Source::WorktreeThenIdMapping.adjust_for_bare(repo.is_bare()), + )?; + // TODO: there should be a way to normalize paths without going through patterns, at least in this case maybe? + // `Search` actually sorts patterns by excluding or not, all that can lead to strange results. + let file = specs + .search() + .patterns() + .map(|p| p.path().to_owned()) + .next() + .expect("exactly one pattern"); + + let outcome = gix::blame::file( + &repo.objects, + suspect, + cache, + &mut resource_cache, + file.as_bstr(), + options, + )?; + + let blame_path = outcome + .blame_path + .expect("blame path to be present as `debug_track_path == true`"); + + // TODO + // Potentially make `"assets"` configurable (it is in `git_to_sh`). + let assets = destination_dir.join("assets"); + + eprintln!("{prefix} create directory '{assets}'", assets = assets.display()); + + if !dry_run { + std::fs::create_dir_all(&assets)?; + } + + let mut buf = Vec::new(); + + for blame_path_entry in &blame_path { + let src: &BStr = blame_path_entry.source_file_path.as_bstr(); + let dst = assets.join(format!("{}.commit", blame_path_entry.commit_id)); + + eprintln!( + "{prefix} copy file '{}' at commit {} to '{dst}'", + src, + blame_path_entry.commit_id, + dst = dst.display() + ); + + if !dry_run { + let blob = repo.objects.find_blob(&blame_path_entry.blob_id, &mut buf)?.data; + + if verbatim { + std::fs::write(dst, blob)?; + } else { + let blob = std::str::from_utf8(blob).with_context(|| { + format!( + "Entry in blob '{blob_id}' was not valid UTF8 and can't be remapped", + blob_id = blame_path_entry.blob_id + ) + })?; + + let blob = crate::commands::copy_royal::remapped(blob); + + std::fs::write(dst, blob)?; + } + } + } + + let mut blame_script = BlameScript::new(blame_path, Options { verbatim }); + + blame_script.generate()?; + + let script_file = destination_dir.join("create-history.sh"); + + eprintln!( + "{prefix} write script file at '{script_file}'", + script_file = script_file.display() + ); + + if !dry_run { + let blocks: Vec<_> = blame_script + .script + .iter() + .map(std::string::ToString::to_string) + .collect(); + + std::fs::write(script_file, blocks.join(""))?; + } + + Ok(()) + } + + enum BlameScriptOperation { + InitRepository, + RemoveFile(String), + CommitFile(BString, ObjectId), + CheckoutTag(ObjectId), + PrepareMerge(Vec), + CreateTag(ObjectId), + } + + impl Display for BlameScriptOperation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + BlameScriptOperation::InitRepository => write!( + f, + r"#!/bin/sh + +set -e + +git init +echo .gitignore >> .gitignore +echo assets/ >> .gitignore +echo create-history.sh >> .gitignore + +" + ), + BlameScriptOperation::RemoveFile(src) => write!( + f, + r"# delete previous version of file +git rm {src} +" + ), + BlameScriptOperation::CommitFile(src, commit_id) => write!( + f, + r"# make file {src} contain content at commit {commit_id} +mkdir -p $(dirname {src}) +cp ./assets/{commit_id}.commit ./{src} +# create commit +git add {src} +git commit -m {commit_id} +" + ), + BlameScriptOperation::CheckoutTag(commit_id) => writeln!(f, "git checkout tag-{commit_id}"), + BlameScriptOperation::PrepareMerge(commit_ids) => writeln!( + f, + "git merge --no-commit {} || true", + commit_ids + .iter() + .map(|commit_id| format!("tag-{commit_id}")) + .collect::>() + .join(" ") + ), + BlameScriptOperation::CreateTag(commit_id) => write!(f, "git tag tag-{commit_id}\n\n"), + } + } + } + + struct BlameScript { + blame_path: Vec, + seen: BTreeSet, + script: Vec, + options: Options, + } + + impl BlameScript { + fn new(blame_path: Vec, options: Options) -> Self { + let script = vec![BlameScriptOperation::InitRepository]; + + Self { + blame_path, + seen: BTreeSet::default(), + script, + options, + } + } + + fn generate(&mut self) -> anyhow::Result<()> { + // `self.blame_path`, before calling `reverse`, has parents before children, with the + // history’s root being the last element. We reverse the order in place so that all + // methods can rely on the assumption that the root comes first, followed by its + // descendants. That way, we can use a simple `for` loop to iterate through + // `self.blame_path` below. + self.blame_path.reverse(); + + for blame_path_entry in self.blame_path.clone() { + if !self.seen.contains(&blame_path_entry.commit_id) { + self.process_entry(&blame_path_entry)?; + } + + self.seen.insert(blame_path_entry.commit_id); + } + + Ok(()) + } + + fn process_entry(&mut self, blame_path_entry: &BlamePathEntry) -> anyhow::Result<()> { + let source_file_path = blame_path_entry.source_file_path.clone(); + let parents = self.parents_of(blame_path_entry); + + let src = if self.options.verbatim { + source_file_path.clone() + } else { + let source_file_path = std::str::from_utf8(source_file_path.as_slice()).with_context(|| { + format!("Source file path '{source_file_path}' was not valid UTF8 and can't be remapped",) + })?; + + crate::commands::copy_royal::remapped(source_file_path).into() + }; + let commit_id = blame_path_entry.commit_id; + + let delete_previous_file_operation = match &blame_path_entry.previous_source_file_path { + Some(previous_source_file_path) if source_file_path != *previous_source_file_path => { + let src = if self.options.verbatim { + previous_source_file_path.to_string() + } else { + let source_file_path = + std::str::from_utf8(previous_source_file_path.as_slice()).with_context(|| { + format!("Source file path '{previous_source_file_path}' was not valid UTF8 and can't be remapped",) + })?; + + crate::commands::copy_royal::remapped(source_file_path) + }; + + Some(BlameScriptOperation::RemoveFile(src)) + } + _ => None, + }; + + if parents.is_empty() { + if let Some(delete_previous_file_operation) = delete_previous_file_operation { + self.script.push(delete_previous_file_operation); + } + self.script.push(BlameScriptOperation::CommitFile(src, commit_id)); + } else { + let ([first], rest) = parents.split_at(1) else { + unreachable!(); + }; + + self.script.push(BlameScriptOperation::CheckoutTag(first.commit_id)); + + if rest.is_empty() { + if let Some(delete_previous_file_operation) = delete_previous_file_operation { + self.script.push(delete_previous_file_operation); + } + self.script.push(BlameScriptOperation::CommitFile(src, commit_id)); + } else { + self.script.push(BlameScriptOperation::PrepareMerge( + rest.iter().map(|blame_path_entry| blame_path_entry.commit_id).collect(), + )); + + if let Some(delete_previous_file_operation) = delete_previous_file_operation { + self.script.push(delete_previous_file_operation); + } + self.script.push(BlameScriptOperation::CommitFile(src, commit_id)); + } + } + + self.script.push(BlameScriptOperation::CreateTag(commit_id)); + + Ok(()) + } + + fn parents_of(&self, child: &BlamePathEntry) -> Vec { + // In almost all cases, `children` will only have one element. The exception are merge + // commits where there’s changes against each parent. Each of these changes would + // produce a diff that’s represented in `self.blame_path`. + let mut children: Vec<_> = self + .blame_path + .iter() + .enumerate() + .filter(|(_, x)| x.commit_id == child.commit_id) + .collect(); + + children.sort_by_key(|(_, x)| x.index); + + let parents = children + .iter() + .filter_map(|(index, child)| { + let parent_blob_id = child.previous_blob_id; + let parent_source_file_path = &child.previous_source_file_path; + + // When we search for a parent we only have to consider entries up to and + // excluding `index` as anything after `index` can only be a child. + self.blame_path[..(*index)] + .iter() + .rfind(|&x| { + x.blob_id == parent_blob_id && Some(&x.source_file_path) == parent_source_file_path.as_ref() + }) + .cloned() + }) + .collect(); + + parents + } + } +} diff --git a/tests/it/src/commands/mod.rs b/tests/it/src/commands/mod.rs index ae00a26a8d7..6bec01671df 100644 --- a/tests/it/src/commands/mod.rs +++ b/tests/it/src/commands/mod.rs @@ -1,3 +1,6 @@ +pub mod blame_copy_royal; +pub use blame_copy_royal::function::blame_copy_royal; + pub mod copy_royal; pub use copy_royal::function::copy_royal; diff --git a/tests/it/src/main.rs b/tests/it/src/main.rs index f18dd3450be..d2799b99cb4 100644 --- a/tests/it/src/main.rs +++ b/tests/it/src/main.rs @@ -25,6 +25,19 @@ fn main() -> anyhow::Result<()> { max_count: count, }, ), + Subcommands::BlameCopyRoyal { + dry_run, + worktree_dir: worktree_root, + destination_dir, + file, + verbatim, + } => commands::blame_copy_royal( + dry_run, + &worktree_root, + destination_dir, + &file, + commands::blame_copy_royal::Options { verbatim }, + ), Subcommands::CopyRoyal { dry_run, worktree_dir: worktree_root,