Skip to content

Commit b01d624

Browse files
committed
Add it blame-copy-royal
1 parent 4afc51d commit b01d624

File tree

5 files changed

+343
-1
lines changed

5 files changed

+343
-1
lines changed

tests/it/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,6 @@ path = "src/main.rs"
1717
[dependencies]
1818
anyhow = "1.0.98"
1919
clap = { version = "4.5.39", features = ["derive"] }
20-
gix = { version = "^0.72.1", path = "../../gix", default-features = false, features = ["attributes", "revision"] }
20+
gix = { version = "^0.72.1", path = "../../gix", default-features = false, features = ["attributes", "blame", "blob-diff", "revision"] }
2121
once_cell = "1.21.3"
2222
regex = { version = "1.11.1", default-features = false, features = ["std"] }

tests/it/src/args.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,30 @@ pub struct Args {
1414

1515
#[derive(Debug, clap::Subcommand)]
1616
pub enum Subcommands {
17+
/// Extract a file’s history so that its blame shows the same characteristics, in particular
18+
/// bugs, as the original, but in a way that can't be traced back uniquely to its source.
19+
///
20+
/// The idea is that we don't want to deal with licensing, it's more about patterns in order to
21+
/// reproduce cases for tests.
22+
#[clap(visible_alias = "bcr")]
23+
BlameCopyRoyal {
24+
/// Don't really copy anything.
25+
#[clap(long, short = 'n')]
26+
dry_run: bool,
27+
/// The git root whose history to extract the blame-relevant parts from.
28+
worktree_dir: PathBuf,
29+
/// The directory into which to copy the files.
30+
destination_dir: PathBuf,
31+
/// The file to extract the history for.
32+
file: std::ffi::OsString,
33+
/// Do not use `copy-royal` to obfuscate the content of blobs, but copy it verbatim.
34+
///
35+
/// Note that this should only be done if the source repository only contains information
36+
/// you’re willing to share. Also note that the obfuscation leaves the structure of the
37+
/// source intact, so a few of its properties can still be inferred.
38+
#[clap(long)]
39+
verbatim: bool,
40+
},
1741
/// Copy a tree so that it diffs the same but can't be traced back uniquely to its source.
1842
///
1943
/// The idea is that we don't want to deal with licensing, it's more about patterns in order to
Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
pub struct Options {
2+
pub verbatim: bool,
3+
}
4+
5+
pub(super) mod function {
6+
use anyhow::Context;
7+
use gix::{
8+
blame::BlamePathEntry,
9+
bstr::{BStr, ByteSlice},
10+
objs::FindExt,
11+
ObjectId,
12+
};
13+
use std::{
14+
collections::BTreeSet,
15+
ffi::OsStr,
16+
path::{Path, PathBuf},
17+
};
18+
19+
use super::Options;
20+
21+
pub fn blame_copy_royal(
22+
dry_run: bool,
23+
worktree_dir: &Path,
24+
destination_dir: PathBuf,
25+
file: &OsStr,
26+
Options { verbatim }: Options,
27+
) -> anyhow::Result<()> {
28+
let prefix = if dry_run { "WOULD" } else { "Will" };
29+
let repo = gix::open(worktree_dir)?;
30+
31+
let suspect: gix::ObjectId = repo.head()?.into_peeled_id()?.into();
32+
let cache: Option<gix::commitgraph::Graph> = repo.commit_graph_if_enabled()?;
33+
let mut resource_cache = repo.diff_resource_cache_for_tree_diff()?;
34+
let diff_algorithm = repo.diff_algorithm()?;
35+
36+
let options = gix::blame::Options {
37+
diff_algorithm,
38+
range: gix::blame::BlameRanges::default(),
39+
since: None,
40+
rewrites: Some(gix::diff::Rewrites::default()),
41+
debug_track_path: true,
42+
};
43+
44+
let index = repo.index_or_empty()?;
45+
46+
// The following block, including the `TODO` comment, comes from
47+
// `gitoxide_core::repository::blame`.
48+
let file = gix::path::os_str_into_bstr(file)?;
49+
let specs = repo.pathspec(
50+
false,
51+
[file],
52+
true,
53+
&index,
54+
gix::worktree::stack::state::attributes::Source::WorktreeThenIdMapping.adjust_for_bare(repo.is_bare()),
55+
)?;
56+
// TODO: there should be a way to normalize paths without going through patterns, at least in this case maybe?
57+
// `Search` actually sorts patterns by excluding or not, all that can lead to strange results.
58+
let file = specs
59+
.search()
60+
.patterns()
61+
.map(|p| p.path().to_owned())
62+
.next()
63+
.expect("exactly one pattern");
64+
65+
let outcome = gix::blame::file(
66+
&repo.objects,
67+
suspect,
68+
cache,
69+
&mut resource_cache,
70+
file.as_bstr(),
71+
options,
72+
)?;
73+
74+
let blame_path = outcome
75+
.blame_path
76+
.expect("blame path to be present as `debug_track_path == true`");
77+
78+
// TODO
79+
// Potentially make `"assets"` configurable (it is in `git_to_sh`).
80+
let assets = destination_dir.join("assets");
81+
82+
eprintln!("{prefix} create directory '{assets}'", assets = assets.display());
83+
84+
if !dry_run {
85+
std::fs::create_dir_all(&assets)?;
86+
}
87+
88+
let mut buf = Vec::new();
89+
90+
for blame_path_entry in &blame_path {
91+
let src: &BStr = blame_path_entry.source_file_path.as_bstr();
92+
let dst = assets.join(format!("{}.commit", blame_path_entry.commit_id));
93+
94+
eprintln!(
95+
"{prefix} copy file '{}' at commit {} to '{dst}'",
96+
src,
97+
blame_path_entry.commit_id,
98+
dst = dst.display()
99+
);
100+
101+
if !dry_run {
102+
let blob = repo.objects.find_blob(&blame_path_entry.blob_id, &mut buf)?.data;
103+
104+
if verbatim {
105+
std::fs::write(dst, blob)?;
106+
} else {
107+
let blob = std::str::from_utf8(blob).with_context(|| {
108+
format!(
109+
"Entry in blob '{blob_id}' was not valid UTF8 and can't be remapped",
110+
blob_id = blame_path_entry.blob_id
111+
)
112+
})?;
113+
114+
let blob = crate::commands::copy_royal::remapped(blob);
115+
116+
std::fs::write(dst, blob)?;
117+
}
118+
}
119+
}
120+
121+
let mut blame_script = BlameScript::new(blame_path, Options { verbatim });
122+
123+
blame_script.generate()?;
124+
125+
let script_file = destination_dir.join("create-history.sh");
126+
127+
eprintln!(
128+
"{prefix} write script file at '{script_file}'",
129+
script_file = script_file.display()
130+
);
131+
132+
if !dry_run {
133+
std::fs::write(script_file, blame_script.script)?;
134+
}
135+
136+
Ok(())
137+
}
138+
139+
struct BlameScript {
140+
blame_path: Vec<BlamePathEntry>,
141+
seen: BTreeSet<ObjectId>,
142+
script: String,
143+
options: Options,
144+
}
145+
146+
impl BlameScript {
147+
fn new(blame_path: Vec<BlamePathEntry>, options: Options) -> Self {
148+
let mut script = String::new();
149+
150+
script.push_str(
151+
r"#!/bin/sh
152+
153+
set -e
154+
155+
git init
156+
echo .gitignore >> .gitignore
157+
echo assets/ >> .gitignore
158+
echo create-history.sh >> .gitignore
159+
160+
",
161+
);
162+
163+
Self {
164+
blame_path,
165+
seen: BTreeSet::default(),
166+
script,
167+
options,
168+
}
169+
}
170+
171+
fn generate(&mut self) -> anyhow::Result<()> {
172+
// `self.blame_path`, before calling `reverse`, has parents before children, with the
173+
// history’s root being the last element. We reverse the order in place so that all
174+
// methods can rely on the assumption that the root comes first, followed by its
175+
// descendants. That way, we can use a simple `for` loop to iterate through
176+
// `self.blame_path` below.
177+
self.blame_path.reverse();
178+
179+
for blame_path_entry in self.blame_path.clone() {
180+
if !self.seen.contains(&blame_path_entry.commit_id) {
181+
self.process_entry(&blame_path_entry)?;
182+
}
183+
184+
self.seen.insert(blame_path_entry.commit_id);
185+
}
186+
187+
Ok(())
188+
}
189+
190+
fn process_entry(&mut self, blame_path_entry: &BlamePathEntry) -> anyhow::Result<()> {
191+
let source_file_path = blame_path_entry.source_file_path.clone();
192+
let parents = self.parents_of(blame_path_entry);
193+
194+
let src = if self.options.verbatim {
195+
source_file_path.clone()
196+
} else {
197+
let source_file_path = std::str::from_utf8(source_file_path.as_slice()).with_context(|| {
198+
format!("Source file path '{source_file_path}' was not valid UTF8 and can't be remapped",)
199+
})?;
200+
201+
crate::commands::copy_royal::remapped(source_file_path).into()
202+
};
203+
let commit_id = blame_path_entry.commit_id;
204+
205+
let delete_previous_file_script = match &blame_path_entry.previous_source_file_path {
206+
Some(previous_source_file_path) if source_file_path != *previous_source_file_path => {
207+
let src = if self.options.verbatim {
208+
previous_source_file_path.to_string()
209+
} else {
210+
let source_file_path =
211+
std::str::from_utf8(previous_source_file_path.as_slice()).with_context(|| {
212+
format!("Source file path '{previous_source_file_path}' was not valid UTF8 and can't be remapped",)
213+
})?;
214+
215+
crate::commands::copy_royal::remapped(source_file_path)
216+
};
217+
218+
format!(
219+
r"# delete previous version of file
220+
git rm {src}
221+
"
222+
)
223+
}
224+
_ => String::new(),
225+
};
226+
227+
let script = format!(
228+
r"# make file {src} contain content at commit {commit_id}
229+
mkdir -p $(dirname {src})
230+
cp ./assets/{commit_id}.commit ./{src}
231+
# create commit
232+
git add {src}
233+
git commit -m {commit_id}
234+
"
235+
);
236+
237+
if parents.is_empty() {
238+
self.script.push_str(delete_previous_file_script.as_str());
239+
self.script.push_str(script.as_str());
240+
} else {
241+
let ([first], rest) = parents.split_at(1) else {
242+
unreachable!();
243+
};
244+
245+
self.script
246+
.push_str(format!("git checkout tag-{}\n", first.commit_id).as_str());
247+
248+
if rest.is_empty() {
249+
self.script.push_str(delete_previous_file_script.as_str());
250+
self.script.push_str(script.as_str());
251+
} else {
252+
self.script.push_str(
253+
format!(
254+
"git merge --no-commit {} || true\n",
255+
rest.iter()
256+
.map(|blame_path_entry| format!("tag-{}", blame_path_entry.commit_id))
257+
.collect::<Vec<_>>()
258+
.join(" ")
259+
)
260+
.as_str(),
261+
);
262+
263+
self.script.push_str(delete_previous_file_script.as_str());
264+
self.script.push_str(script.as_str());
265+
}
266+
}
267+
268+
self.script.push_str(format!("git tag tag-{commit_id}\n\n").as_str());
269+
270+
Ok(())
271+
}
272+
273+
fn parents_of(&self, child: &BlamePathEntry) -> Vec<BlamePathEntry> {
274+
// In almost all cases, `children` will only have one element. The exception are merge
275+
// commits where there’s changes against each parent. Each of these changes would
276+
// produce a diff that’s represented in `self.blame_path`.
277+
let children = self
278+
.blame_path
279+
.iter()
280+
.enumerate()
281+
.filter(|(_, x)| x.commit_id == child.commit_id);
282+
283+
let parents = children
284+
.filter_map(|(index, child)| {
285+
let parent_blob_id = child.previous_blob_id;
286+
let parent_source_file_path = &child.previous_source_file_path;
287+
288+
// When we search for a parent we only have to consider entries up to and
289+
// excluding `index` as anything after `index` can only be a child.
290+
self.blame_path[..index]
291+
.iter()
292+
.rfind(|&x| {
293+
x.blob_id == parent_blob_id && Some(&x.source_file_path) == parent_source_file_path.as_ref()
294+
})
295+
.cloned()
296+
})
297+
.collect();
298+
299+
parents
300+
}
301+
}
302+
}

tests/it/src/commands/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
pub mod blame_copy_royal;
2+
pub use blame_copy_royal::function::blame_copy_royal;
3+
14
pub mod copy_royal;
25
pub use copy_royal::function::copy_royal;
36

tests/it/src/main.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,19 @@ fn main() -> anyhow::Result<()> {
2525
max_count: count,
2626
},
2727
),
28+
Subcommands::BlameCopyRoyal {
29+
dry_run,
30+
worktree_dir: worktree_root,
31+
destination_dir,
32+
file,
33+
verbatim,
34+
} => commands::blame_copy_royal(
35+
dry_run,
36+
&worktree_root,
37+
destination_dir,
38+
&file,
39+
commands::blame_copy_royal::Options { verbatim },
40+
),
2841
Subcommands::CopyRoyal {
2942
dry_run,
3043
worktree_dir: worktree_root,

0 commit comments

Comments
 (0)