Skip to content

Commit e8534d0

Browse files
committed
fix(vendor)!: direct extraction for registry sources
`PathSource::list_files` has some heurstic rules for listing files. Those rules are mainly designed for `cargo package`. Previously, cargo-vendor relies on those rules to understand what files to vendor. However, it shouldn't use those rules because: * Package extracted from a `.crate` tarball isn't Git-controlled, some rules may apply differently. * The extracted package already went through `PathSource::list_files` during packaging. It should be clean enough. * Should keep crate sources from registry sources in a pristine state, which is exactly what vendoring is meant for. Instead, we switch to direct extraction into the vendor directory to ensure source code is the same as in the `.crate` tarball. There are some caveats: * The overwrite protection in `unpack_package` assumes the unpack directory is always `<pkg>-<version`>. We don't want to remove this, but for cargo-vendor supports vendoring without version suffix. For that case, we need a temporary staging area, and move the unpacked source then. * The heurstic in `PathSource::list_files` did something "good" in general cases, like excluding hidden directories. That means common directorys like `.github` or `.config` won't be vendored. After this, those get included. This is another round of churns. We might want to get other `cargo-vendor` changes along with this in one single release.
1 parent 060ab41 commit e8534d0

File tree

3 files changed

+130
-24
lines changed

3 files changed

+130
-24
lines changed

src/cargo/ops/vendor.rs

Lines changed: 78 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@ use crate::core::{GitReference, Package, Workspace};
44
use crate::ops;
55
use crate::sources::path::PathSource;
66
use crate::sources::PathEntry;
7+
use crate::sources::RegistrySource;
78
use crate::sources::SourceConfigMap;
89
use crate::sources::CRATES_IO_REGISTRY;
910
use crate::util::cache_lock::CacheLockMode;
1011
use crate::util::{try_canonicalize, CargoResult, GlobalContext};
12+
1113
use anyhow::{bail, Context as _};
1214
use cargo_util::{paths, Sha256};
15+
use cargo_util_schemas::core::SourceKind;
1316
use serde::Serialize;
17+
use walkdir::WalkDir;
18+
1419
use std::collections::HashSet;
1520
use std::collections::{BTreeMap, BTreeSet, HashMap};
1621
use std::ffi::OsStr;
@@ -86,9 +91,16 @@ struct SourceReplacementCache<'gctx> {
8691
}
8792

8893
impl SourceReplacementCache<'_> {
89-
fn new(gctx: &GlobalContext) -> CargoResult<SourceReplacementCache<'_>> {
94+
fn new(
95+
gctx: &GlobalContext,
96+
respect_source_config: bool,
97+
) -> CargoResult<SourceReplacementCache<'_>> {
9098
Ok(SourceReplacementCache {
91-
map: SourceConfigMap::new(gctx)?,
99+
map: if respect_source_config {
100+
SourceConfigMap::new(gctx)
101+
} else {
102+
SourceConfigMap::empty(gctx)
103+
}?,
92104
cache: Default::default(),
93105
})
94106
}
@@ -130,7 +142,8 @@ fn sync(
130142
}
131143
}
132144

133-
let mut source_replacement_cache = SourceReplacementCache::new(gctx)?;
145+
let mut source_replacement_cache =
146+
SourceReplacementCache::new(gctx, opts.respect_source_config)?;
134147

135148
// First up attempt to work around rust-lang/cargo#5956. Apparently build
136149
// artifacts sprout up in Cargo's global cache for whatever reason, although
@@ -263,11 +276,69 @@ fn sync(
263276
)?;
264277

265278
let _ = fs::remove_dir_all(&dst);
266-
let pathsource = PathSource::new(src, id.source_id(), gctx);
267-
let paths = pathsource.list_files(pkg)?;
279+
268280
let mut file_cksums = BTreeMap::new();
269-
cp_sources(pkg, src, &paths, &dst, &mut file_cksums, &mut tmp_buf, gctx)
270-
.with_context(|| format!("failed to copy over vendored sources for: {}", id))?;
281+
282+
// Need this mapping anyway because we will directly consult registry sources,
283+
// otherwise builtin source replacement (sparse registry) won't be respected.
284+
let sid = source_replacement_cache.get(id.source_id())?;
285+
286+
if sid.is_registry() {
287+
// To keep the unpacked source from registry in a pristine state,
288+
// we'll do a direct extraction into the vendor directory.
289+
let registry = match sid.kind() {
290+
SourceKind::Registry | SourceKind::SparseRegistry => {
291+
RegistrySource::remote(sid, &Default::default(), gctx)?
292+
}
293+
SourceKind::LocalRegistry => {
294+
let path = sid.url().to_file_path().expect("local path");
295+
RegistrySource::local(sid, &path, &Default::default(), gctx)
296+
}
297+
_ => unreachable!("not registry source: {sid}"),
298+
};
299+
300+
let mut compute_file_cksums = |root| {
301+
let walkdir = WalkDir::new(root)
302+
.into_iter()
303+
// It is safe to skip errors,
304+
// since we'll hit them during copying/reading later anyway.
305+
.filter_map(|e| e.ok())
306+
// There should be no symlink in tarballs on crates.io,
307+
// but might be wrong for local registries.
308+
// Hence here be conservative and include symlinks.
309+
.filter(|e| e.file_type().is_file() || e.file_type().is_symlink());
310+
for e in walkdir {
311+
let path = e.path();
312+
let relative = path.strip_prefix(&dst).unwrap();
313+
let cksum = Sha256::new()
314+
.update_path(path)
315+
.map(Sha256::finish_hex)
316+
.with_context(|| format!("failed to checksum `{}`", path.display()))?;
317+
file_cksums.insert(relative.to_str().unwrap().replace("\\", "/"), cksum);
318+
}
319+
Ok::<_, anyhow::Error>(())
320+
};
321+
if dir_has_version_suffix {
322+
registry.unpack_package_in(id, &vendor_dir, &vendor_this)?;
323+
compute_file_cksums(&dst)?;
324+
} else {
325+
// Due to the extra sanity check in registry unpack
326+
// (ensure it contain only one top-level directory with name `pkg-version`),
327+
// we can only unpack a directory with version suffix,
328+
// and move it to the no suffix directory.
329+
let staging_dir = tempfile::Builder::new()
330+
.prefix(".vendor-staging")
331+
.tempdir_in(vendor_dir)?;
332+
let unpacked_src =
333+
registry.unpack_package_in(id, staging_dir.path(), &vendor_this)?;
334+
fs::rename(&unpacked_src, &dst)?;
335+
compute_file_cksums(&dst)?;
336+
}
337+
} else {
338+
let paths = PathSource::new(src, sid, gctx).list_files(pkg)?;
339+
cp_sources(pkg, src, &paths, &dst, &mut file_cksums, &mut tmp_buf, gctx)
340+
.with_context(|| format!("failed to copy vendored sources for {id}"))?;
341+
}
271342

272343
// Finally, emit the metadata about this package
273344
let json = serde_json::json!({

src/cargo/sources/registry/mod.rs

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,8 @@ pub struct RegistrySource<'gctx> {
248248
source_id: SourceId,
249249
/// The path where crate files are extracted (`$CARGO_HOME/registry/src/$REG-HASH`).
250250
src_path: Filesystem,
251+
/// Path to the cache of `.crate` files (`$CARGO_HOME/registry/cache/$REG-HASH`).
252+
cache_path: Filesystem,
251253
/// Local reference to [`GlobalContext`] for convenience.
252254
gctx: &'gctx GlobalContext,
253255
/// Abstraction for interfacing to the different registry kinds.
@@ -532,6 +534,7 @@ impl<'gctx> RegistrySource<'gctx> {
532534
RegistrySource {
533535
name: name.into(),
534536
src_path: gctx.registry_source_path().join(name),
537+
cache_path: gctx.registry_cache_path().join(name),
535538
gctx,
536539
source_id,
537540
index: index::RegistryIndex::new(source_id, ops.index_path(), gctx),
@@ -631,7 +634,7 @@ impl<'gctx> RegistrySource<'gctx> {
631634
}
632635
dst.create_dir()?;
633636

634-
let bytes_written = unpack(self.gctx, tarball, unpack_dir)?;
637+
let bytes_written = unpack(self.gctx, tarball, unpack_dir, &|_| true)?;
635638

636639
// Now that we've finished unpacking, create and write to the lock file to indicate that
637640
// unpacking was successful.
@@ -656,6 +659,29 @@ impl<'gctx> RegistrySource<'gctx> {
656659
Ok(unpack_dir.to_path_buf())
657660
}
658661

662+
/// Unpacks the `.crate` tarball of the package in a given directory.
663+
///
664+
/// Returns the path to the crate tarball directory,
665+
/// whch is always `<unpack_dir>/<pkg>-<version>`.
666+
///
667+
/// This holds an assumption that the associated tarball already exists.
668+
pub fn unpack_package_in(
669+
&self,
670+
pkg: &PackageId,
671+
unpack_dir: &Path,
672+
include: &dyn Fn(&Path) -> bool,
673+
) -> CargoResult<PathBuf> {
674+
let path = self.cache_path.join(pkg.tarball_name());
675+
let path = self
676+
.gctx
677+
.assert_package_cache_locked(CacheLockMode::DownloadExclusive, &path);
678+
let dst = unpack_dir.join(format!("{}-{}", pkg.name(), pkg.version()));
679+
let tarball =
680+
File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
681+
unpack(self.gctx, &tarball, &dst, include)?;
682+
Ok(dst)
683+
}
684+
659685
/// Turns the downloaded `.crate` tarball file into a [`Package`].
660686
///
661687
/// This unconditionally sets checksum for the returned package, so it
@@ -996,7 +1022,12 @@ fn set_mask<R: Read>(tar: &mut Archive<R>) {
9961022
}
9971023

9981024
/// Unpack a tarball with zip bomb and overwrite protections.
999-
fn unpack(gctx: &GlobalContext, tarball: &File, unpack_dir: &Path) -> CargoResult<u64> {
1025+
fn unpack(
1026+
gctx: &GlobalContext,
1027+
tarball: &File,
1028+
unpack_dir: &Path,
1029+
include: &dyn Fn(&Path) -> bool,
1030+
) -> CargoResult<u64> {
10001031
let mut tar = {
10011032
let size_limit = max_unpack_size(gctx, tarball.metadata()?.len());
10021033
let gz = GzDecoder::new(tarball);
@@ -1015,20 +1046,23 @@ fn unpack(gctx: &GlobalContext, tarball: &File, unpack_dir: &Path) -> CargoResul
10151046
.context("failed to read entry path")?
10161047
.into_owned();
10171048

1018-
// We're going to unpack this tarball into the global source
1019-
// directory, but we want to make sure that it doesn't accidentally
1020-
// (or maliciously) overwrite source code from other crates. Cargo
1021-
// itself should never generate a tarball that hits this error, and
1022-
// crates.io should also block uploads with these sorts of tarballs,
1023-
// but be extra sure by adding a check here as well.
1024-
if !entry_path.starts_with(prefix) {
1049+
if let Ok(path) = entry_path.strip_prefix(prefix) {
1050+
if !include(path) {
1051+
continue;
1052+
}
1053+
} else {
1054+
// We're going to unpack this tarball into the global source
1055+
// directory, but we want to make sure that it doesn't accidentally
1056+
// (or maliciously) overwrite source code from other crates. Cargo
1057+
// itself should never generate a tarball that hits this error, and
1058+
// crates.io should also block uploads with these sorts of tarballs,
1059+
// but be extra sure by adding a check here as well.
10251060
anyhow::bail!(
10261061
"invalid tarball downloaded, contains \
1027-
a file at {:?} which isn't under {:?}",
1028-
entry_path,
1029-
prefix
1062+
a file at {entry_path:?} which isn't under {prefix:?}",
10301063
)
10311064
}
1065+
10321066
// Prevent unpacking the lockfile from the crate itself.
10331067
if entry_path
10341068
.file_name()

tests/testsuite/vendor.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -213,12 +213,13 @@ fn package_exclude() {
213213

214214
p.cargo("vendor --respect-source-config").run();
215215
let csum = p.read_file("vendor/bar/.cargo-checksum.json");
216+
// Everything is included because `cargo-vendor`
217+
// do direct extractions from tarballs
218+
// (Some are excluded like `.git` or `.cargo-ok` though.)
216219
assert!(csum.contains(".include"));
217-
assert!(!csum.contains(".exclude"));
218-
assert!(!csum.contains(".dotdir/exclude"));
219-
// Gitignore doesn't re-include a file in an excluded parent directory,
220-
// even if negating it explicitly.
221-
assert!(!csum.contains(".dotdir/include"));
220+
assert!(csum.contains(".exclude"));
221+
assert!(csum.contains(".dotdir/exclude"));
222+
assert!(csum.contains(".dotdir/include"));
222223
}
223224

224225
#[cargo_test]

0 commit comments

Comments
 (0)