From bbf656daaa25034d7e3756ef7329a10914a04202 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 23 Jun 2025 15:48:53 +1000 Subject: [PATCH] Intern filenames. So that each unique filename only occurs once in the JSON output. This reduces the JSON file size by about 9%. --- src/librustdoc/json/conversions.rs | 20 +++++++++++++++--- src/librustdoc/json/mod.rs | 21 +++++++++++------- src/rustdoc-json-types/lib.rs | 34 ++++++++++++++++++------------ 3 files changed, 50 insertions(+), 25 deletions(-) diff --git a/src/librustdoc/json/conversions.rs b/src/librustdoc/json/conversions.rs index abad6e480291e..47472b46391a8 100644 --- a/src/librustdoc/json/conversions.rs +++ b/src/librustdoc/json/conversions.rs @@ -92,7 +92,7 @@ impl JsonRenderer<'_> { } } -pub(crate) trait FromClean { +pub(crate) trait FromClean { fn from_clean(f: &T, renderer: &JsonRenderer<'_>) -> Self; } @@ -100,7 +100,7 @@ pub(crate) trait IntoJson { fn into_json(&self, renderer: &JsonRenderer<'_>) -> T; } -impl IntoJson for T +impl IntoJson for T where U: FromClean, { @@ -153,7 +153,7 @@ impl FromClean for Option { let hi = span.hi(renderer.sess()); let lo = span.lo(renderer.sess()); Some(Span { - filename: local_path, + filename: local_path.as_path().into_json(renderer), begin: (lo.line, lo.col.to_usize() + 1), end: (hi.line, hi.col.to_usize() + 1), }) @@ -180,6 +180,20 @@ impl FromClean>> for Visibility { } } +impl FromClean for FilenameId { + fn from_clean(path: &std::path::Path, renderer: &JsonRenderer<'_>) -> Self { + let mut filenames = renderer.filenames.borrow_mut(); + + // To minimize calls to `to_path_buf` (which allocates) we call + // `get_index_of` first, which only needs `&Path` and usually succeeds. + // If it fails we do a separate insert, which requires `PathBuf`. + let idx = filenames + .get_index_of(path) + .unwrap_or_else(|| filenames.insert_full(path.to_path_buf()).0); + FilenameId(idx as u32) + } +} + impl FromClean for Deprecation { fn from_clean(deprecation: &attrs::Deprecation, _renderer: &JsonRenderer<'_>) -> Self { let attrs::Deprecation { since, note, suggestion: _ } = deprecation; diff --git a/src/librustdoc/json/mod.rs b/src/librustdoc/json/mod.rs index 600a4b429f3c7..3c12e67cf30fb 100644 --- a/src/librustdoc/json/mod.rs +++ b/src/librustdoc/json/mod.rs @@ -14,7 +14,7 @@ use std::io::{BufWriter, Write, stdout}; use std::path::PathBuf; use std::rc::Rc; -use rustc_data_structures::fx::FxHashSet; +use rustc_data_structures::fx::{FxHashSet, FxIndexSet}; use rustc_hir::def_id::{DefId, DefIdSet}; use rustc_middle::ty::TyCtxt; use rustc_session::Session; @@ -41,6 +41,8 @@ pub(crate) struct JsonRenderer<'tcx> { /// A mapping of IDs that contains all local items for this crate which gets output as a top /// level field of the JSON blob. index: FxHashMap, + // Interned filenames. + filenames: RefCell>, /// The directory where the JSON blob should be written to. /// /// If this is `None`, the blob will be printed to `stdout` instead. @@ -107,12 +109,12 @@ impl<'tcx> JsonRenderer<'tcx> { } fn serialize_and_write( - &self, + sess: &Session, output_crate: types::Crate, mut writer: BufWriter, path: &str, ) -> Result<(), Error> { - self.sess().time("rustdoc_json_serialize_and_write", || { + sess.time("rustdoc_json_serialize_and_write", || { try_err!( serde_json::ser::to_writer(&mut writer, &output_crate).map_err(|e| e.to_string()), path @@ -199,6 +201,7 @@ impl<'tcx> FormatRenderer<'tcx> for JsonRenderer<'tcx> { index: FxHashMap::default(), out_dir: if options.output_to_stdout { None } else { Some(options.output) }, cache: Rc::new(cache), + filenames: Default::default(), imported_items, id_interner: Default::default(), }, @@ -318,6 +321,7 @@ impl<'tcx> FormatRenderer<'tcx> for JsonRenderer<'tcx> { let target = target(self.tcx.sess); debug!("Constructing Output"); + let sess = self.sess(); let output_crate = types::Crate { root: self.id_from_item_default(e.def_id().into()), crate_version: self.cache.crate_version.clone(), @@ -339,6 +343,7 @@ impl<'tcx> FormatRenderer<'tcx> for JsonRenderer<'tcx> { ) }) .collect(), + filenames: self.filenames.into_inner().into_iter().collect(), external_crates: self .cache .extern_locations @@ -367,13 +372,14 @@ impl<'tcx> FormatRenderer<'tcx> for JsonRenderer<'tcx> { p.push(output_crate.index.get(&output_crate.root).unwrap().name.clone().unwrap()); p.set_extension("json"); - self.serialize_and_write( + Self::serialize_and_write( + sess, output_crate, try_err!(File::create_buffered(&p), p), &p.display().to_string(), ) } else { - self.serialize_and_write(output_crate, BufWriter::new(stdout().lock()), "") + Self::serialize_and_write(sess, output_crate, BufWriter::new(stdout().lock()), "") } } } @@ -389,7 +395,7 @@ mod size_asserts { use super::types::*; // tidy-alphabetical-start static_assert_size!(AssocItemConstraint, 112); - static_assert_size!(Crate, 184); + static_assert_size!(Crate, 208); static_assert_size!(ExternalCrate, 48); static_assert_size!(FunctionPointer, 168); static_assert_size!(GenericArg, 80); @@ -397,8 +403,7 @@ mod size_asserts { static_assert_size!(GenericBound, 72); static_assert_size!(GenericParamDef, 136); static_assert_size!(Impl, 304); - // `Item` contains a `PathBuf`, which is different sizes on different OSes. - static_assert_size!(Item, 528 + size_of::()); + static_assert_size!(Item, 544); static_assert_size!(ItemSummary, 32); static_assert_size!(PolyTrait, 64); static_assert_size!(PreciseCapturingArg, 32); diff --git a/src/rustdoc-json-types/lib.rs b/src/rustdoc-json-types/lib.rs index e5c246cb69c3e..4231ed157c2be 100644 --- a/src/rustdoc-json-types/lib.rs +++ b/src/rustdoc-json-types/lib.rs @@ -37,8 +37,8 @@ pub type FxHashMap = HashMap; // re-export for use in src/librustdoc // will instead cause conflicts. See #94591 for more. (This paragraph and the "Latest feature" line // are deliberately not in a doc comment, because they need not be in public docs.) // -// Latest feature: Pretty printing of must_use attributes changed -pub const FORMAT_VERSION: u32 = 52; +// Latest feature: Intern filenames +pub const FORMAT_VERSION: u32 = 53; /// The root of the emitted JSON blob. /// @@ -58,6 +58,8 @@ pub struct Crate { pub index: HashMap, /// Maps IDs to fully qualified paths and other info helpful for generating links. pub paths: HashMap, + /// Interned filenames. `FilenameId`s index into this. + pub filenames: Vec, /// Maps `crate_id` of items to a crate name and html_root_url if it exists. pub external_crates: HashMap, /// Information about the target for which this documentation was generated @@ -205,8 +207,9 @@ pub struct Item { /// A range of source code. #[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct Span { - /// The path to the source file for this span relative to the path `rustdoc` was invoked with. - pub filename: PathBuf, + /// ID of the path to the source file for this span relative to the path `rustdoc` was invoked + /// with. + pub filename: FilenameId, /// One indexed Line and Column of the first character of the `Span`. pub begin: (usize, usize), /// One indexed Line and Column of the last character of the `Span`. @@ -385,21 +388,24 @@ pub enum AssocItemConstraintKind { Constraint(Vec), } -/// An opaque identifier for an item. +/// An opaque identifier for an item. The integer within indexes into +/// [`Crate::index`] to resolve it to an [`Item`], or into [`Crate::paths`] +/// to resolve it to an [`ItemSummary`]. /// -/// It can be used to lookup in [`Crate::index`] or [`Crate::paths`] to resolve it -/// to an [`Item`]. -/// -/// Id's are only valid within a single JSON blob. They cannot be used to -/// resolve references between the JSON output's for different crates. -/// -/// Rustdoc makes no guarantees about the inner value of Id's. Applications -/// should treat them as opaque keys to lookup items, and avoid attempting -/// to parse them, or otherwise depend on any implementation details. +/// `Id`s are only valid within a single JSON blob. They cannot be used to +/// resolve references between the JSON output for different crates. #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] // FIXME(aDotInTheVoid): Consider making this non-public in rustdoc-types. pub struct Id(pub u32); +/// An identifier for a filename. The integer within indexes into +/// [`Crate::filenames`] to resolve to the actual filename. +/// +/// `FilenameId`s are only valid within a single JSON blob. They cannot be +/// used to resolve references between the JSON output for different crates. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct FilenameId(pub u32); + /// The fundamental kind of an item. Unlike [`ItemEnum`], this does not carry any additional info. /// /// Part of [`ItemSummary`].