From 3785196dd4a804dd7e988f28397d345c89289d5b Mon Sep 17 00:00:00 2001 From: konstin Date: Tue, 5 Nov 2024 00:55:38 +0100 Subject: [PATCH 1/2] `Range::from_iter` and document ranges invariants ## Motivation https://github.com/astral-sh/uv/pull/8797/files#diff-17170cd58cc6474d6c6ee0413af4b3b28610aca267bbe7863411627219c26549R232 needs to modify an existing `Ranges`. We don't want to allow direct access to ranges since this allows breaking the version-ranges invariants. This led me to two questions: How do we best construct ranges from a number of versions (#249), does the user need to hold up the invariants or do we sort and merge the given ranges? And: What are our invariants? Given my previous profiling, construction ranges isn't nearly noticeable in benchmarks (but i'm gladly proven otherwise), the input ranges are too small and we do orders of magnitude more ranges constructions in the pubgrub algorithm itself, so the primary problem of https://github.com/astral-sh/uv/pull/8797/files#diff-17170cd58cc6474d6c6ee0413af4b3b28610aca267bbe7863411627219c26549R232 and #249 is ergonomics and zero-to-low overhead construction is a secondary problem, one that i'm tackling only because it would be inconsistent to have a slow API in pubgrub. Currently, we don't have a method for constructing ranges from multiple segments and this makes for an unergonomic API. ## Invariants For the invariants, we need 1), but i'm not clear if we also need 2) and 3) as strong as they are. They are currently invariants in the version ranges, but we could weaken them. What i like about them is that they imply that any instance of ranges is "fully reduced", it can't be simplified further (without knowning the actual versions available). 1. The segments are sorted, from lowest to highest (through `Ord`). 2. Each segment contains at least one version (start < end). 3. There is at least one version between two segments. ## API I added two functions: A `from_iter` where the user has to pass valid segments. This wants to be a `try_collect`, but that's still unstable. It is targeted at https://github.com/astral-sh/uv/pull/8797/files#diff-17170cd58cc6474d6c6ee0413af4b3b28610aca267bbe7863411627219c26549R232, which currently does this in a less secure fashion. There, you'd replace `iter_mut().for_each(...)` with an `.into_iter().map(...).collect()` (With the `IntoIter` i've also added this shouldn't even be an allocation; i'm optimizing this too for consistency's sake but i'd be surprised if it mattered to users). The other is `from_unsorted` which is ergonomics and could be used for https://github.com/astral-sh/uv/blob/0335efd0a901ee2351e663d0488b1ce87fd4cc80/crates/pep508-rs/src/marker/algebra.rs#L638-L645 and https://github.com/astral-sh/uv/blob/e1a8beb64b9f6eb573b3af616ffff680c282a316/crates/uv-resolver/src/pubgrub/report.rs#L1098-L1102 from #249. The user passes arbitrary segments and we're merging them into valid ranges. Please discuss :) --- version-ranges/src/lib.rs | 163 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) diff --git a/version-ranges/src/lib.rs b/version-ranges/src/lib.rs index 0e33bd5c..90c9e2d9 100644 --- a/version-ranges/src/lib.rs +++ b/version-ranges/src/lib.rs @@ -50,6 +50,13 @@ use smallvec::{smallvec, SmallVec}; #[cfg_attr(feature = "serde", derive(serde::Serialize))] #[cfg_attr(feature = "serde", serde(transparent))] pub struct Ranges { + /// Internally, [`Ranges`] are an ordered list of segments, where segment is a bounds pair. + /// + /// Invariants: + /// 1. The segments are sorted, from lowest to highest (through `Ord`). + /// 2. Each segment contains at least one version (start < end). + /// 3. There is at least one version between two segments. + /// /// Profiling in showed /// that a single stack entry is the most efficient. This is most likely due to `Interval` /// being large. @@ -283,6 +290,37 @@ impl Ranges { } } + /// We want to use `iterator_try_collect`, but since it's unstable at the time of writing, + /// we expose a public `FromIterator<(Bound, Bound)>` method and use this for internal + /// testing. + fn try_from( + into_iter: impl IntoIterator, Bound)>, + ) -> Result { + let mut iter = into_iter.into_iter(); + let Some(mut previous) = iter.next() else { + return Ok(Self { + segments: SmallVec::new(), + }); + }; + let mut segments = SmallVec::with_capacity(iter.size_hint().0); + for current in iter { + if !valid_segment(&previous.start_bound(), &previous.end_bound()) { + return Err(FromIterError::InvalidSegment); + } + if !end_before_start_with_gap(&previous.end_bound(), ¤t.start_bound()) { + return Err(FromIterError::OverlappingSegments); + } + segments.push(previous); + previous = current; + } + if !valid_segment(&previous.start_bound(), &previous.end_bound()) { + return Err(FromIterError::InvalidSegment); + } + segments.push(previous); + Ok(Self { segments }) + } + + /// See `Range.segments` docstring. fn check_invariants(self) -> Self { if cfg!(debug_assertions) { for p in self.segments.as_slice().windows(2) { @@ -820,12 +858,81 @@ impl Ranges { Self { segments }.check_invariants() } + /// Constructor from arbitrary, unsorted and potentially overlapping ranges. + // TODO(konsti): If we handroll this, we don't need the clone bound + pub fn from_unsorted(segments: impl IntoIterator, Bound)>) -> Self { + // We have three constraints we need to fulfil: + // 1. The segments are sorted, from lowest to highest (through `Ord`): By sorting. + // 2. Each segment contains at least one version (start < end): By `union`. + // 3. There is at least one version between two segments: By `union`. + let mut segments: SmallVec<_> = segments.into_iter().collect(); + segments.sort_by(|a: &Interval, b: &Interval| { + if a.start_bound() == b.start_bound() { + // The ends don't matter, we merge them anyway. + Ordering::Equal + } else if left_start_is_smaller(a.start_bound(), b.start_bound()) { + Ordering::Less + } else { + Ordering::Greater + } + }); + // TODO(konsti): Handroll this. We're relying on the union implementation treating each + // segment as potentially coming from the other ranges and merging overlapping bounds. + Self { segments }.union(&Self { + segments: SmallVec::new(), + }) + } + /// Iterate over the parts of the range. pub fn iter(&self) -> impl Iterator, &Bound)> { self.segments.iter().map(|(start, end)| (start, end)) } } +impl IntoIterator for Ranges { + type Item = (Bound, Bound); + // TODO(konsti): We must not leak the type here! + type IntoIter = smallvec::IntoIter<[Interval; 1]>; + + fn into_iter(self) -> Self::IntoIter { + self.segments.into_iter() + } +} + +/// User provided segment iterator breaks [`Ranges`] invariants. +/// +/// Not user accessible since `FromIterator<(Bound, Bound)>` panics and `iterator_try_collect` +/// is unstable. +#[derive(Debug, PartialEq, Eq)] +enum FromIterError { + /// The start of a segment must be before its end, and a segment must contain at least one + /// version. + InvalidSegment, + /// The end of a segment is not before the start of the next segment, leaving at least one + /// version space. + OverlappingSegments, +} + +impl Display for FromIterError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + FromIterError::InvalidSegment => f.write_str("segment must be valid"), + FromIterError::OverlappingSegments => { + f.write_str("end of a segment and start of the next segment must not overlap") + } + } + } +} + +impl FromIterator<(Bound, Bound)> for Ranges { + /// Construct a [`Ranges`] from an ordered list of bounds. + /// + /// Panics if the bounds aren't sorted, are empty or have no space to the next bound. + fn from_iter, Bound)>>(iter: T) -> Self { + Self::try_from(iter).unwrap() + } +} + // REPORT ###################################################################### impl Display for Ranges { @@ -1130,6 +1237,29 @@ pub mod tests { } assert!(simp.segments.len() <= range.segments.len()) } + + #[test] + fn from_iter_valid(segments in proptest::collection::vec(any::<(Bound, Bound)>(), ..30)) { + match Ranges::try_from(segments.clone()) { + Ok(ranges) => { + ranges.check_invariants(); + } + Err(_) => { + assert!( + segments + .as_slice() + .windows(2) + .any(|p| !end_before_start_with_gap(&p[0].1, &p[1].0)) + || segments.iter().any(|(start, end)| !valid_segment(start, end)) + ); + } + } + } + + #[test] + fn from_unsorted_valid(segments in proptest::collection::vec(any::<(Bound, Bound)>(), ..30)) { + Ranges::from_unsorted(segments.clone()).check_invariants(); + } } #[test] @@ -1194,4 +1324,37 @@ pub mod tests { version_reverse_sorted.sort(); assert_eq!(version_reverse_sorted, versions); } + + /// Test all error conditions in [`Ranges::try_from`]. + #[test] + fn from_iter_errors() { + // Unbounded in not at an end + let result = Ranges::try_from([ + (Bound::Included(1), Bound::Unbounded), + (Bound::Included(2), Bound::Unbounded), + ]); + assert_eq!(result, Err(FromIterError::OverlappingSegments)); + // Not a version in between + let result = Ranges::try_from([ + (Bound::Included(1), Bound::Excluded(2)), + (Bound::Included(2), Bound::Unbounded), + ]); + assert_eq!(result, Err(FromIterError::OverlappingSegments)); + // First segment + let result = Ranges::try_from([(Bound::Excluded(2), Bound::Included(2))]); + assert_eq!(result, Err(FromIterError::InvalidSegment)); + // Middle segment + let result = Ranges::try_from([ + (Bound::Included(1), Bound::Included(2)), + (Bound::Included(3), Bound::Included(2)), + (Bound::Included(4), Bound::Included(5)), + ]); + assert_eq!(result, Err(FromIterError::InvalidSegment)); + // Last segment + let result = Ranges::try_from([ + (Bound::Included(1), Bound::Included(2)), + (Bound::Included(3), Bound::Included(2)), + ]); + assert_eq!(result, Err(FromIterError::InvalidSegment)); + } } From 0e1502b269b4b4cbcf01275d0c64b592e19b0ce8 Mon Sep 17 00:00:00 2001 From: konstin Date: Tue, 5 Nov 2024 01:17:41 +0100 Subject: [PATCH 2/2] Filter to valid segments --- version-ranges/src/lib.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/version-ranges/src/lib.rs b/version-ranges/src/lib.rs index 90c9e2d9..9d457e5d 100644 --- a/version-ranges/src/lib.rs +++ b/version-ranges/src/lib.rs @@ -865,7 +865,10 @@ impl Ranges { // 1. The segments are sorted, from lowest to highest (through `Ord`): By sorting. // 2. Each segment contains at least one version (start < end): By `union`. // 3. There is at least one version between two segments: By `union`. - let mut segments: SmallVec<_> = segments.into_iter().collect(); + let mut segments: SmallVec<_> = segments + .into_iter() + .filter(|segment| valid_segment(&segment.start_bound(), &segment.end_bound())) + .collect(); segments.sort_by(|a: &Interval, b: &Interval| { if a.start_bound() == b.start_bound() { // The ends don't matter, we merge them anyway.