Skip to content

Multiaddr: Replace Arc<Vec<u8>> with inline storage #1510

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions misc/multiaddr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,15 @@ serde = "1.0.70"
static_assertions = "1.1"
unsigned-varint = "0.3"
url = { version = "2.1.0", default-features = false }
smallvec = { version = "1.0", features = ["write"] }

[dev-dependencies]
bincode = "1"
quickcheck = "0.9.0"
rand = "0.7.2"
serde_json = "1.0"
criterion = "0.3"

[[bench]]
name = "clone"
harness = false
52 changes: 52 additions & 0 deletions misc/multiaddr/benches/clone.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright 2019 Parity Technologies (UK) Ltd.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//! This benchmark tests the speed of cloning of the largest possible inlined
//! multiaddr vs the smalles possible heap allocated multiaddr.
//!
//! Note that the main point of the storage optimization is not to speed up clone
//! but to avoid allocating on the heap at all, but still you see a nice benefit
//! in the speed of cloning.
use criterion::{Bencher, Criterion, criterion_main, criterion_group, black_box};
use parity_multiaddr::Multiaddr;

fn do_clone(multiaddr: &Multiaddr) -> usize {
let mut res = 0usize;
for _ in 0..10 {
res += multiaddr.clone().as_ref().len()
}
res
}

fn clone(bench: &mut Bencher, addr: &Multiaddr) {
bench.iter(|| do_clone(black_box(addr)))
}

fn criterion_benchmarks(bench: &mut Criterion) {
let inlined: Multiaddr = "/dns4/01234567890123456789123/tcp/80/ws".parse().unwrap();
let heap: Multiaddr = "/dns4/0123456789012345678901234/tcp/80/ws".parse().unwrap();
assert_eq!(inlined.as_ref().len(), 30);
assert_eq!(heap.as_ref().len(), 32);

bench.bench_function("clone 10 max inlined", |b| clone(b, &inlined));
bench.bench_function("clone 10 min heap", |b| clone(b, &heap));
}

criterion_group!(benches, criterion_benchmarks);
criterion_main!(benches);
66 changes: 37 additions & 29 deletions misc/multiaddr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod protocol;
mod onion_addr;
mod errors;
mod from_url;
mod storage;

use serde::{
Deserialize,
Expand All @@ -17,13 +18,13 @@ use serde::{
use std::{
convert::TryFrom,
fmt,
io,
hash,
iter::FromIterator,
net::{IpAddr, Ipv4Addr, Ipv6Addr},
result::Result as StdResult,
str::FromStr,
sync::Arc
};
use storage::Storage;
pub use self::errors::{Result, Error};
pub use self::from_url::{FromUrlErr, from_url, from_url_lossy};
pub use self::protocol::Protocol;
Expand All @@ -36,28 +37,23 @@ static_assertions::const_assert! {
}

/// Representation of a Multiaddr.
#[derive(PartialEq, Eq, Clone, Hash)]
pub struct Multiaddr { bytes: Arc<Vec<u8>> }
#[derive(Clone)]
pub struct Multiaddr { storage: Storage }

impl Multiaddr {
/// Create a new, empty multiaddress.
pub fn empty() -> Self {
Self { bytes: Arc::new(Vec::new()) }
}

/// Create a new, empty multiaddress with the given capacity.
pub fn with_capacity(n: usize) -> Self {
Self { bytes: Arc::new(Vec::with_capacity(n)) }
Self { storage: Storage::from_slice(&[]) }
}

/// Return the length in bytes of this multiaddress.
pub fn len(&self) -> usize {
self.bytes.len()
self.as_ref().len()
}

/// Return a copy of this [`Multiaddr`]'s byte representation.
pub fn to_vec(&self) -> Vec<u8> {
Vec::from(&self.bytes[..])
self.as_ref().to_vec()
}

/// Adds an already-parsed address component to the end of this multiaddr.
Expand All @@ -73,9 +69,9 @@ impl Multiaddr {
/// ```
///
pub fn push(&mut self, p: Protocol<'_>) {
let mut w = io::Cursor::<&mut Vec<u8>>::new(Arc::make_mut(&mut self.bytes));
w.set_position(w.get_ref().len() as u64);
p.write_bytes(&mut w).expect("Writing to a `io::Cursor<&mut Vec<u8>>` never fails.")
let mut w: smallvec::SmallVec<[u8; 32]> = self.as_ref().into();
p.write_bytes(&mut w).expect("Writing to a `Buffer` never fails.");
self.storage = Storage::from_slice(&w);
}

/// Pops the last `Protocol` of this multiaddr, or `None` if the multiaddr is empty.
Expand All @@ -89,7 +85,7 @@ impl Multiaddr {
/// ```
///
pub fn pop<'a>(&mut self) -> Option<Protocol<'a>> {
let mut slice = &self.bytes[..]; // the remaining multiaddr slice
let mut slice = self.as_ref(); // the remaining multiaddr slice
if slice.is_empty() {
return None
}
Expand All @@ -100,16 +96,14 @@ impl Multiaddr {
}
slice = s
};
let remaining_len = self.bytes.len() - slice.len();
Arc::make_mut(&mut self.bytes).truncate(remaining_len);
let remaining_len = self.as_ref().len() - slice.len();
self.storage = Storage::from_slice(&self.as_ref()[..remaining_len]);
Some(protocol)
}

/// Like [`Multiaddr::push`] but consumes `self`.
pub fn with(mut self, p: Protocol<'_>) -> Self {
let mut w = io::Cursor::<&mut Vec<u8>>::new(Arc::make_mut(&mut self.bytes));
w.set_position(w.get_ref().len() as u64);
p.write_bytes(&mut w).expect("Writing to a `io::Cursor<&mut Vec<u8>>` never fails.");
self.push(p);
self
}

Expand All @@ -130,7 +124,7 @@ impl Multiaddr {
/// ```
///
pub fn iter(&self) -> Iter<'_> {
Iter(&self.bytes)
Iter(&self.as_ref())
}

/// Replace a [`Protocol`] at some position in this `Multiaddr`.
Expand All @@ -145,7 +139,7 @@ impl Multiaddr {
where
F: FnOnce(&Protocol) -> Option<Protocol<'a>>
{
let mut address = Multiaddr::with_capacity(self.len());
let mut address = Multiaddr::empty();
let mut fun = Some(by);
let mut replaced = false;

Expand Down Expand Up @@ -192,9 +186,23 @@ impl fmt::Display for Multiaddr {
}
}

impl PartialEq for Multiaddr {
fn eq(&self, other: &Self) -> bool {
self.as_ref() == other.as_ref()
}
}

impl Eq for Multiaddr {}

impl hash::Hash for Multiaddr {
fn hash<H: hash::Hasher>(&self, state: &mut H) {
self.as_ref().hash(state);
}
}

impl AsRef<[u8]> for Multiaddr {
fn as_ref(&self) -> &[u8] {
self.bytes.as_ref()
self.storage.bytes()
}
}

Expand All @@ -203,7 +211,7 @@ impl<'a> IntoIterator for &'a Multiaddr {
type IntoIter = Iter<'a>;

fn into_iter(self) -> Iter<'a> {
Iter(&self.bytes)
Iter(&self.as_ref())
}
}

Expand All @@ -216,7 +224,7 @@ impl<'a> FromIterator<Protocol<'a>> for Multiaddr {
for cmp in iter {
cmp.write_bytes(&mut writer).expect("Writing to a `Vec` never fails.");
}
Multiaddr { bytes: Arc::new(writer) }
Multiaddr { storage: Storage::from_slice(&writer) }
}
}

Expand All @@ -237,7 +245,7 @@ impl FromStr for Multiaddr {
p.write_bytes(&mut writer).expect("Writing to a `Vec` never fails.");
}

Ok(Multiaddr { bytes: Arc::new(writer) })
Ok(Multiaddr { storage: Storage::from_slice(&writer) })
}
}

Expand All @@ -264,7 +272,7 @@ impl<'a> From<Protocol<'a>> for Multiaddr {
fn from(p: Protocol<'a>) -> Multiaddr {
let mut w = Vec::new();
p.write_bytes(&mut w).expect("Writing to a `Vec` never fails.");
Multiaddr { bytes: Arc::new(w) }
Multiaddr { storage: Storage::from_slice(&w) }
}
}

Expand Down Expand Up @@ -299,7 +307,7 @@ impl TryFrom<Vec<u8>> for Multiaddr {
let (_, s) = Protocol::from_bytes(slice)?;
slice = s
}
Ok(Multiaddr { bytes: Arc::new(v) })
Ok(Multiaddr { storage: Storage::from_slice(&v) })
}
}

Expand Down
110 changes: 110 additions & 0 deletions misc/multiaddr/src/storage.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// Copyright 2020 Parity Technologies (UK) Ltd.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
use std::sync::Arc;

/// MAX_INLINE is the maximum size of a multiaddr that can be stored inline.
/// There is an overhead of 2 bytes, 1 for the length and 1 for the enum discriminator.
/// 30 is chosen so that the overall size is 32. This should still be big enough to fit
/// a multiaddr containing an ipv4 or ipv6 address and port.
///
/// More complex multiaddrs like those containing peer ids will be stored on the heap.
const MAX_INLINE: usize = 30;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not 62? Most peer IDs would fit in there as well and peer IDs are quite a common case.

Copy link
Contributor Author

@rklaehn rklaehn Mar 26, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could make it 62. I have no idea about the size distribution of the typical multiaddr. From what I have seen in rust-libp2p, and for what I intend to use this for, you often retain addresses for other nodes, which are just host/port/protocol. If you say that the ones containing a peer id are sufficiently common, I will change this to 62.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless large multiaddrs are way more popular than I think, 64 would be too big. If you go to the arced version for large multihashes, it is not the end of the world. And there are situations where having lots of 64 byte objects would be worse than having a bunch of pointers to arcs.


#[derive(Clone)]
pub(crate) enum Storage {
/// hash is stored inline if it is smaller than MAX_INLINE
Inline(u8, [u8; MAX_INLINE]),
/// hash is stored on the heap. this must be only used if the hash is actually larger than
/// MAX_INLINE bytes to ensure a unique representation.
Heap(Arc<[u8]>),
}

impl Storage {
/// The raw bytes.
pub fn bytes(&self) -> &[u8] {
match self {
Storage::Inline(len, bytes) => &bytes[..(*len as usize)],
Storage::Heap(data) => &data,
}
}

/// Creates storage from a slice.
/// For a size up to MAX_INLINE, this will not allocate.
pub fn from_slice(slice: &[u8]) -> Self {
let len = slice.len();
if len <= MAX_INLINE {
let mut data: [u8; MAX_INLINE] = [0; MAX_INLINE];
data[..len].copy_from_slice(slice);
Storage::Inline(len as u8, data)
} else {
Storage::Heap(slice.into())
}
}
}

#[cfg(test)]
mod tests {
use crate::Multiaddr;
use super::{Storage, MAX_INLINE};
use quickcheck::quickcheck;

#[test]
fn multihash_size() {
fn assert_size(ma: &str, n: usize, inline: bool) {
let ma: Multiaddr = ma.parse().unwrap();
assert_eq!(ma.as_ref().len(), n);
assert_eq!(n <= MAX_INLINE, inline);
}
assert_size("/ip4/127.0.0.1", 5, true);
assert_size("/ip6/2001:8a0:7ac5:4201:3ac9:86ff:fe31:7095/tcp/8000", 20, true);
assert_size("/dns4/0123456789012345678901234/tcp/8000", 30, true);
assert_size("/ip6/2001:8a0:7ac5:4201:3ac9:86ff:fe31:7095/tcp/8000/ws/p2p/QmcgpsyWgH8Y8ajJz1Cu72KnS5uo2Aa2LpzU7kinSupNKC", 59, false);
}

#[test]
fn struct_size() {
// this should be true for both 32 and 64 bit archs
assert_eq!(std::mem::size_of::<Storage>(), 32);
}

#[test]
fn roundtrip() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this test not redundant given the property test below?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, this one is guaranteed to hit all the interesting cases, whereas with the property based test you depend on luck. But this was copied from multihash, which is an independent repository. So if you want to reduce the number of tests for rust-libp2p, which is a bit bigger, I can remove it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you are concerned about the test runtime, it would probably be best to keep this one and drop the property based test.

// check that .bytes() returns whatever the storage was created with
for i in 0..((MAX_INLINE + 10) as u8) {
let data = (0..i).collect::<Vec<u8>>();
let storage = Storage::from_slice(&data);
assert_eq!(data, storage.bytes());
}
}

fn check_invariants(storage: Storage) -> bool {
match storage {
Storage::Inline(len, _) => len as usize <= MAX_INLINE,
Storage::Heap(arc) => arc.len() > MAX_INLINE,
}
}

quickcheck! {
fn roundtrip_check(data: Vec<u8>) -> bool {
let storage = Storage::from_slice(&data);
storage.bytes() == data.as_slice() && check_invariants(storage)
}
}
}