Skip to content

Commit 0ec803a

Browse files
authored
Use inline storage for small hashes (#47)
fix: use inline storage for small hashes Replace the `bytes` crate with a custom storage, that doesn't heap allocate for hashes <= 38 byte and uses an Arc<[u8]> for bigger hashes.
1 parent d1214d5 commit 0ec803a

File tree

3 files changed

+153
-25
lines changed

3 files changed

+153
-25
lines changed

Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@ edition = "2018"
2020
[dependencies]
2121
blake2b_simd = { version = "0.5.9", default-features = false }
2222
blake2s_simd = { version = "0.5.9", default-features = false }
23-
bytes = "0.5"
2423
sha1 = "0.5"
2524
sha2 = { version = "0.7", default-features = false }
2625
tiny-keccak = "1.4"
2726
unsigned-varint = "0.3"
27+
28+
[dev-dependencies]
29+
quickcheck = "0.9.2"

src/lib.rs

Lines changed: 43 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,22 @@
88
99
mod errors;
1010
mod hashes;
11+
mod storage;
1112

1213
use std::convert::TryFrom;
14+
use std::fmt::Debug;
15+
use std::hash;
1316

1417
use blake2b_simd::{blake2b, Params as Blake2bVariable};
1518
use blake2s_simd::{blake2s, Params as Blake2sVariable};
16-
use bytes::{BufMut, Bytes, BytesMut};
1719
use sha2::Digest;
1820
use tiny_keccak::Keccak;
1921
use unsigned_varint::{decode, encode};
2022

2123
pub use errors::{DecodeError, DecodeOwnedError, EncodeError};
2224
pub use hashes::Hash;
25+
use std::fmt;
26+
use storage::Storage;
2327

2428
// Helper macro for encoding input into output using sha1, sha2, tiny_keccak, or blake2
2529
macro_rules! encode {
@@ -104,15 +108,8 @@ pub fn encode(hash: Hash, input: &[u8]) -> Result<Multihash, EncodeError> {
104108
let code = encode::u16(hash.code(), &mut buf);
105109
let mut len_buf = encode::u32_buffer();
106110
let size = encode::u32(input.len() as u32, &mut len_buf);
107-
108-
let total_len = code.len() + size.len() + input.len();
109-
110-
let mut output = BytesMut::with_capacity(total_len);
111-
output.put_slice(code);
112-
output.put_slice(size);
113-
output.put_slice(input);
114111
Ok(Multihash {
115-
bytes: output.freeze(),
112+
storage: Storage::from_slices(&[&code, &size, &input]),
116113
})
117114
} else {
118115
let (offset, mut output) = encode_hash(hash);
@@ -135,31 +132,51 @@ pub fn encode(hash: Hash, input: &[u8]) -> Result<Multihash, EncodeError> {
135132
});
136133

137134
Ok(Multihash {
138-
bytes: output.freeze(),
135+
storage: Storage::from_slice(&output),
139136
})
140137
}
141138
}
142139

143-
// Encode the given [`Hash`] value and ensure the returned [`BytesMut`]
140+
// Encode the given [`Hash`] value and ensure the returned [`Vec<u8>`]
144141
// has enough capacity to hold the actual digest.
145-
fn encode_hash(hash: Hash) -> (usize, BytesMut) {
142+
fn encode_hash(hash: Hash) -> (usize, Vec<u8>) {
146143
let mut buf = encode::u16_buffer();
147144
let code = encode::u16(hash.code(), &mut buf);
148145

149146
let len = code.len() + 1 + usize::from(hash.size());
150147

151-
let mut output = BytesMut::with_capacity(len);
152-
output.put_slice(code);
153-
output.put_u8(hash.size());
148+
let mut output = Vec::with_capacity(len);
149+
output.extend_from_slice(code);
150+
output.push(hash.size());
154151
output.resize(len, 0);
155152

156153
(code.len() + 1, output)
157154
}
158155

159156
/// Represents a valid multihash.
160-
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
157+
#[derive(Clone)]
161158
pub struct Multihash {
162-
bytes: Bytes,
159+
storage: Storage,
160+
}
161+
162+
impl Debug for Multihash {
163+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
164+
f.debug_tuple("Multihash").field(&self.as_bytes()).finish()
165+
}
166+
}
167+
168+
impl PartialEq for Multihash {
169+
fn eq(&self, other: &Self) -> bool {
170+
self.storage.bytes() == other.storage.bytes()
171+
}
172+
}
173+
174+
impl Eq for Multihash {}
175+
176+
impl hash::Hash for Multihash {
177+
fn hash<H: hash::Hasher>(&self, state: &mut H) {
178+
self.storage.bytes().hash(state);
179+
}
163180
}
164181

165182
impl Multihash {
@@ -172,7 +189,7 @@ impl Multihash {
172189
});
173190
}
174191
Ok(Multihash {
175-
bytes: Bytes::from(bytes),
192+
storage: Storage::from_slice(&bytes),
176193
})
177194
}
178195

@@ -183,17 +200,19 @@ impl Multihash {
183200

184201
/// Returns the bytes representation of the multihash.
185202
pub fn to_vec(&self) -> Vec<u8> {
186-
Vec::from(&self.bytes[..])
203+
Vec::from(self.as_bytes())
187204
}
188205

189206
/// Returns the bytes representation of this multihash.
190207
pub fn as_bytes(&self) -> &[u8] {
191-
&self.bytes
208+
self.storage.bytes()
192209
}
193210

194211
/// Builds a `MultihashRef` corresponding to this `Multihash`.
195212
pub fn as_ref(&self) -> MultihashRef {
196-
MultihashRef { bytes: &self.bytes }
213+
MultihashRef {
214+
bytes: self.as_bytes(),
215+
}
197216
}
198217

199218
/// Returns which hashing algorithm is used in this multihash.
@@ -215,7 +234,7 @@ impl AsRef<[u8]> for Multihash {
215234

216235
impl<'a> PartialEq<MultihashRef<'a>> for Multihash {
217236
fn eq(&self, other: &MultihashRef<'a>) -> bool {
218-
&*self.bytes == other.bytes
237+
&*self.as_bytes() == other.as_bytes()
219238
}
220239
}
221240

@@ -290,7 +309,7 @@ impl<'a> MultihashRef<'a> {
290309
/// This operation allocates.
291310
pub fn to_owned(&self) -> Multihash {
292311
Multihash {
293-
bytes: Bytes::copy_from_slice(self.bytes),
312+
storage: Storage::from_slice(self.bytes),
294313
}
295314
}
296315

@@ -302,7 +321,7 @@ impl<'a> MultihashRef<'a> {
302321

303322
impl<'a> PartialEq<Multihash> for MultihashRef<'a> {
304323
fn eq(&self, other: &Multihash) -> bool {
305-
self.bytes == &*other.bytes
324+
self.as_bytes() == &*other.as_bytes()
306325
}
307326
}
308327

src/storage.rs

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
use std::sync::Arc;
2+
3+
/// MAX_INLINE is the maximum size of a multihash that can be stored inline
4+
///
5+
/// We want the currently most common multihashes using 256bit hashes to be stored inline. These
6+
/// hashes are 34 bytes long. An overall size of 38 seems like a good compromise. It allows storing
7+
/// any 256bit hash with some room to spare and gives an overall size for Storage of 40 bytes, which
8+
/// is a multiple of 8. We need 2 extra bytes, one for the size and one for the enum discriminator.
9+
const MAX_INLINE: usize = 38;
10+
11+
#[derive(Clone)]
12+
pub(crate) enum Storage {
13+
/// hash is stored inline if it is smaller than MAX_INLINE
14+
Inline(u8, [u8; MAX_INLINE]),
15+
/// hash is stored on the heap. this must be only used if the hash is actually larger than
16+
/// MAX_INLINE bytes to ensure an unique representation.
17+
Heap(Arc<[u8]>),
18+
}
19+
20+
impl Storage {
21+
/// The raw bytes.
22+
pub fn bytes(&self) -> &[u8] {
23+
match self {
24+
Storage::Inline(len, bytes) => &bytes[..(*len as usize)],
25+
Storage::Heap(data) => &data,
26+
}
27+
}
28+
29+
/// creates storage from a vec. For a size up to MAX_INLINE, this will not allocate.
30+
pub fn from_slice(slice: &[u8]) -> Self {
31+
let len = slice.len();
32+
if len <= MAX_INLINE {
33+
let mut data: [u8; MAX_INLINE] = [0; MAX_INLINE];
34+
data[..len].copy_from_slice(slice);
35+
Storage::Inline(len as u8, data)
36+
} else {
37+
Storage::Heap(slice.into())
38+
}
39+
}
40+
41+
/// creates storage from multiple slices. For a size up to MAX_INLINE, this will not allocate.
42+
pub fn from_slices(slices: &[&[u8]]) -> Self {
43+
let n = slices.iter().fold(0usize, |a, s| a.saturating_add(s.len()));
44+
if n <= MAX_INLINE {
45+
let s = slices
46+
.iter()
47+
.fold(([0; MAX_INLINE], 0), |(mut array, i), s| {
48+
array[i..i + s.len()].copy_from_slice(s);
49+
(array, i + s.len())
50+
});
51+
Storage::Inline(n as u8, s.0)
52+
} else {
53+
let mut v = Vec::with_capacity(n);
54+
for s in slices {
55+
v.extend_from_slice(s)
56+
}
57+
Storage::Heap(v.into())
58+
}
59+
}
60+
}
61+
62+
#[cfg(test)]
63+
mod tests {
64+
use super::{Storage, MAX_INLINE};
65+
use quickcheck::quickcheck;
66+
67+
#[test]
68+
fn struct_size() {
69+
// this should be true for both 32 and 64 bit archs
70+
assert_eq!(std::mem::size_of::<Storage>(), 40);
71+
}
72+
73+
#[test]
74+
fn roundtrip() {
75+
// check that .bytes() returns whatever the storage was created with
76+
for i in 0..((MAX_INLINE + 10) as u8) {
77+
let data = (0..i).collect::<Vec<u8>>();
78+
let storage = Storage::from_slice(&data);
79+
assert_eq!(data, storage.bytes());
80+
}
81+
}
82+
83+
fn check_invariants(storage: Storage) -> bool {
84+
match storage {
85+
Storage::Inline(len, _) => len as usize <= MAX_INLINE,
86+
Storage::Heap(arc) => arc.len() > MAX_INLINE,
87+
}
88+
}
89+
90+
quickcheck! {
91+
fn roundtrip_check(data: Vec<u8>) -> bool {
92+
let storage = Storage::from_slice(&data);
93+
storage.bytes() == data.as_slice() && check_invariants(storage)
94+
}
95+
96+
fn from_slices_roundtrip_check(data: Vec<Vec<u8>>) -> bool {
97+
let mut slices = Vec::new();
98+
let mut expected = Vec::new();
99+
for v in data.iter() {
100+
slices.push(v.as_slice());
101+
expected.extend_from_slice(&v);
102+
}
103+
let storage = Storage::from_slices(&slices);
104+
storage.bytes() == expected.as_slice() && check_invariants(storage)
105+
}
106+
}
107+
}

0 commit comments

Comments
 (0)