Skip to content

Use inline storage for small hashes #47

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Feb 21, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ edition = "2018"
[dependencies]
blake2b_simd = { version = "0.5.9", default-features = false }
blake2s_simd = { version = "0.5.9", default-features = false }
bytes = "0.5"
sha1 = "0.5"
sha2 = { version = "0.7", default-features = false }
tiny-keccak = "1.4"
Expand Down
94 changes: 73 additions & 21 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,22 @@

mod errors;
mod hashes;
mod storage;

use std::convert::TryFrom;
use std::fmt::Debug;
use std::hash;

use blake2b_simd::{blake2b, Params as Blake2bVariable};
use blake2s_simd::{blake2s, Params as Blake2sVariable};
use bytes::{BufMut, Bytes, BytesMut};
use sha2::Digest;
use tiny_keccak::Keccak;
use unsigned_varint::{decode, encode};

pub use errors::{DecodeError, DecodeOwnedError, EncodeError};
pub use hashes::Hash;
use std::fmt;
use storage::Storage;

// Helper macro for encoding input into output using sha1, sha2, tiny_keccak, or blake2
macro_rules! encode {
Expand Down Expand Up @@ -107,12 +111,12 @@ pub fn encode(hash: Hash, input: &[u8]) -> Result<Multihash, EncodeError> {

let total_len = code.len() + size.len() + input.len();

let mut output = BytesMut::with_capacity(total_len);
output.put_slice(code);
output.put_slice(size);
output.put_slice(input);
let mut output = Vec::with_capacity(total_len);
output.extend_from_slice(code);
output.extend_from_slice(size);
output.extend_from_slice(input);
Ok(Multihash {
bytes: output.freeze(),
storage: Storage::copy_from_slice(&output),
})
} else {
let (offset, mut output) = encode_hash(hash);
Expand All @@ -135,31 +139,51 @@ pub fn encode(hash: Hash, input: &[u8]) -> Result<Multihash, EncodeError> {
});

Ok(Multihash {
bytes: output.freeze(),
storage: Storage::copy_from_slice(&output),
})
}
}

// Encode the given [`Hash`] value and ensure the returned [`BytesMut`]
// Encode the given [`Hash`] value and ensure the returned [`Vec<u8>`]
// has enough capacity to hold the actual digest.
fn encode_hash(hash: Hash) -> (usize, BytesMut) {
fn encode_hash(hash: Hash) -> (usize, Vec<u8>) {
let mut buf = encode::u16_buffer();
let code = encode::u16(hash.code(), &mut buf);

let len = code.len() + 1 + usize::from(hash.size());

let mut output = BytesMut::with_capacity(len);
output.put_slice(code);
output.put_u8(hash.size());
let mut output = Vec::with_capacity(len);
output.extend_from_slice(code);
output.push(hash.size());
output.resize(len, 0);

(code.len() + 1, output)
}

/// Represents a valid multihash.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[derive(Clone)]
pub struct Multihash {
bytes: Bytes,
storage: Storage,
}

impl Debug for Multihash {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Multihash")
}
}

impl PartialEq for Multihash {
fn eq(&self, other: &Self) -> bool {
self.storage.bytes() == other.storage.bytes()
}
}

impl Eq for Multihash {}

impl hash::Hash for Multihash {
fn hash<H: hash::Hasher>(&self, state: &mut H) {
self.storage.bytes().hash(state);
}
}

impl Multihash {
Expand All @@ -172,7 +196,7 @@ impl Multihash {
});
}
Ok(Multihash {
bytes: Bytes::from(bytes),
storage: Storage::copy_from_slice(&bytes),
})
}

Expand All @@ -183,17 +207,21 @@ impl Multihash {

/// Returns the bytes representation of the multihash.
pub fn to_vec(&self) -> Vec<u8> {
Vec::from(&self.bytes[..])
Vec::from(self.as_bytes())
}

/// Returns the bytes representation of this multihash.
pub fn as_bytes(&self) -> &[u8] {
&self.bytes
let bytes = self.storage.bytes();
let size = multihash_size(bytes).expect("storage contains a valid multihash");
&bytes[..size]
}

/// Builds a `MultihashRef` corresponding to this `Multihash`.
pub fn as_ref(&self) -> MultihashRef {
MultihashRef { bytes: &self.bytes }
MultihashRef {
bytes: self.as_bytes(),
}
}

/// Returns which hashing algorithm is used in this multihash.
Expand All @@ -215,7 +243,7 @@ impl AsRef<[u8]> for Multihash {

impl<'a> PartialEq<MultihashRef<'a>> for Multihash {
fn eq(&self, other: &MultihashRef<'a>) -> bool {
&*self.bytes == other.bytes
&*self.as_bytes() == other.as_bytes()
}
}

Expand All @@ -233,6 +261,30 @@ pub struct MultihashRef<'a> {
bytes: &'a [u8],
}

/// Given a buffer starting with a valid multihash, returns the size of the multihash
fn multihash_size(input: &[u8]) -> Result<usize, DecodeError> {
if input.is_empty() {
return Err(DecodeError::BadInputLength);
}
let mut res = 0usize;

// Ensure `Hash::code` returns a `u16` so that our `decode::u16` here is correct.
std::convert::identity::<fn(Hash) -> u16>(Hash::code);
let (code, bytes) = decode::u16(&input).map_err(|_| DecodeError::BadInputLength)?;

// Very convoluted way to get the size of the code
let mut tmp = [0u8; 3];
res += unsigned_varint::encode::u16(code, &mut tmp).len();

let (hash_len, _) = decode::u32(&bytes).map_err(|_| DecodeError::BadInputLength)?;

// Very convoluted way to get the size of the hash_len
let mut tmp = [0u8; 5];
res += unsigned_varint::encode::u32(hash_len, &mut tmp).len();
res += hash_len as usize;
Ok(res)
}

impl<'a> MultihashRef<'a> {
/// Creates a `MultihashRef` from the given `input`.
pub fn from_slice(input: &'a [u8]) -> Result<Self, DecodeError> {
Expand Down Expand Up @@ -290,7 +342,7 @@ impl<'a> MultihashRef<'a> {
/// This operation allocates.
pub fn to_owned(&self) -> Multihash {
Multihash {
bytes: Bytes::copy_from_slice(self.bytes),
storage: Storage::copy_from_slice(self.bytes),
}
}

Expand All @@ -302,7 +354,7 @@ impl<'a> MultihashRef<'a> {

impl<'a> PartialEq<Multihash> for MultihashRef<'a> {
fn eq(&self, other: &Multihash) -> bool {
self.bytes == &*other.bytes
self.as_bytes() == &*other.as_bytes()
}
}

Expand Down
43 changes: 43 additions & 0 deletions src/storage.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
use std::sync::Arc;

const MAX_INLINE: usize = 39;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why 39?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the total size is 40, a multiple of 8. 1 byte is needed for the enum discriminator.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

39 fits all 256 bit hashes with some room to spare. 512 byte hashes won't work though, and I think once you go to such large hashes, you are better off with an Arc<[u8]>.

But the size of the inline buffer can of course be adjusted when even larger hashes become common. The whole mechanism is completely opaque.


#[derive(Clone)]
pub enum Storage {
/// hash is stored inline. if it is smaller than 39 bytes it should be padded with 0u8
Inline([u8; MAX_INLINE]),
/// hash is stored on the heap. this must be only used if the hash is actually larger than
/// 39 bytes to ensure an unique representation.
Heap(Arc<[u8]>),
}

impl Storage {
/// The raw bytes. Note that this can be longer than the data this storage has been created from.
pub fn bytes(&self) -> &[u8] {
match self {
Storage::Inline(bytes) => bytes,
Storage::Heap(data) => &data,
}
}

/// creates storage from a vec. Note that this will not preserve the size.
pub fn copy_from_slice(slice: &[u8]) -> Self {
if slice.len() <= MAX_INLINE {
let mut data: [u8; MAX_INLINE] = [0; MAX_INLINE];
&data[..slice.len()].copy_from_slice(slice);
Storage::Inline(data)
} else {
Storage::Heap(slice.into())
}
}
}

#[cfg(test)]
mod tests {
use super::Storage;

#[test]
fn test_size() {
assert_eq!(std::mem::size_of::<Storage>(), 40);
}
}