Skip to content

feat: Add ZIP support (+ EPUB, Office Open XML, Open Document, and OpenXPS) and Collection Data Hash assertion #499

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
2c22931
Foundation for ZIP support
ok-nick Jul 8, 2024
a84411a
Foundation for collection data hash assertion
ok-nick Jul 8, 2024
c03d653
Collection assertion hash resolver
ok-nick Jul 9, 2024
9e4af21
Rework collection hash assertion
ok-nick Jul 9, 2024
6559e6f
More collection assertion validation
ok-nick Jul 9, 2024
ac1a12d
Fix ZIP unit tests
ok-nick Jul 9, 2024
d1f3eab
Update memchr dep
ok-nick Jul 9, 2024
9b5be12
Update memchr dep in make_test_images
ok-nick Jul 9, 2024
f57b832
Add ZIP unit tests
ok-nick Jul 9, 2024
5cf73e0
Merge branch 'main' into ok-nick/zip
ok-nick Jul 9, 2024
7700453
Collection Hash Assertion relative path validation
ok-nick Jul 10, 2024
7994817
Add collection hash unit tests
ok-nick Jul 10, 2024
484cca8
Pass CI for collection hash
ok-nick Jul 10, 2024
32fc674
Fix ZIP offsets/lens
ok-nick Jul 10, 2024
d668888
Collection assertion docs, optimizations, and cleanup
ok-nick Jul 11, 2024
11bef80
Cleanup collection hash errors
ok-nick Jul 12, 2024
d383ce8
Rework collection hash and add better validation
ok-nick Jul 12, 2024
744045d
More file types for ZIP unit tests
ok-nick Jul 12, 2024
d0704e9
Merge remote-tracking branch 'origin' into ok-nick/zip
ok-nick Aug 8, 2024
db83807
Hash central directory and add unit tests
ok-nick Aug 8, 2024
c2feb82
Fix thiserror dependency conflict
ok-nick Aug 8, 2024
97ebd56
Use latest zip crate (with fix)
ok-nick Aug 29, 2024
e29fe8b
Merge remote-tracking branch 'origin' into ok-nick/zip
ok-nick Aug 29, 2024
a5d0533
Update log crate to fix dependency conflict
ok-nick Aug 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion make_test_images/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ image = { version = "0.24.7", default-features = false, features = [
"jpeg",
"png",
] }
memchr = "2.7.1"
memchr = "2.7.4"
nom = "7.1.3"
regex = "1.5.6"
serde = "1.0.197"
Expand Down
6 changes: 3 additions & 3 deletions sdk/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ jfifdump = "0.5.1"
log = "0.4.8"
lopdf = { version = "0.31.0", optional = true }
lazy_static = "1.4.0"
memchr = "2.7.1"
memchr = "2.7.4"
multibase = "0.9.0"
multihash = "0.11.4"
mp4 = "0.13.0"
Expand All @@ -121,11 +121,11 @@ sha2 = "0.10.2"
tempfile = "3.10.1"
thiserror = "1.0.61"
treeline = "0.1.0"
url = "2.2.2, <2.5.1" # Can't use 2.5.1 or newer until new license is reviewed.
url = "2.2.2, <2.5.1" # Can't use 2.5.1 or newer until new license is reviewed.
uuid = { version = "1.3.1", features = ["serde", "v4", "wasm-bindgen"] }
x509-parser = "0.15.1"
x509-certificate = "0.19.0"
zip = { version = "0.6.6", default-features = false }
zip = { version = "2.1.3", default-features = false }

[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
ureq = "2.4.0"
Expand Down
288 changes: 288 additions & 0 deletions sdk/src/assertions/collection_hash.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
use std::{
fs::File,
io::{Read, Seek},
path::{Component, Path, PathBuf},
};

use serde::{Deserialize, Serialize};

use crate::{
assertions::AssetType, asset_handlers::zip_io, hash_stream_by_alg,
hash_utils::verify_stream_by_alg, Error, HashRange, Result,
};

#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Default)]
pub struct CollectionHash {
pub uris: Vec<UriHashedDataMap>,

#[serde(skip_serializing_if = "Option::is_none")]
pub alg: Option<String>,

#[serde(skip_serializing_if = "Option::is_none", with = "serde_bytes")]
pub zip_central_directory_hash: Option<Vec<u8>>,
}

#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
pub struct UriHashedDataMap {
pub uri: PathBuf,

#[serde(with = "serde_bytes")]
pub hash: Vec<u8>,

#[serde(skip_serializing_if = "Option::is_none")]
pub size: Option<u64>,

#[serde(rename = "dc:format", skip_serializing_if = "Option::is_none")]
pub dc_format: Option<String>,

#[serde(skip_serializing_if = "Option::is_none")]
pub data_types: Option<Vec<AssetType>>,
}

impl CollectionHash {
pub fn new() -> Self {
Self::default()
}

pub fn add_uri_map(&mut self, uri_map: UriHashedDataMap) {
self.uris.push(uri_map);
}

// The base path MUST be the folder of the manifest. A URI MUST NOT reference a path outside of that folder.
pub fn gen_hash<R>(&mut self, base_path: &Path) -> Result<()>
where
R: Read + Seek + ?Sized,
{
self.validate_paths()?;

let alg = self.alg().to_owned();
for uri_map in &mut self.uris {
let path = base_path.join(&uri_map.uri);
let mut file = File::open(path)?;
let file_len = file.metadata()?.len();

uri_map.hash = hash_stream_by_alg(
&alg,
&mut file,
// TODO: temp unwrap
#[allow(clippy::unwrap_used)]
Some(vec![HashRange::new(0, usize::try_from(file_len).unwrap())]),
false,
)?;
}

Ok(())
}

pub fn verify_hash<R>(&self, alg: Option<&str>, base_path: &Path) -> Result<()>
where
R: Read + Seek + ?Sized,
{
self.validate_paths()?;

let alg = alg.unwrap_or_else(|| self.alg());
for uri_map in &self.uris {
let path = base_path.join(&uri_map.uri);
let mut file = File::open(&path)?;
let file_len = file.metadata()?.len();

if !verify_stream_by_alg(
alg,
&uri_map.hash,
&mut file,
// TODO: temp unwrap
#[allow(clippy::unwrap_used)]
Some(vec![HashRange::new(0, usize::try_from(file_len).unwrap())]),
false,
) {
return Err(Error::HashMismatch(format!(
"hash for {} does not match",
path.display()
)));
}
}

Ok(())
}

// We overwrite all URIs with all existing URIs in the ZIP because all URIs in the ZIP represent all
// possible valid URIs — we don't want duplicates!
pub fn gen_uris_from_zip_stream<R>(&mut self, stream: &mut R) -> Result<()>
where
R: Read + Seek + ?Sized,
{
self.uris = zip_io::uri_maps(stream)?;
Ok(())
}

pub fn gen_hash_from_zip_stream<R>(&mut self, stream: &mut R) -> Result<()>
where
R: Read + Seek + ?Sized,
{
let alg = self.alg().to_owned();

let zip_central_directory_inclusions = zip_io::central_directory_inclusions(stream)?;
let zip_central_directory_hash =
hash_stream_by_alg(&alg, stream, Some(zip_central_directory_inclusions), false)?;
if zip_central_directory_hash.is_empty() {
return Err(Error::BadParam("could not generate data hash".to_string()));
}
self.zip_central_directory_hash = Some(zip_central_directory_hash);

let hash_ranges = zip_io::uri_inclusions(stream, &self.uris)?;
for (uri_map, hash_range) in self.uris.iter_mut().zip(hash_ranges) {
let hash = hash_stream_by_alg(&alg, stream, Some(vec![hash_range]), false)?;
if hash.is_empty() {
return Err(Error::BadParam("could not generate data hash".to_string()));
}

uri_map.hash = hash;
}

Ok(())
}

pub fn verify_zip_stream_hash<R>(&self, stream: &mut R, alg: Option<&str>) -> Result<()>
where
R: Read + Seek + ?Sized,
{
let alg = alg.unwrap_or_else(|| self.alg());
let central_directory_hash = match &self.zip_central_directory_hash {
Some(hash) => Ok(hash),
None => Err(Error::BadParam(
"Missing zip central directory hash".to_owned(),
)),
}?;
let zip_central_directory_inclusions = zip_io::central_directory_inclusions(stream)?;
if !verify_stream_by_alg(
alg,
central_directory_hash,
stream,
Some(zip_central_directory_inclusions),
false,
) {
return Err(Error::HashMismatch(
"Hashes do not match for zip central directory".to_owned(),
));
}

let hash_ranges = zip_io::uri_inclusions(stream, &self.uris)?;
for (uri_map, hash_range) in self.uris.iter().zip(hash_ranges) {
if !verify_stream_by_alg(alg, &uri_map.hash, stream, Some(vec![hash_range]), false) {
return Err(Error::HashMismatch(format!(
"hash for {} does not match",
uri_map.uri.display()
)));
}
}

Ok(())
}

fn alg(&self) -> &str {
self.alg.as_deref().unwrap_or("sha256")
}

fn validate_paths(&self) -> Result<()> {
for uri_map in &self.uris {
for component in uri_map.uri.components() {
match component {
Component::CurDir | Component::ParentDir => {
return Err(Error::BadParam(format!(
"URI `{}` must not contain relative components: `.` nor `..`",
uri_map.uri.display()
)));
}
_ => {}
}
}
}

Ok(())
}
}

#[cfg(test)]
mod tests {
use std::io::Cursor;

use super::*;

const ZIP_SAMPLE1: &[u8] = include_bytes!("../../tests/fixtures/sample1.zip");

#[test]
fn test_zip_uri_gen() -> Result<()> {
let mut stream = Cursor::new(ZIP_SAMPLE1);

let mut collection = CollectionHash::new();
collection.gen_uris_from_zip_stream(&mut stream)?;

assert_eq!(
collection.uris.first(),
Some(&UriHashedDataMap {
uri: PathBuf::from("sample1/test1.txt"),
hash: Vec::new(),
size: Some(47),
dc_format: None,
data_types: None
})
);
assert_eq!(
collection.uris.get(1),
Some(&UriHashedDataMap {
uri: PathBuf::from("sample1/test1/test1.txt"),
hash: Vec::new(),
size: Some(57),
dc_format: None,
data_types: None
})
);
assert_eq!(
collection.uris.get(2),
Some(&UriHashedDataMap {
uri: PathBuf::from("sample1/test1/test2.txt"),
hash: Vec::new(),
size: Some(53),
dc_format: None,
data_types: None
})
);
assert_eq!(
collection.uris.get(3),
Some(&UriHashedDataMap {
uri: PathBuf::from("sample1/test1/test3.txt"),
hash: Vec::new(),
size: Some(68),
dc_format: None,
data_types: None
})
);
assert_eq!(
collection.uris.get(4),
Some(&UriHashedDataMap {
uri: PathBuf::from("sample1/test2.txt"),
hash: Vec::new(),
size: Some(56),
dc_format: None,
data_types: None
})
);
assert_eq!(collection.uris.len(), 5);

Ok(())
}

#[test]
fn test_zip_hash_gen() -> Result<()> {
// let mut stream = Cursor::new(ZIP_SAMPLE1);

// TODO: blocked by zip_io::central_directory_inclusions
// let mut collection = CollectionHash::new();
// collection.gen_uris_from_zip_stream(&mut stream)?;
// collection.gen_hash_from_zip_stream(&mut stream)?;

// TODO: assert central dir hash + uri map hashes

Ok(())
}
}
3 changes: 3 additions & 0 deletions sdk/src/assertions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ pub use box_hash::{BoxHash, BoxMap, C2PA_BOXHASH};
mod data_hash;
pub use data_hash::DataHash;

mod collection_hash;
pub use collection_hash::{CollectionHash, UriHashedDataMap};

mod creative_work;
pub use creative_work::CreativeWork;

Expand Down
1 change: 1 addition & 0 deletions sdk/src/asset_handlers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pub mod png_io;
pub mod riff_io;
pub mod svg_io;
pub mod tiff_io;
pub mod zip_io;

#[cfg(feature = "pdf")]
pub(crate) mod pdf;
Expand Down
Loading
Loading