From df5ab9c012c09bebac75e59c0830e62e139aa0f9 Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Fri, 2 May 2025 13:00:30 +0200 Subject: [PATCH 1/2] Add tests for compression algorithms --- compression/README.md | 7 +++++ compression/algorithms/README.md | 6 ++++ compression/algorithms/lz4/read.C | 6 ++++ compression/algorithms/lz4/write.C | 5 ++++ compression/algorithms/lzma/read.C | 6 ++++ compression/algorithms/lzma/write.C | 5 ++++ compression/algorithms/write_algorithm.hxx | 32 ++++++++++++++++++++++ compression/algorithms/zlib/read.C | 6 ++++ compression/algorithms/zlib/write.C | 5 ++++ compression/algorithms/zstd/read.C | 6 ++++ compression/algorithms/zstd/write.C | 5 ++++ compression/read_compression.hxx | 27 ++++++++++++++++++ 12 files changed, 116 insertions(+) create mode 100644 compression/README.md create mode 100644 compression/algorithms/README.md create mode 100644 compression/algorithms/lz4/read.C create mode 100644 compression/algorithms/lz4/write.C create mode 100644 compression/algorithms/lzma/read.C create mode 100644 compression/algorithms/lzma/write.C create mode 100644 compression/algorithms/write_algorithm.hxx create mode 100644 compression/algorithms/zlib/read.C create mode 100644 compression/algorithms/zlib/write.C create mode 100644 compression/algorithms/zstd/read.C create mode 100644 compression/algorithms/zstd/write.C create mode 100644 compression/read_compression.hxx diff --git a/compression/README.md b/compression/README.md new file mode 100644 index 0000000..e5753de --- /dev/null +++ b/compression/README.md @@ -0,0 +1,7 @@ +# Compression + +For the RNTuple Validation Suite, we assume that compression is orthogonal to the supported types and serialized data. +Therefore all tests in this category write a single `Int64` field with type `std::int64_t` and column type `SplitInt64`. +The entries have ascending values and the reference `.json` files only contain the sum of all elements. + + * [`algorithms`](algorithms): `zlib`, `lzma`, `lz4`, `zstd` diff --git a/compression/algorithms/README.md b/compression/algorithms/README.md new file mode 100644 index 0000000..d336d56 --- /dev/null +++ b/compression/algorithms/README.md @@ -0,0 +1,6 @@ +# Compression Algorithms + + * [`zlib`](zlib): compression settings `101` + * [`lzma`](lzma): level 7 (`207`) + * [`lz4`](lz4): level 4 (`404`) + * [`zstd`](zstd): level 5 (`505`) diff --git a/compression/algorithms/lz4/read.C b/compression/algorithms/lz4/read.C new file mode 100644 index 0000000..8ae21eb --- /dev/null +++ b/compression/algorithms/lz4/read.C @@ -0,0 +1,6 @@ +#include "../../read_compression.hxx" + +void read(std::string_view input = "compression.algorithms.lz4.root", + std::string_view output = "compression.algorithms.lz4.json") { + read_compression(input, output); +} diff --git a/compression/algorithms/lz4/write.C b/compression/algorithms/lz4/write.C new file mode 100644 index 0000000..cd57d32 --- /dev/null +++ b/compression/algorithms/lz4/write.C @@ -0,0 +1,5 @@ +#include "../write_algorithm.hxx" + +void write(std::string_view filename = "compression.algorithms.lz4.root") { + write_algorithm(filename, 404); +} diff --git a/compression/algorithms/lzma/read.C b/compression/algorithms/lzma/read.C new file mode 100644 index 0000000..7a189bf --- /dev/null +++ b/compression/algorithms/lzma/read.C @@ -0,0 +1,6 @@ +#include "../../read_compression.hxx" + +void read(std::string_view input = "compression.algorithms.lzma.root", + std::string_view output = "compression.algorithms.lzma.json") { + read_compression(input, output); +} diff --git a/compression/algorithms/lzma/write.C b/compression/algorithms/lzma/write.C new file mode 100644 index 0000000..aa55668 --- /dev/null +++ b/compression/algorithms/lzma/write.C @@ -0,0 +1,5 @@ +#include "../write_algorithm.hxx" + +void write(std::string_view filename = "compression.algorithms.lzma.root") { + write_algorithm(filename, 207); +} diff --git a/compression/algorithms/write_algorithm.hxx b/compression/algorithms/write_algorithm.hxx new file mode 100644 index 0000000..177f64b --- /dev/null +++ b/compression/algorithms/write_algorithm.hxx @@ -0,0 +1,32 @@ +#include +#include +#include +#include + +using ROOT::Experimental::EColumnType; +using ROOT::Experimental::RNTupleModel; +using ROOT::Experimental::RNTupleWriteOptions; +using ROOT::Experimental::RNTupleWriter; + +#include +#include +#include + +void write_algorithm(std::string_view filename, std::uint32_t compression) { + auto model = RNTupleModel::Create(); + + auto Int64 = model->MakeField("Int64"); + model->GetMutableField("Int64").SetColumnRepresentatives( + {{EColumnType::kSplitInt64}}); + + RNTupleWriteOptions options; + options.SetCompression(compression); + auto writer = + RNTupleWriter::Recreate(std::move(model), "ntpl", filename, options); + + // Write 32 entries to make sure the compression block is not too small. + for (int i = 0; i < 32; i++) { + *Int64 = i; + writer->Fill(); + } +} diff --git a/compression/algorithms/zlib/read.C b/compression/algorithms/zlib/read.C new file mode 100644 index 0000000..3768b14 --- /dev/null +++ b/compression/algorithms/zlib/read.C @@ -0,0 +1,6 @@ +#include "../../read_compression.hxx" + +void read(std::string_view input = "compression.algorithms.zlib.root", + std::string_view output = "compression.algorithms.zlib.json") { + read_compression(input, output); +} diff --git a/compression/algorithms/zlib/write.C b/compression/algorithms/zlib/write.C new file mode 100644 index 0000000..684586f --- /dev/null +++ b/compression/algorithms/zlib/write.C @@ -0,0 +1,5 @@ +#include "../write_algorithm.hxx" + +void write(std::string_view filename = "compression.algorithms.zlib.root") { + write_algorithm(filename, 101); +} diff --git a/compression/algorithms/zstd/read.C b/compression/algorithms/zstd/read.C new file mode 100644 index 0000000..90b5eb2 --- /dev/null +++ b/compression/algorithms/zstd/read.C @@ -0,0 +1,6 @@ +#include "../../read_compression.hxx" + +void read(std::string_view input = "compression.algorithms.zstd.root", + std::string_view output = "compression.algorithms.zstd.json") { + read_compression(input, output); +} diff --git a/compression/algorithms/zstd/write.C b/compression/algorithms/zstd/write.C new file mode 100644 index 0000000..9bd2b1a --- /dev/null +++ b/compression/algorithms/zstd/write.C @@ -0,0 +1,5 @@ +#include "../write_algorithm.hxx" + +void write(std::string_view filename = "compression.algorithms.zstd.root") { + write_algorithm(filename, 505); +} diff --git a/compression/read_compression.hxx b/compression/read_compression.hxx new file mode 100644 index 0000000..141d031 --- /dev/null +++ b/compression/read_compression.hxx @@ -0,0 +1,27 @@ +#include +#include + +using ROOT::Experimental::REntry; +using ROOT::Experimental::RNTupleReader; + +#include +#include +#include +#include +#include + +void read_compression(std::string_view input, std::string_view output) { + auto reader = RNTupleReader::Open("ntpl", input); + auto Int64 = + reader->GetModel().GetDefaultEntry().GetPtr("Int64"); + std::int64_t sum = 0; + for (auto index : *reader) { + reader->LoadEntry(index); + sum += *Int64; + } + + std::ofstream os(std::string{output}); + os << "{\n"; + os << " \"Int64\": " << sum << "\n"; + os << "}\n"; +} From 127bdbebffe1c3544059f7ce137cb3eef8be58e9 Mon Sep 17 00:00:00 2001 From: Jonas Hahnfeld Date: Fri, 2 May 2025 13:33:15 +0200 Subject: [PATCH 2/2] Add tests for compression blocks --- compression/README.md | 1 + compression/block/README.md | 4 ++++ compression/block/big/read.C | 6 ++++++ compression/block/big/write.C | 38 +++++++++++++++++++++++++++++++++ compression/block/short/read.C | 6 ++++++ compression/block/short/write.C | 33 ++++++++++++++++++++++++++++ 6 files changed, 88 insertions(+) create mode 100644 compression/block/README.md create mode 100644 compression/block/big/read.C create mode 100644 compression/block/big/write.C create mode 100644 compression/block/short/read.C create mode 100644 compression/block/short/write.C diff --git a/compression/README.md b/compression/README.md index e5753de..b3a45d9 100644 --- a/compression/README.md +++ b/compression/README.md @@ -5,3 +5,4 @@ Therefore all tests in this category write a single `Int64` field with type `std The entries have ascending values and the reference `.json` files only contain the sum of all elements. * [`algorithms`](algorithms): `zlib`, `lzma`, `lz4`, `zstd` + * [`block`](block): big and short compression blocks diff --git a/compression/block/README.md b/compression/block/README.md new file mode 100644 index 0000000..a3e2827 --- /dev/null +++ b/compression/block/README.md @@ -0,0 +1,4 @@ +# Compression Blocks + + * [`big`](big): big compression blocks, larger than 16 MiB + * [`short`](short): a "short" compression that is actually uncompressed diff --git a/compression/block/big/read.C b/compression/block/big/read.C new file mode 100644 index 0000000..b981a81 --- /dev/null +++ b/compression/block/big/read.C @@ -0,0 +1,6 @@ +#include "../../read_compression.hxx" + +void read(std::string_view input = "compression.block.big.root", + std::string_view output = "compression.block.big.json") { + read_compression(input, output); +} diff --git a/compression/block/big/write.C b/compression/block/big/write.C new file mode 100644 index 0000000..79152da --- /dev/null +++ b/compression/block/big/write.C @@ -0,0 +1,38 @@ +#include +#include +#include +#include + +using ROOT::Experimental::EColumnType; +using ROOT::Experimental::RNTupleModel; +using ROOT::Experimental::RNTupleWriteOptions; +using ROOT::Experimental::RNTupleWriter; + +#include +#include +#include + +void write(std::string_view filename = "compression.block.big.root") { + auto model = RNTupleModel::Create(); + + auto Int64 = model->MakeField("Int64"); + model->GetMutableField("Int64").SetColumnRepresentatives( + {{EColumnType::kSplitInt64}}); + + RNTupleWriteOptions options; + // Crank up the zstd compression level to reduce the output file size by + // approximately a factor 6 (from 76K with 505 to 12K). + options.SetCompression(509); + // Increase the maximum unzipped page size to make it bigger than the maximum + // size of a compression block, which is 16 MiB. + options.SetMaxUnzippedPageSize(128 * 1024 * 1024); + auto writer = + RNTupleWriter::Recreate(std::move(model), "ntpl", filename, options); + + // Write 40 MiB of entries that will be split into three compression blocks. + static constexpr int Entries = 40 * 1024 * 1024 / sizeof(std::int64_t); + for (int i = 0; i < Entries; i++) { + *Int64 = i; + writer->Fill(); + } +} diff --git a/compression/block/short/read.C b/compression/block/short/read.C new file mode 100644 index 0000000..ad9c2fb --- /dev/null +++ b/compression/block/short/read.C @@ -0,0 +1,6 @@ +#include "../../read_compression.hxx" + +void read(std::string_view input = "compression.block.short.root", + std::string_view output = "compression.block.short.json") { + read_compression(input, output); +} diff --git a/compression/block/short/write.C b/compression/block/short/write.C new file mode 100644 index 0000000..e262002 --- /dev/null +++ b/compression/block/short/write.C @@ -0,0 +1,33 @@ +#include +#include +#include +#include + +using ROOT::Experimental::EColumnType; +using ROOT::Experimental::RNTupleModel; +using ROOT::Experimental::RNTupleWriteOptions; +using ROOT::Experimental::RNTupleWriter; + +#include +#include +#include + +void write(std::string_view filename = "compression.block.short.root") { + auto model = RNTupleModel::Create(); + + auto Int64 = model->MakeField("Int64"); + model->GetMutableField("Int64").SetColumnRepresentatives( + {{EColumnType::kSplitInt64}}); + + RNTupleWriteOptions options; + options.SetCompression(505); + auto writer = + RNTupleWriter::Recreate(std::move(model), "ntpl", filename, options); + + // Write only 2 entries to make sure the compression block is small and + // actually stored uncompressed. + for (int i = 0; i < 2; i++) { + *Int64 = i; + writer->Fill(); + } +}