Skip to content

Preliminary benchmarking framework #1463

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
cb2394d
feat: flag-gated benchmarking storage for fake latency
kyteware Jun 19, 2025
af20743
feat: first benchmark
kyteware Jun 19, 2025
5e5858e
chore: remove unused imports
kyteware Jun 19, 2025
1d79162
chore: cargo fmt
kyteware Jun 19, 2025
4182bd8
chore: cargo fmt
kyteware Jun 19, 2025
ceb9d15
ref: clarify naming of benchmark
kyteware Jun 19, 2025
1b3dab3
fix: change confidence interval
kyteware Jun 19, 2025
56333bf
fix: include needed sqlx features for sqlite
kyteware Jun 20, 2025
7bdcf44
chore: rename artifact of old benchmarks crate name
kyteware Jun 20, 2025
62e5b57
ref: rename the first benchmark to a more descriptive name
kyteware Jun 22, 2025
d0f40a6
feat: make latency static and use it for writes as well
kyteware Jun 22, 2025
ad63b89
doc: document reason for python script
kyteware Jun 22, 2025
97df541
ref: move dep documentation to a req.txt file
kyteware Jun 22, 2025
91a1d0a
doc: clarify sqlite usage
kyteware Jun 22, 2025
3335921
chore: add newline to gitignore
kyteware Jun 22, 2025
34d2ad2
doc: clarify benchmarking readme
kyteware Jun 22, 2025
fa1273f
doc: clarify iter_custom usage
kyteware Jun 22, 2025
4c73001
doc: include reasoning for benchmarking storage
kyteware Jun 22, 2025
f8a1e79
fix: lower verison of sqlx in benchmarking for lockfile
kyteware Jun 22, 2025
f76809b
chore: add license to requirements.txt
kyteware Jun 22, 2025
5f8cb97
fix: dont run benchmark if benchmarking flag isn't enabled
kyteware Jun 22, 2025
9a52a9e
chore: turn off warning for benchmarking cfg
kyteware Jun 22, 2025
9f12ba9
chore: cargo fmt
kyteware Jun 22, 2025
3cab046
chore: changes to pass cargo clippy
kyteware Jun 23, 2025
a529afd
chore: ignore sqlx in cargo machete because we only use it to bumb th…
kyteware Jun 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
[workspace]
exclude = ["bindings/python"]
members = [
"crates/benchmarks",
"crates/catalog/*",
"crates/examples",
"crates/iceberg",
Expand Down Expand Up @@ -76,6 +77,7 @@ http = "1.2"
iceberg = { version = "0.5.1", path = "./crates/iceberg" }
iceberg-catalog-memory = { version = "0.5.1", path = "./crates/catalog/memory" }
iceberg-catalog-rest = { version = "0.5.1", path = "./crates/catalog/rest" }
iceberg-catalog-sql = { version = "0.5.1", path = "./crates/catalog/sql" }
iceberg-datafusion = { version = "0.5.1", path = "./crates/integrations/datafusion" }
indicatif = "0.17"
itertools = "0.13"
Expand Down
18 changes: 18 additions & 0 deletions crates/benchmarks/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

src/construction_scripts/__pycache__/
51 changes: 51 additions & 0 deletions crates/benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

[package]
name = "benchmarks"
edition.workspace = true
homepage.workspace = true
version.workspace = true
license.workspace = true
repository.workspace = true
rust-version.workspace = true

[dependencies]
iceberg = { workspace = true }
iceberg-catalog-sql = { workspace = true }
rand = { workspace = true }
walkdir = "2.5.0"
tokio = { features = ["rt-multi-thread"], workspace = true }
bytes = { workspace = true }
futures = { workspace = true }
arrow-array.workspace = true
sqlx = { version = "0.8.1", features = ["any", "runtime-tokio", "sqlite"] }

[dev-dependencies]
criterion = { version = "0.6.0", features = ["async_tokio"] }

[[bench]]
name = "sql-catalog-projection-once"
harness = false

[lints.rust]
# so the #[cfg(benchmarking)] tag doesn't create warnings
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(benchmarking)'] }

# this is done so we can get sqlite feature support without a warning
[package.metadata.cargo-machete]
ignored = ["sqlx"]
26 changes: 26 additions & 0 deletions crates/benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<!-- Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License. -->

Where `iceberg-rust` benchmarks are kept.

To run the benchmarks, we must pass the `benchmarking` rustflag so that the benchmarking storage is compiled and we can use a storage with a predictable and reliable latency.

This command includes it: `RUSTFLAGS="--cfg benchmarking" cargo bench`

## Existing benchmarks

- `sql-catalog-projection-once`: Benchmarks scanning a table (we use a month of NYC taxicab data) and projecting two columns.
112 changes: 112 additions & 0 deletions crates/benchmarks/benches/sql-catalog-projection-once.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::hint::black_box;
use std::path::PathBuf;
use std::time::{Duration, Instant};

use arrow_array::RecordBatch;
use benchmarks::{construct_table, copy_dir_to_fileio};
use criterion::{Criterion, criterion_group, criterion_main};
use futures::TryStreamExt;
use iceberg::io::FileIOBuilder;
use iceberg::table::Table;
use iceberg::{Catalog, TableIdent};
use iceberg_catalog_sql::{SqlBindStyle, SqlCatalog, SqlCatalogConfig};
use tokio::runtime::Runtime;

#[allow(unused_variables)]
pub fn bench_sql_catalog_projection_once(c: &mut Criterion) {
#[cfg(not(benchmarking))]
{
eprintln!(r#"benchmarking flag not enabled, must pass RUSTFLAGS="--cfg benchmarking""#);
return;
}

#[allow(unreachable_code)]
let mut group = c.benchmark_group("sql_catalog_projection_once");
group
.measurement_time(Duration::from_secs(20))
.sample_size(50);

let table_dir = construct_table("sql-catalog-taxicab");
let mut db_path = table_dir.clone();
db_path.push("benchmarking-catalog.db");
let uri = format!("sqlite:{}", db_path.to_str().unwrap());

group.bench_function(
"sql_catalog_projection_once",
// an issue with criterion (https://github.com/bheisler/criterion.rs/issues/751) means we can't do a normal benchmark here,
// it doesn't let use provide an async setup function to build `FileIO`, etc. instead, we have to use `iter_custom` and do
// the measurements ourselves
|b| {
b.to_async(Runtime::new().unwrap()).iter_custom(async |n| {
let mut total_elapsed = Duration::default();

for _ in 0..n {
let table = setup_table(table_dir.clone(), uri.clone()).await;

let start = Instant::now();
let output = scan_table(black_box(table)).await;
let dur = start.elapsed();

drop(black_box(output));
total_elapsed += dur;
}

total_elapsed
})
},
);
group.finish()
}

async fn setup_table(table_dir: PathBuf, uri: String) -> Table {
let file_io = FileIOBuilder::new("iceberg_benchmarking_storage")
.build()
.unwrap();
copy_dir_to_fileio(table_dir.clone(), &file_io).await;

let config = SqlCatalogConfig::builder()
.file_io(file_io)
.uri(format!("sqlite:{uri}"))
.name("default".to_owned())
.sql_bind_style(SqlBindStyle::QMark)
.warehouse_location(table_dir.to_str().unwrap().to_owned())
.build();
let catalog = SqlCatalog::new(config).await.unwrap();
catalog
.load_table(&TableIdent::from_strs(["default", "taxi_dataset"]).unwrap())
.await
.unwrap()
}

async fn scan_table(table: Table) -> Vec<RecordBatch> {
let stream = table
.scan()
.select(["passenger_count", "fare_amount"])
.build()
.unwrap()
.to_arrow()
.await
.unwrap();

stream.try_collect().await.unwrap()
}

criterion_group!(benches, bench_sql_catalog_projection_once);
criterion_main!(benches);
19 changes: 19 additions & 0 deletions crates/benchmarks/src/construction_scripts/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

pyiceberg[sql-sqlite]
pyarrow
60 changes: 60 additions & 0 deletions crates/benchmarks/src/construction_scripts/sql-catalog-taxicab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# reason for existence explained in ../lib.rs

from pathlib import Path
from sys import argv
from urllib.request import urlretrieve
import sqlite3

from pyiceberg.catalog import load_catalog
from pyarrow import parquet

working_dir = Path(argv[1])

warehouse_path = working_dir
catalog = load_catalog(
"default",
**{
'type': 'sql',
"uri": f"sqlite:///{warehouse_path}/benchmarking-catalog.db",
"warehouse": f"file://{warehouse_path}",
},
)

data_file_path = working_dir / "yellow_tripdata_2023-01.parquet"
urlretrieve("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet", data_file_path)

df = parquet.read_table(data_file_path)

catalog.create_namespace("default")

table = catalog.create_table(
"default.taxi_dataset",
schema=df.schema,
)

table.append(df)


# the rust sql catalog needs an extra column for each table that pyiceberg doesn't include, this adds it
conn = sqlite3.connect(f"{warehouse_path}/benchmarking-catalog.db")
c = conn.cursor()
c.execute("ALTER TABLE iceberg_tables ADD column iceberg_type VARCHAR(5) DEFAULT 'TABLE'")
conn.commit()
c.close()
74 changes: 74 additions & 0 deletions crates/benchmarks/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::env::{set_current_dir, temp_dir};
use std::fs::{create_dir_all, read};
use std::path::PathBuf;
use std::process;
use std::str::FromStr;

use bytes::Bytes;
use iceberg::io::FileIO;
use rand::{RngCore, thread_rng};
use walkdir::WalkDir;

/// As of now, write support for tables is not complete for iceberg rust, so we can't construct
/// tables for benchmarking ourselves.
///
/// As a temporary (and admittedly ugly) measure, we run a python script to generate the tables
/// in a temp directory instead and then migrate all files created to a memory-storage `FileIO`.
/// When write support is complete, we can easily swap out the Python scripts and do this natively.
pub fn construct_table(table_name: &str) -> PathBuf {
let script_filename = [table_name, "py"].join(".");
let mut script_path = PathBuf::from_str(env!("CARGO_MANIFEST_DIR")).unwrap();
script_path.push("src");
script_path.push("construction_scripts");

set_current_dir(&script_path).unwrap();

script_path.push(script_filename);

// should look like /tmp/iceberg_benchmark_tables/<script_name>-<random number>>
let mut working_dir = temp_dir();
working_dir.push("iceberg_benchmark_tables");
working_dir.push([table_name, &thread_rng().next_u64().to_string()].join("-"));

create_dir_all(&working_dir).unwrap();

process::Command::new("python")
.arg(script_path.to_str().unwrap())
.arg(working_dir.to_str().unwrap())
.spawn()
.unwrap()
.wait()
.unwrap();

working_dir
}

pub async fn copy_dir_to_fileio(dir: PathBuf, fileio: &FileIO) {
for entry in WalkDir::new(dir)
.into_iter()
.map(|res| res.unwrap())
.filter(|entry| entry.file_type().is_file())
{
let path = entry.into_path();
let output = fileio.new_output(path.to_str().unwrap()).unwrap();
let bytes = Bytes::from(read(path).unwrap());
output.write(bytes).await.unwrap();
}
}
4 changes: 4 additions & 0 deletions crates/iceberg/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,7 @@ tera = { workspace = true }
[package.metadata.cargo-machete]
# These dependencies are added to ensure minimal dependency version
ignored = ["tap"]

[lints.rust]
# so the #[cfg(benchmarking)] tag doesn't create warnings
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(benchmarking)'] }
Loading
Loading