tobixdev · tobixdev · Jun 25, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 25, 2025
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -40,7 +40,9 @@ jobs:
 
       - name: Prepare BSBM Benchmark
         working-directory: bench
-        run: cargo run --bin rdf-fusion-bench --profile codspeed -- prepare bsbm --dataset-size 1000
+        run: |
+          cargo run --bin rdf-fusion-bench --profile codspeed -- prepare bsbm-explore --dataset-size 1000
+          cargo run --bin rdf-fusion-bench --profile codspeed -- prepare bsbm-business-intelligence --dataset-size 1000
 
       - name: Build the benchmark target(s)
         # Limiting the number of jobs is an attempt to not violate GitHub's runner resources.

diff --git a/bench/Cargo.toml b/bench/Cargo.toml
@@ -38,5 +38,9 @@ codspeed-criterion-compat = { workspace = true, features = ["async_tokio"] }
 workspace = true
 
 [[bench]]
-name = "bsbm"
+name = "bsbm_explore"
+harness = false
+
+[[bench]]
+name = "bsbm_business_intelligence"
 harness = false
diff --git a/bench/benches/bsbm_business_intelligence.rs b/bench/benches/bsbm_business_intelligence.rs
diff --git a/bench/benches/bsbm.rs → bench/benches/bsbm_explore.rs b/bench/benches/bsbm.rs → bench/benches/bsbm_explore.rs
diff --git a/bench/src/benchmarks/bsbm/business_intelligence/benchmark.rs b/bench/src/benchmarks/bsbm/business_intelligence/benchmark.rs
@@ -0,0 +1,235 @@
+use crate::benchmarks::bsbm::business_intelligence::operation::{
+    list_raw_operations, BsbmBusinessIntelligenceOperation, BsbmBusinessIntelligenceRawOperation,
+};
+use crate::benchmarks::bsbm::business_intelligence::report::{
+    BusinessIntelligenceReport, BusinessIntelligenceReportBuilder,
+};
+use crate::benchmarks::bsbm::BsbmDatasetSize;
+use crate::benchmarks::{Benchmark, BenchmarkName};
+use crate::environment::{BenchmarkContext, RdfFusionBenchContext};
+use crate::prepare::{ArchiveType, FileDownloadAction, PrepRequirement};
+use crate::report::BenchmarkReport;
+use crate::runs::BenchmarkRun;
+use async_trait::async_trait;
+use futures::StreamExt;
+use rdf_fusion::io::RdfFormat;
+use rdf_fusion::store::Store;
+use rdf_fusion::{Query, QueryOptions, QueryResults};
+use reqwest::Url;
+use std::fs;
+use std::fs::File;
+use std::path::PathBuf;
+use tokio::time::Instant;
+
+/// The [Berlin SPARQL Benchmark](http://wbsg.informatik.uni-mannheim.de/bizer/berlinsparqlbenchmark/)
+/// is a widely adopted benchmark built around an e-commerce use case.
+///
+/// This version of the benchmark uses the [pre-prepared datasets](https://zenodo.org/records/12663333)
+/// from Oxigraph.
+pub struct BsbmBusinessIntelligenceBenchmark {
+    name: BenchmarkName,
+    dataset_size: BsbmDatasetSize,
+    max_query_count: Option<u64>,
+}
+
+impl BsbmBusinessIntelligenceBenchmark {
+    /// Creates a new [BsbmBusinessIntelligenceBenchmark] with the given sizes.
+    pub fn new(dataset_size: BsbmDatasetSize, max_query_count: Option<u64>) -> Self {
+        let name = BenchmarkName::BsbmBusinessIntelligence {
+            dataset_size,
+            max_query_count,
+        };
+        Self {
+            name,
+            dataset_size,
+            max_query_count,
+        }
+    }
+
+    /// The BSBM also generates many queries that are tailored to the generated data. This method
+    /// returns a list of queries that should be executed during this run.
+    fn list_operations(
+        &self,
+        env: &RdfFusionBenchContext,
+    ) -> anyhow::Result<Vec<BsbmBusinessIntelligenceOperation>> {
+        println!("Loading queries ...");
+
+        let queries_path = env.join_data_dir(
+            PathBuf::from(format!("businessIntelligence-{}.csv", self.dataset_size)).as_path(),
+        )?;
+        let result = match self.max_query_count {
+            None => list_raw_operations(&queries_path)?
+                .filter_map(parse_query)
+                .collect(),
+            Some(max_query_count) => list_raw_operations(&queries_path)?
+                .filter_map(parse_query)
+                .take(usize::try_from(max_query_count)?)
+                .collect(),
+        };
+
+        println!("Queries loaded.");
+        Ok(result)
+    }
+
+    async fn prepare_store(&self, bench_context: &BenchmarkContext<'_>) -> anyhow::Result<Store> {
+        println!("Creating in-memory store and loading data ...");
+        let data_path = bench_context
+            .parent()
+            .join_data_dir(PathBuf::from(format!("dataset-{}.nt", self.dataset_size)).as_path())?;
+        let data = fs::read(data_path)?;
+        let memory_store = Store::new();
+        memory_store
+            .load_from_reader(RdfFormat::NTriples, data.as_slice())
+            .await?;
+        println!("Store created and data loaded.");
+        Ok(memory_store)
+    }
+}
+
+#[async_trait]
+impl Benchmark for BsbmBusinessIntelligenceBenchmark {
+    fn name(&self) -> BenchmarkName {
+        self.name
+    }
+
+    #[allow(clippy::expect_used)]
+    fn requirements(&self) -> Vec<PrepRequirement> {
+        let dataset_size = self.dataset_size;
+        let download_bsbm_tools = PrepRequirement::FileDownload {
+            url: Url::parse("https://github.com/Tpt/bsbm-tools/archive/59d0a8a605b26f21506789fa1a713beb5abf1cab.zip")
+                .expect("parse dataset-name"),
+            file_name: PathBuf::from("bsbmtools"),
+            action: Some(FileDownloadAction::Unpack(ArchiveType::Zip)),
+        };
+        let generate_dataset = PrepRequirement::RunCommand {
+            workdir: PathBuf::from("./bsbmtools"),
+            program: "./generate".to_owned(),
+            args: vec![
+                "-fc".to_owned(),
+                "-pc".to_owned(),
+                format!("{}", dataset_size),
+                "-dir".to_owned(),
+                "../td_data".to_owned(),
+                "-fn".to_owned(),
+                format!("../dataset-{}", dataset_size),
+            ],
+            check_requirement: Box::new(move || {
+                let exists = File::open(format!("./data/dataset-{dataset_size}.nt")).is_ok();
+                Ok(exists)
+            }),
+        };
+        let download_pregenerated_queries = PrepRequirement::FileDownload {
+            url: Url::parse(
+                "https://zenodo.org/records/12663333/files/businessIntelligence-1000.csv.bz2",
+            )
+            .expect("parse dataset-name"),
+            file_name: PathBuf::from("businessIntelligence-1000.csv"),
+            action: Some(FileDownloadAction::Unpack(ArchiveType::Bz2)),
+        };
+
+        vec![
+            download_bsbm_tools,
+            generate_dataset,
+            download_pregenerated_queries,
+        ]
+    }
+
+    async fn execute(
+        &self,
+        bench_context: &BenchmarkContext<'_>,
+    ) -> anyhow::Result<Box<dyn BenchmarkReport>> {
+        let operations = self.list_operations(bench_context.parent())?;
+        let memory_store = self.prepare_store(bench_context).await?;
+        let report = execute_benchmark(bench_context, operations, &memory_store).await?;
+        Ok(Box::new(report))
+    }
+}
+
+fn parse_query(
+    query: BsbmBusinessIntelligenceRawOperation,
+) -> Option<BsbmBusinessIntelligenceOperation> {
+    match query {
+        BsbmBusinessIntelligenceRawOperation::Query(name, query) => {
+            // TODO remove once describe is supported
+            if query.contains("DESCRIBE") {
+                None
+            } else {
+                Some(BsbmBusinessIntelligenceOperation::Query(
+                    name,
+                    Query::parse(&query.replace('#', ""), None).unwrap(),
+                ))
+            }
+        }
+    }
+}
+
+async fn execute_benchmark(
+    context: &BenchmarkContext<'_>,
+    operations: Vec<BsbmBusinessIntelligenceOperation>,
+    memory_store: &Store,
+) -> anyhow::Result<BusinessIntelligenceReport> {
+    println!("Evaluating queries ...");
+
+    let mut report = BusinessIntelligenceReportBuilder::new();
+    let len = operations.len();
+    for (idx, operation) in operations.iter().enumerate() {
+        if idx % 25 == 0 {
+            println!("Progress: {idx}/{len}");
+        }
+
+        run_operation(context, &mut report, memory_store, operation).await?;
+    }
+    let report = report.build();
+
+    println!("Progress: {len}/{len}");
+    println!("All queries evaluated.");
+
+    Ok(report)
+}
+
+/// Executes a single [BsbmBusinessIntelligenceOperation], profiles the execution, and stores the
+/// results of the profiling in the `report`.
+async fn run_operation(
+    context: &BenchmarkContext<'_>,
+    report: &mut BusinessIntelligenceReportBuilder,
+    store: &Store,
+    operation: &BsbmBusinessIntelligenceOperation,
+) -> anyhow::Result<()> {
+    let guard = pprof::ProfilerGuardBuilder::default()
+        .frequency(1000)
+        .blocklist(&["libc", "libgcc", "pthread", "vdso"])
+        .build()?;
+    let start = Instant::now();
+
+    let options = QueryOptions;
+    let (name, explanation) = match operation {
+        BsbmBusinessIntelligenceOperation::Query(name, q) => {
+            let (result, explanation) = store.explain_query_opt(q.clone(), options.clone()).await?;
+            match result {
+                QueryResults::Boolean(_) => (),
+                QueryResults::Solutions(mut s) => {
+                    while let Some(s) = s.next().await {
+                        s?;
+                    }
+                }
+                QueryResults::Graph(mut g) => {
+                    while let Some(t) = g.next().await {
+                        t?;
+                    }
+                }
+            }
+            (*name, explanation)
+        }
+    };
+
+    let run = BenchmarkRun {
+        duration: start.elapsed(),
+        report: Some(guard.report().build()?),
+    };
+    report.add_run(name, run);
+    if context.parent().options().verbose_results {
+        report.add_explanation(explanation);
+    }
+
+    Ok(())
+}
diff --git a/bench/src/benchmarks/bsbm/business_intelligence/mod.rs b/bench/src/benchmarks/bsbm/business_intelligence/mod.rs
@@ -0,0 +1,69 @@
+mod benchmark;
+mod operation;
+mod report;
+
+use clap::ValueEnum;
+use std::fmt::{Display, Formatter};
+
+pub use benchmark::BsbmBusinessIntelligenceBenchmark;
+
+pub(super) const BSBM_BUSINESS_INTELLIGENCE_QUERIES: [BsbmBusinessIntelligenceQueryName; 8] = [
+    BsbmBusinessIntelligenceQueryName::Q1,
+    BsbmBusinessIntelligenceQueryName::Q2,
+    BsbmBusinessIntelligenceQueryName::Q3,
+    BsbmBusinessIntelligenceQueryName::Q4,
+    BsbmBusinessIntelligenceQueryName::Q5,
+    BsbmBusinessIntelligenceQueryName::Q6,
+    BsbmBusinessIntelligenceQueryName::Q7,
+    BsbmBusinessIntelligenceQueryName::Q8,
+];
+
+/// The BSBM business intelligence query names.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, ValueEnum)]
+pub(super) enum BsbmBusinessIntelligenceQueryName {
+    Q1,
+    Q2,
+    Q3,
+    Q4,
+    Q5,
+    Q6,
+    Q7,
+    Q8,
+}
+
+impl TryFrom<u8> for BsbmBusinessIntelligenceQueryName {
+    type Error = anyhow::Error;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        match value {
+            1 => Ok(BsbmBusinessIntelligenceQueryName::Q1),
+            2 => Ok(BsbmBusinessIntelligenceQueryName::Q2),
+            3 => Ok(BsbmBusinessIntelligenceQueryName::Q3),
+            4 => Ok(BsbmBusinessIntelligenceQueryName::Q4),
+            5 => Ok(BsbmBusinessIntelligenceQueryName::Q5),
+            6 => Ok(BsbmBusinessIntelligenceQueryName::Q6),
+            7 => Ok(BsbmBusinessIntelligenceQueryName::Q7),
+            8 => Ok(BsbmBusinessIntelligenceQueryName::Q8),
+            _ => Err(anyhow::anyhow!(
+                "Invalid BSBM Business Intelligence query name: {}",
+                value
+            )),
+        }
+    }
+}
+
+impl Display for BsbmBusinessIntelligenceQueryName {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let string = match self {
+            BsbmBusinessIntelligenceQueryName::Q1 => "Q1",
+            BsbmBusinessIntelligenceQueryName::Q2 => "Q2",
+            BsbmBusinessIntelligenceQueryName::Q3 => "Q3",
+            BsbmBusinessIntelligenceQueryName::Q4 => "Q4",
+            BsbmBusinessIntelligenceQueryName::Q5 => "Q5",
+            BsbmBusinessIntelligenceQueryName::Q6 => "Q6",
+            BsbmBusinessIntelligenceQueryName::Q7 => "Q7",
+            BsbmBusinessIntelligenceQueryName::Q8 => "Q8",
+        };
+        write!(f, "{string}")
+    }
+}
diff --git a/bench/src/benchmarks/bsbm/business_intelligence/operation.rs b/bench/src/benchmarks/bsbm/business_intelligence/operation.rs
@@ -0,0 +1,42 @@
+use crate::benchmarks::bsbm::business_intelligence::BsbmBusinessIntelligenceQueryName;
+use rdf_fusion::Query;
+use std::fs;
+use std::path::Path;
+
+#[allow(clippy::panic)]
+#[allow(clippy::panic_in_result_fn)]
+#[allow(clippy::expect_used)]
+pub(super) fn list_raw_operations(
+    path: &Path,
+) -> anyhow::Result<impl Iterator<Item = BsbmBusinessIntelligenceRawOperation>> {
+    let reader = fs::read(path)?;
+    let result = csv::Reader::from_reader(reader.as_slice())
+        .records()
+        .collect::<Result<Vec<_>, _>>()?
+        .into_iter()
+        .map(|record| {
+            let query_id = record[0].parse::<u8>().expect("Can't parse query id");
+            let query_name =
+                BsbmBusinessIntelligenceQueryName::try_from(query_id).expect("Invalid query id");
+
+            match &record[1] {
+                "query" => {
+                    BsbmBusinessIntelligenceRawOperation::Query(query_name, record[2].into())
+                }
+                _ => panic!("Unexpected operation kind {}", &record[1]),
+            }
+        });
+    Ok(result)
+}
+
+#[allow(dead_code)]
+#[derive(Clone)]
+pub(super) enum BsbmBusinessIntelligenceRawOperation {
+    Query(BsbmBusinessIntelligenceQueryName, String),
+}
+
+#[allow(clippy::large_enum_variant, clippy::allow_attributes)]
+#[derive(Clone)]
+pub(super) enum BsbmBusinessIntelligenceOperation {
+    Query(BsbmBusinessIntelligenceQueryName, Query),
+}