Skip to content

Feature/add bsbm business intelligence #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ jobs:

- name: Prepare BSBM Benchmark
working-directory: bench
run: cargo run --bin rdf-fusion-bench --profile codspeed -- prepare bsbm --dataset-size 1000
run: |
cargo run --bin rdf-fusion-bench --profile codspeed -- prepare bsbm-explore --dataset-size 1000
cargo run --bin rdf-fusion-bench --profile codspeed -- prepare bsbm-business-intelligence --dataset-size 1000

- name: Build the benchmark target(s)
# Limiting the number of jobs is an attempt to not violate GitHub's runner resources.
Expand Down
6 changes: 5 additions & 1 deletion bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,9 @@ codspeed-criterion-compat = { workspace = true, features = ["async_tokio"] }
workspace = true

[[bench]]
name = "bsbm"
name = "bsbm_explore"
harness = false

[[bench]]
name = "bsbm_business_intelligence"
harness = false
420 changes: 420 additions & 0 deletions bench/benches/bsbm_business_intelligence.rs

Large diffs are not rendered by default.

File renamed without changes.
235 changes: 235 additions & 0 deletions bench/src/benchmarks/bsbm/business_intelligence/benchmark.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
use crate::benchmarks::bsbm::business_intelligence::operation::{
list_raw_operations, BsbmBusinessIntelligenceOperation, BsbmBusinessIntelligenceRawOperation,
};
use crate::benchmarks::bsbm::business_intelligence::report::{
BusinessIntelligenceReport, BusinessIntelligenceReportBuilder,
};
use crate::benchmarks::bsbm::BsbmDatasetSize;
use crate::benchmarks::{Benchmark, BenchmarkName};
use crate::environment::{BenchmarkContext, RdfFusionBenchContext};
use crate::prepare::{ArchiveType, FileDownloadAction, PrepRequirement};
use crate::report::BenchmarkReport;
use crate::runs::BenchmarkRun;
use async_trait::async_trait;
use futures::StreamExt;
use rdf_fusion::io::RdfFormat;
use rdf_fusion::store::Store;
use rdf_fusion::{Query, QueryOptions, QueryResults};
use reqwest::Url;
use std::fs;
use std::fs::File;
use std::path::PathBuf;
use tokio::time::Instant;

/// The [Berlin SPARQL Benchmark](http://wbsg.informatik.uni-mannheim.de/bizer/berlinsparqlbenchmark/)
/// is a widely adopted benchmark built around an e-commerce use case.
///
/// This version of the benchmark uses the [pre-prepared datasets](https://zenodo.org/records/12663333)
/// from Oxigraph.
pub struct BsbmBusinessIntelligenceBenchmark {
name: BenchmarkName,
dataset_size: BsbmDatasetSize,
max_query_count: Option<u64>,
}

impl BsbmBusinessIntelligenceBenchmark {
/// Creates a new [BsbmBusinessIntelligenceBenchmark] with the given sizes.
pub fn new(dataset_size: BsbmDatasetSize, max_query_count: Option<u64>) -> Self {
let name = BenchmarkName::BsbmBusinessIntelligence {
dataset_size,
max_query_count,
};
Self {
name,
dataset_size,
max_query_count,
}
}

/// The BSBM also generates many queries that are tailored to the generated data. This method
/// returns a list of queries that should be executed during this run.
fn list_operations(
&self,
env: &RdfFusionBenchContext,
) -> anyhow::Result<Vec<BsbmBusinessIntelligenceOperation>> {
println!("Loading queries ...");

let queries_path = env.join_data_dir(
PathBuf::from(format!("businessIntelligence-{}.csv", self.dataset_size)).as_path(),
)?;
let result = match self.max_query_count {
None => list_raw_operations(&queries_path)?
.filter_map(parse_query)
.collect(),
Some(max_query_count) => list_raw_operations(&queries_path)?
.filter_map(parse_query)
.take(usize::try_from(max_query_count)?)
.collect(),
};

println!("Queries loaded.");
Ok(result)
}

async fn prepare_store(&self, bench_context: &BenchmarkContext<'_>) -> anyhow::Result<Store> {
println!("Creating in-memory store and loading data ...");
let data_path = bench_context
.parent()
.join_data_dir(PathBuf::from(format!("dataset-{}.nt", self.dataset_size)).as_path())?;
let data = fs::read(data_path)?;
let memory_store = Store::new();
memory_store
.load_from_reader(RdfFormat::NTriples, data.as_slice())
.await?;
println!("Store created and data loaded.");
Ok(memory_store)
}
}

#[async_trait]
impl Benchmark for BsbmBusinessIntelligenceBenchmark {
fn name(&self) -> BenchmarkName {
self.name
}

#[allow(clippy::expect_used)]
fn requirements(&self) -> Vec<PrepRequirement> {
let dataset_size = self.dataset_size;
let download_bsbm_tools = PrepRequirement::FileDownload {
url: Url::parse("https://github.com/Tpt/bsbm-tools/archive/59d0a8a605b26f21506789fa1a713beb5abf1cab.zip")
.expect("parse dataset-name"),
file_name: PathBuf::from("bsbmtools"),
action: Some(FileDownloadAction::Unpack(ArchiveType::Zip)),
};
let generate_dataset = PrepRequirement::RunCommand {
workdir: PathBuf::from("./bsbmtools"),
program: "./generate".to_owned(),
args: vec![
"-fc".to_owned(),
"-pc".to_owned(),
format!("{}", dataset_size),
"-dir".to_owned(),
"../td_data".to_owned(),
"-fn".to_owned(),
format!("../dataset-{}", dataset_size),
],
check_requirement: Box::new(move || {
let exists = File::open(format!("./data/dataset-{dataset_size}.nt")).is_ok();
Ok(exists)
}),
};
let download_pregenerated_queries = PrepRequirement::FileDownload {
url: Url::parse(
"https://zenodo.org/records/12663333/files/businessIntelligence-1000.csv.bz2",
)
.expect("parse dataset-name"),
file_name: PathBuf::from("businessIntelligence-1000.csv"),
action: Some(FileDownloadAction::Unpack(ArchiveType::Bz2)),
};

vec![
download_bsbm_tools,
generate_dataset,
download_pregenerated_queries,
]
}

async fn execute(
&self,
bench_context: &BenchmarkContext<'_>,
) -> anyhow::Result<Box<dyn BenchmarkReport>> {
let operations = self.list_operations(bench_context.parent())?;
let memory_store = self.prepare_store(bench_context).await?;
let report = execute_benchmark(bench_context, operations, &memory_store).await?;
Ok(Box::new(report))
}
}

fn parse_query(
query: BsbmBusinessIntelligenceRawOperation,
) -> Option<BsbmBusinessIntelligenceOperation> {
match query {
BsbmBusinessIntelligenceRawOperation::Query(name, query) => {
// TODO remove once describe is supported
if query.contains("DESCRIBE") {
None
} else {
Some(BsbmBusinessIntelligenceOperation::Query(
name,
Query::parse(&query.replace('#', ""), None).unwrap(),
))
}
}
}
}

async fn execute_benchmark(
context: &BenchmarkContext<'_>,
operations: Vec<BsbmBusinessIntelligenceOperation>,
memory_store: &Store,
) -> anyhow::Result<BusinessIntelligenceReport> {
println!("Evaluating queries ...");

let mut report = BusinessIntelligenceReportBuilder::new();
let len = operations.len();
for (idx, operation) in operations.iter().enumerate() {
if idx % 25 == 0 {
println!("Progress: {idx}/{len}");
}

run_operation(context, &mut report, memory_store, operation).await?;
}
let report = report.build();

println!("Progress: {len}/{len}");
println!("All queries evaluated.");

Ok(report)
}

/// Executes a single [BsbmBusinessIntelligenceOperation], profiles the execution, and stores the
/// results of the profiling in the `report`.
async fn run_operation(
context: &BenchmarkContext<'_>,
report: &mut BusinessIntelligenceReportBuilder,
store: &Store,
operation: &BsbmBusinessIntelligenceOperation,
) -> anyhow::Result<()> {
let guard = pprof::ProfilerGuardBuilder::default()
.frequency(1000)
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
.build()?;
let start = Instant::now();

let options = QueryOptions;
let (name, explanation) = match operation {
BsbmBusinessIntelligenceOperation::Query(name, q) => {
let (result, explanation) = store.explain_query_opt(q.clone(), options.clone()).await?;
match result {
QueryResults::Boolean(_) => (),
QueryResults::Solutions(mut s) => {
while let Some(s) = s.next().await {
s?;
}
}
QueryResults::Graph(mut g) => {
while let Some(t) = g.next().await {
t?;
}
}
}
(*name, explanation)
}
};

let run = BenchmarkRun {
duration: start.elapsed(),
report: Some(guard.report().build()?),
};
report.add_run(name, run);
if context.parent().options().verbose_results {
report.add_explanation(explanation);
}

Ok(())
}
69 changes: 69 additions & 0 deletions bench/src/benchmarks/bsbm/business_intelligence/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
mod benchmark;
mod operation;
mod report;

use clap::ValueEnum;
use std::fmt::{Display, Formatter};

pub use benchmark::BsbmBusinessIntelligenceBenchmark;

pub(super) const BSBM_BUSINESS_INTELLIGENCE_QUERIES: [BsbmBusinessIntelligenceQueryName; 8] = [
BsbmBusinessIntelligenceQueryName::Q1,
BsbmBusinessIntelligenceQueryName::Q2,
BsbmBusinessIntelligenceQueryName::Q3,
BsbmBusinessIntelligenceQueryName::Q4,
BsbmBusinessIntelligenceQueryName::Q5,
BsbmBusinessIntelligenceQueryName::Q6,
BsbmBusinessIntelligenceQueryName::Q7,
BsbmBusinessIntelligenceQueryName::Q8,
];

/// The BSBM business intelligence query names.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, ValueEnum)]
pub(super) enum BsbmBusinessIntelligenceQueryName {
Q1,
Q2,
Q3,
Q4,
Q5,
Q6,
Q7,
Q8,
}

impl TryFrom<u8> for BsbmBusinessIntelligenceQueryName {
type Error = anyhow::Error;

fn try_from(value: u8) -> Result<Self, Self::Error> {
match value {
1 => Ok(BsbmBusinessIntelligenceQueryName::Q1),
2 => Ok(BsbmBusinessIntelligenceQueryName::Q2),
3 => Ok(BsbmBusinessIntelligenceQueryName::Q3),
4 => Ok(BsbmBusinessIntelligenceQueryName::Q4),
5 => Ok(BsbmBusinessIntelligenceQueryName::Q5),
6 => Ok(BsbmBusinessIntelligenceQueryName::Q6),
7 => Ok(BsbmBusinessIntelligenceQueryName::Q7),
8 => Ok(BsbmBusinessIntelligenceQueryName::Q8),
_ => Err(anyhow::anyhow!(
"Invalid BSBM Business Intelligence query name: {}",
value
)),
}
}
}

impl Display for BsbmBusinessIntelligenceQueryName {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let string = match self {
BsbmBusinessIntelligenceQueryName::Q1 => "Q1",
BsbmBusinessIntelligenceQueryName::Q2 => "Q2",
BsbmBusinessIntelligenceQueryName::Q3 => "Q3",
BsbmBusinessIntelligenceQueryName::Q4 => "Q4",
BsbmBusinessIntelligenceQueryName::Q5 => "Q5",
BsbmBusinessIntelligenceQueryName::Q6 => "Q6",
BsbmBusinessIntelligenceQueryName::Q7 => "Q7",
BsbmBusinessIntelligenceQueryName::Q8 => "Q8",
};
write!(f, "{string}")
}
}
42 changes: 42 additions & 0 deletions bench/src/benchmarks/bsbm/business_intelligence/operation.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
use crate::benchmarks::bsbm::business_intelligence::BsbmBusinessIntelligenceQueryName;
use rdf_fusion::Query;
use std::fs;
use std::path::Path;

#[allow(clippy::panic)]
#[allow(clippy::panic_in_result_fn)]
#[allow(clippy::expect_used)]
pub(super) fn list_raw_operations(
path: &Path,
) -> anyhow::Result<impl Iterator<Item = BsbmBusinessIntelligenceRawOperation>> {
let reader = fs::read(path)?;
let result = csv::Reader::from_reader(reader.as_slice())
.records()
.collect::<Result<Vec<_>, _>>()?
.into_iter()
.map(|record| {
let query_id = record[0].parse::<u8>().expect("Can't parse query id");
let query_name =
BsbmBusinessIntelligenceQueryName::try_from(query_id).expect("Invalid query id");

match &record[1] {
"query" => {
BsbmBusinessIntelligenceRawOperation::Query(query_name, record[2].into())
}
_ => panic!("Unexpected operation kind {}", &record[1]),
}
});
Ok(result)
}

#[allow(dead_code)]
#[derive(Clone)]
pub(super) enum BsbmBusinessIntelligenceRawOperation {
Query(BsbmBusinessIntelligenceQueryName, String),
}

#[allow(clippy::large_enum_variant, clippy::allow_attributes)]
#[derive(Clone)]
pub(super) enum BsbmBusinessIntelligenceOperation {
Query(BsbmBusinessIntelligenceQueryName, Query),
}
Loading
Loading