diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 730e5501..7bbe190d 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -40,7 +40,9 @@ jobs: - name: Prepare BSBM Benchmark working-directory: bench - run: cargo run --bin rdf-fusion-bench --profile codspeed -- prepare bsbm --dataset-size 1000 + run: | + cargo run --bin rdf-fusion-bench --profile codspeed -- prepare bsbm-explore --dataset-size 1000 + cargo run --bin rdf-fusion-bench --profile codspeed -- prepare bsbm-business-intelligence --dataset-size 1000 - name: Build the benchmark target(s) # Limiting the number of jobs is an attempt to not violate GitHub's runner resources. diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 4237b704..abc58ba5 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -38,5 +38,9 @@ codspeed-criterion-compat = { workspace = true, features = ["async_tokio"] } workspace = true [[bench]] -name = "bsbm" +name = "bsbm_explore" +harness = false + +[[bench]] +name = "bsbm_business_intelligence" harness = false \ No newline at end of file diff --git a/bench/benches/bsbm_business_intelligence.rs b/bench/benches/bsbm_business_intelligence.rs new file mode 100644 index 00000000..d83bf4e0 --- /dev/null +++ b/bench/benches/bsbm_business_intelligence.rs @@ -0,0 +1,420 @@ +//! Runs certain queries from the BSBM benchmark suite as part of the regular benchmark suite. +//! +//! The particular instance of a query (they are generated randomly) is picked arbitrarily. If we +//! ever decide that queries in this file are not representative, we can easily change the query. +//! +//! The tests assume the presence of the benchmark data. + +use codspeed_criterion_compat::{criterion_group, criterion_main, Criterion}; +use futures::StreamExt; +use rdf_fusion::io::RdfFormat; +use rdf_fusion::store::Store; +use rdf_fusion::{QueryOptions, QueryResults}; +use std::fs; +use std::path::PathBuf; +use tokio::runtime::{Builder, Runtime}; + +fn bsbm_business_intelligence_q1(c: &mut Criterion) { + let runtime = create_runtime(); + let store = runtime.block_on(load_bsbm_1000()).unwrap(); + + c.bench_function("BSBM Business Intelligence 1000 - Query 1", |b| { + b.to_async(&runtime).iter(|| async { + let result = store + .query_opt( + " + prefix bsbm: + prefix rev: + + Select ?productType ?reviewCount + { + { + Select ?productType (count(?review) As ?reviewCount) + { + ?productType a bsbm:ProductType . + ?product a ?productType . + ?product bsbm:producer ?producer . + ?producer bsbm:country . + ?review bsbm:reviewFor ?product . + ?review rev:reviewer ?reviewer . + ?reviewer bsbm:country . + } + Group By ?productType + } + } + Order By desc(?reviewCount) ?productType + Limit 10 + ", + QueryOptions::default(), + ) + .await + .unwrap(); + assert_number_of_results(result, 10).await; + }); + }); +} + +fn bsbm_business_intelligence_q2(c: &mut Criterion) { + let runtime = create_runtime(); + let store = runtime.block_on(load_bsbm_1000()).unwrap(); + + c.bench_function("BSBM Business Intelligence 1000 - Query 2", |b| { + b.to_async(&runtime).iter(|| async { + let result = store.query_opt(" + prefix bsbm: + + SELECT ?otherProduct ?sameFeatures + { + ?otherProduct a bsbm:Product . + FILTER(?otherProduct != ) + { + SELECT ?otherProduct (count(?otherFeature) As ?sameFeatures) + { + bsbm:productFeature ?feature . + ?otherProduct bsbm:productFeature ?otherFeature . + FILTER(?feature=?otherFeature) + } + Group By ?otherProduct + } + } + Order By desc(?sameFeatures) ?otherProduct + Limit 10 + ", QueryOptions::default()).await.unwrap(); + assert_number_of_results(result, 10).await; + }); + }); +} + +fn bsbm_business_intelligence_q3(c: &mut Criterion) { + let runtime = create_runtime(); + let store = runtime.block_on(load_bsbm_1000()).unwrap(); + + c.bench_function("BSBM Business Intelligence 1000 - Query 3", |b| { + b.to_async(&runtime).iter(|| async { + let result = store.query_opt(" + prefix bsbm: + prefix bsbm-inst: + prefix rev: prefix dc: + prefix xsd: + Select ?product (xsd:float(?monthCount)/?monthBeforeCount As ?ratio) + { + { + Select ?product (count(?review) As ?monthCount) + { + ?review bsbm:reviewFor ?product . + ?review dc:date ?date . + Filter(?date >= \"2007-10-10\"^^ && ?date < \"2007-11-07\"^^)\ + } + Group By ?product + } + { + Select ?product (count(?review) As ?monthBeforeCount) + { + ?review bsbm:reviewFor ?product . + ?review dc:date ?date . + Filter(?date >= \"2007-09-12\"^^ && ?date < \"2007-10-10\"^^)\ + } + Group By ?product + Having (count(?review)>0) + } + } + Order By desc(xsd:float(?monthCount) / ?monthBeforeCount) ?product + Limit 10 + ", QueryOptions::default()).await.unwrap(); + assert_number_of_results(result, 10).await; + }); + }); +} + +fn bsbm_business_intelligence_q4(c: &mut Criterion) { + let runtime = create_runtime(); + let store = runtime.block_on(load_bsbm_1000()).unwrap(); + + c.bench_function("BSBM Business Intelligence 1000 - Query 4", |b| { + b.to_async(&runtime).iter(|| async { + let result = store.query_opt(" + prefix bsbm: + prefix bsbm-inst: + prefix xsd: + + Select ?feature (?withFeaturePrice/?withoutFeaturePrice As ?priceRatio) + { + { + Select ?feature (avg(xsd:float(xsd:string(?price))) As ?withFeaturePrice) + { + ?product a ; + bsbm:productFeature ?feature . + ?offer bsbm:product ?product ; + bsbm:price ?price . + } + Group By ?feature + } + { + Select ?feature (avg(xsd:float(xsd:string(?price))) As ?withoutFeaturePrice) + { + { + Select distinct ?feature + { + ?p a ; + bsbm:productFeature ?feature . + } + } + ?product a . + ?offer bsbm:product ?product ; + bsbm:price ?price . + FILTER NOT EXISTS { ?product bsbm:productFeature ?feature } + } + Group By ?feature + } + } + Order By desc(?withFeaturePrice/?withoutFeaturePrice) ?feature + Limit 10 + ", QueryOptions::default()).await.unwrap(); + assert_number_of_results(result, 10).await; + }); + }); +} + +fn bsbm_business_intelligence_q5(c: &mut Criterion) { + let runtime = create_runtime(); + let store = runtime.block_on(load_bsbm_1000()).unwrap(); + + c.bench_function("BSBM Business Intelligence 1000 - Query 5", |b| { + b.to_async(&runtime).iter(|| async { + let result = store.query_opt(" + prefix bsbm: + prefix bsbm-inst: + prefix rev: + prefix xsd: + + Select ?country ?product ?nrOfReviews ?avgPrice + { + { + Select ?country (max(?nrOfReviews) As ?maxReviews) + { + { + Select ?country ?product (count(?review) As ?nrOfReviews) + { + ?product a . + ?review bsbm:reviewFor ?product ; + rev:reviewer ?reviewer . + ?reviewer bsbm:country ?country . + } + Group By ?country ?product + } + } + Group By ?country + } + { + Select ?country ?product (avg(xsd:float(xsd:string(?price))) As ?avgPrice) + { + ?product a . + ?offer bsbm:product ?product . + ?offer bsbm:price ?price . + } + Group By ?country ?product + } + { + Select ?country ?product (count(?review) As ?nrOfReviews) + { + ?product a . + ?review bsbm:reviewFor ?product . + ?review rev:reviewer ?reviewer . + ?reviewer bsbm:country ?country . + } + Group By ?country ?product + } + FILTER(?nrOfReviews=?maxReviews) + } + Order By desc(?nrOfReviews) ?country ?product + ", QueryOptions::default()).await.unwrap(); + assert_number_of_results(result, 12).await; + }); + }); +} + +fn bsbm_business_intelligence_q6(c: &mut Criterion) { + let runtime = create_runtime(); + let store = runtime.block_on(load_bsbm_1000()).unwrap(); + + c.bench_function("BSBM Business Intelligence 1000 - Query 6", |b| { + b.to_async(&runtime).iter(|| async { + let result = store.query_opt(" + prefix bsbm: + prefix bsbm-inst: + prefix rev: + prefix xsd: + + Select ?reviewer (avg(xsd:float(?score)) As ?reviewerAvgScore) + { + { + Select (avg(xsd:float(?score)) As ?avgScore) + { + ?product bsbm:producer . + ?review bsbm:reviewFor ?product . + { ?review bsbm:rating1 ?score . } UNION + { ?review bsbm:rating2 ?score . } UNION + { ?review bsbm:rating3 ?score . } UNION + { ?review bsbm:rating4 ?score . } + } + } + ?product bsbm:producer . + ?review bsbm:reviewFor ?product . + ?review rev:reviewer ?reviewer . + { ?review bsbm:rating1 ?score . } UNION + { ?review bsbm:rating2 ?score . } UNION + { ?review bsbm:rating3 ?score . } UNION + { ?review bsbm:rating4 ?score . } + } + Group By ?reviewer + Having (avg(xsd:float(?score)) > min(?avgScore) * 1.5) + ", QueryOptions::default()).await.unwrap(); + assert_number_of_results(result, 12).await; + }); + }); +} + +fn bsbm_business_intelligence_q7(c: &mut Criterion) { + let runtime = create_runtime(); + let store = runtime.block_on(load_bsbm_1000()).unwrap(); + + c.bench_function("BSBM Business Intelligence 1000 - Query 7", |b| { + b.to_async(&runtime).iter(|| async { + let result = store.query_opt(" + prefix bsbm: + prefix bsbm-inst: + prefix xsd: + + Select ?product + { + { + Select ?product + { + { + Select ?product (count(?offer) As ?offerCount) + { + ?product a . + ?offer bsbm:product ?product . + } + Group By ?product + } + } + Order By desc(?offerCount) + Limit 1000 + } + FILTER NOT EXISTS + { + ?offer bsbm:product ?product . + ?offer bsbm:vendor ?vendor . + ?vendor bsbm:country ?country . + FILTER(?country=) + } + } + ", QueryOptions::default()).await.unwrap(); + assert_number_of_results(result, 16).await; + }); + }); +} + +fn bsbm_business_intelligence_q8(c: &mut Criterion) { + let runtime = create_runtime(); + let store = runtime.block_on(load_bsbm_1000()).unwrap(); + + c.bench_function("BSBM Business Intelligence 1000 - Query 8", |b| { + b.to_async(&runtime).iter(|| async { + let result = store.query_opt(" + prefix bsbm: + prefix bsbm-inst: + prefix xsd: + + Select ?vendor (xsd:float(?belowAvg)/?offerCount As ?cheapExpensiveRatio) + { + { + Select ?vendor (count(?offer) As ?belowAvg) + { + { + ?product a . + ?offer bsbm:product ?product . + ?offer bsbm:vendor ?vendor . + ?offer bsbm:price ?price . + { + Select ?product (avg(xsd:float(xsd:string(?price))) As ?avgPrice) + { + ?product a . + ?offer bsbm:product ?product . + ?offer bsbm:vendor ?vendor . + ?offer bsbm:price ?price . + } + Group By ?product + } + } . + FILTER (xsd:float(xsd:string(?price)) < ?avgPrice) + } + Group By ?vendor + } + { + Select ?vendor (count(?offer) As ?offerCount) + { + ?product a . + ?offer bsbm:product ?product . + ?offer bsbm:vendor ?vendor . + } + Group By ?vendor + } + } + Order by desc(xsd:float(?belowAvg)/?offerCount) ?vendor + limit 10 + ", QueryOptions::default()).await.unwrap(); + assert_number_of_results(result, 10).await; + }); + }); +} + +criterion_group!( + bsbm_business_intelligence, + bsbm_business_intelligence_q1, + bsbm_business_intelligence_q2, + // bsbm_business_intelligence_q3, TODO failing + // bsbm_business_intelligence_q4, TODO failing + bsbm_business_intelligence_q5, + bsbm_business_intelligence_q6, + bsbm_business_intelligence_q7, + // bsbm_business_intelligence_q8, TODO failing +); +criterion_main!(bsbm_business_intelligence); + +fn create_runtime() -> Runtime { + Builder::new_current_thread().enable_all().build().unwrap() +} + +async fn load_bsbm_1000() -> anyhow::Result { + let data_path = PathBuf::from("./data/dataset-1000.nt"); + let data = fs::read(data_path)?; + let memory_store = Store::new(); + memory_store + .load_from_reader(RdfFormat::NTriples, data.as_slice()) + .await?; + Ok(memory_store) +} + +async fn assert_number_of_results(result: QueryResults, n: usize) { + match result { + QueryResults::Solutions(mut solutions) => { + let mut count = 0; + while let Some(sol) = solutions.next().await { + sol.unwrap(); + count += 1; + } + assert_eq!(count, n); + } + QueryResults::Graph(mut triples) => { + let mut count = 0; + while let Some(sol) = triples.next().await { + sol.unwrap(); + count += 1; + } + assert_eq!(count, n); + } + _ => panic!("Unexpected QueryResults"), + } +} diff --git a/bench/benches/bsbm.rs b/bench/benches/bsbm_explore.rs similarity index 100% rename from bench/benches/bsbm.rs rename to bench/benches/bsbm_explore.rs diff --git a/bench/src/benchmarks/bsbm/business_intelligence/benchmark.rs b/bench/src/benchmarks/bsbm/business_intelligence/benchmark.rs new file mode 100644 index 00000000..382e832e --- /dev/null +++ b/bench/src/benchmarks/bsbm/business_intelligence/benchmark.rs @@ -0,0 +1,235 @@ +use crate::benchmarks::bsbm::business_intelligence::operation::{ + list_raw_operations, BsbmBusinessIntelligenceOperation, BsbmBusinessIntelligenceRawOperation, +}; +use crate::benchmarks::bsbm::business_intelligence::report::{ + BusinessIntelligenceReport, BusinessIntelligenceReportBuilder, +}; +use crate::benchmarks::bsbm::BsbmDatasetSize; +use crate::benchmarks::{Benchmark, BenchmarkName}; +use crate::environment::{BenchmarkContext, RdfFusionBenchContext}; +use crate::prepare::{ArchiveType, FileDownloadAction, PrepRequirement}; +use crate::report::BenchmarkReport; +use crate::runs::BenchmarkRun; +use async_trait::async_trait; +use futures::StreamExt; +use rdf_fusion::io::RdfFormat; +use rdf_fusion::store::Store; +use rdf_fusion::{Query, QueryOptions, QueryResults}; +use reqwest::Url; +use std::fs; +use std::fs::File; +use std::path::PathBuf; +use tokio::time::Instant; + +/// The [Berlin SPARQL Benchmark](http://wbsg.informatik.uni-mannheim.de/bizer/berlinsparqlbenchmark/) +/// is a widely adopted benchmark built around an e-commerce use case. +/// +/// This version of the benchmark uses the [pre-prepared datasets](https://zenodo.org/records/12663333) +/// from Oxigraph. +pub struct BsbmBusinessIntelligenceBenchmark { + name: BenchmarkName, + dataset_size: BsbmDatasetSize, + max_query_count: Option, +} + +impl BsbmBusinessIntelligenceBenchmark { + /// Creates a new [BsbmBusinessIntelligenceBenchmark] with the given sizes. + pub fn new(dataset_size: BsbmDatasetSize, max_query_count: Option) -> Self { + let name = BenchmarkName::BsbmBusinessIntelligence { + dataset_size, + max_query_count, + }; + Self { + name, + dataset_size, + max_query_count, + } + } + + /// The BSBM also generates many queries that are tailored to the generated data. This method + /// returns a list of queries that should be executed during this run. + fn list_operations( + &self, + env: &RdfFusionBenchContext, + ) -> anyhow::Result> { + println!("Loading queries ..."); + + let queries_path = env.join_data_dir( + PathBuf::from(format!("businessIntelligence-{}.csv", self.dataset_size)).as_path(), + )?; + let result = match self.max_query_count { + None => list_raw_operations(&queries_path)? + .filter_map(parse_query) + .collect(), + Some(max_query_count) => list_raw_operations(&queries_path)? + .filter_map(parse_query) + .take(usize::try_from(max_query_count)?) + .collect(), + }; + + println!("Queries loaded."); + Ok(result) + } + + async fn prepare_store(&self, bench_context: &BenchmarkContext<'_>) -> anyhow::Result { + println!("Creating in-memory store and loading data ..."); + let data_path = bench_context + .parent() + .join_data_dir(PathBuf::from(format!("dataset-{}.nt", self.dataset_size)).as_path())?; + let data = fs::read(data_path)?; + let memory_store = Store::new(); + memory_store + .load_from_reader(RdfFormat::NTriples, data.as_slice()) + .await?; + println!("Store created and data loaded."); + Ok(memory_store) + } +} + +#[async_trait] +impl Benchmark for BsbmBusinessIntelligenceBenchmark { + fn name(&self) -> BenchmarkName { + self.name + } + + #[allow(clippy::expect_used)] + fn requirements(&self) -> Vec { + let dataset_size = self.dataset_size; + let download_bsbm_tools = PrepRequirement::FileDownload { + url: Url::parse("https://github.com/Tpt/bsbm-tools/archive/59d0a8a605b26f21506789fa1a713beb5abf1cab.zip") + .expect("parse dataset-name"), + file_name: PathBuf::from("bsbmtools"), + action: Some(FileDownloadAction::Unpack(ArchiveType::Zip)), + }; + let generate_dataset = PrepRequirement::RunCommand { + workdir: PathBuf::from("./bsbmtools"), + program: "./generate".to_owned(), + args: vec![ + "-fc".to_owned(), + "-pc".to_owned(), + format!("{}", dataset_size), + "-dir".to_owned(), + "../td_data".to_owned(), + "-fn".to_owned(), + format!("../dataset-{}", dataset_size), + ], + check_requirement: Box::new(move || { + let exists = File::open(format!("./data/dataset-{dataset_size}.nt")).is_ok(); + Ok(exists) + }), + }; + let download_pregenerated_queries = PrepRequirement::FileDownload { + url: Url::parse( + "https://zenodo.org/records/12663333/files/businessIntelligence-1000.csv.bz2", + ) + .expect("parse dataset-name"), + file_name: PathBuf::from("businessIntelligence-1000.csv"), + action: Some(FileDownloadAction::Unpack(ArchiveType::Bz2)), + }; + + vec![ + download_bsbm_tools, + generate_dataset, + download_pregenerated_queries, + ] + } + + async fn execute( + &self, + bench_context: &BenchmarkContext<'_>, + ) -> anyhow::Result> { + let operations = self.list_operations(bench_context.parent())?; + let memory_store = self.prepare_store(bench_context).await?; + let report = execute_benchmark(bench_context, operations, &memory_store).await?; + Ok(Box::new(report)) + } +} + +fn parse_query( + query: BsbmBusinessIntelligenceRawOperation, +) -> Option { + match query { + BsbmBusinessIntelligenceRawOperation::Query(name, query) => { + // TODO remove once describe is supported + if query.contains("DESCRIBE") { + None + } else { + Some(BsbmBusinessIntelligenceOperation::Query( + name, + Query::parse(&query.replace('#', ""), None).unwrap(), + )) + } + } + } +} + +async fn execute_benchmark( + context: &BenchmarkContext<'_>, + operations: Vec, + memory_store: &Store, +) -> anyhow::Result { + println!("Evaluating queries ..."); + + let mut report = BusinessIntelligenceReportBuilder::new(); + let len = operations.len(); + for (idx, operation) in operations.iter().enumerate() { + if idx % 25 == 0 { + println!("Progress: {idx}/{len}"); + } + + run_operation(context, &mut report, memory_store, operation).await?; + } + let report = report.build(); + + println!("Progress: {len}/{len}"); + println!("All queries evaluated."); + + Ok(report) +} + +/// Executes a single [BsbmBusinessIntelligenceOperation], profiles the execution, and stores the +/// results of the profiling in the `report`. +async fn run_operation( + context: &BenchmarkContext<'_>, + report: &mut BusinessIntelligenceReportBuilder, + store: &Store, + operation: &BsbmBusinessIntelligenceOperation, +) -> anyhow::Result<()> { + let guard = pprof::ProfilerGuardBuilder::default() + .frequency(1000) + .blocklist(&["libc", "libgcc", "pthread", "vdso"]) + .build()?; + let start = Instant::now(); + + let options = QueryOptions; + let (name, explanation) = match operation { + BsbmBusinessIntelligenceOperation::Query(name, q) => { + let (result, explanation) = store.explain_query_opt(q.clone(), options.clone()).await?; + match result { + QueryResults::Boolean(_) => (), + QueryResults::Solutions(mut s) => { + while let Some(s) = s.next().await { + s?; + } + } + QueryResults::Graph(mut g) => { + while let Some(t) = g.next().await { + t?; + } + } + } + (*name, explanation) + } + }; + + let run = BenchmarkRun { + duration: start.elapsed(), + report: Some(guard.report().build()?), + }; + report.add_run(name, run); + if context.parent().options().verbose_results { + report.add_explanation(explanation); + } + + Ok(()) +} diff --git a/bench/src/benchmarks/bsbm/business_intelligence/mod.rs b/bench/src/benchmarks/bsbm/business_intelligence/mod.rs new file mode 100644 index 00000000..8aca3b65 --- /dev/null +++ b/bench/src/benchmarks/bsbm/business_intelligence/mod.rs @@ -0,0 +1,69 @@ +mod benchmark; +mod operation; +mod report; + +use clap::ValueEnum; +use std::fmt::{Display, Formatter}; + +pub use benchmark::BsbmBusinessIntelligenceBenchmark; + +pub(super) const BSBM_BUSINESS_INTELLIGENCE_QUERIES: [BsbmBusinessIntelligenceQueryName; 8] = [ + BsbmBusinessIntelligenceQueryName::Q1, + BsbmBusinessIntelligenceQueryName::Q2, + BsbmBusinessIntelligenceQueryName::Q3, + BsbmBusinessIntelligenceQueryName::Q4, + BsbmBusinessIntelligenceQueryName::Q5, + BsbmBusinessIntelligenceQueryName::Q6, + BsbmBusinessIntelligenceQueryName::Q7, + BsbmBusinessIntelligenceQueryName::Q8, +]; + +/// The BSBM business intelligence query names. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, ValueEnum)] +pub(super) enum BsbmBusinessIntelligenceQueryName { + Q1, + Q2, + Q3, + Q4, + Q5, + Q6, + Q7, + Q8, +} + +impl TryFrom for BsbmBusinessIntelligenceQueryName { + type Error = anyhow::Error; + + fn try_from(value: u8) -> Result { + match value { + 1 => Ok(BsbmBusinessIntelligenceQueryName::Q1), + 2 => Ok(BsbmBusinessIntelligenceQueryName::Q2), + 3 => Ok(BsbmBusinessIntelligenceQueryName::Q3), + 4 => Ok(BsbmBusinessIntelligenceQueryName::Q4), + 5 => Ok(BsbmBusinessIntelligenceQueryName::Q5), + 6 => Ok(BsbmBusinessIntelligenceQueryName::Q6), + 7 => Ok(BsbmBusinessIntelligenceQueryName::Q7), + 8 => Ok(BsbmBusinessIntelligenceQueryName::Q8), + _ => Err(anyhow::anyhow!( + "Invalid BSBM Business Intelligence query name: {}", + value + )), + } + } +} + +impl Display for BsbmBusinessIntelligenceQueryName { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let string = match self { + BsbmBusinessIntelligenceQueryName::Q1 => "Q1", + BsbmBusinessIntelligenceQueryName::Q2 => "Q2", + BsbmBusinessIntelligenceQueryName::Q3 => "Q3", + BsbmBusinessIntelligenceQueryName::Q4 => "Q4", + BsbmBusinessIntelligenceQueryName::Q5 => "Q5", + BsbmBusinessIntelligenceQueryName::Q6 => "Q6", + BsbmBusinessIntelligenceQueryName::Q7 => "Q7", + BsbmBusinessIntelligenceQueryName::Q8 => "Q8", + }; + write!(f, "{string}") + } +} diff --git a/bench/src/benchmarks/bsbm/business_intelligence/operation.rs b/bench/src/benchmarks/bsbm/business_intelligence/operation.rs new file mode 100644 index 00000000..30629aa0 --- /dev/null +++ b/bench/src/benchmarks/bsbm/business_intelligence/operation.rs @@ -0,0 +1,42 @@ +use crate::benchmarks::bsbm::business_intelligence::BsbmBusinessIntelligenceQueryName; +use rdf_fusion::Query; +use std::fs; +use std::path::Path; + +#[allow(clippy::panic)] +#[allow(clippy::panic_in_result_fn)] +#[allow(clippy::expect_used)] +pub(super) fn list_raw_operations( + path: &Path, +) -> anyhow::Result> { + let reader = fs::read(path)?; + let result = csv::Reader::from_reader(reader.as_slice()) + .records() + .collect::, _>>()? + .into_iter() + .map(|record| { + let query_id = record[0].parse::().expect("Can't parse query id"); + let query_name = + BsbmBusinessIntelligenceQueryName::try_from(query_id).expect("Invalid query id"); + + match &record[1] { + "query" => { + BsbmBusinessIntelligenceRawOperation::Query(query_name, record[2].into()) + } + _ => panic!("Unexpected operation kind {}", &record[1]), + } + }); + Ok(result) +} + +#[allow(dead_code)] +#[derive(Clone)] +pub(super) enum BsbmBusinessIntelligenceRawOperation { + Query(BsbmBusinessIntelligenceQueryName, String), +} + +#[allow(clippy::large_enum_variant, clippy::allow_attributes)] +#[derive(Clone)] +pub(super) enum BsbmBusinessIntelligenceOperation { + Query(BsbmBusinessIntelligenceQueryName, Query), +} diff --git a/bench/src/benchmarks/bsbm/business_intelligence/report.rs b/bench/src/benchmarks/bsbm/business_intelligence/report.rs new file mode 100644 index 00000000..b16b965c --- /dev/null +++ b/bench/src/benchmarks/bsbm/business_intelligence/report.rs @@ -0,0 +1,212 @@ +use crate::benchmarks::bsbm::business_intelligence::{ + BsbmBusinessIntelligenceQueryName, BSBM_BUSINESS_INTELLIGENCE_QUERIES, +}; +use crate::report::BenchmarkReport; +use crate::runs::{BenchmarkRun, BenchmarkRuns}; +use crate::utils::write_flamegraph; +use anyhow::{bail, Context}; +use datafusion::physical_plan::displayable; +use prettytable::{row, Table}; +use rdf_fusion::QueryExplanation; +use std::collections::HashMap; +use std::fs; +use std::io::Write; +use std::path::Path; + +/// Stores the final report of executing a BSBM business intelligence benchmark. +pub struct BusinessIntelligenceReport { + /// Stores all runs of the benchmark grouped by the query name. + /// A single query name can have multiple instances (with random variables) in BSBM. + runs: HashMap, + /// Query explanations for each run. + explanations: Vec, +} + +impl BusinessIntelligenceReport { + /// Writes a tabular summary of the query execution time. + fn write_summary(&self, writer: &mut W) -> anyhow::Result<()> { + // Create the table + let mut table = Table::new(); + table.add_row(row!["Query", "Samples", "Average Duration"]); + for query in BSBM_BUSINESS_INTELLIGENCE_QUERIES { + let summary = self + .runs + .get(&query) + .map(BenchmarkRuns::summarize) + .transpose()?; + + let samples = summary + .as_ref() + .map_or_else(|| "-".to_owned(), |s| s.number_of_samples.to_string()); + let average_duration = summary + .as_ref() + .map_or_else(|| "-".to_owned(), |s| format!("{:?}", s.avg_duration)); + + table.add_row(row![query.to_string(), samples, average_duration]); + } + table.print(writer)?; + + Ok(()) + } + + /// Write aggregated flamegraph. + fn write_aggregated_flamegraphs(&self, output_directory: &Path) -> anyhow::Result<()> { + if !output_directory.is_dir() { + bail!( + "Output directory {} does not exist", + output_directory.display() + ); + } + + for query in BSBM_BUSINESS_INTELLIGENCE_QUERIES { + let frames = self + .runs + .get(&query) + .map(BenchmarkRuns::accumulate_profiles) + .transpose()?; + if let Some(frames) = frames { + let flamegraph_file = output_directory.join(format!("{query}.svg")); + let mut flamegraph_file = + fs::File::create(flamegraph_file).context("Cannot create flamegraph file")?; + write_flamegraph(&mut flamegraph_file, &frames)?; + } + } + + Ok(()) + } + + fn write_query_results(&self, output_directory: &Path, index: usize) -> anyhow::Result<()> { + let query_i_path = output_directory.join(format!("query{index}")); + fs::create_dir_all(&query_i_path).context("Cannot create query directory")?; + + let summary_file = query_i_path.join("0_summary.txt"); + let initial_logical_plan_file = query_i_path.join("1_initial_logical_plan.txt"); + let optimized_logical_plan_file = query_i_path.join("2_optimized_logical_plan.txt"); + let execution_plan_file = query_i_path.join("3_execution_plan.txt"); + + let explanation = self + .explanations + .get(index) + .context("Cannot get explanation")?; + + // Write the initial logical plan + fs::write( + &summary_file, + format!("Planning Time:{:?}", explanation.planning_time), + ) + .with_context(|| { + format!( + "Failed to write summary plan to {}", + initial_logical_plan_file.display() + ) + })?; + + // Write the initial logical plan + let initial_logical_plan = explanation.initial_logical_plan.to_string(); + fs::write( + &initial_logical_plan_file, + format!("Initial Logical Plan:\n\n{initial_logical_plan}"), + ) + .with_context(|| { + format!( + "Failed to write initial logical plan to {}", + initial_logical_plan_file.display() + ) + })?; + + // Write the optimized logical plan + let optimized_logical_plan = explanation.optimized_logical_plan.to_string(); + fs::write( + &optimized_logical_plan_file, + format!("Optimized Logical Plan:\n\n{optimized_logical_plan}"), + ) + .with_context(|| { + format!( + "Failed to write optimized logical plan to {}", + optimized_logical_plan_file.display() + ) + })?; + + // Write the execution plan + let execution_plan = displayable(explanation.execution_plan.as_ref()).indent(false); + fs::write( + &execution_plan_file, + format!("Execution Plan:\n\n{execution_plan}"), + ) + .with_context(|| { + format!( + "Failed to write execution plan to {}", + execution_plan_file.display() + ) + })?; + + Ok(()) + } +} + +impl BenchmarkReport for BusinessIntelligenceReport { + fn write_results(&self, output_dir: &Path) -> anyhow::Result<()> { + let summary_txt = output_dir.join("summary.txt"); + let mut summary_file = fs::File::create(summary_txt)?; + self.write_summary(&mut summary_file)?; + + let flamegraphs_dir = output_dir.join("flamegraphs"); + fs::create_dir_all(&flamegraphs_dir) + .context("Cannot create flamegraphs directory before writing flamegraphs")?; + self.write_aggregated_flamegraphs(&flamegraphs_dir)?; + + if !self.explanations.is_empty() { + let queries_path = output_dir.join("queries"); + fs::create_dir_all(&queries_path).context("Cannot create queries directory")?; + for i in 0..self.explanations.len() { + self.write_query_results(&queries_path, i)?; + } + } + + Ok(()) + } +} + +/// Builder for the [`BusinessIntelligenceReport`]. +/// +/// This should only be accessible to the benchmark code. +pub(super) struct BusinessIntelligenceReportBuilder { + /// The inner report that is being built. + report: BusinessIntelligenceReport, +} + +impl BusinessIntelligenceReportBuilder { + /// Creates a new builder. + pub(super) fn new() -> Self { + Self { + report: BusinessIntelligenceReport { + runs: HashMap::new(), + explanations: Vec::new(), + }, + } + } + + /// Adds a run to a particular query. + pub(super) fn add_run(&mut self, name: BsbmBusinessIntelligenceQueryName, run: BenchmarkRun) { + let runs = self.report.runs.entry(name).or_default(); + runs.add_run(run); + } + + /// Adds an explanation for a particular query. + /// + /// It is expected that the n-th call of this method is the explanation of the n-th query. + pub(super) fn add_explanation(&mut self, explanation: QueryExplanation) { + self.report.explanations.push(explanation) + } + + /// Finalizes the report. + pub(super) fn build(self) -> BusinessIntelligenceReport { + self.report + } +} + +impl Default for BusinessIntelligenceReportBuilder { + fn default() -> Self { + Self::new() + } +} diff --git a/bench/src/benchmarks/bsbm/explore/benchmark.rs b/bench/src/benchmarks/bsbm/explore/benchmark.rs index 95c3366d..9e56085a 100644 --- a/bench/src/benchmarks/bsbm/explore/benchmark.rs +++ b/bench/src/benchmarks/bsbm/explore/benchmark.rs @@ -33,7 +33,7 @@ pub struct BsbmExploreBenchmark { impl BsbmExploreBenchmark { /// Creates a new [BsbmExploreBenchmark] with the given sizes. pub fn new(dataset_size: BsbmDatasetSize, max_query_count: Option) -> Self { - let name = BenchmarkName::Bsbm { + let name = BenchmarkName::BsbmExplore { dataset_size, max_query_count, }; diff --git a/bench/src/benchmarks/bsbm/mod.rs b/bench/src/benchmarks/bsbm/mod.rs index c6d522a8..6f5ebf93 100644 --- a/bench/src/benchmarks/bsbm/mod.rs +++ b/bench/src/benchmarks/bsbm/mod.rs @@ -1,8 +1,10 @@ use clap::ValueEnum; use std::fmt::{Display, Formatter}; +mod business_intelligence; mod explore; +pub use business_intelligence::BsbmBusinessIntelligenceBenchmark; pub use explore::BsbmExploreBenchmark; /// Indicates the size of the dataset. diff --git a/bench/src/benchmarks/name.rs b/bench/src/benchmarks/name.rs index 92aaa3c6..f507da53 100644 --- a/bench/src/benchmarks/name.rs +++ b/bench/src/benchmarks/name.rs @@ -4,8 +4,17 @@ use std::fmt::{Display, Formatter}; #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Subcommand)] pub enum BenchmarkName { - /// Runs a BSBM instance with already generated queries from Oxigraph. - Bsbm { + /// Represents the BSBM explore benchmark. + BsbmExplore { + /// Indicates the scaling of the dataset. + #[arg(short, long, default_value = "1000")] + dataset_size: BsbmDatasetSize, + /// Provides an upper bound on the number of queries to be executed. + #[arg(short, long)] + max_query_count: Option, + }, + /// Represents the BSBM business intelligence benchmark. + BsbmBusinessIntelligence { /// Indicates the scaling of the dataset. #[arg(short, long, default_value = "1000")] dataset_size: BsbmDatasetSize, @@ -19,12 +28,19 @@ impl BenchmarkName { /// Returns a directory name for the benchmark. pub fn dir_name(&self) -> String { match self { - BenchmarkName::Bsbm { + BenchmarkName::BsbmExplore { + dataset_size, + max_query_count, + } => match max_query_count { + Some(max_query_count) => format!("bsbm-explore-{dataset_size}-{max_query_count}"), + None => format!("bsbm-explore-{dataset_size}"), + }, + BenchmarkName::BsbmBusinessIntelligence { dataset_size, max_query_count, } => match max_query_count { - Some(max_query_count) => format!("bsbm-{dataset_size}-{max_query_count}"), - None => format!("bsbm-{dataset_size}"), + Some(max_query_count) => format!("bsbm-bi-{dataset_size}-{max_query_count}"), + None => format!("bsbm-bi-{dataset_size}"), }, } } @@ -33,7 +49,7 @@ impl BenchmarkName { impl Display for BenchmarkName { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - BenchmarkName::Bsbm { + BenchmarkName::BsbmExplore { dataset_size, max_query_count, } => match max_query_count { @@ -43,6 +59,16 @@ impl Display for BenchmarkName { ), None => write!(f, "BSBM Explore: dataset_size={dataset_size}"), }, + BenchmarkName::BsbmBusinessIntelligence { + dataset_size, + max_query_count, + } => match max_query_count { + Some(max_query_count) => write!( + f, + "BSBM Business Intelligence: dataset_size={dataset_size}, max_query_count={max_query_count}" + ), + None => write!(f, "BSBM Business Intelligence: dataset_size={dataset_size}"), + }, } } } diff --git a/bench/src/lib.rs b/bench/src/lib.rs index 8b865058..3fe40ec2 100644 --- a/bench/src/lib.rs +++ b/bench/src/lib.rs @@ -1,6 +1,6 @@ #![allow(clippy::print_stdout)] -use crate::benchmarks::bsbm::BsbmExploreBenchmark; +use crate::benchmarks::bsbm::{BsbmBusinessIntelligenceBenchmark, BsbmExploreBenchmark}; use crate::benchmarks::{Benchmark, BenchmarkName}; use crate::environment::RdfFusionBenchContext; use clap::ValueEnum; @@ -80,9 +80,16 @@ pub async fn execute_benchmark_operation( fn create_benchmark_instance(benchmark: BenchmarkName) -> Box { match benchmark { - BenchmarkName::Bsbm { + BenchmarkName::BsbmExplore { dataset_size, max_query_count: query_size, } => Box::new(BsbmExploreBenchmark::new(dataset_size, query_size)), + BenchmarkName::BsbmBusinessIntelligence { + dataset_size, + max_query_count: query_size, + } => Box::new(BsbmBusinessIntelligenceBenchmark::new( + dataset_size, + query_size, + )), } }