Add pegasus implementations of kBET (#53)

mumichae · lazappi · rcannood · web-flow · commit 7e4d3b5fc7e5 · 2025-04-11T13:29:23.000+02:00
* add global kbet metric from pegasus implementation

* add per label averaged kBET implementation from pegasus

* fixed label kbet naming

* add new kbet components to workflow

* update CHANGELOG

* update documentation on running the workflow

* Apply suggestions from code review

Co-authored-by: Luke Zappia &lt;lazappi@users.noreply.github.com&gt;

* move changelog message to devel section

* set to maximum available number of threads

* Apply suggestions from code review

Co-authored-by: Robrecht Cannoodt &lt;rcannood@gmail.com&gt;

* Apply suggestions from code review

---------

Co-authored-by: Luke Zappia &lt;lazappi@users.noreply.github.com&gt;
Co-authored-by: Robrecht Cannoodt &lt;rcannood@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # task_batch_integration devel
 
+## New functionality
+
+* Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52).
+
 ## Minor changes
 
 * Un-pin the scPRINT version and update parameters (PR #51)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -141,12 +141,14 @@ viash ns test --parallel
 
 ### Running the benchmark
 
-To run the benchmark, you can use the following command:
+To locally test the benchmark workflow, you can use the following command:
 
 ```bash
-scripts/run_benchmark/run.sh
+scripts/run_benchmark/run_test_local.sh
 ```
 
+Other scripts in `scripts/run_benchmark/` provide commands for testing on Seqera Cloud and for production runs. 
+
 ## Debugging nf-tower runs
 
 The actual benchmark is run on the [nf-tower platform](https://cloud.seqera.io/orgs/openproblems-bio/workspaces/openproblems-bio/watch).
diff --git a/src/metrics/kbet_pg/config.vsh.yaml b/src/metrics/kbet_pg/config.vsh.yaml
@@ -0,0 +1,49 @@
+__merge__: /src/api/comp_metric.yaml
+name: kbet_pg
+info:
+  metric_type: embedding
+  metrics:
+    - name: kbet_pg
+      label: kBET pegasus
+      summary: kBET algorithm to determine how well batches are mixed within a cell
+        type
+      # TODO: transform into more readable markdown with proper formulae formatting
+      description: |
+        The kBET algorithm (v.0.99.6, release 4c9dafa) determines whether the label composition
+        of a k nearest neighborhood of a cell is similar to the expected (global) label
+        composition (Buettner et al., Nat Meth 2019). The test is repeated for a random subset
+        of cells, and the results are summarized as a rejection rate over all tested
+        neighborhoods.
+
+        This implementation uses the `pegasus.calc_kBET` function.
+
+        In Open Problems we do not run kBET on graph outputs to avoid computation-intensive
+        diffusion processes being run.
+      references:
+        doi: 10.1038/s41592-021-01336-8
+      links:
+        homepage: https://pegasus.readthedocs.io/en/stable/
+        documentation: https://pegasus.readthedocs.io/en/stable/api/pegasus.calc_kBET.html
+        repository: https://github.com/lilab-bcb/pegasus
+      min: 0
+      max: 1
+      maximize: true
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: python
+        pypi:
+          - pegasuspy
+          - zarr<3.0
+          - pandas<2.0
+          - numpy<2.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [hightime, veryhighmem, lowcpu]
diff --git a/src/metrics/kbet_pg/script.py b/src/metrics/kbet_pg/script.py
@@ -0,0 +1,64 @@
+from tqdm import tqdm
+import sys
+import numpy as np
+import anndata as ad
+import pegasus as pg
+import pegasusio
+from scipy.sparse import csr_matrix
+
+
+def compute_kbet(mmdata, *args, **kwargs):
+    stat_mean, pvalue_mean, accept_rate = pg.calc_kBET(mmdata, *args, **kwargs)
+    return accept_rate
+
+
+## VIASH START
+par = {
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
+    'output': 'output.h5ad',
+}
+
+meta = {
+    'name': 'kbet_pg',
+}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+n_threads = meta["cpus"] or -1
+
+print('Read input...', flush=True)
+adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns')
+adata.obs = read_anndata(par['input_solution'], obs='obs').obs
+adata.uns |= read_anndata(par['input_solution'], uns='uns').uns
+print(adata, flush=True)
+
+print('Convert to pegasusio.MultimodalData...', flush=True)
+adata.X = csr_matrix(adata.shape)
+mmdata = pegasusio.MultimodalData(adata)
+
+print('Compute global kBET...', flush=True)
+score = compute_kbet(
+    mmdata,
+    attr="batch",
+    rep="emb",
+    K=50,
+    n_jobs=n_threads,
+)
+print('Global kBET score:', score, flush=True)
+
+print('Create output AnnData object', flush=True)
+metric_name = meta['name']
+output = ad.AnnData(
+    uns={
+        'dataset_id': adata.uns['dataset_id'],
+        'normalization_id': adata.uns['normalization_id'],
+        'method_id': adata.uns['method_id'],
+        'metric_ids': [ metric_name ],
+        'metric_values': [ score ]
+    }
+)
+
+print('Write data to file', flush=True)
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/metrics/kbet_pg_label/config.vsh.yaml b/src/metrics/kbet_pg_label/config.vsh.yaml
@@ -0,0 +1,49 @@
+__merge__: /src/api/comp_metric.yaml
+name: kbet_pg_label
+info:
+  metric_type: embedding
+  metrics:
+    - name: kbet_pg_label
+      label: kBET pegasus label
+      summary: kBET algorithm to determine how well batches are mixed within a cell
+        type
+      description: |
+        The kBET algorithm (v.0.99.6, release 4c9dafa) determines whether the label composition
+        of a k nearest neighborhood of a cell is similar to the expected (global) label
+        composition (Buettner et al., Nat Meth 2019). The test is repeated for a random subset
+        of cells, and the results are summarized as a rejection rate over all tested
+        neighborhoods.
+
+        This implementation uses the `pegasus.calc_kBET` function per cell type and the rejection
+        rate is averaged over all cell types.
+
+        In Open Problems we do not run kBET on graph outputs to avoid computation-intensive
+        diffusion processes being run.
+      references:
+        doi: 10.1038/s41592-021-01336-8
+      links:
+        homepage: https://pegasus.readthedocs.io/en/stable/
+        documentation: https://pegasus.readthedocs.io/en/stable/api/pegasus.calc_kBET.html
+        repository: https://github.com/lilab-bcb/pegasus
+      min: 0
+      max: 1
+      maximize: true
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: python
+        pypi:
+          - pegasuspy
+          - zarr<3.0
+          - pandas<2.0
+          - numpy<2.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [hightime, veryhighmem, lowcpu]
diff --git a/src/metrics/kbet_pg_label/script.py b/src/metrics/kbet_pg_label/script.py
@@ -0,0 +1,84 @@
+from tqdm import tqdm
+import sys
+import numpy as np
+import anndata as ad
+import pegasus as pg
+import pegasusio
+from scipy.sparse import csr_matrix
+from joblib import Parallel, delayed
+
+
+def compute_kbet(mmdata, *args, **kwargs):
+    stat_mean, pvalue_mean, accept_rate = pg.calc_kBET(mmdata, *args, **kwargs)
+    return accept_rate
+
+
+## VIASH START
+par = {
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
+    'output': 'output.h5ad',
+}
+
+meta = {
+    'name': 'foo',
+}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+n_threads = meta["cpus"] or -1
+
+print('Read input...', flush=True)
+adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns')
+adata.obs = read_anndata(par['input_solution'], obs='obs').obs
+adata.uns |= read_anndata(par['input_solution'], uns='uns').uns
+adata.X = csr_matrix(adata.shape)
+print(adata, flush=True)
+
+print('Compute cell type averaged kBET...', flush=True)
+cell_types = adata.obs['cell_type'].unique()
+scores = []
+mmdata_per_cell_type = []
+
+# collect all adata subsets
+for cell_type in cell_types:
+    ad_sub = adata[adata.obs['cell_type'] == cell_type]
+    if ad_sub.obs['batch'].nunique() <= 1:
+        print(f'Skipping cell type {cell_type} because it\'s present in only one batch', flush=True)
+        continue
+    _mmdata = pegasusio.MultimodalData(ad_sub.copy())
+    mmdata_per_cell_type.append(_mmdata)
+
+# compute kBET scores
+scores = Parallel(n_jobs=n_threads)(
+    delayed(compute_kbet)(
+        _mmdata,
+        attr="batch",
+        rep="emb",
+        K=50,
+        use_cache=False,
+        n_jobs=1,
+    ) for _mmdata in tqdm(
+        mmdata_per_cell_type,
+        desc=f'Compute per cell type with {n_threads} threads',
+        miniters=1,
+    )
+)
+score = np.nanmean(scores)
+print('Cell type averaged kBET score:', score, flush=True)
+
+print('Create output AnnData object', flush=True)
+metric_name = meta['name']
+output = ad.AnnData(
+    uns={
+        'dataset_id': adata.uns['dataset_id'],
+        'normalization_id': adata.uns['normalization_id'],
+        'method_id': adata.uns['method_id'],
+        'metric_ids': [ metric_name ],
+        'metric_values': [ score ]
+    }
+)
+
+print('Write data to file', flush=True)
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
@@ -117,6 +117,8 @@ dependencies:
   - name: metrics/isolated_label_asw
   - name: metrics/isolated_label_f1
   - name: metrics/kbet
+  - name: metrics/kbet_pg
+  - name: metrics/kbet_pg_label
   - name: metrics/lisi
   - name: metrics/pcr
   # data processors
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
@@ -56,6 +56,8 @@ metrics = [
   isolated_label_asw,
   isolated_label_f1,
   kbet,
+  kbet_pg,
+  kbet_pg_label,
   lisi,
   pcr
 ]

Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,8 @@ metrics = [`
`56`	`56`	`isolated_label_asw,`
`57`	`57`	`isolated_label_f1,`
`58`	`58`	`kbet,`
	`59`	`+ kbet_pg,`
	`60`	`+ kbet_pg_label,`
`59`	`61`	`lisi,`
`60`	`62`	`pcr`
`61`	`63`	`]`