Improved result matrix stacking for Hamming GPU implementation (#617)

felixpetschko · pre-commit-ci[bot] · grst · web-flow · commit 7b769330fa3e · 2025-06-11T14:34:21.000+02:00
* additional numba implementation for gpu result matrix stacking * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Changelog updated for GPU hamming result matrix block stacking with Numba * Lower-bound numba --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Gregor Sturm <mail@gregor-sturm.de>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning][].
 
 ## [Unreleased]
 
+### Performance improvements
+
+- The stacking of the result matrix blocks of the GPU implementation of the Hamming distance metric has been reimplemented with Numba ([#617](https://github.com/scverse/scirpy/pull/617)).
+
 ### Fixes
 
 - Ensure that clonotype network plots don't have any axis ticks ([#607](https://github.com/scverse/scirpy/pull/607)).
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
   "logomaker!=0.8.5",
   "mudata>=0.2.3",
   "networkx>=2.5",
-  "numba>=0.41",
+  "numba>=0.57",
   "numpy>=1.17",
   "pandas>=1.5,!=2.1.2",
   "pooch>=1.7",
diff --git a/src/scirpy/ir_dist/metrics.py b/src/scirpy/ir_dist/metrics.py
@@ -1107,12 +1107,44 @@ def calc_block_gpu(
         Current number: {num_elements}, Maximum number: {np.iinfo(np.int32).max}.
         Consider choosing a smaller cutoff to resolve this issue."""
 
-        result_sparse = result_blocks[0]
-        for i in range(1, len(result_blocks)):
-            result_sparse += result_blocks[i]
+        @nb.njit
+        def csr_union_numba(block_data, block_indices, block_indptrs, num_rows, num_elements):
+            data = np.empty(num_elements, dtype=block_data[0].dtype)
+            indices = np.empty(num_elements, dtype=block_indices[0].dtype)
+            indptr = np.zeros(num_rows + 1, dtype=np.int32)
 
-        row_element_counts_gpu = np.diff(result_sparse.indptr)
+            ptr = 0
+            for row in range(num_rows):
+                for b in range(len(block_indptrs)):
+                    start = block_indptrs[b][row]
+                    end = block_indptrs[b][row + 1]
+                    count = end - start
+
+                    for j in range(count):
+                        data[ptr + j] = block_data[b][start + j]
+                        indices[ptr + j] = block_indices[b][start + j]
+
+                    ptr += count
+                indptr[row + 1] = ptr
+
+            return data, indices, indptr
+
+        def csr_union(blocks):
+            num_rows = blocks[0].shape[0]
+            num_elements = sum(b.nnz for b in blocks)
 
+            block_data = [b.data for b in blocks]
+            block_indices = [b.indices for b in blocks]
+            block_indptrs = [b.indptr for b in blocks]
+
+            data, indices, indptr = csr_union_numba(block_data, block_indices, block_indptrs, num_rows, num_elements)
+
+            shape = blocks[0].shape
+            return csr_matrix((data, indices, indptr), shape=shape)
+
+        result_sparse = csr_union(result_blocks)
+
+        row_element_counts_gpu = np.diff(result_sparse.indptr)
         result_sparse.sort_indices()
 
         # Returns the results in a way that fits the current interface, could be improved later