From 1f9eb682edc29de9c648e00acc300d5d9a020f1f Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Sun, 5 May 2024 21:41:58 -0400 Subject: [PATCH 1/7] Micro optimize dataset.isel for speed on large datasets This targets optimization for datasets with many "scalar" variables (that is variables without any dimensions). This can happen in the context where you have many pieces of small metadata that relate to various facts about an experimental condition. For example, we have about 80 of these in our datasets (and I want to incrase this number) Our datasets are quite large (On the order of 1TB uncompresed) so we often have one dimension that is in the 10's of thousands. However, it has become quite slow to index in the dataset. We therefore often "carefully slice out the matadata we need" prior to doing anything with our dataset, but that isn't quite possible with you want to orchestrate things with a parent application. These optimizations are likely "minor" but considering the results of the benchmark, I think they are quite worthwhile: * main (as of #9001) - 2.5k its/s * With #9002 - 4.2k its/s * With this Pull Request (on top of #9002) -- 6.1k its/s Thanks for considering. --- xarray/core/dataset.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0b8be674675..b2da2c78e61 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2983,20 +2983,30 @@ def isel( coord_names = self._coord_names.copy() indexes, index_variables = isel_indexes(self.xindexes, indexers) + all_keys = set(indexers.keys()) for name, var in self._variables.items(): # preserve variable order if name in index_variables: var = index_variables[name] - else: - var_indexers = {k: v for k, v in indexers.items() if k in var.dims} - if var_indexers: + dims.update(zip(var.dims, var.shape)) + # Fastpath, skip all of this for variables with no dimensions + # Keep the result cached for future dictionary update + elif var_dims := var.dims: + # Large datasets with alot of metadata may have many scalars + # without any relevant dimensions for slicing. + # Pick those out quickly and avoid paying the cost below + # of resolving the var_indexers variables + if var_indexer_keys := all_keys.intersection(var_dims): + var_indexers = {k: indexers[k] for k in var_indexer_keys} var = var.isel(var_indexers) if drop and var.ndim == 0 and name in coord_names: coord_names.remove(name) continue + # Update our reference to `var_dims` after the call to isel + var_dims = var.dims + dims.update(zip(var_dims, var.shape)) variables[name] = var - dims.update(zip(var.dims, var.shape)) return self._construct_direct( variables=variables, From 53dc76ee94f9f8e90703097aa9914e6ec8f4537b Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Tue, 11 Jun 2024 23:04:32 -0400 Subject: [PATCH 2/7] Use len to improve readability --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b2da2c78e61..bc827539d20 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2992,7 +2992,7 @@ def isel( dims.update(zip(var.dims, var.shape)) # Fastpath, skip all of this for variables with no dimensions # Keep the result cached for future dictionary update - elif var_dims := var.dims: + elif len(var_dims := var.dims): # Large datasets with alot of metadata may have many scalars # without any relevant dimensions for slicing. # Pick those out quickly and avoid paying the cost below From b016dceadf99c557881f861e76f7a57860bbb358 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Tue, 11 Jun 2024 23:28:39 -0400 Subject: [PATCH 3/7] Add more comment --- xarray/core/dataset.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index bc827539d20..0e16d723eb4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2990,8 +2990,18 @@ def isel( if name in index_variables: var = index_variables[name] dims.update(zip(var.dims, var.shape)) - # Fastpath, skip all of this for variables with no dimensions - # Keep the result cached for future dictionary update + # Fastpath, skip all this metadata analysis for variables + # with no dimensions + # Keep the result of var.dims cached for future accesss to it + # + # Optimization Note from hmaarrfk - 2024/06 + # https://github.com/pydata/xarray/pull/9003#discussion_r1592767493 + # It was found that accessing var.dims is faster than + # using var.shape or var.ndim since resolving both is typically + # left to the underlying array that each Xarray structure wraps. + # By using var.dims, we can avoid the cost of resolving the + # underlying array's shape and ndim since the dims are already + # cached by the Variable elif len(var_dims := var.dims): # Large datasets with alot of metadata may have many scalars # without any relevant dimensions for slicing. From 4e0701a25b27fac7601ae5db241ccc9d762179db Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Tue, 11 Jun 2024 23:29:06 -0400 Subject: [PATCH 4/7] Add whats-new --- doc/whats-new.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 51a2c98fb9c..5e1262707c5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,15 @@ New Features - Allow chunking for arrays with duplicated dimension names (:issue:`8759`, :pull:`9099`). By `Martin Raspaud `_. +Performance +~~~~~~~~~~~ + +- Small optimization to the netCDF4 and h5netcdf backends (:issue:`9058`, :pull:`9067`). + By `Deepak Cherian `_. +- Small optimizations to help reduce indexing speed of datasets (:pull:`9002`, :pull:`9003`). + By `Mark Harfouche `_. + + Breaking changes ~~~~~~~~~~~~~~~~ From 5bcf40c24a06a435382bf281f4af71fb1d5b9fe3 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Tue, 18 Jun 2024 20:17:23 -0400 Subject: [PATCH 5/7] update release note --- doc/whats-new.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5e1262707c5..4ef2519dce1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -28,9 +28,7 @@ New Features Performance ~~~~~~~~~~~ -- Small optimization to the netCDF4 and h5netcdf backends (:issue:`9058`, :pull:`9067`). - By `Deepak Cherian `_. -- Small optimizations to help reduce indexing speed of datasets (:pull:`9002`, :pull:`9003`). +- Small optimizations to help reduce indexing speed of datasets (:pull:`9003`). By `Mark Harfouche `_. From f7945a311bb339045af9736216c5d42d77ecdf91 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 21 Jun 2024 14:38:03 -0600 Subject: [PATCH 6/7] cleanup --- xarray/core/dataset.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0e16d723eb4..34acac1514f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2986,14 +2986,6 @@ def isel( all_keys = set(indexers.keys()) for name, var in self._variables.items(): - # preserve variable order - if name in index_variables: - var = index_variables[name] - dims.update(zip(var.dims, var.shape)) - # Fastpath, skip all this metadata analysis for variables - # with no dimensions - # Keep the result of var.dims cached for future accesss to it - # # Optimization Note from hmaarrfk - 2024/06 # https://github.com/pydata/xarray/pull/9003#discussion_r1592767493 # It was found that accessing var.dims is faster than @@ -3002,21 +2994,24 @@ def isel( # By using var.dims, we can avoid the cost of resolving the # underlying array's shape and ndim since the dims are already # cached by the Variable - elif len(var_dims := var.dims): - # Large datasets with alot of metadata may have many scalars - # without any relevant dimensions for slicing. - # Pick those out quickly and avoid paying the cost below - # of resolving the var_indexers variables - if var_indexer_keys := all_keys.intersection(var_dims): + var_dims = var.dims + # Fastpath, skip all this metadata analysis for variables + # with no dimensions + if var_dims: + # preserve variable order + if name in index_variables: + var = index_variables[name] + elif var_indexer_keys := all_keys.intersection(var_dims): var_indexers = {k: indexers[k] for k in var_indexer_keys} var = var.isel(var_indexers) if drop and var.ndim == 0 and name in coord_names: coord_names.remove(name) continue # Update our reference to `var_dims` after the call to isel + # to reflect any dropped dimensions var_dims = var.dims - dims.update(zip(var_dims, var.shape)) variables[name] = var + dims.update(zip(var_dims, var.shape)) return self._construct_direct( variables=variables, From 1c2e5c65efddf503503fde39f673484ee4c089c9 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Sat, 22 Jun 2024 16:49:35 -0400 Subject: [PATCH 7/7] Revert "cleanup" This reverts commit f7945a311bb339045af9736216c5d42d77ecdf91. --- xarray/core/dataset.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 34acac1514f..0e16d723eb4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2986,6 +2986,14 @@ def isel( all_keys = set(indexers.keys()) for name, var in self._variables.items(): + # preserve variable order + if name in index_variables: + var = index_variables[name] + dims.update(zip(var.dims, var.shape)) + # Fastpath, skip all this metadata analysis for variables + # with no dimensions + # Keep the result of var.dims cached for future accesss to it + # # Optimization Note from hmaarrfk - 2024/06 # https://github.com/pydata/xarray/pull/9003#discussion_r1592767493 # It was found that accessing var.dims is faster than @@ -2994,24 +3002,21 @@ def isel( # By using var.dims, we can avoid the cost of resolving the # underlying array's shape and ndim since the dims are already # cached by the Variable - var_dims = var.dims - # Fastpath, skip all this metadata analysis for variables - # with no dimensions - if var_dims: - # preserve variable order - if name in index_variables: - var = index_variables[name] - elif var_indexer_keys := all_keys.intersection(var_dims): + elif len(var_dims := var.dims): + # Large datasets with alot of metadata may have many scalars + # without any relevant dimensions for slicing. + # Pick those out quickly and avoid paying the cost below + # of resolving the var_indexers variables + if var_indexer_keys := all_keys.intersection(var_dims): var_indexers = {k: indexers[k] for k in var_indexer_keys} var = var.isel(var_indexers) if drop and var.ndim == 0 and name in coord_names: coord_names.remove(name) continue # Update our reference to `var_dims` after the call to isel - # to reflect any dropped dimensions var_dims = var.dims + dims.update(zip(var_dims, var.shape)) variables[name] = var - dims.update(zip(var_dims, var.shape)) return self._construct_direct( variables=variables,