Skip to content

Commit 17933e7

Browse files
authored
Add lazy backend ASV test (#7426)
* Update dataset_io.py * Update dataset_io.py * Update dataset_io.py * move _skip_slow to setup * Add timing for all engines. * Update dataset_io.py * Update dataset_io.py * Update dataset_io.py * Update dataset_io.py * Update dataset_io.py * Update dataset_io.py
1 parent f3b7c69 commit 17933e7

File tree

1 file changed

+191
-17
lines changed

1 file changed

+191
-17
lines changed

asv_bench/benchmarks/dataset_io.py

Lines changed: 191 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1+
from __future__ import annotations
2+
13
import os
4+
from dataclasses import dataclass
25

36
import numpy as np
47
import pandas as pd
58

69
import xarray as xr
710

8-
from . import _skip_slow, randint, randn, requires_dask
11+
from . import _skip_slow, parameterized, randint, randn, requires_dask
912

1013
try:
1114
import dask
@@ -16,6 +19,8 @@
1619

1720
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
1821

22+
_ENGINES = tuple(xr.backends.list_engines().keys() - {"store"})
23+
1924

2025
class IOSingleNetCDF:
2126
"""
@@ -28,10 +33,6 @@ class IOSingleNetCDF:
2833
number = 5
2934

3035
def make_ds(self):
31-
# TODO: Lazily skipped in CI as it is very demanding and slow.
32-
# Improve times and remove errors.
33-
_skip_slow()
34-
3536
# single Dataset
3637
self.ds = xr.Dataset()
3738
self.nt = 1000
@@ -95,6 +96,10 @@ def make_ds(self):
9596

9697
class IOWriteSingleNetCDF3(IOSingleNetCDF):
9798
def setup(self):
99+
# TODO: Lazily skipped in CI as it is very demanding and slow.
100+
# Improve times and remove errors.
101+
_skip_slow()
102+
98103
self.format = "NETCDF3_64BIT"
99104
self.make_ds()
100105

@@ -107,6 +112,9 @@ def time_write_dataset_scipy(self):
107112

108113
class IOReadSingleNetCDF4(IOSingleNetCDF):
109114
def setup(self):
115+
# TODO: Lazily skipped in CI as it is very demanding and slow.
116+
# Improve times and remove errors.
117+
_skip_slow()
110118

111119
self.make_ds()
112120

@@ -128,6 +136,9 @@ def time_vectorized_indexing(self):
128136

129137
class IOReadSingleNetCDF3(IOReadSingleNetCDF4):
130138
def setup(self):
139+
# TODO: Lazily skipped in CI as it is very demanding and slow.
140+
# Improve times and remove errors.
141+
_skip_slow()
131142

132143
self.make_ds()
133144

@@ -149,6 +160,9 @@ def time_vectorized_indexing(self):
149160

150161
class IOReadSingleNetCDF4Dask(IOSingleNetCDF):
151162
def setup(self):
163+
# TODO: Lazily skipped in CI as it is very demanding and slow.
164+
# Improve times and remove errors.
165+
_skip_slow()
152166

153167
requires_dask()
154168

@@ -189,6 +203,9 @@ def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self):
189203

190204
class IOReadSingleNetCDF3Dask(IOReadSingleNetCDF4Dask):
191205
def setup(self):
206+
# TODO: Lazily skipped in CI as it is very demanding and slow.
207+
# Improve times and remove errors.
208+
_skip_slow()
192209

193210
requires_dask()
194211

@@ -230,10 +247,6 @@ class IOMultipleNetCDF:
230247
number = 5
231248

232249
def make_ds(self, nfiles=10):
233-
# TODO: Lazily skipped in CI as it is very demanding and slow.
234-
# Improve times and remove errors.
235-
_skip_slow()
236-
237250
# multiple Dataset
238251
self.ds = xr.Dataset()
239252
self.nt = 1000
@@ -298,6 +311,10 @@ def make_ds(self, nfiles=10):
298311

299312
class IOWriteMultipleNetCDF3(IOMultipleNetCDF):
300313
def setup(self):
314+
# TODO: Lazily skipped in CI as it is very demanding and slow.
315+
# Improve times and remove errors.
316+
_skip_slow()
317+
301318
self.make_ds()
302319
self.format = "NETCDF3_64BIT"
303320

@@ -314,6 +331,9 @@ def time_write_dataset_scipy(self):
314331

315332
class IOReadMultipleNetCDF4(IOMultipleNetCDF):
316333
def setup(self):
334+
# TODO: Lazily skipped in CI as it is very demanding and slow.
335+
# Improve times and remove errors.
336+
_skip_slow()
317337

318338
requires_dask()
319339

@@ -330,6 +350,9 @@ def time_open_dataset_netcdf4(self):
330350

331351
class IOReadMultipleNetCDF3(IOReadMultipleNetCDF4):
332352
def setup(self):
353+
# TODO: Lazily skipped in CI as it is very demanding and slow.
354+
# Improve times and remove errors.
355+
_skip_slow()
333356

334357
requires_dask()
335358

@@ -346,6 +369,9 @@ def time_open_dataset_scipy(self):
346369

347370
class IOReadMultipleNetCDF4Dask(IOMultipleNetCDF):
348371
def setup(self):
372+
# TODO: Lazily skipped in CI as it is very demanding and slow.
373+
# Improve times and remove errors.
374+
_skip_slow()
349375

350376
requires_dask()
351377

@@ -400,6 +426,9 @@ def time_open_dataset_netcdf4_with_time_chunks_multiprocessing(self):
400426

401427
class IOReadMultipleNetCDF3Dask(IOReadMultipleNetCDF4Dask):
402428
def setup(self):
429+
# TODO: Lazily skipped in CI as it is very demanding and slow.
430+
# Improve times and remove errors.
431+
_skip_slow()
403432

404433
requires_dask()
405434

@@ -435,10 +464,6 @@ def time_open_dataset_scipy_with_time_chunks(self):
435464
def create_delayed_write():
436465
import dask.array as da
437466

438-
# TODO: Lazily skipped in CI as it is very demanding and slow.
439-
# Improve times and remove errors.
440-
_skip_slow()
441-
442467
vals = da.random.random(300, chunks=(1,))
443468
ds = xr.Dataset({"vals": (["a"], vals)})
444469
return ds.to_netcdf("file.nc", engine="netcdf4", compute=False)
@@ -450,7 +475,12 @@ class IOWriteNetCDFDask:
450475
number = 5
451476

452477
def setup(self):
478+
# TODO: Lazily skipped in CI as it is very demanding and slow.
479+
# Improve times and remove errors.
480+
_skip_slow()
481+
453482
requires_dask()
483+
454484
self.write = create_delayed_write()
455485

456486
def time_write(self):
@@ -459,15 +489,17 @@ def time_write(self):
459489

460490
class IOWriteNetCDFDaskDistributed:
461491
def setup(self):
492+
# TODO: Lazily skipped in CI as it is very demanding and slow.
493+
# Improve times and remove errors.
494+
_skip_slow()
495+
496+
requires_dask()
497+
462498
try:
463499
import distributed
464500
except ImportError:
465501
raise NotImplementedError()
466502

467-
# TODO: Lazily skipped in CI as it is very demanding and slow.
468-
# Improve times and remove errors.
469-
_skip_slow()
470-
471503
self.client = distributed.Client()
472504
self.write = create_delayed_write()
473505

@@ -476,3 +508,145 @@ def cleanup(self):
476508

477509
def time_write(self):
478510
self.write.compute()
511+
512+
513+
class IOReadSingleFile(IOSingleNetCDF):
514+
def setup(self, *args, **kwargs):
515+
self.make_ds()
516+
517+
self.filepaths = {}
518+
for engine in _ENGINES:
519+
self.filepaths[engine] = f"test_single_file_with_{engine}.nc"
520+
self.ds.to_netcdf(self.filepaths[engine], engine=engine)
521+
522+
@parameterized(["engine", "chunks"], (_ENGINES, [None, {}]))
523+
def time_read_dataset(self, engine, chunks):
524+
xr.open_dataset(self.filepaths[engine], engine=engine, chunks=chunks)
525+
526+
527+
class IOReadCustomEngine:
528+
def setup(self, *args, **kwargs):
529+
"""
530+
The custom backend does the bare mininum to be considered a lazy backend. But
531+
the data in it is still in memory so slow file reading shouldn't affect the
532+
results.
533+
"""
534+
requires_dask()
535+
536+
@dataclass
537+
class PerformanceBackendArray(xr.backends.BackendArray):
538+
filename_or_obj: str | os.PathLike | None
539+
shape: tuple[int, ...]
540+
dtype: np.dtype
541+
lock: xr.backends.locks.SerializableLock
542+
543+
def __getitem__(self, key: tuple):
544+
return xr.core.indexing.explicit_indexing_adapter(
545+
key,
546+
self.shape,
547+
xr.core.indexing.IndexingSupport.BASIC,
548+
self._raw_indexing_method,
549+
)
550+
551+
def _raw_indexing_method(self, key: tuple):
552+
raise NotImplementedError
553+
554+
@dataclass
555+
class PerformanceStore(xr.backends.common.AbstractWritableDataStore):
556+
manager: xr.backends.CachingFileManager
557+
mode: str | None = None
558+
lock: xr.backends.locks.SerializableLock | None = None
559+
autoclose: bool = False
560+
561+
def __post_init__(self):
562+
self.filename = self.manager._args[0]
563+
564+
@classmethod
565+
def open(
566+
cls,
567+
filename: str | os.PathLike | None,
568+
mode: str = "r",
569+
lock: xr.backends.locks.SerializableLock | None = None,
570+
autoclose: bool = False,
571+
):
572+
if lock is None:
573+
if mode == "r":
574+
locker = xr.backends.locks.SerializableLock()
575+
else:
576+
locker = xr.backends.locks.SerializableLock()
577+
else:
578+
locker = lock
579+
580+
manager = xr.backends.CachingFileManager(
581+
xr.backends.DummyFileManager,
582+
filename,
583+
mode=mode,
584+
)
585+
return cls(manager, mode=mode, lock=locker, autoclose=autoclose)
586+
587+
def load(self) -> tuple:
588+
"""
589+
Load a bunch of test data quickly.
590+
591+
Normally this method would've opened a file and parsed it.
592+
"""
593+
n_variables = 2000
594+
595+
# Important to have a shape and dtype for lazy loading.
596+
shape = (1,)
597+
dtype = np.dtype(int)
598+
variables = {
599+
f"long_variable_name_{v}": xr.Variable(
600+
data=PerformanceBackendArray(
601+
self.filename, shape, dtype, self.lock
602+
),
603+
dims=("time",),
604+
fastpath=True,
605+
)
606+
for v in range(0, n_variables)
607+
}
608+
attributes = {}
609+
610+
return variables, attributes
611+
612+
class PerformanceBackend(xr.backends.BackendEntrypoint):
613+
def open_dataset(
614+
self,
615+
filename_or_obj: str | os.PathLike | None,
616+
drop_variables: tuple[str] = None,
617+
*,
618+
mask_and_scale=True,
619+
decode_times=True,
620+
concat_characters=True,
621+
decode_coords=True,
622+
use_cftime=None,
623+
decode_timedelta=None,
624+
lock=None,
625+
**kwargs,
626+
) -> xr.Dataset:
627+
filename_or_obj = xr.backends.common._normalize_path(filename_or_obj)
628+
store = PerformanceStore.open(filename_or_obj, lock=lock)
629+
630+
store_entrypoint = xr.backends.store.StoreBackendEntrypoint()
631+
632+
ds = store_entrypoint.open_dataset(
633+
store,
634+
mask_and_scale=mask_and_scale,
635+
decode_times=decode_times,
636+
concat_characters=concat_characters,
637+
decode_coords=decode_coords,
638+
drop_variables=drop_variables,
639+
use_cftime=use_cftime,
640+
decode_timedelta=decode_timedelta,
641+
)
642+
return ds
643+
644+
self.engine = PerformanceBackend
645+
646+
@parameterized(["chunks"], ([None, {}]))
647+
def time_open_dataset(self, chunks):
648+
"""
649+
Time how fast xr.open_dataset is without the slow data reading part.
650+
Test with and without dask.
651+
"""
652+
xr.open_dataset(None, engine=self.engine, chunks=chunks)

0 commit comments

Comments
 (0)