Skip to content

Commit d1de5aa

Browse files
authored
Add ability to ignore the local cache (#119)
Fixes #44 Adds `ignore_cache` in various places to allow for the ignoring of cache. Useful for debugging and demos
1 parent 9496d1a commit d1de5aa

File tree

7 files changed

+281
-6
lines changed

7 files changed

+281
-6
lines changed

.vscode/settings.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,5 +79,8 @@
7979
"xaod",
8080
"xrootd"
8181
],
82-
"python.analysis.typeCheckingMode": "basic"
82+
"python.analysis.typeCheckingMode": "basic",
83+
"python.testing.pytestArgs": [
84+
"--no-cov"
85+
]
8386
}

README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,47 @@ If you'd like to be able to submit multiple queries and have them run on the `Se
9090

9191
For documentation of `get_data` and `get_data_async` see the `servicex.py` source file.
9292

93+
### The Local Cache
94+
95+
To speed things up - especially when you run the same query multiple times, the `servicex` package will cache queries data that comes back from Servicex. You can control where this is stored with the `cache_path` in the `.servicex` file (see below).
96+
97+
There are times when you want the system to ignore the cache when it is running. You can do this by using `ignore_cache()`:
98+
99+
```python
100+
from servicex import ignore_cache
101+
102+
with ignore_cache():
103+
do_query():
104+
```
105+
106+
If you are using a Jupyter notebook, the `with` statement can't really span cells. So use `ignore_cache().__enter__()` instead. Or you can do something like:
107+
108+
```python
109+
from servicex import ignore_cache
110+
111+
ic = ignore_cache()
112+
ic.__enter__()
113+
114+
...
115+
116+
ic.__exit__(None, None, None)
117+
```
118+
119+
If you wish to disable the cache for a single dataset, use the `ignore_cache` parameter when you create it:
120+
121+
```python
122+
ds = ServiceXDataset(dataset, ignore_cache=True)
123+
```
124+
125+
Finally, you can ignore the cache for a dataset for a short period of time by using the same context manager pattern:
126+
127+
```python
128+
ds = ServiceXData(dataset)
129+
with ds.ignore_cache():
130+
do_query(ds) # Cache is ignored
131+
do_query(ds) # Cache is not ignored
132+
```
133+
93134
## Configuration
94135

95136
As mentioned above, the `.servicex` file is read to pull a configuration. The search path for this file:

servicex/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@
1010
)
1111
from .servicex_adaptor import ServiceXAdaptor # NOQA
1212
from .minio_adaptor import MinioAdaptor # NOQA
13-
from .cache import Cache # NOQA
13+
from .cache import Cache, ignore_cache # NOQA

servicex/cache.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,53 @@
11
import json
22
from pathlib import Path
33
from typing import Dict, List, Optional, Tuple, Any
4+
from contextlib import contextmanager
45

56
from .utils import ServiceXException, _query_cache_hash, sanitize_filename
67

8+
_ignore_cache = False
9+
10+
11+
@contextmanager
12+
def ignore_cache():
13+
'''This will cause all caches to be ignored while it is invoked:
14+
15+
```
16+
with ignore_cache():
17+
ServiceXDataset(...).get_data...()
18+
```
19+
20+
If you want to do this globally, you can just use the `__enter__()` method.
21+
This is probably the only way to do this accross cells in a notebook.
22+
23+
```
24+
i = ignore_cache()
25+
i.__enter__()
26+
... Query code, jupyter notebook cells, etc. go here
27+
i.__exit(None, None, None)
28+
```
29+
30+
Note:
31+
32+
- The only time the cache is checked is when the query is actually made, not when
33+
the servicex dataset object is created!
34+
- Calls to this can be safely nested.
35+
- Note that calling this doesn't clear the cache or delete anything. It
36+
just prevents the cache lookup from working while it is in effect.
37+
'''
38+
global _ignore_cache
39+
old_value = _ignore_cache
40+
_ignore_cache = True
41+
yield
42+
_ignore_cache = old_value
43+
744

845
class Cache:
946
'''
1047
Caching for all data returns from the system. It provides both in-memory
1148
and on-disk cache.
49+
50+
TODO: Rename this to be an adaptor, unifying how we name things
1251
'''
1352
_in_memory_cache = {}
1453

@@ -17,22 +56,34 @@ def reset_cache(cls):
1756
'Reset the internal cache, usually used for testing'
1857
cls._in_memory_cache = {}
1958

20-
def __init__(self, cache_path: Path):
59+
def __init__(self, cache_path: Path, ignore_cache: bool = False):
2160
'''
2261
Create the cache object
2362
2463
Arguments:
2564
2665
cache_path The path to the cache directory. Only sub-directories
2766
will be created in this path.
67+
ignore_cache If true, then always ignore the cache for any queries
68+
against this dataset.
2869
'''
2970
self._path = cache_path
71+
self._ignore_cache = ignore_cache
3072

3173
@property
3274
def path(self) -> Path:
3375
'Return root path of cache directory'
3476
return self._path
3577

78+
@contextmanager
79+
def ignore_cache(self):
80+
'''Ignore the cache as long as we are held. Supports nesting.
81+
'''
82+
old_ignore = self._ignore_cache
83+
self._ignore_cache = True
84+
yield
85+
self._ignore_cache = old_ignore
86+
3687
def _query_cache_file(self, json: Dict[str, str]) -> Path:
3788
'Return the query cache file'
3889
h = _query_cache_hash(json)
@@ -47,6 +98,10 @@ def _files_cache_file(self, id: str) -> Path:
4798
return self._path / 'file_list_cache' / id
4899

49100
def lookup_query(self, json: Dict[str, str]) -> Optional[str]:
101+
global _ignore_cache
102+
if _ignore_cache or self._ignore_cache:
103+
return None
104+
50105
f = self._query_cache_file(json)
51106
if not f.exists():
52107
return None
@@ -108,6 +163,10 @@ def set_inmem(self, id: str, v: Any):
108163
self._in_memory_cache[id] = v
109164

110165
def lookup_inmem(self, id: str) -> Optional[Any]:
166+
global _ignore_cache
167+
if _ignore_cache or self._ignore_cache:
168+
return None
169+
111170
if id not in self._in_memory_cache:
112171
return None
113172
return self._in_memory_cache[id]

servicex/servicex.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ def __init__(self,
4747
local_log: log_adaptor = None,
4848
session_generator: Callable[[], Awaitable[aiohttp.ClientSession]] = None,
4949
config_adaptor: Optional[ServiceXConfigAdaptor] = None,
50-
data_convert_adaptor: Optional[DataConverterAdaptor] = None):
50+
data_convert_adaptor: Optional[DataConverterAdaptor] = None,
51+
ignore_cache: bool = False):
5152
'''
5253
Create and configure a ServiceX object for a dataset.
5354
@@ -81,6 +82,9 @@ def __init__(self,
8182
data_convert_adaptor Manages conversions between root and parquet and `pandas`
8283
and `awkward`, including default settings for expected
8384
datatypes from the backend.
85+
ignore_cache Always ignore the cache on any query for this dataset. This
86+
is only meaningful if no cache adaptor is provided. Defaults
87+
to false - the cache is used if possible.
8488
8589
Notes:
8690
@@ -101,7 +105,7 @@ def __init__(self,
101105
else ServiceXConfigAdaptor()
102106

103107
# Establish the cache that will store all our queries
104-
self._cache = Cache(get_configured_cache_path(config.settings)) \
108+
self._cache = Cache(get_configured_cache_path(config.settings), ignore_cache) \
105109
if cache_adaptor is None \
106110
else cache_adaptor
107111

@@ -127,6 +131,15 @@ def __init__(self,
127131
self._converter = data_convert_adaptor if data_convert_adaptor is not None \
128132
else DataConverterAdaptor(config.get_default_returned_datatype(backend_type))
129133

134+
def ignore_cache(self):
135+
'''Return a context manager that, as long as it is held, will cause any queries against just
136+
this dataset to ignore any locally cached data.
137+
138+
Returns:
139+
ContextManager: As long as this is held, the local query cache will be ignored.
140+
'''
141+
return self._cache.ignore_cache()
142+
130143
@functools.wraps(ServiceXABC.get_data_rootfiles_async, updated=())
131144
@_wrap_in_memory_sx_cache
132145
async def get_data_rootfiles_async(self, selection_query: str) -> List[Path]:

tests/test_cache.py

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from servicex.utils import ServiceXException
22
import pytest
33

4-
from servicex.cache import Cache
4+
from servicex.cache import Cache, ignore_cache
55

66

77
@pytest.fixture()
@@ -27,6 +27,26 @@ def test_query_hit_1(tmp_path):
2727
assert c.lookup_query({'hi': 'there'}) == 'dude'
2828

2929

30+
def test_ic_query(tmp_path):
31+
c = Cache(tmp_path)
32+
c.set_query({'hi': 'there'}, 'dude')
33+
with ignore_cache():
34+
assert c.lookup_query({'hi': 'there'}) is None
35+
36+
37+
def test_ic_query_query_context(tmp_path):
38+
c = Cache(tmp_path)
39+
c.set_query({'hi': 'there'}, 'dude')
40+
with c.ignore_cache():
41+
assert c.lookup_query({'hi': 'there'}) is None
42+
43+
44+
def test_ic_query_ds_level(tmp_path):
45+
c = Cache(tmp_path, ignore_cache=True)
46+
c.set_query({'hi': 'there'}, 'dude')
47+
assert c.lookup_query({'hi': 'there'}) is None
48+
49+
3050
def test_query_hit_2(tmp_path):
3151
c = Cache(tmp_path)
3252
c.set_query({'hi': 'there'}, 'dude1')
@@ -61,6 +81,14 @@ def test_files_hit(tmp_path):
6181
assert c.lookup_files('1234') == [['hi', '1'], ['there', '1']]
6282

6383

84+
def test_ic_files_hit(tmp_path):
85+
'The file list should not be affected by cache ignores'
86+
c = Cache(tmp_path)
87+
c.set_files('1234', [('hi', '1'), ('there', '1')])
88+
with ignore_cache():
89+
assert c.lookup_files('1234') == [['hi', '1'], ['there', '1']]
90+
91+
6492
def test_files_hit_reloaded(tmp_path):
6593
c1 = Cache(tmp_path)
6694
c1.set_files('1234', [('hi', '1'), ('there', '1')])
@@ -80,6 +108,29 @@ def test_memory_hit(tmp_path):
80108
assert c.lookup_inmem('dude') is r
81109

82110

111+
def test_ic_memory_hit(tmp_path):
112+
c = Cache(tmp_path)
113+
r = 10
114+
c.set_inmem('dude', r)
115+
with ignore_cache():
116+
assert c.lookup_inmem('dude') is None
117+
118+
119+
def test_ic_memory_hit_ds_context(tmp_path):
120+
c = Cache(tmp_path)
121+
r = 10
122+
c.set_inmem('dude', r)
123+
with c.ignore_cache():
124+
assert c.lookup_inmem('dude') is None
125+
126+
127+
def test_ic_memory_hit_ds_level(tmp_path):
128+
c = Cache(tmp_path, ignore_cache=True)
129+
r = 10
130+
c.set_inmem('dude', r)
131+
assert c.lookup_inmem('dude') is None
132+
133+
83134
def test_memory_hit_accross(tmp_path):
84135
c1 = Cache(tmp_path)
85136
r = 10
@@ -131,3 +182,49 @@ def test_query_cache_status_bad(tmp_path):
131182

132183
with pytest.raises(ServiceXException):
133184
c.lookup_query_status('111-222-333')
185+
186+
187+
def test_ic_query_cache_status(tmp_path):
188+
'Query status should be cached and accessed *during* a query'
189+
c = Cache(tmp_path)
190+
info = {'request_id': '111-222-333', 'key': 'bogus'}
191+
c.set_query_status(info)
192+
with ignore_cache():
193+
info1 = c.lookup_query_status('111-222-333')
194+
assert info1['key'] == 'bogus'
195+
196+
197+
def test_ic_restore(tmp_path):
198+
c = Cache(tmp_path)
199+
c.set_query({'hi': 'there'}, 'dude')
200+
with ignore_cache():
201+
pass
202+
assert c.lookup_query({'hi': 'there'}) == 'dude'
203+
204+
205+
def test_ic_nesting(tmp_path):
206+
c = Cache(tmp_path)
207+
c.set_query({'hi': 'there'}, 'dude')
208+
with ignore_cache():
209+
with ignore_cache():
210+
pass
211+
assert c.lookup_query({'hi': 'there'}) is None
212+
213+
214+
def test_ic_nesting_ds_context(tmp_path):
215+
c = Cache(tmp_path)
216+
c.set_query({'hi': 'there'}, 'dude')
217+
with c.ignore_cache():
218+
with c.ignore_cache():
219+
pass
220+
assert c.lookup_query({'hi': 'there'}) is None
221+
222+
223+
def test_ic_enter_exit(tmp_path):
224+
c = Cache(tmp_path)
225+
c.set_query({'hi': 'there'}, 'dude')
226+
i = ignore_cache()
227+
i.__enter__()
228+
assert c.lookup_query({'hi': 'there'}) is None
229+
i.__exit__(None, None, None)
230+
assert c.lookup_query({'hi': 'there'}) == 'dude'

0 commit comments

Comments
 (0)