Skip to content

Commit 9341628

Browse files
authored
Awkward 2.0 support in servicex 2.0 (#317)
Enable awkward 2.0 support in `servicex` * Setup configured to support only awkward 1.0 if you are on python 3.7 (we need to drop support for this!) * If you load a `awkward` array using this library, it will give you an `awkward_dask` array if `awkward` 2.0 is loaded, or a `lazy` array if `awkward` 1.0 is in your environment. Fixes #316
1 parent 9789b7d commit 9341628

File tree

4 files changed

+40
-15
lines changed

4 files changed

+40
-15
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ The file can contain an `api_endpoint` as mentioned earlier. In addition the oth
195195
All strings are expanded using python's [os.path.expand](https://docs.python.org/3/library/os.path.html#os.path.expandvars) method - so `$NAME` and `${NAME}` will work to expand existing environment variables.
196196

197197
For non-standard use cases, the user can specify:
198+
198199
- The code generator that is used by the backend. This is done by passing a `codegen` argument to ServiceXDataset. This argument is normally inherited from the backend type set in `servicex.yaml`, but can be overridden with any valid `codegen` contained in the default type listing. A `codegen` entry can also be added to a backend in the yaml file to use as default.
199200
- The type of backend, using the `backend_type` argument on ServiceXDataset. This overrides the backend type setting in the `servicex.yaml` file.
200201

@@ -206,7 +207,8 @@ Implemented:
206207
- Exceptions are used to report back errors of all sorts from the service to the user's code.
207208
- Data is return in the following forms:
208209
- `pandas.DataFrame` an in process DataFrame of all the data requested
209-
- `awkward` an in process `JaggedArray` or dictionary of `JaggedArray`s
210+
- `awkward` an in process `JaggedArray` or dictionary of `JaggedArray`s.
211+
- If you have `awkward` 2.0 installed, then a `dask_awkward` array is returned instead.
210212
- A list of root files that can be opened with `uproot` and used as desired.
211213
- Not all output formats are compatible with all transformations.
212214
- Complete returned data must fit in the process' memory

servicex/data_conversions.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ async def _convert_root_to_pandas(self, file: Path):
9797
def do_the_work(file: Path) -> DataFrame:
9898
import uproot as uproot
9999

100-
with uproot.open(file) as f_in:
100+
with uproot.open(file) as f_in: # type: ignore
101101
r = f_in[f_in.keys()[0]]
102102
return r.arrays(library="pd") # type: ignore
103103

@@ -152,10 +152,18 @@ async def _convert_root_to_awkward(self, file: Path):
152152
def do_the_work(file: Path) -> ak.Array:
153153
import uproot as uproot
154154

155-
with uproot.open(file) as f_in:
155+
with uproot.open(file) as f_in: # type: ignore
156156
tree_name = f_in.keys()[0]
157157

158-
return uproot.lazy(f"{file}:{tree_name}")
158+
if hasattr(uproot, "lazy"):
159+
return uproot.lazy(f"{file}:{tree_name}") # type: ignore
160+
161+
if hasattr(uproot, "dask"):
162+
return uproot.dask(f"{file}:{tree_name}") # type: ignore
163+
164+
assert (
165+
False
166+
), "Uproot version does not have either `dask` or `lazy` - please fix environment!"
159167

160168
return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))
161169

setup.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,21 @@
1818
else:
1919
version = version.split("/")[-1]
2020

21+
# Awkward 2.0 is only allowed on Python 3.8+ - so we need to shift the
22+
# awkward requirement a little bit.
23+
# TODO: Remove this when we stop supporting 3.7.
24+
if sys.version_info < (3, 8):
25+
awkward_requirements = [
26+
"awkward>=1.0.1,<2",
27+
"uproot>=4.0.1,<5",
28+
]
29+
else:
30+
awkward_requirements = [
31+
"awkward>=1.0.1",
32+
"dask_awkward",
33+
"fsspec",
34+
"uproot>=4.0.1",
35+
]
2136
setup(
2237
name="servicex",
2338
version=version,
@@ -37,8 +52,7 @@
3752
install_requires=[
3853
"idna==2.10", # Required to thread version needle with requests library
3954
"pandas~=1.0",
40-
"uproot>=4.0.1, <5",
41-
"awkward>=1.0.1, <2",
55+
"uproot>=4.0.1",
4256
"backoff>=2.0",
4357
"aiohttp~=3.6",
4458
"minio~=5.0",
@@ -48,7 +62,8 @@
4862
"google-auth",
4963
"confuse",
5064
"pyarrow>=1.0",
51-
],
65+
]
66+
+ awkward_requirements,
5267
extras_require={
5368
"test": [
5469
"pytest>=3.9",

tests/test_data_conversions.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
def check_awkward_accessible(col: ak.Array):
99
"Check to make sure we can look at every item in column"
10-
ak.repartition(col, 3) # type: ignore
10+
ak.sum(col)
1111

1212

1313
def check_pandas_accessible(col):
@@ -51,14 +51,14 @@ async def test_parquet_to_pandas(good_uproot_file_path):
5151
@pytest.mark.asyncio
5252
async def test_parquet_to_awkward(good_uproot_file_path):
5353
df = await DataConverterAdaptor("parquet").convert_to_awkward(good_uproot_file_path)
54-
assert len(df["JetPT"]) == 115714
54+
assert len(df["JetPT"]) == 115714 # type: ignore
5555
check_awkward_accessible(df["JetPT"]) # type: ignore
5656

5757

5858
@pytest.mark.asyncio
5959
async def test_root_to_awkward(good_root_file_path):
6060
df = await DataConverterAdaptor("root-file").convert_to_awkward(good_root_file_path)
61-
assert len(df["JetPt"]) == 283458
61+
assert len(df["JetPt"]) == 283458 # type: ignore
6262
check_awkward_accessible(df["JetPt"]) # type: ignore
6363

6464

@@ -84,7 +84,7 @@ def test_combine_pandas_from_root(good_root_file_path):
8484
def load_df():
8585
import uproot as uproot
8686

87-
with uproot.open(good_root_file_path) as f_in:
87+
with uproot.open(good_root_file_path) as f_in: # type: ignore
8888
r = f_in[f_in.keys()[0]]
8989
return r.arrays(library="pd") # type: ignore
9090

@@ -120,9 +120,9 @@ def test_combine_awkward_from_root(good_root_file_path):
120120
def load_df():
121121
import uproot as uproot
122122

123-
with uproot.open(good_root_file_path) as f_in:
123+
with uproot.open(good_root_file_path) as f_in: # type: ignore
124124
tree_name = f_in.keys()[0]
125-
return uproot.lazy(f"{good_root_file_path}:{tree_name}")
125+
return f_in[tree_name].arrays() # type: ignore
126126

127127
df1 = load_df()
128128
df2 = load_df()
@@ -142,7 +142,7 @@ def load_df():
142142
df1 = load_df()
143143
df2 = load_df()
144144

145-
combined = DataConverterAdaptor("root-file").combine_awkward([df1, df2])
145+
combined = DataConverterAdaptor("root-file").combine_awkward([df1, df2]) # type: ignore
146146

147-
assert len(combined) == len(df1) + len(df2)
147+
assert len(combined) == len(df1) + len(df2) # type: ignore
148148
check_awkward_accessible(combined["JetPT"]) # type: ignore

0 commit comments

Comments
 (0)