Skip to content

Commit 1ddbf03

Browse files
authored
Do not request parquet when we want root-file (#257)
Bug fixes: * Make sure to be consistent about passing `root-file` and not `root-files` * Limit the result format types allowed to `root-file` and `parquet`. There is a global var in `servicex.py` that contains the legal list that can be updated when testing new formats. * Change where `flake8` in `vscode` gets its info from Fixes #253
1 parent b1df19a commit 1ddbf03

File tree

2 files changed

+19
-14
lines changed

2 files changed

+19
-14
lines changed

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,4 +101,7 @@
101101
"editor.formatOnSave": true,
102102
"python.formatting.provider": "black",
103103
"restructuredtext.preview.docutils.disabled": true,
104+
"python.linting.flake8Args": [
105+
"--config=.flake8"
106+
],
104107
}

servicex/servicex.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@
5555
stream_unique_updates_only,
5656
)
5757

58+
# The allowed file formats.
59+
# You could modify this if you wanted to add new...
60+
g_allowed_formats = ["parquet", "root-file"]
61+
5862

5963
class StreamInfoBase:
6064
"""Contains base information about results that are streamed back from
@@ -320,8 +324,8 @@ def first_supported_datatype(
320324
return None
321325

322326
def ignore_cache(self):
323-
"""Return a context manager that, as long as it is held, will cause any queries against just
324-
this dataset to ignore any locally cached data.
327+
"""Return a context manager that, as long as it is held, will cause any queries against
328+
just this dataset to ignore any locally cached data.
325329
326330
Returns:
327331
ContextManager: As long as this is held, the local query cache will be ignored.
@@ -352,7 +356,7 @@ async def get_data_rootfiles_stream(
352356
file locally.
353357
"""
354358
async for f_info in self._stream_local_files(
355-
selection_query, title, "root-files"
359+
selection_query, title, "root-file"
356360
): # type: ignore
357361
yield f_info
358362

@@ -462,7 +466,7 @@ async def get_data_rootfiles_uri_stream(
462466
as_signed_url (bool): Return the uri as a presigned http url?
463467
"""
464468
async for f_info in self._stream_url_buckets(
465-
selection_query, "root-files", title, as_signed_url
469+
selection_query, "root-file", title, as_signed_url
466470
): # type: ignore
467471
yield f_info
468472

@@ -591,7 +595,6 @@ async def _stream_url_buckets(
591595
query = self._build_json_query(selection_query, data_format, title)
592596

593597
async with aiohttp.ClientSession() as client:
594-
595598
# Get a request id - which might be cached, but if not, submit it.
596599
request_id = await self._get_request_id(client, query)
597600

@@ -768,7 +771,7 @@ async def _stream_local_files(
768771
async def _get_files(
769772
self,
770773
selection_query: str,
771-
data_type: str,
774+
data_format: str,
772775
notifier: _status_update_wrapper,
773776
title: Optional[str],
774777
) -> AsyncIterator[Tuple[str, Awaitable[Path]]]:
@@ -787,7 +790,7 @@ async def _get_files(
787790
Arguments:
788791
789792
selection_query The query string to send to ServiceX
790-
data_type The type of data that we want to come back.
793+
data_format The type of data that we want to come back.
791794
notifier Status callback to let our progress be advertised
792795
title Title to pass to servicex backend.
793796
@@ -797,10 +800,9 @@ async def _get_files(
797800
This is returned this way so a number of downloads can run
798801
simultaneously.
799802
"""
800-
query = self._build_json_query(selection_query, data_type, title)
803+
query = self._build_json_query(selection_query, data_format, title)
801804

802805
async with aiohttp.ClientSession() as client:
803-
804806
# Get a request id - which might be cached, but if not, submit it.
805807
request_id = await self._get_request_id(client, query)
806808

@@ -950,7 +952,6 @@ async def _get_files_from_servicex(
950952
start_time = time.monotonic()
951953
good = True
952954
try:
953-
954955
# Get the stream of minio bucket new files.
955956
stream_new_object = self._get_minio_bucket_files_from_servicex(
956957
request_id, client, minio_adaptor, notifier
@@ -1007,7 +1008,6 @@ async def _get_minio_bucket_files_from_servicex(
10071008
"""
10081009
start_time = time.monotonic()
10091010
try:
1010-
10111011
# Setup the status sequence from servicex
10121012
stream_status = transform_status_stream(
10131013
self._servicex_adaptor, client, request_id
@@ -1034,23 +1034,25 @@ async def _get_minio_bucket_files_from_servicex(
10341034
)
10351035

10361036
def _build_json_query(
1037-
self, selection_query: str, data_type: str, title: Optional[str]
1037+
self, selection_query: str, data_format: str, title: Optional[str]
10381038
) -> Dict[str, Union[str, Iterable[str]]]:
10391039
"""
10401040
Returns a list of locally written files for a given selection query.
10411041
10421042
Arguments:
10431043
selection_query The query to be send into the ServiceX API
1044-
data_type What is the output data type (parquet, root-file, etc.)
1044+
data_format What is the output data type (parquet, root-file, etc.)
10451045
10461046
Notes:
10471047
- Internal routine.
10481048
"""
1049+
assert data_format in g_allowed_formats
1050+
10491051
# Items that must always be present
10501052
json_query: Dict[str, Union[str, Iterable[str]]] = {
10511053
"selection": selection_query,
10521054
"result-destination": self._result_destination,
1053-
"result-format": "parquet" if data_type == "parquet" else "root-file",
1055+
"result-format": "parquet" if data_format == "parquet" else "root-file",
10541056
"chunk-size": "1000",
10551057
"workers": str(self._max_workers),
10561058
}

0 commit comments

Comments
 (0)