@@ -40,6 +40,14 @@ class StreamInfoUrl:
40
40
bucket : str
41
41
42
42
43
+ @dataclass
44
+ class StreamInfoPath :
45
+ '''Contains information on accessing ServiceX data via a local Path
46
+ '''
47
+ path : Path
48
+ file : str
49
+
50
+
43
51
class ServiceXDataset (ServiceXABC ):
44
52
'''
45
53
Used to access an instance of ServiceX at an end point on the internet. Support convieration
@@ -159,11 +167,47 @@ def ignore_cache(self):
159
167
async def get_data_rootfiles_async (self , selection_query : str ) -> List [Path ]:
160
168
return await self ._file_return (selection_query , 'root-file' )
161
169
170
+ async def get_data_rootfiles_stream (self , selection_query : str ) \
171
+ -> AsyncIterator [StreamInfoPath ]:
172
+ '''Returns, as an async iterator, each completed batch of work from Servicex.
173
+ The `StreamInfoPath` contains a path where downstream consumers can directly
174
+ access the data.
175
+
176
+ Args:
177
+ selection_query (str): The `qastle` query for the data to retreive.
178
+
179
+ Yields:
180
+ AsyncIterator[StreamInfoPath]: As ServiceX completes the data, and it is downloaded
181
+ to the local machine, the async iterator returns
182
+ a `StreamInfoPath` which can be used to access the
183
+ file locally.
184
+ '''
185
+ async for f_info in self ._stream_local_files (selection_query , 'root-files' ):
186
+ yield f_info
187
+
162
188
@functools .wraps (ServiceXABC .get_data_parquet_async , updated = ())
163
189
@_wrap_in_memory_sx_cache
164
190
async def get_data_parquet_async (self , selection_query : str ) -> List [Path ]:
165
191
return await self ._file_return (selection_query , 'parquet' )
166
192
193
+ async def get_data_parquet_stream (self , selection_query : str ) \
194
+ -> AsyncIterator [StreamInfoPath ]:
195
+ '''Returns, as an async iterator, each completed batch of work from Servicex.
196
+ The `StreamInfoPath` contains a path where downstream consumers can directly
197
+ access the data.
198
+
199
+ Args:
200
+ selection_query (str): The `qastle` query for the data to retreive.
201
+
202
+ Yields:
203
+ AsyncIterator[StreamInfoPath]: As ServiceX completes the data, and it is downloaded
204
+ to the local machine, the async iterator returns
205
+ a `StreamInfoPath` which can be used to access the
206
+ file locally.
207
+ '''
208
+ async for f_info in self ._stream_local_files (selection_query , 'parquet' ):
209
+ yield f_info
210
+
167
211
@functools .wraps (ServiceXABC .get_data_pandas_df_async , updated = ())
168
212
@_wrap_in_memory_sx_cache
169
213
async def get_data_pandas_df_async (self , selection_query : str ):
@@ -185,18 +229,18 @@ async def get_data_rootfiles_url_stream(self, selection_query: str) \
185
229
Args:
186
230
selection_query (str): The ServiceX Selection
187
231
'''
188
- async for f_info in self ._get_minio_buckets (selection_query , 'root-files' ):
232
+ async for f_info in self ._stream_url_buckets (selection_query , 'root-files' ):
189
233
yield f_info
190
234
191
- async def get_data_parquet_minio_stream (self , selection_query : str ) \
235
+ async def get_data_parquet_url_stream (self , selection_query : str ) \
192
236
-> AsyncIterator [StreamInfoUrl ]:
193
237
'''Returns, as an async iterator, each of the files from the minio bucket,
194
238
as the files are added there.
195
239
196
240
Args:
197
241
selection_query (str): The ServiceX Selection
198
242
'''
199
- async for f_info in self ._get_minio_buckets (selection_query , 'parquet' ):
243
+ async for f_info in self ._stream_url_buckets (selection_query , 'parquet' ):
200
244
yield f_info
201
245
202
246
async def _file_return (self , selection_query : str , data_format : str ):
@@ -224,7 +268,7 @@ async def convert_to_file(f: Path) -> Path:
224
268
return await self ._data_return (selection_query , convert_to_file , data_format )
225
269
226
270
@on_exception (backoff .constant , ServiceXUnknownRequestID , interval = 0.1 , max_tries = 3 )
227
- async def _get_minio_buckets (self , selection_query : str , data_format : str ) \
271
+ async def _stream_url_buckets (self , selection_query : str , data_format : str ) \
228
272
-> AsyncIterator [StreamInfoUrl ]:
229
273
'''Get a list of files back for a request
230
274
@@ -280,12 +324,10 @@ async def _get_minio_buckets(self, selection_query: str, data_format: str) \
280
324
await self ._servicex_adaptor .dump_query_errors (client , request_id )
281
325
raise ServiceXException (f'Failed to transform all files in { request_id } ' ) from e
282
326
283
- @on_exception (backoff .constant , ServiceXUnknownRequestID , interval = 0.1 , max_tries = 3 )
284
327
async def _data_return (self , selection_query : str ,
285
328
converter : Callable [[Path ], Awaitable [Any ]],
286
- data_format : str = 'root-file' ):
287
- '''
288
- Given a query, return the data, in a unique order, that hold
329
+ data_format : str = 'root-file' ) -> List [Any ]:
330
+ '''Given a query, return the data, in a unique order, that hold
289
331
the data for the query.
290
332
291
333
For certian types of exceptions, the queries will be repeated. For example,
@@ -303,25 +345,51 @@ async def _data_return(self, selection_query: str,
303
345
data Data converted to the "proper" format, depending
304
346
on the converter call.
305
347
'''
306
- # Get a notifier to update anyone who wants to listen.
307
- notifier = self ._create_notifier ( )
348
+ as_data = (( f . file , asyncio . ensure_future ( converter ( f . path )))
349
+ async for f in self ._stream_local_files ( selection_query , data_format ) )
308
350
309
- # Get all the files
310
- as_files = \
311
- (f async for f in
312
- self ._get_files (selection_query , data_format , notifier ))
351
+ all_data = {d [0 ]: await d [1 ] async for d in as_data }
313
352
314
353
# Convert them to the proper format
315
- as_data = ((f [0 ], asyncio .ensure_future (converter (await f [1 ])))
316
- async for f in as_files )
317
354
318
355
# Finally, we need them in the proper order so we append them
319
356
# all together
320
- all_data = {f [0 ]: await f [1 ] async for f in as_data }
321
357
ordered_data = [all_data [k ] for k in sorted (all_data .keys ())]
322
358
323
359
return ordered_data
324
360
361
+ @on_exception (backoff .constant , ServiceXUnknownRequestID , interval = 0.1 , max_tries = 3 )
362
+ async def _stream_local_files (self , selection_query : str ,
363
+ data_format : str = 'root-file' ):
364
+ '''
365
+ Given a query, return the data as a list of paths pointing to local files
366
+ that contain the results of the query. This is an async generator, and files
367
+ are returned as they arrive.
368
+
369
+ For certian types of exceptions, the queries will be repeated. For example,
370
+ if `ServiceX` indicates that it was restarted in the middle of the query, then
371
+ the query will be re-submitted.
372
+
373
+ Arguments:
374
+
375
+ selection_query `qastle` data that makes up the selection request.
376
+
377
+ Returns:
378
+
379
+ data Data converted to the "proper" format, depending
380
+ on the converter call.
381
+ '''
382
+ # Get a notifier to update anyone who wants to listen.
383
+ notifier = self ._create_notifier ()
384
+
385
+ # Get all the files
386
+ as_files = \
387
+ (f async for f in
388
+ self ._get_files (selection_query , data_format , notifier ))
389
+
390
+ async for name , a_path in as_files :
391
+ yield StreamInfoPath (Path (await a_path ), name )
392
+
325
393
async def _get_files (self , selection_query : str , data_type : str ,
326
394
notifier : _status_update_wrapper ) \
327
395
-> AsyncIterator [Tuple [str , Awaitable [Path ]]]:
0 commit comments