22
33import argparse
44import datetime
5+ import logging
56import os
67import shutil
78import sys
8- from collections .abc import Iterable
99
1010import rich
1111import rich .table
1616from cumulus_etl .etl .config import JobConfig , JobSummary
1717from cumulus_etl .etl .tasks import task_factory
1818
19+ TaskList = list [type [tasks .EtlTask ]]
20+
21+
1922###############################################################################
2023#
2124# Main Pipeline (run all tasks)
2427
2528
2629async def etl_job (
27- config : JobConfig , selected_tasks : list [ type [ tasks . EtlTask ]] , use_philter : bool = False
30+ config : JobConfig , selected_tasks : TaskList , use_philter : bool = False
2831) -> list [JobSummary ]:
2932 """
3033 :param config: job config
@@ -68,7 +71,7 @@ def check_mstool() -> None:
6871 raise SystemExit (errors .MSTOOL_MISSING )
6972
7073
71- async def check_requirements (selected_tasks : Iterable [ type [ tasks . EtlTask ]] ) -> None :
74+ async def check_requirements (selected_tasks : TaskList ) -> None :
7275 """
7376 Verifies that all external services and programs are ready
7477
@@ -118,6 +121,11 @@ def define_etl_parser(parser: argparse.ArgumentParser) -> None:
118121 parser .add_argument (
119122 "--errors-to" , metavar = "DIR" , help = "where to put resources that could not be processed"
120123 )
124+ parser .add_argument (
125+ "--allow-missing-resources" ,
126+ action = "store_true" ,
127+ help = "run tasks even if their resources are not present" ,
128+ )
121129
122130 cli_utils .add_aws (parser )
123131 cli_utils .add_auth (parser )
@@ -143,7 +151,7 @@ def define_etl_parser(parser: argparse.ArgumentParser) -> None:
143151
144152
145153def print_config (
146- args : argparse .Namespace , job_datetime : datetime .datetime , all_tasks : Iterable [ tasks . EtlTask ]
154+ args : argparse .Namespace , job_datetime : datetime .datetime , all_tasks : TaskList
147155) -> None :
148156 """
149157 Prints the ETL configuration to the console.
@@ -214,6 +222,49 @@ def handle_completion_args(
214222 return export_group_name , export_datetime
215223
216224
225+ async def check_available_resources (
226+ loader : loaders .Loader ,
227+ * ,
228+ requested_resources : set [str ],
229+ args : argparse .Namespace ,
230+ is_default_tasks : bool ,
231+ ) -> set [str ]:
232+ # Here we try to reconcile which resources the user requested and which resources are actually
233+ # available in the input root.
234+ # - If the user didn't specify a specific task, we'll scope down the requested resources to
235+ # what is actually present in the input.
236+ # - If they did, we'll complain if their required resources are not available.
237+ #
238+ # Reconciling is helpful for performance reasons (don't need to finalize untouched tables),
239+ # UX reasons (can tell user if they made a CLI mistake), and completion tracking (don't
240+ # mark a resource as complete if we didn't even export it)
241+ if args .allow_missing_resources :
242+ return requested_resources
243+
244+ detected = await loader .detect_resources ()
245+ if detected is None :
246+ return requested_resources # likely we haven't run bulk export yet
247+
248+ if missing_resources := requested_resources - detected :
249+ for resource in sorted (missing_resources ):
250+ # Log the same message we would print if in common.py if we ran tasks anyway
251+ logging .warning ("No %s files found in %s" , resource , loader .root .path )
252+
253+ if is_default_tasks :
254+ requested_resources -= missing_resources # scope down to detected resources
255+ if not requested_resources :
256+ errors .fatal (
257+ "No supported resources found." ,
258+ errors .MISSING_REQUESTED_RESOURCES ,
259+ )
260+ else :
261+ msg = "Required resources not found.\n "
262+ msg += "Add --allow-missing-resources to run related tasks anyway with no input."
263+ errors .fatal (msg , errors .MISSING_REQUESTED_RESOURCES )
264+
265+ return requested_resources
266+
267+
217268async def etl_main (args : argparse .Namespace ) -> None :
218269 # Set up some common variables
219270
@@ -227,6 +278,7 @@ async def etl_main(args: argparse.Namespace) -> None:
227278 job_datetime = common .datetime_now () # grab timestamp before we do anything
228279
229280 selected_tasks = task_factory .get_selected_tasks (args .task , args .task_filter )
281+ is_default_tasks = not args .task and not args .task_filter
230282
231283 # Print configuration
232284 print_config (args , job_datetime , selected_tasks )
@@ -261,8 +313,17 @@ async def etl_main(args: argparse.Namespace) -> None:
261313 resume = args .resume ,
262314 )
263315
316+ required_resources = await check_available_resources (
317+ config_loader ,
318+ args = args ,
319+ is_default_tasks = is_default_tasks ,
320+ requested_resources = required_resources ,
321+ )
322+ # Drop any tasks that we didn't find resources for
323+ selected_tasks = [t for t in selected_tasks if t .resource in required_resources ]
324+
264325 # Pull down resources from any remote location (like s3), convert from i2b2, or do a bulk export
265- loader_results = await config_loader .load_all ( list ( required_resources ) )
326+ loader_results = await config_loader .load_resources ( required_resources )
266327
267328 # Establish the group name and datetime of the loaded dataset (from CLI args or Loader)
268329 export_group_name , export_datetime = handle_completion_args (args , loader_results )
0 commit comments