NOAA-OWP · CarsonPruitt-NOAA · Sep 13, 2024 · Jun 27, 2024 · Jun 27, 2024 · Aug 29, 2024
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,6 +1,17 @@
 All notable changes to this project will be documented in this file.
 We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
 
+## v4.5.6.1 - 2024-09-13 - [PR#1271](https://github.com/NOAA-OWP/inundation-mapping/pull/1271)
+
+Upgrade for `test_case_by_hydro_id.py` that enables the ability to run on HUCs with differing projections (e.g. Alaska) and adds a logging system.
+
+### Changes
+
+- `tools/test_case_by_hydro_id.py`: Moved the reprojection step to accommodate  multiple input projections and fixed a lot of unnecessary logic. Also added an optional logging system that is activated by the new `-l` flag.
+
+<br/><br/>
+
+
 ## v4.5.6.0 - 2024-08-23 - [PR#1253](https://github.com/NOAA-OWP/inundation-mapping/pull/1253)
 
 Upgrades Python packages and dependencies and fixes backwards incompatibilities with new version of `geopandas`. Major changes include:
@@ -38,7 +49,7 @@ Updated the gauge crosswalk and SRC adjustment routine to use the ras2fim v2 fil
 - `src/src_adjust_ras2fim_rating.py`: Updated code logic to use the huc-specific input files containing the ras2fim rating curve data (previous ras2fim input file contained all hucs in one csv)
 - `src/utils/shared_functions.py`: Added function to find huc subdirectories with the same name btw two parent folders
 
- <br/><br/>
+<br/><br/>
 
 ## v4.5.4.4 - 2024-08-02 - [PR#1238](https://github.com/NOAA-OWP/inundation-mapping/pull/1238)
 

diff --git a/tools/test_case_by_hydro_id.py b/tools/test_case_by_hydro_id.py
@@ -2,6 +2,7 @@
 
 import argparse
 import os
+import traceback
 from datetime import datetime
 
 import geopandas as gpd
@@ -10,10 +11,25 @@
 from run_test_case import Test_Case
 from shapely.validation import make_valid
 from tools_shared_functions import compute_stats_from_contingency_table
+from tqdm import tqdm
 
 
 gpd.options.io_engine = "pyogrio"
 
+"""
+This module uses zonal stats to subdivide alpha metrics by each HAND catchment.
+The output is a vector geopackage and is also known as the "FIM Performance" layer
+when loaded into HydroVIS. At the time of this commit, it takes approximately
+32 hours to complete.
+
+Example usage:
+python /foss_fim/tools/test_case_by_hydro_id.py \
+    -b all \
+    -v fim_4_5_2_11 \
+    -g /outputs/fim_performance_v4_5_2_11.gpkg \
+    -l
+"""
+
 
 #####################################################
 # Perform zonal stats is a funtion stored in pixel_counter.py.
@@ -180,39 +196,7 @@ def assemble_hydro_alpha_for_single_huc(stats, huc8, mag, bench):
     return in_mem_df
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Produces alpha metrics by hyrdoid.')
-
-    parser.add_argument(
-        '-b',
-        '--benchmark_category',
-        help='Choice of truth data. Options are: all, ble, ifc, nws, usgs, ras2fim',
-        required=True,
-    )
-    parser.add_argument(
-        '-v', '--version', help='The fim version to use. Should be similar to fim_3_0_24_14_ms', required=True
-    )
-    parser.add_argument(
-        '-g',
-        '--gpkg',
-        help='filepath and filename to hold exported gpkg (and csv) file. '
-        'Similar to /data/path/fim_performance_catchments.gpkg Need to use gpkg as output.',
-        required=True,
-    )
-
-    # Assign variables from arguments.
-    args = vars(parser.parse_args())
-    benchmark_category = args['benchmark_category']
-    version = args['version']
-    csv = args['gpkg']
-
-    print("================================")
-    print("Start test_case_by_hydroid.py")
-    start_time = datetime.now()
-    dt_string = datetime.now().strftime("%m/%d/%Y %H:%M:%S")
-    print(f"started: {dt_string}")
-    print()
-
+def catchment_zonal_stats(benchmark_category, version, csv, log):
     # Execution code
     csv_output = gpd.GeoDataFrame(
         columns=[
@@ -237,92 +221,167 @@ def assemble_hydro_alpha_for_single_huc(stats, huc8, mag, bench):
             'geometry',
         ],
         geometry='geometry',
-    )
+    ).set_crs('EPSG:3857')
 
     # This funtion, relies on the Test_Case class defined in run_test_case.py to list all available test cases
-    print('listing_test_cases_with_updates')
     all_test_cases = Test_Case.list_all_test_cases(
         version=version,
         archive=True,
         benchmark_categories=[] if benchmark_category == "all" else [benchmark_category],
     )
+    print(f'Found {len(all_test_cases)} test cases')
+    if log:
+        log.write(f'Found {len(all_test_cases)} test cases...\n')
+    missing_hucs = []
 
-    for test_case_class in all_test_cases:
+    for test_case_class in tqdm(all_test_cases, desc=f'Running {len(all_test_cases)} test cases'):
         if not os.path.exists(test_case_class.fim_dir):
             print(f'{test_case_class.fim_dir} does not exist')
+            missing_hucs.append(test_case_class)
+            if log:
+                log.write(f'{test_case_class.fim_dir} does not exist\n')
             continue
 
-        print(test_case_class.fim_dir, end='\r')
+        if log:
+            log.write(test_case_class.test_id + '\n')
 
         agreement_dict = test_case_class.get_current_agreements()
 
         for agree_rast in agreement_dict:
-            print(f'performing_zonal_stats for {agree_rast}')
 
-            branches_dir = os.path.join(test_case_class.fim_dir, 'branches')
-            for branches in os.listdir(branches_dir):
-                if branches != "0":
-                    continue
-                huc_gpkg = os.path.join(branches_dir, branches)
+            # We are only using branch 0 catchments to define boundaries for zonal stats
+            catchment_gpkg = os.path.join(
+                test_case_class.fim_dir,
+                'branches',
+                "gw_catchments_reaches_filtered_addedAttributes_crosswalked_0.gpkg",
+            )
 
-                string_manip = (
-                    "gw_catchments_reaches_filtered_addedAttributes_crosswalked_" + branches + ".gpkg"
-                )
+            define_mag = agree_rast.split(version)
+            define_mag_1 = define_mag[1].split('/')
+            mag = define_mag_1[1]
 
-                huc_gpkg = os.path.join(huc_gpkg, string_manip)
+            if log:
+                log.write(f'  {define_mag[1]}\n')
 
-                define_mag = agree_rast.split(version)
+            stats = perform_zonal_stats(catchment_gpkg, agree_rast)
+            if stats == []:
+                continue
 
-                define_mag_1 = define_mag[1].split('/')
+            get_geom = gpd.read_file(catchment_gpkg)
 
-                mag = define_mag_1[1]
+            get_geom['geometry'] = get_geom.apply(lambda row: make_valid(row.geometry), axis=1)
 
-                stats = perform_zonal_stats(huc_gpkg, agree_rast)
-                if stats == []:
-                    continue
+            in_mem_df = assemble_hydro_alpha_for_single_huc(
+                stats, test_case_class.huc, mag, test_case_class.benchmark_cat
+            )
 
-                print('assembling_hydroalpha_for_single_huc')
-                get_geom = gpd.read_file(huc_gpkg)
+            hydro_geom_df = get_geom[["HydroID", "geometry"]]
 
-                get_geom['geometry'] = get_geom.apply(lambda row: make_valid(row.geometry), axis=1)
+            geom_output = hydro_geom_df.merge(in_mem_df, on='HydroID', how='inner').to_crs('EPSG:3857')
 
-                in_mem_df = assemble_hydro_alpha_for_single_huc(
-                    stats, test_case_class.huc, mag, test_case_class.benchmark_cat
-                )
+            concat_df_list = [geom_output, csv_output]
 
-                hydro_geom_df = get_geom[["HydroID", "geometry"]]
+            csv_output = pd.concat(concat_df_list, sort=False)
 
-                geom_output = hydro_geom_df.merge(in_mem_df, on='HydroID', how='inner')
+    if missing_hucs:
+        log.write(
+            f"There were {len(missing_hucs)} HUCs missing from the input FIM version:\n"
+            + "\n".join([h.fim_dir for h in missing_hucs])
+        )
 
-                concat_df_list = [geom_output, csv_output]
+    print()
+    print(csv_output.groupby('BENCH').size())
+    print(f'total     {len(csv_output)}')
+    log.write("\n------------------------------------\n")
+    csv_output.groupby('BENCH').size().to_string(log)
+    log.write(f'\ntotal     {len(csv_output)}\n')
+
+    print('Writing to GPKG')
+    log.write(f'Writing geopackage {csv}\n')
+    csv_output.to_file(csv, driver="GPKG")
 
-                csv_output = pd.concat(concat_df_list, sort=False)
+    # Add version information to csv_output dataframe
+    csv_output['version'] = version
 
-    print('projecting to 3857')
-    csv_output = csv_output.to_crs('EPSG:3857')
+    print('Writing to CSV')
+    csv_path = csv.replace(".gpkg", ".csv")
+    log.write(f'Writing CSV {csv_path}\n')
+    csv_output.to_csv(csv_path)  # Save to CSV
 
-    print('manipulating the input string to exclude gpkg and include csv')
-    csv_path_list = csv.split(".")
-    csv_path = csv_path_list[0]
-    csv_path_dot = csv_path + ".csv"
 
-    print('writing_to_gpkg')
-    csv_output.to_file(csv, driver="GPKG")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Produces alpha metrics by hyrdoid.')
 
-    # Add version information to csv_output dataframe
-    csv_output['version'] = version
+    parser.add_argument(
+        '-b',
+        '--benchmark_category',
+        help='Choice of truth data. Options are: all, ble, ifc, nws, usgs, ras2fim',
+        required=True,
+    )
+    parser.add_argument(
+        '-v', '--version', help='The fim version to use. Should be similar to fim_3_0_24_14_ms', required=True
+    )
+    parser.add_argument(
+        '-g',
+        '--gpkg',
+        help='Filepath and filename to hold exported gpkg file. '
+        'Similar to /data/path/fim_performance_catchments.gpkg. A CSV with the same name will also be written.',
+        required=True,
+    )
+    parser.add_argument(
+        '-l',
+        '--log',
+        help='Optional flag to write a log file with the same name as the --GPKG.',
+        required=False,
+        default=None,
+        action='store_true',
+    )
 
-    print('writing_to_csv')
-    csv_output.to_csv(csv_path_dot)  # Save to CSV
+    # Assign variables from arguments.
+    args = vars(parser.parse_args())
+    benchmark_category = args['benchmark_category']
+    version = args['version']
+    csv = args['gpkg']
+    log = args['log']
 
     print("================================")
-    print("End test_case_by_hydroid.py")
+    print("Start test_case_by_hydroid.py")
+    start_time = datetime.now()
+    dt_string = datetime.now().strftime("%m/%d/%Y %H:%M:%S")
+    print(f"started: {dt_string}")
+    print()
+
+    ## Initiate log file
+    if log:
+        log = open(csv.replace('.gpkg', '.log'), "w")
+        log.write('START TIME: ' + str(start_time) + '\n')
+        log.write('#########################################################\n\n')
+        log.write('')
+        log.write(f'Runtime args:\n {args}\n\n')
+
+    # This is the main execution -- try block is to catch and log errors
+    try:
+        catchment_zonal_stats(benchmark_category, version, csv, log)
+    except Exception as ex:
+        print(f"ERROR: Execution failed. Please check the log file for details. \n {log.name if log else ''}")
+        if log:
+            log.write(f"ERROR -->\n{ex}")
+        traceback.print_exc(file=log)
+        if log:
+            log.write(f'Errored at: {str(datetime.now().strftime("%m/%d/%Y %H:%M:%S"))} \n')
 
     end_time = datetime.now()
-    dt_string = datetime.now().strftime("%m/%d/%Y %H:%M:%S")
+    dt_string = end_time.strftime("%m/%d/%Y %H:%M:%S")
+    tot_run_time = end_time - start_time
+    if log:
+        log.write(f'END TIME: {str(end_time)} \n')
+        log.write(f'TOTAL RUN TIME: {str(tot_run_time)} \n')
+        log.close()
+
+    print("================================")
+    print("End test_case_by_hydroid.py")
+
     print(f"ended: {dt_string}")
 
-    # calculate duration
-    time_duration = end_time - start_time
-    print(f"Duration: {str(time_duration).split('.')[0]}")
+    print(f"Duration: {str(tot_run_time).split('.')[0]}")
     print()