nasa · joeyschultz · Oct 15, 2025 · Oct 13, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -13,5 +13,6 @@ A short description of the changes in this PR.
 * [ ] Jira ticket acceptance criteria met.
 * [ ] `CHANGELOG.md` updated to include high level summary of PR changes.
 * [ ] `docker/service_version.txt` updated if publishing a release.
+* [ ] Fix version [harmony-metadata-annotator-X.Y.Z] added to Jira ticket if publishing a release.
 * [ ] Tests added/updated and passing.
 * [ ] Documentation updated (if needed).
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v1.5.0] - 2025-10-13
+
+### Changed
+
+- Adds ability to exclude variables from the output using earthdata-varinfo configuration.
+- Adds configuration entries to exclude SMAP L3 string variables.
+- Changes xarray engine from default netcdf4 to h5netcdf.
+
 ## [v1.4.0] - 2025-09-30
 
 ### Changed

diff --git a/docker/service_version.txt b/docker/service_version.txt
@@ -1 +1 @@
-1.4.0
+1.5.0
diff --git a/metadata_annotator/annotate.py b/metadata_annotator/annotate.py
@@ -46,8 +46,12 @@ def annotate_granule(
         config_file=varinfo_config_file,
     )
 
-    if len(granule_varinfo.cf_config.metadata_overrides):
-        # There are metadata overrides applicable to the granule's collection:
+    if (
+        len(granule_varinfo.cf_config.metadata_overrides)
+        or granule_varinfo.cf_config.excluded_science_variables
+    ):
+        # There are metadata overrides or excluded variables
+        # applicable to the granule's collection:
         amend_in_file_metadata(input_file_name, output_file_name, granule_varinfo)
     else:
         # There are no updates required, so copy the input file as-is:
@@ -60,10 +64,10 @@ def amend_in_file_metadata(
     """Update metadata attributes according to known rules.
 
     First, identify the variables or groups needing to be updated, or variables
-    that need to be created. Next create any missing, attribute only, variables.
-    Update the metadata attributes of all variables listed in overrides, or
-    removing any attributes with an overriding value of None. Lastly, update
-    the `history` global attribute.
+    that need to be created. Then, delete any variables that are configured to be
+    excluded. Next create any missing, attribute only, variables. Update the metadata
+    attributes of all variables listed in overrides, or removing any attributes with an
+    overriding value of None. Lastly, update the `history` global attribute.
 
     When opening the file as a DataTree, attempts to decode times, coordinates
     and other CF-Convention metadata are disabled, to allow updates to be made
@@ -74,7 +78,7 @@ def amend_in_file_metadata(
     items_to_update, variables_to_create = get_matching_groups_and_variables(
         granule_varinfo,
     )
-
+    variables_to_delete = get_variables_to_delete(granule_varinfo)
     with xr.open_datatree(
         input_file_name,
         decode_times=False,
@@ -83,7 +87,15 @@ def amend_in_file_metadata(
         concat_characters=True,
         use_cftime=False,
         mask_and_scale=False,
+        engine='h5netcdf',
     ) as datatree:
+        # Delete the excluded variables from the datatree and remove them from
+        # the set of items to update
+        for variable in variables_to_delete:
+            if variable in items_to_update:
+                items_to_update.remove(variable)
+            delete_variable(datatree, variable)
+
         # Update all pre-existing variables or groups with metadata overrides including
         # dimension renaming where applicable.
         update_group_and_variable_attributes(datatree, items_to_update, granule_varinfo)
@@ -117,7 +129,7 @@ def amend_in_file_metadata(
         # whole `xarray.DataTree` in one operation. Making this write variables
         # and group separately reduces the memory usage, but makes the
         # operation slower. (See Harmony SMAP L2 Gridder implementation)
-        datatree.to_netcdf(output_file_name)
+        datatree.to_netcdf(output_file_name, engine='h5netcdf')
 
 
 def get_matching_groups_and_variables(
@@ -510,3 +522,26 @@ def get_referenced_variables(
         )
 
     return referenced_variables
+
+
+def get_variables_to_delete(
+    var_info: VarInfoFromNetCDF4,
+) -> list[str]:
+    """Returns a list of variables to delete identified by VarInfo configuration."""
+    var_list = var_info.get_all_variables()
+    return [var for var in var_list if is_excluded_science_variable(var_info, var)]
+
+
+def is_excluded_science_variable(var_info: VarInfoFromNetCDF4, var) -> bool:
+    """Returns True if variable is explicitly excluded by VarInfo configuration."""
+    exclusions_pattern = re.compile(
+        '|'.join(var_info.cf_config.excluded_science_variables)
+    )
+    return var_info.variable_is_excluded(var, exclusions_pattern)
+
+
+def delete_variable(datatree, full_variable_path: str) -> None:
+    """Delete a variable from the DataTree."""
+    parent_group, variable_name = full_variable_path.rsplit('/', 1)
+    node = datatree[parent_group] if parent_group else datatree
+    del node[variable_name]
diff --git a/metadata_annotator/earthdata_varinfo_config.json b/metadata_annotator/earthdata_varinfo_config.json
@@ -16,6 +16,26 @@
   "Mission": {
     "SPL[1234].+": "SMAP"
   },
+  "ExcludedScienceVariables": [
+    {
+      "Applicability": {
+        "Mission": "SMAP"
+      },
+      "VariablePattern": [
+	      "/.*time_utc.*"
+      ]
+    },
+    {
+      "Applicability": {
+        "Mission": "SMAP",
+        "ShortNamePath": "SPL3FTA"
+      },
+      "VariablePattern": [
+	      "/Freeze_Thaw_Retrieval_Data/freeze_reference_date",
+	      "/Freeze_Thaw_Retrieval_Data/thaw_reference_date"
+      ]
+    }
+  ],
   "MetadataOverrides": [
     {
       "Applicability": {
@@ -1915,7 +1935,7 @@
       ],
       "_Description": "SMAP L3 data are HDF5 and without dimension settings. Overrides here define the dimensions, a useful reference name, and critically, the dimension order."
     },
-        {
+    {
       "Applicability": {
         "Mission": "SMAP",
         "ShortNamePath": "SPL2SMAP_S",

diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ earthdata-varinfo ~= 3.0.2
 harmony-service-lib ~= 2.5.0
 netCDF4 ~= 1.6.5
 xarray == 2025.9.0
+h5netcdf ~= 1.6.4
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -394,3 +394,79 @@ def sample_varinfo_test04(
     return VarInfoFromNetCDF4(
         sample_netcdf4_file, config_file=varinfo_config_file, short_name='TEST04'
     )
+
+
+@fixture(scope='function')
+def sample_netcdf4_file_test05(temp_dir) -> str:
+    """Create a sample NetCDF-4 file for testing excluding variables."""
+    file_name = path_join(temp_dir, 'test_input_05.nc')
+
+    sample_datatree = xr.DataTree(
+        xr.Dataset(
+            data_vars={
+                'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']),
+                'string_time_seconds': xr.DataArray(np.ones((3, 3))),
+            },
+        )
+    )
+
+    sample_datatree['/sub_group'] = xr.Dataset(
+        data_vars={
+            'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']),
+            'string_time_seconds': xr.DataArray(np.ones((3, 3))),
+        },
+    )
+
+    sample_datatree['/sub_group/nested_group'] = xr.Dataset(
+        data_vars={
+            'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']),
+            'string_time_seconds': xr.DataArray(np.ones((3, 3))),
+        },
+    )
+
+    sample_datatree.to_netcdf(file_name, encoding=None)
+    return file_name
+
+
+@fixture(scope='function')
+def expected_output_netcdf4_file_test05(temp_dir) -> str:
+    """Create a sample NetCDF-4 file for testing excluding variables.
+
+    The generated file omits the 'string_time_utc_seconds' variable from each group.
+    This ensures that the metadata annotator correctly excludes these variables
+    from its output during testing.
+    """
+    file_name = path_join(temp_dir, 'test_input_05.nc')
+
+    sample_datatree = xr.DataTree(
+        xr.Dataset(
+            data_vars={
+                'string_time_seconds': xr.DataArray(np.ones((3, 3))),
+            },
+        )
+    )
+
+    sample_datatree['/sub_group'] = xr.Dataset(
+        data_vars={
+            'string_time_seconds': xr.DataArray(np.ones((3, 3))),
+        },
+    )
+
+    sample_datatree['/sub_group/nested_group'] = xr.Dataset(
+        data_vars={
+            'string_time_seconds': xr.DataArray(np.ones((3, 3))),
+        },
+    )
+
+    sample_datatree.to_netcdf(file_name, encoding=None)
+    return file_name
+
+
+@fixture(scope='function')
+def sample_varinfo_test05(
+    sample_netcdf4_file_test05, varinfo_config_file
+) -> VarInfoFromNetCDF4:
+    """Create sample VarInfoFromNetCDF4 instance."""
+    return VarInfoFromNetCDF4(
+        sample_netcdf4_file_test05, config_file=varinfo_config_file, short_name='TEST05'
+    )
diff --git a/tests/data/earthdata_varinfo_test_config.json b/tests/data/earthdata_varinfo_test_config.json
@@ -15,6 +15,17 @@
   "Mission": {
     "TEST\\d{2}": "TEST_MISSION"
   },
+    "ExcludedScienceVariables": [
+    {
+      "Applicability": {
+        "Mission": "TEST_MISSION",
+        "ShortNamePath": "TEST05"
+      },
+      "VariablePattern": [
+        "/.*time_utc.*"
+      ]
+    }
+  ],
   "MetadataOverrides": [
     {
       "Applicability": {

diff --git a/tests/unit/test_annotate.py b/tests/unit/test_annotate.py
@@ -11,6 +11,7 @@
 from metadata_annotator.annotate import (
     annotate_granule,
     create_new_variable,
+    delete_variable,
     get_dimension_variables,
     get_geotransform_config,
     get_grid_start_index,
@@ -19,7 +20,9 @@
     get_spatial_dimension_type,
     get_spatial_dimension_variables,
     get_start_index_from_row_col_variable,
+    get_variables_to_delete,
     is_exact_path,
+    is_excluded_science_variable,
     is_temporary_attribute,
     update_dimension_names,
     update_dimension_variable_attributes,
@@ -250,6 +253,31 @@ def test_annotate_granule_no_changes(
         assert results_datatree.identical(expected_datatree)
 
 
+def test_annotate_granule_variable_exclusions_only(
+    sample_netcdf4_file_test05,
+    expected_output_netcdf4_file_test05,
+    temp_output_file_path,
+    varinfo_config_file,
+    mocker,
+):
+    """Confirm that a granule with only variable exclusion configuration is updated."""
+    _ = mocker.patch('metadata_annotator.annotate.update_history_metadata')
+    annotate_granule(
+        sample_netcdf4_file_test05,
+        temp_output_file_path,
+        varinfo_config_file,
+        'TEST05',
+    )
+
+    with (
+        xr.open_datatree(
+            expected_output_netcdf4_file_test05, decode_times=False
+        ) as expected_datatree,
+        xr.open_datatree(temp_output_file_path, decode_times=False) as results_datatree,
+    ):
+        assert results_datatree.identical(expected_datatree)
+
+
 def test_annotate_granule_with_dimension_variable_updates(temp_output_file_path):
     """Confirm that a granule has all metadata updated as expected.
 
@@ -716,3 +744,65 @@ def test_update_dimension_variables(sample_netcdf4_file_test04, sample_varinfo_t
             assert np.allclose(
                 test_datatree['sub_group'].dataset['y'], expected_y_values
             )
+
+
+def test_get_variables_to_delete(sample_varinfo_test05):
+    """Ensure correct list of variables to delete is obtained."""
+    expected_result = set(
+        [
+            '/string_time_utc_seconds',
+            '/sub_group/string_time_utc_seconds',
+            '/sub_group/nested_group/string_time_utc_seconds',
+        ]
+    )
+    assert set(get_variables_to_delete(sample_varinfo_test05)) == expected_result
+
+
+def test_is_excluded_science_variable(sample_varinfo_test05):
+    """Ensure excluded science variables are determined correctly."""
+    assert is_excluded_science_variable(
+        sample_varinfo_test05, '/string_time_utc_seconds'
+    )
+    assert is_excluded_science_variable(
+        sample_varinfo_test05, '/sub_group/string_time_utc_seconds'
+    )
+    assert is_excluded_science_variable(
+        sample_varinfo_test05, '/sub_group/nested_group/string_time_utc_seconds'
+    )
+    assert not is_excluded_science_variable(
+        sample_varinfo_test05, '/string_time_seconds'
+    )
+    assert not is_excluded_science_variable(
+        sample_varinfo_test05, '/sub_group/string_time_seconds'
+    )
+    assert not is_excluded_science_variable(
+        sample_varinfo_test05, '/sub_group/nested/string_time_seconds'
+    )
+
+
+@pytest.mark.parametrize(
+    'group_name',
+    [
+        '/',
+        '/sub_group',
+        '/sub_group/nested_group',
+    ],
+)
+def test_delete_variable(sample_netcdf4_file_test05, group_name):
+    """Ensure correct variables are deleted from the datatree."""
+    with xr.open_datatree(
+        sample_netcdf4_file_test05, decode_times=False
+    ) as test_datatree:
+        variable_name = 'string_time_utc_seconds'
+        if group_name == '/':
+            full_path = f'/{variable_name}'
+        else:
+            full_path = f'{group_name}/{variable_name}'
+
+        # check that variable is in datatree before deletion
+        assert variable_name in test_datatree[group_name].ds.data_vars
+        initial_var_count = len(test_datatree[group_name].ds.data_vars)
+
+        delete_variable(test_datatree, full_path)
+        assert variable_name not in test_datatree[group_name].ds.data_vars
+        assert len(test_datatree[group_name].ds.data_vars) == initial_var_count - 1