Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ A short description of the changes in this PR.
* [ ] Jira ticket acceptance criteria met.
* [ ] `CHANGELOG.md` updated to include high level summary of PR changes.
* [ ] `docker/service_version.txt` updated if publishing a release.
* [ ] Fix version [harmony-metadata-annotator-X.Y.Z] added to Jira ticket if publishing a release.
* [ ] Tests added/updated and passing.
* [ ] Documentation updated (if needed).
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v1.5.0] - 2025-10-13

### Changed

- Adds ability to exclude variables from the output using earthdata-varinfo configuration.
- Adds configuration entries to exclude SMAP L3 string variables.
- Changes xarray engine from default netcdf4 to h5netcdf.

## [v1.4.0] - 2025-09-30

### Changed
Expand Down
2 changes: 1 addition & 1 deletion docker/service_version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.4.0
1.5.0
51 changes: 43 additions & 8 deletions metadata_annotator/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,12 @@ def annotate_granule(
config_file=varinfo_config_file,
)

if len(granule_varinfo.cf_config.metadata_overrides):
# There are metadata overrides applicable to the granule's collection:
if (
len(granule_varinfo.cf_config.metadata_overrides)
or granule_varinfo.cf_config.excluded_science_variables
):
# There are metadata overrides or excluded variables
# applicable to the granule's collection:
amend_in_file_metadata(input_file_name, output_file_name, granule_varinfo)
else:
# There are no updates required, so copy the input file as-is:
Expand All @@ -60,10 +64,10 @@ def amend_in_file_metadata(
"""Update metadata attributes according to known rules.

First, identify the variables or groups needing to be updated, or variables
that need to be created. Next create any missing, attribute only, variables.
Update the metadata attributes of all variables listed in overrides, or
removing any attributes with an overriding value of None. Lastly, update
the `history` global attribute.
that need to be created. Then, delete any variables that are configured to be
excluded. Next create any missing, attribute only, variables. Update the metadata
attributes of all variables listed in overrides, or removing any attributes with an
overriding value of None. Lastly, update the `history` global attribute.

When opening the file as a DataTree, attempts to decode times, coordinates
and other CF-Convention metadata are disabled, to allow updates to be made
Expand All @@ -74,7 +78,7 @@ def amend_in_file_metadata(
items_to_update, variables_to_create = get_matching_groups_and_variables(
granule_varinfo,
)

variables_to_delete = get_variables_to_delete(granule_varinfo)
with xr.open_datatree(
input_file_name,
decode_times=False,
Expand All @@ -83,7 +87,15 @@ def amend_in_file_metadata(
concat_characters=True,
use_cftime=False,
mask_and_scale=False,
engine='h5netcdf',
) as datatree:
# Delete the excluded variables from the datatree and remove them from
# the set of items to update
for variable in variables_to_delete:
if variable in items_to_update:
items_to_update.remove(variable)
delete_variable(datatree, variable)

# Update all pre-existing variables or groups with metadata overrides including
# dimension renaming where applicable.
update_group_and_variable_attributes(datatree, items_to_update, granule_varinfo)
Expand Down Expand Up @@ -117,7 +129,7 @@ def amend_in_file_metadata(
# whole `xarray.DataTree` in one operation. Making this write variables
# and group separately reduces the memory usage, but makes the
# operation slower. (See Harmony SMAP L2 Gridder implementation)
datatree.to_netcdf(output_file_name)
datatree.to_netcdf(output_file_name, engine='h5netcdf')


def get_matching_groups_and_variables(
Expand Down Expand Up @@ -510,3 +522,26 @@ def get_referenced_variables(
)

return referenced_variables


def get_variables_to_delete(
var_info: VarInfoFromNetCDF4,
) -> list[str]:
"""Returns a list of variables to delete identified by VarInfo configuration."""
var_list = var_info.get_all_variables()
return [var for var in var_list if is_excluded_science_variable(var_info, var)]


def is_excluded_science_variable(var_info: VarInfoFromNetCDF4, var) -> bool:
"""Returns True if variable is explicitly excluded by VarInfo configuration."""
exclusions_pattern = re.compile(
'|'.join(var_info.cf_config.excluded_science_variables)
)
return var_info.variable_is_excluded(var, exclusions_pattern)
Comment on lines +535 to +540
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is probably the time to move this to earthdata-varinfo. Because I'll bet Josie has a similar function in her PR.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I certainly do.



def delete_variable(datatree, full_variable_path: str) -> None:
"""Delete a variable from the DataTree."""
parent_group, variable_name = full_variable_path.rsplit('/', 1)
node = datatree[parent_group] if parent_group else datatree
del node[variable_name]
22 changes: 21 additions & 1 deletion metadata_annotator/earthdata_varinfo_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,26 @@
"Mission": {
"SPL[1234].+": "SMAP"
},
"ExcludedScienceVariables": [
{
"Applicability": {
"Mission": "SMAP"
},
"VariablePattern": [
"/.*time_utc.*"
]
},
{
"Applicability": {
"Mission": "SMAP",
"ShortNamePath": "SPL3FTA"
},
"VariablePattern": [
"/Freeze_Thaw_Retrieval_Data/freeze_reference_date",
"/Freeze_Thaw_Retrieval_Data/thaw_reference_date"
]
}
],
"MetadataOverrides": [
{
"Applicability": {
Expand Down Expand Up @@ -1915,7 +1935,7 @@
],
"_Description": "SMAP L3 data are HDF5 and without dimension settings. Overrides here define the dimensions, a useful reference name, and critically, the dimension order."
},
{
{
"Applicability": {
"Mission": "SMAP",
"ShortNamePath": "SPL2SMAP_S",
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ earthdata-varinfo ~= 3.0.2
harmony-service-lib ~= 2.5.0
netCDF4 ~= 1.6.5
xarray == 2025.9.0
h5netcdf ~= 1.6.4
76 changes: 76 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,3 +394,79 @@ def sample_varinfo_test04(
return VarInfoFromNetCDF4(
sample_netcdf4_file, config_file=varinfo_config_file, short_name='TEST04'
)


@fixture(scope='function')
def sample_netcdf4_file_test05(temp_dir) -> str:
"""Create a sample NetCDF-4 file for testing excluding variables."""
file_name = path_join(temp_dir, 'test_input_05.nc')

sample_datatree = xr.DataTree(
xr.Dataset(
data_vars={
'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']),
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
},
)
)

sample_datatree['/sub_group'] = xr.Dataset(
data_vars={
'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']),
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
},
)

sample_datatree['/sub_group/nested_group'] = xr.Dataset(
data_vars={
'string_time_utc_seconds': xr.DataArray(['time1', 'time2', 'time3']),
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
},
)

sample_datatree.to_netcdf(file_name, encoding=None)
return file_name


@fixture(scope='function')
def expected_output_netcdf4_file_test05(temp_dir) -> str:
"""Create a sample NetCDF-4 file for testing excluding variables.

The generated file omits the 'string_time_utc_seconds' variable from each group.
This ensures that the metadata annotator correctly excludes these variables
from its output during testing.
"""
file_name = path_join(temp_dir, 'test_input_05.nc')

sample_datatree = xr.DataTree(
xr.Dataset(
data_vars={
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
},
)
)

sample_datatree['/sub_group'] = xr.Dataset(
data_vars={
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
},
)

sample_datatree['/sub_group/nested_group'] = xr.Dataset(
data_vars={
'string_time_seconds': xr.DataArray(np.ones((3, 3))),
},
)

sample_datatree.to_netcdf(file_name, encoding=None)
return file_name


@fixture(scope='function')
def sample_varinfo_test05(
sample_netcdf4_file_test05, varinfo_config_file
) -> VarInfoFromNetCDF4:
"""Create sample VarInfoFromNetCDF4 instance."""
return VarInfoFromNetCDF4(
sample_netcdf4_file_test05, config_file=varinfo_config_file, short_name='TEST05'
)
11 changes: 11 additions & 0 deletions tests/data/earthdata_varinfo_test_config.json

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just commenting on this file as a whole - I never thought about creating a test config file like this, kinda cool! I suppose I wonder if it might be an issue that we're not directly testing the changes to earthdata_varinfo_config.json, but if this is an accepted practice I'll accept it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While I think both approaches are okay, the earthdata_varinfo_test_config.json file includes configuration items for things like deleting attributes that don't currently exist in the earthdata_varinfo_config.json. I think using the test config file is beneficial for our unit tests as it won't change as desired collection specific output annotations evolve (ex. unit test using a configuration item that is later removed).

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, okay got it! And that makes sense especially for the metadata annotator.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, do you think it'd be worthwhile to test the specific SPL3FTA configuration, or nah?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The precedent set in this repository has been to avoid tests for specific collection configuration rules. I think this is mainly because it would inflate the test suite (would require tests for every collection that is configured) and we'd also need to store granules for these tests in the repository.

We've been reliant on properly testing these collection specific configuration rules in the test instructions for the PRs that add them.

Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@
"Mission": {
"TEST\\d{2}": "TEST_MISSION"
},
"ExcludedScienceVariables": [
{
"Applicability": {
"Mission": "TEST_MISSION",
"ShortNamePath": "TEST05"
},
"VariablePattern": [
"/.*time_utc.*"
]
}
],
"MetadataOverrides": [
{
"Applicability": {
Expand Down
90 changes: 90 additions & 0 deletions tests/unit/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from metadata_annotator.annotate import (
annotate_granule,
create_new_variable,
delete_variable,
get_dimension_variables,
get_geotransform_config,
get_grid_start_index,
Expand All @@ -19,7 +20,9 @@
get_spatial_dimension_type,
get_spatial_dimension_variables,
get_start_index_from_row_col_variable,
get_variables_to_delete,
is_exact_path,
is_excluded_science_variable,
is_temporary_attribute,
update_dimension_names,
update_dimension_variable_attributes,
Expand Down Expand Up @@ -250,6 +253,31 @@ def test_annotate_granule_no_changes(
assert results_datatree.identical(expected_datatree)


def test_annotate_granule_variable_exclusions_only(
sample_netcdf4_file_test05,
expected_output_netcdf4_file_test05,
temp_output_file_path,
varinfo_config_file,
mocker,
):
"""Confirm that a granule with only variable exclusion configuration is updated."""
_ = mocker.patch('metadata_annotator.annotate.update_history_metadata')
annotate_granule(
sample_netcdf4_file_test05,
temp_output_file_path,
varinfo_config_file,
'TEST05',
)

with (
xr.open_datatree(
expected_output_netcdf4_file_test05, decode_times=False
) as expected_datatree,
xr.open_datatree(temp_output_file_path, decode_times=False) as results_datatree,
):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first time I really hate black's formatting. 😬

assert results_datatree.identical(expected_datatree)


def test_annotate_granule_with_dimension_variable_updates(temp_output_file_path):
"""Confirm that a granule has all metadata updated as expected.

Expand Down Expand Up @@ -716,3 +744,65 @@ def test_update_dimension_variables(sample_netcdf4_file_test04, sample_varinfo_t
assert np.allclose(
test_datatree['sub_group'].dataset['y'], expected_y_values
)


def test_get_variables_to_delete(sample_varinfo_test05):
"""Ensure correct list of variables to delete is obtained."""
expected_result = set(
[
'/string_time_utc_seconds',
'/sub_group/string_time_utc_seconds',
'/sub_group/nested_group/string_time_utc_seconds',
]
)
assert set(get_variables_to_delete(sample_varinfo_test05)) == expected_result


def test_is_excluded_science_variable(sample_varinfo_test05):
"""Ensure excluded science variables are determined correctly."""
assert is_excluded_science_variable(
sample_varinfo_test05, '/string_time_utc_seconds'
)
assert is_excluded_science_variable(
sample_varinfo_test05, '/sub_group/string_time_utc_seconds'
)
assert is_excluded_science_variable(
sample_varinfo_test05, '/sub_group/nested_group/string_time_utc_seconds'
)
assert not is_excluded_science_variable(
sample_varinfo_test05, '/string_time_seconds'
)
assert not is_excluded_science_variable(
sample_varinfo_test05, '/sub_group/string_time_seconds'
)
assert not is_excluded_science_variable(
sample_varinfo_test05, '/sub_group/nested/string_time_seconds'
)


@pytest.mark.parametrize(
'group_name',
[
'/',
'/sub_group',
'/sub_group/nested_group',
],
)
def test_delete_variable(sample_netcdf4_file_test05, group_name):
"""Ensure correct variables are deleted from the datatree."""
with xr.open_datatree(
sample_netcdf4_file_test05, decode_times=False
) as test_datatree:
variable_name = 'string_time_utc_seconds'
if group_name == '/':
full_path = f'/{variable_name}'
else:
full_path = f'{group_name}/{variable_name}'

# check that variable is in datatree before deletion
assert variable_name in test_datatree[group_name].ds.data_vars
initial_var_count = len(test_datatree[group_name].ds.data_vars)

delete_variable(test_datatree, full_path)
assert variable_name not in test_datatree[group_name].ds.data_vars
assert len(test_datatree[group_name].ds.data_vars) == initial_var_count - 1