-
Notifications
You must be signed in to change notification settings - Fork 1
DAS-2418: Add exclusions for SMAP L3 string variables #33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
89a158a
fcd2987
b099eb7
dd1fcc8
69d3374
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1 @@ | ||
| 1.4.0 | ||
| 1.5.0 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,8 +46,12 @@ def annotate_granule( | |
| config_file=varinfo_config_file, | ||
| ) | ||
|
|
||
| if len(granule_varinfo.cf_config.metadata_overrides): | ||
| # There are metadata overrides applicable to the granule's collection: | ||
| if ( | ||
| len(granule_varinfo.cf_config.metadata_overrides) | ||
| or granule_varinfo.cf_config.excluded_science_variables | ||
| ): | ||
| # There are metadata overrides or excluded variables | ||
| # applicable to the granule's collection: | ||
| amend_in_file_metadata(input_file_name, output_file_name, granule_varinfo) | ||
| else: | ||
| # There are no updates required, so copy the input file as-is: | ||
|
|
@@ -60,10 +64,10 @@ def amend_in_file_metadata( | |
| """Update metadata attributes according to known rules. | ||
|
|
||
| First, identify the variables or groups needing to be updated, or variables | ||
| that need to be created. Next create any missing, attribute only, variables. | ||
| Update the metadata attributes of all variables listed in overrides, or | ||
| removing any attributes with an overriding value of None. Lastly, update | ||
| the `history` global attribute. | ||
| that need to be created. Then, delete any variables that are configured to be | ||
| excluded. Next create any missing, attribute only, variables. Update the metadata | ||
| attributes of all variables listed in overrides, or removing any attributes with an | ||
| overriding value of None. Lastly, update the `history` global attribute. | ||
|
|
||
| When opening the file as a DataTree, attempts to decode times, coordinates | ||
| and other CF-Convention metadata are disabled, to allow updates to be made | ||
|
|
@@ -74,7 +78,7 @@ def amend_in_file_metadata( | |
| items_to_update, variables_to_create = get_matching_groups_and_variables( | ||
| granule_varinfo, | ||
| ) | ||
|
|
||
| variables_to_delete = get_variables_to_delete(granule_varinfo) | ||
| with xr.open_datatree( | ||
| input_file_name, | ||
| decode_times=False, | ||
|
|
@@ -83,7 +87,15 @@ def amend_in_file_metadata( | |
| concat_characters=True, | ||
| use_cftime=False, | ||
| mask_and_scale=False, | ||
| engine='h5netcdf', | ||
| ) as datatree: | ||
| # Delete the excluded variables from the datatree and remove them from | ||
| # the set of items to update | ||
| for variable in variables_to_delete: | ||
| if variable in items_to_update: | ||
| items_to_update.remove(variable) | ||
| delete_variable(datatree, variable) | ||
|
|
||
| # Update all pre-existing variables or groups with metadata overrides including | ||
| # dimension renaming where applicable. | ||
| update_group_and_variable_attributes(datatree, items_to_update, granule_varinfo) | ||
|
|
@@ -117,7 +129,7 @@ def amend_in_file_metadata( | |
| # whole `xarray.DataTree` in one operation. Making this write variables | ||
| # and group separately reduces the memory usage, but makes the | ||
| # operation slower. (See Harmony SMAP L2 Gridder implementation) | ||
| datatree.to_netcdf(output_file_name) | ||
| datatree.to_netcdf(output_file_name, engine='h5netcdf') | ||
|
|
||
|
|
||
| def get_matching_groups_and_variables( | ||
|
|
@@ -510,3 +522,31 @@ def get_referenced_variables( | |
| ) | ||
|
|
||
| return referenced_variables | ||
|
|
||
|
|
||
| def get_variables_to_delete( | ||
| var_info: VarInfoFromNetCDF4, | ||
| ) -> list[str]: | ||
| """Returns a list of variables to delete identified by VarInfo configuration.""" | ||
| vars_to_delete = [] | ||
flamingbear marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| var_list = var_info.get_all_variables() | ||
|
|
||
| vars_to_delete = [ | ||
| var for var in var_list if is_excluded_science_variable(var_info, var) | ||
| ] | ||
| return vars_to_delete | ||
|
|
||
|
|
||
| def is_excluded_science_variable(var_info: VarInfoFromNetCDF4, var) -> bool: | ||
| """Returns True if variable is explicitly excluded by VarInfo configuration.""" | ||
| exclusions_pattern = re.compile( | ||
| '|'.join(var_info.cf_config.excluded_science_variables) | ||
| ) | ||
| return var_info.variable_is_excluded(var, exclusions_pattern) | ||
|
Comment on lines
+535
to
+540
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this is probably the time to move this to earthdata-varinfo. Because I'll bet Josie has a similar function in her PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I certainly do. |
||
|
|
||
|
|
||
| def delete_variable(datatree, full_variable_path: str) -> None: | ||
| """Delete a variable from the DataTree.""" | ||
| parent_group, variable_name = full_variable_path.rsplit('/', 1) | ||
| node = datatree[parent_group] if parent_group else datatree | ||
| del node[variable_name] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,3 +4,4 @@ earthdata-varinfo ~= 3.0.2 | |
| harmony-service-lib ~= 2.5.0 | ||
| netCDF4 ~= 1.6.5 | ||
| xarray == 2025.9.0 | ||
| h5netcdf ~= 1.6.4 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -394,3 +394,74 @@ def sample_varinfo_test04( | |
| return VarInfoFromNetCDF4( | ||
| sample_netcdf4_file, config_file=varinfo_config_file, short_name='TEST04' | ||
| ) | ||
|
|
||
|
|
||
| @fixture(scope='function') | ||
| def sample_netcdf4_file_test05(temp_dir) -> str: | ||
| """Create a sample NetCDF-4 file for testing excluding variables.""" | ||
| file_name = path_join(temp_dir, 'test_input_05.nc') | ||
|
|
||
| sample_datatree = xr.DataTree( | ||
| xr.Dataset( | ||
| data_vars={ | ||
| 'string_time_utc_seconds': xr.DataArray(np.ones((3, 3))), | ||
|
||
| 'string_time_seconds': xr.DataArray(np.ones((3, 3))), | ||
| }, | ||
| ) | ||
| ) | ||
|
|
||
| sample_datatree['/sub_group'] = xr.Dataset( | ||
| data_vars={ | ||
| 'string_time_utc_seconds': xr.DataArray(np.ones((3, 3))), | ||
| 'string_time_seconds': xr.DataArray(np.ones((3, 3))), | ||
| }, | ||
| ) | ||
|
|
||
| sample_datatree['/sub_group/nested_group'] = xr.Dataset( | ||
| data_vars={ | ||
| 'string_time_utc_seconds': xr.DataArray(np.ones((3, 3))), | ||
| 'string_time_seconds': xr.DataArray(np.ones((3, 3))), | ||
| }, | ||
| ) | ||
|
|
||
| sample_datatree.to_netcdf(file_name, encoding=None) | ||
| return file_name | ||
|
|
||
|
|
||
| @fixture(scope='function') | ||
| def expected_output_netcdf4_file_test05(temp_dir) -> str: | ||
| """Create a sample NetCDF-4 file for testing excluding variables.""" | ||
flamingbear marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| file_name = path_join(temp_dir, 'test_input_05.nc') | ||
|
|
||
| sample_datatree = xr.DataTree( | ||
| xr.Dataset( | ||
| data_vars={ | ||
| 'string_time_seconds': xr.DataArray(np.ones((3, 3))), | ||
| }, | ||
| ) | ||
| ) | ||
|
|
||
| sample_datatree['/sub_group'] = xr.Dataset( | ||
| data_vars={ | ||
| 'string_time_seconds': xr.DataArray(np.ones((3, 3))), | ||
| }, | ||
| ) | ||
|
|
||
| sample_datatree['/sub_group/nested_group'] = xr.Dataset( | ||
| data_vars={ | ||
| 'string_time_seconds': xr.DataArray(np.ones((3, 3))), | ||
| }, | ||
| ) | ||
|
|
||
| sample_datatree.to_netcdf(file_name, encoding=None) | ||
| return file_name | ||
|
|
||
|
|
||
| @fixture(scope='function') | ||
| def sample_varinfo_test05( | ||
| sample_netcdf4_file_test05, varinfo_config_file | ||
| ) -> VarInfoFromNetCDF4: | ||
| """Create sample VarInfoFromNetCDF4 instance.""" | ||
| return VarInfoFromNetCDF4( | ||
| sample_netcdf4_file_test05, config_file=varinfo_config_file, short_name='TEST05' | ||
| ) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just commenting on this file as a whole - I never thought about creating a test config file like this, kinda cool! I suppose I wonder if it might be an issue that we're not directly testing the changes to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While I think both approaches are okay, the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, okay got it! And that makes sense especially for the metadata annotator. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, do you think it'd be worthwhile to test the specific SPL3FTA configuration, or nah? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The precedent set in this repository has been to avoid tests for specific collection configuration rules. I think this is mainly because it would inflate the test suite (would require tests for every collection that is configured) and we'd also need to store granules for these tests in the repository. We've been reliant on properly testing these collection specific configuration rules in the test instructions for the PRs that add them. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,7 @@ | |
| from metadata_annotator.annotate import ( | ||
| annotate_granule, | ||
| create_new_variable, | ||
| delete_variable, | ||
| get_dimension_variables, | ||
| get_geotransform_config, | ||
| get_grid_start_index, | ||
|
|
@@ -19,7 +20,9 @@ | |
| get_spatial_dimension_type, | ||
| get_spatial_dimension_variables, | ||
| get_start_index_from_row_col_variable, | ||
| get_variables_to_delete, | ||
| is_exact_path, | ||
| is_excluded_science_variable, | ||
| is_temporary_attribute, | ||
| update_dimension_names, | ||
| update_dimension_variable_attributes, | ||
|
|
@@ -250,6 +253,31 @@ def test_annotate_granule_no_changes( | |
| assert results_datatree.identical(expected_datatree) | ||
|
|
||
|
|
||
| def test_annotate_granule_variable_exclusions_only( | ||
| sample_netcdf4_file_test05, | ||
| expected_output_netcdf4_file_test05, | ||
| temp_output_file_path, | ||
| varinfo_config_file, | ||
| mocker, | ||
| ): | ||
| """Confirm that a granule with only variable exclusion configuration is updated.""" | ||
| _ = mocker.patch('metadata_annotator.annotate.update_history_metadata') | ||
| annotate_granule( | ||
| sample_netcdf4_file_test05, | ||
| temp_output_file_path, | ||
| varinfo_config_file, | ||
| 'TEST05', | ||
| ) | ||
|
|
||
| with ( | ||
| xr.open_datatree( | ||
| expected_output_netcdf4_file_test05, decode_times=False | ||
| ) as expected_datatree, | ||
| xr.open_datatree(temp_output_file_path, decode_times=False) as results_datatree, | ||
| ): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The first time I really hate black's formatting. 😬 |
||
| assert results_datatree.identical(expected_datatree) | ||
|
|
||
|
|
||
| def test_annotate_granule_with_dimension_variable_updates(temp_output_file_path): | ||
| """Confirm that a granule has all metadata updated as expected. | ||
|
|
||
|
|
@@ -716,3 +744,65 @@ def test_update_dimension_variables(sample_netcdf4_file_test04, sample_varinfo_t | |
| assert np.allclose( | ||
| test_datatree['sub_group'].dataset['y'], expected_y_values | ||
| ) | ||
|
|
||
|
|
||
| def test_get_variables_to_delete(sample_varinfo_test05): | ||
| """Ensure correct list of variables to delete is obtained.""" | ||
| expected_result = set( | ||
| [ | ||
| '/string_time_utc_seconds', | ||
| '/sub_group/string_time_utc_seconds', | ||
| '/sub_group/nested_group/string_time_utc_seconds', | ||
| ] | ||
| ) | ||
| assert set(get_variables_to_delete(sample_varinfo_test05)) == expected_result | ||
|
|
||
|
|
||
| def test_is_excluded_science_variable(sample_varinfo_test05): | ||
| """Ensure excluded science variables are determined correctly.""" | ||
| assert is_excluded_science_variable( | ||
| sample_varinfo_test05, '/string_time_utc_seconds' | ||
| ) | ||
| assert is_excluded_science_variable( | ||
| sample_varinfo_test05, '/sub_group/string_time_utc_seconds' | ||
| ) | ||
| assert is_excluded_science_variable( | ||
| sample_varinfo_test05, '/sub_group/nested_group/string_time_utc_seconds' | ||
| ) | ||
| assert not is_excluded_science_variable( | ||
| sample_varinfo_test05, '/string_time_seconds' | ||
| ) | ||
| assert not is_excluded_science_variable( | ||
| sample_varinfo_test05, '/sub_group/string_time_seconds' | ||
| ) | ||
| assert not is_excluded_science_variable( | ||
| sample_varinfo_test05, '/sub_group/nested/string_time_seconds' | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| 'group_name', | ||
| [ | ||
| '/', | ||
| '/sub_group', | ||
| '/sub_group/nested_group', | ||
| ], | ||
| ) | ||
| def test_delete_variable(sample_netcdf4_file_test05, group_name): | ||
| """Ensure correct variables are deleted from the datatree.""" | ||
| with xr.open_datatree( | ||
| sample_netcdf4_file_test05, decode_times=False | ||
| ) as test_datatree: | ||
| variable_name = 'string_time_utc_seconds' | ||
| if group_name == '/': | ||
| full_path = f'/{variable_name}' | ||
| else: | ||
| full_path = f'{group_name}/{variable_name}' | ||
|
|
||
| # check that variable is in datatree before deletion | ||
| assert variable_name in test_datatree[group_name].ds.data_vars | ||
| initial_var_count = len(test_datatree[group_name].ds.data_vars) | ||
|
|
||
| delete_variable(test_datatree, full_path) | ||
| assert variable_name not in test_datatree[group_name].ds.data_vars | ||
| assert len(test_datatree[group_name].ds.data_vars) == initial_var_count - 1 | ||
Uh oh!
There was an error while loading. Please reload this page.