From 130a2be16e6b797996a4594d73451e6eb624475f Mon Sep 17 00:00:00 2001 From: Vlad0n20 Date: Mon, 9 Jun 2025 19:49:48 +0300 Subject: [PATCH 1/2] Add new tests for update Datacite metadata celery task --- .../test_datacite_verified_links.py | 209 ++++++++++++++++++ tests/identifiers/test_tasks.py | 125 +++++++++++ website/identifiers/tasks.py | 12 +- 3 files changed, 345 insertions(+), 1 deletion(-) create mode 100644 tests/identifiers/test_datacite_verified_links.py create mode 100644 tests/identifiers/test_tasks.py diff --git a/tests/identifiers/test_datacite_verified_links.py b/tests/identifiers/test_datacite_verified_links.py new file mode 100644 index 00000000000..1005f380d64 --- /dev/null +++ b/tests/identifiers/test_datacite_verified_links.py @@ -0,0 +1,209 @@ +import pytest +from unittest import mock + +from osf.metadata.serializers.datacite.datacite_tree_walker import DataciteTreeWalker +from osf.metadata import gather + +from osf_tests.factories import NodeFactory, UserFactory + + +@pytest.mark.django_db +class TestDataCiteVerifiedLinks: + + @pytest.fixture + def node(self): + return NodeFactory(is_public=True) + + @pytest.fixture + def basket(self, node): + from osf.metadata.osf_gathering import OsfFocus + focus = OsfFocus(node) + return gather.Basket(focus) + + @pytest.fixture + def tree_walker(self, basket): + def mock_visit(parent, tag, text=None, attrib=None, is_list=False): + element = {'tag': tag, 'text': text, 'attrib': attrib or {}} + if is_list: + element['children'] = [] + return element + + walker = DataciteTreeWalker(basket, None, mock_visit) + walker.visit = mock_visit + return walker + + @mock.patch('framework.sentry.log_message') + def test_valid_verified_links(self, mock_sentry_log, tree_walker, mock_gravy_valet_get_verified_links): + mock_gravy_valet_get_verified_links.return_value = [ + {'target_url': 'https://example.com/dataset', 'resource_type': 'dataset'}, + {'target_url': 'https://github.com/user/repo', 'resource_type': 'software'}, + {'target_url': 'https://osf.io/abc123/', 'resource_type': 'text'} + ] + + parent_el = {'tag': 'resource', 'children': []} + + tree_walker._visit_related_and_verified_links(parent_el) + + mock_gravy_valet_get_verified_links.assert_called_once_with(tree_walker.basket.focus.dbmodel._id) + + mock_sentry_log.assert_not_called() + + @mock.patch('framework.sentry.log_message') + def test_missing_target_url(self, mock_sentry_log, tree_walker, mock_gravy_valet_get_verified_links): + mock_gravy_valet_get_verified_links.return_value = [ + {'resource_type': 'dataset'}, + {'target_url': 'https://example.com/valid', 'resource_type': 'software'} # Valid link + ] + + parent_el = {'tag': 'resource', 'children': []} + tree_walker._visit_related_and_verified_links(parent_el) + + mock_sentry_log.assert_called_once() + call_args = mock_sentry_log.call_args[0] + assert 'Skipped items for node' in call_args[0] + assert 'Missing data: [link=None, type=dataset]' in call_args[0] + + @mock.patch('framework.sentry.log_message') + def test_missing_resource_type(self, mock_sentry_log, tree_walker, mock_gravy_valet_get_verified_links): + mock_gravy_valet_get_verified_links.return_value = [ + {'target_url': 'https://example.com/dataset'}, # Missing resource_type + {'target_url': 'https://example.com/valid', 'resource_type': 'software'} # Valid link + ] + + parent_el = {'tag': 'resource', 'children': []} + tree_walker._visit_related_and_verified_links(parent_el) + + mock_sentry_log.assert_called_once() + call_args = mock_sentry_log.call_args[0] + assert 'Skipped items for node' in call_args[0] + assert 'Missing data: [link=https://example.com/dataset, type=None]' in call_args[0] + + @mock.patch('framework.sentry.log_message') + def test_invalid_url_format(self, mock_sentry_log, tree_walker, mock_gravy_valet_get_verified_links): + mock_gravy_valet_get_verified_links.return_value = [ + {'target_url': 'not-a-valid-url', 'resource_type': 'dataset'}, + {'target_url': 'also.invalid', 'resource_type': 'software'}, + {'target_url': 'https://example.com/valid', 'resource_type': 'text'} + ] + + parent_el = {'tag': 'resource', 'children': []} + tree_walker._visit_related_and_verified_links(parent_el) + + mock_sentry_log.assert_called_once() + call_args = mock_sentry_log.call_args[0] + assert 'Skipped items for node' in call_args[0] + assert 'Invalid link: [link=not-a-valid-url, type=dataset]' in call_args[0] + assert 'Invalid link: [link=also.invalid, type=software]' in call_args[0] + + @mock.patch('framework.sentry.log_message') + def test_multiple_issues_combined(self, mock_sentry_log, tree_walker, mock_gravy_valet_get_verified_links): + mock_gravy_valet_get_verified_links.return_value = [ + {'resource_type': 'dataset'}, + {'target_url': 'invalid-url', 'resource_type': 'software'}, + {'target_url': 'https://example.com/missing-type'}, + {'target_url': 'https://example.com/valid', 'resource_type': 'text'} + ] + + parent_el = {'tag': 'resource', 'children': []} + tree_walker._visit_related_and_verified_links(parent_el) + + mock_sentry_log.assert_called_once() + call_args = mock_sentry_log.call_args[0] + log_message = call_args[0] + + assert 'Skipped items for node' in log_message + assert 'Missing data: [link=None, type=dataset]' in log_message + assert 'Invalid link: [link=invalid-url, type=software]' in log_message + assert 'Missing data: [link=https://example.com/missing-type, type=None]' in log_message + + @mock.patch('framework.sentry.log_message') + def test_empty_verified_links(self, mock_sentry_log, tree_walker, mock_gravy_valet_get_verified_links): + mock_gravy_valet_get_verified_links.return_value = [] + + parent_el = {'tag': 'resource', 'children': []} + tree_walker._visit_related_and_verified_links(parent_el) + + mock_gravy_valet_get_verified_links.assert_called_once() + + mock_sentry_log.assert_not_called() + + @mock.patch('framework.sentry.log_message') + def test_resource_type_title_case(self, mock_sentry_log, tree_walker, mock_gravy_valet_get_verified_links): + mock_gravy_valet_get_verified_links.return_value = [ + {'target_url': 'https://example.com/dataset', 'resource_type': 'dataset'}, + {'target_url': 'https://example.com/software', 'resource_type': 'software'}, + {'target_url': 'https://example.com/text', 'resource_type': 'text'} + ] + + parent_el = {'tag': 'resource', 'children': []} + + visit_calls = [] + original_visit = tree_walker.visit + + def capture_visit(*args, **kwargs): + visit_calls.append((args, kwargs)) + return original_visit(*args, **kwargs) + + tree_walker.visit = capture_visit + tree_walker._visit_related_and_verified_links(parent_el) + + related_identifier_calls = [ + call for call in visit_calls + if len(call[0]) >= 2 and call[0][1] == 'relatedIdentifier' + ] + + assert len(related_identifier_calls) == 3 + + resource_types = [call[1]['attrib']['resourceTypeGeneral'] for call in related_identifier_calls] + assert 'Dataset' in resource_types + assert 'Software' in resource_types + assert 'Text' in resource_types + + def test_non_abstract_node_skipped(self, tree_walker, mock_gravy_valet_get_verified_links): + from osf.metadata.osf_gathering import OsfFocus + user = UserFactory() + focus = OsfFocus(user) + tree_walker.basket = gather.Basket(focus) + + parent_el = {'tag': 'resource', 'children': []} + tree_walker._visit_related_and_verified_links(parent_el) + + mock_gravy_valet_get_verified_links.assert_not_called() + + @mock.patch('framework.sentry.log_message') + def test_edge_case_empty_strings(self, mock_sentry_log, tree_walker, mock_gravy_valet_get_verified_links): + mock_gravy_valet_get_verified_links.return_value = [ + {'target_url': '', 'resource_type': 'dataset'}, + {'target_url': 'https://example.com/valid', 'resource_type': ''}, + {'target_url': 'https://example.com/valid2', 'resource_type': 'software'} + ] + + parent_el = {'tag': 'resource', 'children': []} + tree_walker._visit_related_and_verified_links(parent_el) + + mock_sentry_log.assert_called_once() + call_args = mock_sentry_log.call_args[0] + log_message = call_args[0] + + assert 'Missing data: [link=, type=dataset]' in log_message + assert 'Missing data: [link=https://example.com/valid, type=]' in log_message + + @mock.patch('framework.sentry.log_message') + def test_url_validation_edge_cases(self, mock_sentry_log, tree_walker, mock_gravy_valet_get_verified_links): + mock_gravy_valet_get_verified_links.return_value = [ + {'target_url': 'http://example.com', 'resource_type': 'dataset'}, + {'target_url': 'https://example.com', 'resource_type': 'software'}, + {'target_url': 'ftp://example.com/file', 'resource_type': 'text'}, + {'target_url': 'example.com', 'resource_type': 'dataset'}, + {'target_url': 'www.example.com', 'resource_type': 'software'}, + ] + + parent_el = {'tag': 'resource', 'children': []} + tree_walker._visit_related_and_verified_links(parent_el) + + mock_sentry_log.assert_called_once() + call_args = mock_sentry_log.call_args[0] + log_message = call_args[0] + + assert 'Invalid link: [link=example.com, type=dataset]' in log_message + assert 'Invalid link: [link=www.example.com, type=software]' in log_message \ No newline at end of file diff --git a/tests/identifiers/test_tasks.py b/tests/identifiers/test_tasks.py new file mode 100644 index 00000000000..77084462a25 --- /dev/null +++ b/tests/identifiers/test_tasks.py @@ -0,0 +1,125 @@ +import pytest +from unittest import mock +from django.utils import timezone +from celery.exceptions import RetryTaskError + +from osf_tests.factories import UserFactory, NodeFactory +from website.identifiers.tasks import task__update_verified_links +from website.identifiers.clients.datacite import DataCiteClient + +pytestmark = pytest.mark.django_db + +@pytest.fixture() +def user(): + return UserFactory() + +@pytest.fixture() +def node(user): + return NodeFactory(creator=user, is_public=True) + +@pytest.fixture() +def node_with_doi(user): + node = NodeFactory(creator=user, is_public=True) + node.set_identifier_value('doi', '10.1234/test') + return node + +@pytest.fixture() +def mock_datacite_client(): + with mock.patch('website.identifiers.clients.datacite.DataCiteClient') as mock_client: + mock_instance = mock.Mock(spec=DataCiteClient) + mock_instance.update_identifier.return_value = {'doi': '10.1234/test'} + mock_client.return_value = mock_instance + yield mock_instance + +@pytest.fixture(autouse=True) +def mock_get_doi_client(mock_datacite_client): + with mock.patch('osf.models.AbstractNode.get_doi_client', return_value=mock_datacite_client): + yield + +@pytest.mark.enable_enqueue_task +class TestUpdateDOIMetadataWithVerifiedLinks: + + def test_update_doi_metadata_success(self, node_with_doi, mock_datacite_client): + verified_links = { + 'https://osf.io/': 'Text', + f'https://osf.io/{node_with_doi._id}/': 'Text' + } + node_with_doi.verified_links = verified_links + node_with_doi.save() + + task__update_verified_links.delay(node_with_doi._id) + + node_with_doi.reload() + assert node_with_doi.verified_links == verified_links + mock_datacite_client.update_identifier.assert_called_once_with(node_with_doi, 'doi') + + def test_update_doi_metadata_no_doi_no_create(self, node, mock_datacite_client): + task__update_verified_links.delay(node._id) + mock_datacite_client.update_identifier.assert_not_called() + + + @mock.patch('framework.sentry.log_message') + @mock.patch('website.identifiers.tasks.task__update_verified_links.retry') + def test_update_doi_metadata_deleted_node(self, mock_retry, mock_sentry_log, node_with_doi, mock_datacite_client): + node_with_doi.is_deleted = True + node_with_doi.deleted = timezone.now() + node_with_doi.save() + + mock_datacite_client.update_identifier.side_effect = Exception('Node is deleted') + mock_retry.side_effect = Exception('Retry prevented') + + with pytest.raises(Exception, match='Retry prevented'): + task__update_verified_links.delay(node_with_doi._id) + + mock_sentry_log.assert_called_with( + 'Failed to update DOI metadata with verified links', + extra_data={'guid': node_with_doi._id, 'error': mock.ANY}, + level=mock.ANY + ) + + @mock.patch('website.identifiers.tasks.task__update_verified_links.retry') + def test_update_doi_metadata_exception_retry(self, mock_retry, node_with_doi, mock_datacite_client): + mock_datacite_client.update_identifier.side_effect = Exception('Test error') + mock_retry.side_effect = RetryTaskError() + + with pytest.raises(RetryTaskError): + task__update_verified_links.delay(node_with_doi._id) + + mock_datacite_client.update_identifier.assert_called_once_with(node_with_doi, 'doi') + mock_retry.assert_called_once_with(exc=mock.ANY) + + @mock.patch('framework.sentry.log_message') + def test_update_doi_metadata_success_log(self, mock_log, node_with_doi, mock_datacite_client): + task__update_verified_links.delay(node_with_doi._id) + + mock_log.assert_any_call( + 'DOI metadata with verified links updated for guid', + extra_data={'guid': node_with_doi._id}, + level=mock.ANY + ) + mock_datacite_client.update_identifier.assert_called_once_with(node_with_doi, 'doi') + + def test_update_doi_metadata_with_multiple_verified_links(self, node_with_doi, mock_datacite_client): + verified_links = { + 'https://osf.io/': 'Text', + f'https://osf.io/{node_with_doi._id}/': 'Text', + 'https://example.com': 'Text', + 'https://test.org': 'Text' + } + node_with_doi.verified_links = verified_links + node_with_doi.save() + + task__update_verified_links.delay(node_with_doi._id) + + node_with_doi.reload() + assert node_with_doi.verified_links == verified_links + mock_datacite_client.update_identifier.assert_called_once_with(node_with_doi, 'doi') + + def test_update_doi_metadata_private_node(self, node_with_doi, mock_datacite_client): + node_with_doi.is_public = False + node_with_doi.save() + mock_datacite_client.reset_mock() + + task__update_verified_links.delay(node_with_doi._id) + + mock_datacite_client.update_identifier.assert_called_once_with(node_with_doi, 'doi') diff --git a/website/identifiers/tasks.py b/website/identifiers/tasks.py index 29afa9645ee..fe5c35848f5 100644 --- a/website/identifiers/tasks.py +++ b/website/identifiers/tasks.py @@ -30,8 +30,18 @@ def task__update_verified_links(self, target_guid): target_object = Guid.load(target_guid).referent try: target_object.request_identifier_update(category='doi') + sentry.log_message( + 'DOI metadata with verified links updated for guid', + extra_data={'guid': target_guid}, + level=logging.INFO + ) logger.debug(f'DOI metadata for guid with verified links updated: [guid={target_guid}]') - except GVException as e: + except Exception as e: + sentry.log_message( + 'Failed to update DOI metadata with verified links', + extra_data={'guid': target_guid, 'error': str(e)}, + level=logging.ERROR + ) logger.error(f'DOI metadata for guid with verified links failed to update: [guid={target_guid}]') raise self.retry(exc=e) From fd244a88203ea3a2913e64f4da009cc1dcb43343 Mon Sep 17 00:00:00 2001 From: Vlad0n20 Date: Fri, 13 Jun 2025 18:36:27 +0300 Subject: [PATCH 2/2] Remove sentry logs for success request_identifier_update case --- website/identifiers/tasks.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/website/identifiers/tasks.py b/website/identifiers/tasks.py index fe5c35848f5..0122a03d0b4 100644 --- a/website/identifiers/tasks.py +++ b/website/identifiers/tasks.py @@ -30,11 +30,6 @@ def task__update_verified_links(self, target_guid): target_object = Guid.load(target_guid).referent try: target_object.request_identifier_update(category='doi') - sentry.log_message( - 'DOI metadata with verified links updated for guid', - extra_data={'guid': target_guid}, - level=logging.INFO - ) logger.debug(f'DOI metadata for guid with verified links updated: [guid={target_guid}]') except Exception as e: sentry.log_message(