From 52656628c4707c06a2b35b8f2f62c262647f1699 Mon Sep 17 00:00:00 2001 From: Eric Nordlund Date: Thu, 5 Jun 2025 15:41:41 -0700 Subject: [PATCH] Fix linkcheck anchor encoding issues - Enhanced AnchorCheckParser to handle multiple anchor variations - Added comprehensive test coverage for encoded anchors - Fixed false 'Anchor not found' errors for URLs with encoded characters - Maintains full backward compatibility - All linting checks pass --- CHANGES.rst | 2 + sphinx/builders/linkcheck.py | 47 ++++- .../test-linkcheck-encoded-anchors/conf.py | 4 + .../test_builders/test_anchor_check_parser.py | 76 ++++++++ .../test_build_linkcheck_encoded_anchors.py | 169 ++++++++++++++++++ 5 files changed, 289 insertions(+), 9 deletions(-) create mode 100644 tests/roots/test-linkcheck-encoded-anchors/conf.py create mode 100644 tests/test_builders/test_anchor_check_parser.py create mode 100644 tests/test_builders/test_build_linkcheck_encoded_anchors.py diff --git a/CHANGES.rst b/CHANGES.rst index f575efab7c7..80b6efee8f8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -37,6 +37,8 @@ Bugs fixed Patch by Alicia Garcia-Raboso. * #13528: Add tilde ``~`` prefix support for :rst:role:`py:deco`. Patch by Shengyu Zhang and Adam Turner. +* linkcheck: Fix false "Anchor not found" errors for valid URLs with encoded + characters in fragment identifiers. Testing ------- diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index ff6878f2acb..085acd5fe66 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -11,7 +11,7 @@ from html.parser import HTMLParser from queue import PriorityQueue, Queue from threading import Thread -from typing import TYPE_CHECKING, NamedTuple, cast +from typing import TYPE_CHECKING, Any, NamedTuple, cast from urllib.parse import quote, unquote, urlparse, urlsplit, urlunparse from docutils import nodes @@ -485,6 +485,9 @@ def _retrieval_methods( def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties: req_url, delimiter, anchor = uri.partition('#') + original_encoded_anchor = ( + anchor # Store the original encoded anchor before decoding + ) if delimiter and anchor: for rex in self.anchors_ignore: if rex.match(anchor): @@ -536,7 +539,9 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties: ) as response: if anchor and self.check_anchors and response.ok: try: - found = contains_anchor(response, anchor) + found = contains_anchor( + response, anchor, original_encoded_anchor + ) except UnicodeDecodeError: return ( _Status.IGNORED, @@ -686,11 +691,13 @@ def _get_request_headers( return {} -def contains_anchor(response: Response, anchor: str) -> bool: +def contains_anchor( + response: Response, anchor: str, original_encoded_anchor: str = '' +) -> bool: """Determine if an anchor is contained within an HTTP response.""" - parser = AnchorCheckParser(anchor) - # Read file in chunks. If we find a matching anchor, we break - # the loop early in hopes not to have to download the whole thing. + parser = AnchorCheckParser(anchor, original_encoded_anchor) + # Read file in chunks. If we find a matching anchor, we break the loop early + # to avoid downloading the entire response body. for chunk in response.iter_content(chunk_size=4096, decode_unicode=True): if isinstance(chunk, bytes): # requests failed to decode, manually try to decode it @@ -706,15 +713,37 @@ def contains_anchor(response: Response, anchor: str) -> bool: class AnchorCheckParser(HTMLParser): """Specialised HTML parser that looks for a specific anchor.""" - def __init__(self, search_anchor: str) -> None: + def __init__(self, search_anchor: str, original_encoded_anchor: str = '') -> None: + """Initialize the parser with multiple anchor variations. + + Args: + search_anchor: The decoded anchor to search for + (e.g., "standard-input/output-stdio") + original_encoded_anchor: The original encoded anchor + (e.g., "standard-input%2Foutput-stdio") + """ super().__init__() - self.search_anchor = search_anchor + # Create variations of the anchor to check + self.search_variations = { + search_anchor, # decoded (current behavior) + } + + # Add the original encoded version if provided + if original_encoded_anchor: + self.search_variations.add(original_encoded_anchor) + + # Add a re-encoded version if the decoded anchor contains characters + # that would be encoded + if search_anchor != quote(search_anchor, safe=''): + self.search_variations.add(quote(search_anchor, safe='')) + self.found = False def handle_starttag(self, tag: Any, attrs: Any) -> None: for key, value in attrs: - if key in {'id', 'name'} and value == self.search_anchor: + # Check if the attribute value matches any of our variations + if key in {'id', 'name'} and value in self.search_variations: self.found = True break diff --git a/tests/roots/test-linkcheck-encoded-anchors/conf.py b/tests/roots/test-linkcheck-encoded-anchors/conf.py new file mode 100644 index 00000000000..88442504003 --- /dev/null +++ b/tests/roots/test-linkcheck-encoded-anchors/conf.py @@ -0,0 +1,4 @@ +root_doc = 'encoded_anchors' +exclude_patterns = ['_build'] +linkcheck_anchors = True +linkcheck_timeout = 0.25 diff --git a/tests/test_builders/test_anchor_check_parser.py b/tests/test_builders/test_anchor_check_parser.py new file mode 100644 index 00000000000..c58a37158a7 --- /dev/null +++ b/tests/test_builders/test_anchor_check_parser.py @@ -0,0 +1,76 @@ +"""Test the AnchorCheckParser class.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING +from unittest import mock + +from sphinx.builders.linkcheck import AnchorCheckParser, contains_anchor + +if TYPE_CHECKING: + from typing import Any + + +def test_anchor_check_parser_basic() -> None: + """Test basic anchor matching functionality.""" + parser = AnchorCheckParser('test-anchor') + parser.feed('
Test
') + assert parser.found is True + + parser = AnchorCheckParser('non-existent') + parser.feed('
Test
') + assert parser.found is False + + +def test_anchor_check_parser_with_encoded_anchors() -> None: + """Test anchor matching with encoded characters.""" + # Test with encoded slash + parser = AnchorCheckParser( + 'standard-input/output-stdio', 'standard-input%2Foutput-stdio' + ) + parser.feed( + '
Test
' + ) + assert parser.found is True + + # Test with plus sign + parser = AnchorCheckParser('encoded+anchor', 'encoded%2Banchor') + parser.feed('
Test
') + assert parser.found is True + + # Test with space + parser = AnchorCheckParser('encoded space', 'encoded%20space') + parser.feed('
Test
') + assert parser.found is True + + +def test_contains_anchor_with_encoded_characters() -> None: + """Test the contains_anchor function with encoded characters.""" + mock_response = mock.MagicMock() + + # Setup a response that returns HTML with encoded anchors + def mock_iter_content(chunk_size: Any = None, decode_unicode: Any = None) -> Any: + content = '
Test
' + yield content + + mock_response.iter_content = mock_iter_content + + # Test with original encoded anchor + assert ( + contains_anchor( + mock_response, + 'standard-input/output-stdio', + 'standard-input%2Foutput-stdio', + ) + is True + ) + + # Test with decoded anchor only + mock_response2 = mock.MagicMock() + mock_response2.iter_content = mock_iter_content + assert contains_anchor(mock_response2, 'standard-input/output-stdio') is True + + # Test with non-existent anchor + mock_response3 = mock.MagicMock() + mock_response3.iter_content = mock_iter_content + assert contains_anchor(mock_response3, 'non-existent-anchor') is False diff --git a/tests/test_builders/test_build_linkcheck_encoded_anchors.py b/tests/test_builders/test_build_linkcheck_encoded_anchors.py new file mode 100644 index 00000000000..596d4e093b4 --- /dev/null +++ b/tests/test_builders/test_build_linkcheck_encoded_anchors.py @@ -0,0 +1,169 @@ +"""Test the linkcheck builder's ability to handle encoded anchors.""" + +from __future__ import annotations + +import json +import re +from http.server import BaseHTTPRequestHandler +from typing import TYPE_CHECKING + +import pytest + +from tests.utils import serve_application + +if TYPE_CHECKING: + from collections.abc import Iterable + from typing import Any + + from sphinx.testing.util import SphinxTestApp + + +class EncodedAnchorsHandler(BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]: + """Split content into chunks of a maximum size.""" + + def _encode_chunk(chunk: bytes) -> Iterable[bytes]: + """Encode a bytestring into a format suitable for HTTP chunked-transfer.""" + yield f'{len(chunk):X}'.encode('ascii') + yield b'\r\n' + yield chunk + yield b'\r\n' + + buffer = b'' + for char in content: + buffer += char.encode('utf-8') + if len(buffer) >= max_chunk_size: + chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:] + yield from _encode_chunk(chunk) + + # Flush remaining bytes, if any + if buffer: + yield from _encode_chunk(buffer) + + # Emit a final empty chunk to close the stream + yield from _encode_chunk(b'') + + def _send_chunked(self, content: str) -> bool: + """Send content in chunks.""" + for chunk in self._chunk_content(content, max_chunk_size=20): + try: + self.wfile.write(chunk) + except (BrokenPipeError, ConnectionResetError) as e: + self.log_message(str(e)) + return False + return True + + def do_HEAD(self) -> None: + """Handle HEAD requests.""" + print(f'HEAD request for path: {self.path}') + if self.path in {'/standard-encoded-anchors', '/various-encoded-chars'}: + self.send_response(200, 'OK') + else: + self.send_response(404, 'Not Found') + self.end_headers() + + def do_GET(self) -> None: + """Serve test pages with encoded anchors.""" + if self.path == '/standard-encoded-anchors': + self.send_response(200, 'OK') + # Note the ID has an encoded forward slash (%2F) + content = """ + + + Encoded Anchors Test + +

Standard I/O

+

Encoded Plus

+ + + """ + elif self.path == '/various-encoded-chars': + self.send_response(200, 'OK') + content = """ + + + Various Encoded Characters + +

Encoded Exclamation

+

Encoded Hash

+

Encoded Percent

+

Encoded Ampersand

+
Encoded Question
+
Encoded At
+ + + """ + else: + self.send_response(404, 'Not Found') + content = 'not found\n' + self.send_header('Transfer-Encoding', 'chunked') + self.end_headers() + self._send_chunked(content) + + +@pytest.mark.sphinx( + 'linkcheck', + testroot='linkcheck-encoded-anchors', + freshenv=True, +) +def test_encoded_anchors_handling(app: SphinxTestApp, tmp_path: Any) -> None: + """Test that linkcheck correctly handles URLs with encoded anchors.""" + with serve_application(app, EncodedAnchorsHandler) as address: + # Create test file with encoded anchor links using the server address + (app.srcdir / 'encoded_anchors.rst').write_text( + f""" +Encoded Anchors Test +==================== + +Links with encoded anchors: + +* `Standard I/O `_ +* `Encoded Plus `_ +* `Encoded Exclamation `_ +* `Encoded Hash `_ +* `Encoded Percent `_ +* `Encoded Ampersand `_ +* `Encoded Question `_ +* `Encoded At `_ +""", + encoding='utf-8', + ) + + app.build() + + # Parse the JSON output to check the results + content = (app.outdir / 'output.json').read_text(encoding='utf8') + data = [json.loads(record) for record in content.splitlines()] + + # Filter for our encoded anchor URLs + encoded_anchor_results = [ + record + for record in data + if any( + x in record['uri'] + for x in ['standard-encoded-anchors#', 'various-encoded-chars#'] + ) + ] + + # All links with encoded anchors should be working + assert all(record['status'] == 'working' for record in encoded_anchor_results) + + # Verify specific links + uri_pattern = re.compile( + f'http://{re.escape(address)}/standard-encoded-anchors#standard-input/output-stdio' + ) + stdio_link = next( + record for record in encoded_anchor_results if uri_pattern.match(record['uri']) + ) + assert stdio_link['status'] == 'working' + + # Check for encoded plus link + plus_pattern = re.compile( + f'http://{re.escape(address)}/standard-encoded-anchors#encoded\\+anchor' + ) + plus_link = next( + record for record in encoded_anchor_results if plus_pattern.match(record['uri']) + ) + assert plus_link['status'] == 'working'