diff --git a/CHANGES.rst b/CHANGES.rst
index 40d6984ca55..d7305cf71ac 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -40,6 +40,9 @@ Bugs fixed
* #13619: LaTeX: possible duplicated footnotes in PDF from object signatures
(typically if :confval:`latex_show_urls` ``= 'footnote'``).
Patch by Jean-François B.
+* #13621: linkcheck: Fix false "Anchor not found" errors for valid URLs with encoded
+ characters in fragment identifiers.
+ Patch by Eric Nordlund.
Testing
-------
diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
index ff6878f2acb..085acd5fe66 100644
--- a/sphinx/builders/linkcheck.py
+++ b/sphinx/builders/linkcheck.py
@@ -11,7 +11,7 @@
from html.parser import HTMLParser
from queue import PriorityQueue, Queue
from threading import Thread
-from typing import TYPE_CHECKING, NamedTuple, cast
+from typing import TYPE_CHECKING, Any, NamedTuple, cast
from urllib.parse import quote, unquote, urlparse, urlsplit, urlunparse
from docutils import nodes
@@ -485,6 +485,9 @@ def _retrieval_methods(
def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
req_url, delimiter, anchor = uri.partition('#')
+ original_encoded_anchor = (
+ anchor # Store the original encoded anchor before decoding
+ )
if delimiter and anchor:
for rex in self.anchors_ignore:
if rex.match(anchor):
@@ -536,7 +539,9 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
) as response:
if anchor and self.check_anchors and response.ok:
try:
- found = contains_anchor(response, anchor)
+ found = contains_anchor(
+ response, anchor, original_encoded_anchor
+ )
except UnicodeDecodeError:
return (
_Status.IGNORED,
@@ -686,11 +691,13 @@ def _get_request_headers(
return {}
-def contains_anchor(response: Response, anchor: str) -> bool:
+def contains_anchor(
+ response: Response, anchor: str, original_encoded_anchor: str = ''
+) -> bool:
"""Determine if an anchor is contained within an HTTP response."""
- parser = AnchorCheckParser(anchor)
- # Read file in chunks. If we find a matching anchor, we break
- # the loop early in hopes not to have to download the whole thing.
+ parser = AnchorCheckParser(anchor, original_encoded_anchor)
+ # Read file in chunks. If we find a matching anchor, we break the loop early
+ # to avoid downloading the entire response body.
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
if isinstance(chunk, bytes):
# requests failed to decode, manually try to decode it
@@ -706,15 +713,37 @@ def contains_anchor(response: Response, anchor: str) -> bool:
class AnchorCheckParser(HTMLParser):
"""Specialised HTML parser that looks for a specific anchor."""
- def __init__(self, search_anchor: str) -> None:
+ def __init__(self, search_anchor: str, original_encoded_anchor: str = '') -> None:
+ """Initialize the parser with multiple anchor variations.
+
+ Args:
+ search_anchor: The decoded anchor to search for
+ (e.g., "standard-input/output-stdio")
+ original_encoded_anchor: The original encoded anchor
+ (e.g., "standard-input%2Foutput-stdio")
+ """
super().__init__()
- self.search_anchor = search_anchor
+ # Create variations of the anchor to check
+ self.search_variations = {
+ search_anchor, # decoded (current behavior)
+ }
+
+ # Add the original encoded version if provided
+ if original_encoded_anchor:
+ self.search_variations.add(original_encoded_anchor)
+
+ # Add a re-encoded version if the decoded anchor contains characters
+ # that would be encoded
+ if search_anchor != quote(search_anchor, safe=''):
+ self.search_variations.add(quote(search_anchor, safe=''))
+
self.found = False
def handle_starttag(self, tag: Any, attrs: Any) -> None:
for key, value in attrs:
- if key in {'id', 'name'} and value == self.search_anchor:
+ # Check if the attribute value matches any of our variations
+ if key in {'id', 'name'} and value in self.search_variations:
self.found = True
break
diff --git a/tests/roots/test-linkcheck-encoded-anchors/conf.py b/tests/roots/test-linkcheck-encoded-anchors/conf.py
new file mode 100644
index 00000000000..88442504003
--- /dev/null
+++ b/tests/roots/test-linkcheck-encoded-anchors/conf.py
@@ -0,0 +1,4 @@
+root_doc = 'encoded_anchors'
+exclude_patterns = ['_build']
+linkcheck_anchors = True
+linkcheck_timeout = 0.25
diff --git a/tests/test_builders/test_anchor_check_parser.py b/tests/test_builders/test_anchor_check_parser.py
new file mode 100644
index 00000000000..c58a37158a7
--- /dev/null
+++ b/tests/test_builders/test_anchor_check_parser.py
@@ -0,0 +1,76 @@
+"""Test the AnchorCheckParser class."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+from unittest import mock
+
+from sphinx.builders.linkcheck import AnchorCheckParser, contains_anchor
+
+if TYPE_CHECKING:
+ from typing import Any
+
+
+def test_anchor_check_parser_basic() -> None:
+ """Test basic anchor matching functionality."""
+ parser = AnchorCheckParser('test-anchor')
+ parser.feed('
Test
')
+ assert parser.found is True
+
+ parser = AnchorCheckParser('non-existent')
+ parser.feed('Test
')
+ assert parser.found is False
+
+
+def test_anchor_check_parser_with_encoded_anchors() -> None:
+ """Test anchor matching with encoded characters."""
+ # Test with encoded slash
+ parser = AnchorCheckParser(
+ 'standard-input/output-stdio', 'standard-input%2Foutput-stdio'
+ )
+ parser.feed(
+ 'Test
'
+ )
+ assert parser.found is True
+
+ # Test with plus sign
+ parser = AnchorCheckParser('encoded+anchor', 'encoded%2Banchor')
+ parser.feed('Test
')
+ assert parser.found is True
+
+ # Test with space
+ parser = AnchorCheckParser('encoded space', 'encoded%20space')
+ parser.feed('Test
')
+ assert parser.found is True
+
+
+def test_contains_anchor_with_encoded_characters() -> None:
+ """Test the contains_anchor function with encoded characters."""
+ mock_response = mock.MagicMock()
+
+ # Setup a response that returns HTML with encoded anchors
+ def mock_iter_content(chunk_size: Any = None, decode_unicode: Any = None) -> Any:
+ content = 'Test
'
+ yield content
+
+ mock_response.iter_content = mock_iter_content
+
+ # Test with original encoded anchor
+ assert (
+ contains_anchor(
+ mock_response,
+ 'standard-input/output-stdio',
+ 'standard-input%2Foutput-stdio',
+ )
+ is True
+ )
+
+ # Test with decoded anchor only
+ mock_response2 = mock.MagicMock()
+ mock_response2.iter_content = mock_iter_content
+ assert contains_anchor(mock_response2, 'standard-input/output-stdio') is True
+
+ # Test with non-existent anchor
+ mock_response3 = mock.MagicMock()
+ mock_response3.iter_content = mock_iter_content
+ assert contains_anchor(mock_response3, 'non-existent-anchor') is False
diff --git a/tests/test_builders/test_build_linkcheck_encoded_anchors.py b/tests/test_builders/test_build_linkcheck_encoded_anchors.py
new file mode 100644
index 00000000000..596d4e093b4
--- /dev/null
+++ b/tests/test_builders/test_build_linkcheck_encoded_anchors.py
@@ -0,0 +1,169 @@
+"""Test the linkcheck builder's ability to handle encoded anchors."""
+
+from __future__ import annotations
+
+import json
+import re
+from http.server import BaseHTTPRequestHandler
+from typing import TYPE_CHECKING
+
+import pytest
+
+from tests.utils import serve_application
+
+if TYPE_CHECKING:
+ from collections.abc import Iterable
+ from typing import Any
+
+ from sphinx.testing.util import SphinxTestApp
+
+
+class EncodedAnchorsHandler(BaseHTTPRequestHandler):
+ protocol_version = 'HTTP/1.1'
+
+ def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]:
+ """Split content into chunks of a maximum size."""
+
+ def _encode_chunk(chunk: bytes) -> Iterable[bytes]:
+ """Encode a bytestring into a format suitable for HTTP chunked-transfer."""
+ yield f'{len(chunk):X}'.encode('ascii')
+ yield b'\r\n'
+ yield chunk
+ yield b'\r\n'
+
+ buffer = b''
+ for char in content:
+ buffer += char.encode('utf-8')
+ if len(buffer) >= max_chunk_size:
+ chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:]
+ yield from _encode_chunk(chunk)
+
+ # Flush remaining bytes, if any
+ if buffer:
+ yield from _encode_chunk(buffer)
+
+ # Emit a final empty chunk to close the stream
+ yield from _encode_chunk(b'')
+
+ def _send_chunked(self, content: str) -> bool:
+ """Send content in chunks."""
+ for chunk in self._chunk_content(content, max_chunk_size=20):
+ try:
+ self.wfile.write(chunk)
+ except (BrokenPipeError, ConnectionResetError) as e:
+ self.log_message(str(e))
+ return False
+ return True
+
+ def do_HEAD(self) -> None:
+ """Handle HEAD requests."""
+ print(f'HEAD request for path: {self.path}')
+ if self.path in {'/standard-encoded-anchors', '/various-encoded-chars'}:
+ self.send_response(200, 'OK')
+ else:
+ self.send_response(404, 'Not Found')
+ self.end_headers()
+
+ def do_GET(self) -> None:
+ """Serve test pages with encoded anchors."""
+ if self.path == '/standard-encoded-anchors':
+ self.send_response(200, 'OK')
+ # Note the ID has an encoded forward slash (%2F)
+ content = """
+
+
+ Encoded Anchors Test
+
+
+ Encoded Plus
+
+
+ """
+ elif self.path == '/various-encoded-chars':
+ self.send_response(200, 'OK')
+ content = """
+
+
+ Various Encoded Characters
+
+ Encoded Exclamation
+ Encoded Hash
+ Encoded Percent
+ Encoded Ampersand
+ Encoded Question
+ Encoded At
+
+
+ """
+ else:
+ self.send_response(404, 'Not Found')
+ content = 'not found\n'
+ self.send_header('Transfer-Encoding', 'chunked')
+ self.end_headers()
+ self._send_chunked(content)
+
+
+@pytest.mark.sphinx(
+ 'linkcheck',
+ testroot='linkcheck-encoded-anchors',
+ freshenv=True,
+)
+def test_encoded_anchors_handling(app: SphinxTestApp, tmp_path: Any) -> None:
+ """Test that linkcheck correctly handles URLs with encoded anchors."""
+ with serve_application(app, EncodedAnchorsHandler) as address:
+ # Create test file with encoded anchor links using the server address
+ (app.srcdir / 'encoded_anchors.rst').write_text(
+ f"""
+Encoded Anchors Test
+====================
+
+Links with encoded anchors:
+
+* `Standard I/O `_
+* `Encoded Plus `_
+* `Encoded Exclamation `_
+* `Encoded Hash `_
+* `Encoded Percent `_
+* `Encoded Ampersand `_
+* `Encoded Question `_
+* `Encoded At `_
+""",
+ encoding='utf-8',
+ )
+
+ app.build()
+
+ # Parse the JSON output to check the results
+ content = (app.outdir / 'output.json').read_text(encoding='utf8')
+ data = [json.loads(record) for record in content.splitlines()]
+
+ # Filter for our encoded anchor URLs
+ encoded_anchor_results = [
+ record
+ for record in data
+ if any(
+ x in record['uri']
+ for x in ['standard-encoded-anchors#', 'various-encoded-chars#']
+ )
+ ]
+
+ # All links with encoded anchors should be working
+ assert all(record['status'] == 'working' for record in encoded_anchor_results)
+
+ # Verify specific links
+ uri_pattern = re.compile(
+ f'http://{re.escape(address)}/standard-encoded-anchors#standard-input/output-stdio'
+ )
+ stdio_link = next(
+ record for record in encoded_anchor_results if uri_pattern.match(record['uri'])
+ )
+ assert stdio_link['status'] == 'working'
+
+ # Check for encoded plus link
+ plus_pattern = re.compile(
+ f'http://{re.escape(address)}/standard-encoded-anchors#encoded\\+anchor'
+ )
+ plus_link = next(
+ record for record in encoded_anchor_results if plus_pattern.match(record['uri'])
+ )
+ assert plus_link['status'] == 'working'