Skip to content

Commit 98f3a53

Browse files
author
Eric Nordlund
committed
Fix linkcheck anchor encoding issues
- Enhanced AnchorCheckParser to handle multiple anchor variations - Added comprehensive test coverage for encoded anchors - Fixed false 'Anchor not found' errors for URLs with encoded characters - Maintains full backward compatibility - All linting checks pass
1 parent 2b7e3ad commit 98f3a53

File tree

5 files changed

+260
-9
lines changed

5 files changed

+260
-9
lines changed

CHANGES.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,5 +38,7 @@ Bugs fixed
3838
* #13528: Add tilde ``~`` prefix support for :rst:role:`py:deco`.
3939
Patch by Shengyu Zhang and Adam Turner.
4040

41+
* linkcheck: Fix false "Anchor not found" errors for valid URLs with encoded characters in fragment identifiers.
42+
4143
Testing
4244
-------

sphinx/builders/linkcheck.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from html.parser import HTMLParser
1212
from queue import PriorityQueue, Queue
1313
from threading import Thread
14-
from typing import TYPE_CHECKING, NamedTuple, cast
14+
from typing import TYPE_CHECKING, Any, NamedTuple, cast
1515
from urllib.parse import quote, unquote, urlparse, urlsplit, urlunparse
1616

1717
from docutils import nodes
@@ -485,6 +485,7 @@ def _retrieval_methods(
485485

486486
def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
487487
req_url, delimiter, anchor = uri.partition('#')
488+
original_encoded_anchor = anchor # Store the original encoded anchor before decoding
488489
if delimiter and anchor:
489490
for rex in self.anchors_ignore:
490491
if rex.match(anchor):
@@ -536,7 +537,7 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
536537
) as response:
537538
if anchor and self.check_anchors and response.ok:
538539
try:
539-
found = contains_anchor(response, anchor)
540+
found = contains_anchor(response, anchor, original_encoded_anchor)
540541
except UnicodeDecodeError:
541542
return (
542543
_Status.IGNORED,
@@ -686,11 +687,13 @@ def _get_request_headers(
686687
return {}
687688

688689

689-
def contains_anchor(response: Response, anchor: str) -> bool:
690+
def contains_anchor(
691+
response: Response, anchor: str, original_encoded_anchor: str = ''
692+
) -> bool:
690693
"""Determine if an anchor is contained within an HTTP response."""
691-
parser = AnchorCheckParser(anchor)
692-
# Read file in chunks. If we find a matching anchor, we break
693-
# the loop early in hopes not to have to download the whole thing.
694+
parser = AnchorCheckParser(anchor, original_encoded_anchor)
695+
# Read file in chunks. If we find a matching anchor, we break the loop early
696+
# to avoid downloading the entire response body.
694697
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
695698
if isinstance(chunk, bytes):
696699
# requests failed to decode, manually try to decode it
@@ -706,15 +709,37 @@ def contains_anchor(response: Response, anchor: str) -> bool:
706709
class AnchorCheckParser(HTMLParser):
707710
"""Specialised HTML parser that looks for a specific anchor."""
708711

709-
def __init__(self, search_anchor: str) -> None:
712+
def __init__(self, search_anchor: str, original_encoded_anchor: str = '') -> None:
713+
"""Initialize the parser with multiple anchor variations.
714+
715+
Args:
716+
search_anchor: The decoded anchor to search for
717+
(e.g., "standard-input/output-stdio")
718+
original_encoded_anchor: The original encoded anchor
719+
(e.g., "standard-input%2Foutput-stdio")
720+
"""
710721
super().__init__()
711722

712-
self.search_anchor = search_anchor
723+
# Create variations of the anchor to check
724+
self.search_variations = {
725+
search_anchor, # decoded (current behavior)
726+
}
727+
728+
# Add the original encoded version if provided
729+
if original_encoded_anchor:
730+
self.search_variations.add(original_encoded_anchor)
731+
732+
# Add a re-encoded version if the decoded anchor contains characters
733+
# that would be encoded
734+
if search_anchor != quote(search_anchor, safe=''):
735+
self.search_variations.add(quote(search_anchor, safe=''))
736+
713737
self.found = False
714738

715739
def handle_starttag(self, tag: Any, attrs: Any) -> None:
716740
for key, value in attrs:
717-
if key in {'id', 'name'} and value == self.search_anchor:
741+
# Check if the attribute value matches any of our variations
742+
if key in {'id', 'name'} and value in self.search_variations:
718743
self.found = True
719744
break
720745

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
root_doc = 'encoded_anchors'
2+
exclude_patterns = ['_build']
3+
linkcheck_anchors = True
4+
linkcheck_timeout = 0.25
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""Test the AnchorCheckParser class."""
2+
from __future__ import annotations
3+
4+
from typing import TYPE_CHECKING
5+
from unittest import mock
6+
7+
from sphinx.builders.linkcheck import AnchorCheckParser, contains_anchor
8+
9+
if TYPE_CHECKING:
10+
from typing import Any
11+
12+
13+
def test_anchor_check_parser_basic() -> None:
14+
"""Test basic anchor matching functionality."""
15+
parser = AnchorCheckParser('test-anchor')
16+
parser.feed('<html><body><div id="test-anchor">Test</div></body></html>')
17+
assert parser.found is True
18+
19+
parser = AnchorCheckParser('non-existent')
20+
parser.feed('<html><body><div id="test-anchor">Test</div></body></html>')
21+
assert parser.found is False
22+
23+
24+
def test_anchor_check_parser_with_encoded_anchors() -> None:
25+
"""Test anchor matching with encoded characters."""
26+
# Test with encoded slash
27+
parser = AnchorCheckParser('standard-input/output-stdio', 'standard-input%2Foutput-stdio')
28+
parser.feed('<html><body><div id="standard-input%2Foutput-stdio">Test</div></body></html>')
29+
assert parser.found is True
30+
31+
# Test with plus sign
32+
parser = AnchorCheckParser('encoded+anchor', 'encoded%2Banchor')
33+
parser.feed('<html><body><div id="encoded%2Banchor">Test</div></body></html>')
34+
assert parser.found is True
35+
36+
# Test with space
37+
parser = AnchorCheckParser('encoded space', 'encoded%20space')
38+
parser.feed('<html><body><div id="encoded%20space">Test</div></body></html>')
39+
assert parser.found is True
40+
41+
42+
def test_contains_anchor_with_encoded_characters() -> None:
43+
"""Test the contains_anchor function with encoded characters."""
44+
mock_response = mock.MagicMock()
45+
46+
# Setup a response that returns HTML with encoded anchors
47+
def mock_iter_content(chunk_size: Any = None, decode_unicode: Any = None) -> Any:
48+
content = '<html><body><div id="standard-input%2Foutput-stdio">Test</div></body></html>'
49+
yield content
50+
51+
mock_response.iter_content = mock_iter_content
52+
53+
# Test with original encoded anchor
54+
assert contains_anchor(mock_response, 'standard-input/output-stdio', 'standard-input%2Foutput-stdio') is True
55+
56+
# Test with decoded anchor only
57+
mock_response2 = mock.MagicMock()
58+
mock_response2.iter_content = mock_iter_content
59+
assert contains_anchor(mock_response2, 'standard-input/output-stdio') is True
60+
61+
# Test with non-existent anchor
62+
mock_response3 = mock.MagicMock()
63+
mock_response3.iter_content = mock_iter_content
64+
assert contains_anchor(mock_response3, 'non-existent-anchor') is False
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""Test the linkcheck builder's ability to handle encoded anchors."""
2+
3+
from __future__ import annotations
4+
5+
import json
6+
import re
7+
from http.server import BaseHTTPRequestHandler
8+
from typing import TYPE_CHECKING
9+
10+
import pytest
11+
12+
from tests.utils import serve_application
13+
14+
if TYPE_CHECKING:
15+
from collections.abc import Iterable
16+
from typing import Any
17+
18+
from sphinx.testing.util import SphinxTestApp
19+
20+
21+
class EncodedAnchorsHandler(BaseHTTPRequestHandler):
22+
protocol_version = 'HTTP/1.1'
23+
24+
def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]:
25+
"""Split content into chunks of a maximum size."""
26+
def _encode_chunk(chunk: bytes) -> Iterable[bytes]:
27+
"""Encode a bytestring into a format suitable for HTTP chunked-transfer."""
28+
yield f'{len(chunk):X}'.encode('ascii')
29+
yield b'\r\n'
30+
yield chunk
31+
yield b'\r\n'
32+
33+
buffer = b''
34+
for char in content:
35+
buffer += char.encode('utf-8')
36+
if len(buffer) >= max_chunk_size:
37+
chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:]
38+
yield from _encode_chunk(chunk)
39+
40+
# Flush remaining bytes, if any
41+
if buffer:
42+
yield from _encode_chunk(buffer)
43+
44+
# Emit a final empty chunk to close the stream
45+
yield from _encode_chunk(b'')
46+
47+
def _send_chunked(self, content: str) -> bool:
48+
"""Send content in chunks."""
49+
for chunk in self._chunk_content(content, max_chunk_size=20):
50+
try:
51+
self.wfile.write(chunk)
52+
except (BrokenPipeError, ConnectionResetError) as e:
53+
self.log_message(str(e))
54+
return False
55+
return True
56+
57+
def do_HEAD(self) -> None:
58+
"""Handle HEAD requests."""
59+
print(f'HEAD request for path: {self.path}')
60+
if self.path in {'/standard-encoded-anchors', '/various-encoded-chars'}:
61+
self.send_response(200, 'OK')
62+
else:
63+
self.send_response(404, 'Not Found')
64+
self.end_headers()
65+
66+
def do_GET(self) -> None:
67+
"""Serve test pages with encoded anchors."""
68+
if self.path == '/standard-encoded-anchors':
69+
self.send_response(200, 'OK')
70+
# Note the ID has an encoded forward slash (%2F)
71+
content = """
72+
<!DOCTYPE html>
73+
<html>
74+
<head><title>Encoded Anchors Test</title></head>
75+
<body>
76+
<h1 id="standard-input%2Foutput-stdio">Standard I/O</h1>
77+
<h2 id="encoded%2Banchor">Encoded Plus</h2>
78+
</body>
79+
</html>
80+
"""
81+
elif self.path == '/various-encoded-chars':
82+
self.send_response(200, 'OK')
83+
content = """
84+
<!DOCTYPE html>
85+
<html>
86+
<head><title>Various Encoded Characters</title></head>
87+
<body>
88+
<h1 id="encoded%21exclamation">Encoded Exclamation</h1>
89+
<h2 id="encoded%23hash">Encoded Hash</h2>
90+
<h3 id="encoded%25percent">Encoded Percent</h3>
91+
<h4 id="encoded%26ampersand">Encoded Ampersand</h4>
92+
<h5 id="encoded%3Fquestion">Encoded Question</h5>
93+
<h6 id="encoded%40at">Encoded At</h6>
94+
</body>
95+
</html>
96+
"""
97+
else:
98+
self.send_response(404, 'Not Found')
99+
content = 'not found\n'
100+
self.send_header('Transfer-Encoding', 'chunked')
101+
self.end_headers()
102+
self._send_chunked(content)
103+
104+
105+
@pytest.mark.sphinx(
106+
'linkcheck',
107+
testroot='linkcheck-encoded-anchors',
108+
freshenv=True,
109+
)
110+
def test_encoded_anchors_handling(app: SphinxTestApp, tmp_path: Any) -> None:
111+
"""Test that linkcheck correctly handles URLs with encoded anchors."""
112+
with serve_application(app, EncodedAnchorsHandler) as address:
113+
# Create test file with encoded anchor links using the server address
114+
(app.srcdir / 'encoded_anchors.rst').write_text(f"""
115+
Encoded Anchors Test
116+
===================
117+
118+
Links with encoded anchors:
119+
120+
* `Standard I/O <http://{address}/standard-encoded-anchors#standard-input/output-stdio>`_
121+
* `Encoded Plus <http://{address}/standard-encoded-anchors#encoded+anchor>`_
122+
* `Encoded Exclamation <http://{address}/various-encoded-chars#encoded!exclamation>`_
123+
* `Encoded Hash <http://{address}/various-encoded-chars#encoded#hash>`_
124+
* `Encoded Percent <http://{address}/various-encoded-chars#encoded%percent>`_
125+
* `Encoded Ampersand <http://{address}/various-encoded-chars#encoded&ampersand>`_
126+
* `Encoded Question <http://{address}/various-encoded-chars#encoded?question>`_
127+
* `Encoded At <http://{address}/various-encoded-chars#encoded@at>`_
128+
""", encoding='utf-8')
129+
130+
app.build()
131+
132+
# Parse the JSON output to check the results
133+
content = (app.outdir / 'output.json').read_text(encoding='utf8')
134+
data = [json.loads(record) for record in content.splitlines()]
135+
136+
# Filter for our encoded anchor URLs
137+
encoded_anchor_results = [
138+
record for record in data
139+
if any(x in record['uri'] for x in
140+
['standard-encoded-anchors#', 'various-encoded-chars#'])
141+
]
142+
143+
# All links with encoded anchors should be working
144+
assert all(record['status'] == 'working' for record in encoded_anchor_results)
145+
146+
# Verify specific links
147+
uri_pattern = re.compile(f'http://{re.escape(address)}/standard-encoded-anchors#standard-input/output-stdio')
148+
stdio_link = next(record for record in encoded_anchor_results
149+
if uri_pattern.match(record['uri']))
150+
assert stdio_link['status'] == 'working'
151+
152+
# Check for encoded plus link
153+
plus_pattern = re.compile(f'http://{re.escape(address)}/standard-encoded-anchors#encoded\\+anchor')
154+
plus_link = next(record for record in encoded_anchor_results
155+
if plus_pattern.match(record['uri']))
156+
assert plus_link['status'] == 'working'

0 commit comments

Comments
 (0)