Skip to content

Commit 9f13923

Browse files
authored
Handle invalid Content-Type headers (#76)
When checking the `Content-Type` header (if provided) for HTML-related diffs (e.g. html_render and links), we previously treated malformed header values (e.g. `Content-Type: #<mime::nulltype:0x007f2a523499b8>`) as non-HTML content and would bail out before diffing. This wasn't intentional -- we just didn't bother to consider a completely invalid header value in the logic. Since bad values *do* occasionally occur in the wild, however, we need to handle them. This change does so by treating invalid values as if the header was not set at all. Fixes #75.
1 parent da45a98 commit 9f13923

File tree

3 files changed

+37
-1
lines changed

3 files changed

+37
-1
lines changed

docs/source/release-history.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22
Release History
33
===============
44

5+
In Development
6+
--------------
7+
8+
- Ignore invalid `Content-Type` headers when diffing HTML. (:issue:`75`)
9+
10+
511
Version 0.1.2 (2021-04-01)
612
-----------------------------
713

web_monitoring_diff/content_type.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@
4141
r'text/.+'
4242
)))
4343

44+
# Roughly checks whether a Content-Type header value is valid. For syntax, see:
45+
# - https://datatracker.ietf.org/doc/html/rfc2045#section-5.1
46+
# - https://datatracker.ietf.org/doc/html/rfc6838#section-4.2
47+
VALID_CONTENT_TYPE_PATTERN = re.compile(
48+
r'^[a-z0-9][a-z0-9!#$&^_.+-]*/[a-z0-9][a-z0-9!#$&^_.+-]*$',
49+
re.IGNORECASE
50+
)
51+
4452

4553
def is_not_html(text, headers=None, check_options='normal'):
4654
"""
@@ -63,7 +71,7 @@ def is_not_html(text, headers=None, check_options='normal'):
6371
"""
6472
if headers and (check_options == 'normal' or check_options == 'nosniff'):
6573
content_type = headers.get('Content-Type', '').split(';', 1)[0].strip()
66-
if content_type:
74+
if content_type and VALID_CONTENT_TYPE_PATTERN.match(content_type):
6775
if content_type in ACCEPTABLE_CONTENT_TYPES:
6876
return False
6977
elif not UNKNOWN_CONTENT_TYPE_PATTERN.match(content_type):

web_monitoring_diff/tests/test_html_diff_validity.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,28 @@ def test_html_diff_render_should_check_content_type_header():
175175
a_headers={'Content-Type': 'text/html'},
176176
b_headers={'Content-Type': 'image/jpeg'})
177177

178+
with pytest.raises(UndiffableContentError):
179+
html_diff_render(
180+
'<p>Just a little HTML</p>',
181+
'Some other text',
182+
a_headers={'Content-Type': 'image/jpeg'},
183+
b_headers={'Content-Type': 'text/html'})
184+
185+
with pytest.raises(UndiffableContentError):
186+
html_diff_render(
187+
'<p>Just a little HTML</p>',
188+
'Some other text',
189+
a_headers={'Content-Type': 'image/jpeg'},
190+
b_headers={'Content-Type': 'image/jpeg'})
191+
192+
193+
def test_html_diff_render_should_not_check_content_type_header_if_header_is_malformed():
194+
html_diff_render(
195+
'<p>Just a little HTML</p>',
196+
'<p>Just some HTML</p>',
197+
a_headers={'Content-Type': '#<mime::nulltype:0x007f2a523499b8>'},
198+
b_headers={'Content-Type': 'text/html'})
199+
178200

179201
def test_html_diff_render_should_not_check_content_type_header_if_content_type_options_is_nocheck():
180202
html_diff_render(

0 commit comments

Comments
 (0)