Skip to content

Commit 7f1f281

Browse files
committed
Check domain syntax after normalization to internationalized characters as a precaution
Out of caution that normalization of the domain part to internationalized characters could turn a valid domain string into an invalid one, it is re-parsed at the end to ensure that it still is validated by the idna package. I could not find any examples where that was not already caught, however, since it seems like the existing IDNA calls already prevent it. Some tests are added for invalid characters in the domain part which become invalid after Unicode NFC normalization. These were already handled. (The new code never raises an exception.) See #142.
1 parent c23c0d6 commit 7f1f281

File tree

2 files changed

+29
-6
lines changed

2 files changed

+29
-6
lines changed

email_validator/syntax.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ class DomainNameValidationResult(TypedDict):
446446
def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult:
447447
"""Validates the syntax of the domain part of an email address."""
448448

449-
# Check for invalid characters before normalization.
449+
# Check for invalid characters.
450450
# (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses)
451451
bad_chars = {
452452
safe_character_display(c)
@@ -466,8 +466,9 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob
466466
# and converting all label separators (the period/full stop, fullwidth full stop,
467467
# ideographic full stop, and halfwidth ideographic full stop) to regular dots.
468468
# It will also raise an exception if there is an invalid character in the input,
469-
# such as "⒈" which is invalid because it would expand to include a dot.
470-
# Since several characters are normalized to a dot, this has to come before
469+
# such as "⒈" which is invalid because it would expand to include a dot and
470+
# U+1FEF which normalizes to a backtick, which is not an allowed hostname character.
471+
# Since several characters *are* normalized to a dot, this has to come before
471472
# checks related to dots, like check_dot_atom which comes next.
472473
original_domain = domain
473474
try:
@@ -577,14 +578,23 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob
577578
# but not be actual IDNA. For ASCII-only domains, the conversion out
578579
# of IDNA just gives the same thing back.
579580
#
580-
# This gives us the canonical internationalized form of the domain.
581+
# This gives us the canonical internationalized form of the domain,
582+
# which we return to the caller as a part of the normalized email
583+
# address.
581584
try:
582585
domain_i18n = idna.decode(ascii_domain.encode('ascii'))
583586
except idna.IDNAError as e:
584587
raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e
585588

586-
# Check for invalid characters after normalization. These
587-
# should never arise. See the similar checks above.
589+
# Check that this normalized domain name has not somehow become
590+
# an invalid domain name. All of the checks before this point
591+
# using the idna package probably guarantee that we now have
592+
# a valid international domain name in most respects. But it
593+
# doesn't hurt to re-apply some tests to be sure. See the similar
594+
# tests above.
595+
596+
# Check for invalid and unsafe characters. We have no test
597+
# case for this.
588598
bad_chars = {
589599
safe_character_display(c)
590600
for c in domain
@@ -594,6 +604,13 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob
594604
raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
595605
check_unsafe_chars(domain)
596606

607+
# Check that it can be encoded back to IDNA ASCII. We have no test
608+
# case for this.
609+
try:
610+
idna.encode(domain_i18n)
611+
except idna.IDNAError as e:
612+
raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e
613+
597614
# Return the IDNA ASCII-encoded form of the domain, which is how it
598615
# would be transmitted on the wire (except when used with SMTPUTF8
599616
# possibly), as well as the canonical Unicode form of the domain,

tests/test_syntax.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,12 @@ def test_domain_literal() -> None:
392392
('me@⒈wouldbeinvalid.com',
393393
"The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed "
394394
"at position 1 in '⒈wouldbeinvalid.com')."),
395+
('me@\u037e.com',
396+
"The part after the @-sign is invalid (Codepoint U+003B at position 1 "
397+
"of ';' not allowed)."),
398+
('me@\u1fef.com',
399+
"The part after the @-sign is invalid (Codepoint U+0060 at position 1 "
400+
"of '`' not allowed)."),
395401
('@example.com', 'There must be something before the @-sign.'),
396402
('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'),
397403
('test@white space', 'The part after the @-sign contains invalid characters: SPACE.'),

0 commit comments

Comments
 (0)