Skip to content

Commit ea71dfa

Browse files
committed
Implement domain name length checks without relying on the IDNA package
1 parent 09012db commit ea71dfa

File tree

3 files changed

+62
-54
lines changed

3 files changed

+62
-54
lines changed

email_validator/rfc_constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@
2020
# the allowed characters of hostnames further. The hyphen cannot be at
2121
# the beginning or end of a *dot-atom component* of a hostname either.
2222
ATEXT_HOSTNAME = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])'
23+
DOT_ATOM_TEXT_HOSTNAME = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*'
2324

2425
# Length constants
2526
# RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690)
2627
# explains the maximum length of an email address is 254 octets.
2728
EMAIL_MAX_LENGTH = 254
2829
LOCAL_PART_MAX_LENGTH = 64
29-
DOMAIN_MAX_LENGTH = 255
30+
DNS_LABEL_LENGTH_LIMIT = 63 # RFC 1035 2.3.1
31+
DOMAIN_MAX_LENGTH = 255 # RFC 1035 2.3.4

email_validator/syntax.py

Lines changed: 56 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .exceptions_types import EmailSyntaxError
22
from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \
3-
DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_HOSTNAME, ATEXT_INTL
3+
DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME
44

55
import re
66
import unicodedata
@@ -141,57 +141,52 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
141141
if ".." in domain:
142142
raise EmailSyntaxError("An email address cannot have two periods in a row.")
143143

144-
# Regardless of whether international characters are actually used,
145-
# first convert to IDNA ASCII. For ASCII-only domains, the transformation
146-
# does nothing. If internationalized characters are present, the MTA
147-
# must either support SMTPUTF8 or the mail client must convert the
148-
# domain name to IDNA before submission.
149-
#
150-
# Unfortunately this step incorrectly 'fixes' domain names with leading
151-
# periods by removing them, so we have to check for this above. It also gives
152-
# a funky error message ("No input") when there are two periods in a
153-
# row, also checked separately above.
154-
try:
155-
ascii_domain = idna.encode(domain, uts46=False).decode("ascii")
156-
except idna.IDNAError as e:
157-
if "Domain too long" in str(e):
158-
# We can't really be more specific because UTS-46 normalization means
159-
# the length check is applied to a string that is different from the
160-
# one the user supplied. Also I'm not sure if the length check applies
161-
# to the internationalized form, the IDNA ASCII form, or even both!
162-
raise EmailSyntaxError("The email address is too long after the @-sign.")
163-
raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e)))
164-
165-
# We may have been given an IDNA ASCII domain to begin with. Check
166-
# that the domain actually conforms to IDNA. It could look like IDNA
167-
# but not be actual IDNA. For ASCII-only domains, the conversion out
168-
# of IDNA just gives the same thing back.
169-
#
170-
# This gives us the canonical internationalized form of the domain,
171-
# which we should use in all error messages.
172-
try:
173-
domain_i18n = idna.decode(ascii_domain.encode('ascii'))
174-
except idna.IDNAError as e:
175-
raise EmailSyntaxError("The domain name %s is not valid IDNA (%s)." % (ascii_domain, str(e)))
144+
if re.match(DOT_ATOM_TEXT_HOSTNAME + "\\Z", domain):
145+
ascii_domain = domain
146+
else:
147+
# If international characters are present in the domain name, convert
148+
# the domain to IDNA ASCII. If internationalized characters are present,
149+
# the MTA must either support SMTPUTF8 or the mail client must convert the
150+
# domain name to IDNA before submission.
151+
#
152+
# Unfortunately this step incorrectly 'fixes' domain names with leading
153+
# periods by removing them, so we have to check for this above. It also gives
154+
# a funky error message ("No input") when there are two periods in a
155+
# row, also checked separately above.
156+
#
157+
# For ASCII-only domains, the transformation does nothing and is safe to
158+
# apply. However, to ensure we don't rely on the idna library for basic
159+
# syntax checks, we don't use it if it's not needed.
160+
try:
161+
ascii_domain = idna.encode(domain, uts46=False).decode("ascii")
162+
except idna.IDNAError as e:
163+
if "Domain too long" in str(e):
164+
# We can't really be more specific because UTS-46 normalization means
165+
# the length check is applied to a string that is different from the
166+
# one the user supplied. Also I'm not sure if the length check applies
167+
# to the internationalized form, the IDNA ASCII form, or even both!
168+
raise EmailSyntaxError("The email address is too long after the @-sign.")
169+
raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e)))
170+
171+
# Check the syntax of the string returned by idna.encode.
172+
# It should never fail.
173+
m = re.match(DOT_ATOM_TEXT_HOSTNAME + "\\Z", ascii_domain)
174+
if not m:
175+
raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.")
176176

177177
# RFC 5321 4.5.3.1.2
178178
# We're checking the number of bytes (octets) here, which can be much
179179
# higher than the number of characters in internationalized domains,
180180
# on the assumption that the domain may be transmitted without SMTPUTF8
181-
# as IDNA ASCII. This is also checked by idna.encode, so this exception
182-
# is never reached.
181+
# as IDNA ASCII. (This is also checked by idna.encode, so this exception
182+
# is never reached for internationalized domains.)
183183
if len(ascii_domain) > DOMAIN_MAX_LENGTH:
184-
raise EmailSyntaxError("The email address is too long after the @-sign.")
185-
186-
# A "dot atom text", per RFC 2822 3.2.4, but using the restricted
187-
# characters allowed in a hostname (see ATEXT_HOSTNAME above).
188-
DOT_ATOM_TEXT = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*'
189-
190-
# Check the regular expression. This is probably entirely redundant
191-
# with idna.decode, which also checks this format.
192-
m = re.match(DOT_ATOM_TEXT + "\\Z", ascii_domain)
193-
if not m:
194-
raise EmailSyntaxError("The email address contains invalid characters after the @-sign.")
184+
reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH)
185+
raise EmailSyntaxError("The email address is too long after the @-sign {}.".format(reason))
186+
for label in ascii_domain.split("."):
187+
if len(label) > DNS_LABEL_LENGTH_LIMIT:
188+
reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT)
189+
raise EmailSyntaxError("The part of the email address \"{}\" is too long {}.".format(label, reason))
195190

196191
if globally_deliverable:
197192
# All publicly deliverable addresses have domain named with at least
@@ -200,13 +195,11 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
200195
# like. We'll skip this in test environments to allow '@test' email
201196
# addresses.
202197
if "." not in ascii_domain and not (ascii_domain == "test" and test_environment):
203-
raise EmailSyntaxError("The domain name %s is not valid. It should have a period." % domain_i18n)
198+
raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.")
204199

205200
# We also know that all TLDs currently end with a letter.
206201
if not re.search(r"[A-Za-z]\Z", ascii_domain):
207-
raise EmailSyntaxError(
208-
"The domain name %s is not valid. It is not within a valid top-level domain." % domain_i18n
209-
)
202+
raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.")
210203

211204
# Check special-use and reserved domain names.
212205
# Some might fail DNS-based deliverability checks, but that
@@ -218,7 +211,19 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
218211
continue
219212

220213
if ascii_domain == d or ascii_domain.endswith("." + d):
221-
raise EmailSyntaxError("The domain name %s is a special-use or reserved name that cannot be used with email." % domain_i18n)
214+
raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.")
215+
216+
# We may have been given an IDNA ASCII domain to begin with. Check
217+
# that the domain actually conforms to IDNA. It could look like IDNA
218+
# but not be actual IDNA. For ASCII-only domains, the conversion out
219+
# of IDNA just gives the same thing back.
220+
#
221+
# This gives us the canonical internationalized form of the domain,
222+
# which we should use in all error messages.
223+
try:
224+
domain_i18n = idna.decode(ascii_domain.encode('ascii'))
225+
except idna.IDNAError as e:
226+
raise EmailSyntaxError("The domain name %s is not valid IDNA (%s)." % (ascii_domain, str(e)))
222227

223228
# Return the IDNA ASCII-encoded form of the domain, which is how it
224229
# would be transmitted on the wire (except when used with SMTPUTF8

tests/test_main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ def test_email_valid(email_input, output):
212212
@pytest.mark.parametrize(
213213
'email_input,error_msg',
214214
[
215-
('my@localhost', 'The domain name localhost is not valid. It should have a period.'),
215+
('my@localhost', 'The part after the @-sign is not valid. It should have a period.'),
216216
('my@.leadingdot.com', 'An email address cannot have a period immediately after the @-sign.'),
217217
('my@..leadingfwdot.com', 'An email address cannot have a period immediately after the @-sign.'),
218218
('my@..twodots.com', 'An email address cannot have a period immediately after the @-sign.'),
@@ -241,7 +241,8 @@ def test_email_valid(email_input, output):
241241
('my\n@example.com', 'The email address contains invalid characters before the @-sign: \'\\n\'.'),
242242
('11111111112222222222333333333344444444445555555555666666666677777@example.com', 'The email address is too long before the @-sign (1 character too many).'),
243243
('111111111122222222223333333333444444444455555555556666666666777777@example.com', 'The email address is too long before the @-sign (2 characters too many).'),
244-
('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long after the @-sign.'),
244+
('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.111111111122222222223333333333444444444455555555556.com', 'The email address is too long (4 characters too many).'),
245+
('me@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555566.com', 'The email address is too long after the @-sign (1 character too many).'),
245246
('my.long.address@1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333333344444.info', 'The email address is too long (2 characters too many).'),
246247
('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.11111111112222222222333333.info', 'The email address is too long (when converted to IDNA ASCII).'),
247248
('my.long.address@λ111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444444444555555555.6666666666777777777788888888889999999999000000000.1111111111222222222233333333334444.info', 'The email address is too long (at least 1 character too many).'),

0 commit comments

Comments
 (0)