Skip to content

Commit 00902f8

Browse files
committed
Rearrange the local part syntax checks to put the more likely success conditions first
1 parent d6a5d4b commit 00902f8

File tree

2 files changed

+32
-28
lines changed

2 files changed

+32
-28
lines changed

email_validator/rfc_constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# characters are permitted in email addresses (not taking into
77
# account internationalization):
88
ATEXT = r'a-zA-Z0-9_!#\$%&\'\*\+\-/=\?\^`\{\|\}~'
9+
ATEXT_RE = re.compile('[.' + ATEXT + ']') # ATEXT plus dots
910

1011
# A "dot atom text", per RFC 2822 3.2.4:
1112
DOT_ATOM_TEXT = re.compile('[' + ATEXT + ']+(?:\\.[' + ATEXT + r']+)*\Z')
@@ -15,6 +16,7 @@
1516
# RFC3629 section 4, which appear to be the Unicode code points from
1617
# U+0080 to U+10FFFF.
1718
ATEXT_INTL = ATEXT + u"\u0080-\U0010FFFF"
19+
ATEXT_INTL_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots
1820
DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z')
1921

2022
# The domain part of the email address, after IDNA (ASCII) encoding,

email_validator/syntax.py

Lines changed: 30 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .exceptions_types import EmailSyntaxError
22
from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \
3-
DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, ATEXT_HOSTNAME_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX
3+
DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX
44

55
import re
66
import unicodedata
@@ -57,44 +57,25 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
5757
reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH)
5858
raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason))
5959

60-
# Check for invalid characters.
61-
# (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3
62-
# if internationalized local parts are allowed)
63-
atext_re = re.compile('[.' + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + ']')
64-
bad_chars = set(
65-
safe_character_display(c)
66-
for c in local
67-
if not atext_re.match(c)
68-
)
69-
if bad_chars:
70-
raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")
71-
72-
# Check for dot errors imposted by the dot-atom rule.
73-
# (RFC 2822 3.2.4)
74-
check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False)
75-
7660
# Check the local part against the non-internationalized regular expression.
61+
# Most email addresses match this regex so it's probably fastest to check this first.
7762
# (RFC 2822 3.2.4)
7863
m = DOT_ATOM_TEXT.match(local)
7964
if m:
65+
# It's valid.
66+
8067
# Return the local part unchanged and flag that SMTPUTF8 is not needed.
8168
return {
8269
"local_part": local,
8370
"ascii_local_part": local,
8471
"smtputf8": False,
8572
}
8673

87-
else:
88-
# The local part failed the ASCII check. Now try the extended internationalized requirements.
89-
# This should already be handled by the bad_chars and check_dot_atom tests above.
90-
# It's the same pattern but with additional characters permitted.
91-
m = DOT_ATOM_TEXT_INTL.match(local)
92-
if not m:
93-
raise EmailSyntaxError("The email address contains invalid characters before the @-sign.")
94-
# It would be valid if internationalized characters were allowed by the caller.
95-
if not allow_smtputf8:
96-
raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.")
97-
74+
# The local part failed the ASCII check. Try the extended character set
75+
# for internationalized addresses. It's the same pattern but with additional
76+
# characters permitted.
77+
m = DOT_ATOM_TEXT_INTL.match(local)
78+
if m and allow_smtputf8:
9879
# It's valid.
9980

10081
# RFC 6532 section 3.1 also says that Unicode NFC normalization should be applied,
@@ -122,6 +103,27 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
122103
"smtputf8": True,
123104
}
124105

106+
# It's not a valid local part either non-internationalized or internationalized.
107+
# Let's find out why.
108+
109+
# Check for invalid characters.
110+
# (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3)
111+
bad_chars = set(
112+
safe_character_display(c)
113+
for c in local
114+
if not (ATEXT_INTL_RE if allow_smtputf8 else ATEXT_RE).match(c)
115+
)
116+
if bad_chars:
117+
raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")
118+
119+
# Check for dot errors imposted by the dot-atom rule.
120+
# (RFC 2822 3.2.4)
121+
check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False)
122+
123+
# All of the reasons should already have been checked, but just in case
124+
# we have a fallback message.
125+
raise EmailSyntaxError("The email address contains invalid characters before the @-sign.")
126+
125127

126128
def check_unsafe_chars(s):
127129
# Check for unsafe characters or characters that would make the string

0 commit comments

Comments
 (0)