Skip to content

Commit 75c2136

Browse files
committed
Compile some of the regexes at the module level
1 parent ea71dfa commit 75c2136

File tree

2 files changed

+12
-9
lines changed

2 files changed

+12
-9
lines changed

email_validator/rfc_constants.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,29 @@
11
# These constants are defined by the email specifications.
22

3+
import re
4+
35
# Based on RFC 2822 section 3.2.4 / RFC 5322 section 3.2.3, these
46
# characters are permitted in email addresses (not taking into
57
# account internationalization):
68
ATEXT = r'a-zA-Z0-9_!#\$%&\'\*\+\-/=\?\^`\{\|\}~'
79

810
# A "dot atom text", per RFC 2822 3.2.4:
9-
DOT_ATOM_TEXT = '[' + ATEXT + ']+(?:\\.[' + ATEXT + ']+)*'
11+
DOT_ATOM_TEXT = re.compile('[' + ATEXT + ']+(?:\\.[' + ATEXT + r']+)*\Z')
1012

1113
# RFC 6531 section 3.3 extends the allowed characters in internationalized
1214
# addresses to also include three specific ranges of UTF8 defined in
1315
# RFC3629 section 4, which appear to be the Unicode code points from
1416
# U+0080 to U+10FFFF.
1517
ATEXT_INTL = ATEXT + u"\u0080-\U0010FFFF"
16-
DOT_ATOM_TEXT_INTL = '[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + ']+)*'
18+
DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z')
1719

1820
# The domain part of the email address, after IDNA (ASCII) encoding,
1921
# must also satisfy the requirements of RFC 952/RFC 1123 which restrict
2022
# the allowed characters of hostnames further. The hyphen cannot be at
2123
# the beginning or end of a *dot-atom component* of a hostname either.
2224
ATEXT_HOSTNAME = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])'
23-
DOT_ATOM_TEXT_HOSTNAME = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*'
25+
DOT_ATOM_TEXT_HOSTNAME = re.compile(ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*\Z')
26+
DOMAIN_NAME_REGEX = re.compile(r"[A-Za-z]\Z") # all TLDs currently end with a letter
2427

2528
# Length constants
2629
# RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690)

email_validator/syntax.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .exceptions_types import EmailSyntaxError
22
from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \
3-
DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME
3+
DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT, ATEXT_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX
44

55
import re
66
import unicodedata
@@ -42,7 +42,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
4242
raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason))
4343

4444
# Check the local part against the regular expression for the older ASCII requirements.
45-
m = re.match(DOT_ATOM_TEXT + "\\Z", local)
45+
m = DOT_ATOM_TEXT.match(local)
4646
if m:
4747
# Return the local part unchanged and flag that SMTPUTF8 is not needed.
4848
return {
@@ -53,7 +53,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
5353

5454
else:
5555
# The local part failed the ASCII check. Now try the extended internationalized requirements.
56-
m = re.match(DOT_ATOM_TEXT_INTL + "\\Z", local)
56+
m = DOT_ATOM_TEXT_INTL.match(local)
5757
if not m:
5858
# It's not a valid internationalized address either. Report which characters were not valid.
5959
bad_chars = ', '.join(sorted(set(
@@ -141,7 +141,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
141141
if ".." in domain:
142142
raise EmailSyntaxError("An email address cannot have two periods in a row.")
143143

144-
if re.match(DOT_ATOM_TEXT_HOSTNAME + "\\Z", domain):
144+
if DOT_ATOM_TEXT_HOSTNAME.match(domain):
145145
ascii_domain = domain
146146
else:
147147
# If international characters are present in the domain name, convert
@@ -170,7 +170,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
170170

171171
# Check the syntax of the string returned by idna.encode.
172172
# It should never fail.
173-
m = re.match(DOT_ATOM_TEXT_HOSTNAME + "\\Z", ascii_domain)
173+
m = DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain)
174174
if not m:
175175
raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.")
176176

@@ -198,7 +198,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
198198
raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.")
199199

200200
# We also know that all TLDs currently end with a letter.
201-
if not re.search(r"[A-Za-z]\Z", ascii_domain):
201+
if not DOMAIN_NAME_REGEX.search(ascii_domain):
202202
raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.")
203203

204204
# Check special-use and reserved domain names.

0 commit comments

Comments
 (0)