|
1 | 1 | from .exceptions_types import EmailSyntaxError
|
2 | 2 | from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \
|
3 |
| - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ |
4 |
| - DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS, \ |
5 |
| - QUOTED_LOCAL_PART_ADDR |
| 3 | + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ |
| 4 | + DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS |
6 | 5 |
|
7 | 6 | import re
|
8 | 7 | import unicodedata
|
|
12 | 11 |
|
13 | 12 |
|
14 | 13 | def split_email(email):
|
15 |
| - # Return the local part and domain part of the address and |
16 |
| - # whether the local part was quoted as a three-tuple. |
| 14 | + # Return the display name, unescaped local part, and domain part |
| 15 | + # of the address, and whether the local part was quoted. If no |
| 16 | + # display name was present and angle brackets do not surround |
| 17 | + # the address, display name will be None; otherwise, it will be |
| 18 | + # set to the display name or the empty string if there were |
| 19 | + # angle brackets but no display name. |
| 20 | + |
| 21 | + # Typical email addresses have a single @-sign and no quote |
| 22 | + # characters, but the awkward "quoted string" local part form |
| 23 | + # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear |
| 24 | + # in the local part if the local part is quoted. |
| 25 | + |
| 26 | + # A `display name <addr>` format is also present in MIME messages |
| 27 | + # (RFC 5322 3.4) and this format is also often recognized in |
| 28 | + # mail UIs. It's not allowed in SMTP commands or in typical web |
| 29 | + # login forms, but parsing it has been requested, so it's done |
| 30 | + # here as a convenience. It's implemented in the spirit but not |
| 31 | + # the letter of RFC 5322 3.4 because MIME messages allow newlines |
| 32 | + # and comments as a part of the CFWS rule, but this is typically |
| 33 | + # not allowed in mail UIs (although comment syntax was requested |
| 34 | + # once too). |
| 35 | + # |
| 36 | + # Display names are either basic characters (the same basic characters |
| 37 | + # permitted in email addresses, but periods are not allowed and spaces |
| 38 | + # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with |
| 39 | + # the same rules as a quoted local part. (Multiple quoted strings might |
| 40 | + # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the |
| 41 | + # email address follows in angle brackets. |
| 42 | + # |
| 43 | + # An initial quote is ambiguous between starting a display name or |
| 44 | + # a quoted local part --- fun. |
| 45 | + # |
| 46 | + # We assume the input string is already stripped of leading and |
| 47 | + # trailing CFWS. |
| 48 | + |
| 49 | + def split_string_at_unquoted_special(text, specials): |
| 50 | + # Split the string at the first character in specials (an @-sign |
| 51 | + # or left angle bracket) that does not occur within quotes. |
| 52 | + inside_quote = False |
| 53 | + escaped = False |
| 54 | + left_part = "" |
| 55 | + for c in text: |
| 56 | + if inside_quote: |
| 57 | + left_part += c |
| 58 | + if c == '\\' and not escaped: |
| 59 | + escaped = True |
| 60 | + elif c == '"' and not escaped: |
| 61 | + # The only way to exit the quote is an unescaped quote. |
| 62 | + inside_quote = False |
| 63 | + escaped = False |
| 64 | + else: |
| 65 | + escaped = False |
| 66 | + elif c == '"': |
| 67 | + left_part += c |
| 68 | + inside_quote = True |
| 69 | + elif c in specials: |
| 70 | + # When unquoted, stop before a special character. |
| 71 | + break |
| 72 | + else: |
| 73 | + left_part += c |
| 74 | + |
| 75 | + # The right part is whatever is left. |
| 76 | + right_part = text[len(left_part):] |
| 77 | + |
| 78 | + return left_part, right_part |
| 79 | + |
| 80 | + def unquote_quoted_string(text): |
| 81 | + # Remove surrounding quotes and unescape escaped backslashes |
| 82 | + # and quotes. Escapes are parsed liberally. I think only |
| 83 | + # backslashes and quotes can be escaped but we'll allow anything |
| 84 | + # to be. |
| 85 | + quoted = False |
| 86 | + escaped = False |
| 87 | + value = "" |
| 88 | + for i, c in enumerate(text): |
| 89 | + if quoted: |
| 90 | + if escaped: |
| 91 | + value += c |
| 92 | + escaped = False |
| 93 | + elif c == '\\': |
| 94 | + escaped = True |
| 95 | + elif c == '"': |
| 96 | + if i != len(text) - 1: |
| 97 | + raise EmailSyntaxError("Extra character(s) found after close quote: " |
| 98 | + + ", ".join(safe_character_display(c) for c in text[i + 1:])) |
| 99 | + break |
| 100 | + else: |
| 101 | + value += c |
| 102 | + elif i == 0 and c == '"': |
| 103 | + quoted = True |
| 104 | + else: |
| 105 | + value += c |
| 106 | + |
| 107 | + return value, quoted |
| 108 | + |
| 109 | + # Split the string at the first unquoted @-sign or left angle bracket. |
| 110 | + left_part, right_part = split_string_at_unquoted_special(email, ("@", "<")) |
| 111 | + |
| 112 | + # If the right part starts with an angle bracket, |
| 113 | + # then the left part is a display name and the rest |
| 114 | + # of the right part up to the final right angle bracket |
| 115 | + # is the email address, . |
| 116 | + if right_part.startswith("<"): |
| 117 | + # Remove space between the display name and angle bracket. |
| 118 | + left_part = left_part.rstrip() |
| 119 | + |
| 120 | + # Unquote and unescape the display name. |
| 121 | + display_name, display_name_quoted = unquote_quoted_string(left_part) |
| 122 | + |
| 123 | + # Check that only basic characters are present in a |
| 124 | + # non-quoted display name. |
| 125 | + if not display_name_quoted: |
| 126 | + bad_chars = { |
| 127 | + safe_character_display(c) |
| 128 | + for c in display_name |
| 129 | + if (not ATEXT_RE.match(c) and c != ' ') or c == '.' |
| 130 | + } |
| 131 | + if bad_chars: |
| 132 | + raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".") |
17 | 133 |
|
18 |
| - # Typical email addresses have a single @-sign, but the |
19 |
| - # awkward "quoted string" local part form (RFC 5321 4.1.2) |
20 |
| - # allows @-signs (and escaped quotes) to appear in the local |
21 |
| - # part if the local part is quoted. If the address is quoted, |
22 |
| - # split it at a non-escaped @-sign and unescape the escaping. |
23 |
| - if m := QUOTED_LOCAL_PART_ADDR.match(email): |
24 |
| - local_part, domain_part = m.groups() |
| 134 | + # Check for other unsafe characters. |
| 135 | + check_unsafe_chars(display_name, allow_space=True) |
25 | 136 |
|
26 |
| - # Since backslash-escaping is no longer needed because |
27 |
| - # the quotes are removed, remove backslash-escaping |
28 |
| - # to return in the normalized form. |
29 |
| - local_part = re.sub(r"\\(.)", "\\1", local_part) |
| 137 | + # Remove the initial and trailing angle brackets. |
| 138 | + addr_spec = right_part[1:].rstrip(">") |
30 | 139 |
|
31 |
| - return local_part, domain_part, True |
| 140 | + # Split the email address at the first unquoted @-sign. |
| 141 | + local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",)) |
32 | 142 |
|
| 143 | + # Otherwise there is no display name. The left part is the local |
| 144 | + # part and the right part is the domain. |
33 | 145 | else:
|
34 |
| - # Split at the one and only at-sign. |
35 |
| - parts = email.split('@') |
36 |
| - if len(parts) != 2: |
37 |
| - raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") |
38 |
| - local_part, domain_part = parts |
39 |
| - return local_part, domain_part, False |
| 146 | + display_name = None |
| 147 | + local_part, domain_part = left_part, right_part |
| 148 | + |
| 149 | + if domain_part.startswith("@"): |
| 150 | + domain_part = domain_part[1:] |
| 151 | + |
| 152 | + # Unquote the local part if it is quoted. |
| 153 | + local_part, is_quoted_local_part = unquote_quoted_string(local_part) |
| 154 | + |
| 155 | + return display_name, local_part, domain_part, is_quoted_local_part |
40 | 156 |
|
41 | 157 |
|
42 | 158 | def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH):
|
@@ -215,7 +331,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp
|
215 | 331 | bad_chars = {
|
216 | 332 | safe_character_display(c)
|
217 | 333 | for c in local
|
218 |
| - if not ATEXT_INTL_RE.match(c) |
| 334 | + if not ATEXT_INTL_DOT_RE.match(c) |
219 | 335 | }
|
220 | 336 | if bad_chars:
|
221 | 337 | raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")
|
|
0 commit comments