Skip to content

Commit 9412b33

Browse files
committed
Revert "Revert "Python: switch to shared implementation of IncompleteHostnameRegExp.ql""
This reverts commit 6d24591.
1 parent 117fb5b commit 9412b33

File tree

6 files changed

+249
-30
lines changed

6 files changed

+249
-30
lines changed

config/identical-files.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,7 @@
518518
],
519519
"Hostname Regexp queries": [
520520
"javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
521+
"python/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
521522
"ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll"
522523
],
523524
"ApiGraphModels": [

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import python
44
private import semmle.python.regex
5+
private import semmle.python.dataflow.new.DataFlow
56

67
/**
78
* An element containing a regular expression term, that is, either
@@ -48,6 +49,19 @@ newtype TRegExpParent =
4849
/** A back reference */
4950
TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
5051

52+
/**
53+
* Provides utility predicates related to regular expressions.
54+
*/
55+
module RegExpPatterns {
56+
/**
57+
* Gets a pattern that matches common top-level domain names in lower case.
58+
*/
59+
string getACommonTld() {
60+
// according to ranking by http://google.com/search?q=site:.<<TLD>>
61+
result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
62+
}
63+
}
64+
5165
/**
5266
* An element containing a regular expression term, that is, either
5367
* a string literal (parsed as a regular expression)
@@ -445,6 +459,8 @@ class RegExpAlt extends RegExpTerm, TRegExpAlt {
445459
override string getPrimaryQLClass() { result = "RegExpAlt" }
446460
}
447461

462+
class RegExpCharEscape = RegExpEscape;
463+
448464
/**
449465
* An escaped regular expression term, that is, a regular expression
450466
* term starting with a backslash, which is not a backreference.
@@ -751,6 +767,9 @@ class RegExpGroup extends RegExpTerm, TRegExpGroup {
751767
*/
752768
int getNumber() { result = re.getGroupNumber(start, end) }
753769

770+
/** Holds if this is a capture group. */
771+
predicate isCapture() { exists(this.getNumber()) }
772+
754773
/** Holds if this is a named capture group. */
755774
predicate isNamed() { exists(this.getName()) }
756775

@@ -1009,3 +1028,24 @@ class RegExpBackRef extends RegExpTerm, TRegExpBackRef {
10091028

10101029
/** Gets the parse tree resulting from parsing `re`, if such has been constructed. */
10111030
RegExpTerm getParsedRegExp(StrConst re) { result.getRegex() = re and result.isRootTerm() }
1031+
1032+
/**
1033+
* A node whose value may flow to a position where it is interpreted
1034+
* as a part of a regular expression.
1035+
*/
1036+
class RegExpPatternSource extends DataFlow::CfgNode {
1037+
private Regex astNode;
1038+
1039+
RegExpPatternSource() { astNode = this.asExpr() }
1040+
1041+
/**
1042+
* Gets a node where the pattern of this node is parsed as a part of
1043+
* a regular expression.
1044+
*/
1045+
DataFlow::Node getAParse() { result = this }
1046+
1047+
/**
1048+
* Gets the root term of the regular expression parsed from this pattern.
1049+
*/
1050+
RegExpTerm getRegExpTerm() { result.getRegex() = astNode }
1051+
}
Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
/**
2+
* Provides predicates for reasoning about regular expressions
3+
* that match URLs and hostname patterns.
4+
*/
5+
6+
private import HostnameRegexpSpecific
7+
8+
/**
9+
* Holds if the given constant is unlikely to occur in the origin part of a URL.
10+
*/
11+
predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
12+
// Look for any of these cases:
13+
// - A character that can't occur in the origin
14+
// - Two dashes in a row
15+
// - A colon that is not part of port or scheme separator
16+
// - A slash that is not part of scheme separator
17+
term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
18+
}
19+
20+
/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
21+
predicate isDotConstant(RegExpTerm term) {
22+
term.(RegExpCharEscape).getValue() = "."
23+
or
24+
exists(RegExpCharacterClass cls |
25+
term = cls and
26+
not cls.isInverted() and
27+
cls.getNumChild() = 1 and
28+
cls.getAChild().(RegExpConstant).getValue() = "."
29+
)
30+
}
31+
32+
/** Holds if `term` is a wildcard `.` or an actual `.` character. */
33+
predicate isDotLike(RegExpTerm term) {
34+
term instanceof RegExpDot
35+
or
36+
isDotConstant(term)
37+
}
38+
39+
/** Holds if `term` will only ever be matched against the beginning of the input. */
40+
predicate matchesBeginningOfString(RegExpTerm term) {
41+
term.isRootTerm()
42+
or
43+
exists(RegExpTerm parent | matchesBeginningOfString(parent) |
44+
term = parent.(RegExpSequence).getChild(0)
45+
or
46+
parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
47+
term = parent.(RegExpSequence).getChild(1)
48+
or
49+
term = parent.(RegExpAlt).getAChild()
50+
or
51+
term = parent.(RegExpGroup).getAChild()
52+
)
53+
}
54+
55+
/**
56+
* Holds if the given sequence contains top-level domain preceded by a dot, such as `.com`,
57+
* excluding cases where this is at the very beginning of the regexp.
58+
*
59+
* `i` is bound to the index of the last child in the top-level domain part.
60+
*/
61+
predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
62+
seq.getChild(i)
63+
.(RegExpConstant)
64+
.getValue()
65+
.regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and
66+
isDotLike(seq.getChild(i - 1)) and
67+
not (i = 1 and matchesBeginningOfString(seq))
68+
}
69+
70+
/**
71+
* Holds if the given regular expression term contains top-level domain preceded by a dot,
72+
* such as `.com`.
73+
*/
74+
predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
75+
76+
/**
77+
* Holds if `term` will always match a hostname, that is, all disjunctions contain
78+
* a hostname pattern that isn't inside a quantifier.
79+
*/
80+
predicate alwaysMatchesHostname(RegExpTerm term) {
81+
hasTopLevelDomainEnding(term, _)
82+
or
83+
// `localhost` is considered a hostname pattern, but has no TLD
84+
term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
85+
or
86+
not term instanceof RegExpAlt and
87+
not term instanceof RegExpQuantifier and
88+
alwaysMatchesHostname(term.getAChild())
89+
or
90+
alwaysMatchesHostnameAlt(term)
91+
}
92+
93+
/** Holds if every child of `alt` contains a hostname pattern. */
94+
predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
95+
alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
96+
}
97+
98+
/**
99+
* Holds if the first `i` children of `alt` contains a hostname pattern.
100+
*
101+
* This is used instead of `forall` to avoid materializing the set of alternatives
102+
* that don't contains hostnames, which is much larger.
103+
*/
104+
predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
105+
alwaysMatchesHostname(alt.getChild(0)) and i = 0
106+
or
107+
alwaysMatchesHostnameAlt(alt, i - 1) and
108+
alwaysMatchesHostname(alt.getChild(i))
109+
}
110+
111+
/**
112+
* Holds if `term` occurs inside a quantifier or alternative (and thus
113+
* can not be expected to correspond to a unique match), or as part of
114+
* a lookaround assertion (which are rarely used for capture groups).
115+
*/
116+
predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
117+
exists(RegExpParent parent | parent = term.getParent() |
118+
parent instanceof RegExpAlt
119+
or
120+
parent instanceof RegExpQuantifier
121+
or
122+
parent instanceof RegExpSubPattern
123+
or
124+
isInsideChoiceOrSubPattern(parent)
125+
)
126+
}
127+
128+
/**
129+
* Holds if `group` is likely to be used as a capture group.
130+
*/
131+
predicate isLikelyCaptureGroup(RegExpGroup group) {
132+
group.isCapture() and
133+
not isInsideChoiceOrSubPattern(group)
134+
}
135+
136+
/**
137+
* Holds if `seq` contains two consecutive dots `..` or escaped dots.
138+
*
139+
* At least one of these dots is not intended to be a subdomain separator,
140+
* so we avoid flagging the pattern in this case.
141+
*/
142+
predicate hasConsecutiveDots(RegExpSequence seq) {
143+
exists(int i |
144+
isDotLike(seq.getChild(i)) and
145+
isDotLike(seq.getChild(i + 1))
146+
)
147+
}
148+
149+
predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
150+
seq = regexp.getAChild*() and
151+
exists(RegExpDot unescapedDot, int i, string hostname |
152+
hasTopLevelDomainEnding(seq, i) and
153+
not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
154+
not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
155+
unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
156+
unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
157+
not hasConsecutiveDots(unescapedDot.getParent()) and
158+
hostname =
159+
seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
160+
seq.getChild(i).getRawValue()
161+
|
162+
if unescapedDot.getParent() instanceof RegExpQuantifier
163+
then
164+
// `.*\.example.com` can match `evil.com/?x=.example.com`
165+
//
166+
// This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
167+
// We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
168+
// Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
169+
// and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
170+
seq.getChild(0) instanceof RegExpCaret and
171+
not seq.getAChild() instanceof RegExpDollar and
172+
seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
173+
msg =
174+
"has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
175+
+ "' which may cause '" + hostname +
176+
"' to be matched anywhere in the URL, outside the hostname."
177+
else
178+
msg =
179+
"has an unescaped '.' before '" + hostname +
180+
"', so it might match more hosts than expected."
181+
)
182+
}
183+
184+
predicate incompleteHostnameRegExp(
185+
RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
186+
) {
187+
exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
188+
regexp = re.getRegExpTerm() and
189+
isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
190+
(
191+
if re.getAParse() != re
192+
then (
193+
kind = "string, which is used as a regular expression $@," and
194+
aux = re.getAParse()
195+
) else (
196+
kind = "regular expression" and aux = re
197+
)
198+
)
199+
|
200+
message = "This " + kind + " " + msg and label = "here"
201+
)
202+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
import semmle.python.security.performance.RegExpTreeView
2+
import semmle.python.dataflow.new.DataFlow

python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -8,35 +8,9 @@
88
* @id py/incomplete-hostname-regexp
99
* @tags correctness
1010
* security
11-
* external/cwe/cwe-20
11+
* external/cwe/cwe-020
1212
*/
1313

14-
import python
15-
import semmle.python.regex
14+
import HostnameRegexpShared
1615

17-
private string commonTopLevelDomainRegex() { result = "com|org|edu|gov|uk|net|io" }
18-
19-
/**
20-
* Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
21-
* and `pattern` contains a subtle mistake that allows it to match unexpected hosts.
22-
*/
23-
bindingset[pattern]
24-
predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
25-
hostPart =
26-
pattern
27-
.regexpCapture("(?i).*" +
28-
// an unescaped single `.`
29-
"(?<!\\\\)[.]" +
30-
// immediately followed by a sequence of subdomains, perhaps with some regex characters mixed in, followed by a known TLD
31-
"([():|?a-z0-9-]+(\\\\)?[.](" + commonTopLevelDomainRegex() + "))" + ".*", 1)
32-
}
33-
34-
from Regex r, string pattern, string hostPart
35-
where
36-
r.getText() = pattern and
37-
isIncompleteHostNameRegExpPattern(pattern, hostPart) and
38-
// ignore patterns with capture groups after the TLD
39-
not pattern.regexpMatch("(?i).*[.](" + commonTopLevelDomainRegex() + ").*[(][?]:.*[)].*")
40-
select r,
41-
"This regular expression has an unescaped '.' before '" + hostPart +
42-
"', so it might match more hosts than expected."
16+
query predicate problems = incompleteHostnameRegExp/4;
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
| hosttest.py:6:27:6:51 | Str | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
1+
| hosttest.py:6:31:6:53 | (www\|beta).example.com/ | This regular expression has an unescaped '.' before 'example.com/', so it might match more hosts than expected. | hosttest.py:6:27:6:51 | ControlFlowNode for Str | here |

0 commit comments

Comments
 (0)