Revert "Revert "Python: switch to shared implementation of IncompleteHostnameRegExp.ql""

aibaars · aibaars · commit 9412b331dbcf · 2022-03-18T16:31:22.000+01:00
This reverts commit 6d24591.
diff --git a/config/identical-files.json b/config/identical-files.json
@@ -518,6 +518,7 @@
   ],
   "Hostname Regexp queries": [
     "javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
+    "python/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
     "ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll"
   ],
   "ApiGraphModels": [
diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll
@@ -2,6 +2,7 @@
 
 import python
 private import semmle.python.regex
+private import semmle.python.dataflow.new.DataFlow
 
 /**
  * An element containing a regular expression term, that is, either
@@ -48,6 +49,19 @@ newtype TRegExpParent =
   /** A back reference */
   TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
 
+/**
+ * Provides utility predicates related to regular expressions.
+ */
+module RegExpPatterns {
+  /**
+   * Gets a pattern that matches common top-level domain names in lower case.
+   */
+  string getACommonTld() {
+    // according to ranking by http://google.com/search?q=site:.<<TLD>>
+    result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
+  }
+}
+
 /**
  * An element containing a regular expression term, that is, either
  * a string literal (parsed as a regular expression)
@@ -445,6 +459,8 @@ class RegExpAlt extends RegExpTerm, TRegExpAlt {
   override string getPrimaryQLClass() { result = "RegExpAlt" }
 }
 
+class RegExpCharEscape = RegExpEscape;
+
 /**
  * An escaped regular expression term, that is, a regular expression
  * term starting with a backslash, which is not a backreference.
@@ -751,6 +767,9 @@ class RegExpGroup extends RegExpTerm, TRegExpGroup {
    */
   int getNumber() { result = re.getGroupNumber(start, end) }
 
+  /** Holds if this is a capture group. */
+  predicate isCapture() { exists(this.getNumber()) }
+
   /** Holds if this is a named capture group. */
   predicate isNamed() { exists(this.getName()) }
 
@@ -1009,3 +1028,24 @@ class RegExpBackRef extends RegExpTerm, TRegExpBackRef {
 
 /** Gets the parse tree resulting from parsing `re`, if such has been constructed. */
 RegExpTerm getParsedRegExp(StrConst re) { result.getRegex() = re and result.isRootTerm() }
+
+/**
+ * A node whose value may flow to a position where it is interpreted
+ * as a part of a regular expression.
+ */
+class RegExpPatternSource extends DataFlow::CfgNode {
+  private Regex astNode;
+
+  RegExpPatternSource() { astNode = this.asExpr() }
+
+  /**
+   * Gets a node where the pattern of this node is parsed as a part of
+   * a regular expression.
+   */
+  DataFlow::Node getAParse() { result = this }
+
+  /**
+   * Gets the root term of the regular expression parsed from this pattern.
+   */
+  RegExpTerm getRegExpTerm() { result.getRegex() = astNode }
+}
diff --git a/python/ql/src/Security/CWE-020/HostnameRegexpShared.qll b/python/ql/src/Security/CWE-020/HostnameRegexpShared.qll
@@ -0,0 +1,202 @@
+/**
+ * Provides predicates for reasoning about regular expressions
+ * that match URLs and hostname patterns.
+ */
+
+private import HostnameRegexpSpecific
+
+/**
+ * Holds if the given constant is unlikely to occur in the origin part of a URL.
+ */
+predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
+  // Look for any of these cases:
+  // - A character that can't occur in the origin
+  // - Two dashes in a row
+  // - A colon that is not part of port or scheme separator
+  // - A slash that is not part of scheme separator
+  term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
+}
+
+/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
+predicate isDotConstant(RegExpTerm term) {
+  term.(RegExpCharEscape).getValue() = "."
+  or
+  exists(RegExpCharacterClass cls |
+    term = cls and
+    not cls.isInverted() and
+    cls.getNumChild() = 1 and
+    cls.getAChild().(RegExpConstant).getValue() = "."
+  )
+}
+
+/** Holds if `term` is a wildcard `.` or an actual `.` character. */
+predicate isDotLike(RegExpTerm term) {
+  term instanceof RegExpDot
+  or
+  isDotConstant(term)
+}
+
+/** Holds if `term` will only ever be matched against the beginning of the input. */
+predicate matchesBeginningOfString(RegExpTerm term) {
+  term.isRootTerm()
+  or
+  exists(RegExpTerm parent | matchesBeginningOfString(parent) |
+    term = parent.(RegExpSequence).getChild(0)
+    or
+    parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
+    term = parent.(RegExpSequence).getChild(1)
+    or
+    term = parent.(RegExpAlt).getAChild()
+    or
+    term = parent.(RegExpGroup).getAChild()
+  )
+}
+
+/**
+ * Holds if the given sequence contains top-level domain preceded by a dot, such as `.com`,
+ * excluding cases where this is at the very beginning of the regexp.
+ *
+ * `i` is bound to the index of the last child in the top-level domain part.
+ */
+predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
+  seq.getChild(i)
+      .(RegExpConstant)
+      .getValue()
+      .regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and
+  isDotLike(seq.getChild(i - 1)) and
+  not (i = 1 and matchesBeginningOfString(seq))
+}
+
+/**
+ * Holds if the given regular expression term contains top-level domain preceded by a dot,
+ * such as `.com`.
+ */
+predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
+
+/**
+ * Holds if `term` will always match a hostname, that is, all disjunctions contain
+ * a hostname pattern that isn't inside a quantifier.
+ */
+predicate alwaysMatchesHostname(RegExpTerm term) {
+  hasTopLevelDomainEnding(term, _)
+  or
+  // `localhost` is considered a hostname pattern, but has no TLD
+  term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
+  or
+  not term instanceof RegExpAlt and
+  not term instanceof RegExpQuantifier and
+  alwaysMatchesHostname(term.getAChild())
+  or
+  alwaysMatchesHostnameAlt(term)
+}
+
+/** Holds if every child of `alt` contains a hostname pattern. */
+predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
+  alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
+}
+
+/**
+ * Holds if the first `i` children of `alt` contains a hostname pattern.
+ *
+ * This is used instead of `forall` to avoid materializing the set of alternatives
+ * that don't contains hostnames, which is much larger.
+ */
+predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
+  alwaysMatchesHostname(alt.getChild(0)) and i = 0
+  or
+  alwaysMatchesHostnameAlt(alt, i - 1) and
+  alwaysMatchesHostname(alt.getChild(i))
+}
+
+/**
+ * Holds if `term` occurs inside a quantifier or alternative (and thus
+ * can not be expected to correspond to a unique match), or as part of
+ * a lookaround assertion (which are rarely used for capture groups).
+ */
+predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
+  exists(RegExpParent parent | parent = term.getParent() |
+    parent instanceof RegExpAlt
+    or
+    parent instanceof RegExpQuantifier
+    or
+    parent instanceof RegExpSubPattern
+    or
+    isInsideChoiceOrSubPattern(parent)
+  )
+}
+
+/**
+ * Holds if `group` is likely to be used as a capture group.
+ */
+predicate isLikelyCaptureGroup(RegExpGroup group) {
+  group.isCapture() and
+  not isInsideChoiceOrSubPattern(group)
+}
+
+/**
+ * Holds if `seq` contains two consecutive dots `..` or escaped dots.
+ *
+ * At least one of these dots is not intended to be a subdomain separator,
+ * so we avoid flagging the pattern in this case.
+ */
+predicate hasConsecutiveDots(RegExpSequence seq) {
+  exists(int i |
+    isDotLike(seq.getChild(i)) and
+    isDotLike(seq.getChild(i + 1))
+  )
+}
+
+predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
+  seq = regexp.getAChild*() and
+  exists(RegExpDot unescapedDot, int i, string hostname |
+    hasTopLevelDomainEnding(seq, i) and
+    not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
+    not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
+    unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
+    unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
+    not hasConsecutiveDots(unescapedDot.getParent()) and
+    hostname =
+      seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
+        seq.getChild(i).getRawValue()
+  |
+    if unescapedDot.getParent() instanceof RegExpQuantifier
+    then
+      // `.*\.example.com` can match `evil.com/?x=.example.com`
+      //
+      // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
+      // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
+      // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
+      // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
+      seq.getChild(0) instanceof RegExpCaret and
+      not seq.getAChild() instanceof RegExpDollar and
+      seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
+      msg =
+        "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
+          + "' which may cause '" + hostname +
+          "' to be matched anywhere in the URL, outside the hostname."
+    else
+      msg =
+        "has an unescaped '.' before '" + hostname +
+          "', so it might match more hosts than expected."
+  )
+}
+
+predicate incompleteHostnameRegExp(
+  RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
+) {
+  exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
+    regexp = re.getRegExpTerm() and
+    isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
+    (
+      if re.getAParse() != re
+      then (
+        kind = "string, which is used as a regular expression $@," and
+        aux = re.getAParse()
+      ) else (
+        kind = "regular expression" and aux = re
+      )
+    )
+  |
+    message = "This " + kind + " " + msg and label = "here"
+  )
+}
diff --git a/python/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll b/python/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll
@@ -0,0 +1,2 @@
+import semmle.python.security.performance.RegExpTreeView
+import semmle.python.dataflow.new.DataFlow
diff --git a/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql b/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
@@ -8,35 +8,9 @@
  * @id py/incomplete-hostname-regexp
  * @tags correctness
  *       security
- *       external/cwe/cwe-20
+ *       external/cwe/cwe-020
  */
 
-import python
-import semmle.python.regex
+import HostnameRegexpShared
 
-private string commonTopLevelDomainRegex() { result = "com|org|edu|gov|uk|net|io" }
-
-/**
- * Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
- * and `pattern` contains a subtle mistake that allows it to match unexpected hosts.
- */
-bindingset[pattern]
-predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
-  hostPart =
-    pattern
-        .regexpCapture("(?i).*" +
-            // an unescaped single `.`
-            "(?<!\\\\)[.]" +
-            // immediately followed by a sequence of subdomains, perhaps with some regex characters mixed in, followed by a known TLD
-            "([():|?a-z0-9-]+(\\\\)?[.](" + commonTopLevelDomainRegex() + "))" + ".*", 1)
-}
-
-from Regex r, string pattern, string hostPart
-where
-  r.getText() = pattern and
-  isIncompleteHostNameRegExpPattern(pattern, hostPart) and
-  // ignore patterns with capture groups after the TLD
-  not pattern.regexpMatch("(?i).*[.](" + commonTopLevelDomainRegex() + ").*[(][?]:.*[)].*")
-select r,
-  "This regular expression has an unescaped '.' before '" + hostPart +
-    "', so it might match more hosts than expected."
+query predicate problems = incompleteHostnameRegExp/4;
diff --git a/python/ql/test/query-tests/Security/CWE-020-IncompleteHostnameRegExp/IncompleteHostnameRegExp.expected b/python/ql/test/query-tests/Security/CWE-020-IncompleteHostnameRegExp/IncompleteHostnameRegExp.expected
@@ -1 +1 @@
-| hosttest.py:6:27:6:51 | Str | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| hosttest.py:6:31:6:53 | (www\|beta).example.com/ | This regular expression has an unescaped '.' before 'example.com/', so it might match more hosts than expected. | hosttest.py:6:27:6:51 | ControlFlowNode for Str | here |

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+import semmle.python.security.performance.RegExpTreeView`
	`2`	`+import semmle.python.dataflow.new.DataFlow`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-\| hosttest.py:6:27:6:51 \| Str \| This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. \|`
	`1`	`+\| hosttest.py:6:31:6:53 \| (www\\|beta).example.com/ \| This regular expression has an unescaped '.' before 'example.com/', so it might match more hosts than expected. \| hosttest.py:6:27:6:51 \| ControlFlowNode for Str \| here \|`