Merge pull request #8293 from aibaars/regex-pattern-source

aibaars · web-flow · commit b79d08523ced · 2022-03-03T17:35:40.000+01:00
Ruby: parse more string literals as regular expressions
diff --git a/.github/workflows/ruby-qltest.yml b/.github/workflows/ruby-qltest.yml
@@ -63,6 +63,7 @@ jobs:
   qltest:
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         slice: ["1/2", "2/2"]
     steps:
diff --git a/ruby/ql/lib/change-notes/2022-02-28-regex-string-literals.md b/ruby/ql/lib/change-notes/2022-02-28-regex-string-literals.md
@@ -0,0 +1,4 @@
+---
+category: minorAnalysis
+---
+* The `Regex` class is now an abstract class that extends `StringlikeLiteral` with implementations for `RegExpLiteral` and string literals that 'flow' into functions that are known to interpret string arguments as regular expressions such as `Regex.new` and `String.match`.
diff --git a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
@@ -7,8 +7,32 @@
 
 private import codeql.ruby.ast.Literal as AST
 private import codeql.Locations
+private import codeql.ruby.DataFlow
+private import codeql.ruby.TaintTracking
+private import codeql.ruby.typetracking.TypeTracker
+private import codeql.ruby.ApiGraphs
+
+/**
+ * A `StringlikeLiteral` containing a regular expression term, that is, either
+ * a regular expression literal, or a string literal used in a context where
+ * it is parsed as regular expression.
+ */
+abstract class RegExp extends AST::StringlikeLiteral {
+  /**
+   * Holds if this `RegExp` has the `s` flag for multi-line matching.
+   */
+  predicate isDotAll() { none() }
+
+  /**
+   * Holds if this `RegExp` has the `i` flag for case-insensitive matching.
+   */
+  predicate isIgnoreCase() { none() }
+
+  /**
+   * Gets the flags for this `RegExp`, or the empty string if it has no flags.
+   */
+  string getFlags() { result = "" }
 
-class RegExp extends AST::RegExpLiteral {
   /**
    * Helper predicate for `charSetStart(int start, int end)`.
    *
@@ -933,3 +957,63 @@ class RegExp extends AST::RegExpLiteral {
     this.lastPart(start, end)
   }
 }
+
+private class RegExpLiteralRegExp extends RegExp, AST::RegExpLiteral {
+  override predicate isDotAll() { this.hasMultilineFlag() }
+
+  override predicate isIgnoreCase() { this.hasCaseInsensitiveFlag() }
+
+  override string getFlags() { result = this.getFlagString() }
+}
+
+private class ParsedStringRegExp extends RegExp {
+  private DataFlow::Node parse;
+
+  ParsedStringRegExp() { this = regExpSource(parse).asExpr().getExpr() }
+
+  DataFlow::Node getAParse() { result = parse }
+
+  override predicate isDotAll() { none() }
+
+  override predicate isIgnoreCase() { none() }
+
+  override string getFlags() { none() }
+}
+
+/**
+ * Holds if `source` may be interpreted as a regular expression.
+ */
+cached
+private predicate isInterpretedAsRegExp(DataFlow::Node source) {
+  // The first argument to an invocation of `Regexp.new` or `Regexp.compile`.
+  source = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"]).getArgument(0)
+  or
+  // The argument of a call that coerces the argument to a regular expression.
+  exists(DataFlow::CallNode mce |
+    mce.getMethodName() = ["match", "match?"] and
+    source = mce.getArgument(0)
+  )
+}
+
+/**
+ * Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
+ * as a part of a regular expression.
+ */
+private DataFlow::Node regExpSource(DataFlow::Node re, TypeBackTracker t) {
+  t.start() and
+  re = result and
+  isInterpretedAsRegExp(result)
+  or
+  exists(TypeBackTracker t2, DataFlow::Node succ | succ = regExpSource(re, t2) |
+    t2 = t.smallstep(result, succ)
+    or
+    TaintTracking::localTaintStep(result, succ) and
+    t = t2
+  )
+}
+
+/**
+ * Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
+ * as a part of a regular expression.
+ */
+DataFlow::Node regExpSource(DataFlow::Node re) { result = regExpSource(re, TypeBackTracker::end()) }
diff --git a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
@@ -1,7 +1,7 @@
 private import codeql.ruby.ast.Literal as AST
-private import codeql.Locations
 private import ParseRegExp
 import codeql.Locations
+private import codeql.ruby.DataFlow
 
 /**
  * Holds if `term` is an ecape class representing e.g. `\d`.
@@ -27,7 +27,7 @@ predicate isEscapeClass(RegExpTerm term, string clazz) {
  * Holds if the regular expression should not be considered.
  */
 predicate isExcluded(RegExpParent parent) {
-  parent.(RegExpTerm).getRegExp().hasFreeSpacingFlag() // exclude free-spacing mode regexes
+  parent.(RegExpTerm).getRegExp().(AST::RegExpLiteral).hasFreeSpacingFlag() // exclude free-spacing mode regexes
 }
 
 /**
@@ -93,11 +93,11 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {
 
   override RegExpTerm getChild(int i) { i = 0 and result.getRegExp() = re and result.isRootTerm() }
 
-  predicate isDotAll() { re.hasMultilineFlag() }
+  predicate isDotAll() { re.isDotAll() }
 
-  predicate isIgnoreCase() { re.hasCaseInsensitiveFlag() }
+  predicate isIgnoreCase() { re.isIgnoreCase() }
 
-  string getFlags() { result = re.getFlagString() }
+  string getFlags() { result = re.getFlags() }
 
   override string getAPrimaryQlClass() { result = "RegExpLiteral" }
 }
@@ -795,3 +795,47 @@ class RegExpNamedCharacterProperty extends RegExpTerm, TRegExpNamedCharacterProp
 RegExpTerm getParsedRegExp(AST::RegExpLiteral re) {
   result.getRegExp() = re and result.isRootTerm()
 }
+
+/**
+ * A node whose value may flow to a position where it is interpreted
+ * as a part of a regular expression.
+ */
+abstract class RegExpPatternSource extends DataFlow::Node {
+  /**
+   * Gets a node where the pattern of this node is parsed as a part of
+   * a regular expression.
+   */
+  abstract DataFlow::Node getAParse();
+
+  /**
+   * Gets the root term of the regular expression parsed from this pattern.
+   */
+  abstract RegExpTerm getRegExpTerm();
+}
+
+/**
+ * A regular expression literal, viewed as the pattern source for itself.
+ */
+private class RegExpLiteralPatternSource extends RegExpPatternSource {
+  private AST::RegExpLiteral astNode;
+
+  RegExpLiteralPatternSource() { astNode = this.asExpr().getExpr() }
+
+  override DataFlow::Node getAParse() { result = this }
+
+  override RegExpTerm getRegExpTerm() { result = astNode.getParsed() }
+}
+
+/**
+ * A node whose string value may flow to a position where it is interpreted
+ * as a part of a regular expression.
+ */
+private class StringRegExpPatternSource extends RegExpPatternSource {
+  private DataFlow::Node parse;
+
+  StringRegExpPatternSource() { this = regExpSource(parse) }
+
+  override DataFlow::Node getAParse() { result = parse }
+
+  override RegExpTerm getRegExpTerm() { result.getRegExp() = this.asExpr().getExpr() }
+}
diff --git a/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected b/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected
@@ -20,6 +20,7 @@
 | tst.rb:74:10:74:17 | (b\|a?b)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'b'. |
 | tst.rb:77:10:77:17 | (a\|aa?)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
 | tst.rb:83:10:83:16 | (.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
+| tst.rb:89:21:89:28 | (a\|aa?)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
 | tst.rb:95:11:95:24 | ([\\S\\s]\|[^a])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '`'. |
 | tst.rb:101:11:101:19 | (.\|[^a])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '`'. |
 | tst.rb:107:11:107:19 | (b\|[^a])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'b'. |
diff --git a/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/tst.rb b/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/tst.rb
@@ -85,7 +85,7 @@
 # GOOD
 good8 = /([\w.]+)*/
 
-# BAD - we don't yet parse regexps constructed from strings
+# NOT GOOD
 bad17 = Regexp.new '(a|aa?)*b'
 
 # GOOD - not used as regexp

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +category: minorAnalysis
 +---
 +* The `Regex` class is now an abstract class that extends `StringlikeLiteral` with implementations for `RegExpLiteral` and string literals that 'flow' into functions that are known to interpret string arguments as regular expressions such as `Regex.new` and `String.match`.