Skip to content

Commit 44cc6f7

Browse files
committed
Ruby: improve tracking of regular expressions
There are two flavours of `match?`. If the receiver of `match?` has type String then the argument to `match?` is a regular expression. However, if the receiver of `match?` has type Regexp then the argument is the text. The role of receiver and argument flips depending on the type of the receiver, this caused a lot of false positives when looking for string-like literals that are used as a regular expression. This commit attempts to improve things by trying to determine whether the type of the receiver is known to be of type Regexp. In such cases we know that the argument is unlikely to be regular expression.
1 parent 0160c37 commit 44cc6f7

File tree

4 files changed

+33
-7
lines changed

4 files changed

+33
-7
lines changed

ruby/ql/lib/codeql/ruby/Regexp.qll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ class StdLibRegExpInterpretation extends RegExpInterpretation::Range {
114114
mce.getMethodName() = ["match", "match?"] and
115115
this = mce.getArgument(0) and
116116
// exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
117-
not mce.getReceiver().asExpr().getExpr() instanceof Ast::RegExpLiteral
117+
not mce.getReceiver() = trackRegexpType()
118118
)
119119
}
120120
}

ruby/ql/lib/codeql/ruby/regexp/internal/RegExpConfiguration.qll

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ private import codeql.ruby.ast.Literal as Ast
33
private import codeql.ruby.DataFlow
44
private import codeql.ruby.controlflow.CfgNodes
55
private import codeql.ruby.dataflow.internal.tainttrackingforlibraries.TaintTrackingImpl
6+
private import codeql.ruby.typetracking.TypeTracker
7+
private import codeql.ruby.ApiGraphs
68

79
class RegExpConfiguration extends Configuration {
810
RegExpConfiguration() { this = "RegExpConfiguration" }
@@ -19,12 +21,26 @@ class RegExpConfiguration extends Configuration {
1921
override predicate isSink(DataFlow::Node sink) { sink instanceof RegExpInterpretation::Range }
2022

2123
override predicate isSanitizer(DataFlow::Node node) {
22-
// stop flow if `node` is receiver of
23-
// https://ruby-doc.org/core-2.4.0/String.html#method-i-match
24-
exists(DataFlow::CallNode mce |
25-
mce.getMethodName() = ["match", "match?"] and
24+
exists(DataFlow::CallNode mce | mce.getMethodName() = ["match", "match?"] |
25+
// receiver of https://ruby-doc.org/core-2.4.0/String.html#method-i-match
2626
node = mce.getReceiver() and
27-
mce.getArgument(0).asExpr().getExpr() instanceof Ast::RegExpLiteral
27+
mce.getArgument(0) = trackRegexpType()
28+
or
29+
// first argument of https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
30+
node = mce.getArgument(0) and
31+
mce.getReceiver() = trackRegexpType()
2832
)
2933
}
3034
}
35+
36+
private DataFlow::LocalSourceNode trackRegexpType(TypeTracker t) {
37+
t.start() and
38+
(
39+
result.asExpr().getExpr() instanceof Ast::RegExpLiteral or
40+
result = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"])
41+
)
42+
or
43+
exists(TypeTracker t2 | result = trackRegexpType(t2).track(t2, t))
44+
}
45+
46+
DataFlow::Node trackRegexpType() { trackRegexpType(TypeTracker::end()).flowsTo(result) }

ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,5 @@
1616
| missing_regexp_anchor.rb:50:1:50:30 | /^good\\\\\\\\.com\|better\\\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
1717
| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression '^foo' is anchored at the beginning, but the other parts of this regular expression are not |
1818
| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression 'baz$' is anchored at the end, but the other parts of this regular expression are not |
19+
| missing_regexp_anchor.rb:60:20:60:39 | "http://example.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
20+
| missing_regexp_anchor.rb:61:19:61:38 | "http://example.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |

ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,12 @@
5050
/^good\\\\.com|better\\\\.com/ # BAD
5151

5252
/^foo|bar|baz$/ # BAD
53-
/^foo|%/ # OK
53+
/^foo|%/ # OK
54+
55+
REGEXP = /foo/
56+
REGEXP.match? "http://example.com" # GOOD: the url is the text not the regexp
57+
REGEXP.match "http://example.com" # GOOD: the url is the text not the regexp
58+
"http://example.com".match? REGEXP # GOOD: the url is the text not the regexp
59+
"http://example.com".match REGEXP # GOOD: the url is the text not the regexp
60+
"some text".match? "http://example.com" # BAD
61+
"some text".match "http://example.com" # BAD

0 commit comments

Comments
 (0)