Skip to content

Commit 74aea81

Browse files
committed
Ruby: refactor regex libraries
1 parent 496aab7 commit 74aea81

File tree

15 files changed

+1318
-879
lines changed

15 files changed

+1318
-879
lines changed

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,7 @@ class RegExpWordBoundary extends RegExpSpecialChar {
552552

553553
/**
554554
* A character class escape in a regular expression.
555-
* That is, an escaped charachter that denotes multiple characters.
555+
* That is, an escaped character that denotes multiple characters.
556556
*
557557
* Examples:
558558
*

python/ql/lib/semmle/python/regex.qll

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ abstract class RegexString extends Expr {
186186
)
187187
}
188188

189-
/** Hold is a character set starts between `start` and `end`. */
189+
/** Holds if a character set starts between `start` and `end`. */
190190
predicate char_set_start(int start, int end) {
191191
this.char_set_start(start) = true and
192192
(
@@ -314,8 +314,10 @@ abstract class RegexString extends Expr {
314314
result = this.(Bytes).getS()
315315
}
316316

317+
/** Gets the `i`th character of this regex */
317318
string getChar(int i) { result = this.getText().charAt(i) }
318319

320+
/** Gets the `i`th character of this regex, unless it is part of an character escape sequence. */
319321
string nonEscapedCharAt(int i) {
320322
result = this.getText().charAt(i) and
321323
not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1])
@@ -327,6 +329,9 @@ abstract class RegexString extends Expr {
327329

328330
private predicate isGroupStart(int i) { this.nonEscapedCharAt(i) = "(" and not this.inCharSet(i) }
329331

332+
/**
333+
* Holds if the `i`th character could not be parsed.
334+
*/
330335
predicate failedToParse(int i) {
331336
exists(this.getChar(i)) and
332337
not exists(int start, int end |
@@ -415,6 +420,9 @@ abstract class RegexString extends Expr {
415420
)
416421
}
417422

423+
/**
424+
* Holds if a simple or escaped character is found between `start` and `end`.
425+
*/
418426
predicate character(int start, int end) {
419427
(
420428
this.simpleCharacter(start, end) and
@@ -426,12 +434,18 @@ abstract class RegexString extends Expr {
426434
not exists(int x, int y | this.backreference(x, y) and x <= start and y >= end)
427435
}
428436

437+
/**
438+
* Holds if a normal character is found between `start` and `end`.
439+
*/
429440
predicate normalCharacter(int start, int end) {
430441
end = start + 1 and
431442
this.character(start, end) and
432443
not this.specialCharacter(start, end, _)
433444
}
434445

446+
/**
447+
* Holds if a special character is found between `start` and `end`.
448+
*/
435449
predicate specialCharacter(int start, int end, string char) {
436450
not this.inCharSet(start) and
437451
this.character(start, end) and
@@ -490,7 +504,7 @@ abstract class RegexString extends Expr {
490504
this.specialCharacter(start, end, _)
491505
}
492506

493-
/** Whether the text in the range start,end is a group */
507+
/** Whether the text in the range `start,end` is a group */
494508
predicate group(int start, int end) {
495509
this.groupContents(start, end, _, _)
496510
or
@@ -609,19 +623,26 @@ abstract class RegexString extends Expr {
609623
this.simple_group_start(start, end)
610624
}
611625

626+
/** Matches the start of a non-capturing group, e.g. `(?:` */
612627
private predicate non_capturing_group_start(int start, int end) {
613628
this.isGroupStart(start) and
614629
this.getChar(start + 1) = "?" and
615630
this.getChar(start + 2) = ":" and
616631
end = start + 3
617632
}
618633

634+
/** Matches the start of a simple group, e.g. `(a+)`. */
619635
private predicate simple_group_start(int start, int end) {
620636
this.isGroupStart(start) and
621637
this.getChar(start + 1) != "?" and
622638
end = start + 1
623639
}
624640

641+
/**
642+
* Matches the start of a named group, such as:
643+
* - `(?<name>\w+)`
644+
* - `(?'name'\w+)`
645+
*/
625646
private predicate named_group_start(int start, int end) {
626647
this.isGroupStart(start) and
627648
this.getChar(start + 1) = "?" and
@@ -673,20 +694,23 @@ abstract class RegexString extends Expr {
673694
)
674695
}
675696

697+
/** Matches the start of a positive lookahead assertion, i.e. `(?=`. */
676698
private predicate lookahead_assertion_start(int start, int end) {
677699
this.isGroupStart(start) and
678700
this.getChar(start + 1) = "?" and
679701
this.getChar(start + 2) = "=" and
680702
end = start + 3
681703
}
682704

705+
/** Matches the start of a negative lookahead assertion, i.e. `(?!`. */
683706
private predicate negative_lookahead_assertion_start(int start, int end) {
684707
this.isGroupStart(start) and
685708
this.getChar(start + 1) = "?" and
686709
this.getChar(start + 2) = "!" and
687710
end = start + 3
688711
}
689712

713+
/** Matches the start of a positive lookbehind assertion, i.e. `(?<=`. */
690714
private predicate lookbehind_assertion_start(int start, int end) {
691715
this.isGroupStart(start) and
692716
this.getChar(start + 1) = "?" and
@@ -695,6 +719,7 @@ abstract class RegexString extends Expr {
695719
end = start + 4
696720
}
697721

722+
/** Matches the start of a negative lookbehind assertion, i.e. `(?<!`. */
698723
private predicate negative_lookbehind_assertion_start(int start, int end) {
699724
this.isGroupStart(start) and
700725
this.getChar(start + 1) = "?" and
@@ -703,26 +728,30 @@ abstract class RegexString extends Expr {
703728
end = start + 4
704729
}
705730

731+
/** Matches the start of a comment group, i.e. `(?#`. */
706732
private predicate comment_group_start(int start, int end) {
707733
this.isGroupStart(start) and
708734
this.getChar(start + 1) = "?" and
709735
this.getChar(start + 2) = "#" and
710736
end = start + 3
711737
}
712738

739+
/** Matches the contents of a group. */
713740
predicate groupContents(int start, int end, int in_start, int in_end) {
714741
this.group_start(start, in_start) and
715742
end = in_end + 1 and
716743
this.top_level(in_start, in_end) and
717744
this.isGroupEnd(in_end)
718745
}
719746

747+
/** Matches a named backreference, e.g. `\k<foo>`. */
720748
private predicate named_backreference(int start, int end, string name) {
721749
this.named_backreference_start(start, start + 4) and
722750
end = min(int i | i > start + 4 and this.getChar(i) = ")") + 1 and
723751
name = this.getText().substring(start + 4, end - 2)
724752
}
725753

754+
/** Matches a numbered backreference, e.g. `\1`. */
726755
private predicate numbered_backreference(int start, int end, int value) {
727756
this.escapingChar(start) and
728757
// starting with 0 makes it an octal escape
@@ -747,7 +776,7 @@ abstract class RegexString extends Expr {
747776
)
748777
}
749778

750-
/** Whether the text in the range start,end is a back reference */
779+
/** Whether the text in the range `start,end` is a back reference */
751780
predicate backreference(int start, int end) {
752781
this.numbered_backreference(start, end, _)
753782
or

ruby/ql/consistency-queries/RegExpConsistency.ql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import codeql.ruby.security.performance.RegExpTreeView
1+
import codeql.ruby.Regexp
22

33
query predicate nonUniqueChild(RegExpParent parent, int i, RegExpTerm child) {
44
child = parent.getChild(i) and

ruby/ql/lib/codeql/ruby/Regexp.qll

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
/**
2+
* Provides classes for working with regular expressions.
3+
*
4+
* Regular expression literals are represented as an abstract syntax tree of regular expression
5+
* terms.
6+
*/
7+
8+
import regexp.RegExpTreeView // re-export
9+
private import regexp.ParseRegExp
10+
private import codeql.ruby.ast.Literal as AST
11+
private import codeql.ruby.DataFlow
12+
private import codeql.ruby.controlflow.CfgNodes
13+
private import codeql.ruby.ApiGraphs
14+
private import codeql.ruby.dataflow.internal.tainttrackingforlibraries.TaintTrackingImpl
15+
16+
/**
17+
* Provides utility predicates related to regular expressions.
18+
*/
19+
module RegExpPatterns {
20+
/**
21+
* Gets a pattern that matches common top-level domain names in lower case.
22+
*/
23+
string getACommonTld() {
24+
// according to ranking by http://google.com/search?q=site:.<<TLD>>
25+
result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
26+
}
27+
}
28+
29+
/**
30+
* A node whose value may flow to a position where it is interpreted
31+
* as a part of a regular expression.
32+
*/
33+
abstract class RegExpPatternSource extends DataFlow::Node {
34+
/**
35+
* Gets a node where the pattern of this node is parsed as a part of
36+
* a regular expression.
37+
*/
38+
abstract DataFlow::Node getAParse();
39+
40+
/**
41+
* Gets the root term of the regular expression parsed from this pattern.
42+
*/
43+
abstract RegExpTerm getRegExpTerm();
44+
}
45+
46+
/**
47+
* A regular expression literal, viewed as the pattern source for itself.
48+
*/
49+
private class RegExpLiteralPatternSource extends RegExpPatternSource {
50+
private AST::RegExpLiteral astNode;
51+
52+
RegExpLiteralPatternSource() { astNode = this.asExpr().getExpr() }
53+
54+
override DataFlow::Node getAParse() { result = this }
55+
56+
override RegExpTerm getRegExpTerm() { result = astNode.getParsed() }
57+
}
58+
59+
/**
60+
* A node whose string value may flow to a position where it is interpreted
61+
* as a part of a regular expression.
62+
*/
63+
private class StringRegExpPatternSource extends RegExpPatternSource {
64+
private DataFlow::Node parse;
65+
66+
StringRegExpPatternSource() { this = regExpSource(parse) }
67+
68+
override DataFlow::Node getAParse() { result = parse }
69+
70+
override RegExpTerm getRegExpTerm() { result.getRegExp() = this.asExpr().getExpr() }
71+
}
72+
73+
private class RegExpLiteralRegExp extends RegExp, AST::RegExpLiteral {
74+
override predicate isDotAll() { this.hasMultilineFlag() }
75+
76+
override predicate isIgnoreCase() { this.hasCaseInsensitiveFlag() }
77+
78+
override string getFlags() { result = this.getFlagString() }
79+
}
80+
81+
private class ParsedStringRegExp extends RegExp {
82+
private DataFlow::Node parse;
83+
84+
ParsedStringRegExp() { this = regExpSource(parse).asExpr().getExpr() }
85+
86+
DataFlow::Node getAParse() { result = parse }
87+
88+
override predicate isDotAll() { none() }
89+
90+
override predicate isIgnoreCase() { none() }
91+
92+
override string getFlags() { none() }
93+
}
94+
95+
/**
96+
* Holds if `source` may be interpreted as a regular expression.
97+
*/
98+
private predicate isInterpretedAsRegExp(DataFlow::Node source) {
99+
// The first argument to an invocation of `Regexp.new` or `Regexp.compile`.
100+
source = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"]).getArgument(0)
101+
or
102+
// The argument of a call that coerces the argument to a regular expression.
103+
exists(DataFlow::CallNode mce |
104+
mce.getMethodName() = ["match", "match?"] and
105+
source = mce.getArgument(0) and
106+
// exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
107+
not mce.getReceiver().asExpr().getExpr() instanceof AST::RegExpLiteral
108+
)
109+
}
110+
111+
private class RegExpConfiguration extends Configuration {
112+
RegExpConfiguration() { this = "RegExpConfiguration" }
113+
114+
override predicate isSource(DataFlow::Node source) {
115+
source.asExpr() =
116+
any(ExprCfgNode e |
117+
e.getConstantValue().isString(_) and
118+
not e instanceof ExprNodes::VariableReadAccessCfgNode and
119+
not e instanceof ExprNodes::ConstantReadAccessCfgNode
120+
)
121+
}
122+
123+
override predicate isSink(DataFlow::Node sink) { isInterpretedAsRegExp(sink) }
124+
125+
override predicate isSanitizer(DataFlow::Node node) {
126+
// stop flow if `node` is receiver of
127+
// https://ruby-doc.org/core-2.4.0/String.html#method-i-match
128+
exists(DataFlow::CallNode mce |
129+
mce.getMethodName() = ["match", "match?"] and
130+
node = mce.getReceiver() and
131+
mce.getArgument(0).asExpr().getExpr() instanceof AST::RegExpLiteral
132+
)
133+
}
134+
}
135+
136+
/**
137+
* Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
138+
* as a part of a regular expression.
139+
*/
140+
cached
141+
DataFlow::Node regExpSource(DataFlow::Node re) {
142+
exists(RegExpConfiguration c | c.hasFlow(result, re))
143+
}

ruby/ql/lib/codeql/ruby/ast/Literal.qll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
private import codeql.ruby.AST
2-
private import codeql.ruby.security.performance.RegExpTreeView as RETV
2+
private import codeql.ruby.Regexp as RE
33
private import internal.AST
44
private import internal.Constant
55
private import internal.Literal
@@ -594,7 +594,7 @@ class RegExpLiteral extends StringlikeLiteral, TRegExpLiteral {
594594
final predicate hasFreeSpacingFlag() { this.getFlagString().charAt(_) = "x" }
595595

596596
/** Returns the root node of the parse tree of this regular expression. */
597-
final RETV::RegExpTerm getParsed() { result = RETV::getParsedRegExp(this) }
597+
final RE::RegExpTerm getParsed() { result = RE::getParsedRegExp(this) }
598598
}
599599

600600
/**

ruby/ql/lib/codeql/ruby/printAst.qll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*/
88

99
private import AST
10-
private import codeql.ruby.security.performance.RegExpTreeView as RETV
10+
private import codeql.ruby.Regexp as RE
1111
private import codeql.ruby.ast.internal.Synthesis
1212

1313
/**
@@ -37,7 +37,7 @@ private predicate shouldPrintAstEdge(AstNode parent, string edgeName, AstNode ch
3737

3838
newtype TPrintNode =
3939
TPrintRegularAstNode(AstNode n) { shouldPrintNode(n) } or
40-
TPrintRegExpNode(RETV::RegExpTerm term) {
40+
TPrintRegExpNode(RE::RegExpTerm term) {
4141
exists(RegExpLiteral literal |
4242
shouldPrintNode(literal) and
4343
term.getRootTerm() = literal.getParsed()
@@ -107,7 +107,7 @@ class PrintRegularAstNode extends PrintAstNode, TPrintRegularAstNode {
107107
or
108108
// If this AST node is a regexp literal, add the parsed regexp tree as a
109109
// child.
110-
exists(RETV::RegExpTerm t | t = astNode.(RegExpLiteral).getParsed() |
110+
exists(RE::RegExpTerm t | t = astNode.(RegExpLiteral).getParsed() |
111111
result = TPrintRegExpNode(t) and edgeName = "getParsed"
112112
)
113113
}
@@ -134,7 +134,7 @@ class PrintRegularAstNode extends PrintAstNode, TPrintRegularAstNode {
134134

135135
/** A parsed regexp node in the output tree. */
136136
class PrintRegExpNode extends PrintAstNode, TPrintRegExpNode {
137-
RETV::RegExpTerm regexNode;
137+
RE::RegExpTerm regexNode;
138138

139139
PrintRegExpNode() { this = TPrintRegExpNode(regexNode) }
140140

@@ -147,7 +147,7 @@ class PrintRegExpNode extends PrintAstNode, TPrintRegExpNode {
147147
exists(int i | result = TPrintRegExpNode(regexNode.getChild(i)) and edgeName = i.toString())
148148
}
149149

150-
override int getOrder() { exists(RETV::RegExpTerm p | p.getChild(result) = regexNode) }
150+
override int getOrder() { exists(RE::RegExpTerm p | p.getChild(result) = regexNode) }
151151

152152
override predicate hasLocationInfo(
153153
string filepath, int startline, int startcolumn, int endline, int endcolumn

0 commit comments

Comments
 (0)