Skip to content

Commit af1d949

Browse files
authored
Merge pull request #8489 from aibaars/regex-refactor
Ruby: refactor regex libraries
2 parents c98d024 + accdd94 commit af1d949

File tree

16 files changed

+1316
-879
lines changed

16 files changed

+1316
-879
lines changed

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,7 @@ class RegExpWordBoundary extends RegExpSpecialChar {
552552

553553
/**
554554
* A character class escape in a regular expression.
555-
* That is, an escaped charachter that denotes multiple characters.
555+
* That is, an escaped character that denotes multiple characters.
556556
*
557557
* Examples:
558558
*

python/ql/lib/semmle/python/regex.qll

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ abstract class RegexString extends Expr {
188188
)
189189
}
190190

191-
/** Hold is a character set starts between `start` and `end`. */
191+
/** Holds if a character set starts between `start` and `end`. */
192192
predicate char_set_start(int start, int end) {
193193
this.char_set_start(start) = true and
194194
(
@@ -316,8 +316,10 @@ abstract class RegexString extends Expr {
316316
result = this.(Bytes).getS()
317317
}
318318

319+
/** Gets the `i`th character of this regex */
319320
string getChar(int i) { result = this.getText().charAt(i) }
320321

322+
/** Gets the `i`th character of this regex, unless it is part of a character escape sequence. */
321323
string nonEscapedCharAt(int i) {
322324
result = this.getText().charAt(i) and
323325
not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1])
@@ -329,6 +331,9 @@ abstract class RegexString extends Expr {
329331

330332
private predicate isGroupStart(int i) { this.nonEscapedCharAt(i) = "(" and not this.inCharSet(i) }
331333

334+
/**
335+
* Holds if the `i`th character could not be parsed.
336+
*/
332337
predicate failedToParse(int i) {
333338
exists(this.getChar(i)) and
334339
not exists(int start, int end |
@@ -417,6 +422,9 @@ abstract class RegexString extends Expr {
417422
)
418423
}
419424

425+
/**
426+
* Holds if a simple or escaped character is found between `start` and `end`.
427+
*/
420428
predicate character(int start, int end) {
421429
(
422430
this.simpleCharacter(start, end) and
@@ -428,12 +436,18 @@ abstract class RegexString extends Expr {
428436
not exists(int x, int y | this.backreference(x, y) and x <= start and y >= end)
429437
}
430438

439+
/**
440+
* Holds if a normal character is found between `start` and `end`.
441+
*/
431442
predicate normalCharacter(int start, int end) {
432443
end = start + 1 and
433444
this.character(start, end) and
434445
not this.specialCharacter(start, end, _)
435446
}
436447

448+
/**
449+
* Holds if a special character is found between `start` and `end`.
450+
*/
437451
predicate specialCharacter(int start, int end, string char) {
438452
not this.inCharSet(start) and
439453
this.character(start, end) and
@@ -492,7 +506,7 @@ abstract class RegexString extends Expr {
492506
this.specialCharacter(start, end, _)
493507
}
494508

495-
/** Whether the text in the range start,end is a group */
509+
/** Whether the text in the range `start,end` is a group */
496510
predicate group(int start, int end) {
497511
this.groupContents(start, end, _, _)
498512
or
@@ -611,19 +625,26 @@ abstract class RegexString extends Expr {
611625
this.simple_group_start(start, end)
612626
}
613627

628+
/** Matches the start of a non-capturing group, e.g. `(?:` */
614629
private predicate non_capturing_group_start(int start, int end) {
615630
this.isGroupStart(start) and
616631
this.getChar(start + 1) = "?" and
617632
this.getChar(start + 2) = ":" and
618633
end = start + 3
619634
}
620635

636+
/** Matches the start of a simple group, e.g. `(a+)`. */
621637
private predicate simple_group_start(int start, int end) {
622638
this.isGroupStart(start) and
623639
this.getChar(start + 1) != "?" and
624640
end = start + 1
625641
}
626642

643+
/**
644+
* Matches the start of a named group, such as:
645+
* - `(?<name>\w+)`
646+
* - `(?'name'\w+)`
647+
*/
627648
private predicate named_group_start(int start, int end) {
628649
this.isGroupStart(start) and
629650
this.getChar(start + 1) = "?" and
@@ -675,20 +696,23 @@ abstract class RegexString extends Expr {
675696
)
676697
}
677698

699+
/** Matches the start of a positive lookahead assertion, i.e. `(?=`. */
678700
private predicate lookahead_assertion_start(int start, int end) {
679701
this.isGroupStart(start) and
680702
this.getChar(start + 1) = "?" and
681703
this.getChar(start + 2) = "=" and
682704
end = start + 3
683705
}
684706

707+
/** Matches the start of a negative lookahead assertion, i.e. `(?!`. */
685708
private predicate negative_lookahead_assertion_start(int start, int end) {
686709
this.isGroupStart(start) and
687710
this.getChar(start + 1) = "?" and
688711
this.getChar(start + 2) = "!" and
689712
end = start + 3
690713
}
691714

715+
/** Matches the start of a positive lookbehind assertion, i.e. `(?<=`. */
692716
private predicate lookbehind_assertion_start(int start, int end) {
693717
this.isGroupStart(start) and
694718
this.getChar(start + 1) = "?" and
@@ -697,6 +721,7 @@ abstract class RegexString extends Expr {
697721
end = start + 4
698722
}
699723

724+
/** Matches the start of a negative lookbehind assertion, i.e. `(?<!`. */
700725
private predicate negative_lookbehind_assertion_start(int start, int end) {
701726
this.isGroupStart(start) and
702727
this.getChar(start + 1) = "?" and
@@ -705,26 +730,30 @@ abstract class RegexString extends Expr {
705730
end = start + 4
706731
}
707732

733+
/** Matches the start of a comment group, i.e. `(?#`. */
708734
private predicate comment_group_start(int start, int end) {
709735
this.isGroupStart(start) and
710736
this.getChar(start + 1) = "?" and
711737
this.getChar(start + 2) = "#" and
712738
end = start + 3
713739
}
714740

741+
/** Matches the contents of a group. */
715742
predicate groupContents(int start, int end, int in_start, int in_end) {
716743
this.group_start(start, in_start) and
717744
end = in_end + 1 and
718745
this.top_level(in_start, in_end) and
719746
this.isGroupEnd(in_end)
720747
}
721748

749+
/** Matches a named backreference, e.g. `\k<foo>`. */
722750
private predicate named_backreference(int start, int end, string name) {
723751
this.named_backreference_start(start, start + 4) and
724752
end = min(int i | i > start + 4 and this.getChar(i) = ")") + 1 and
725753
name = this.getText().substring(start + 4, end - 2)
726754
}
727755

756+
/** Matches a numbered backreference, e.g. `\1`. */
728757
private predicate numbered_backreference(int start, int end, int value) {
729758
this.escapingChar(start) and
730759
// starting with 0 makes it an octal escape
@@ -749,7 +778,7 @@ abstract class RegexString extends Expr {
749778
)
750779
}
751780

752-
/** Whether the text in the range start,end is a back reference */
781+
/** Whether the text in the range `start,end` is a back reference */
753782
predicate backreference(int start, int end) {
754783
this.numbered_backreference(start, end, _)
755784
or

ruby/ql/consistency-queries/RegExpConsistency.ql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import codeql.ruby.security.performance.RegExpTreeView
1+
import codeql.ruby.Regexp
22

33
query predicate nonUniqueChild(RegExpParent parent, int i, RegExpTerm child) {
44
child = parent.getChild(i) and
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* The `ParseRegExp` and `RegExpTreeView` modules are now "internal" modules. Users should use `codeql.ruby.Regexp` instead.

ruby/ql/lib/codeql/ruby/Regexp.qll

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
/**
2+
* Provides classes for working with regular expressions.
3+
*
4+
* Regular expression literals are represented as an abstract syntax tree of regular expression
5+
* terms.
6+
*/
7+
8+
import regexp.RegExpTreeView // re-export
9+
private import regexp.internal.ParseRegExp
10+
private import codeql.ruby.ast.Literal as AST
11+
private import codeql.ruby.DataFlow
12+
private import codeql.ruby.controlflow.CfgNodes
13+
private import codeql.ruby.ApiGraphs
14+
private import codeql.ruby.dataflow.internal.tainttrackingforlibraries.TaintTrackingImpl
15+
16+
/**
17+
* Provides utility predicates related to regular expressions.
18+
*/
19+
module RegExpPatterns {
20+
/**
21+
* Gets a pattern that matches common top-level domain names in lower case.
22+
*/
23+
string getACommonTld() {
24+
// according to ranking by http://google.com/search?q=site:.<<TLD>>
25+
result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
26+
}
27+
}
28+
29+
/**
30+
* A node whose value may flow to a position where it is interpreted
31+
* as a part of a regular expression.
32+
*/
33+
abstract class RegExpPatternSource extends DataFlow::Node {
34+
/**
35+
* Gets a node where the pattern of this node is parsed as a part of
36+
* a regular expression.
37+
*/
38+
abstract DataFlow::Node getAParse();
39+
40+
/**
41+
* Gets the root term of the regular expression parsed from this pattern.
42+
*/
43+
abstract RegExpTerm getRegExpTerm();
44+
}
45+
46+
/**
47+
* A regular expression literal, viewed as the pattern source for itself.
48+
*/
49+
private class RegExpLiteralPatternSource extends RegExpPatternSource {
50+
private AST::RegExpLiteral astNode;
51+
52+
RegExpLiteralPatternSource() { astNode = this.asExpr().getExpr() }
53+
54+
override DataFlow::Node getAParse() { result = this }
55+
56+
override RegExpTerm getRegExpTerm() { result = astNode.getParsed() }
57+
}
58+
59+
/**
60+
* A node whose string value may flow to a position where it is interpreted
61+
* as a part of a regular expression.
62+
*/
63+
private class StringRegExpPatternSource extends RegExpPatternSource {
64+
private DataFlow::Node parse;
65+
66+
StringRegExpPatternSource() { this = regExpSource(parse) }
67+
68+
override DataFlow::Node getAParse() { result = parse }
69+
70+
override RegExpTerm getRegExpTerm() { result.getRegExp() = this.asExpr().getExpr() }
71+
}
72+
73+
private class RegExpLiteralRegExp extends RegExp, AST::RegExpLiteral {
74+
override predicate isDotAll() { this.hasMultilineFlag() }
75+
76+
override predicate isIgnoreCase() { this.hasCaseInsensitiveFlag() }
77+
78+
override string getFlags() { result = this.getFlagString() }
79+
}
80+
81+
private class ParsedStringRegExp extends RegExp {
82+
private DataFlow::Node parse;
83+
84+
ParsedStringRegExp() { this = regExpSource(parse).asExpr().getExpr() }
85+
86+
DataFlow::Node getAParse() { result = parse }
87+
88+
override predicate isDotAll() { none() }
89+
90+
override predicate isIgnoreCase() { none() }
91+
92+
override string getFlags() { none() }
93+
}
94+
95+
/**
96+
* Holds if `source` may be interpreted as a regular expression.
97+
*/
98+
private predicate isInterpretedAsRegExp(DataFlow::Node source) {
99+
// The first argument to an invocation of `Regexp.new` or `Regexp.compile`.
100+
source = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"]).getArgument(0)
101+
or
102+
// The argument of a call that coerces the argument to a regular expression.
103+
exists(DataFlow::CallNode mce |
104+
mce.getMethodName() = ["match", "match?"] and
105+
source = mce.getArgument(0) and
106+
// exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
107+
not mce.getReceiver().asExpr().getExpr() instanceof AST::RegExpLiteral
108+
)
109+
}
110+
111+
private class RegExpConfiguration extends Configuration {
112+
RegExpConfiguration() { this = "RegExpConfiguration" }
113+
114+
override predicate isSource(DataFlow::Node source) {
115+
source.asExpr() =
116+
any(ExprCfgNode e |
117+
e.getConstantValue().isString(_) and
118+
not e instanceof ExprNodes::VariableReadAccessCfgNode and
119+
not e instanceof ExprNodes::ConstantReadAccessCfgNode
120+
)
121+
}
122+
123+
override predicate isSink(DataFlow::Node sink) { isInterpretedAsRegExp(sink) }
124+
125+
override predicate isSanitizer(DataFlow::Node node) {
126+
// stop flow if `node` is receiver of
127+
// https://ruby-doc.org/core-2.4.0/String.html#method-i-match
128+
exists(DataFlow::CallNode mce |
129+
mce.getMethodName() = ["match", "match?"] and
130+
node = mce.getReceiver() and
131+
mce.getArgument(0).asExpr().getExpr() instanceof AST::RegExpLiteral
132+
)
133+
}
134+
}
135+
136+
/**
137+
* Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
138+
* as a part of a regular expression.
139+
*/
140+
cached
141+
DataFlow::Node regExpSource(DataFlow::Node re) {
142+
exists(RegExpConfiguration c | c.hasFlow(result, re))
143+
}

ruby/ql/lib/codeql/ruby/ast/Literal.qll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
private import codeql.ruby.AST
2-
private import codeql.ruby.security.performance.RegExpTreeView as RETV
2+
private import codeql.ruby.Regexp as RE
33
private import internal.AST
44
private import internal.Constant
55
private import internal.Literal
@@ -393,7 +393,7 @@ class RegExpLiteral extends StringlikeLiteral instanceof RegExpLiteralImpl {
393393
final predicate hasFreeSpacingFlag() { this.getFlagString().charAt(_) = "x" }
394394

395395
/** Returns the root node of the parse tree of this regular expression. */
396-
final RETV::RegExpTerm getParsed() { result = RETV::getParsedRegExp(this) }
396+
final RE::RegExpTerm getParsed() { result = RE::getParsedRegExp(this) }
397397
}
398398

399399
/**

ruby/ql/lib/codeql/ruby/printAst.qll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*/
88

99
private import AST
10-
private import codeql.ruby.security.performance.RegExpTreeView as RETV
10+
private import codeql.ruby.Regexp as RE
1111
private import codeql.ruby.ast.internal.Synthesis
1212

1313
/**
@@ -37,7 +37,7 @@ private predicate shouldPrintAstEdge(AstNode parent, string edgeName, AstNode ch
3737

3838
newtype TPrintNode =
3939
TPrintRegularAstNode(AstNode n) { shouldPrintNode(n) } or
40-
TPrintRegExpNode(RETV::RegExpTerm term) {
40+
TPrintRegExpNode(RE::RegExpTerm term) {
4141
exists(RegExpLiteral literal |
4242
shouldPrintNode(literal) and
4343
term.getRootTerm() = literal.getParsed()
@@ -107,7 +107,7 @@ class PrintRegularAstNode extends PrintAstNode, TPrintRegularAstNode {
107107
or
108108
// If this AST node is a regexp literal, add the parsed regexp tree as a
109109
// child.
110-
exists(RETV::RegExpTerm t | t = astNode.(RegExpLiteral).getParsed() |
110+
exists(RE::RegExpTerm t | t = astNode.(RegExpLiteral).getParsed() |
111111
result = TPrintRegExpNode(t) and edgeName = "getParsed"
112112
)
113113
}
@@ -134,7 +134,7 @@ class PrintRegularAstNode extends PrintAstNode, TPrintRegularAstNode {
134134

135135
/** A parsed regexp node in the output tree. */
136136
class PrintRegExpNode extends PrintAstNode, TPrintRegExpNode {
137-
RETV::RegExpTerm regexNode;
137+
RE::RegExpTerm regexNode;
138138

139139
PrintRegExpNode() { this = TPrintRegExpNode(regexNode) }
140140

@@ -147,7 +147,7 @@ class PrintRegExpNode extends PrintAstNode, TPrintRegExpNode {
147147
exists(int i | result = TPrintRegExpNode(regexNode.getChild(i)) and edgeName = i.toString())
148148
}
149149

150-
override int getOrder() { exists(RETV::RegExpTerm p | p.getChild(result) = regexNode) }
150+
override int getOrder() { exists(RE::RegExpTerm p | p.getChild(result) = regexNode) }
151151

152152
override predicate hasLocationInfo(
153153
string filepath, int startline, int startcolumn, int endline, int endcolumn

0 commit comments

Comments
 (0)