Skip to content

Commit 5ce6b84

Browse files
authored
Merge pull request #8166 from aibaars/regex-char-sequence-1
Ruby/Python: regex parser: group sequences of 'normal' characters
2 parents d3e3603 + 0c23f58 commit 5ce6b84

File tree

12 files changed

+188
-231
lines changed

12 files changed

+188
-231
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* The regular expression parser now groups sequences of normal characters. This reduces the number of instances of `RegExpNormalChar`.

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,12 @@ newtype TRegExpParent =
3939
/** A special character */
4040
TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or
4141
/** A normal character */
42-
TRegExpNormalChar(Regex re, int start, int end) { re.normalCharacter(start, end) } or
42+
TRegExpNormalChar(Regex re, int start, int end) {
43+
re.normalCharacterSequence(start, end)
44+
or
45+
re.escapedCharacter(start, end) and
46+
not re.specialCharacter(start, end, _)
47+
} or
4348
/** A back reference */
4449
TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
4550

python/ql/lib/semmle/python/regex.qll

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,7 @@ abstract class RegexString extends Expr {
427427
}
428428

429429
predicate normalCharacter(int start, int end) {
430+
end = start + 1 and
430431
this.character(start, end) and
431432
not this.specialCharacter(start, end, _)
432433
}
@@ -446,6 +447,49 @@ abstract class RegexString extends Expr {
446447
)
447448
}
448449

450+
/**
451+
* Holds if the range [start:end) consists of only 'normal' characters.
452+
*/
453+
predicate normalCharacterSequence(int start, int end) {
454+
// a normal character inside a character set is interpreted on its own
455+
this.normalCharacter(start, end) and
456+
this.inCharSet(start)
457+
or
458+
// a maximal run of normal characters is considered as one constant
459+
exists(int s, int e |
460+
e = max(int i | this.normalCharacterRun(s, i)) and
461+
not this.inCharSet(s)
462+
|
463+
// 'abc' can be considered one constant, but
464+
// 'abc+' has to be broken up into 'ab' and 'c+',
465+
// as the qualifier only applies to 'c'.
466+
if this.qualifier(e, _, _, _)
467+
then
468+
end = e and start = e - 1
469+
or
470+
end = e - 1 and start = s and start < end
471+
else (
472+
end = e and
473+
start = s
474+
)
475+
)
476+
}
477+
478+
private predicate normalCharacterRun(int start, int end) {
479+
(
480+
this.normalCharacterRun(start, end - 1)
481+
or
482+
start = end - 1 and not this.normalCharacter(start - 1, start)
483+
) and
484+
this.normalCharacter(end - 1, end)
485+
}
486+
487+
private predicate characterItem(int start, int end) {
488+
this.normalCharacterSequence(start, end) or
489+
this.escapedCharacter(start, end) or
490+
this.specialCharacter(start, end, _)
491+
}
492+
449493
/** Whether the text in the range start,end is a group */
450494
predicate group(int start, int end) {
451495
this.groupContents(start, end, _, _)
@@ -717,7 +761,7 @@ abstract class RegexString extends Expr {
717761
string getBackrefName(int start, int end) { this.named_backreference(start, end, result) }
718762

719763
private predicate baseItem(int start, int end) {
720-
this.character(start, end) and
764+
this.characterItem(start, end) and
721765
not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
722766
or
723767
this.group(start, end)
@@ -837,14 +881,14 @@ abstract class RegexString extends Expr {
837881
}
838882

839883
private predicate item_start(int start) {
840-
this.character(start, _) or
884+
this.characterItem(start, _) or
841885
this.isGroupStart(start) or
842886
this.charSet(start, _) or
843887
this.backreference(start, _)
844888
}
845889

846890
private predicate item_end(int end) {
847-
this.character(_, end)
891+
this.characterItem(_, end)
848892
or
849893
exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
850894
or
@@ -953,7 +997,7 @@ abstract class RegexString extends Expr {
953997
*/
954998
predicate firstItem(int start, int end) {
955999
(
956-
this.character(start, end)
1000+
this.characterItem(start, end)
9571001
or
9581002
this.qualifiedItem(start, end, _, _)
9591003
or
@@ -968,7 +1012,7 @@ abstract class RegexString extends Expr {
9681012
*/
9691013
predicate lastItem(int start, int end) {
9701014
(
971-
this.character(start, end)
1015+
this.characterItem(start, end)
9721016
or
9731017
this.qualifiedItem(start, end, _, _)
9741018
or

python/ql/test/library-tests/regex/FirstLast.expected

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
| 012345678 | first | 0 | 1 |
2-
| 012345678 | last | 8 | 9 |
3-
| (?!not-this)^[A-Z_]+$ | first | 3 | 4 |
1+
| 012345678 | first | 0 | 9 |
2+
| 012345678 | last | 0 | 9 |
3+
| (?!not-this)^[A-Z_]+$ | first | 3 | 11 |
44
| (?!not-this)^[A-Z_]+$ | first | 12 | 13 |
55
| (?!not-this)^[A-Z_]+$ | first | 13 | 19 |
66
| (?!not-this)^[A-Z_]+$ | first | 13 | 20 |
@@ -27,9 +27,9 @@
2727
| (?m)^(?!$) | last | 4 | 5 |
2828
| (?m)^(?!$) | last | 8 | 9 |
2929
| (\\033\|~{) | first | 1 | 5 |
30-
| (\\033\|~{) | first | 6 | 7 |
30+
| (\\033\|~{) | first | 6 | 8 |
3131
| (\\033\|~{) | last | 1 | 5 |
32-
| (\\033\|~{) | last | 7 | 8 |
32+
| (\\033\|~{) | last | 6 | 8 |
3333
| [\ufffd-\ufffd] | first | 0 | 5 |
3434
| [\ufffd-\ufffd] | last | 0 | 5 |
3535
| [\ufffd-\ufffd][\ufffd-\ufffd] | first | 0 | 5 |
@@ -52,8 +52,8 @@
5252
| \\A[+-]?\\d+ | last | 7 | 9 |
5353
| \\A[+-]?\\d+ | last | 7 | 10 |
5454
| \\Afoo\\Z | first | 0 | 2 |
55-
| \\Afoo\\Z | first | 2 | 3 |
56-
| \\Afoo\\Z | last | 4 | 5 |
55+
| \\Afoo\\Z | first | 2 | 5 |
56+
| \\Afoo\\Z | last | 2 | 5 |
5757
| \\Afoo\\Z | last | 5 | 7 |
5858
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | first | 0 | 2 |
5959
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 32 |
@@ -86,30 +86,30 @@
8686
| ^[A-Z_]+$(?<!not-this) | last | 1 | 7 |
8787
| ^[A-Z_]+$(?<!not-this) | last | 1 | 8 |
8888
| ^[A-Z_]+$(?<!not-this) | last | 8 | 9 |
89-
| ^[A-Z_]+$(?<!not-this) | last | 20 | 21 |
89+
| ^[A-Z_]+$(?<!not-this) | last | 13 | 21 |
9090
| ax{01,3} | first | 0 | 1 |
9191
| ax{01,3} | last | 1 | 2 |
9292
| ax{01,3} | last | 1 | 8 |
93-
| ax{01,3} | last | 7 | 8 |
93+
| ax{01,3} | last | 3 | 8 |
9494
| ax{3,} | first | 0 | 1 |
9595
| ax{3,} | last | 1 | 2 |
9696
| ax{3,} | last | 1 | 6 |
97-
| ax{3,} | last | 5 | 6 |
97+
| ax{3,} | last | 3 | 6 |
9898
| ax{3} | first | 0 | 1 |
9999
| ax{3} | last | 1 | 2 |
100100
| ax{3} | last | 1 | 5 |
101-
| ax{3} | last | 4 | 5 |
101+
| ax{3} | last | 3 | 5 |
102102
| ax{,3} | first | 0 | 1 |
103103
| ax{,3} | last | 0 | 1 |
104104
| ax{,3} | last | 1 | 2 |
105105
| ax{,3} | last | 1 | 6 |
106-
| ax{,3} | last | 5 | 6 |
106+
| ax{,3} | last | 3 | 6 |
107107
| x\| | first | 0 | 1 |
108108
| x\| | last | 0 | 1 |
109109
| x\|(?<!\\w)l | first | 0 | 1 |
110110
| x\|(?<!\\w)l | first | 6 | 8 |
111111
| x\|(?<!\\w)l | first | 9 | 10 |
112112
| x\|(?<!\\w)l | last | 0 | 1 |
113113
| x\|(?<!\\w)l | last | 9 | 10 |
114-
| x{Not qual} | first | 0 | 1 |
115-
| x{Not qual} | last | 10 | 11 |
114+
| x{Not qual} | first | 0 | 11 |
115+
| x{Not qual} | last | 0 | 11 |

python/ql/test/library-tests/regex/Regex.ql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ predicate part(Regex r, int start, int end, string kind) {
66
or
77
r.normalCharacter(start, end) and kind = "char"
88
or
9+
r.escapedCharacter(start, end) and
10+
kind = "char" and
11+
not r.specialCharacter(start, end, _)
12+
or
913
r.specialCharacter(start, end, kind)
1014
or
1115
r.sequence(start, end) and kind = "sequence"

python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
| redos.py:220:25:220:29 | [^X]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'W'. |
6060
| redos.py:223:30:223:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
6161
| redos.py:229:30:229:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
62-
| redos.py:241:27:241:27 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ba'. |
62+
| redos.py:241:26:241:27 | ab | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ab'. |
6363
| redos.py:247:25:247:31 | [\\n\\s]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
6464
| redos.py:256:25:256:27 | \\w* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
6565
| redos.py:256:37:256:39 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbaz' and containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* The regular expression parser now groups sequences of normal characters. This reduces the number of instances of `RegExpNormalChar`.

ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ class RegExp extends AST::RegExpLiteral {
382382
}
383383

384384
predicate normalCharacter(int start, int end) {
385+
end = start + 1 and
385386
this.character(start, end) and
386387
not this.specialCharacter(start, end, _)
387388
}
@@ -401,6 +402,49 @@ class RegExp extends AST::RegExpLiteral {
401402
)
402403
}
403404

405+
/**
406+
* Holds if the range [start:end) consists of only 'normal' characters.
407+
*/
408+
predicate normalCharacterSequence(int start, int end) {
409+
// a normal character inside a character set is interpreted on its own
410+
this.normalCharacter(start, end) and
411+
this.inCharSet(start)
412+
or
413+
// a maximal run of normal characters is considered as one constant
414+
exists(int s, int e |
415+
e = max(int i | this.normalCharacterRun(s, i)) and
416+
not this.inCharSet(s)
417+
|
418+
// 'abc' can be considered one constant, but
419+
// 'abc+' has to be broken up into 'ab' and 'c+',
420+
// as the qualifier only applies to 'c'.
421+
if this.qualifier(e, _, _, _)
422+
then
423+
end = e and start = e - 1
424+
or
425+
end = e - 1 and start = s and start < end
426+
else (
427+
end = e and
428+
start = s
429+
)
430+
)
431+
}
432+
433+
private predicate normalCharacterRun(int start, int end) {
434+
(
435+
this.normalCharacterRun(start, end - 1)
436+
or
437+
start = end - 1 and not this.normalCharacter(start - 1, start)
438+
) and
439+
this.normalCharacter(end - 1, end)
440+
}
441+
442+
private predicate characterItem(int start, int end) {
443+
this.normalCharacterSequence(start, end) or
444+
this.escapedCharacter(start, end) or
445+
this.specialCharacter(start, end, _)
446+
}
447+
404448
/** Whether the text in the range `start,end` is a group */
405449
predicate group(int start, int end) {
406450
this.groupContents(start, end, _, _)
@@ -639,7 +683,7 @@ class RegExp extends AST::RegExpLiteral {
639683
string getBackRefName(int start, int end) { this.namedBackreference(start, end, result) }
640684

641685
private predicate baseItem(int start, int end) {
642-
this.character(start, end) and
686+
this.characterItem(start, end) and
643687
not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
644688
or
645689
this.group(start, end)
@@ -746,15 +790,15 @@ class RegExp extends AST::RegExpLiteral {
746790
}
747791

748792
private predicate itemStart(int start) {
749-
this.character(start, _) or
793+
this.characterItem(start, _) or
750794
this.isGroupStart(start) or
751795
this.charSet(start, _) or
752796
this.backreference(start, _) or
753797
this.namedCharacterProperty(start, _, _)
754798
}
755799

756800
private predicate itemEnd(int end) {
757-
this.character(_, end)
801+
this.characterItem(_, end)
758802
or
759803
exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
760804
or
@@ -865,7 +909,7 @@ class RegExp extends AST::RegExpLiteral {
865909
*/
866910
predicate firstItem(int start, int end) {
867911
(
868-
this.character(start, end)
912+
this.characterItem(start, end)
869913
or
870914
this.qualifiedItem(start, end, _, _)
871915
or
@@ -880,7 +924,7 @@ class RegExp extends AST::RegExpLiteral {
880924
*/
881925
predicate lastItem(int start, int end) {
882926
(
883-
this.character(start, end)
927+
this.characterItem(start, end)
884928
or
885929
this.qualifiedItem(start, end, _, _)
886930
or

ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,12 @@ newtype TRegExpParent =
228228
TRegExpCharacterRange(RegExp re, int start, int end) { re.charRange(_, start, _, _, end) } or
229229
TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or
230230
TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or
231-
TRegExpNormalChar(RegExp re, int start, int end) { re.normalCharacter(start, end) } or
231+
TRegExpNormalChar(RegExp re, int start, int end) {
232+
re.normalCharacterSequence(start, end)
233+
or
234+
re.escapedCharacter(start, end) and
235+
not re.specialCharacter(start, end, _)
236+
} or
232237
TRegExpBackRef(RegExp re, int start, int end) { re.backreference(start, end) } or
233238
TRegExpNamedCharacterProperty(RegExp re, int start, int end) {
234239
re.namedCharacterProperty(start, end, _)

0 commit comments

Comments
 (0)