Skip to content

Commit 82ef2a1

Browse files
authored
Merge pull request #8164 from github/nickrolfe/escape_sequences
Ruby: interpret string escape sequences in getConstantValue()
2 parents cd9d61c + 1a85002 commit 82ef2a1

File tree

15 files changed

+2556
-1814
lines changed

15 files changed

+2556
-1814
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* `getConstantValue()` now returns the contents of strings and symbols after escape sequences have been interpreted. For example, for the Ruby string literal `"\n"`, `getConstantValue().getString()` previously returned a QL string with two characters, a backslash followed by `n`; now it returns the single-character string "\n" (U+000A, known as newline).
5+
* `getConstantValue().getInt()` previously returned incorrect values for integers larger than 2<sup>31</sup>-1 (the largest value that can be represented by the QL `int` type). It now returns no result in those cases.

ruby/ql/lib/codeql/NumberUtils.qll

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
/**
2+
* Provides predicates for working with numeric values and their string
3+
* representations.
4+
*/
5+
6+
/**
7+
* Gets the integer value of `binary` when interpreted as binary. `binary` must
8+
* contain only the digits 0 and 1. For values greater than
9+
* 01111111111111111111111111111111 (2^31-1, the maximum value that `int` can
10+
* represent), there is no result.
11+
*
12+
* ```
13+
* "0" => 0
14+
* "01" => 1
15+
* "1010101" => 85
16+
* ```
17+
*/
18+
bindingset[binary]
19+
int parseBinaryInt(string binary) {
20+
exists(string stripped | stripped = stripLeadingZeros(binary) |
21+
stripped.length() <= 31 and
22+
result >= 0 and
23+
result =
24+
sum(int index, string c, int digit |
25+
c = stripped.charAt(index) and
26+
digit = "01".indexOf(c)
27+
|
28+
twoToThe(stripped.length() - 1 - index) * digit
29+
)
30+
)
31+
}
32+
33+
/**
34+
* Gets the integer value of `hex` when interpreted as hex. `hex` must be a
35+
* valid hexadecimal string. For values greater than 7FFFFFFF (2^31-1, the
36+
* maximum value that `int` can represent), there is no result.
37+
*
38+
* ```
39+
* "0" => 0
40+
* "FF" => 255
41+
* "f00d" => 61453
42+
* ```
43+
*/
44+
bindingset[hex]
45+
int parseHexInt(string hex) {
46+
exists(string stripped | stripped = stripLeadingZeros(hex) |
47+
stripped.length() <= 8 and
48+
result >= 0 and
49+
result =
50+
sum(int index, string c |
51+
c = stripped.charAt(index)
52+
|
53+
sixteenToThe(stripped.length() - 1 - index) * toHex(c)
54+
)
55+
)
56+
}
57+
58+
/**
59+
* Gets the integer value of `octal` when interpreted as octal. `octal` must be
60+
* a valid octal string containing only the digits 0-7. For values greater than
61+
* 17777777777 (2^31-1, the maximum value that `int` can represent), there is no
62+
* result.
63+
*
64+
* ```
65+
* "0" => 0
66+
* "77" => 63
67+
* "76543210" => 16434824
68+
* ```
69+
*/
70+
bindingset[octal]
71+
int parseOctalInt(string octal) {
72+
exists(string stripped | stripped = stripLeadingZeros(octal) |
73+
stripped.length() <= 11 and
74+
result >= 0 and
75+
result =
76+
sum(int index, string c, int digit |
77+
c = stripped.charAt(index) and
78+
digit = "01234567".indexOf(c)
79+
|
80+
eightToThe(stripped.length() - 1 - index) * digit
81+
)
82+
)
83+
}
84+
85+
/** Gets the integer value of the `hex` char. */
86+
private int toHex(string hex) {
87+
hex = [0 .. 9].toString() and
88+
result = hex.toInt()
89+
or
90+
result = 10 and hex = ["a", "A"]
91+
or
92+
result = 11 and hex = ["b", "B"]
93+
or
94+
result = 12 and hex = ["c", "C"]
95+
or
96+
result = 13 and hex = ["d", "D"]
97+
or
98+
result = 14 and hex = ["e", "E"]
99+
or
100+
result = 15 and hex = ["f", "F"]
101+
}
102+
103+
/**
104+
* Gets the value of 16 to the power of `n`. Holds only for `n` in the range
105+
* 0..7 (inclusive).
106+
*/
107+
int sixteenToThe(int n) {
108+
// 16**7 is the largest power of 16 that fits in an int.
109+
n in [0 .. 7] and result = 1.bitShiftLeft(4 * n)
110+
}
111+
112+
/**
113+
* Gets the value of 8 to the power of `n`. Holds only for `n` in the range
114+
* 0..10 (inclusive).
115+
*/
116+
int eightToThe(int n) {
117+
// 8**10 is the largest power of 8 that fits in an int.
118+
n in [0 .. 10] and result = 1.bitShiftLeft(3 * n)
119+
}
120+
121+
/**
122+
* Gets the value of 2 to the power of `n`. Holds only for `n` in the range
123+
* 0..30 (inclusive).
124+
*/
125+
int twoToThe(int n) { n in [0 .. 30] and result = 1.bitShiftLeft(n) }
126+
127+
/** Gets `s` with any leading "0" characters removed. */
128+
bindingset[s]
129+
private string stripLeadingZeros(string s) { result = s.regexpCapture("0*(.*)", 1) }

ruby/ql/lib/codeql/ruby/ast/Literal.qll

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -230,13 +230,18 @@ class StringTextComponent extends StringComponent, TStringTextComponentNonRegexp
230230

231231
StringTextComponent() { this = TStringTextComponentNonRegexp(g) }
232232

233-
final override string toString() { result = g.getValue() }
233+
final override string toString() { result = this.getRawText() }
234234

235235
final override ConstantValue::ConstantStringValue getConstantValue() {
236-
result.isString(g.getValue())
236+
result.isString(this.getUnescapedText())
237237
}
238238

239239
final override string getAPrimaryQlClass() { result = "StringTextComponent" }
240+
241+
/** Gets the text of this component as it appears in the source code. */
242+
final string getRawText() { result = g.getValue() }
243+
244+
final private string getUnescapedText() { result = unescapeTextComponent(this.getRawText()) }
240245
}
241246

242247
/**
@@ -247,13 +252,18 @@ class StringEscapeSequenceComponent extends StringComponent, TStringEscapeSequen
247252

248253
StringEscapeSequenceComponent() { this = TStringEscapeSequenceComponentNonRegexp(g) }
249254

250-
final override string toString() { result = g.getValue() }
255+
final override string toString() { result = this.getRawText() }
251256

252257
final override ConstantValue::ConstantStringValue getConstantValue() {
253-
result.isString(g.getValue())
258+
result.isString(this.getUnescapedText())
254259
}
255260

256261
final override string getAPrimaryQlClass() { result = "StringEscapeSequenceComponent" }
262+
263+
/** Gets the text of this component as it appears in the source code. */
264+
final string getRawText() { result = g.getValue() }
265+
266+
final private string getUnescapedText() { result = unescapeEscapeSequence(this.getRawText()) }
257267
}
258268

259269
/**

ruby/ql/lib/codeql/ruby/ast/internal/Literal.qll

Lines changed: 82 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,43 +3,23 @@ private import AST
33
private import Constant
44
private import TreeSitter
55
private import codeql.ruby.controlflow.CfgNodes
6+
private import codeql.NumberUtils
67

78
int parseInteger(Ruby::Integer i) {
89
exists(string s | s = i.getValue().toLowerCase().replaceAll("_", "") |
910
s.charAt(0) != "0" and
1011
result = s.toInt()
1112
or
12-
exists(string str, string values, int shift |
13-
s.matches("0b%") and
14-
values = "01" and
15-
str = s.suffix(2) and
16-
shift = 1
17-
or
18-
s.matches("0x%") and
19-
values = "0123456789abcdef" and
20-
str = s.suffix(2) and
21-
shift = 4
22-
or
23-
s.charAt(0) = "0" and
24-
not s.charAt(1) = ["b", "x", "o"] and
25-
values = "01234567" and
26-
str = s.suffix(1) and
27-
shift = 3
28-
or
29-
s.matches("0o%") and
30-
values = "01234567" and
31-
str = s.suffix(2) and
32-
shift = 3
33-
|
34-
result =
35-
sum(int index, string c, int v, int exp |
36-
c = str.charAt(index) and
37-
v = values.indexOf(c.toLowerCase()) and
38-
exp = str.length() - index - 1
39-
|
40-
v.bitShiftLeft((str.length() - index - 1) * shift)
41-
)
42-
)
13+
s.matches("0b%") and result = parseBinaryInt(s.suffix(2))
14+
or
15+
s.matches("0x%") and result = parseHexInt(s.suffix(2))
16+
or
17+
s.charAt(0) = "0" and
18+
not s.charAt(1) = ["b", "x", "o"] and
19+
result = parseOctalInt(s.suffix(1))
20+
or
21+
s.matches("0o%") and
22+
result = parseOctalInt(s.suffix(2))
4323
)
4424
}
4525

@@ -148,16 +128,85 @@ private class RequiredFileLiteralConstantValue extends RequiredConstantValue {
148128

149129
private class RequiredStringTextComponentConstantValue extends RequiredConstantValue {
150130
override predicate requiredString(string s) {
151-
s = any(Ruby::Token t | exists(TStringTextComponentNonRegexp(t))).getValue()
131+
s =
132+
unescapeTextComponent(any(Ruby::Token t | exists(TStringTextComponentNonRegexp(t))).getValue())
152133
}
153134
}
154135

155136
private class RequiredStringEscapeSequenceComponentConstantValue extends RequiredConstantValue {
156137
override predicate requiredString(string s) {
157-
s = any(Ruby::Token t | exists(TStringEscapeSequenceComponentNonRegexp(t))).getValue()
138+
s =
139+
unescapeEscapeSequence(any(Ruby::Token t | exists(TStringEscapeSequenceComponentNonRegexp(t)))
140+
.getValue())
158141
}
159142
}
160143

144+
/**
145+
* Gets the string represented by the escape sequence in `escaped`. For example:
146+
*
147+
* ```
148+
* \\ => \
149+
* \141 => a
150+
* \u0078 => x
151+
* ```
152+
*/
153+
bindingset[escaped]
154+
string unescapeEscapeSequence(string escaped) {
155+
result = unescapeKnownEscapeSequence(escaped)
156+
or
157+
// Any other character following a backslash is just that character.
158+
not exists(unescapeKnownEscapeSequence(escaped)) and
159+
result = escaped.suffix(1)
160+
}
161+
162+
bindingset[escaped]
163+
private string unescapeKnownEscapeSequence(string escaped) {
164+
escaped = "\\\\" and result = "\\"
165+
or
166+
escaped = "\\'" and result = "'"
167+
or
168+
escaped = "\\\"" and result = "\""
169+
or
170+
escaped = "\\a" and result = 7.toUnicode()
171+
or
172+
escaped = "\\b" and result = 8.toUnicode()
173+
or
174+
escaped = "\\t" and result = "\t"
175+
or
176+
escaped = "\\n" and result = "\n"
177+
or
178+
escaped = "\\v" and result = 11.toUnicode()
179+
or
180+
escaped = "\\f" and result = 12.toUnicode()
181+
or
182+
escaped = "\\r" and result = "\r"
183+
or
184+
escaped = "\\e" and result = 27.toUnicode()
185+
or
186+
escaped = "\\s" and result = " "
187+
or
188+
escaped = ["\\c?", "\\C-?"] and result = 127.toUnicode()
189+
or
190+
result = parseOctalInt(escaped.regexpCapture("\\\\([0-7]{1,3})", 1)).toUnicode()
191+
or
192+
result = parseHexInt(escaped.regexpCapture("\\\\x([0-9a-fA-F]{1,2})", 1)).toUnicode()
193+
or
194+
result = parseHexInt(escaped.regexpCapture("\\\\u([0-9a-fA-F]{4})", 1)).toUnicode()
195+
or
196+
result = parseHexInt(escaped.regexpCapture("\\\\u\\{([0-9a-fA-F]{1,6})\\}", 1)).toUnicode()
197+
}
198+
199+
/**
200+
* Gets the result of unescaping a string text component by replacing `\\` and
201+
* `\'` with `\` and `'`, respectively.
202+
*
203+
* ```rb
204+
* 'foo\\bar \'baz\'' # foo\bar 'baz'
205+
* ```
206+
*/
207+
bindingset[text]
208+
string unescapeTextComponent(string text) { result = text.regexpReplaceAll("\\\\(['\\\\])", "$1") }
209+
161210
class TRegExpComponent =
162211
TStringTextComponentRegexp or TStringEscapeSequenceComponentRegexp or
163212
TStringInterpolationComponentRegexp;

ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
private import codeql.ruby.ast.Literal as AST
22
private import ParseRegExp
3+
private import codeql.NumberUtils
34
import codeql.Locations
45
private import codeql.ruby.DataFlow
56

@@ -423,48 +424,15 @@ class RegExpEscape extends RegExpNormalChar {
423424
* E.g. for `\u0061` this returns "a".
424425
*/
425426
private string getUnicode() {
426-
exists(int codepoint | codepoint = sum(this.getHexValueFromUnicode(_)) |
427-
result = codepoint.toUnicode()
428-
)
429-
}
430-
431-
/**
432-
* Gets int value for the `index`th char in the hex number of the unicode escape.
433-
* E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex).
434-
*/
435-
private int getHexValueFromUnicode(int index) {
436427
this.isUnicode() and
437-
exists(string hex, string char | hex = this.getText().suffix(2) |
438-
char = hex.charAt(index) and
439-
result = 16.pow(hex.length() - index - 1) * toHex(char)
440-
)
428+
result = parseHexInt(this.getText().suffix(2)).toUnicode()
441429
}
442430

443431
string getUnescaped() { result = this.getText().suffix(1) }
444432

445433
override string getAPrimaryQlClass() { result = "RegExpEscape" }
446434
}
447435

448-
/**
449-
* Gets the hex number for the `hex` char.
450-
*/
451-
private int toHex(string hex) {
452-
hex = [0 .. 9].toString() and
453-
result = hex.toInt()
454-
or
455-
result = 10 and hex = ["a", "A"]
456-
or
457-
result = 11 and hex = ["b", "B"]
458-
or
459-
result = 12 and hex = ["c", "C"]
460-
or
461-
result = 13 and hex = ["d", "D"]
462-
or
463-
result = 14 and hex = ["e", "E"]
464-
or
465-
result = 15 and hex = ["f", "F"]
466-
}
467-
468436
/**
469437
* A word boundary, that is, a regular expression term of the form `\b`.
470438
*/

0 commit comments

Comments
 (0)