github
diff --git a/‎ruby/ql/lib/change-notes/2022-03-16-string-escape-sequences.md
Lines changed: 5 additions & 0 deletions b/‎ruby/ql/lib/change-notes/2022-03-16-string-escape-sequences.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎ruby/ql/lib/codeql/NumberUtils.qll
Lines changed: 129 additions & 0 deletions b/‎ruby/ql/lib/codeql/NumberUtils.qll
Lines changed: 129 additions & 0 deletions
diff --git a/‎ruby/ql/lib/codeql/ruby/ast/Literal.qll
Lines changed: 14 additions & 4 deletions b/‎ruby/ql/lib/codeql/ruby/ast/Literal.qll
Lines changed: 14 additions & 4 deletions
diff --git a/‎ruby/ql/lib/codeql/ruby/ast/internal/Literal.qll
Lines changed: 82 additions & 33 deletions b/‎ruby/ql/lib/codeql/ruby/ast/internal/Literal.qll
Lines changed: 82 additions & 33 deletions
diff --git a/‎ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
Lines changed: 2 additions & 34 deletions b/‎ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
Lines changed: 2 additions & 34 deletions
@@ -0,0 +1,5 @@
+---
+category: minorAnalysis
+---
+* `getConstantValue()` now returns the contents of strings and symbols after escape sequences have been interpreted. For example, for the Ruby string literal `"\n"`, `getConstantValue().getString()` previously returned a QL string with two characters, a backslash followed by `n`; now it returns the single-character string "\n" (U+000A, known as newline).
+* `getConstantValue().getInt()` previously returned incorrect values for integers larger than 2<sup>31</sup>-1 (the largest value that can be represented by the QL `int` type). It now returns no result in those cases.
@@ -0,0 +1,129 @@
+/**
+ * Provides predicates for working with numeric values and their string
+ * representations.
+ */
+
+/**
+ * Gets the integer value of `binary` when interpreted as binary. `binary` must
+ * contain only the digits 0 and 1. For values greater than
+ * 01111111111111111111111111111111 (2^31-1, the maximum value that `int` can
+ * represent), there is no result.
+ *
+ * ```
+ * "0"       => 0
+ * "01"      => 1
+ * "1010101" => 85
+ * ```
+ */
+bindingset[binary]
+int parseBinaryInt(string binary) {
+  exists(string stripped | stripped = stripLeadingZeros(binary) |
+    stripped.length() <= 31 and
+    result >= 0 and
+    result =
+      sum(int index, string c, int digit |
+        c = stripped.charAt(index) and
+        digit = "01".indexOf(c)
+      |
+        twoToThe(stripped.length() - 1 - index) * digit
+      )
+  )
+}
+
+/**
+ * Gets the integer value of `hex` when interpreted as hex. `hex` must be a
+ * valid hexadecimal string. For values greater than 7FFFFFFF (2^31-1, the
+ * maximum value that `int` can represent), there is no result.
+ *
+ * ```
+ * "0"    => 0
+ * "FF"   => 255
+ * "f00d" => 61453
+ * ```
+ */
+bindingset[hex]
+int parseHexInt(string hex) {
+  exists(string stripped | stripped = stripLeadingZeros(hex) |
+    stripped.length() <= 8 and
+    result >= 0 and
+    result =
+      sum(int index, string c |
+        c = stripped.charAt(index)
+      |
+        sixteenToThe(stripped.length() - 1 - index) * toHex(c)
+      )
+  )
+}
+
+/**
+ * Gets the integer value of `octal` when interpreted as octal. `octal` must be
+ * a valid octal string containing only the digits 0-7. For values greater than
+ * 17777777777 (2^31-1, the maximum value that `int` can represent), there is no
+ * result.
+ *
+ * ```
+ * "0"        => 0
+ * "77"       => 63
+ * "76543210" => 16434824
+ * ```
+ */
+bindingset[octal]
+int parseOctalInt(string octal) {
+  exists(string stripped | stripped = stripLeadingZeros(octal) |
+    stripped.length() <= 11 and
+    result >= 0 and
+    result =
+      sum(int index, string c, int digit |
+        c = stripped.charAt(index) and
+        digit = "01234567".indexOf(c)
+      |
+        eightToThe(stripped.length() - 1 - index) * digit
+      )
+  )
+}
+
+/** Gets the integer value of the `hex` char. */
+private int toHex(string hex) {
+  hex = [0 .. 9].toString() and
+  result = hex.toInt()
+  or
+  result = 10 and hex = ["a", "A"]
+  or
+  result = 11 and hex = ["b", "B"]
+  or
+  result = 12 and hex = ["c", "C"]
+  or
+  result = 13 and hex = ["d", "D"]
+  or
+  result = 14 and hex = ["e", "E"]
+  or
+  result = 15 and hex = ["f", "F"]
+}
+
+/**
+ * Gets the value of 16 to the power of `n`. Holds only for `n` in the range
+ * 0..7 (inclusive).
+ */
+int sixteenToThe(int n) {
+  // 16**7 is the largest power of 16 that fits in an int.
+  n in [0 .. 7] and result = 1.bitShiftLeft(4 * n)
+}
+
+/**
+ * Gets the value of 8 to the power of `n`. Holds only for `n` in the range
+ * 0..10 (inclusive).
+ */
+int eightToThe(int n) {
+  // 8**10 is the largest power of 8 that fits in an int.
+  n in [0 .. 10] and result = 1.bitShiftLeft(3 * n)
+}
+
+/**
+ * Gets the value of 2 to the power of `n`. Holds only for `n` in the range
+ * 0..30 (inclusive).
+ */
+int twoToThe(int n) { n in [0 .. 30] and result = 1.bitShiftLeft(n) }
+
+/** Gets `s` with any leading "0" characters removed. */
+bindingset[s]
+private string stripLeadingZeros(string s) { result = s.regexpCapture("0*(.*)", 1) }
@@ -230,13 +230,18 @@ class StringTextComponent extends StringComponent, TStringTextComponentNonRegexp
 
   StringTextComponent() { this = TStringTextComponentNonRegexp(g) }
 
-  final override string toString() { result = g.getValue() }
+  final override string toString() { result = this.getRawText() }
 
   final override ConstantValue::ConstantStringValue getConstantValue() {
-    result.isString(g.getValue())
+    result.isString(this.getUnescapedText())
   }
 
   final override string getAPrimaryQlClass() { result = "StringTextComponent" }
+
+  /** Gets the text of this component as it appears in the source code. */
+  final string getRawText() { result = g.getValue() }
+
+  final private string getUnescapedText() { result = unescapeTextComponent(this.getRawText()) }
 }
 
 /**
@@ -247,13 +252,18 @@ class StringEscapeSequenceComponent extends StringComponent, TStringEscapeSequen
 
   StringEscapeSequenceComponent() { this = TStringEscapeSequenceComponentNonRegexp(g) }
 
-  final override string toString() { result = g.getValue() }
+  final override string toString() { result = this.getRawText() }
 
   final override ConstantValue::ConstantStringValue getConstantValue() {
-    result.isString(g.getValue())
+    result.isString(this.getUnescapedText())
   }
 
   final override string getAPrimaryQlClass() { result = "StringEscapeSequenceComponent" }
+
+  /** Gets the text of this component as it appears in the source code. */
+  final string getRawText() { result = g.getValue() }
+
+  final private string getUnescapedText() { result = unescapeEscapeSequence(this.getRawText()) }
 }
 
 /**
 
@@ -3,43 +3,23 @@ private import AST
 private import Constant
 private import TreeSitter
 private import codeql.ruby.controlflow.CfgNodes
+private import codeql.NumberUtils
 
 int parseInteger(Ruby::Integer i) {
   exists(string s | s = i.getValue().toLowerCase().replaceAll("_", "") |
     s.charAt(0) != "0" and
     result = s.toInt()
     or
-    exists(string str, string values, int shift |
-      s.matches("0b%") and
-      values = "01" and
-      str = s.suffix(2) and
-      shift = 1
-      or
-      s.matches("0x%") and
-      values = "0123456789abcdef" and
-      str = s.suffix(2) and
-      shift = 4
-      or
-      s.charAt(0) = "0" and
-      not s.charAt(1) = ["b", "x", "o"] and
-      values = "01234567" and
-      str = s.suffix(1) and
-      shift = 3
-      or
-      s.matches("0o%") and
-      values = "01234567" and
-      str = s.suffix(2) and
-      shift = 3
-    |
-      result =
-        sum(int index, string c, int v, int exp |
-          c = str.charAt(index) and
-          v = values.indexOf(c.toLowerCase()) and
-          exp = str.length() - index - 1
-        |
-          v.bitShiftLeft((str.length() - index - 1) * shift)
-        )
-    )
+    s.matches("0b%") and result = parseBinaryInt(s.suffix(2))
+    or
+    s.matches("0x%") and result = parseHexInt(s.suffix(2))
+    or
+    s.charAt(0) = "0" and
+    not s.charAt(1) = ["b", "x", "o"] and
+    result = parseOctalInt(s.suffix(1))
+    or
+    s.matches("0o%") and
+    result = parseOctalInt(s.suffix(2))
   )
 }
 
@@ -148,16 +128,85 @@ private class RequiredFileLiteralConstantValue extends RequiredConstantValue {
 
 private class RequiredStringTextComponentConstantValue extends RequiredConstantValue {
   override predicate requiredString(string s) {
-    s = any(Ruby::Token t | exists(TStringTextComponentNonRegexp(t))).getValue()
+    s =
+      unescapeTextComponent(any(Ruby::Token t | exists(TStringTextComponentNonRegexp(t))).getValue())
   }
 }
 
 private class RequiredStringEscapeSequenceComponentConstantValue extends RequiredConstantValue {
   override predicate requiredString(string s) {
-    s = any(Ruby::Token t | exists(TStringEscapeSequenceComponentNonRegexp(t))).getValue()
+    s =
+      unescapeEscapeSequence(any(Ruby::Token t | exists(TStringEscapeSequenceComponentNonRegexp(t)))
+            .getValue())
   }
 }
 
+/**
+ * Gets the string represented by the escape sequence in `escaped`. For example:
+ *
+ * ```
+ * \\     => \
+ * \141   => a
+ * \u0078 => x
+ * ```
+ */
+bindingset[escaped]
+string unescapeEscapeSequence(string escaped) {
+  result = unescapeKnownEscapeSequence(escaped)
+  or
+  // Any other character following a backslash is just that character.
+  not exists(unescapeKnownEscapeSequence(escaped)) and
+  result = escaped.suffix(1)
+}
+
+bindingset[escaped]
+private string unescapeKnownEscapeSequence(string escaped) {
+  escaped = "\\\\" and result = "\\"
+  or
+  escaped = "\\'" and result = "'"
+  or
+  escaped = "\\\"" and result = "\""
+  or
+  escaped = "\\a" and result = 7.toUnicode()
+  or
+  escaped = "\\b" and result = 8.toUnicode()
+  or
+  escaped = "\\t" and result = "\t"
+  or
+  escaped = "\\n" and result = "\n"
+  or
+  escaped = "\\v" and result = 11.toUnicode()
+  or
+  escaped = "\\f" and result = 12.toUnicode()
+  or
+  escaped = "\\r" and result = "\r"
+  or
+  escaped = "\\e" and result = 27.toUnicode()
+  or
+  escaped = "\\s" and result = " "
+  or
+  escaped = ["\\c?", "\\C-?"] and result = 127.toUnicode()
+  or
+  result = parseOctalInt(escaped.regexpCapture("\\\\([0-7]{1,3})", 1)).toUnicode()
+  or
+  result = parseHexInt(escaped.regexpCapture("\\\\x([0-9a-fA-F]{1,2})", 1)).toUnicode()
+  or
+  result = parseHexInt(escaped.regexpCapture("\\\\u([0-9a-fA-F]{4})", 1)).toUnicode()
+  or
+  result = parseHexInt(escaped.regexpCapture("\\\\u\\{([0-9a-fA-F]{1,6})\\}", 1)).toUnicode()
+}
+
+/**
+ * Gets the result of unescaping a string text component by replacing `\\` and
+ * `\'` with `\` and `'`, respectively.
+ *
+ * ```rb
+ * 'foo\\bar \'baz\'' # foo\bar 'baz'
+ * ```
+ */
+bindingset[text]
+string unescapeTextComponent(string text) { result = text.regexpReplaceAll("\\\\(['\\\\])", "$1") }
+
 class TRegExpComponent =
   TStringTextComponentRegexp or TStringEscapeSequenceComponentRegexp or
       TStringInterpolationComponentRegexp;
 
@@ -1,5 +1,6 @@
 private import codeql.ruby.ast.Literal as AST
 private import ParseRegExp
+private import codeql.NumberUtils
 import codeql.Locations
 private import codeql.ruby.DataFlow
 
@@ -423,48 +424,15 @@ class RegExpEscape extends RegExpNormalChar {
    * E.g. for `\u0061` this returns "a".
    */
   private string getUnicode() {
-    exists(int codepoint | codepoint = sum(this.getHexValueFromUnicode(_)) |
-      result = codepoint.toUnicode()
-    )
-  }
-
-  /**
-   * Gets int value for the `index`th char in the hex number of the unicode escape.
-   * E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex).
-   */
-  private int getHexValueFromUnicode(int index) {
     this.isUnicode() and
-    exists(string hex, string char | hex = this.getText().suffix(2) |
-      char = hex.charAt(index) and
-      result = 16.pow(hex.length() - index - 1) * toHex(char)
-    )
+    result = parseHexInt(this.getText().suffix(2)).toUnicode()
   }
 
   string getUnescaped() { result = this.getText().suffix(1) }
 
   override string getAPrimaryQlClass() { result = "RegExpEscape" }
 }
 
-/**
- * Gets the hex number for the `hex` char.
- */
-private int toHex(string hex) {
-  hex = [0 .. 9].toString() and
-  result = hex.toInt()
-  or
-  result = 10 and hex = ["a", "A"]
-  or
-  result = 11 and hex = ["b", "B"]
-  or
-  result = 12 and hex = ["c", "C"]
-  or
-  result = 13 and hex = ["d", "D"]
-  or
-  result = 14 and hex = ["e", "E"]
-  or
-  result = 15 and hex = ["f", "F"]
-}
-
 /**
  * A word boundary, that is, a regular expression term of the form `\b`.
  */