|
| 1 | +/** |
| 2 | + * Classes and predicates for working with suspicious character ranges. |
| 3 | + */ |
| 4 | + |
| 5 | +// We don't need the NFA utils, just the regexp tree. |
| 6 | +// but the below is a nice shared library that exposes the API we need. |
| 7 | +import performance.ReDoSUtil |
| 8 | + |
| 9 | +/** |
| 10 | + * Gets a rank for `range` that is unique for ranges in the same file. |
| 11 | + * Prioritizes ranges that match more characters. |
| 12 | + */ |
| 13 | +int rankRange(RegExpCharacterRange range) { |
| 14 | + range = |
| 15 | + rank[result](RegExpCharacterRange r, Location l, int low, int high | |
| 16 | + r.getLocation() = l and |
| 17 | + isRange(r, low, high) |
| 18 | + | |
| 19 | + r order by (high - low) desc, l.getStartLine(), l.getStartColumn() |
| 20 | + ) |
| 21 | +} |
| 22 | + |
| 23 | +/** Holds if `range` spans from the unicode code points `low` to `high` (both inclusive). */ |
| 24 | +predicate isRange(RegExpCharacterRange range, int low, int high) { |
| 25 | + exists(string lowc, string highc | |
| 26 | + range.isRange(lowc, highc) and |
| 27 | + low.toUnicode() = lowc and |
| 28 | + high.toUnicode() = highc |
| 29 | + ) |
| 30 | +} |
| 31 | + |
| 32 | +/** Holds if `char` is an alpha-numeric character. */ |
| 33 | +predicate isAlphanumeric(string char) { |
| 34 | + // written like this to avoid having a bindingset for the predicate |
| 35 | + char = [[48 .. 57], [65 .. 90], [97 .. 122]].toUnicode() // 0-9, A-Z, a-z |
| 36 | +} |
| 37 | + |
| 38 | +/** |
| 39 | + * Holds if the given ranges are from the same character class |
| 40 | + * and there exists at least one character matched by both ranges. |
| 41 | + */ |
| 42 | +predicate overlap(RegExpCharacterRange a, RegExpCharacterRange b) { |
| 43 | + exists(RegExpCharacterClass clz | |
| 44 | + a = clz.getAChild() and |
| 45 | + b = clz.getAChild() and |
| 46 | + a != b |
| 47 | + | |
| 48 | + exists(int alow, int ahigh, int blow, int bhigh | |
| 49 | + isRange(a, alow, ahigh) and |
| 50 | + isRange(b, blow, bhigh) and |
| 51 | + alow <= bhigh and |
| 52 | + blow <= ahigh |
| 53 | + ) |
| 54 | + ) |
| 55 | +} |
| 56 | + |
| 57 | +/** |
| 58 | + * Holds if `range` overlaps with the char class `escape` from the same character class. |
| 59 | + */ |
| 60 | +predicate overlapsWithCharEscape(RegExpCharacterRange range, RegExpCharacterClassEscape escape) { |
| 61 | + exists(RegExpCharacterClass clz, string low, string high | |
| 62 | + range = clz.getAChild() and |
| 63 | + escape = clz.getAChild() and |
| 64 | + range.isRange(low, high) |
| 65 | + | |
| 66 | + escape.getValue() = "w" and |
| 67 | + getInRange(low, high).regexpMatch("\\w") |
| 68 | + or |
| 69 | + escape.getValue() = "d" and |
| 70 | + getInRange(low, high).regexpMatch("\\d") |
| 71 | + or |
| 72 | + escape.getValue() = "s" and |
| 73 | + getInRange(low, high).regexpMatch("\\s") |
| 74 | + ) |
| 75 | +} |
| 76 | + |
| 77 | +/** Gets the unicode code point for a `char`. */ |
| 78 | +bindingset[char] |
| 79 | +int toCodePoint(string char) { result.toUnicode() = char } |
| 80 | + |
| 81 | +/** A character range that appears to be overly wide. */ |
| 82 | +class OverlyWideRange extends RegExpCharacterRange { |
| 83 | + OverlyWideRange() { |
| 84 | + exists(int low, int high, int numChars | |
| 85 | + isRange(this, low, high) and |
| 86 | + numChars = (1 + high - low) and |
| 87 | + this.getRootTerm().isUsedAsRegExp() and |
| 88 | + numChars >= 10 |
| 89 | + | |
| 90 | + // across the Z-a range (which includes backticks) |
| 91 | + toCodePoint("Z") >= low and |
| 92 | + toCodePoint("a") <= high |
| 93 | + or |
| 94 | + // across the 9-A range (which includes e.g. ; and ?) |
| 95 | + toCodePoint("9") >= low and |
| 96 | + toCodePoint("A") <= high |
| 97 | + or |
| 98 | + // a non-alphanumeric char as part of the range boundaries |
| 99 | + exists(int bound | bound = [low, high] | not isAlphanumeric(bound.toUnicode())) |
| 100 | + ) and |
| 101 | + // allowlist for known ranges |
| 102 | + not this = allowedWideRanges() |
| 103 | + } |
| 104 | + |
| 105 | + /** Gets a string representation of a character class that matches the same chars as this range. */ |
| 106 | + string printEquivalent() { result = RangePrinter::printEquivalentCharClass(this) } |
| 107 | +} |
| 108 | + |
| 109 | +/** Gets a range that should not be reported as an overly wide range. */ |
| 110 | +RegExpCharacterRange allowedWideRanges() { |
| 111 | + // ~ is the last printable ASCII character, it's used right in various wide ranges. |
| 112 | + result.isRange(_, "~") |
| 113 | + or |
| 114 | + // the same with " " and "!". " " is the first printable character, and "!" is the first non-white-space printable character. |
| 115 | + result.isRange([" ", "!"], _) |
| 116 | + or |
| 117 | + // the `[@-_]` range is intentional |
| 118 | + result.isRange("@", "_") |
| 119 | + or |
| 120 | + // starting from the zero byte is a good indication that it's purposely matching a large range. |
| 121 | + result.isRange(0.toUnicode(), _) |
| 122 | +} |
| 123 | + |
| 124 | +/** Gets a char between (and including) `low` and `high`. */ |
| 125 | +bindingset[low, high] |
| 126 | +private string getInRange(string low, string high) { |
| 127 | + result = [toCodePoint(low) .. toCodePoint(high)].toUnicode() |
| 128 | +} |
| 129 | + |
| 130 | +/** A module computing an equivalent character class for an overly wide range. */ |
| 131 | +module RangePrinter { |
| 132 | + bindingset[char] |
| 133 | + bindingset[result] |
| 134 | + private string next(string char) { |
| 135 | + exists(int prev, int next | |
| 136 | + prev.toUnicode() = char and |
| 137 | + next.toUnicode() = result and |
| 138 | + next = prev + 1 |
| 139 | + ) |
| 140 | + } |
| 141 | + |
| 142 | + /** Gets the points where the parts of the pretty printed range should be cut off. */ |
| 143 | + private string cutoffs() { result = ["A", "Z", "a", "z", "0", "9"] } |
| 144 | + |
| 145 | + /** Gets the char to use in the low end of a range for a given `cut` */ |
| 146 | + private string lowCut(string cut) { |
| 147 | + cut = ["A", "a", "0"] and |
| 148 | + result = cut |
| 149 | + or |
| 150 | + cut = ["Z", "z", "9"] and |
| 151 | + result = next(cut) |
| 152 | + } |
| 153 | + |
| 154 | + /** Gets the char to use in the high end of a range for a given `cut` */ |
| 155 | + private string highCut(string cut) { |
| 156 | + cut = ["Z", "z", "9"] and |
| 157 | + result = cut |
| 158 | + or |
| 159 | + cut = ["A", "a", "0"] and |
| 160 | + next(result) = cut |
| 161 | + } |
| 162 | + |
| 163 | + /** Gets the cutoff char used for a given `part` of a range when pretty-printing it. */ |
| 164 | + private string cutoff(OverlyWideRange range, int part) { |
| 165 | + exists(int low, int high | isRange(range, low, high) | |
| 166 | + result = |
| 167 | + rank[part + 1](string cut | |
| 168 | + cut = cutoffs() and low < toCodePoint(cut) and toCodePoint(cut) < high |
| 169 | + | |
| 170 | + cut order by toCodePoint(cut) |
| 171 | + ) |
| 172 | + ) |
| 173 | + } |
| 174 | + |
| 175 | + /** Gets the number of parts we should print for a given `range`. */ |
| 176 | + private int parts(OverlyWideRange range) { result = 1 + strictcount(cutoff(range, _)) } |
| 177 | + |
| 178 | + /** Holds if the given part of a range should span from `low` to `high`. */ |
| 179 | + private predicate part(OverlyWideRange range, int part, string low, string high) { |
| 180 | + // first part. |
| 181 | + part = 0 and |
| 182 | + ( |
| 183 | + range.isRange(low, high) and |
| 184 | + parts(range) = 1 |
| 185 | + or |
| 186 | + parts(range) >= 2 and |
| 187 | + range.isRange(low, _) and |
| 188 | + high = highCut(cutoff(range, part)) |
| 189 | + ) |
| 190 | + or |
| 191 | + // middle |
| 192 | + part >= 1 and |
| 193 | + part < parts(range) - 1 and |
| 194 | + low = lowCut(cutoff(range, part - 1)) and |
| 195 | + high = highCut(cutoff(range, part)) |
| 196 | + or |
| 197 | + // last. |
| 198 | + part = parts(range) - 1 and |
| 199 | + low = lowCut(cutoff(range, part - 1)) and |
| 200 | + range.isRange(_, high) |
| 201 | + } |
| 202 | + |
| 203 | + /** Gets an escaped `char` for use in a character class. */ |
| 204 | + bindingset[char] |
| 205 | + private string escape(string char) { |
| 206 | + exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" | |
| 207 | + if char.regexpMatch(reg) then result = "\\" + char else result = char |
| 208 | + ) |
| 209 | + } |
| 210 | + |
| 211 | + /** Gets a part of the equivalent range. */ |
| 212 | + private string printEquivalentCharClass(OverlyWideRange range, int part) { |
| 213 | + exists(string low, string high | part(range, part, low, high) | |
| 214 | + if |
| 215 | + isAlphanumeric(low) and |
| 216 | + isAlphanumeric(high) |
| 217 | + then result = low + "-" + high |
| 218 | + else |
| 219 | + result = |
| 220 | + strictconcat(string char | char = getInRange(low, high) | escape(char) order by char) |
| 221 | + ) |
| 222 | + } |
| 223 | + |
| 224 | + /** Gets the entire pretty printed equivalent range. */ |
| 225 | + string printEquivalentCharClass(OverlyWideRange range) { |
| 226 | + result = |
| 227 | + strictconcat(string r, int part | |
| 228 | + r = "[" and part = -1 and exists(range) |
| 229 | + or |
| 230 | + r = printEquivalentCharClass(range, part) |
| 231 | + or |
| 232 | + r = "]" and part = parts(range) |
| 233 | + | |
| 234 | + r order by part |
| 235 | + ) |
| 236 | + } |
| 237 | +} |
| 238 | + |
| 239 | +/** Gets a char range that is overly large because of `reason`. */ |
| 240 | +RegExpCharacterRange getABadRange(string reason, int priority) { |
| 241 | + priority = 0 and |
| 242 | + reason = "is equivalent to " + result.(OverlyWideRange).printEquivalent() |
| 243 | + or |
| 244 | + priority = 1 and |
| 245 | + exists(RegExpCharacterRange other | |
| 246 | + reason = "overlaps with " + other + " in the same character class" and |
| 247 | + rankRange(result) < rankRange(other) and |
| 248 | + overlap(result, other) |
| 249 | + ) |
| 250 | + or |
| 251 | + priority = 2 and |
| 252 | + exists(RegExpCharacterClassEscape escape | |
| 253 | + reason = "overlaps with " + escape + " in the same character class" and |
| 254 | + overlapsWithCharEscape(result, escape) |
| 255 | + ) |
| 256 | + or |
| 257 | + reason = "is empty" and |
| 258 | + priority = 3 and |
| 259 | + exists(int low, int high | |
| 260 | + isRange(result, low, high) and |
| 261 | + low > high |
| 262 | + ) |
| 263 | +} |
| 264 | + |
| 265 | +/** Holds if `range` matches suspiciously many characters. */ |
| 266 | +predicate problem(RegExpCharacterRange range, string reason) { |
| 267 | + reason = |
| 268 | + strictconcat(string m, int priority | |
| 269 | + range = getABadRange(m, priority) |
| 270 | + | |
| 271 | + m, ", and " order by priority desc |
| 272 | + ) and |
| 273 | + // specifying a range using an escape is usually OK. |
| 274 | + not range.getAChild() instanceof RegExpEscape and |
| 275 | + // Unicode escapes in strings are interpreted before it turns into a regexp, |
| 276 | + // so e.g. [\u0001-\uFFFF] will just turn up as a range between two constants. |
| 277 | + // We therefore exclude these ranges. |
| 278 | + range.getRootTerm().getParent() instanceof RegExpLiteral and |
| 279 | + // is used as regexp (mostly for JS where regular expressions are parsed eagerly) |
| 280 | + range.getRootTerm().isUsedAsRegExp() |
| 281 | +} |
0 commit comments