|
| 1 | +/** |
| 2 | + * Provides predicates for reasoning about regular expressions |
| 3 | + * that match URLs and hostname patterns. |
| 4 | + */ |
| 5 | + |
| 6 | +private import HostnameRegexpSpecific |
| 7 | + |
| 8 | +/** |
| 9 | + * Holds if the given constant is unlikely to occur in the origin part of a URL. |
| 10 | + */ |
| 11 | +predicate isConstantInvalidInsideOrigin(RegExpConstant term) { |
| 12 | + // Look for any of these cases: |
| 13 | + // - A character that can't occur in the origin |
| 14 | + // - Two dashes in a row |
| 15 | + // - A colon that is not part of port or scheme separator |
| 16 | + // - A slash that is not part of scheme separator |
| 17 | + term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*") |
| 18 | +} |
| 19 | + |
| 20 | +/** Holds if `term` is a dot constant of form `\.` or `[.]`. */ |
| 21 | +predicate isDotConstant(RegExpTerm term) { |
| 22 | + term.(RegExpCharEscape).getValue() = "." |
| 23 | + or |
| 24 | + exists(RegExpCharacterClass cls | |
| 25 | + term = cls and |
| 26 | + not cls.isInverted() and |
| 27 | + cls.getNumChild() = 1 and |
| 28 | + cls.getAChild().(RegExpConstant).getValue() = "." |
| 29 | + ) |
| 30 | +} |
| 31 | + |
| 32 | +/** Holds if `term` is a wildcard `.` or an actual `.` character. */ |
| 33 | +predicate isDotLike(RegExpTerm term) { |
| 34 | + term instanceof RegExpDot |
| 35 | + or |
| 36 | + isDotConstant(term) |
| 37 | +} |
| 38 | + |
| 39 | +/** Holds if `term` will only ever be matched against the beginning of the input. */ |
| 40 | +predicate matchesBeginningOfString(RegExpTerm term) { |
| 41 | + term.isRootTerm() |
| 42 | + or |
| 43 | + exists(RegExpTerm parent | matchesBeginningOfString(parent) | |
| 44 | + term = parent.(RegExpSequence).getChild(0) |
| 45 | + or |
| 46 | + parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and |
| 47 | + term = parent.(RegExpSequence).getChild(1) |
| 48 | + or |
| 49 | + term = parent.(RegExpAlt).getAChild() |
| 50 | + or |
| 51 | + term = parent.(RegExpGroup).getAChild() |
| 52 | + ) |
| 53 | +} |
| 54 | + |
| 55 | +/** |
| 56 | + * Holds if the given sequence contains top-level domain preceded by a dot, such as `.com`, |
| 57 | + * excluding cases where this is at the very beginning of the regexp. |
| 58 | + * |
| 59 | + * `i` is bound to the index of the last child in the top-level domain part. |
| 60 | + */ |
| 61 | +predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) { |
| 62 | + seq.getChild(i) |
| 63 | + .(RegExpConstant) |
| 64 | + .getValue() |
| 65 | + .regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and |
| 66 | + isDotLike(seq.getChild(i - 1)) and |
| 67 | + not (i = 1 and matchesBeginningOfString(seq)) |
| 68 | +} |
| 69 | + |
| 70 | +/** |
| 71 | + * Holds if the given regular expression term contains top-level domain preceded by a dot, |
| 72 | + * such as `.com`. |
| 73 | + */ |
| 74 | +predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) } |
| 75 | + |
| 76 | +/** |
| 77 | + * Holds if `term` will always match a hostname, that is, all disjunctions contain |
| 78 | + * a hostname pattern that isn't inside a quantifier. |
| 79 | + */ |
| 80 | +predicate alwaysMatchesHostname(RegExpTerm term) { |
| 81 | + hasTopLevelDomainEnding(term, _) |
| 82 | + or |
| 83 | + // `localhost` is considered a hostname pattern, but has no TLD |
| 84 | + term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b") |
| 85 | + or |
| 86 | + not term instanceof RegExpAlt and |
| 87 | + not term instanceof RegExpQuantifier and |
| 88 | + alwaysMatchesHostname(term.getAChild()) |
| 89 | + or |
| 90 | + alwaysMatchesHostnameAlt(term) |
| 91 | +} |
| 92 | + |
| 93 | +/** Holds if every child of `alt` contains a hostname pattern. */ |
| 94 | +predicate alwaysMatchesHostnameAlt(RegExpAlt alt) { |
| 95 | + alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1) |
| 96 | +} |
| 97 | + |
| 98 | +/** |
| 99 | + * Holds if the first `i` children of `alt` contains a hostname pattern. |
| 100 | + * |
| 101 | + * This is used instead of `forall` to avoid materializing the set of alternatives |
| 102 | + * that don't contains hostnames, which is much larger. |
| 103 | + */ |
| 104 | +predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) { |
| 105 | + alwaysMatchesHostname(alt.getChild(0)) and i = 0 |
| 106 | + or |
| 107 | + alwaysMatchesHostnameAlt(alt, i - 1) and |
| 108 | + alwaysMatchesHostname(alt.getChild(i)) |
| 109 | +} |
| 110 | + |
| 111 | +/** |
| 112 | + * Holds if `term` occurs inside a quantifier or alternative (and thus |
| 113 | + * can not be expected to correspond to a unique match), or as part of |
| 114 | + * a lookaround assertion (which are rarely used for capture groups). |
| 115 | + */ |
| 116 | +predicate isInsideChoiceOrSubPattern(RegExpTerm term) { |
| 117 | + exists(RegExpParent parent | parent = term.getParent() | |
| 118 | + parent instanceof RegExpAlt |
| 119 | + or |
| 120 | + parent instanceof RegExpQuantifier |
| 121 | + or |
| 122 | + parent instanceof RegExpSubPattern |
| 123 | + or |
| 124 | + isInsideChoiceOrSubPattern(parent) |
| 125 | + ) |
| 126 | +} |
| 127 | + |
| 128 | +/** |
| 129 | + * Holds if `group` is likely to be used as a capture group. |
| 130 | + */ |
| 131 | +predicate isLikelyCaptureGroup(RegExpGroup group) { |
| 132 | + group.isCapture() and |
| 133 | + not isInsideChoiceOrSubPattern(group) |
| 134 | +} |
| 135 | + |
| 136 | +/** |
| 137 | + * Holds if `seq` contains two consecutive dots `..` or escaped dots. |
| 138 | + * |
| 139 | + * At least one of these dots is not intended to be a subdomain separator, |
| 140 | + * so we avoid flagging the pattern in this case. |
| 141 | + */ |
| 142 | +predicate hasConsecutiveDots(RegExpSequence seq) { |
| 143 | + exists(int i | |
| 144 | + isDotLike(seq.getChild(i)) and |
| 145 | + isDotLike(seq.getChild(i + 1)) |
| 146 | + ) |
| 147 | +} |
| 148 | + |
| 149 | +predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) { |
| 150 | + seq = regexp.getAChild*() and |
| 151 | + exists(RegExpDot unescapedDot, int i, string hostname | |
| 152 | + hasTopLevelDomainEnding(seq, i) and |
| 153 | + not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and |
| 154 | + not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and |
| 155 | + unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and |
| 156 | + unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD |
| 157 | + not hasConsecutiveDots(unescapedDot.getParent()) and |
| 158 | + hostname = |
| 159 | + seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() + |
| 160 | + seq.getChild(i).getRawValue() |
| 161 | + | |
| 162 | + if unescapedDot.getParent() instanceof RegExpQuantifier |
| 163 | + then |
| 164 | + // `.*\.example.com` can match `evil.com/?x=.example.com` |
| 165 | + // |
| 166 | + // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin. |
| 167 | + // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`. |
| 168 | + // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL, |
| 169 | + // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor. |
| 170 | + seq.getChild(0) instanceof RegExpCaret and |
| 171 | + not seq.getAChild() instanceof RegExpDollar and |
| 172 | + seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and |
| 173 | + msg = |
| 174 | + "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue() |
| 175 | + + "' which may cause '" + hostname + |
| 176 | + "' to be matched anywhere in the URL, outside the hostname." |
| 177 | + else |
| 178 | + msg = |
| 179 | + "has an unescaped '.' before '" + hostname + |
| 180 | + "', so it might match more hosts than expected." |
| 181 | + ) |
| 182 | +} |
| 183 | + |
| 184 | +predicate incompleteHostnameRegExp( |
| 185 | + RegExpSequence hostSequence, string message, DataFlow::Node aux, string label |
| 186 | +) { |
| 187 | + exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind | |
| 188 | + regexp = re.getRegExpTerm() and |
| 189 | + isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and |
| 190 | + ( |
| 191 | + if re.getAParse() != re |
| 192 | + then ( |
| 193 | + kind = "string, which is used as a regular expression $@," and |
| 194 | + aux = re.getAParse() |
| 195 | + ) else ( |
| 196 | + kind = "regular expression" and aux = re |
| 197 | + ) |
| 198 | + ) |
| 199 | + | |
| 200 | + message = "This " + kind + " " + msg and label = "here" |
| 201 | + ) |
| 202 | +} |
0 commit comments