@@ -62,7 +62,7 @@ abstract class RegexString extends StringLiteral {
62
62
63
63
/**
64
64
* Helper predicate for `quote`.
65
- * Holds if the char at `pos` is the one-based `index`th occourence of a quote delimiter (`\Q` or `\E`)
65
+ * Holds if the char at `pos` is the one-based `index`th occurence of a quote delimiter (`\Q` or `\E`)
66
66
* Result is `true` for `\Q` and `false` for `\E`.
67
67
*/
68
68
private boolean quoteDelimiter ( int index , int pos ) {
@@ -73,7 +73,7 @@ abstract class RegexString extends StringLiteral {
73
73
/** Holds if a quoted sequence is found between `start` and `end` */
74
74
predicate quote ( int start , int end ) { this .quote ( start , end , _, _) }
75
75
76
- /** Holds if a quoted sequence is found between `start` and `end`, with ontent found between `inner_start` and `inner_end`. */
76
+ /** Holds if a quoted sequence is found between `start` and `end`, with content found between `inner_start` and `inner_end`. */
77
77
predicate quote ( int start , int end , int inner_start , int inner_end ) {
78
78
exists ( int index |
79
79
this .quoteDelimiter ( index , start ) = true and
@@ -98,7 +98,7 @@ abstract class RegexString extends StringLiteral {
98
98
}
99
99
100
100
/**
101
- * A control sequence, `\cx`
101
+ * Holds if there is a control sequence, `\cx`, between `start` and `end`.
102
102
* `x` may be any ascii character including special characters.
103
103
*/
104
104
predicate controlEscape ( int start , int end ) {
@@ -107,6 +107,65 @@ abstract class RegexString extends StringLiteral {
107
107
end = start + 3
108
108
}
109
109
110
+ pragma [ inline]
111
+ private predicate isOctal ( int index ) { this .getChar ( index ) = [ 0 .. 7 ] .toString ( ) }
112
+
113
+ /** An escape sequence that includes braces, such as named characters (\N{degree sign}), named classes (\p{Lower}), or hex values (\x{h..h}) */
114
+ private predicate escapedBraces ( int start , int end ) {
115
+ this .escapingChar ( start ) and
116
+ this .getChar ( start + 1 ) = [ "N" , "p" , "P" , "x" ] and
117
+ this .getChar ( start + 2 ) = "{" and
118
+ end = min ( int i | start + 2 < i and this .getChar ( i - 1 ) = "}" )
119
+ }
120
+
121
+ /**
122
+ * Holds if an escaped character is found between `start` and `end`.
123
+ * Escaped characters include hex values, octal values and named escapes,
124
+ * but excludes backreferences.
125
+ */
126
+ predicate escapedCharacter ( int start , int end ) {
127
+ this .escapingChar ( start ) and
128
+ not this .backreference ( start , _) and
129
+ (
130
+ // hex value \xhh
131
+ this .getChar ( start + 1 ) = "x" and
132
+ this .getChar ( start + 2 ) != "{" and
133
+ end = start + 4
134
+ or
135
+ // octal value \0o, \0oo, or \0ooo. Max of 0377.
136
+ this .getChar ( start + 1 ) = "0" and
137
+ this .isOctal ( start + 2 ) and
138
+ (
139
+ if this .isOctal ( start + 3 )
140
+ then
141
+ if this .isOctal ( start + 4 ) and this .getChar ( start + 2 ) in [ "0" , "1" , "2" , "3" ]
142
+ then end = start + 5
143
+ else end = start + 4
144
+ else end = start + 3
145
+ )
146
+ or
147
+ // 16-bit hex value \uhhhh
148
+ this .getChar ( start + 1 ) = "u" and end = start + 6
149
+ or
150
+ this .escapedBraces ( start , end )
151
+ or
152
+ // Boundary matchers \b, \b{g}
153
+ this .getChar ( start + 1 ) = "b" and
154
+ (
155
+ if this .getText ( ) .substring ( start + 2 , start + 5 ) = "{g}"
156
+ then end = start + 5
157
+ else end = start + 2
158
+ )
159
+ or
160
+ this .controlEscape ( start , end )
161
+ or
162
+ // escape not handled above, update when adding a new case
163
+ not this .getChar ( start + 1 ) in [ "x" , "0" , "u" , "p" , "P" , "N" , "b" , "c" ] and
164
+ not exists ( this .getChar ( start + 1 ) .toInt ( ) ) and
165
+ end = start + 2
166
+ )
167
+ }
168
+
110
169
private string nonEscapedCharAt ( int i ) {
111
170
result = this .getChar ( i ) and
112
171
not exists ( int x , int y | this .escapedCharacter ( x , y ) and i in [ x .. y - 1 ] ) and
@@ -128,7 +187,7 @@ abstract class RegexString extends StringLiteral {
128
187
129
188
/**
130
189
* Holds if the character at `pos` starts a character set delimiter.
131
- * Result is 1 for `[` and 0 for `]`.
190
+ * Result is 1 for `[` and -1 for `]`.
132
191
*/
133
192
private int charSetDelimiter ( int pos ) {
134
193
result = 1 and this .charSetStart0 ( pos , _)
@@ -145,17 +204,14 @@ abstract class RegexString extends StringLiteral {
145
204
pos = rank [ index ] ( int p | exists ( this .charSetDelimiter ( p ) ) )
146
205
}
147
206
148
- bindingset [ x]
149
- private int max_zero ( int x ) { result = max ( [ x , 0 ] ) }
150
-
151
207
/**
152
208
* Gets the nesting depth of character classes after position `pos`,
153
209
* where `pos` is the position of a character set delimiter.
154
210
*/
155
211
private int charSetDepth ( int index , int pos ) {
156
- index = 1 and result = max_zero ( charSetDelimiter ( index , pos ) )
212
+ index = 1 and result = 0 . maximum ( this . charSetDelimiter ( index , pos ) )
157
213
or
158
- result = max_zero ( charSetDelimiter ( index , pos ) + charSetDepth ( index - 1 , _) )
214
+ result = 0 . maximum ( this . charSetDelimiter ( index , pos ) + this . charSetDepth ( index - 1 , _) )
159
215
}
160
216
161
217
/** Hold if a top-level character set starts between `start` and `end`. */
@@ -209,26 +265,10 @@ abstract class RegexString extends StringLiteral {
209
265
210
266
/** An indexed version of `charSetToken/3` */
211
267
private predicate charSetToken ( int charset_start , int index , int token_start , int token_end ) {
212
- token_start =
213
- rank [ index ] ( int start , int end | this .charSetToken ( charset_start , start , end ) | start ) and
268
+ token_start = rank [ index ] ( int start | this .charSetToken ( charset_start , start , _) | start ) and
214
269
this .charSetToken ( charset_start , token_start , token_end )
215
270
}
216
271
217
- /**
218
- * Holds if the character set starting at `charset_start` contains either
219
- * a character or a range found between `start` and `end`.
220
- */
221
- predicate charSetChild ( int charset_start , int start , int end ) {
222
- this .charSetToken ( charset_start , start , end ) and
223
- not exists ( int range_start , int range_end |
224
- this .charRange ( charset_start , range_start , _, _, range_end ) and
225
- range_start <= start and
226
- range_end >= end
227
- )
228
- or
229
- this .charRange ( charset_start , start , _, _, end )
230
- }
231
-
232
272
/**
233
273
* Helper predicate for `charRange`.
234
274
* We can determine where character ranges end by a left to right sweep.
@@ -272,63 +312,19 @@ abstract class RegexString extends StringLiteral {
272
312
)
273
313
}
274
314
275
- pragma [ inline]
276
- private predicate isOctal ( int index ) { this .getChar ( index ) = [ 0 .. 7 ] .toString ( ) }
277
-
278
- /** An escape sequence that includes braces, such as named characters (\N{degree sign}), named classes (\p{Lower}), or hex values (\x{h..h}) */
279
- private predicate escapedBraces ( int start , int end ) {
280
- this .escapingChar ( start ) and
281
- this .getChar ( start + 1 ) = [ "N" , "p" , "P" , "x" ] and
282
- this .getChar ( start + 2 ) = "{" and
283
- end = min ( int i | start + 2 < i and this .getChar ( i - 1 ) = "}" )
284
- }
285
-
286
315
/**
287
- * Holds if an escaped character is found between `start` and `end`.
288
- * Escaped characters include hex values, octal values and named escapes,
289
- * but excludes backreferences.
316
+ * Holds if the character set starting at `charset_start` contains either
317
+ * a character or a range found between `start` and `end`.
290
318
*/
291
- predicate escapedCharacter ( int start , int end ) {
292
- this .escapingChar ( start ) and
293
- not this .backreference ( start , _) and
294
- (
295
- // hex value \xhh
296
- this .getChar ( start + 1 ) = "x" and
297
- this .getChar ( start + 2 ) != "{" and
298
- end = start + 4
299
- or
300
- // octal value \0o, \0oo, or \0ooo. Max of 0377.
301
- this .getChar ( start + 1 ) = "0" and
302
- this .isOctal ( start + 2 ) and
303
- (
304
- if this .isOctal ( start + 3 )
305
- then
306
- if this .isOctal ( start + 4 ) and this .getChar ( start + 2 ) in [ "0" , "1" , "2" , "3" ]
307
- then end = start + 5
308
- else end = start + 4
309
- else end = start + 3
310
- )
311
- or
312
- // 16-bit hex value \uhhhh
313
- this .getChar ( start + 1 ) = "u" and end = start + 6
314
- or
315
- this .escapedBraces ( start , end )
316
- or
317
- // Boundary matchers \b, \b{g}
318
- this .getChar ( start + 1 ) = "b" and
319
- (
320
- if this .getText ( ) .substring ( start + 2 , start + 5 ) = "{g}"
321
- then end = start + 5
322
- else end = start + 2
323
- )
324
- or
325
- this .controlEscape ( start , end )
326
- or
327
- // escape not handled above, update when adding a new case
328
- not this .getChar ( start + 1 ) in [ "x" , "0" , "u" , "p" , "P" , "N" , "b" , "c" ] and
329
- not exists ( this .getChar ( start + 1 ) .toInt ( ) ) and
330
- end = start + 2
319
+ predicate charSetChild ( int charset_start , int start , int end ) {
320
+ this .charSetToken ( charset_start , start , end ) and
321
+ not exists ( int range_start , int range_end |
322
+ this .charRange ( charset_start , range_start , _, _, range_end ) and
323
+ range_start <= start and
324
+ range_end >= end
331
325
)
326
+ or
327
+ this .charRange ( charset_start , start , _, _, end )
332
328
}
333
329
334
330
/** Holds if `index` is inside a character set. */
@@ -871,9 +867,9 @@ abstract class RegexString extends StringLiteral {
871
867
* Holds if a character is represented between `start` and `end` in the source literal.
872
868
*/
873
869
private predicate sourceCharacter ( int start , int end ) {
874
- sourceEscapedCharacter ( start , end )
870
+ this . sourceEscapedCharacter ( start , end )
875
871
or
876
- sourceNonEscapedCharacter ( start ) and
872
+ this . sourceNonEscapedCharacter ( start ) and
877
873
end = start + 1
878
874
}
879
875
@@ -885,8 +881,8 @@ abstract class RegexString extends StringLiteral {
885
881
*/
886
882
predicate sourceCharacter ( int pos , int start , int end ) {
887
883
exists ( this .getChar ( pos ) ) and
888
- sourceCharacter ( start , end ) and
889
- start = rank [ pos + 2 ] ( int s | sourceCharacter ( s , _) )
884
+ this . sourceCharacter ( start , end ) and
885
+ start = rank [ pos + 2 ] ( int s | this . sourceCharacter ( s , _) )
890
886
}
891
887
}
892
888
0 commit comments