Skip to content

Commit df10d7a

Browse files
committed
Fix StringScanner and patterns starting with ^
PullRequest: truffleruby/549
2 parents b1b6b9f + 6d401a4 commit df10d7a

File tree

5 files changed

+63
-115
lines changed

5 files changed

+63
-115
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Bug fixes:
88
using XML conversion options and a new destination encoding (#1545).
99
* Fixed a bug where a raised cloned exception would be caught as the
1010
original exception (#1542).
11+
* Fixed a bug with `StringScanner` and patterns starting with `^` (#1544).
1112

1213
Compatibility:
1314

lib/truffle/strscan.rb

Lines changed: 56 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
# Modifications made by the Truffle team are:
2828
#
29-
# Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. This
29+
# Copyright (c) 2017, 2019 Oracle and/or its affiliates. All rights reserved. This
3030
# code is released under a tri EPL/GPL/LGPL license. You can use it,
3131
# redistribute it and/or modify it under the terms of the:
3232
#
@@ -35,14 +35,28 @@
3535
# GNU Lesser General Public License version 2.1.
3636

3737

38-
class ScanError < StandardError; end
38+
class ScanError < StandardError
39+
end
3940

4041
class StringScanner
42+
4143
Id = 'None$Id'.freeze
4244
Version = '1.0.0'.freeze
4345

4446
attr_reader :pos, :match, :prev_pos
4547

48+
def initialize(string, dup=false)
49+
if string.instance_of? String
50+
@original = string
51+
@string = string
52+
else
53+
@original = StringValue(string)
54+
@string = String.new @original
55+
end
56+
57+
reset_state
58+
end
59+
4660
def pos=(n)
4761
n = Integer(n)
4862

@@ -59,12 +73,10 @@ def pos=(n)
5973
alias_method :pointer=, :pos=
6074

6175
def [](n)
62-
# Truffle: no eager check
6376
if @match
64-
# Truffle: follow MRI
6577
raise TypeError, "no implicit conversion of #{n.class} into Integer" if Range === n
6678
str = @match[n]
67-
str.taint if @string.tainted? # Truffle: propagate taint
79+
str.taint if @string.tainted?
6880
str
6981
end
7082
end
@@ -75,17 +87,16 @@ def bol?
7587

7688
alias_method :beginning_of_line?, :bol?
7789

78-
# Truffle: added
7990
def charpos
8091
@string.byteslice(0, @pos).length
8192
end
8293

8394
def check(pattern)
84-
_scan pattern, false, true, true
95+
scan_internal pattern, false, true, true
8596
end
8697

8798
def check_until(pattern)
88-
_scan pattern, false, true, false
99+
scan_internal pattern, false, true, false
89100
end
90101

91102
def clear
@@ -105,17 +116,27 @@ def empty?
105116
end
106117

107118
def eos?
108-
raise ArgumentError, 'uninitialized StringScanner object' unless @string # Truffle
119+
raise ArgumentError, 'uninitialized StringScanner object' unless @string
109120
@pos >= @string.bytesize
110121
end
111122

112123
def exist?(pattern)
113-
_scan pattern, false, false, false
124+
scan_internal pattern, false, false, false
114125
end
115126

116127
def get_byte
117-
# Truffle: correct get_byte with non-ascii strings
118-
_get_byte
128+
if eos?
129+
@match = nil
130+
return nil
131+
end
132+
133+
# We need to match one byte, regardless of the string encoding
134+
@match = Truffle.invoke_primitive :regexp_search_from_binary, /./mn, @string, pos
135+
136+
@prev_pos = @pos
137+
@pos += 1
138+
139+
@string.byteslice(@prev_pos, 1)
119140
end
120141

121142
def getbyte
@@ -127,19 +148,6 @@ def getch
127148
scan(/./m)
128149
end
129150

130-
def initialize(string, dup=false)
131-
if string.instance_of? String
132-
@original = string
133-
@string = string
134-
else
135-
@original = StringValue(string)
136-
@string = String.new @original
137-
end
138-
139-
reset_state
140-
end
141-
142-
# Truffle: fix to use self.class instead of hard-coded StringScanner
143151
def inspect
144152
if defined? @string
145153
if eos?
@@ -172,13 +180,13 @@ def inspect
172180
end
173181

174182
def match?(pattern)
175-
_scan pattern, false, false, true
183+
scan_internal pattern, false, false, true
176184
end
177185

178186
def matched
179187
if @match
180188
matched = @match.to_s
181-
matched.taint if @string.tainted? # Truffle: propagate taint
189+
matched.taint if @string.tainted?
182190
matched
183191
end
184192
end
@@ -233,31 +241,31 @@ def restsize
233241
end
234242

235243
def scan(pattern)
236-
_scan pattern, true, true, true
244+
scan_internal pattern, true, true, true
237245
end
238246

239247
def scan_until(pattern)
240-
_scan pattern, true, true, false
248+
scan_internal pattern, true, true, false
241249
end
242250

243251
def scan_full(pattern, advance_pos, getstr)
244-
_scan pattern, advance_pos, getstr, true
252+
scan_internal pattern, advance_pos, getstr, true
245253
end
246254

247255
def search_full(pattern, advance_pos, getstr)
248-
_scan pattern, advance_pos, getstr, false
256+
scan_internal pattern, advance_pos, getstr, false
249257
end
250258

251259
def self.must_C_version
252260
self
253261
end
254262

255263
def skip(pattern)
256-
_scan pattern, true, false, true
264+
scan_internal pattern, true, false, true
257265
end
258266

259267
def skip_until(pattern)
260-
_scan pattern, true, false, false
268+
scan_internal pattern, true, false, false
261269
end
262270

263271
def string
@@ -293,8 +301,6 @@ def unscan
293301
def peek(len)
294302
raise ArgumentError if len < 0
295303
return '' if len.zero?
296-
297-
# Truffle: correctly use byte offsets and no rescue
298304
@string.byteslice(pos, len)
299305
end
300306

@@ -303,54 +309,37 @@ def peep(len)
303309
peek len
304310
end
305311

306-
def _scan(pattern, advance_pos, getstr, headonly)
312+
def scan_internal(pattern, advance_pos, getstr, headonly)
307313
unless pattern.kind_of? Regexp
308314
raise TypeError, "bad pattern argument: #{pattern.inspect}"
309315
end
310-
raise ArgumentError, 'uninitialized StringScanner object' unless @string # Truffle
311-
312-
@match = nil
313-
314-
if headonly
315-
# NOTE - match_start is an Oniguruma feature that Rubinius exposes.
316-
# We use it here to avoid creating a new Regexp with '^' prepended.
317-
@match = pattern.match_start @string, @pos
318-
else
319-
# NOTE - search_from is an Oniguruma feature that Rubinius exposes.
320-
# We use it so we can begin the search in the middle of the string
321-
@match = pattern.search_from @string, @pos
316+
raise ArgumentError, 'uninitialized StringScanner object' unless @string
317+
318+
# If the pattern already starts with a ^, and we're not at the start of
319+
# the string, then we can't match as normal because match_from still tries
320+
# to match the ^ at position 0 even though it's looking from point pos
321+
# onwards, even if headonly is set. Instead, remove the ^. This could
322+
# possibly be fixed in Joni instead, or maybe there is already some option
323+
# we're not using.
324+
325+
if pattern.source[0] == '^' && pos > 0
326+
pattern = Regexp.new(pattern.source[1..-1])
327+
headonly = true
322328
end
323329

330+
@match = pattern.match_onwards @string, pos, headonly
324331
return nil unless @match
325332

326333
fin = @match.byte_end(0)
327334

328335
@prev_pos = @pos
329-
330336
@pos = fin if advance_pos
331337

332338
width = fin - @prev_pos
333-
334339
return width unless getstr
335340

336341
@string.byteslice(@prev_pos, width)
337342
end
338-
private :_scan
339-
340-
# Truffle: correct get_byte with non-ascii strings
341-
def _get_byte
342-
if eos?
343-
@match = nil
344-
return nil
345-
end
346-
347-
# We need to match one byte, regardless of the string encoding
348-
@match = Truffle.invoke_primitive :regexp_search_from_binary, /./mn, @string, pos
343+
private :scan_internal
349344

350-
@prev_pos = @pos
351-
@pos += 1
352-
353-
@string.byteslice(@prev_pos, 1)
354-
end
355-
private :_get_byte
356345
end

spec/tags/library/stringscanner/scan_tags.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
fails:StringScanner#scan treats ^ as matching from the beginning of the current position
21
graalvm:StringScanner#scan returns the matched string
32
graalvm:StringScanner#scan returns nil if there's no match
43
graalvm:StringScanner#scan returns nil when there is no more to scan
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
1-
fails:StringScanner#scan_until can match anchors properly
21
graalvm:StringScanner#scan_until returns the substring up to and including the end of the match
32
graalvm:StringScanner#scan_until returns nil if there's no match

src/main/java/org/truffleruby/core/regexp/RegexpNodes.java

Lines changed: 6 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -105,41 +105,18 @@ public static Encoding checkEncoding(DynamicObject regexp, Rope str, boolean war
105105
return checkEncoding(regexp, str.getEncoding(), str.getCodeRange(), warn);
106106
}
107107

108-
// TODO (nirvdrum 03-June-15) Unify with JRuby in RegexpSupport.
109108
public static Encoding checkEncoding(DynamicObject regexp, Encoding strEnc, CodeRange codeRange, boolean warn) {
110109
assert RubyGuards.isRubyRegexp(regexp);
111110

112111
final Encoding regexEnc = Layouts.REGEXP.getRegex(regexp).getEncoding();
113112

114-
/*
115-
if (str.scanForCodeRange() == StringSupport.CR_BROKEN) {
116-
throw getRuntime().newArgumentError("invalid byte sequence in " + str.getEncoding());
117-
}
118-
*/
119-
//check();
120113
if (strEnc == regexEnc) {
121114
return regexEnc;
122115
} else if (regexEnc == USASCIIEncoding.INSTANCE && codeRange == CodeRange.CR_7BIT) {
123116
return regexEnc;
124-
} else if (!strEnc.isAsciiCompatible()) {
125-
if (strEnc != regexEnc) {
126-
//encodingMatchError(getRuntime(), pattern, enc);
127-
}
128-
} else if (Layouts.REGEXP.getOptions(regexp).isFixed()) {
129-
/*
130-
if (enc != pattern.getEncoding() &&
131-
(!pattern.getEncoding().isAsciiCompatible() ||
132-
str.scanForCodeRange() != StringSupport.CR_7BIT)) {
133-
encodingMatchError(getRuntime(), pattern, enc);
134-
}
135-
*/
117+
} else if (strEnc.isAsciiCompatible() && Layouts.REGEXP.getOptions(regexp).isFixed()) {
136118
return regexEnc;
137119
}
138-
/*
139-
if (warn && this.options.isEncodingNone() && enc != ASCIIEncoding.INSTANCE && str.scanForCodeRange() != StringSupport.CR_7BIT) {
140-
getRuntime().getWarnings().warn(ID.REGEXP_MATCH_AGAINST_STRING, "regexp match /.../n against to " + enc + " string");
141-
}
142-
*/
143120
return strEnc;
144121
}
145122

@@ -191,18 +168,18 @@ public int hash(DynamicObject regexp) {
191168
}
192169

193170
@NonStandard
194-
@CoreMethod(names = "match_start", required = 2, lowerFixnum = 2)
195-
public abstract static class MatchStartNode extends CoreMethodArrayArgumentsNode {
171+
@CoreMethod(names = "match_onwards", required = 3, lowerFixnum = 2)
172+
public abstract static class MatchOnwardsNode extends CoreMethodArrayArgumentsNode {
196173

197174
@Child private TruffleRegexpNodes.MatchNode matchNode = TruffleRegexpNodes.MatchNode.create();
198175
@Child private RopeNodes.BytesNode bytesNode = RopeNodes.BytesNode.create();
199176

200177
@Specialization(guards = "isRubyString(string)")
201-
public Object matchStart(VirtualFrame frame, DynamicObject regexp, DynamicObject string, int startPos) {
178+
public Object matchOnwards(DynamicObject regexp, DynamicObject string, int startPos, boolean atStart) {
202179
final Rope rope = StringOperations.rope(string);
203180
final Matcher matcher = createMatcher(getContext(), regexp, rope, bytesNode.execute(rope), true);
204181
int range = rope.byteLength();
205-
return matchNode.execute(regexp, string, matcher, startPos, range, true);
182+
return matchNode.execute(regexp, string, matcher, startPos, range, atStart);
206183
}
207184
}
208185

@@ -247,31 +224,14 @@ private StringNodes.MakeStringNode getMakeStringNode() {
247224
}
248225
}
249226

250-
@NonStandard
251-
@CoreMethod(names = "search_from", required = 2, lowerFixnum = 2)
252-
public abstract static class SearchFromNode extends CoreMethodArrayArgumentsNode {
253-
254-
@Child private TruffleRegexpNodes.MatchNode matchNode = TruffleRegexpNodes.MatchNode.create();
255-
@Child private RopeNodes.BytesNode bytesNode = RopeNodes.BytesNode.create();
256-
257-
@Specialization(guards = "isRubyString(string)")
258-
public Object searchFrom(VirtualFrame frame, DynamicObject regexp, DynamicObject string, int startPos) {
259-
final Rope rope = StringOperations.rope(string);
260-
final Matcher matcher = createMatcher(getContext(), regexp, rope, bytesNode.execute(rope), true);
261-
int range = StringOperations.rope(string).byteLength();
262-
263-
return matchNode.execute(regexp, string, matcher, startPos, range, false);
264-
}
265-
}
266-
267227
@Primitive(name = "regexp_search_from_binary", lowerFixnum = 2)
268228
public abstract static class SearchFromBinaryNode extends CoreMethodArrayArgumentsNode {
269229

270230
@Child private TruffleRegexpNodes.MatchNode matchNode = TruffleRegexpNodes.MatchNode.create();
271231
@Child private RopeNodes.BytesNode bytesNode = RopeNodes.BytesNode.create();
272232

273233
@Specialization(guards = "isRubyString(string)")
274-
public Object searchFrom(VirtualFrame frame, DynamicObject regexp, DynamicObject string, int startPos) {
234+
public Object searchFrom(DynamicObject regexp, DynamicObject string, int startPos) {
275235
final Rope rope = StringOperations.rope(string);
276236
final Matcher matcher = createMatcher(getContext(), regexp, rope, bytesNode.execute(rope), false);
277237
final int endPos = rope.byteLength();

0 commit comments

Comments
 (0)