Skip to content

Commit 9430616

Browse files
committed
Fix String#inspect.
PullRequest: truffleruby/389
2 parents fab94ff + c2b0154 commit 9430616

File tree

12 files changed

+168
-114
lines changed

12 files changed

+168
-114
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
Bug fixes:
44

55
* Improved compatibility with MRI's `Float#to_s` formatting (#1626).
6+
* Fixed `String#inspect` when the string uses a non-UTF-8 ASCII-compatible
7+
encoding and has non-ASCII characters.
8+
* Fixed `puts` for strings with non-ASCII-compatible encodings.
69

710
New features:
811

spec/ruby/core/string/end_with_spec.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,11 @@
4747
"céréale".end_with?("réale").should be_true
4848
end
4949

50+
it "raises an Encoding::CompatibilityError if the encodings are incompatible" do
51+
pat = "ア".encode Encoding::EUC_JP
52+
lambda do
53+
"あれ".end_with?(pat)
54+
end.should raise_error(Encoding::CompatibilityError)
55+
end
56+
5057
end

spec/ruby/core/string/inspect_spec.rb

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,4 +489,12 @@
489489
].should be_computed_by(:inspect)
490490
end
491491
end
492+
493+
describe "when the string's encoding is different than the result's encoding" do
494+
describe "and the string's encoding is ASCII-compatible but the characters are non-ASCII" do
495+
it "returns a string with the non-ASCII characters replaced by \\x notation" do
496+
"\u{3042}".encode("EUC-JP").inspect.should == '"\\x{A4A2}"'
497+
end
498+
end
499+
end
492500
end

src/main/java/org/truffleruby/core/encoding/EncodingNodes.java

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2014, 2018 Oracle and/or its affiliates. All rights reserved. This
2+
* Copyright (c) 2014, 2019 Oracle and/or its affiliates. All rights reserved. This
33
* code is released under a tri EPL/GPL/LGPL license. You can use it,
44
* redistribute it and/or modify it under the terms of the:
55
*
@@ -20,7 +20,13 @@
2020
import com.oracle.truffle.api.profiles.ConditionProfile;
2121
import org.jcodings.Encoding;
2222
import org.jcodings.EncodingDB;
23+
import org.jcodings.specific.ASCIIEncoding;
2324
import org.jcodings.specific.USASCIIEncoding;
25+
import org.jcodings.specific.UTF16BEEncoding;
26+
import org.jcodings.specific.UTF16LEEncoding;
27+
import org.jcodings.specific.UTF32BEEncoding;
28+
import org.jcodings.specific.UTF32LEEncoding;
29+
import org.jcodings.unicode.UnicodeEncoding;
2430
import org.jcodings.util.CaseInsensitiveBytesHash;
2531
import org.jcodings.util.Hash;
2632
import org.truffleruby.Layouts;
@@ -468,6 +474,76 @@ public boolean isUnicode(DynamicObject encoding) {
468474

469475
}
470476

477+
@Primitive(name = "get_actual_encoding", needsSelf = false)
478+
public abstract static class GetActualEncodingPrimitiveNode extends PrimitiveArrayArgumentsNode {
479+
480+
@Specialization
481+
public DynamicObject getActualEncoding(DynamicObject string,
482+
@Cached("create()") GetActualEncodingNode getActualEncodingNode,
483+
@Cached("create()") GetRubyEncodingNode getRubyEncodingNode) {
484+
final Rope rope = StringOperations.rope(string);
485+
final Encoding actualEncoding = getActualEncodingNode.execute(rope);
486+
487+
return getRubyEncodingNode.executeGetRubyEncoding(actualEncoding);
488+
}
489+
490+
}
491+
492+
// Port of MRI's `get_actual_encoding`.
493+
public abstract static class GetActualEncodingNode extends RubyBaseNode {
494+
495+
protected static final Encoding UTF16Dummy = EncodingDB.getEncodings().get("UTF-16".getBytes()).getEncoding();
496+
protected static final Encoding UTF32Dummy = EncodingDB.getEncodings().get("UTF-32".getBytes()).getEncoding();
497+
498+
public static GetActualEncodingNode create() {
499+
return EncodingNodesFactory.GetActualEncodingNodeGen.create();
500+
}
501+
502+
public abstract Encoding execute(Rope rope);
503+
504+
@Specialization(guards = "!rope.getEncoding().isDummy()")
505+
public Encoding getActualEncoding(Rope rope) {
506+
return rope.getEncoding();
507+
}
508+
509+
@TruffleBoundary
510+
@Specialization(guards = "rope.getEncoding().isDummy()")
511+
public Encoding getActualEncodingDummy(Rope rope) {
512+
final Encoding encoding = rope.getEncoding();
513+
514+
if (encoding instanceof UnicodeEncoding) {
515+
// handle dummy UTF-16 and UTF-32 by scanning for BOM, as in MRI
516+
if (encoding == UTF16Dummy && rope.byteLength() >= 2) {
517+
int c0 = rope.get(0) & 0xff;
518+
int c1 = rope.get(1) & 0xff;
519+
520+
if (c0 == 0xFE && c1 == 0xFF) {
521+
return UTF16BEEncoding.INSTANCE;
522+
} else if (c0 == 0xFF && c1 == 0xFE) {
523+
return UTF16LEEncoding.INSTANCE;
524+
}
525+
return ASCIIEncoding.INSTANCE;
526+
} else if (encoding == UTF32Dummy && rope.byteLength() >= 4) {
527+
int c0 = rope.get(0) & 0xff;
528+
int c1 = rope.get(1) & 0xff;
529+
int c2 = rope.get(2) & 0xff;
530+
int c3 = rope.get(3) & 0xff;
531+
532+
if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
533+
return UTF32BEEncoding.INSTANCE;
534+
} else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
535+
return UTF32LEEncoding.INSTANCE;
536+
}
537+
return ASCIIEncoding.INSTANCE;
538+
}
539+
}
540+
541+
return encoding;
542+
}
543+
544+
545+
}
546+
471547
@Primitive(name = "encoding_get_default_encoding", needsSelf = false)
472548
public abstract static class GetDefaultEncodingNode extends PrimitiveArrayArgumentsNode {
473549

src/main/java/org/truffleruby/core/rope/RopeNodes.java

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.jcodings.specific.ASCIIEncoding;
2929
import org.jcodings.specific.USASCIIEncoding;
3030
import org.jcodings.specific.UTF8Encoding;
31+
import org.truffleruby.core.encoding.EncodingNodes;
3132
import org.truffleruby.core.rope.RopeNodesFactory.SetByteNodeGen;
3233
import org.truffleruby.core.string.StringAttributes;
3334
import org.truffleruby.core.string.StringSupport;
@@ -347,14 +348,15 @@ public StringAttributes calculateAttributesAsciiCompatibleGeneric(Encoding encod
347348
return new StringAttributes(characters, codeRange);
348349
}
349350

351+
350352
@Specialization(guards = { "!isEmpty(bytes)", "!isBinaryString(encoding)", "!isAsciiCompatible(encoding)" })
351353
public StringAttributes calculateAttributesGeneric(Encoding encoding, byte[] bytes,
352354
@Cached("create()") CalculateCharacterLengthNode calculateCharacterLengthNode,
353-
@Cached("createBinaryProfile()") ConditionProfile asciiCompatibleProfile,
354-
@Cached("createBinaryProfile()") ConditionProfile validCharacterProfile) {
355+
@Cached("createBinaryProfile()") ConditionProfile validCharacterProfile,
356+
@Cached("createBinaryProfile()") ConditionProfile fixedWidthProfile) {
355357
// Taken from StringSupport.strLengthWithCodeRangeNonAsciiCompatible.
356358

357-
CodeRange codeRange = asciiCompatibleProfile.profile(encoding.isAsciiCompatible()) ? CR_7BIT : CR_VALID;
359+
CodeRange codeRange = CR_VALID;
358360
int characters;
359361
int p = 0;
360362
final int end = bytes.length;
@@ -363,14 +365,19 @@ public StringAttributes calculateAttributesGeneric(Encoding encoding, byte[] byt
363365
final int lengthOfCurrentCharacter = calculateCharacterLengthNode.characterLength(encoding, CR_UNKNOWN, bytes, p, end);
364366

365367
if (validCharacterProfile.profile(lengthOfCurrentCharacter > 0)) {
366-
if (codeRange != CR_BROKEN) {
367-
codeRange = CR_VALID;
368-
}
369-
370368
p += lengthOfCurrentCharacter;
371369
} else {
372370
codeRange = CR_BROKEN;
373-
p++;
371+
372+
// If a string is detected as broken and we already know the character length due to a
373+
// fixed width encoding, there's no value in visiting any more bytes.
374+
if (fixedWidthProfile.profile(encoding.isFixedWidth())) {
375+
characters = (bytes.length + encoding.minLength() - 1) / encoding.minLength();
376+
377+
return new StringAttributes(characters, CR_BROKEN);
378+
} else {
379+
p++;
380+
}
374381
}
375382
}
376383

@@ -1175,32 +1182,35 @@ public int getCodePointUTF8(Rope rope, int index,
11751182
@Cached("create()") GetByteNode getByteNode,
11761183
@Cached("create()") BytesNode bytesNode,
11771184
@Cached("create()") CodeRangeNode codeRangeNode,
1185+
@Cached("create()") EncodingNodes.GetActualEncodingNode getActualEncodingNode,
11781186
@Cached("createBinaryProfile()") ConditionProfile singleByteCharProfile,
11791187
@Cached("create()") BranchProfile errorProfile) {
11801188
final int firstByte = getByteNode.executeGetByte(rope, index);
11811189
if (singleByteCharProfile.profile(firstByte < 128)) {
11821190
return firstByte;
11831191
}
11841192

1185-
return getCodePointMultiByte(rope, index, errorProfile, bytesNode, codeRangeNode);
1193+
return getCodePointMultiByte(rope, index, errorProfile, bytesNode, codeRangeNode, getActualEncodingNode);
11861194
}
11871195

11881196
@Specialization(guards = { "!singleByteOptimizableNode.execute(rope)", "!rope.getEncoding().isUTF8()" })
11891197
public int getCodePointMultiByte(Rope rope, int index,
11901198
@Cached("create()") BranchProfile errorProfile,
11911199
@Cached("create()") BytesNode bytesNode,
1192-
@Cached("create()") CodeRangeNode codeRangeNode) {
1200+
@Cached("create()") CodeRangeNode codeRangeNode,
1201+
@Cached("create()") EncodingNodes.GetActualEncodingNode getActualEncodingNode) {
11931202
final byte[] bytes = bytesNode.execute(rope);
11941203
final Encoding encoding = rope.getEncoding();
1204+
final Encoding actualEncoding = getActualEncodingNode.execute(rope);
11951205
final CodeRange codeRange = codeRangeNode.execute(rope);
11961206

1197-
final int characterLength = characterLength(encoding, codeRange, bytes, index, rope.byteLength());
1207+
final int characterLength = characterLength(actualEncoding, codeRange, bytes, index, rope.byteLength());
11981208
if (characterLength <= 0) {
11991209
errorProfile.enter();
12001210
throw new RaiseException(getContext(), getContext().getCoreExceptions().argumentError("invalid byte sequence in " + encoding, null));
12011211
}
12021212

1203-
return mbcToCode(encoding, bytes, index, rope.byteLength());
1213+
return mbcToCode(actualEncoding, bytes, index, rope.byteLength());
12041214
}
12051215

12061216
@TruffleBoundary

src/main/java/org/truffleruby/core/rope/RopeOperations.java

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2016, 2018 Oracle and/or its affiliates. All rights reserved. This
2+
* Copyright (c) 2016, 2019 Oracle and/or its affiliates. All rights reserved. This
33
* code is released under a tri EPL/GPL/LGPL license. You can use it,
44
* redistribute it and/or modify it under the terms of the:
55
*
@@ -31,7 +31,6 @@
3131
import org.truffleruby.core.Hashing;
3232
import org.truffleruby.core.encoding.EncodingManager;
3333
import org.truffleruby.core.rope.RopeNodes.WithEncodingNode;
34-
import org.truffleruby.core.string.EncodingUtils;
3534
import org.truffleruby.core.string.StringAttributes;
3635
import org.truffleruby.core.string.StringOperations;
3736
import org.truffleruby.core.string.StringSupport;
@@ -234,12 +233,6 @@ private static String decode(Charset charset, byte[] bytes, int byteOffset, int
234233
return new String(bytes, byteOffset, byteLength, charset);
235234
}
236235

237-
// MRI: get_actual_encoding
238-
@TruffleBoundary
239-
public static Encoding STR_ENC_GET(Rope rope) {
240-
return EncodingUtils.getActualEncoding(rope.getEncoding(), rope.getBytes(), 0, rope.byteLength());
241-
}
242-
243236
@TruffleBoundary
244237
public static StringAttributes calculateCodeRangeAndLength(Encoding encoding, byte[] bytes, int start, int end) {
245238
if (bytes.length == 0) {

src/main/java/org/truffleruby/core/string/EncodingUtils.java

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,8 @@
2929
import java.util.List;
3030

3131
import org.jcodings.Encoding;
32-
import org.jcodings.EncodingDB;
3332
import org.jcodings.ascii.AsciiTables;
3433
import org.jcodings.specific.ASCIIEncoding;
35-
import org.jcodings.specific.UTF16BEEncoding;
36-
import org.jcodings.specific.UTF16LEEncoding;
37-
import org.jcodings.specific.UTF32BEEncoding;
38-
import org.jcodings.specific.UTF32LEEncoding;
39-
import org.jcodings.unicode.UnicodeEncoding;
4034
import org.truffleruby.core.rope.CodeRange;
4135

4236
public class EncodingUtils {
@@ -134,39 +128,6 @@ public static List<String> encodingNames(byte[] name, int p, int end) {
134128
}
135129

136130

137-
private static final Encoding UTF16Dummy = EncodingDB.getEncodings().get("UTF-16".getBytes()).getEncoding();
138-
private static final Encoding UTF32Dummy = EncodingDB.getEncodings().get("UTF-32".getBytes()).getEncoding();
139-
140-
public static Encoding getActualEncoding(Encoding enc, byte[] bytes, int p, int end) {
141-
if (enc.isDummy() && enc instanceof UnicodeEncoding) {
142-
// handle dummy UTF-16 and UTF-32 by scanning for BOM, as in MRI
143-
if (enc == UTF16Dummy && end - p >= 2) {
144-
int c0 = bytes[p] & 0xff;
145-
int c1 = bytes[p + 1] & 0xff;
146-
147-
if (c0 == 0xFE && c1 == 0xFF) {
148-
return UTF16BEEncoding.INSTANCE;
149-
} else if (c0 == 0xFF && c1 == 0xFE) {
150-
return UTF16LEEncoding.INSTANCE;
151-
}
152-
return ASCIIEncoding.INSTANCE;
153-
} else if (enc == UTF32Dummy && end - p >= 4) {
154-
int c0 = bytes[p] & 0xff;
155-
int c1 = bytes[p + 1] & 0xff;
156-
int c2 = bytes[p + 2] & 0xff;
157-
int c3 = bytes[p + 3] & 0xff;
158-
159-
if (c0 == 0 && c1 == 0 && c2 == 0xFE && c3 == 0xFF) {
160-
return UTF32BEEncoding.INSTANCE;
161-
} else if (c3 == 0 && c2 == 0 && c1 == 0xFE && c0 == 0xFF) {
162-
return UTF32LEEncoding.INSTANCE;
163-
}
164-
return ASCIIEncoding.INSTANCE;
165-
}
166-
}
167-
return enc;
168-
}
169-
170131
// rb_enc_ascget
171132
public static int encAscget(byte[] pBytes, int p, int e, int[] len, Encoding enc, CodeRange codeRange) {
172133
int c;

0 commit comments

Comments
 (0)