Skip to content

Commit 74ef80f

Browse files
committed
[GR-18163] Implement String#undump (#2131)
PullRequest: truffleruby/2114
2 parents 232f2a3 + 4a8b6d0 commit 74ef80f

File tree

6 files changed

+337
-2
lines changed

6 files changed

+337
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Bug fixes:
99

1010
Compatibility:
1111

12+
* Implement `String#undump` (#2131, @kustosz)
1213

1314
Performance:
1415

spec/tags/core/string/undump_tags.txt

-2.11 KB
Binary file not shown.

src/main/java/org/truffleruby/core/array/ArrayUtils.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,15 @@ public static int memcmp(final byte[] first, final int firstStart, final byte[]
222222
return 0;
223223
}
224224

225+
public static int memchr(byte[] array, int start, byte find, int size) {
226+
for (int i = start; i < start + size; i++) {
227+
if (array[i] == find) {
228+
return i;
229+
}
230+
}
231+
return -1;
232+
}
233+
225234
@TruffleBoundary
226235
public static void sort(Object[] elements, int length) {
227236
Arrays.sort(elements, 0, length);

src/main/java/org/truffleruby/core/string/EncodingUtils.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,8 @@ public static int encCodepointLength(byte[] pBytes, int p, int e, int[] len_p, E
180180
}
181181

182182
// rb_enc_mbcput
183-
public static void encMbcput(int c, byte[] buf, int p, Encoding enc) {
184-
enc.codeToMbc(c, buf, p);
183+
public static int encMbcput(int c, byte[] buf, int p, Encoding enc) {
184+
return enc.codeToMbc(c, buf, p);
185185
}
186186

187187
}

src/main/java/org/truffleruby/core/string/StringNodes.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2149,6 +2149,29 @@ private int codePointX(Encoding enc, CodeRange codeRange, byte[] bytes, int p, i
21492149
}
21502150
}
21512151

2152+
@CoreMethod(names = "undump", taintFrom = 0)
2153+
@ImportStatic(StringGuards.class)
2154+
public abstract static class UndumpNode extends CoreMethodArrayArgumentsNode {
2155+
@Specialization(guards = "isAsciiCompatible(string)")
2156+
protected RubyString undumpAsciiCompatible(RubyString string,
2157+
@CachedLanguage RubyLanguage language,
2158+
@Cached MakeStringNode makeStringNode) {
2159+
// Taken from org.jruby.RubyString#undump
2160+
RopeBuilder outputBytes = StringSupport.undump(string.rope, getContext(), this);
2161+
return makeStringNode.fromBuilder(outputBytes, CR_UNKNOWN);
2162+
}
2163+
2164+
@Specialization(guards = "!isAsciiCompatible(string)")
2165+
protected RubyString undumpNonAsciiCompatible(RubyString string) {
2166+
throw new RaiseException(
2167+
getContext(),
2168+
getContext().getCoreExceptions().encodingCompatibilityError(
2169+
Utils.concat("ASCII incompatible encoding: ", string.rope.encoding),
2170+
this));
2171+
}
2172+
2173+
}
2174+
21522175
@CoreMethod(names = "setbyte", required = 2, raiseIfFrozenSelf = true, lowerFixnum = { 1, 2 })
21532176
@NodeChild(value = "string", type = RubyNode.class)
21542177
@NodeChild(value = "index", type = RubyNode.class)

src/main/java/org/truffleruby/core/string/StringSupport.java

Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,21 +33,26 @@
3333

3434
import java.util.Arrays;
3535

36+
import com.oracle.truffle.api.nodes.Node;
3637
import org.jcodings.Config;
3738
import org.jcodings.Encoding;
3839
import org.jcodings.IntHolder;
3940
import org.jcodings.ascii.AsciiTables;
4041
import org.jcodings.constants.CharacterType;
4142
import org.jcodings.specific.ASCIIEncoding;
43+
import org.jcodings.specific.UTF8Encoding;
4244
import org.jcodings.util.IntHash;
45+
import org.truffleruby.RubyContext;
4346
import org.truffleruby.collections.IntHashMap;
4447
import org.truffleruby.core.array.ArrayUtils;
48+
import org.truffleruby.core.encoding.RubyEncoding;
4549
import org.truffleruby.core.rope.CodeRange;
4650
import org.truffleruby.core.rope.Rope;
4751
import org.truffleruby.core.rope.RopeBuilder;
4852
import org.truffleruby.core.rope.RopeOperations;
4953

5054
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
55+
import org.truffleruby.language.control.RaiseException;
5156
import org.truffleruby.utils.Utils;
5257

5358
public final class StringSupport {
@@ -1641,4 +1646,301 @@ public static boolean isAsciiCodepoint(int value) {
16411646
}
16421647

16431648
//endregion
1649+
//region undump helpers
1650+
1651+
private static final byte[] FORCE_ENCODING_BYTES = RopeOperations.encodeAsciiBytes(".force_encoding(\"");
1652+
private static final byte[] HEXDIGIT = RopeOperations.encodeAsciiBytes("0123456789abcdef0123456789ABCDEF");
1653+
private static final String INVALID_FORMAT_MESSAGE = "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form";
1654+
1655+
@TruffleBoundary
1656+
public static RopeBuilder undump(Rope rope, RubyContext context, Node currentNode) {
1657+
byte[] bytes = rope.getBytes();
1658+
int start = 0;
1659+
int length = bytes.length;
1660+
Encoding[] enc = { rope.getEncoding() };
1661+
boolean[] utf8 = { false };
1662+
boolean[] binary = { false };
1663+
RopeBuilder undumped = new RopeBuilder();
1664+
undumped.setEncoding(enc[0]);
1665+
1666+
CodeRange cr = rope.getCodeRange();
1667+
if (cr != CR_7BIT) {
1668+
throw new RaiseException(
1669+
context,
1670+
context.getCoreExceptions().runtimeError("non-ASCII character detected", currentNode));
1671+
}
1672+
1673+
if (ArrayUtils.memchr(bytes, start, (byte) '\0', bytes.length) != -1) {
1674+
throw new RaiseException(
1675+
context,
1676+
context.getCoreExceptions().runtimeError("string contains null byte", currentNode));
1677+
}
1678+
if (length < 2) {
1679+
throw new RaiseException(
1680+
context,
1681+
context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode));
1682+
}
1683+
if (bytes[start] != '"') {
1684+
throw new RaiseException(
1685+
context,
1686+
context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode));
1687+
}
1688+
/* strip '"' at the start */
1689+
start++;
1690+
1691+
for (;;) {
1692+
if (start >= length) {
1693+
throw new RaiseException(
1694+
context,
1695+
context.getCoreExceptions().runtimeError("unterminated dumped string", currentNode));
1696+
}
1697+
1698+
if (bytes[start] == '"') {
1699+
/* epilogue */
1700+
start++;
1701+
if (start == length) {
1702+
/* ascii compatible dumped string */
1703+
break;
1704+
} else {
1705+
int size;
1706+
1707+
if (utf8[0]) {
1708+
throw new RaiseException(
1709+
context,
1710+
context.getCoreExceptions().runtimeError(
1711+
"dumped string contained Unicode escape but used force_encoding",
1712+
currentNode));
1713+
}
1714+
1715+
size = FORCE_ENCODING_BYTES.length;
1716+
if (length - start <= size) {
1717+
throw new RaiseException(
1718+
context,
1719+
context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode));
1720+
}
1721+
if (ArrayUtils.memcmp(bytes, start, FORCE_ENCODING_BYTES, 0, size) != 0) {
1722+
throw new RaiseException(
1723+
context,
1724+
context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode));
1725+
}
1726+
start += size;
1727+
1728+
int encname = start;
1729+
start = ArrayUtils.memchr(bytes, start, (byte) '"', length - start);
1730+
size = start - encname;
1731+
if (start == -1) {
1732+
throw new RaiseException(
1733+
context,
1734+
context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode));
1735+
}
1736+
if (length - start != 2) {
1737+
throw new RaiseException(
1738+
context,
1739+
context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode));
1740+
}
1741+
if (bytes[start] != '"' || bytes[start + 1] != ')') {
1742+
throw new RaiseException(
1743+
context,
1744+
context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode));
1745+
}
1746+
String encnameString = new String(bytes, encname, size, rope.encoding.getCharset());
1747+
RubyEncoding enc2 = context.getEncodingManager().getRubyEncoding(encnameString);
1748+
if (enc2 == null) {
1749+
throw new RaiseException(
1750+
context,
1751+
context.getCoreExceptions().runtimeError(
1752+
"dumped string has unknown encoding name",
1753+
currentNode));
1754+
}
1755+
undumped.setEncoding(enc2.encoding);
1756+
}
1757+
break;
1758+
}
1759+
1760+
if (bytes[start] == '\\') {
1761+
start++;
1762+
if (start >= length) {
1763+
throw new RaiseException(
1764+
context,
1765+
context.getCoreExceptions().runtimeError("invalid escape", currentNode));
1766+
}
1767+
start = undumpAfterBackslash(undumped, bytes, start, length, enc, utf8, binary, context, currentNode);
1768+
} else {
1769+
undumped.append(bytes, start++, 1);
1770+
}
1771+
}
1772+
1773+
return undumped;
1774+
}
1775+
1776+
private static int undumpAfterBackslash(RopeBuilder out, byte[] bytes, int start, int length, Encoding[] enc,
1777+
boolean[] utf8, boolean[] binary, RubyContext context, Node currentNode) {
1778+
long c;
1779+
int codelen;
1780+
int[] hexlen = { 0 };
1781+
byte[] buf = new byte[6];
1782+
1783+
switch (bytes[start]) {
1784+
case '\\':
1785+
case '"':
1786+
case '#':
1787+
out.append(bytes, start, 1); /* cat itself */
1788+
start++;
1789+
break;
1790+
case 'n':
1791+
case 'r':
1792+
case 't':
1793+
case 'f':
1794+
case 'v':
1795+
case 'b':
1796+
case 'a':
1797+
case 'e':
1798+
buf[0] = unescapeAscii(bytes[start]);
1799+
out.append(buf, 0, 1);
1800+
start++;
1801+
break;
1802+
case 'u':
1803+
if (binary[0]) {
1804+
throw new RaiseException(
1805+
context,
1806+
context.getCoreExceptions().runtimeError(
1807+
"hex escape and Unicode escape are mixed",
1808+
currentNode));
1809+
}
1810+
utf8[0] = true;
1811+
if (++start >= length) {
1812+
throw new RaiseException(
1813+
context,
1814+
context.getCoreExceptions().runtimeError("invalid Unicode escape", currentNode));
1815+
}
1816+
if (enc[0] != UTF8Encoding.INSTANCE) {
1817+
enc[0] = UTF8Encoding.INSTANCE;
1818+
out.setEncoding(UTF8Encoding.INSTANCE);
1819+
}
1820+
if (bytes[start] == '{') { /* handle u{...} form */
1821+
start++;
1822+
for (;;) {
1823+
if (start >= length) {
1824+
throw new RaiseException(
1825+
context,
1826+
context.getCoreExceptions().runtimeError(
1827+
"unterminated Unicode escape",
1828+
currentNode));
1829+
}
1830+
if (bytes[start] == '}') {
1831+
start++;
1832+
break;
1833+
}
1834+
if (Character.isSpaceChar(bytes[start])) {
1835+
start++;
1836+
continue;
1837+
}
1838+
c = scanHex(bytes, start, length - start, hexlen);
1839+
if (hexlen[0] == 0 || hexlen[0] > 6) {
1840+
throw new RaiseException(
1841+
context,
1842+
context.getCoreExceptions().runtimeError("invalid Unicode escape", currentNode));
1843+
}
1844+
if (c > 0x10ffff) {
1845+
throw new RaiseException(
1846+
context,
1847+
context.getCoreExceptions().runtimeError(
1848+
"invalid Unicode codepoint (too large)",
1849+
currentNode));
1850+
}
1851+
if (0xd800 <= c && c <= 0xdfff) {
1852+
throw new RaiseException(
1853+
context,
1854+
context.getCoreExceptions().runtimeError("invalid Unicode codepoint", currentNode));
1855+
}
1856+
codelen = EncodingUtils.encMbcput((int) c, buf, 0, enc[0]);
1857+
out.append(buf, 0, codelen);
1858+
start += hexlen[0];
1859+
}
1860+
} else { /* handle uXXXX form */
1861+
c = scanHex(bytes, start, 4, hexlen);
1862+
if (hexlen[0] != 4) {
1863+
throw new RaiseException(
1864+
context,
1865+
context.getCoreExceptions().runtimeError("invalid Unicode escape", currentNode));
1866+
}
1867+
if (0xd800 <= c && c <= 0xdfff) {
1868+
throw new RaiseException(
1869+
context,
1870+
context.getCoreExceptions().runtimeError("invalid Unicode codepoint", currentNode));
1871+
}
1872+
codelen = EncodingUtils.encMbcput((int) c, buf, 0, enc[0]);
1873+
out.append(buf, 0, codelen);
1874+
start += hexlen[0];
1875+
}
1876+
break;
1877+
case 'x':
1878+
if (utf8[0]) {
1879+
throw new RaiseException(
1880+
context,
1881+
context.getCoreExceptions().runtimeError(
1882+
"hex escape and Unicode escape are mixed",
1883+
currentNode));
1884+
}
1885+
binary[0] = true;
1886+
if (++start >= length) {
1887+
throw new RaiseException(
1888+
context,
1889+
context.getCoreExceptions().runtimeError("invalid hex escape", currentNode));
1890+
}
1891+
buf[0] = (byte) scanHex(bytes, start, 2, hexlen);
1892+
if (hexlen[0] != 2) {
1893+
throw new RaiseException(
1894+
context,
1895+
context.getCoreExceptions().runtimeError("invalid hex escape", currentNode));
1896+
}
1897+
out.append(buf, 0, 1);
1898+
start += hexlen[0];
1899+
break;
1900+
default:
1901+
out.append(bytes, start - 1, 2);
1902+
start++;
1903+
}
1904+
1905+
return start;
1906+
}
1907+
1908+
private static long scanHex(byte[] bytes, int start, int len, int[] retlen) {
1909+
int s = start;
1910+
long retval = 0;
1911+
int tmp;
1912+
1913+
while ((len--) > 0 && s < bytes.length &&
1914+
(tmp = ArrayUtils.memchr(HEXDIGIT, 0, bytes[s], HEXDIGIT.length)) != -1) {
1915+
retval <<= 4;
1916+
retval |= tmp & 15;
1917+
s++;
1918+
}
1919+
retlen[0] = (s - start); /* less than len */
1920+
return retval;
1921+
}
1922+
1923+
private static byte unescapeAscii(byte c) {
1924+
switch (c) {
1925+
case 'n':
1926+
return '\n';
1927+
case 'r':
1928+
return '\r';
1929+
case 't':
1930+
return '\t';
1931+
case 'f':
1932+
return '\f';
1933+
case 'v':
1934+
return '\13';
1935+
case 'b':
1936+
return '\010';
1937+
case 'a':
1938+
return '\007';
1939+
case 'e':
1940+
return 033;
1941+
default:
1942+
// not reached
1943+
return -1;
1944+
}
1945+
}
16441946
}

0 commit comments

Comments
 (0)