|
33 | 33 |
|
34 | 34 | import java.util.Arrays;
|
35 | 35 |
|
| 36 | +import com.oracle.truffle.api.nodes.Node; |
36 | 37 | import org.jcodings.Config;
|
37 | 38 | import org.jcodings.Encoding;
|
38 | 39 | import org.jcodings.IntHolder;
|
39 | 40 | import org.jcodings.ascii.AsciiTables;
|
40 | 41 | import org.jcodings.constants.CharacterType;
|
41 | 42 | import org.jcodings.specific.ASCIIEncoding;
|
| 43 | +import org.jcodings.specific.UTF8Encoding; |
42 | 44 | import org.jcodings.util.IntHash;
|
| 45 | +import org.truffleruby.RubyContext; |
43 | 46 | import org.truffleruby.collections.IntHashMap;
|
44 | 47 | import org.truffleruby.core.array.ArrayUtils;
|
| 48 | +import org.truffleruby.core.encoding.RubyEncoding; |
45 | 49 | import org.truffleruby.core.rope.CodeRange;
|
46 | 50 | import org.truffleruby.core.rope.Rope;
|
47 | 51 | import org.truffleruby.core.rope.RopeBuilder;
|
48 | 52 | import org.truffleruby.core.rope.RopeOperations;
|
49 | 53 |
|
50 | 54 | import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
|
| 55 | +import org.truffleruby.language.control.RaiseException; |
51 | 56 | import org.truffleruby.utils.Utils;
|
52 | 57 |
|
53 | 58 | public final class StringSupport {
|
@@ -1641,4 +1646,301 @@ public static boolean isAsciiCodepoint(int value) {
|
1641 | 1646 | }
|
1642 | 1647 |
|
1643 | 1648 | //endregion
|
| 1649 | + //region undump helpers |
| 1650 | + |
| 1651 | + private static final byte[] FORCE_ENCODING_BYTES = RopeOperations.encodeAsciiBytes(".force_encoding(\""); |
| 1652 | + private static final byte[] HEXDIGIT = RopeOperations.encodeAsciiBytes("0123456789abcdef0123456789ABCDEF"); |
| 1653 | + private static final String INVALID_FORMAT_MESSAGE = "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form"; |
| 1654 | + |
| 1655 | + @TruffleBoundary |
| 1656 | + public static RopeBuilder undump(Rope rope, RubyContext context, Node currentNode) { |
| 1657 | + byte[] bytes = rope.getBytes(); |
| 1658 | + int start = 0; |
| 1659 | + int length = bytes.length; |
| 1660 | + Encoding[] enc = { rope.getEncoding() }; |
| 1661 | + boolean[] utf8 = { false }; |
| 1662 | + boolean[] binary = { false }; |
| 1663 | + RopeBuilder undumped = new RopeBuilder(); |
| 1664 | + undumped.setEncoding(enc[0]); |
| 1665 | + |
| 1666 | + CodeRange cr = rope.getCodeRange(); |
| 1667 | + if (cr != CR_7BIT) { |
| 1668 | + throw new RaiseException( |
| 1669 | + context, |
| 1670 | + context.getCoreExceptions().runtimeError("non-ASCII character detected", currentNode)); |
| 1671 | + } |
| 1672 | + |
| 1673 | + if (ArrayUtils.memchr(bytes, start, (byte) '\0', bytes.length) != -1) { |
| 1674 | + throw new RaiseException( |
| 1675 | + context, |
| 1676 | + context.getCoreExceptions().runtimeError("string contains null byte", currentNode)); |
| 1677 | + } |
| 1678 | + if (length < 2) { |
| 1679 | + throw new RaiseException( |
| 1680 | + context, |
| 1681 | + context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode)); |
| 1682 | + } |
| 1683 | + if (bytes[start] != '"') { |
| 1684 | + throw new RaiseException( |
| 1685 | + context, |
| 1686 | + context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode)); |
| 1687 | + } |
| 1688 | + /* strip '"' at the start */ |
| 1689 | + start++; |
| 1690 | + |
| 1691 | + for (;;) { |
| 1692 | + if (start >= length) { |
| 1693 | + throw new RaiseException( |
| 1694 | + context, |
| 1695 | + context.getCoreExceptions().runtimeError("unterminated dumped string", currentNode)); |
| 1696 | + } |
| 1697 | + |
| 1698 | + if (bytes[start] == '"') { |
| 1699 | + /* epilogue */ |
| 1700 | + start++; |
| 1701 | + if (start == length) { |
| 1702 | + /* ascii compatible dumped string */ |
| 1703 | + break; |
| 1704 | + } else { |
| 1705 | + int size; |
| 1706 | + |
| 1707 | + if (utf8[0]) { |
| 1708 | + throw new RaiseException( |
| 1709 | + context, |
| 1710 | + context.getCoreExceptions().runtimeError( |
| 1711 | + "dumped string contained Unicode escape but used force_encoding", |
| 1712 | + currentNode)); |
| 1713 | + } |
| 1714 | + |
| 1715 | + size = FORCE_ENCODING_BYTES.length; |
| 1716 | + if (length - start <= size) { |
| 1717 | + throw new RaiseException( |
| 1718 | + context, |
| 1719 | + context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode)); |
| 1720 | + } |
| 1721 | + if (ArrayUtils.memcmp(bytes, start, FORCE_ENCODING_BYTES, 0, size) != 0) { |
| 1722 | + throw new RaiseException( |
| 1723 | + context, |
| 1724 | + context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode)); |
| 1725 | + } |
| 1726 | + start += size; |
| 1727 | + |
| 1728 | + int encname = start; |
| 1729 | + start = ArrayUtils.memchr(bytes, start, (byte) '"', length - start); |
| 1730 | + size = start - encname; |
| 1731 | + if (start == -1) { |
| 1732 | + throw new RaiseException( |
| 1733 | + context, |
| 1734 | + context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode)); |
| 1735 | + } |
| 1736 | + if (length - start != 2) { |
| 1737 | + throw new RaiseException( |
| 1738 | + context, |
| 1739 | + context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode)); |
| 1740 | + } |
| 1741 | + if (bytes[start] != '"' || bytes[start + 1] != ')') { |
| 1742 | + throw new RaiseException( |
| 1743 | + context, |
| 1744 | + context.getCoreExceptions().runtimeError(INVALID_FORMAT_MESSAGE, currentNode)); |
| 1745 | + } |
| 1746 | + String encnameString = new String(bytes, encname, size, rope.encoding.getCharset()); |
| 1747 | + RubyEncoding enc2 = context.getEncodingManager().getRubyEncoding(encnameString); |
| 1748 | + if (enc2 == null) { |
| 1749 | + throw new RaiseException( |
| 1750 | + context, |
| 1751 | + context.getCoreExceptions().runtimeError( |
| 1752 | + "dumped string has unknown encoding name", |
| 1753 | + currentNode)); |
| 1754 | + } |
| 1755 | + undumped.setEncoding(enc2.encoding); |
| 1756 | + } |
| 1757 | + break; |
| 1758 | + } |
| 1759 | + |
| 1760 | + if (bytes[start] == '\\') { |
| 1761 | + start++; |
| 1762 | + if (start >= length) { |
| 1763 | + throw new RaiseException( |
| 1764 | + context, |
| 1765 | + context.getCoreExceptions().runtimeError("invalid escape", currentNode)); |
| 1766 | + } |
| 1767 | + start = undumpAfterBackslash(undumped, bytes, start, length, enc, utf8, binary, context, currentNode); |
| 1768 | + } else { |
| 1769 | + undumped.append(bytes, start++, 1); |
| 1770 | + } |
| 1771 | + } |
| 1772 | + |
| 1773 | + return undumped; |
| 1774 | + } |
| 1775 | + |
| 1776 | + private static int undumpAfterBackslash(RopeBuilder out, byte[] bytes, int start, int length, Encoding[] enc, |
| 1777 | + boolean[] utf8, boolean[] binary, RubyContext context, Node currentNode) { |
| 1778 | + long c; |
| 1779 | + int codelen; |
| 1780 | + int[] hexlen = { 0 }; |
| 1781 | + byte[] buf = new byte[6]; |
| 1782 | + |
| 1783 | + switch (bytes[start]) { |
| 1784 | + case '\\': |
| 1785 | + case '"': |
| 1786 | + case '#': |
| 1787 | + out.append(bytes, start, 1); /* cat itself */ |
| 1788 | + start++; |
| 1789 | + break; |
| 1790 | + case 'n': |
| 1791 | + case 'r': |
| 1792 | + case 't': |
| 1793 | + case 'f': |
| 1794 | + case 'v': |
| 1795 | + case 'b': |
| 1796 | + case 'a': |
| 1797 | + case 'e': |
| 1798 | + buf[0] = unescapeAscii(bytes[start]); |
| 1799 | + out.append(buf, 0, 1); |
| 1800 | + start++; |
| 1801 | + break; |
| 1802 | + case 'u': |
| 1803 | + if (binary[0]) { |
| 1804 | + throw new RaiseException( |
| 1805 | + context, |
| 1806 | + context.getCoreExceptions().runtimeError( |
| 1807 | + "hex escape and Unicode escape are mixed", |
| 1808 | + currentNode)); |
| 1809 | + } |
| 1810 | + utf8[0] = true; |
| 1811 | + if (++start >= length) { |
| 1812 | + throw new RaiseException( |
| 1813 | + context, |
| 1814 | + context.getCoreExceptions().runtimeError("invalid Unicode escape", currentNode)); |
| 1815 | + } |
| 1816 | + if (enc[0] != UTF8Encoding.INSTANCE) { |
| 1817 | + enc[0] = UTF8Encoding.INSTANCE; |
| 1818 | + out.setEncoding(UTF8Encoding.INSTANCE); |
| 1819 | + } |
| 1820 | + if (bytes[start] == '{') { /* handle u{...} form */ |
| 1821 | + start++; |
| 1822 | + for (;;) { |
| 1823 | + if (start >= length) { |
| 1824 | + throw new RaiseException( |
| 1825 | + context, |
| 1826 | + context.getCoreExceptions().runtimeError( |
| 1827 | + "unterminated Unicode escape", |
| 1828 | + currentNode)); |
| 1829 | + } |
| 1830 | + if (bytes[start] == '}') { |
| 1831 | + start++; |
| 1832 | + break; |
| 1833 | + } |
| 1834 | + if (Character.isSpaceChar(bytes[start])) { |
| 1835 | + start++; |
| 1836 | + continue; |
| 1837 | + } |
| 1838 | + c = scanHex(bytes, start, length - start, hexlen); |
| 1839 | + if (hexlen[0] == 0 || hexlen[0] > 6) { |
| 1840 | + throw new RaiseException( |
| 1841 | + context, |
| 1842 | + context.getCoreExceptions().runtimeError("invalid Unicode escape", currentNode)); |
| 1843 | + } |
| 1844 | + if (c > 0x10ffff) { |
| 1845 | + throw new RaiseException( |
| 1846 | + context, |
| 1847 | + context.getCoreExceptions().runtimeError( |
| 1848 | + "invalid Unicode codepoint (too large)", |
| 1849 | + currentNode)); |
| 1850 | + } |
| 1851 | + if (0xd800 <= c && c <= 0xdfff) { |
| 1852 | + throw new RaiseException( |
| 1853 | + context, |
| 1854 | + context.getCoreExceptions().runtimeError("invalid Unicode codepoint", currentNode)); |
| 1855 | + } |
| 1856 | + codelen = EncodingUtils.encMbcput((int) c, buf, 0, enc[0]); |
| 1857 | + out.append(buf, 0, codelen); |
| 1858 | + start += hexlen[0]; |
| 1859 | + } |
| 1860 | + } else { /* handle uXXXX form */ |
| 1861 | + c = scanHex(bytes, start, 4, hexlen); |
| 1862 | + if (hexlen[0] != 4) { |
| 1863 | + throw new RaiseException( |
| 1864 | + context, |
| 1865 | + context.getCoreExceptions().runtimeError("invalid Unicode escape", currentNode)); |
| 1866 | + } |
| 1867 | + if (0xd800 <= c && c <= 0xdfff) { |
| 1868 | + throw new RaiseException( |
| 1869 | + context, |
| 1870 | + context.getCoreExceptions().runtimeError("invalid Unicode codepoint", currentNode)); |
| 1871 | + } |
| 1872 | + codelen = EncodingUtils.encMbcput((int) c, buf, 0, enc[0]); |
| 1873 | + out.append(buf, 0, codelen); |
| 1874 | + start += hexlen[0]; |
| 1875 | + } |
| 1876 | + break; |
| 1877 | + case 'x': |
| 1878 | + if (utf8[0]) { |
| 1879 | + throw new RaiseException( |
| 1880 | + context, |
| 1881 | + context.getCoreExceptions().runtimeError( |
| 1882 | + "hex escape and Unicode escape are mixed", |
| 1883 | + currentNode)); |
| 1884 | + } |
| 1885 | + binary[0] = true; |
| 1886 | + if (++start >= length) { |
| 1887 | + throw new RaiseException( |
| 1888 | + context, |
| 1889 | + context.getCoreExceptions().runtimeError("invalid hex escape", currentNode)); |
| 1890 | + } |
| 1891 | + buf[0] = (byte) scanHex(bytes, start, 2, hexlen); |
| 1892 | + if (hexlen[0] != 2) { |
| 1893 | + throw new RaiseException( |
| 1894 | + context, |
| 1895 | + context.getCoreExceptions().runtimeError("invalid hex escape", currentNode)); |
| 1896 | + } |
| 1897 | + out.append(buf, 0, 1); |
| 1898 | + start += hexlen[0]; |
| 1899 | + break; |
| 1900 | + default: |
| 1901 | + out.append(bytes, start - 1, 2); |
| 1902 | + start++; |
| 1903 | + } |
| 1904 | + |
| 1905 | + return start; |
| 1906 | + } |
| 1907 | + |
| 1908 | + private static long scanHex(byte[] bytes, int start, int len, int[] retlen) { |
| 1909 | + int s = start; |
| 1910 | + long retval = 0; |
| 1911 | + int tmp; |
| 1912 | + |
| 1913 | + while ((len--) > 0 && s < bytes.length && |
| 1914 | + (tmp = ArrayUtils.memchr(HEXDIGIT, 0, bytes[s], HEXDIGIT.length)) != -1) { |
| 1915 | + retval <<= 4; |
| 1916 | + retval |= tmp & 15; |
| 1917 | + s++; |
| 1918 | + } |
| 1919 | + retlen[0] = (s - start); /* less than len */ |
| 1920 | + return retval; |
| 1921 | + } |
| 1922 | + |
| 1923 | + private static byte unescapeAscii(byte c) { |
| 1924 | + switch (c) { |
| 1925 | + case 'n': |
| 1926 | + return '\n'; |
| 1927 | + case 'r': |
| 1928 | + return '\r'; |
| 1929 | + case 't': |
| 1930 | + return '\t'; |
| 1931 | + case 'f': |
| 1932 | + return '\f'; |
| 1933 | + case 'v': |
| 1934 | + return '\13'; |
| 1935 | + case 'b': |
| 1936 | + return '\010'; |
| 1937 | + case 'a': |
| 1938 | + return '\007'; |
| 1939 | + case 'e': |
| 1940 | + return 033; |
| 1941 | + default: |
| 1942 | + // not reached |
| 1943 | + return -1; |
| 1944 | + } |
| 1945 | + } |
1644 | 1946 | }
|
0 commit comments