|
| 1 | +// Disabling this globally as I use it a lot to speed up common operations and cut down on |
| 2 | +// duplicate comparisons. |
| 3 | +/* eslint-disable no-bitwise */ |
| 4 | +"use strict" |
| 5 | + |
| 6 | +const win1252Map = [ |
| 7 | + 0x20AC, |
| 8 | + 0x81, |
| 9 | + 0x201A, |
| 10 | + 0x0192, |
| 11 | + 0x201E, |
| 12 | + 0x2026, |
| 13 | + 0x2020, |
| 14 | + 0x2021, |
| 15 | + 0x02C6, |
| 16 | + 0x2030, |
| 17 | + 0x0160, |
| 18 | + 0x2039, |
| 19 | + 0x0152, |
| 20 | + 0x8D, |
| 21 | + 0x017D, |
| 22 | + 0x8F, |
| 23 | + 0x90, |
| 24 | + 0x2018, |
| 25 | + 0x2019, |
| 26 | + 0x201C, |
| 27 | + 0x201D, |
| 28 | + 0x2022, |
| 29 | + 0x2013, |
| 30 | + 0x2014, |
| 31 | + 0x02DC, |
| 32 | + 0x2122, |
| 33 | + 0x0161, |
| 34 | + 0x203A, |
| 35 | + 0x0153, |
| 36 | + 0x9D, |
| 37 | + 0x017E, |
| 38 | + 0x0178, |
| 39 | +] |
| 40 | + |
| 41 | +function decode(buffer, encoding) { |
| 42 | + switch (encoding) { |
| 43 | + case "utf16be": |
| 44 | + buffer.swap16() |
| 45 | + encoding = "utf16le" |
| 46 | + break |
| 47 | + |
| 48 | + case "win1252": |
| 49 | + encoding = "latin1" |
| 50 | + for (let i = 0; i < buffer.length; i++) { |
| 51 | + const value = buffer[i] |
| 52 | + if ((value & 0xE0) === 0x80) { |
| 53 | + const u16 = new Uint16Array(buffer.length) |
| 54 | + u16.set(buffer.subarray(0, i), 0) |
| 55 | + for (; i < buffer.length; i++) { |
| 56 | + const value = buffer[i] |
| 57 | + const mask = -((value & 0xE0) === 0x80) |
| 58 | + u16[i] = value & ~mask | win1252Map[value & 0x1F] & mask |
| 59 | + } |
| 60 | + buffer = Buffer.from(u16.buffer) |
| 61 | + encoding = "utf16le" |
| 62 | + break |
| 63 | + } |
| 64 | + } |
| 65 | + break |
| 66 | + } |
| 67 | + |
| 68 | + return buffer.toString(encoding) |
| 69 | +} |
| 70 | + |
| 71 | +// Ref: https://encoding.spec.whatwg.org/#concept-encoding-get |
| 72 | +/** @type {Array<["utf8" | "utf16le" | "utf16be" | "win1252", string]>} */ |
| 73 | +const encodingMap = [ |
| 74 | + ["utf8", "UNICODE11UTF8"], |
| 75 | + ["utf8", "UNICODE20UTF8"], |
| 76 | + ["utf8", "UNICODE-1-1-UTF-8"], |
| 77 | + ["utf8", "UTF8"], |
| 78 | + ["utf8", "UTF-8"], |
| 79 | + ["utf8", "X-UNICODE20UTF8"], |
| 80 | + ["win1252", "ANSI_X3.4-1968"], |
| 81 | + ["win1252", "ASCII"], |
| 82 | + ["win1252", "CP1252"], |
| 83 | + ["win1252", "CP819"], |
| 84 | + ["win1252", "CSISOLATIN1"], |
| 85 | + ["win1252", "IBM819"], |
| 86 | + ["win1252", "ISO-8859-1"], |
| 87 | + ["win1252", "ISO-IR-100"], |
| 88 | + ["win1252", "ISO8859-1"], |
| 89 | + ["win1252", "ISO88591"], |
| 90 | + ["win1252", "ISO_8859-1"], |
| 91 | + ["win1252", "ISO_8859-1:1987"], |
| 92 | + ["win1252", "L1"], |
| 93 | + ["win1252", "LATIN1"], |
| 94 | + ["win1252", "US-ASCII"], |
| 95 | + ["win1252", "WINDOWS-1252"], |
| 96 | + ["win1252", "X-CP1252"], |
| 97 | + ["utf16be", "UNICODEFFFE"], |
| 98 | + ["utf16be", "UTF-16BE"], |
| 99 | + ["utf16le", "CSUNICODE"], |
| 100 | + ["utf16le", "ISO-10646-UCS-2"], |
| 101 | + ["utf16le", "UCS-2"], |
| 102 | + ["utf16le", "UNICODE"], |
| 103 | + ["utf16le", "UNICODEFEFF"], |
| 104 | + ["utf16le", "UTF-16"], |
| 105 | + ["utf16le", "UTF-16LE"], |
| 106 | +] |
| 107 | + |
| 108 | +function extractNamedEncoding(name) { |
| 109 | + outer: |
| 110 | + for (const entry of encodingMap) { |
| 111 | + const expected = entry[1] |
| 112 | + if (expected.length !== name.length) continue |
| 113 | + for (let i = 0; i < name.length; i++) { |
| 114 | + let ch = expected.charCodeAt(i) |
| 115 | + const upper = ch & ~0x20 |
| 116 | + if (upper >= 0x41 && upper <= 0x5A) ch = upper |
| 117 | + if (name.charCodeAt(i) !== expected) continue outer |
| 118 | + } |
| 119 | + return entry[0] |
| 120 | + } |
| 121 | + return undefined |
| 122 | +} |
| 123 | + |
| 124 | +function isAsciiWhitespace(ch) { |
| 125 | + const mask = ( |
| 126 | + 1 << (0x09 - 1) | |
| 127 | + 1 << (0x0A - 1) | |
| 128 | + 1 << (0x0C - 1) | |
| 129 | + 1 << (0x0D - 1) | |
| 130 | + 1 << (0x20 - 1) |
| 131 | + ) |
| 132 | + |
| 133 | + ch |= 0 |
| 134 | + return ch < 0x20 && (mask >>> (ch - 1) & 1) !== 0 |
| 135 | +} |
| 136 | + |
| 137 | +function startsWith(buffer, i, end, sequence) { |
| 138 | + if (buffer.length < i + sequence.length) return false |
| 139 | + |
| 140 | + for (let j = 0; j < sequence.length && i < end; i++, j++) { |
| 141 | + let ch = sequence.charCodeAt(j) |
| 142 | + if (ch === 0x20) { |
| 143 | + if (!isAsciiWhitespace(buffer[i++])) return false |
| 144 | + while (i < buffer.length && isAsciiWhitespace(buffer[i])) i++ |
| 145 | + } else { |
| 146 | + const upper = ch & ~0x20 |
| 147 | + if (upper >= 0x41 && upper <= 0x5A) ch = upper |
| 148 | + if (ch !== buffer[i]) return false |
| 149 | + } |
| 150 | + } |
| 151 | + |
| 152 | + return true |
| 153 | +} |
| 154 | + |
| 155 | +const metasToCheck = encodingMap.flatMap(([e, n]) => [ |
| 156 | + [e, `charset=${n}>`], |
| 157 | + [e, `charset="${n}">`], |
| 158 | + [e, `charset='${n}'>`], |
| 159 | + [e, `charset=${n}/>`], |
| 160 | + [e, `charset="${n}"/>`], |
| 161 | + [e, `charset='${n}'/>`], |
| 162 | + [e, `http-equiv=content-type content=${n}>`], |
| 163 | + [e, `http-equiv="content-type" content=${n}>`], |
| 164 | + [e, `http-equiv='content-type' content=${n}>`], |
| 165 | + [e, `http-equiv=content-type content="${n}">`], |
| 166 | + [e, `http-equiv="content-type" content="${n}">`], |
| 167 | + [e, `http-equiv='content-type' content="${n}">`], |
| 168 | + [e, `http-equiv=content-type content='${n}'>`], |
| 169 | + [e, `http-equiv="content-type" content='${n}'>`], |
| 170 | + [e, `http-equiv='content-type' content='${n}'>`], |
| 171 | + [e, `http-equiv=content-type content=${n}/>`], |
| 172 | + [e, `http-equiv="content-type" content=${n}/>`], |
| 173 | + [e, `http-equiv='content-type' content=${n}/>`], |
| 174 | + [e, `http-equiv=content-type content="${n}"/>`], |
| 175 | + [e, `http-equiv="content-type" content="${n}"/>`], |
| 176 | + [e, `http-equiv='content-type' content="${n}"/>`], |
| 177 | + [e, `http-equiv=content-type content='${n}'/>`], |
| 178 | + [e, `http-equiv="content-type" content='${n}'/>`], |
| 179 | + [e, `http-equiv='content-type' content='${n}'/>`], |
| 180 | +]) |
| 181 | + |
| 182 | +function extractMetaEncoding(buffer, i, end) { |
| 183 | + // Exceptionally lazy and not quite fully correct |
| 184 | + for (const [encoding, meta] of metasToCheck) { |
| 185 | + if (startsWith(buffer, i, end, meta)) return encoding |
| 186 | + } |
| 187 | + return undefined |
| 188 | +} |
| 189 | + |
| 190 | +/** |
| 191 | + * @returns {"utf8" | "utf16le" | "utf16be" | "win1252"} |
| 192 | + */ |
| 193 | +function detectEncoding(headers, prefix) { |
| 194 | + // This follows the HTML spec to the extent Node supports the various encodings. I'm *not*, |
| 195 | + // however, going to bend over backwards to support obscure encodings. |
| 196 | + // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding |
| 197 | + |
| 198 | + if (startsWith(prefix, 0, prefix.length, "\xEF\xBB\xBF")) return "utf8" |
| 199 | + if (startsWith(prefix, 0, prefix.length, "\xFE\xFF")) return "utf16le" |
| 200 | + if (startsWith(prefix, 0, prefix.length, "\xFF\xFE")) return "utf16be" |
| 201 | + |
| 202 | + const contentType = headers["content-type"] |
| 203 | + if (contentType) { |
| 204 | + const result = (/;\s*charset="?([\w-]+)"?/i).exec(contentType) |
| 205 | + if (result) { |
| 206 | + const encoding = extractNamedEncoding(result[1]) |
| 207 | + if (encoding) return encoding |
| 208 | + } |
| 209 | + } |
| 210 | + |
| 211 | + if (startsWith(prefix, 0, prefix.length, "\x3c\x00\x3F\x00\x78\x00")) return "utf16le" |
| 212 | + if (startsWith(prefix, 0, prefix.length, "\x00\x3c\x00\x3F\x00\x78")) return "utf16be" |
| 213 | + |
| 214 | + for (let i = 0, end = prefix.indexOf("<!--", 0, "latin1"); i < prefix.length;) { |
| 215 | + if (i === end) { |
| 216 | + i = prefix.indexOf("-->", i + 4, "latin1") |
| 217 | + if (i < 0) return undefined |
| 218 | + i += 3 |
| 219 | + end = prefix.indexOf("<!--", i, "latin1") |
| 220 | + } else if (prefix[i] === 0x3C) { |
| 221 | + i++ |
| 222 | + if (i === prefix.length) return "win1252" |
| 223 | + |
| 224 | + if (startsWith(prefix, i, end, "meta ")) { |
| 225 | + const encoding = extractMetaEncoding(prefix, i, end) |
| 226 | + if (encoding) return encoding |
| 227 | + } else if (prefix[i] === 0x21 || prefix[i] === 0x2F || prefix[i] === 0x3F) { |
| 228 | + i = prefix.indexOf(0x3E, i) |
| 229 | + if (i < 0) return "win1252" |
| 230 | + i++ |
| 231 | + } |
| 232 | + } |
| 233 | + } |
| 234 | + |
| 235 | + return "win1252" |
| 236 | +} |
| 237 | + |
| 238 | +function decodeResponse(headers, body) { |
| 239 | + return decode(body, detectEncoding(headers, body.subarray(0, 1024))) |
| 240 | +} |
| 241 | + |
| 242 | +module.exports = { |
| 243 | + decodeResponse, |
| 244 | +} |
0 commit comments