Skip to content

Commit a7d93ef

Browse files
authored
[Strings] Handle encoding in JSON parsing so StringLifting can handle arbitrary custom section content (#7414)
Rather than encode to WTF8 and re-encode, instead make the unescaping logic go from UTF8 straight to WTF16. That makes it simpler and more efficient. Make the JSON parser get a parameter for which encoding to use for strings, so we can use ascii in old places.
1 parent f77a69d commit a7d93ef

File tree

7 files changed

+75
-48
lines changed

7 files changed

+75
-48
lines changed

src/passes/StringLifting.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,14 @@ struct StringLifting : public Pass {
6666
continue;
6767
}
6868
if (global->module == stringConstsModule) {
69-
importedStrings[global->name] = global->base;
69+
// Encode from WTF-8 to WTF-16.
70+
auto wtf8 = global->base;
71+
std::stringstream wtf16;
72+
bool valid = String::convertWTF8ToWTF16(wtf16, wtf8.str);
73+
if (!valid) {
74+
Fatal() << "Bad string to lift: " << wtf8;
75+
}
76+
importedStrings[global->name] = wtf16.str();
7077
found = true;
7178
}
7279
}
@@ -77,7 +84,7 @@ struct StringLifting : public Pass {
7784
// We found the string consts section. Parse it.
7885
auto copy = section.data;
7986
json::Value array;
80-
array.parse(copy.data());
87+
array.parse(copy.data(), json::Value::WTF16);
8188
if (!array.isArray()) {
8289
Fatal()
8390
<< "StringLifting: string.const section should be a JSON array";
@@ -203,15 +210,8 @@ struct StringLifting : public Pass {
203210
// Replace global.gets of imported strings with string.const.
204211
auto iter = parent.importedStrings.find(curr->name);
205212
if (iter != parent.importedStrings.end()) {
206-
// Encode from WTF-8 to WTF-16.
207-
auto wtf8 = iter->second;
208-
std::stringstream wtf16;
209-
bool valid = String::convertWTF8ToWTF16(wtf16, wtf8.str);
210-
if (!valid) {
211-
Fatal() << "Bad string to lift: " << wtf8;
212-
}
213-
214-
replaceCurrent(Builder(*getModule()).makeStringConst(wtf16.str()));
213+
auto wtf16 = iter->second;
214+
replaceCurrent(Builder(*getModule()).makeStringConst(wtf16.str));
215215
modified = true;
216216
}
217217
}

src/support/json.h

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,16 @@ struct Value {
249249
return true;
250250
}
251251

252-
char* parse(char* curr) {
252+
// The encoding into which we parse strings. The input encoding is always
253+
// UTF8, but we can parse into ASCII (very quickly, and without many small
254+
// allocations), or we can parse into WTF16 (which is the format used by
255+
// StringConst).
256+
enum StringEncoding {
257+
ASCII,
258+
WTF16,
259+
};
260+
261+
char* parse(char* curr, StringEncoding stringEncoding) {
253262
#define is_json_space(x) \
254263
(x == 32 || x == 9 || x == 10 || \
255264
x == 13) /* space, tab, linefeed/newline, or return */
@@ -271,7 +280,13 @@ struct Value {
271280
assert(close);
272281
*close = 0; // end this string, and reuse it straight from the input
273282
char* raw = curr + 1;
274-
unescapeAndSetString(raw);
283+
if (stringEncoding == ASCII) {
284+
// Just use the current string.
285+
setString(raw);
286+
} else {
287+
assert(stringEncoding == WTF16);
288+
unescapeIntoWTF16(raw);
289+
}
275290
curr = close + 1;
276291
} else if (*curr == '[') {
277292
// Array
@@ -281,7 +296,7 @@ struct Value {
281296
while (*curr != ']') {
282297
Ref temp = Ref(new Value());
283298
arr->push_back(temp);
284-
curr = temp->parse(curr);
299+
curr = temp->parse(curr, stringEncoding);
285300
skip();
286301
if (*curr == ']') {
287302
break;
@@ -324,7 +339,7 @@ struct Value {
324339
curr++;
325340
skip();
326341
Ref value = Ref(new Value());
327-
curr = value->parse(curr);
342+
curr = value->parse(curr, stringEncoding);
328343
(*obj)[key] = value;
329344
skip();
330345
if (*curr == '}') {
@@ -412,20 +427,14 @@ struct Value {
412427
}
413428

414429
private:
415-
// If the string has no escaped characters, setString() the char* directly. If
416-
// it does require escaping, do that and intern a new string with those
417-
// contents.
418-
void unescapeAndSetString(char* str) {
419-
if (!strchr(str, '\\')) {
420-
// No escaping slash.
421-
setString(str);
422-
return;
423-
}
424-
425-
auto unescaped = wasm::String::unescapeJSONToWTF8(str);
426-
427-
setString(
428-
IString(std::string_view(unescaped.data(), unescaped.size()), false));
430+
// Unescape the input (UTF8) string into one of our internal strings (WTF16).
431+
void unescapeIntoWTF16(char* str) {
432+
// TODO: Optimize the unescaped path? But it is impossible to avoid an
433+
// allocation here.
434+
std::stringstream ss;
435+
wasm::String::unescapeUTF8JSONtoWTF16(ss, str);
436+
// TODO: Use ss.view() once we have C++20.
437+
setString(ss.str());
429438
}
430439
};
431440

src/support/string.cpp

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -432,13 +432,12 @@ bool isUTF8(std::string_view str) {
432432
return true;
433433
}
434434

435-
std::vector<char> unescapeJSONToWTF8(const char* str) {
436-
std::vector<char> unescaped;
435+
std::ostream& unescapeUTF8JSONtoWTF16(std::ostream& os, const char* str) {
437436
size_t i = 0;
438437
while (str[i]) {
439438
if (str[i] != '\\') {
440439
// Normal character.
441-
unescaped.push_back(str[i]);
440+
writeWTF16CodePoint(os, str[i]);
442441
i++;
443442
continue;
444443
}
@@ -465,7 +464,7 @@ std::vector<char> unescapeJSONToWTF8(const char* str) {
465464
case 0:
466465
Fatal() << "Invalid escaped JSON ends in slash";
467466
}
468-
unescaped.push_back(c);
467+
writeWTF16CodePoint(os, c);
469468
i += 2;
470469
continue;
471470
}
@@ -480,17 +479,12 @@ std::vector<char> unescapeJSONToWTF8(const char* str) {
480479
unhex >> x;
481480

482481
// Write out the results.
483-
unescaped.push_back(x & 0xff);
484-
x >>= 8;
485-
if (x) {
486-
unescaped.push_back(x);
487-
}
488-
// TODO UTF stuff
482+
writeWTF16CodePoint(os, x);
489483

490484
i += 6;
491485
}
492486

493-
return unescaped;
487+
return os;
494488
}
495489

496490
} // namespace wasm::String

src/support/string.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,8 @@ bool convertUTF16ToUTF8(std::ostream& os, std::string_view str);
102102
// Whether the string is valid UTF-8.
103103
bool isUTF8(std::string_view str);
104104

105-
// Given a string of properly-escaped JSON, unescape it.
106-
std::vector<char> unescapeJSONToWTF8(const char* str);
105+
// Given a string of properly-escaped JSON in UTF8, unescape it into WTF16.
106+
std::ostream& unescapeUTF8JSONtoWTF16(std::ostream& os, const char* str);
107107

108108
} // namespace wasm::String
109109

src/tools/wasm-metadce.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,7 @@ int main(int argc, const char* argv[]) {
518518
auto graphInput(read_file<std::string>(graphFile, Flags::Text));
519519
auto* copy = strdup(graphInput.c_str());
520520
json::Value outside;
521-
outside.parse(copy);
521+
outside.parse(copy, json::Value::ASCII);
522522

523523
// parse the JSON into our graph, doing all the JSON parsing here, leaving
524524
// the abstract computation for the class itself

test/gtest/json.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ TEST_F(JSONTest, Stringify) {
88
auto input = "[\"hello\",\"world\"]";
99
auto* copy = strdup(input);
1010
json::Value value;
11-
value.parse(copy);
11+
value.parse(copy, json::Value::ASCII);
1212
std::stringstream ss;
1313
value.stringify(ss);
1414
EXPECT_EQ(ss.str(), input);

test/lit/passes/string-lifting-section.wast

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,13 @@
2929

3030
;; CHECK: (import "string.const" "1" (global $"string.const_\"foo\"" (ref extern)))
3131

32-
;; CHECK: (import "string.const" "2" (global $"string.const_\"needs\\tescaping\\00.\\\'#%\\\"\"" (ref extern)))
32+
;; CHECK: (import "string.const" "2" (global $"string.const_\"needs\\tescaping\\00.\\\'#%\\\"- .\\r\\n\\\\08\\0c\\n\\r\\t.\\ea\\99\\ae\"" (ref extern)))
33+
34+
;; CHECK: (import "string.const" "3" (global $"string.const_\"surrogate pair \\f0\\90\\8d\\88 \"" (ref extern)))
35+
36+
;; CHECK: (import "string.const" "4" (global $"string.const_\"unpaired high surrogate \\ed\\a0\\80 \"" (ref extern)))
37+
38+
;; CHECK: (import "string.const" "5" (global $"string.const_\"unpaired low surrogate \\ed\\bd\\88 \"" (ref extern)))
3339

3440
;; CHECK: (import "wasm:js-string" "fromCharCodeArray" (func $fromCharCodeArray (type $3) (param (ref null $0) i32 i32) (result (ref extern))))
3541

@@ -77,14 +83,32 @@
7783

7884
;; CHECK: (func $tricky-consts (type $1)
7985
;; CHECK-NEXT: (drop
80-
;; CHECK-NEXT: (string.const "needs\tescaping\00.\'#%\"")
86+
;; CHECK-NEXT: (string.const "needs\tescaping\00.\'#%\"- .\r\n\\08\0c\n\r\t.\ea\99\ae")
87+
;; CHECK-NEXT: )
88+
;; CHECK-NEXT: (drop
89+
;; CHECK-NEXT: (string.const "surrogate pair \f0\90\8d\88 ")
90+
;; CHECK-NEXT: )
91+
;; CHECK-NEXT: (drop
92+
;; CHECK-NEXT: (string.const "unpaired high surrogate \ed\a0\80 ")
93+
;; CHECK-NEXT: )
94+
;; CHECK-NEXT: (drop
95+
;; CHECK-NEXT: (string.const "unpaired low surrogate \ed\bd\88 ")
8196
;; CHECK-NEXT: )
8297
;; CHECK-NEXT: )
8398
(func $tricky-consts
84-
;; This tricky string should remain exactly the same after lowering and
99+
;; These tricky strings should remain exactly the same after lowering and
85100
;; lifting.
86101
(drop
87-
(string.const "needs\tescaping\00.'#%\"")
102+
(string.const "needs\tescaping\00.'#%\"- .\r\n\\08\0C\0A\0D\09.ꙮ")
103+
)
104+
(drop
105+
(string.const "surrogate pair \F0\90\8D\88 ")
106+
)
107+
(drop
108+
(string.const "unpaired high surrogate \ED\A0\80 ")
109+
)
110+
(drop
111+
(string.const "unpaired low surrogate \ED\BD\88 ")
88112
)
89113
)
90114
)

0 commit comments

Comments
 (0)