[Strings] Handle encoding in JSON parsing so StringLifting can handle arbitrary custom section content (#7414)

kripken · web-flow · commit a7d93efd82d3 · 2025-04-01T16:07:14.000-07:00
Rather than encode to WTF8 and re-encode, instead make the unescaping logic go
from UTF8 straight to WTF16. That makes it simpler and more efficient.

Make the JSON parser get a parameter for which encoding to use for strings, so
we can use ascii in old places.
diff --git a/src/passes/StringLifting.cpp b/src/passes/StringLifting.cpp
@@ -66,7 +66,14 @@ struct StringLifting : public Pass {
         continue;
       }
       if (global->module == stringConstsModule) {
-        importedStrings[global->name] = global->base;
+        // Encode from WTF-8 to WTF-16.
+        auto wtf8 = global->base;
+        std::stringstream wtf16;
+        bool valid = String::convertWTF8ToWTF16(wtf16, wtf8.str);
+        if (!valid) {
+          Fatal() << "Bad string to lift: " << wtf8;
+        }
+        importedStrings[global->name] = wtf16.str();
         found = true;
       }
     }
@@ -77,7 +84,7 @@ struct StringLifting : public Pass {
         // We found the string consts section. Parse it.
         auto copy = section.data;
         json::Value array;
-        array.parse(copy.data());
+        array.parse(copy.data(), json::Value::WTF16);
         if (!array.isArray()) {
           Fatal()
             << "StringLifting: string.const section should be a JSON array";
@@ -203,15 +210,8 @@ struct StringLifting : public Pass {
         // Replace global.gets of imported strings with string.const.
         auto iter = parent.importedStrings.find(curr->name);
         if (iter != parent.importedStrings.end()) {
-          // Encode from WTF-8 to WTF-16.
-          auto wtf8 = iter->second;
-          std::stringstream wtf16;
-          bool valid = String::convertWTF8ToWTF16(wtf16, wtf8.str);
-          if (!valid) {
-            Fatal() << "Bad string to lift: " << wtf8;
-          }
-
-          replaceCurrent(Builder(*getModule()).makeStringConst(wtf16.str()));
+          auto wtf16 = iter->second;
+          replaceCurrent(Builder(*getModule()).makeStringConst(wtf16.str));
           modified = true;
         }
       }
diff --git a/src/support/json.h b/src/support/json.h
@@ -249,7 +249,16 @@ struct Value {
     return true;
   }
 
-  char* parse(char* curr) {
+  // The encoding into which we parse strings. The input encoding is always
+  // UTF8, but we can parse into ASCII (very quickly, and without many small
+  // allocations), or we can parse into WTF16 (which is the format used by
+  // StringConst).
+  enum StringEncoding {
+    ASCII,
+    WTF16,
+  };
+
+  char* parse(char* curr, StringEncoding stringEncoding) {
 #define is_json_space(x)                                                       \
   (x == 32 || x == 9 || x == 10 ||                                             \
    x == 13) /* space, tab, linefeed/newline, or return */
@@ -271,7 +280,13 @@ struct Value {
       assert(close);
       *close = 0; // end this string, and reuse it straight from the input
       char* raw = curr + 1;
-      unescapeAndSetString(raw);
+      if (stringEncoding == ASCII) {
+        // Just use the current string.
+        setString(raw);
+      } else {
+        assert(stringEncoding == WTF16);
+        unescapeIntoWTF16(raw);
+      }
       curr = close + 1;
     } else if (*curr == '[') {
       // Array
@@ -281,7 +296,7 @@ struct Value {
       while (*curr != ']') {
         Ref temp = Ref(new Value());
         arr->push_back(temp);
-        curr = temp->parse(curr);
+        curr = temp->parse(curr, stringEncoding);
         skip();
         if (*curr == ']') {
           break;
@@ -324,7 +339,7 @@ struct Value {
         curr++;
         skip();
         Ref value = Ref(new Value());
-        curr = value->parse(curr);
+        curr = value->parse(curr, stringEncoding);
         (*obj)[key] = value;
         skip();
         if (*curr == '}') {
@@ -412,20 +427,14 @@ struct Value {
   }
 
 private:
-  // If the string has no escaped characters, setString() the char* directly. If
-  // it does require escaping, do that and intern a new string with those
-  // contents.
-  void unescapeAndSetString(char* str) {
-    if (!strchr(str, '\\')) {
-      // No escaping slash.
-      setString(str);
-      return;
-    }
-
-    auto unescaped = wasm::String::unescapeJSONToWTF8(str);
-
-    setString(
-      IString(std::string_view(unescaped.data(), unescaped.size()), false));
+  // Unescape the input (UTF8) string into one of our internal strings (WTF16).
+  void unescapeIntoWTF16(char* str) {
+    // TODO: Optimize the unescaped path? But it is impossible to avoid an
+    //       allocation here.
+    std::stringstream ss;
+    wasm::String::unescapeUTF8JSONtoWTF16(ss, str);
+    // TODO: Use ss.view() once we have C++20.
+    setString(ss.str());
   }
 };
 
diff --git a/src/support/string.cpp b/src/support/string.cpp
@@ -432,13 +432,12 @@ bool isUTF8(std::string_view str) {
   return true;
 }
 
-std::vector<char> unescapeJSONToWTF8(const char* str) {
-  std::vector<char> unescaped;
+std::ostream& unescapeUTF8JSONtoWTF16(std::ostream& os, const char* str) {
   size_t i = 0;
   while (str[i]) {
     if (str[i] != '\\') {
       // Normal character.
-      unescaped.push_back(str[i]);
+      writeWTF16CodePoint(os, str[i]);
       i++;
       continue;
     }
@@ -465,7 +464,7 @@ std::vector<char> unescapeJSONToWTF8(const char* str) {
         case 0:
           Fatal() << "Invalid escaped JSON ends in slash";
       }
-      unescaped.push_back(c);
+      writeWTF16CodePoint(os, c);
       i += 2;
       continue;
     }
@@ -480,17 +479,12 @@ std::vector<char> unescapeJSONToWTF8(const char* str) {
     unhex >> x;
 
     // Write out the results.
-    unescaped.push_back(x & 0xff);
-    x >>= 8;
-    if (x) {
-      unescaped.push_back(x);
-    }
-    // TODO UTF stuff
+    writeWTF16CodePoint(os, x);
 
     i += 6;
   }
 
-  return unescaped;
+  return os;
 }
 
 } // namespace wasm::String
diff --git a/src/support/string.h b/src/support/string.h
@@ -102,8 +102,8 @@ bool convertUTF16ToUTF8(std::ostream& os, std::string_view str);
 // Whether the string is valid UTF-8.
 bool isUTF8(std::string_view str);
 
-// Given a string of properly-escaped JSON, unescape it.
-std::vector<char> unescapeJSONToWTF8(const char* str);
+// Given a string of properly-escaped JSON in UTF8, unescape it into WTF16.
+std::ostream& unescapeUTF8JSONtoWTF16(std::ostream& os, const char* str);
 
 } // namespace wasm::String
 
diff --git a/src/tools/wasm-metadce.cpp b/src/tools/wasm-metadce.cpp
@@ -518,7 +518,7 @@ int main(int argc, const char* argv[]) {
   auto graphInput(read_file<std::string>(graphFile, Flags::Text));
   auto* copy = strdup(graphInput.c_str());
   json::Value outside;
-  outside.parse(copy);
+  outside.parse(copy, json::Value::ASCII);
 
   // parse the JSON into our graph, doing all the JSON parsing here, leaving
   // the abstract computation for the class itself
diff --git a/test/gtest/json.cpp b/test/gtest/json.cpp
@@ -8,7 +8,7 @@ TEST_F(JSONTest, Stringify) {
   auto input = "[\"hello\",\"world\"]";
   auto* copy = strdup(input);
   json::Value value;
-  value.parse(copy);
+  value.parse(copy, json::Value::ASCII);
   std::stringstream ss;
   value.stringify(ss);
   EXPECT_EQ(ss.str(), input);
diff --git a/test/lit/passes/string-lifting-section.wast b/test/lit/passes/string-lifting-section.wast
@@ -29,7 +29,13 @@
 
   ;; CHECK:      (import "string.const" "1" (global $"string.const_\"foo\"" (ref extern)))
 
-  ;; CHECK:      (import "string.const" "2" (global $"string.const_\"needs\\tescaping\\00.\\\'#%\\\"\"" (ref extern)))
+  ;; CHECK:      (import "string.const" "2" (global $"string.const_\"needs\\tescaping\\00.\\\'#%\\\"- .\\r\\n\\\\08\\0c\\n\\r\\t.\\ea\\99\\ae\"" (ref extern)))
+
+  ;; CHECK:      (import "string.const" "3" (global $"string.const_\"surrogate pair \\f0\\90\\8d\\88 \"" (ref extern)))
+
+  ;; CHECK:      (import "string.const" "4" (global $"string.const_\"unpaired high surrogate \\ed\\a0\\80 \"" (ref extern)))
+
+  ;; CHECK:      (import "string.const" "5" (global $"string.const_\"unpaired low surrogate \\ed\\bd\\88 \"" (ref extern)))
 
   ;; CHECK:      (import "wasm:js-string" "fromCharCodeArray" (func $fromCharCodeArray (type $3) (param (ref null $0) i32 i32) (result (ref extern))))
 
@@ -77,14 +83,32 @@
 
   ;; CHECK:      (func $tricky-consts (type $1)
   ;; CHECK-NEXT:  (drop
-  ;; CHECK-NEXT:   (string.const "needs\tescaping\00.\'#%\"")
+  ;; CHECK-NEXT:   (string.const "needs\tescaping\00.\'#%\"- .\r\n\\08\0c\n\r\t.\ea\99\ae")
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (drop
+  ;; CHECK-NEXT:   (string.const "surrogate pair \f0\90\8d\88 ")
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (drop
+  ;; CHECK-NEXT:   (string.const "unpaired high surrogate \ed\a0\80 ")
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (drop
+  ;; CHECK-NEXT:   (string.const "unpaired low surrogate \ed\bd\88 ")
   ;; CHECK-NEXT:  )
   ;; CHECK-NEXT: )
   (func $tricky-consts
-    ;; This tricky string should remain exactly the same after lowering and
+    ;; These tricky strings should remain exactly the same after lowering and
     ;; lifting.
     (drop
-      (string.const "needs\tescaping\00.'#%\"")
+      (string.const "needs\tescaping\00.'#%\"- .\r\n\\08\0C\0A\0D\09.ꙮ")
+    )
+    (drop
+      (string.const "surrogate pair \F0\90\8D\88 ")
+    )
+    (drop
+      (string.const "unpaired high surrogate \ED\A0\80 ")
+    )
+    (drop
+      (string.const "unpaired low surrogate \ED\BD\88 ")
     )
   )
 )