From 16a9c32b499372d20dc06efcdd97e999e9e68959 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Tue, 3 Dec 2024 18:29:31 -0800 Subject: [PATCH] Parallelize the binary parsing of function bodies After a linear scan through the code section and input source map to find the start locations corresponding to each function body, parse the locals and instructions for each function in parallel. This speeds up binary parsing with a sourcemap by about 20% with 8 cores on my machine, but only by about 2% with all 128 cores, so this parallelization has potential but suffers from scaling overhead[^1]. When running a full -O3 optimization pipeline, the parallel parsing slightly reduces the number of minor page faults, presumably by better allocating the original instructions in separate thread-local arenas. It also slightly reduces the max RSS, but these improvements do not translate into better overall performance. In fact, overall performance is slightly lower with this change (at least on my machine.) [^1]: FWIW the full -O3 pipeline also performs significantly better with 8 cores than with 128 cores on my machine. --- src/pass.h | 2 +- src/wasm-binary.h | 15 ++- src/wasm/wasm-binary.cpp | 136 ++++++++++++++++---------- test/lit/binary/debug-bad-binary.test | 5 + 4 files changed, 98 insertions(+), 60 deletions(-) diff --git a/src/pass.h b/src/pass.h index daed8b6abc6..9e518c42dbe 100644 --- a/src/pass.h +++ b/src/pass.h @@ -452,7 +452,7 @@ class Pass { // This method is used to create instances per function for a // function-parallel pass. You may need to override this if you subclass a // Walker, as otherwise this will create the parent class. - virtual std::unique_ptr create() { WASM_UNREACHABLE("unimplenented"); } + virtual std::unique_ptr create() { WASM_UNREACHABLE("unimplemented"); } // Whether this pass modifies the Binaryen IR in the module. This is true for // most passes, except for passes that have no side effects, or passes that diff --git a/src/wasm-binary.h b/src/wasm-binary.h index 4bd6ff6184a..7f94ad6b47c 100644 --- a/src/wasm-binary.h +++ b/src/wasm-binary.h @@ -1524,16 +1524,15 @@ class WasmBinaryReader { std::unordered_map dataNames; std::unordered_map elemNames; - Function* currFunction = nullptr; - // before we see a function (like global init expressions), there is no end of - // function to check - Index endOfFunction = -1; - void readFunctions(size_t& pos); - void readVars(size_t& pos); - void setLocalNames(Function& func, Index i); + void readVars(size_t& pos, Function* func); + void setLocalNames(Function* func, Index i); - Result<> readInst(size_t& pos); + Result<> + readInst(size_t& pos, IRBuilder& builder, SourceMapReader& sourceMapReader); + Result<> readInst(size_t& pos) { + return readInst(pos, builder, sourceMapReader); + } void readExports(size_t& pos); diff --git a/src/wasm/wasm-binary.cpp b/src/wasm/wasm-binary.cpp index 89fe4fc60cc..d374a6e88b0 100644 --- a/src/wasm/wasm-binary.cpp +++ b/src/wasm/wasm-binary.cpp @@ -2583,7 +2583,7 @@ void WasmBinaryReader::readImports(size_t& pos) { curr->hasExplicitName = isExplicit; curr->module = module; curr->base = base; - setLocalNames(*curr, wasm.functions.size()); + setLocalNames(curr.get(), wasm.functions.size()); wasm.addFunction(std::move(curr)); break; } @@ -2676,16 +2676,16 @@ void WasmBinaryReader::readImports(size_t& pos) { numFuncImports = wasm.functions.size(); } -void WasmBinaryReader::setLocalNames(Function& func, Index i) { +void WasmBinaryReader::setLocalNames(Function* func, Index i) { if (auto it = localNames.find(i); it != localNames.end()) { for (auto& [local, name] : it->second) { - if (local >= func.getNumLocals()) { + if (local >= func->getNumLocals()) { std::cerr << "warning: local index out of bounds in name section: " << name << " at index " << local << " in function " << i << '\n'; continue; } - func.setLocalName(local, name); + func->setLocalName(local, name); } } } @@ -2762,74 +2762,106 @@ void WasmBinaryReader::readFunctions(size_t& pos) { if (numFuncBodies + numFuncImports != wasm.functions.size()) { throwError(pos, "invalid function section size, must equal types"); } - if (DWARF) { - builder.setBinaryLocation(&pos, codeSectionLocation); - } - for (size_t i = 0; i < numFuncBodies; i++) { - auto sizePos = pos; + + using FuncPositions = + std::unordered_map>; + FuncPositions funcPositions; + + for (Index i = 0; i < numFuncBodies; i++) { + auto funcIndex = numFuncImports + i; + auto start = pos; size_t size = getU32LEB(pos); if (size == 0) { throwError(pos, "empty function size"); } - Index endOfFunction = pos + size; + auto end = pos + size; - auto& func = wasm.functions[numFuncImports + i]; - currFunction = func.get(); + auto* func = wasm.functions[funcIndex].get(); if (DWARF) { func->funcLocation = BinaryLocations::FunctionLocations{ - BinaryLocation(sizePos - codeSectionLocation), + BinaryLocation(start - codeSectionLocation), BinaryLocation(pos - codeSectionLocation), BinaryLocation(pos - codeSectionLocation + size)}; } func->prologLocation = sourceMapReader.readDebugLocationAt(pos); - readVars(pos); - setLocalNames(*func, numFuncImports + i); - { - // Process the function body. Even if we are skipping function bodies we - // need to not skip the start function. That contains important code for - // wasm-emscripten-finalize in the form of pthread-related segment - // initializations. As this is just one function, it doesn't add - // significant time, so the optimization of skipping bodies is still very - // useful. - auto currFunctionIndex = wasm.functions.size(); - bool isStart = startIndex == currFunctionIndex; - if (skipFunctionBodies && !isStart) { - // When skipping the function body we need to put something valid in - // their place so we validate. An unreachable is always acceptable - // there. - func->body = Builder(wasm).makeUnreachable(); - // Skip reading the contents. - pos = endOfFunction; - } else { - auto start = builder.visitFunctionStart(func.get()); + funcPositions.insert({func, {funcIndex, pos, end, sourceMapReader}}); + pos = end; + sourceMapReader.finishFunction(); + } + + // Parse the function bodies in parallel. This obviously modifies the IR, but + // we say it's Immutable because we don't need to perform any cleanup actions + // after parsing the IR. + ModuleUtils::ParallelFunctionAnalysis, + Immutable, + InsertOrderedMap> + bodyParser(wasm, [&](Function* func, std::optional& error) { + if (func->imported()) { + return; + } + auto [funcIndex, pos, end, sourceMapReader] = + funcPositions.find(func)->second; + + IRBuilder builder(wasm); + if (DWARF) { + builder.setBinaryLocation(&pos, codeSectionLocation); + } + + try { + readVars(pos, func); + setLocalNames(func, funcIndex); + + // Process the function body. Even if we are skipping function bodies we + // need to not skip the start function. That contains important code for + // wasm-emscripten-finalize in the form of pthread-related segment + // initializations. As this is just one function, it doesn't add + // significant time, so the optimization of skipping bodies is still + // very useful. + if (skipFunctionBodies && startIndex != funcIndex) { + func->body = Builder(wasm).makeUnreachable(); + return; + } + + auto start = builder.visitFunctionStart(func); if (auto* err = start.getErr()) { - throwError(pos, err->msg); + error = ParseException(err->msg, 0, pos); + return; } - while (pos < endOfFunction) { - auto inst = readInst(pos); + while (pos < end) { + auto inst = readInst(pos, builder, sourceMapReader); if (auto* err = inst.getErr()) { - throwError(pos, err->msg); + error = ParseException(err->msg, 0, pos); + return; } } - if (pos != endOfFunction) { - throwError(pos, "function overflowed its bounds"); - } - if (!builder.empty()) { - throwError(pos, "expected function end"); - } + } catch (ParseException e) { + error = e; + return; } - } - sourceMapReader.finishFunction(); - TypeUpdating::handleNonDefaultableLocals(func.get(), wasm); - currFunction = nullptr; + if (pos != end) { + error = ParseException("function overflowed its bounds", 0, pos); + return; + } + if (!builder.empty()) { + error = ParseException("expected function end", 0, pos); + return; + } + TypeUpdating::handleNonDefaultableLocals(func, wasm); + }); + + for (auto& [_, err] : bodyParser.map) { + if (err) { + throw *err; + } } } -void WasmBinaryReader::readVars(size_t& pos) { +void WasmBinaryReader::readVars(size_t& pos, Function* func) { uint32_t totalVars = 0; size_t numLocalTypes = getU32LEB(pos); // Use a SmallVector as in the common (MVP) case there are only 4 possible @@ -2844,16 +2876,18 @@ void WasmBinaryReader::readVars(size_t& pos) { auto type = getConcreteType(pos); decodedVars.emplace_back(num, type); } - currFunction->vars.reserve(totalVars); + func->vars.reserve(totalVars); for (auto [num, type] : decodedVars) { while (num > 0) { - currFunction->vars.push_back(type); + func->vars.push_back(type); num--; } } } -Result<> WasmBinaryReader::readInst(size_t& pos) { +Result<> WasmBinaryReader::readInst(size_t& pos, + IRBuilder& builder, + SourceMapReader& sourceMapReader) { if (auto loc = sourceMapReader.readDebugLocationAt(pos)) { builder.setDebugLocation(loc); } diff --git a/test/lit/binary/debug-bad-binary.test b/test/lit/binary/debug-bad-binary.test index 63a2ed2ccac..a49f21b13ce 100644 --- a/test/lit/binary/debug-bad-binary.test +++ b/test/lit/binary/debug-bad-binary.test @@ -38,4 +38,9 @@ RUN: not wasm-opt --debug %s.wasm 2>&1 | filecheck %s ;; CHECK-NEXT: (i32.const 10) ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) +;; CHECK-NEXT: (func $2 +;; CHECK-NEXT: (drop +;; CHECK-NEXT: (i32.const 30) +;; CHECK-NEXT: ) +;; CHECK-NEXT: ) ;; CHECK-NEXT: )