From 287b66a15e37a35e859f93018b7d0a87dd45d1c7 Mon Sep 17 00:00:00 2001 From: clonker <1685266+clonker@users.noreply.github.com> Date: Wed, 9 Apr 2025 12:18:53 +0200 Subject: [PATCH 1/2] Add instruction location info to eof assembly --- libevmasm/Assembly.cpp | 64 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/libevmasm/Assembly.cpp b/libevmasm/Assembly.cpp index 9dd203a0f802..670165a0bc5d 100644 --- a/libevmasm/Assembly.cpp +++ b/libevmasm/Assembly.cpp @@ -53,6 +53,54 @@ using namespace solidity::evmasm; using namespace solidity::langutil; using namespace solidity::util; +namespace +{ + +/// Produces instruction location info in RAII style. When an assembly instruction is added to the bytecode, +/// this class can be instantiated in that scope. It will record the current bytecode size (before addition) +/// and, at destruction time, record the new bytecode size. This information is then added to an external +/// instruction locations vector. +/// If the instruction decomposes into multiple individual evm instructions, `emit` can be +/// called for all but the last one (which will be emitted by the destructor). +class InstructionLocationEmitter +{ +public: + InstructionLocationEmitter( + std::vector& _instructionLocations, + bytes const& _bytecode, + size_t const _assemblyItemIndex + ): + m_instructionLocations(_instructionLocations), + m_bytecode(_bytecode), + m_assemblyItemIndex(_assemblyItemIndex), + m_instructionLocationStart(_bytecode.size()) + {} + + ~InstructionLocationEmitter() + { + emit(); + } + + void emit() + { + auto const end = m_bytecode.size(); + m_instructionLocations.push_back(LinkerObject::InstructionLocation{ + .start = m_instructionLocationStart, + .end = end, + .assemblyItemIndex = m_assemblyItemIndex + }); + m_instructionLocationStart = end; + } + +private: + std::vector& m_instructionLocations; + bytes const& m_bytecode; + size_t const m_assemblyItemIndex{}; + size_t m_instructionLocationStart{}; +}; + +} + std::map> Assembly::s_sharedSourceNames; AssemblyItem const& Assembly::append(AssemblyItem _i) @@ -1606,9 +1654,17 @@ LinkerObject const& Assembly::assembleEOF() const for (auto&& [codeSectionIndex, codeSection]: m_codeSections | ranges::views::enumerate) { auto const sectionStart = ret.bytecode.size(); + + std::vector instructionLocations; + instructionLocations.reserve(codeSection.items.size()); + solAssert(!codeSection.items.empty(), "Empty code section."); - for (AssemblyItem const& item: codeSection.items) + + for (auto const& [assemblyItemIndex, item]: codeSection.items | ranges::views::enumerate) { + // collect instruction locations via side effects + InstructionLocationEmitter instructionLocationEmitter {instructionLocations, ret.bytecode, assemblyItemIndex}; + // store position of the invalid jump destination if (item.type() != Tag && m_tagPositionsInBytecode[0] == std::numeric_limits::max()) m_tagPositionsInBytecode[0] = ret.bytecode.size(); @@ -1724,6 +1780,12 @@ LinkerObject const& Assembly::assembleEOF() const "Code section too large for EOF." ); setBigEndianUint16(ret.bytecode, codeSectionSizePositions[codeSectionIndex], ret.bytecode.size() - sectionStart); + + ret.codeSectionLocations.push_back(LinkerObject::CodeSectionLocation{ + .start = sectionStart, + .end = ret.bytecode.size(), + .instructionLocations = std::move(instructionLocations) + }); } for (auto const& [refPos, tagId]: tagRef) From a5e20d2dc05b42f27bb0fe1092502b1a799a014c Mon Sep 17 00:00:00 2001 From: clonker <1685266+clonker@users.noreply.github.com> Date: Thu, 10 Apr 2025 10:54:11 +0200 Subject: [PATCH 2/2] Refactor legacy assemble to also use RAII instruction location construction --- libevmasm/Assembly.cpp | 158 +++++++++++++++-------------------------- 1 file changed, 57 insertions(+), 101 deletions(-) diff --git a/libevmasm/Assembly.cpp b/libevmasm/Assembly.cpp index 670165a0bc5d..019683c9d4fd 100644 --- a/libevmasm/Assembly.cpp +++ b/libevmasm/Assembly.cpp @@ -1329,22 +1329,12 @@ LinkerObject const& Assembly::assembleLegacy() const uint8_t dataRefPush = static_cast(pushInstruction(bytesPerDataRef)); LinkerObject::CodeSectionLocation codeSectionLocation; + codeSectionLocation.instructionLocations.reserve(items.size()); codeSectionLocation.start = 0; - size_t assemblyItemIndex = 0; - auto assembleInstruction = [&](auto&& _addInstruction) { - size_t start = ret.bytecode.size(); - _addInstruction(); - size_t end = ret.bytecode.size(); - codeSectionLocation.instructionLocations.emplace_back( - LinkerObject::InstructionLocation{ - .start = start, - .end = end, - .assemblyItemIndex = assemblyItemIndex - } - ); - }; - for (AssemblyItem const& item: items) + for (auto const& [assemblyItemIndex, item]: items | ranges::views::enumerate) { + // collect instruction locations via side effects + InstructionLocationEmitter instructionLocationEmitter(codeSectionLocation.instructionLocations, ret.bytecode, assemblyItemIndex); // store position of the invalid jump destination if (item.type() != Tag && m_tagPositionsInBytecode[0] == std::numeric_limits::max()) m_tagPositionsInBytecode[0] = ret.bytecode.size(); @@ -1352,81 +1342,59 @@ LinkerObject const& Assembly::assembleLegacy() const switch (item.type()) { case Operation: - assembleInstruction([&](){ - ret.bytecode += assembleOperation(item); - }); + ret.bytecode += assembleOperation(item); break; case Push: - assembleInstruction([&](){ - ret.bytecode += assemblePush(item); - }); + ret.bytecode += assemblePush(item); break; case PushTag: - { - assembleInstruction([&](){ - ret.bytecode.push_back(tagPush); - tagRefs[ret.bytecode.size()] = item.splitForeignPushTag(); - ret.bytecode.resize(ret.bytecode.size() + bytesPerTag); - }); + ret.bytecode.push_back(tagPush); + tagRefs[ret.bytecode.size()] = item.splitForeignPushTag(); + ret.bytecode.resize(ret.bytecode.size() + bytesPerTag); break; - } case PushData: - assembleInstruction([&]() { - ret.bytecode.push_back(dataRefPush); - dataRefs.insert(std::make_pair(h256(item.data()), ret.bytecode.size())); - ret.bytecode.resize(ret.bytecode.size() + bytesPerDataRef); - }); + ret.bytecode.push_back(dataRefPush); + dataRefs.insert(std::make_pair(h256(item.data()), ret.bytecode.size())); + ret.bytecode.resize(ret.bytecode.size() + bytesPerDataRef); break; case PushSub: - assembleInstruction([&]() { - assertThrow(item.data() <= std::numeric_limits::max(), AssemblyException, ""); - ret.bytecode.push_back(dataRefPush); - subRefs.insert(std::make_pair(static_cast(item.data()), ret.bytecode.size())); - ret.bytecode.resize(ret.bytecode.size() + bytesPerDataRef); - }); + assertThrow(item.data() <= std::numeric_limits::max(), AssemblyException, ""); + ret.bytecode.push_back(dataRefPush); + subRefs.insert(std::make_pair(static_cast(item.data()), ret.bytecode.size())); + ret.bytecode.resize(ret.bytecode.size() + bytesPerDataRef); break; case PushSubSize: { - assembleInstruction([&](){ - assertThrow(item.data() <= std::numeric_limits::max(), AssemblyException, ""); - auto s = subAssemblyById(static_cast(item.data()))->assemble().bytecode.size(); - item.setPushedValue(u256(s)); - unsigned b = std::max(1, numberEncodingSize(s)); - ret.bytecode.push_back(static_cast(pushInstruction(b))); - ret.bytecode.resize(ret.bytecode.size() + b); - bytesRef byr(&ret.bytecode.back() + 1 - b, b); - toBigEndian(s, byr); - }); + assertThrow(item.data() <= std::numeric_limits::max(), AssemblyException, ""); + auto s = subAssemblyById(static_cast(item.data()))->assemble().bytecode.size(); + item.setPushedValue(u256(s)); + unsigned b = std::max(1, numberEncodingSize(s)); + ret.bytecode.push_back(static_cast(pushInstruction(b))); + ret.bytecode.resize(ret.bytecode.size() + b); + bytesRef byr(&ret.bytecode.back() + 1 - b, b); + toBigEndian(s, byr); break; } case PushProgramSize: - { - assembleInstruction([&](){ - ret.bytecode.push_back(dataRefPush); - sizeRefs.push_back(static_cast(ret.bytecode.size())); - ret.bytecode.resize(ret.bytecode.size() + bytesPerDataRef); - }); + ret.bytecode.push_back(dataRefPush); + sizeRefs.push_back(static_cast(ret.bytecode.size())); + ret.bytecode.resize(ret.bytecode.size() + bytesPerDataRef); break; - } case PushLibraryAddress: { - assembleInstruction([&]() { - auto const [bytecode, linkRef] = assemblePushLibraryAddress(item, ret.bytecode.size()); - ret.bytecode += bytecode; - ret.linkReferences.insert(linkRef); - }); + auto const [bytecode, linkRef] = assemblePushLibraryAddress(item, ret.bytecode.size()); + ret.bytecode += bytecode; + ret.linkReferences.insert(linkRef); break; } case PushImmutable: - assembleInstruction([&]() { - ret.bytecode.push_back(static_cast(Instruction::PUSH32)); - // Maps keccak back to the "identifier" std::string of that immutable. - ret.immutableReferences[item.data()].first = m_immutables.at(item.data()); - // Record the bytecode offset of the PUSH32 argument. - ret.immutableReferences[item.data()].second.emplace_back(ret.bytecode.size()); - // Advance bytecode by 32 bytes (default initialized). - ret.bytecode.resize(ret.bytecode.size() + 32); - }); + ret.bytecode.push_back(static_cast(Instruction::PUSH32)); + // Maps keccak back to the "identifier" std::string of that immutable. + ret.immutableReferences[item.data()].first = m_immutables.at(item.data()); + // Record the bytecode offset of the PUSH32 argument. + ret.immutableReferences[item.data()].second.emplace_back(ret.bytecode.size()); + // Advance bytecode by 32 bytes (default initialized). + ret.bytecode.resize(ret.bytecode.size() + 32); break; case VerbatimBytecode: ret.bytecode += assembleVerbatimBytecode(item); @@ -1439,53 +1407,41 @@ LinkerObject const& Assembly::assembleLegacy() const { if (i != offsets.size() - 1) { - assembleInstruction([&]() { - ret.bytecode.push_back(uint8_t(Instruction::DUP2)); - }); - assembleInstruction([&]() { - ret.bytecode.push_back(uint8_t(Instruction::DUP2)); - }); + ret.bytecode.push_back(static_cast(Instruction::DUP2)); + // This item type decomposes into multiple evm instructions, so we manually call emit() + instructionLocationEmitter.emit(); + ret.bytecode.push_back(static_cast(Instruction::DUP2)); + instructionLocationEmitter.emit(); } - assembleInstruction([&]() { - // TODO: should we make use of the constant optimizer methods for pushing the offsets? - bytes offsetBytes = toCompactBigEndian(u256(offsets[i])); - ret.bytecode.push_back(static_cast(pushInstruction(static_cast(offsetBytes.size())))); - ret.bytecode += offsetBytes; - }); - assembleInstruction([&]() { - ret.bytecode.push_back(uint8_t(Instruction::ADD)); - }); - assembleInstruction([&]() { - ret.bytecode.push_back(uint8_t(Instruction::MSTORE)); - }); + // TODO: should we make use of the constant optimizer methods for pushing the offsets? + bytes offsetBytes = toCompactBigEndian(u256(offsets[i])); + ret.bytecode.push_back(static_cast(pushInstruction(static_cast(offsetBytes.size())))); + ret.bytecode += offsetBytes; + instructionLocationEmitter.emit(); + ret.bytecode.push_back(static_cast(Instruction::ADD)); + instructionLocationEmitter.emit(); + ret.bytecode.push_back(static_cast(Instruction::MSTORE)); + // No emit needed here, it's taken care of by the destructor of instructionLocationEmitter. } if (offsets.empty()) { - assembleInstruction([&]() { - ret.bytecode.push_back(uint8_t(Instruction::POP)); - }); - assembleInstruction([&]() { - ret.bytecode.push_back(uint8_t(Instruction::POP)); - }); + ret.bytecode.push_back(static_cast(Instruction::POP)); + instructionLocationEmitter.emit(); + ret.bytecode.push_back(static_cast(Instruction::POP)); + // no emit needed here, it's taken care of by the destructor of instructionLocationEmitter } immutableReferencesBySub.erase(item.data()); break; } case PushDeployTimeAddress: - assembleInstruction([&]() { - ret.bytecode += assemblePushDeployTimeAddress(); - }); + ret.bytecode += assemblePushDeployTimeAddress(); break; case Tag: - assembleInstruction([&](){ - ret.bytecode += assembleTag(item, ret.bytecode.size(), true); - }); + ret.bytecode += assembleTag(item, ret.bytecode.size(), true); break; default: solAssert(false, "Unexpected opcode while assembling."); } - - ++assemblyItemIndex; } codeSectionLocation.end = ret.bytecode.size();