From 3a056c7bcac7feca54680679f7d9462299c9ca01 Mon Sep 17 00:00:00 2001 From: Nuno Cruces Date: Fri, 11 Apr 2025 11:37:34 +0100 Subject: [PATCH 1/2] Simplify fill. Signed-off-by: Nuno Cruces --- internal/engine/wazevo/frontend/lower.go | 60 +++++++++--------------- 1 file changed, 22 insertions(+), 38 deletions(-) diff --git a/internal/engine/wazevo/frontend/lower.go b/internal/engine/wazevo/frontend/lower.go index e73debbd1a..c365414f92 100644 --- a/internal/engine/wazevo/frontend/lower.go +++ b/internal/engine/wazevo/frontend/lower.go @@ -665,12 +665,6 @@ func (c *Compiler) lowerCurrentOpcode() { tableBaseAddr := c.loadTableBaseAddr(tableInstancePtr) addr := builder.AllocateInstruction().AsIadd(tableBaseAddr, offsetInBytes).Insert(builder).Return() - // Prepare the loop and following block. - beforeLoop := builder.AllocateBasicBlock() - loopBlk := builder.AllocateBasicBlock() - loopVar := loopBlk.AddParam(builder, ssa.TypeI64) - followingBlk := builder.AllocateBasicBlock() - // Uses the copy trick for faster filling buffer like memory.fill, but in this case we copy 8 bytes at a time. // buf := memoryInst.Buffer[offset : offset+fillSize] // buf[0:8] = value @@ -678,6 +672,12 @@ func (c *Compiler) lowerCurrentOpcode() { // copy(buf[i:], buf[:i]) // } + // Prepare the loop and following block. + beforeLoop := builder.AllocateBasicBlock() + loopBlk := builder.AllocateBasicBlock() + loopVar := loopBlk.AddParam(builder, ssa.TypeI64) + followingBlk := builder.AllocateBasicBlock() + // Insert the jump to the beforeLoop block; If the fillSize is zero, then jump to the following block to skip entire logics. zero := builder.AllocateInstruction().AsIconst64(0).Insert(builder).Return() ifFillSizeZero := builder.AllocateInstruction().AsIcmp(fillSizeExt, zero, ssa.IntegerCmpCondEqual). @@ -694,26 +694,18 @@ func (c *Compiler) lowerCurrentOpcode() { builder.SetCurrentBlock(loopBlk) dstAddr := builder.AllocateInstruction().AsIadd(addr, loopVar).Insert(builder).Return() - // If loopVar*2 > fillSizeInBytes, then count must be fillSizeInBytes-loopVar. - var count ssa.Value - { - loopVarDoubled := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return() - loopVarDoubledLargerThanFillSize := builder. - AllocateInstruction().AsIcmp(loopVarDoubled, fillSizeInBytes, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual). - Insert(builder).Return() - diff := builder.AllocateInstruction().AsIsub(fillSizeInBytes, loopVar).Insert(builder).Return() - count = builder.AllocateInstruction().AsSelect(loopVarDoubledLargerThanFillSize, diff, loopVar).Insert(builder).Return() - } + newLoopVar := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return() + newLoopVarLessThanFillSize := builder.AllocateInstruction(). + AsIcmp(newLoopVar, fillSizeInBytes, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return() - c.callMemmove(dstAddr, addr, count) + // On the last iteration, count must be fillSizeInBytes-loopVar. + diff := builder.AllocateInstruction().AsIsub(fillSizeInBytes, loopVar).Insert(builder).Return() + count := builder.AllocateInstruction().AsSelect(newLoopVarLessThanFillSize, loopVar, diff).Insert(builder).Return() - shiftAmount := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return() - newLoopVar := builder.AllocateInstruction().AsIshl(loopVar, shiftAmount).Insert(builder).Return() - loopVarLessThanFillSize := builder.AllocateInstruction(). - AsIcmp(newLoopVar, fillSizeInBytes, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return() + c.callMemmove(dstAddr, addr, count) builder.AllocateInstruction(). - AsBrnz(loopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk). + AsBrnz(newLoopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk). Insert(builder) c.insertJumpToBlock(ssa.ValuesNil, followingBlk) @@ -770,26 +762,18 @@ func (c *Compiler) lowerCurrentOpcode() { builder.SetCurrentBlock(loopBlk) dstAddr := builder.AllocateInstruction().AsIadd(addr, loopVar).Insert(builder).Return() - // If loopVar*2 > fillSizeExt, then count must be fillSizeExt-loopVar. - var count ssa.Value - { - loopVarDoubled := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return() - loopVarDoubledLargerThanFillSize := builder. - AllocateInstruction().AsIcmp(loopVarDoubled, fillSize, ssa.IntegerCmpCondUnsignedGreaterThanOrEqual). - Insert(builder).Return() - diff := builder.AllocateInstruction().AsIsub(fillSize, loopVar).Insert(builder).Return() - count = builder.AllocateInstruction().AsSelect(loopVarDoubledLargerThanFillSize, diff, loopVar).Insert(builder).Return() - } + newLoopVar := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return() + newLoopVarLessThanFillSize := builder.AllocateInstruction(). + AsIcmp(newLoopVar, fillSize, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return() - c.callMemmove(dstAddr, addr, count) + // On the last iteration, count must be fillSize-loopVar. + diff := builder.AllocateInstruction().AsIsub(fillSize, loopVar).Insert(builder).Return() + count := builder.AllocateInstruction().AsSelect(newLoopVarLessThanFillSize, loopVar, diff).Insert(builder).Return() - shiftAmount := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return() - newLoopVar := builder.AllocateInstruction().AsIshl(loopVar, shiftAmount).Insert(builder).Return() - loopVarLessThanFillSize := builder.AllocateInstruction(). - AsIcmp(newLoopVar, fillSize, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return() + c.callMemmove(dstAddr, addr, count) builder.AllocateInstruction(). - AsBrnz(loopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk). + AsBrnz(newLoopVarLessThanFillSize, c.allocateVarLengthValues(1, newLoopVar), loopBlk). Insert(builder) c.insertJumpToBlock(ssa.ValuesNil, followingBlk) From 2584cce44bf24dcf96598aee0eee85a08e379187 Mon Sep 17 00:00:00 2001 From: Nuno Cruces Date: Fri, 11 Apr 2025 14:01:41 +0100 Subject: [PATCH 2/2] Faster fill. Signed-off-by: Nuno Cruces --- internal/engine/interpreter/interpreter.go | 17 +++++++---- internal/engine/wazevo/frontend/lower.go | 34 +++++++++++++++------- internal/wasm/table.go | 6 +++- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/internal/engine/interpreter/interpreter.go b/internal/engine/interpreter/interpreter.go index 5b5e6e9d08..00d25b3e55 100644 --- a/internal/engine/interpreter/interpreter.go +++ b/internal/engine/interpreter/interpreter.go @@ -1725,12 +1725,17 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, m *wasm.ModuleInstance if fillSize+offset > uint64(len(memoryInst.Buffer)) { panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess) } else if fillSize != 0 { - // Uses the copy trick for faster filling buffer. - // https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d + // Uses the copy trick for faster filling the buffer with the value. + // https://github.com/golang/go/blob/go1.24.0/src/bytes/bytes.go#L664-L673 buf := memoryInst.Buffer[offset : offset+fillSize] - buf[0] = value - for i := 1; i < len(buf); i *= 2 { - copy(buf[i:], buf[:i]) + if value == 0 { + clear(buf) + } else { + buf[0] = value + for i := 1; i < len(buf); { + chunk := min(i, 8192) + i += copy(buf[i:], buf[:chunk]) + } } } frame.pc++ @@ -1804,7 +1809,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, m *wasm.ModuleInstance panic(wasmruntime.ErrRuntimeInvalidTableAccess) } else if num > 0 { // Uses the copy trick for faster filling the region with the value. - // https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d + // https://github.com/golang/go/blob/go1.24.0/src/slices/slices.go#L514-L517 targetRegion := table.References[offset : offset+num] targetRegion[0] = ref for i := 1; i < len(targetRegion); i *= 2 { diff --git a/internal/engine/wazevo/frontend/lower.go b/internal/engine/wazevo/frontend/lower.go index c365414f92..b31e1881a2 100644 --- a/internal/engine/wazevo/frontend/lower.go +++ b/internal/engine/wazevo/frontend/lower.go @@ -666,6 +666,9 @@ func (c *Compiler) lowerCurrentOpcode() { addr := builder.AllocateInstruction().AsIadd(tableBaseAddr, offsetInBytes).Insert(builder).Return() // Uses the copy trick for faster filling buffer like memory.fill, but in this case we copy 8 bytes at a time. + // Tables are rarely huge, so ignore the 8KB maximum. + // https://github.com/golang/go/blob/go1.24.0/src/slices/slices.go#L514-L517 + // // buf := memoryInst.Buffer[offset : offset+fillSize] // buf[0:8] = value // for i := 8; i < fillSize; i *= 2 { Begin with 8 bytes. @@ -688,8 +691,8 @@ func (c *Compiler) lowerCurrentOpcode() { // buf[0:8] = value builder.SetCurrentBlock(beforeLoop) builder.AllocateInstruction().AsStore(ssa.OpcodeStore, value, addr, 0).Insert(builder) - initValue := builder.AllocateInstruction().AsIconst64(8).Insert(builder).Return() - c.insertJumpToBlock(c.allocateVarLengthValues(1, initValue), loopBlk) + eight := builder.AllocateInstruction().AsIconst64(8).Insert(builder).Return() + c.insertJumpToBlock(c.allocateVarLengthValues(1, eight), loopBlk) builder.SetCurrentBlock(loopBlk) dstAddr := builder.AllocateInstruction().AsIadd(addr, loopVar).Insert(builder).Return() @@ -733,11 +736,15 @@ func (c *Compiler) lowerCurrentOpcode() { // Calculate the base address: addr := builder.AllocateInstruction().AsIadd(c.getMemoryBaseValue(false), offset).Insert(builder).Return() - // Uses the copy trick for faster filling buffer: https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d + // Uses the copy trick for faster filling buffer, with a maximum chunk size of 8KB. + // https://github.com/golang/go/blob/go1.24.0/src/bytes/bytes.go#L664-L673 + // // buf := memoryInst.Buffer[offset : offset+fillSize] // buf[0] = value - // for i := 1; i < fillSize; i *= 2 { - // copy(buf[i:], buf[:i]) + // for i := 1; i < fillSize; { + // chunk := ((i - 1) & 8191) + 1 + // copy(buf[i:], buf[:chunk]) + // i += chunk // } // Prepare the loop and following block. @@ -756,19 +763,26 @@ func (c *Compiler) lowerCurrentOpcode() { // buf[0] = value builder.SetCurrentBlock(beforeLoop) builder.AllocateInstruction().AsStore(ssa.OpcodeIstore8, value, addr, 0).Insert(builder) - initValue := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return() - c.insertJumpToBlock(c.allocateVarLengthValues(1, initValue), loopBlk) + one := builder.AllocateInstruction().AsIconst64(1).Insert(builder).Return() + c.insertJumpToBlock(c.allocateVarLengthValues(1, one), loopBlk) builder.SetCurrentBlock(loopBlk) dstAddr := builder.AllocateInstruction().AsIadd(addr, loopVar).Insert(builder).Return() - newLoopVar := builder.AllocateInstruction().AsIadd(loopVar, loopVar).Insert(builder).Return() + // chunk := ((i - 1) & 8191) + 1 + mask := builder.AllocateInstruction().AsIconst64(16383).Insert(builder).Return() + tmp1 := builder.AllocateInstruction().AsIsub(loopVar, one).Insert(builder).Return() + tmp2 := builder.AllocateInstruction().AsBand(tmp1, mask).Insert(builder).Return() + chunk := builder.AllocateInstruction().AsIadd(tmp2, one).Insert(builder).Return() + + // i += chunk + newLoopVar := builder.AllocateInstruction().AsIadd(loopVar, chunk).Insert(builder).Return() newLoopVarLessThanFillSize := builder.AllocateInstruction(). AsIcmp(newLoopVar, fillSize, ssa.IntegerCmpCondUnsignedLessThan).Insert(builder).Return() - // On the last iteration, count must be fillSize-loopVar. + // count = min(chunk, fillSize-loopVar) diff := builder.AllocateInstruction().AsIsub(fillSize, loopVar).Insert(builder).Return() - count := builder.AllocateInstruction().AsSelect(newLoopVarLessThanFillSize, loopVar, diff).Insert(builder).Return() + count := builder.AllocateInstruction().AsSelect(newLoopVarLessThanFillSize, chunk, diff).Insert(builder).Return() c.callMemmove(dstAddr, addr, count) diff --git a/internal/wasm/table.go b/internal/wasm/table.go index 2123693c6b..1df1764df4 100644 --- a/internal/wasm/table.go +++ b/internal/wasm/table.go @@ -326,10 +326,14 @@ func (t *TableInstance) Grow(delta uint32, initialRef Reference) (currentLen uin newLen >= math.MaxUint32 || (t.Max != nil && newLen > int64(*t.Max)) { return 0xffffffff // = -1 in signed 32-bit integer. } + t.References = append(t.References, make([]uintptr, delta)...) + if initialRef == 0 { + return + } // Uses the copy trick for faster filling the new region with the initial value. - // https://gist.github.com/taylorza/df2f89d5f9ab3ffd06865062a4cf015d + // https://github.com/golang/go/blob/go1.24.0/src/slices/slices.go#L514-L517 newRegion := t.References[currentLen:] newRegion[0] = initialRef for i := 1; i < len(newRegion); i *= 2 {