diff --git a/internal/engine/wazevo/backend/isa/arm64/abi.go b/internal/engine/wazevo/backend/isa/arm64/abi.go
index 6615471c6a..4eaa13ce1c 100644
--- a/internal/engine/wazevo/backend/isa/arm64/abi.go
+++ b/internal/engine/wazevo/backend/isa/arm64/abi.go
@@ -101,13 +101,14 @@ func (m *machine) LowerParams(args []ssa.Value) {
 			bits := arg.Type.Bits()
 			// At this point of compilation, we don't yet know how much space exist below the return address.
 			// So we instruct the address mode to add the `argStackOffset` to the offset at the later phase of compilation.
-			amode := addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
+			amode := m.amodePool.Allocate()
+			*amode = addressMode{imm: arg.Offset, rn: spVReg, kind: addressModeKindArgStackSpace}
 			load := m.allocateInstr()
 			switch arg.Type {
 			case ssa.TypeI32, ssa.TypeI64:
-				load.asULoad(operandNR(reg), amode, bits)
+				load.asULoad(reg, amode, bits)
 			case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-				load.asFpuLoad(operandNR(reg), amode, bits)
+				load.asFpuLoad(reg, amode, bits)
 			default:
 				panic("BUG")
 			}
@@ -169,7 +170,8 @@ func (m *machine) LowerReturns(rets []ssa.Value) {
 
 			// At this point of compilation, we don't yet know how much space exist below the return address.
 			// So we instruct the address mode to add the `retStackOffset` to the offset at the later phase of compilation.
-			amode := addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
+			amode := m.amodePool.Allocate()
+			*amode = addressMode{imm: r.Offset, rn: spVReg, kind: addressModeKindResultStackSpace}
 			store := m.allocateInstr()
 			store.asStore(operandNR(reg), amode, bits)
 			m.insert(store)
@@ -215,9 +217,9 @@ func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex i
 		ldr := m.allocateInstr()
 		switch r.Type {
 		case ssa.TypeI32, ssa.TypeI64:
-			ldr.asULoad(operandNR(reg), amode, r.Type.Bits())
+			ldr.asULoad(reg, amode, r.Type.Bits())
 		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			ldr.asFpuLoad(operandNR(reg), amode, r.Type.Bits())
+			ldr.asFpuLoad(reg, amode, r.Type.Bits())
 		default:
 			panic("BUG")
 		}
@@ -225,7 +227,7 @@ func (m *machine) callerGenFunctionReturnVReg(a *backend.FunctionABI, retIndex i
 	}
 }
 
-func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, addressMode) {
+func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) (*instruction, *addressMode) {
 	exct := m.executableContext
 	exct.PendingInstructions = exct.PendingInstructions[:0]
 	mode := m.resolveAddressModeForOffset(offset, dstBits, rn, allowTmpRegUse)
@@ -235,15 +237,15 @@ func (m *machine) resolveAddressModeForOffsetAndInsert(cur *instruction, offset
 	return cur, mode
 }
 
-func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) addressMode {
+func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn regalloc.VReg, allowTmpRegUse bool) *addressMode {
 	if rn.RegType() != regalloc.RegTypeInt {
 		panic("BUG: rn should be a pointer: " + formatVRegSized(rn, 64))
 	}
-	var amode addressMode
+	amode := m.amodePool.Allocate()
 	if offsetFitsInAddressModeKindRegUnsignedImm12(dstBits, offset) {
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: rn, imm: offset}
 	} else if offsetFitsInAddressModeKindRegSignedImm9(offset) {
-		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegSignedImm9, rn: rn, imm: offset}
 	} else {
 		var indexReg regalloc.VReg
 		if allowTmpRegUse {
@@ -253,7 +255,7 @@ func (m *machine) resolveAddressModeForOffset(offset int64, dstBits byte, rn reg
 			indexReg = m.compiler.AllocateVReg(ssa.TypeI64)
 			m.lowerConstantI64(indexReg, offset)
 		}
-		amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
+		*amode = addressMode{kind: addressModeKindRegReg, rn: rn, rm: indexReg, extOp: extendOpUXTX /* indicates index rm is 64-bit */}
 	}
 	return amode
 }
@@ -315,7 +317,7 @@ func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add b
 		} else {
 			ao = aluOpSub
 		}
-		alu.asALU(ao, operandNR(rd), operandNR(spVReg), imm12Operand, true)
+		alu.asALU(ao, rd, operandNR(spVReg), imm12Operand, true)
 		m.insert(alu)
 	} else {
 		m.lowerConstantI64(tmpRegVReg, diff)
@@ -326,7 +328,7 @@ func (m *machine) insertAddOrSubStackPointer(rd regalloc.VReg, diff int64, add b
 		} else {
 			ao = aluOpSub
 		}
-		alu.asALU(ao, operandNR(rd), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		alu.asALU(ao, rd, operandNR(spVReg), operandNR(tmpRegVReg), true)
 		m.insert(alu)
 	}
 }
diff --git a/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go b/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
index 7a9cceb332..f8b5d97ac7 100644
--- a/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
+++ b/internal/engine/wazevo/backend/isa/arm64/abi_entry_preamble.go
@@ -59,25 +59,26 @@ func (m *machine) goEntryPreamblePassArg(cur *instruction, paramSlicePtr regallo
 	} else {
 		postIndexImm = 8
 	}
-	loadMode := addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
+	loadMode := m.amodePool.Allocate()
+	*loadMode = addressMode{kind: addressModeKindPostIndex, rn: paramSlicePtr, imm: postIndexImm}
 
 	instr := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32:
-		instr.asULoad(loadTargetReg, loadMode, 32)
+		instr.asULoad(loadTargetReg.reg(), loadMode, 32)
 	case ssa.TypeI64:
-		instr.asULoad(loadTargetReg, loadMode, 64)
+		instr.asULoad(loadTargetReg.reg(), loadMode, 64)
 	case ssa.TypeF32:
-		instr.asFpuLoad(loadTargetReg, loadMode, 32)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 32)
 	case ssa.TypeF64:
-		instr.asFpuLoad(loadTargetReg, loadMode, 64)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 64)
 	case ssa.TypeV128:
-		instr.asFpuLoad(loadTargetReg, loadMode, 128)
+		instr.asFpuLoad(loadTargetReg.reg(), loadMode, 128)
 	}
 	cur = linkInstr(cur, instr)
 
 	if isStackArg {
-		var storeMode addressMode
+		var storeMode *addressMode
 		cur, storeMode = m.resolveAddressModeForOffsetAndInsert(cur, argStartOffsetFromSP+arg.Offset, bits, spVReg, true)
 		toStack := m.allocateInstr()
 		toStack.asStore(loadTargetReg, storeMode, bits)
@@ -113,21 +114,22 @@ func (m *machine) goEntryPreamblePassResult(cur *instruction, resultSlicePtr reg
 	}
 
 	if isStackArg {
-		var loadMode addressMode
+		var loadMode *addressMode
 		cur, loadMode = m.resolveAddressModeForOffsetAndInsert(cur, resultStartOffsetFromSP+result.Offset, bits, spVReg, true)
 		toReg := m.allocateInstr()
 		switch typ {
 		case ssa.TypeI32, ssa.TypeI64:
-			toReg.asULoad(storeTargetReg, loadMode, bits)
+			toReg.asULoad(storeTargetReg.reg(), loadMode, bits)
 		case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
-			toReg.asFpuLoad(storeTargetReg, loadMode, bits)
+			toReg.asFpuLoad(storeTargetReg.reg(), loadMode, bits)
 		default:
 			panic("TODO?")
 		}
 		cur = linkInstr(cur, toReg)
 	}
 
-	mode := addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: resultSlicePtr, imm: postIndexImm}
 	instr := m.allocateInstr()
 	instr.asStore(storeTargetReg, mode, bits)
 	cur = linkInstr(cur, instr)
@@ -214,11 +216,12 @@ func (m *machine) move64(dst, src regalloc.VReg, prev *instruction) *instruction
 
 func (m *machine) loadOrStoreAtExecutionContext(d regalloc.VReg, offset wazevoapi.Offset, store bool, prev *instruction) *instruction {
 	instr := m.allocateInstr()
-	mode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: savedExecutionContextPtr, imm: offset.I64()}
 	if store {
 		instr.asStore(operandNR(d), mode, 64)
 	} else {
-		instr.asULoad(operandNR(d), mode, 64)
+		instr.asULoad(d, mode, 64)
 	}
 	return linkInstr(prev, instr)
 }
diff --git a/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go b/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
index 466b1f9609..99e6bb482d 100644
--- a/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
+++ b/internal/engine/wazevo/backend/isa/arm64/abi_go_call.go
@@ -87,7 +87,8 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		// Module context is always the second argument.
 		moduleCtrPtr := x1VReg
 		store := m.allocateInstr()
-		amode := addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtrPtr, imm: offset}
 		store.asStore(operandNR(moduleCtrPtr), amode, 64)
 		cur = linkInstr(cur, store)
 	}
@@ -120,11 +121,9 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		} else {
 			sizeInBits = 64
 		}
-		store.asStore(operandNR(v),
-			addressMode{
-				kind: addressModeKindPostIndex,
-				rn:   arg0ret0AddrReg, imm: int64(sizeInBits / 8),
-			}, sizeInBits)
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg, imm: int64(sizeInBits / 8)}
+		store.asStore(operandNR(v), amode, sizeInBits)
 		cur = linkInstr(cur, store)
 	}
 
@@ -139,7 +138,7 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		frameSizeReg = xzrVReg
 		sliceSizeReg = xzrVReg
 	}
-	_amode := addressModePreOrPostIndex(spVReg, -16, true)
+	_amode := addressModePreOrPostIndex(m, spVReg, -16, true)
 	storeP := m.allocateInstr()
 	storeP.asStorePair64(frameSizeReg, sliceSizeReg, _amode)
 	cur = linkInstr(cur, storeP)
@@ -165,8 +164,8 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 	cur = m.addsAddOrSubStackPointer(cur, spVReg, frameInfoSize+goCallStackSize, true)
 	ldr := m.allocateInstr()
 	// And load the return address.
-	ldr.asULoad(operandNR(lrVReg),
-		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	amode := addressModePreOrPostIndex(m, spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */)
+	ldr.asULoad(lrVReg, amode, 64)
 	cur = linkInstr(cur, ldr)
 
 	originalRet0Reg := x17VReg // Caller save, so we can use it for whatever we want.
@@ -183,23 +182,24 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 		r := &abi.Rets[i]
 		if r.Kind == backend.ABIArgKindReg {
 			loadIntoReg := m.allocateInstr()
-			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			mode := m.amodePool.Allocate()
+			*mode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
 			switch r.Type {
 			case ssa.TypeI32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asULoad(operandNR(r.Reg), mode, 32)
+				loadIntoReg.asULoad(r.Reg, mode, 32)
 			case ssa.TypeI64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asULoad(operandNR(r.Reg), mode, 64)
+				loadIntoReg.asULoad(r.Reg, mode, 64)
 			case ssa.TypeF32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 32)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 32)
 			case ssa.TypeF64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 64)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 64)
 			case ssa.TypeV128:
 				mode.imm = 16
-				loadIntoReg.asFpuLoad(operandNR(r.Reg), mode, 128)
+				loadIntoReg.asFpuLoad(r.Reg, mode, 128)
 			default:
 				panic("TODO")
 			}
@@ -208,28 +208,29 @@ func (m *machine) CompileGoFunctionTrampoline(exitCode wazevoapi.ExitCode, sig *
 			// First we need to load the value to a temporary just like ^^.
 			intTmp, floatTmp := x11VReg, v11VReg
 			loadIntoTmpReg := m.allocateInstr()
-			mode := addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
+			mode := m.amodePool.Allocate()
+			*mode = addressMode{kind: addressModeKindPostIndex, rn: arg0ret0AddrReg}
 			var resultReg regalloc.VReg
 			switch r.Type {
 			case ssa.TypeI32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 32)
+				loadIntoTmpReg.asULoad(intTmp, mode, 32)
 				resultReg = intTmp
 			case ssa.TypeI64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asULoad(operandNR(intTmp), mode, 64)
+				loadIntoTmpReg.asULoad(intTmp, mode, 64)
 				resultReg = intTmp
 			case ssa.TypeF32:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 32)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 32)
 				resultReg = floatTmp
 			case ssa.TypeF64:
 				mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 64)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 64)
 				resultReg = floatTmp
 			case ssa.TypeV128:
 				mode.imm = 16
-				loadIntoTmpReg.asFpuLoad(operandNR(floatTmp), mode, 128)
+				loadIntoTmpReg.asFpuLoad(floatTmp, mode, 128)
 				resultReg = floatTmp
 			default:
 				panic("TODO")
@@ -258,12 +259,13 @@ func (m *machine) saveRegistersInExecutionContext(cur *instruction, regs []regal
 		case regalloc.RegTypeFloat:
 			sizeInBits = 128
 		}
-		store.asStore(operandNR(v),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: offset,
-			}, sizeInBits)
+		mode := m.amodePool.Allocate()
+		*mode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: offset,
+		}
+		store.asStore(operandNR(v), mode, sizeInBits)
 		store.prev = cur
 		cur.next = store
 		cur = store
@@ -276,7 +278,7 @@ func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []re
 	offset := wazevoapi.ExecutionContextOffsetSavedRegistersBegin.I64()
 	for _, v := range regs {
 		load := m.allocateInstr()
-		var as func(dst operand, amode addressMode, sizeInBits byte)
+		var as func(dst regalloc.VReg, amode *addressMode, sizeInBits byte)
 		var sizeInBits byte
 		switch v.RegType() {
 		case regalloc.RegTypeInt:
@@ -286,12 +288,13 @@ func (m *machine) restoreRegistersInExecutionContext(cur *instruction, regs []re
 			as = load.asFpuLoad
 			sizeInBits = 128
 		}
-		as(operandNR(v),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: offset,
-			}, sizeInBits)
+		mode := m.amodePool.Allocate()
+		*mode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: offset,
+		}
+		as(v, mode, sizeInBits)
 		cur = linkInstr(cur, load)
 		offset += 16 // Imm12 must be aligned 16 for vector regs, so we unconditionally load regs at the offset of multiple of 16.
 	}
@@ -324,11 +327,9 @@ func (m *machine) setExitCode(cur *instruction, execCtr regalloc.VReg, exitCode
 
 	// Set the exit status on the execution context.
 	setExistStatus := m.allocateInstr()
-	setExistStatus.asStore(operandNR(constReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
-		}, 32)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: execCtr, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64()}
+	setExistStatus.asStore(operandNR(constReg), mode, 32)
 	cur = linkInstr(cur, setExistStatus)
 	return cur
 }
@@ -340,12 +341,13 @@ func (m *machine) storeReturnAddressAndExit(cur *instruction) *instruction {
 	cur = linkInstr(cur, adr)
 
 	storeReturnAddr := m.allocateInstr()
-	storeReturnAddr.asStore(operandNR(tmpRegVReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			// Execution context is always the first argument.
-			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
-		}, 64)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		// Execution context is always the first argument.
+		rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+	}
+	storeReturnAddr.asStore(operandNR(tmpRegVReg), mode, 64)
 	cur = linkInstr(cur, storeReturnAddr)
 
 	// Exit the execution.
@@ -364,11 +366,12 @@ func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VRe
 	cur = linkInstr(cur, movSp)
 
 	strSp := m.allocateInstr()
-	strSp.asStore(operandNR(tmpRegVReg),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
-		}, 64)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtr, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+	}
+	strSp.asStore(operandNR(tmpRegVReg), mode, 64)
 	cur = linkInstr(cur, strSp)
 	return cur
 }
@@ -376,27 +379,28 @@ func (m *machine) saveCurrentStackPointer(cur *instruction, execCtr regalloc.VRe
 func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg regalloc.VReg, arg *backend.ABIArg, intVReg, floatVReg regalloc.VReg) (*instruction, regalloc.VReg) {
 	load := m.allocateInstr()
 	var result regalloc.VReg
-	mode := addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: originalArg0Reg}
 	switch arg.Type {
 	case ssa.TypeI32:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asULoad(operandNR(intVReg), mode, 32)
+		load.asULoad(intVReg, mode, 32)
 		result = intVReg
 	case ssa.TypeI64:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asULoad(operandNR(intVReg), mode, 64)
+		load.asULoad(intVReg, mode, 64)
 		result = intVReg
 	case ssa.TypeF32:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asFpuLoad(operandNR(floatVReg), mode, 32)
+		load.asFpuLoad(floatVReg, mode, 32)
 		result = floatVReg
 	case ssa.TypeF64:
 		mode.imm = 8 // We use uint64 for all basic types, except SIMD v128.
-		load.asFpuLoad(operandNR(floatVReg), mode, 64)
+		load.asFpuLoad(floatVReg, mode, 64)
 		result = floatVReg
 	case ssa.TypeV128:
 		mode.imm = 16
-		load.asFpuLoad(operandNR(floatVReg), mode, 128)
+		load.asFpuLoad(floatVReg, mode, 128)
 		result = floatVReg
 	default:
 		panic("TODO")
@@ -408,7 +412,8 @@ func (m *machine) goFunctionCallLoadStackArg(cur *instruction, originalArg0Reg r
 
 func (m *machine) goFunctionCallStoreStackResult(cur *instruction, originalRet0Reg regalloc.VReg, result *backend.ABIArg, resultVReg regalloc.VReg) *instruction {
 	store := m.allocateInstr()
-	mode := addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{kind: addressModeKindPostIndex, rn: originalRet0Reg}
 	var sizeInBits byte
 	switch result.Type {
 	case ssa.TypeI32, ssa.TypeF32:
diff --git a/internal/engine/wazevo/backend/isa/arm64/instr.go b/internal/engine/wazevo/backend/isa/arm64/instr.go
index 8aabc5997b..7121cb5382 100644
--- a/internal/engine/wazevo/backend/isa/arm64/instr.go
+++ b/internal/engine/wazevo/backend/isa/arm64/instr.go
@@ -3,10 +3,12 @@ package arm64
 import (
 	"fmt"
 	"math"
+	"unsafe"
 
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
 )
 
 type (
@@ -22,9 +24,9 @@ type (
 	// TODO: optimize the layout later once the impl settles.
 	instruction struct {
 		prev, next          *instruction
-		u1, u2, u3          uint64
-		rd, rm, rn, ra      operand
-		amode               addressMode
+		u1, u2              uint64
+		rd                  regalloc.VReg
+		rm, rn              operand
 		kind                instructionKind
 		addedBeforeRegAlloc bool
 	}
@@ -174,7 +176,7 @@ func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg {
 	switch defKinds[i.kind] {
 	case defKindNone:
 	case defKindRD:
-		*regs = append(*regs, i.rd.nr())
+		*regs = append(*regs, i.rd)
 	case defKindCall:
 		_, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2)
 		for i := byte(0); i < retIntRealRegs; i++ {
@@ -194,7 +196,7 @@ func (i *instruction) AssignDef(reg regalloc.VReg) {
 	switch defKinds[i.kind] {
 	case defKindNone:
 	case defKindRD:
-		i.rd = i.rd.assignReg(reg)
+		i.rd = reg
 	case defKindCall:
 		panic("BUG: call instructions shouldn't be assigned")
 	default:
@@ -329,7 +331,7 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 		if rm := i.rm.reg(); rm.Valid() {
 			*regs = append(*regs, rm)
 		}
-		if ra := i.ra.reg(); ra.Valid() {
+		if ra := regalloc.VReg(i.u2); ra.Valid() {
 			*regs = append(*regs, ra)
 		}
 	case useKindRNRN1RM:
@@ -341,18 +343,20 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 			*regs = append(*regs, rm)
 		}
 	case useKindAMode:
-		if amodeRN := i.amode.rn; amodeRN.Valid() {
+		amode := i.getAmode()
+		if amodeRN := amode.rn; amodeRN.Valid() {
 			*regs = append(*regs, amodeRN)
 		}
-		if amodeRM := i.amode.rm; amodeRM.Valid() {
+		if amodeRM := amode.rm; amodeRM.Valid() {
 			*regs = append(*regs, amodeRM)
 		}
 	case useKindRNAMode:
 		*regs = append(*regs, i.rn.reg())
-		if amodeRN := i.amode.rn; amodeRN.Valid() {
+		amode := i.getAmode()
+		if amodeRN := amode.rn; amodeRN.Valid() {
 			*regs = append(*regs, amodeRN)
 		}
-		if amodeRM := i.amode.rm; amodeRM.Valid() {
+		if amodeRM := amode.rm; amodeRM.Valid() {
 			*regs = append(*regs, amodeRM)
 		}
 	case useKindCond:
@@ -374,7 +378,7 @@ func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
 	case useKindRDRewrite:
 		*regs = append(*regs, i.rn.reg())
 		*regs = append(*regs, i.rm.reg())
-		*regs = append(*regs, i.rd.reg())
+		*regs = append(*regs, i.rd)
 	default:
 		panic(fmt.Sprintf("useKind for %v not defined", i))
 	}
@@ -408,8 +412,8 @@ func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
 				i.rm = i.rm.assignReg(reg)
 			}
 		} else {
-			if rd := i.rd.reg(); rd.Valid() {
-				i.rd = i.rd.assignReg(reg)
+			if rd := i.rd; rd.Valid() {
+				i.rd = reg
 			}
 		}
 	case useKindRNRN1RM:
@@ -435,32 +439,36 @@ func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
 				i.rm = i.rm.assignReg(reg)
 			}
 		} else {
-			if ra := i.ra.reg(); ra.Valid() {
-				i.ra = i.ra.assignReg(reg)
+			if ra := regalloc.VReg(i.u2); ra.Valid() {
+				i.u2 = uint64(reg)
 			}
 		}
 	case useKindAMode:
 		if index == 0 {
-			if amodeRN := i.amode.rn; amodeRN.Valid() {
-				i.amode.rn = reg
+			amode := i.getAmode()
+			if amodeRN := amode.rn; amodeRN.Valid() {
+				amode.rn = reg
 			}
 		} else {
-			if amodeRM := i.amode.rm; amodeRM.Valid() {
-				i.amode.rm = reg
+			amode := i.getAmode()
+			if amodeRM := amode.rm; amodeRM.Valid() {
+				amode.rm = reg
 			}
 		}
 	case useKindRNAMode:
 		if index == 0 {
 			i.rn = i.rn.assignReg(reg)
 		} else if index == 1 {
-			if amodeRN := i.amode.rn; amodeRN.Valid() {
-				i.amode.rn = reg
+			amode := i.getAmode()
+			if amodeRN := amode.rn; amodeRN.Valid() {
+				amode.rn = reg
 			} else {
 				panic("BUG")
 			}
 		} else {
-			if amodeRM := i.amode.rm; amodeRM.Valid() {
-				i.amode.rm = reg
+			amode := i.getAmode()
+			if amodeRM := amode.rm; amodeRM.Valid() {
+				amode.rm = reg
 			} else {
 				panic("BUG")
 			}
@@ -503,35 +511,35 @@ func (i *instruction) callFuncRef() ssa.FuncRef {
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movZ
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movK
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
-func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
+func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint32, dst64bit bool) {
 	i.kind = movN
-	i.rd = operandNR(dst)
+	i.rd = dst
 	i.u1 = imm
-	i.u2 = shift
+	i.u2 = uint64(shift)
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
@@ -553,21 +561,21 @@ func (i *instruction) asRet() {
 	i.kind = ret
 }
 
-func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode addressMode) {
+func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode *addressMode) {
 	i.kind = storeP64
 	i.rn = operandNR(src1)
 	i.rm = operandNR(src2)
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode addressMode) {
+func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode *addressMode) {
 	i.kind = loadP64
 	i.rn = operandNR(src1)
 	i.rm = operandNR(src2)
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asStore(src operand, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = store8
@@ -589,10 +597,10 @@ func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
 		i.kind = fpuStore128
 	}
 	i.rn = src
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asSLoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = sLoad8
@@ -604,10 +612,10 @@ func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
 		panic("BUG")
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asULoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 8:
 		i.kind = uLoad8
@@ -619,10 +627,10 @@ func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
 		i.kind = uLoad64
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte) {
+func (i *instruction) asFpuLoad(dst regalloc.VReg, amode *addressMode, sizeInBits byte) {
 	switch sizeInBits {
 	case 32:
 		i.kind = fpuLoad32
@@ -632,10 +640,18 @@ func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte)
 		i.kind = fpuLoad128
 	}
 	i.rd = dst
-	i.amode = amode
+	i.setAmode(amode)
 }
 
-func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
+func (i *instruction) getAmode() *addressMode {
+	return wazevoapi.PtrFromUintptr[addressMode](uintptr(i.u1))
+}
+
+func (i *instruction) setAmode(a *addressMode) {
+	i.u1 = uint64(uintptr(unsafe.Pointer(a)))
+}
+
+func (i *instruction) asVecLoad1R(rd regalloc.VReg, rn operand, arr vecArrangement) {
 	// NOTE: currently only has support for no-offset loads, though it is suspicious that
 	// we would need to support offset load (that is only available for post-index).
 	i.kind = vecLoad1R
@@ -646,32 +662,32 @@ func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
 
 func (i *instruction) asCSet(rd regalloc.VReg, mask bool, c condFlag) {
 	i.kind = cSet
-	i.rd = operandNR(rd)
+	i.rd = rd
 	i.u1 = uint64(c)
 	if mask {
 		i.u2 = 1
 	}
 }
 
-func (i *instruction) asCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+func (i *instruction) asCSel(rd regalloc.VReg, rn, rm operand, c condFlag, _64bit bool) {
 	i.kind = cSel
 	i.rd = rd
 	i.rn = rn
 	i.rm = rm
 	i.u1 = uint64(c)
 	if _64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
+func (i *instruction) asFpuCSel(rd regalloc.VReg, rn, rm operand, c condFlag, _64bit bool) {
 	i.kind = fpuCSel
 	i.rd = rd
 	i.rn = rn
 	i.rm = rm
 	i.u1 = uint64(c)
 	if _64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
@@ -691,7 +707,7 @@ func (i *instruction) asBrTableSequence(indexReg regalloc.VReg, targetIndex, tar
 }
 
 func (i *instruction) brTableSequenceOffsetsResolved() {
-	i.u3 = 1 // indicate that the offsets are resolved, for debugging.
+	i.rm.data = 1 // indicate that the offsets are resolved, for debugging.
 }
 
 func (i *instruction) brLabel() label {
@@ -701,7 +717,7 @@ func (i *instruction) brLabel() label {
 // brOffsetResolved is called when the target label is resolved.
 func (i *instruction) brOffsetResolve(offset int64) {
 	i.u2 = uint64(offset)
-	i.u3 = 1 // indicate that the offset is resolved, for debugging.
+	i.rm.data = 1 // indicate that the offset is resolved, for debugging.
 }
 
 func (i *instruction) brOffset() int64 {
@@ -714,7 +730,7 @@ func (i *instruction) asCondBr(c cond, target label, is64bit bool) {
 	i.u1 = c.asUint64()
 	i.u2 = uint64(target)
 	if is64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
@@ -728,17 +744,17 @@ func (i *instruction) condBrLabel() label {
 
 // condBrOffsetResolve is called when the target label is resolved.
 func (i *instruction) condBrOffsetResolve(offset int64) {
-	i.rd.data = uint64(offset)
-	i.rd.data2 = 1 // indicate that the offset is resolved, for debugging.
+	i.rn.data = uint64(offset)
+	i.rn.data2 = 1 // indicate that the offset is resolved, for debugging.
 }
 
 // condBrOffsetResolved returns true if condBrOffsetResolve is already called.
 func (i *instruction) condBrOffsetResolved() bool {
-	return i.rd.data2 == 1
+	return i.rn.data2 == 1
 }
 
 func (i *instruction) condBrOffset() int64 {
-	return int64(i.rd.data)
+	return int64(i.rn.data)
 }
 
 func (i *instruction) condBrCond() cond {
@@ -746,33 +762,33 @@ func (i *instruction) condBrCond() cond {
 }
 
 func (i *instruction) condBr64bit() bool {
-	return i.u3 == 1
+	return i.u2&(1<<32) != 0
 }
 
 func (i *instruction) asLoadFpuConst32(rd regalloc.VReg, raw uint64) {
 	i.kind = loadFpuConst32
 	i.u1 = raw
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asLoadFpuConst64(rd regalloc.VReg, raw uint64) {
 	i.kind = loadFpuConst64
 	i.u1 = raw
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asLoadFpuConst128(rd regalloc.VReg, lo, hi uint64) {
 	i.kind = loadFpuConst128
 	i.u1 = lo
 	i.u2 = hi
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asFpuCmp(rn, rm operand, is64bit bool) {
 	i.kind = fpuCmp
 	i.rn, i.rm = rn, rm
 	if is64bit {
-		i.u3 = 1
+		i.u1 = 1
 	}
 }
 
@@ -783,12 +799,12 @@ func (i *instruction) asCCmpImm(rn operand, imm uint64, c condFlag, flag byte, i
 	i.u1 = uint64(c)
 	i.u2 = uint64(flag)
 	if is64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // asALU setups a basic ALU instruction.
-func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asALU(aluOp aluOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	switch rm.kind {
 	case operandKindNR:
 		i.kind = aluRRR
@@ -804,22 +820,22 @@ func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
 	i.u1 = uint64(aluOp)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 // asALU setups a basic ALU instruction.
-func (i *instruction) asALURRRR(aluOp aluOp, rd, rn, rm, ra operand, dst64bit bool) {
+func (i *instruction) asALURRRR(aluOp aluOp, rd regalloc.VReg, rn, rm operand, ra regalloc.VReg, dst64bit bool) {
 	i.kind = aluRRRR
 	i.u1 = uint64(aluOp)
-	i.rd, i.rn, i.rm, i.ra = rd, rn, rm, ra
+	i.rd, i.rn, i.rm, i.u2 = rd, rn, rm, uint64(ra)
 	if dst64bit {
-		i.u3 = 1
+		i.u1 |= 1 << 32
 	}
 }
 
 // asALUShift setups a shift based ALU instruction.
-func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asALUShift(aluOp aluOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	switch rm.kind {
 	case operandKindNR:
 		i.kind = aluRRR // If the shift amount op is a register, then the instruction is encoded as a normal ALU instruction with two register operands.
@@ -831,17 +847,17 @@ func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool)
 	i.u1 = uint64(aluOp)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 func (i *instruction) asALUBitmaskImm(aluOp aluOp, rd, rn regalloc.VReg, imm uint64, dst64bit bool) {
 	i.kind = aluRRBitmaskImm
 	i.u1 = uint64(aluOp)
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u2 = imm
 	if dst64bit {
-		i.u3 = 1
+		i.u1 |= 1 << 32
 	}
 }
 
@@ -852,76 +868,76 @@ func (i *instruction) asMovToFPSR(rn regalloc.VReg) {
 
 func (i *instruction) asMovFromFPSR(rd regalloc.VReg) {
 	i.kind = movFromFPSR
-	i.rd = operandNR(rd)
+	i.rd = rd
 }
 
 func (i *instruction) asBitRR(bitOp bitOp, rd, rn regalloc.VReg, is64bit bool) {
 	i.kind = bitRR
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u1 = uint64(bitOp)
 	if is64bit {
 		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuRRR(op fpuBinOp, rd, rn, rm operand, dst64bit bool) {
+func (i *instruction) asFpuRRR(op fpuBinOp, rd regalloc.VReg, rn, rm operand, dst64bit bool) {
 	i.kind = fpuRRR
 	i.u1 = uint64(op)
 	i.rd, i.rn, i.rm = rd, rn, rm
 	if dst64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
-func (i *instruction) asFpuRR(op fpuUniOp, rd, rn operand, dst64bit bool) {
+func (i *instruction) asFpuRR(op fpuUniOp, rd regalloc.VReg, rn operand, dst64bit bool) {
 	i.kind = fpuRR
 	i.u1 = uint64(op)
 	i.rd, i.rn = rd, rn
 	if dst64bit {
-		i.u3 = 1
+		i.u2 = 1
 	}
 }
 
 func (i *instruction) asExtend(rd, rn regalloc.VReg, fromBits, toBits byte, signed bool) {
 	i.kind = extend
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	i.u1 = uint64(fromBits)
 	i.u2 = uint64(toBits)
 	if signed {
-		i.u3 = 1
+		i.u2 |= 1 << 32
 	}
 }
 
 func (i *instruction) asMove32(rd, rn regalloc.VReg) {
 	i.kind = mov32
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 }
 
 func (i *instruction) asMove64(rd, rn regalloc.VReg) *instruction {
 	i.kind = mov64
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	return i
 }
 
 func (i *instruction) asFpuMov64(rd, rn regalloc.VReg) {
 	i.kind = fpuMov64
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 }
 
 func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) *instruction {
 	i.kind = fpuMov128
-	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.rn, i.rd = operandNR(rn), rd
 	return i
 }
 
-func (i *instruction) asMovToVec(rd, rn operand, arr vecArrangement, index vecIndex) {
+func (i *instruction) asMovToVec(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex) {
 	i.kind = movToVec
 	i.rd = rd
 	i.rn = rn
 	i.u1, i.u2 = uint64(arr), uint64(index)
 }
 
-func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vecIndex, signed bool) {
+func (i *instruction) asMovFromVec(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex, signed bool) {
 	if signed {
 		i.kind = movFromVecSigned
 	} else {
@@ -932,48 +948,48 @@ func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vec
 	i.u1, i.u2 = uint64(arr), uint64(index)
 }
 
-func (i *instruction) asVecDup(rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecDup(rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecDup
 	i.u1 = uint64(arr)
 	i.rn, i.rd = rn, rd
 }
 
-func (i *instruction) asVecDupElement(rd, rn operand, arr vecArrangement, index vecIndex) {
+func (i *instruction) asVecDupElement(rd regalloc.VReg, rn operand, arr vecArrangement, index vecIndex) {
 	i.kind = vecDupElement
 	i.u1 = uint64(arr)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(index)
 }
 
-func (i *instruction) asVecExtract(rd, rn, rm operand, arr vecArrangement, index uint32) {
+func (i *instruction) asVecExtract(rd regalloc.VReg, rn, rm operand, arr vecArrangement, index uint32) {
 	i.kind = vecExtract
 	i.u1 = uint64(arr)
 	i.rn, i.rm, i.rd = rn, rm, rd
 	i.u2 = uint64(index)
 }
 
-func (i *instruction) asVecMovElement(rd, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
+func (i *instruction) asVecMovElement(rd regalloc.VReg, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
 	i.kind = vecMovElement
 	i.u1 = uint64(arr)
-	i.u2, i.u3 = uint64(rdIndex), uint64(rnIndex)
+	i.u2 = uint64(rdIndex) | uint64(rnIndex)<<32
 	i.rn, i.rd = rn, rd
 }
 
-func (i *instruction) asVecMisc(op vecOp, rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecMisc(op vecOp, rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecMisc
 	i.u1 = uint64(op)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecLanes(op vecOp, rd, rn operand, arr vecArrangement) {
+func (i *instruction) asVecLanes(op vecOp, rd regalloc.VReg, rn operand, arr vecArrangement) {
 	i.kind = vecLanes
 	i.u1 = uint64(op)
 	i.rn, i.rd = rn, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+func (i *instruction) asVecShiftImm(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) *instruction {
 	i.kind = vecShiftImm
 	i.u1 = uint64(op)
 	i.rn, i.rm, i.rd = rn, rm, rd
@@ -981,7 +997,7 @@ func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrange
 	return i
 }
 
-func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecTbl(nregs byte, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	switch nregs {
 	case 0, 1:
 		i.kind = vecTbl
@@ -1000,14 +1016,14 @@ func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangemen
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecPermute(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecPermute(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	i.kind = vecPermute
 	i.u1 = uint64(op)
 	i.rn, i.rm, i.rd = rn, rm, rd
 	i.u2 = uint64(arr)
 }
 
-func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
+func (i *instruction) asVecRRR(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) *instruction {
 	i.kind = vecRRR
 	i.u1 = uint64(op)
 	i.rn, i.rd, i.rm = rn, rd, rm
@@ -1017,7 +1033,7 @@ func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement)
 
 // asVecRRRRewrite encodes a vector instruction that rewrites the destination register.
 // IMPORTANT: the destination register must be already defined before this instruction.
-func (i *instruction) asVecRRRRewrite(op vecOp, rd, rn, rm operand, arr vecArrangement) {
+func (i *instruction) asVecRRRRewrite(op vecOp, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	i.kind = vecRRRRewrite
 	i.u1 = uint64(op)
 	i.rn, i.rd, i.rm = rn, rd, rm
@@ -1033,8 +1049,8 @@ func (i *instruction) IsCopy() bool {
 
 // String implements fmt.Stringer.
 func (i *instruction) String() (str string) {
-	is64SizeBitToSize := func(u3 uint64) byte {
-		if u3 == 0 {
+	is64SizeBitToSize := func(v uint64) byte {
+		if v == 0 {
 			return 32
 		}
 		return 64
@@ -1049,46 +1065,46 @@ func (i *instruction) String() (str string) {
 			str = "nop0"
 		}
 	case aluRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size),
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size),
 			i.rm.format(size))
 	case aluRRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u1 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.ra.nr(), size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(regalloc.VReg(i.u2), size))
 	case aluRRImm12:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
 	case aluRRBitmaskImm:
-		size := is64SizeBitToSize(i.u3)
-		rd, rn := formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size)
+		size := is64SizeBitToSize(i.u1 >> 32)
+		rd, rn := formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size)
 		if size == 32 {
 			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, uint32(i.u2))
 		} else {
 			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, i.u2)
 		}
 	case aluRRImmShift:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %#x",
 			aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			i.rm.shiftImm(),
 		)
 	case aluRRRShift:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s",
 			aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			i.rm.format(size),
 		)
 	case aluRRRExtend:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			// Regardless of the source size, the register is formatted in 32-bit.
 			i.rm.format(32),
@@ -1097,57 +1113,57 @@ func (i *instruction) String() (str string) {
 		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("%s %s, %s",
 			bitOp(i.u1),
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 		)
 	case uLoad8:
-		str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad8:
-		str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad16:
-		str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad16:
-		str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad32:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case sLoad32:
-		str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case uLoad64:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 64), i.getAmode().format(64))
 	case store8:
-		str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(8))
+		str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(8))
 	case store16:
-		str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(16))
+		str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(16))
 	case store32:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(32))
 	case store64:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.getAmode().format(64))
 	case storeP64:
 		str = fmt.Sprintf("stp %s, %s, %s",
-			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.getAmode().format(64))
 	case loadP64:
 		str = fmt.Sprintf("ldp %s, %s, %s",
-			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
+			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.getAmode().format(64))
 	case mov64:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegSized(i.rd.nr(), 64),
+			formatVRegSized(i.rd, 64),
 			formatVRegSized(i.rn.nr(), 64))
 	case mov32:
-		str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd.nr(), 32), formatVRegSized(i.rn.nr(), 32))
+		str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd, 32), formatVRegSized(i.rn.nr(), 32))
 	case movZ:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case movN:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case movK:
-		size := is64SizeBitToSize(i.u3)
-		str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
+		size := is64SizeBitToSize(i.u2 >> 32)
+		str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd, size), uint16(i.u1), uint32(i.u2)*16)
 	case extend:
 		fromBits, toBits := byte(i.u1), byte(i.u2)
 
 		var signedStr string
-		if i.u3 == 1 {
+		if i.u2>>32 == 1 {
 			signedStr = "s"
 		} else {
 			signedStr = "u"
@@ -1161,39 +1177,39 @@ func (i *instruction) String() (str string) {
 		case 32:
 			fromStr = "w"
 		}
-		str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd.nr(), toBits), formatVRegSized(i.rn.nr(), 32))
+		str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd, toBits), formatVRegSized(i.rn.nr(), 32))
 	case cSel:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("csel %s, %s, %s, %s",
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			formatVRegSized(i.rm.nr(), size),
 			condFlag(i.u1),
 		)
 	case cSet:
 		if i.u2 != 0 {
-			str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+			str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd, 64), condFlag(i.u1))
 		} else {
-			str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
+			str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd, 64), condFlag(i.u1))
 		}
 	case cCmpImm:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		str = fmt.Sprintf("ccmp %s, #%#x, #%#x, %s",
 			formatVRegSized(i.rn.nr(), size), i.rm.data,
 			i.u2&0b1111,
 			condFlag(i.u1))
 	case fpuMov64:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement8B, vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement8B, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement8B, vecIndexNone))
 	case fpuMov128:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement16B, vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement16B, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone))
 	case fpuMovFromVec:
 		panic("TODO")
 	case fpuRR:
-		dstSz := is64SizeBitToSize(i.u3)
+		dstSz := is64SizeBitToSize(i.u2)
 		srcSz := dstSz
 		op := fpuUniOp(i.u1)
 		switch op {
@@ -1203,38 +1219,38 @@ func (i *instruction) String() (str string) {
 			srcSz = 64
 		}
 		str = fmt.Sprintf("%s %s, %s", op.String(),
-			formatVRegSized(i.rd.nr(), dstSz), formatVRegSized(i.rn.nr(), srcSz))
+			formatVRegSized(i.rd, dstSz), formatVRegSized(i.rn.nr(), srcSz))
 	case fpuRRR:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("%s %s, %s, %s", fpuBinOp(i.u1).String(),
-			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
+			formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
 	case fpuRRI:
 		panic("TODO")
 	case fpuRRRR:
 		panic("TODO")
 	case fpuCmp:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u1)
 		str = fmt.Sprintf("fcmp %s, %s",
 			formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
 	case fpuLoad32:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 32), i.getAmode().format(32))
 	case fpuStore32:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.getAmode().format(64))
 	case fpuLoad64:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 64), i.getAmode().format(64))
 	case fpuStore64:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.getAmode().format(64))
 	case fpuLoad128:
-		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 128), i.amode.format(64))
+		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd, 128), i.getAmode().format(64))
 	case fpuStore128:
-		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.amode.format(64))
+		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.getAmode().format(64))
 	case loadFpuConst32:
-		str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd.nr(), 32), math.Float32frombits(uint32(i.u1)))
+		str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd, 32), math.Float32frombits(uint32(i.u1)))
 	case loadFpuConst64:
-		str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd.nr(), 64), math.Float64frombits(i.u1))
+		str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd, 64), math.Float64frombits(i.u1))
 	case loadFpuConst128:
 		str = fmt.Sprintf("ldr %s, #8; b 32; data.v128  %016x %016x",
-			formatVRegSized(i.rd.nr(), 128), i.u1, i.u2)
+			formatVRegSized(i.rd, 128), i.u1, i.u2)
 	case fpuToInt:
 		var op, src, dst string
 		if signed := i.u1 == 1; signed {
@@ -1242,15 +1258,15 @@ func (i *instruction) String() (str string) {
 		} else {
 			op = "fcvtzu"
 		}
-		if src64 := i.u2 == 1; src64 {
+		if src64 := i.u2&1 != 0; src64 {
 			src = formatVRegWidthVec(i.rn.nr(), vecArrangementD)
 		} else {
 			src = formatVRegWidthVec(i.rn.nr(), vecArrangementS)
 		}
-		if dst64 := i.u3 == 1; dst64 {
-			dst = formatVRegSized(i.rd.nr(), 64)
+		if dst64 := i.u2&2 != 0; dst64 {
+			dst = formatVRegSized(i.rd, 64)
 		} else {
-			dst = formatVRegSized(i.rd.nr(), 32)
+			dst = formatVRegSized(i.rd, 32)
 		}
 		str = fmt.Sprintf("%s %s, %s", op, dst, src)
 
@@ -1261,21 +1277,21 @@ func (i *instruction) String() (str string) {
 		} else {
 			op = "ucvtf"
 		}
-		if src64 := i.u2 == 1; src64 {
+		if src64 := i.u2&1 != 0; src64 {
 			src = formatVRegSized(i.rn.nr(), 64)
 		} else {
 			src = formatVRegSized(i.rn.nr(), 32)
 		}
-		if dst64 := i.u3 == 1; dst64 {
-			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementD)
+		if dst64 := i.u2&2 != 0; dst64 {
+			dst = formatVRegWidthVec(i.rd, vecArrangementD)
 		} else {
-			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementS)
+			dst = formatVRegWidthVec(i.rd, vecArrangementS)
 		}
 		str = fmt.Sprintf("%s %s, %s", op, dst, src)
 	case fpuCSel:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2)
 		str = fmt.Sprintf("fcsel %s, %s, %s, %s",
-			formatVRegSized(i.rd.nr(), size),
+			formatVRegSized(i.rd, size),
 			formatVRegSized(i.rn.nr(), size),
 			formatVRegSized(i.rm.nr(), size),
 			condFlag(i.u1),
@@ -1291,7 +1307,7 @@ func (i *instruction) String() (str string) {
 		default:
 			panic("unsupported arrangement " + arr.String())
 		}
-		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd.nr(), arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
+		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd, arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
 	case movFromVec, movFromVecSigned:
 		var size byte
 		var opcode string
@@ -1315,23 +1331,23 @@ func (i *instruction) String() (str string) {
 		default:
 			panic("unsupported arrangement " + arr.String())
 		}
-		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd.nr(), size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
+		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd, size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
 	case vecDup:
 		str = fmt.Sprintf("dup %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone),
 			formatVRegSized(i.rn.nr(), 64),
 		)
 	case vecDupElement:
 		arr := vecArrangement(i.u1)
 		str = fmt.Sprintf("dup %s, %s",
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)),
 		)
 	case vecDupFromFpu:
 		panic("TODO")
 	case vecExtract:
 		str = fmt.Sprintf("ext %s, %s, %s, #%d",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndexNone),
 			formatVRegVec(i.rm.nr(), vecArrangement(i.u1), vecIndexNone),
 			uint32(i.u2),
@@ -1340,15 +1356,15 @@ func (i *instruction) String() (str string) {
 		panic("TODO")
 	case vecMovElement:
 		str = fmt.Sprintf("mov %s, %s",
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndex(i.u2)),
-			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u3)),
+			formatVRegVec(i.rd, vecArrangement(i.u1), vecIndex(i.u2&0xffffffff)),
+			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u2>>32)),
 		)
 	case vecMiscNarrow:
 		panic("TODO")
 	case vecRRR, vecRRRRewrite:
 		str = fmt.Sprintf("%s %s, %s, %s",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+			formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone),
 			formatVRegVec(i.rm.nr(), vecArrangement(i.u2), vecIndexNone),
 		)
@@ -1356,12 +1372,12 @@ func (i *instruction) String() (str string) {
 		vop := vecOp(i.u1)
 		if vop == vecOpCmeq0 {
 			str = fmt.Sprintf("cmeq %s, %s, #0",
-				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+				formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
 		} else {
 			str = fmt.Sprintf("%s %s, %s",
 				vop,
-				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+				formatVRegVec(i.rd, vecArrangement(i.u2), vecIndexNone),
 				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
 		}
 	case vecLanes:
@@ -1379,24 +1395,24 @@ func (i *instruction) String() (str string) {
 		}
 		str = fmt.Sprintf("%s %s, %s",
 			vecOp(i.u1),
-			formatVRegWidthVec(i.rd.nr(), destArr),
+			formatVRegWidthVec(i.rd, destArr),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone))
 	case vecShiftImm:
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("%s %s, %s, #%d",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
 			i.rm.shiftImm())
 	case vecTbl:
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("tbl %s, { %s }, %s",
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone),
 			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
 	case vecTbl2:
 		arr := vecArrangement(i.u2)
-		rd, rn, rm := i.rd.nr(), i.rn.nr(), i.rm.nr()
+		rd, rn, rm := i.rd, i.rn.nr(), i.rm.nr()
 		rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
 		str = fmt.Sprintf("tbl %s, { %s, %s }, %s",
 			formatVRegVec(rd, arr, vecIndexNone),
@@ -1407,13 +1423,13 @@ func (i *instruction) String() (str string) {
 		arr := vecArrangement(i.u2)
 		str = fmt.Sprintf("%s %s, %s, %s",
 			vecOp(i.u1),
-			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
+			formatVRegVec(i.rd, arr, vecIndexNone),
 			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
 			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
 	case movToFPSR:
 		str = fmt.Sprintf("msr fpsr, %s", formatVRegSized(i.rn.nr(), 64))
 	case movFromFPSR:
-		str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd.nr(), 64))
+		str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd, 64))
 	case call:
 		str = fmt.Sprintf("bl %s", ssa.FuncRef(i.u1))
 	case callInd:
@@ -1422,15 +1438,15 @@ func (i *instruction) String() (str string) {
 		str = "ret"
 	case br:
 		target := label(i.u1)
-		if i.u3 != 0 {
+		if i.rm.data != 0 {
 			str = fmt.Sprintf("b #%#x (%s)", i.brOffset(), target.String())
 		} else {
 			str = fmt.Sprintf("b %s", target.String())
 		}
 	case condBr:
-		size := is64SizeBitToSize(i.u3)
+		size := is64SizeBitToSize(i.u2 >> 32)
 		c := cond(i.u1)
-		target := label(i.u2)
+		target := label(i.u2 & 0xffffffff)
 		switch c.kind() {
 		case condKindRegisterZero:
 			if !i.condBrOffsetResolved() {
@@ -1456,7 +1472,7 @@ func (i *instruction) String() (str string) {
 			}
 		}
 	case adr:
-		str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd.nr(), 64), int64(i.u1))
+		str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd, 64), int64(i.u1))
 	case brTableSequence:
 		targetIndex := i.u1
 		str = fmt.Sprintf("br_table_sequence %s, table_index=%d", formatVRegSized(i.rn.nr(), 64), targetIndex)
@@ -1473,7 +1489,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), 64))
 	case atomicCas:
 		m := "casal"
 		size := byte(32)
@@ -1485,7 +1501,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd, size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
 	case atomicLoad:
 		m := "ldar"
 		size := byte(32)
@@ -1497,7 +1513,7 @@ func (i *instruction) String() (str string) {
 		case 1:
 			m = m + "b"
 		}
-		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd, size), formatVRegSized(i.rn.nr(), 64))
 	case atomicStore:
 		m := "stlr"
 		size := byte(32)
@@ -1517,9 +1533,9 @@ func (i *instruction) String() (str string) {
 	case emitSourceOffsetInfo:
 		str = fmt.Sprintf("source_offset_info %d", ssa.SourceOffset(i.u1))
 	case vecLoad1R:
-		str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
+		str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd, vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
 	case loadConstBlockArg:
-		str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd.nr(), 64), i.u1)
+		str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd, 64), i.u1)
 	default:
 		panic(i.kind)
 	}
@@ -1528,26 +1544,26 @@ func (i *instruction) String() (str string) {
 
 func (i *instruction) asAdr(rd regalloc.VReg, offset int64) {
 	i.kind = adr
-	i.rd = operandNR(rd)
+	i.rd = rd
 	i.u1 = uint64(offset)
 }
 
-func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt operand, size uint64) {
+func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt regalloc.VReg, size uint64) {
 	i.kind = atomicRmw
-	i.rd, i.rn, i.rm = rt, rn, rs
+	i.rd, i.rn, i.rm = rt, operandNR(rn), operandNR(rs)
 	i.u1 = uint64(op)
 	i.u2 = size
 }
 
-func (i *instruction) asAtomicCas(rn, rs, rt operand, size uint64) {
+func (i *instruction) asAtomicCas(rn, rs, rt regalloc.VReg, size uint64) {
 	i.kind = atomicCas
-	i.rm, i.rn, i.rd = rt, rn, rs
+	i.rm, i.rn, i.rd = operandNR(rt), operandNR(rn), rs
 	i.u2 = size
 }
 
-func (i *instruction) asAtomicLoad(rn, rt operand, size uint64) {
+func (i *instruction) asAtomicLoad(rn, rt regalloc.VReg, size uint64) {
 	i.kind = atomicLoad
-	i.rn, i.rd = rn, rt
+	i.rn, i.rd = operandNR(rn), rt
 	i.u2 = size
 }
 
@@ -1755,12 +1771,12 @@ func (i *instruction) asLoadConstBlockArg(v uint64, typ ssa.Type, dst regalloc.V
 	i.kind = loadConstBlockArg
 	i.u1 = v
 	i.u2 = uint64(typ)
-	i.rd = operandNR(dst)
+	i.rd = dst
 	return i
 }
 
 func (i *instruction) loadConstBlockArgData() (v uint64, typ ssa.Type, dst regalloc.VReg) {
-	return i.u1, ssa.Type(i.u2), i.rd.nr()
+	return i.u1, ssa.Type(i.u2), i.rd
 }
 
 func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction {
@@ -1778,7 +1794,7 @@ func (i *instruction) asUDF() *instruction {
 	return i
 }
 
-func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bool) {
+func (i *instruction) asFpuToInt(rd regalloc.VReg, rn operand, rdSigned, src64bit, dst64bit bool) {
 	i.kind = fpuToInt
 	i.rn = rn
 	i.rd = rd
@@ -1789,11 +1805,11 @@ func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bo
 		i.u2 = 1
 	}
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 2
 	}
 }
 
-func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bool) {
+func (i *instruction) asIntToFpu(rd regalloc.VReg, rn operand, rnSigned, src64bit, dst64bit bool) {
 	i.kind = intToFpu
 	i.rn = rn
 	i.rd = rd
@@ -1804,7 +1820,7 @@ func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bo
 		i.u2 = 1
 	}
 	if dst64bit {
-		i.u3 = 1
+		i.u2 |= 2
 	}
 }
 
@@ -1817,7 +1833,7 @@ func (i *instruction) asExitSequence(ctx regalloc.VReg) *instruction {
 // aluOp determines the type of ALU operation. Instructions whose kind is one of
 // aluRRR, aluRRRR, aluRRImm12, aluRRBitmaskImm, aluRRImmShift, aluRRRShift and aluRRRExtend
 // would use this type.
-type aluOp int
+type aluOp uint32
 
 func (a aluOp) String() string {
 	switch a {
diff --git a/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
index 227a964741..f0ede2d6aa 100644
--- a/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
+++ b/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
@@ -44,12 +44,12 @@ func (i *instruction) encode(m *machine) {
 	case callInd:
 		c.Emit4Bytes(encodeUnconditionalBranchReg(regNumberInEncoding[i.rn.realReg()], true))
 	case store8, store16, store32, store64, fpuStore32, fpuStore64, fpuStore128:
-		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], i.amode))
+		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rn.realReg()], *i.getAmode()))
 	case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128:
-		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.realReg()], i.amode))
+		c.Emit4Bytes(encodeLoadOrStore(i.kind, regNumberInEncoding[i.rd.RealReg()], *i.getAmode()))
 	case vecLoad1R:
 		c.Emit4Bytes(encodeVecLoad1R(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u1)))
 	case condBr:
@@ -75,22 +75,22 @@ func (i *instruction) encode(m *machine) {
 			panic("BUG")
 		}
 	case movN:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b00, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case movZ:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b10, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case movK:
-		c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.realReg()], i.u1, i.u2, i.u3))
+		c.Emit4Bytes(encodeMoveWideImmediate(0b11, regNumberInEncoding[i.rd.RealReg()], i.u1, uint32(i.u2), uint32(i.u2>>32)))
 	case mov32:
-		to, from := i.rd.realReg(), i.rn.realReg()
+		to, from := i.rd.RealReg(), i.rn.realReg()
 		c.Emit4Bytes(encodeAsMov32(regNumberInEncoding[from], regNumberInEncoding[to]))
 	case mov64:
-		to, from := i.rd.realReg(), i.rn.realReg()
+		to, from := i.rd.RealReg(), i.rn.realReg()
 		toIsSp := to == sp
 		fromIsSp := from == sp
 		c.Emit4Bytes(encodeMov64(regNumberInEncoding[to], regNumberInEncoding[from], toIsSp, fromIsSp))
 	case loadP64, storeP64:
 		rt, rt2 := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
-		amode := i.amode
+		amode := i.getAmode()
 		rn := regNumberInEncoding[amode.rn.RealReg()]
 		var pre bool
 		switch amode.kind {
@@ -102,21 +102,21 @@ func (i *instruction) encode(m *machine) {
 		}
 		c.Emit4Bytes(encodePreOrPostIndexLoadStorePair64(pre, kind == loadP64, rn, rt, rt2, amode.imm))
 	case loadFpuConst32:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		if i.u1 == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
 		} else {
 			encodeLoadFpuConst32(c, rd, i.u1)
 		}
 	case loadFpuConst64:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		if i.u1 == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement8B))
 		} else {
-			encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.realReg()], i.u1)
+			encodeLoadFpuConst64(c, regNumberInEncoding[i.rd.RealReg()], i.u1)
 		}
 	case loadFpuConst128:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		lo, hi := i.u1, i.u2
 		if lo == 0 && hi == 0 {
 			c.Emit4Bytes(encodeVecRRR(vecOpEOR, rd, rd, rd, vecArrangement16B))
@@ -126,35 +126,35 @@ func (i *instruction) encode(m *machine) {
 	case aluRRRR:
 		c.Emit4Bytes(encodeAluRRRR(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
-			regNumberInEncoding[i.ra.realReg()],
-			uint32(i.u3),
+			regNumberInEncoding[regalloc.VReg(i.u2).RealReg()],
+			uint32(i.u1>>32),
 		))
 	case aluRRImmShift:
 		c.Emit4Bytes(encodeAluRRImm(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.rm.shiftImm()),
-			uint32(i.u3),
+			uint32(i.u2>>32),
 		))
 	case aluRRR:
 		rn := i.rn.realReg()
 		c.Emit4Bytes(encodeAluRRR(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[rn],
 			regNumberInEncoding[i.rm.realReg()],
-			i.u3 == 1,
+			i.u2>>32 == 1,
 			rn == sp,
 		))
 	case aluRRRExtend:
 		rm, exo, to := i.rm.er()
 		c.Emit4Bytes(encodeAluRRRExtend(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[rm.RealReg()],
 			exo,
@@ -164,25 +164,25 @@ func (i *instruction) encode(m *machine) {
 		r, amt, sop := i.rm.sr()
 		c.Emit4Bytes(encodeAluRRRShift(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[r.RealReg()],
 			uint32(amt),
 			sop,
-			i.u3 == 1,
+			i.u2>>32 == 1,
 		))
 	case aluRRBitmaskImm:
 		c.Emit4Bytes(encodeAluBitmaskImmediate(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			i.u2,
-			i.u3 == 1,
+			i.u1>>32 == 1,
 		))
 	case bitRR:
 		c.Emit4Bytes(encodeBitRR(
 			bitOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2)),
 		)
@@ -190,22 +190,22 @@ func (i *instruction) encode(m *machine) {
 		imm12, shift := i.rm.imm12()
 		c.Emit4Bytes(encodeAluRRImm12(
 			aluOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			imm12, shift,
-			i.u3 == 1,
+			i.u2>>32 == 1,
 		))
 	case fpuRRR:
 		c.Emit4Bytes(encodeFpuRRR(
 			fpuBinOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case fpuMov64, fpuMov128:
 		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MOV--vector---Move-vector--an-alias-of-ORR--vector--register--
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		rn := regNumberInEncoding[i.rn.realReg()]
 		var q uint32
 		if kind == fpuMov128 {
@@ -213,7 +213,7 @@ func (i *instruction) encode(m *machine) {
 		}
 		c.Emit4Bytes(q<<30 | 0b1110101<<21 | rn<<16 | 0b000111<<10 | rn<<5 | rd)
 	case cSet:
-		rd := regNumberInEncoding[i.rd.realReg()]
+		rd := regNumberInEncoding[i.rd.RealReg()]
 		cf := condFlag(i.u1)
 		if i.u2 == 1 {
 			// https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/CSETM--Conditional-Set-Mask--an-alias-of-CSINV-
@@ -225,12 +225,12 @@ func (i *instruction) encode(m *machine) {
 			c.Emit4Bytes(0b1001101010011111<<16 | uint32(cf.invert())<<12 | 0b111111<<5 | rd)
 		}
 	case extend:
-		c.Emit4Bytes(encodeExtend(i.u3 == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.realReg()], regNumberInEncoding[i.rn.realReg()]))
+		c.Emit4Bytes(encodeExtend((i.u2>>32) == 1, byte(i.u1), byte(i.u2), regNumberInEncoding[i.rd.RealReg()], regNumberInEncoding[i.rn.realReg()]))
 	case fpuCmp:
 		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FCMP--Floating-point-quiet-Compare--scalar--?lang=en
 		rn, rm := regNumberInEncoding[i.rn.realReg()], regNumberInEncoding[i.rm.realReg()]
 		var ftype uint32
-		if i.u3 == 1 {
+		if i.u1 == 1 {
 			ftype = 0b01 // double precision.
 		}
 		c.Emit4Bytes(0b1111<<25 | ftype<<22 | 1<<21 | rm<<16 | 0b1<<13 | rn<<5)
@@ -242,34 +242,34 @@ func (i *instruction) encode(m *machine) {
 			c.Emit4Bytes(0)
 		}
 	case adr:
-		c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.realReg()], uint32(i.u1)))
+		c.Emit4Bytes(encodeAdr(regNumberInEncoding[i.rd.RealReg()], uint32(i.u1)))
 	case cSel:
 		c.Emit4Bytes(encodeConditionalSelect(
 			kind,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			condFlag(i.u1),
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case fpuCSel:
 		c.Emit4Bytes(encodeFpuCSel(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			condFlag(i.u1),
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case movToVec:
 		c.Emit4Bytes(encodeMoveToVec(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2),
 		))
 	case movFromVec, movFromVecSigned:
 		c.Emit4Bytes(encodeMoveFromVec(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2),
@@ -277,18 +277,18 @@ func (i *instruction) encode(m *machine) {
 		))
 	case vecDup:
 		c.Emit4Bytes(encodeVecDup(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1))))
 	case vecDupElement:
 		c.Emit4Bytes(encodeVecDupElement(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(byte(i.u1)),
 			vecIndex(i.u2)))
 	case vecExtract:
 		c.Emit4Bytes(encodeVecExtract(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(byte(i.u1)),
@@ -296,35 +296,35 @@ func (i *instruction) encode(m *machine) {
 	case vecPermute:
 		c.Emit4Bytes(encodeVecPermute(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(byte(i.u2))))
 	case vecMovElement:
 		c.Emit4Bytes(encodeVecMovElement(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u1),
-			uint32(i.u2), uint32(i.u3),
+			uint32(i.u2), uint32(i.u2>>32),
 		))
 	case vecMisc:
 		c.Emit4Bytes(encodeAdvancedSIMDTwoMisc(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u2),
 		))
 	case vecLanes:
 		c.Emit4Bytes(encodeVecLanes(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			vecArrangement(i.u2),
 		))
 	case vecShiftImm:
 		c.Emit4Bytes(encodeVecShiftImm(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.rm.shiftImm()),
 			vecArrangement(i.u2),
@@ -332,7 +332,7 @@ func (i *instruction) encode(m *machine) {
 	case vecTbl:
 		c.Emit4Bytes(encodeVecTbl(
 			1,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2)),
@@ -340,7 +340,7 @@ func (i *instruction) encode(m *machine) {
 	case vecTbl2:
 		c.Emit4Bytes(encodeVecTbl(
 			2,
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2)),
@@ -353,9 +353,9 @@ func (i *instruction) encode(m *machine) {
 	case fpuRR:
 		c.Emit4Bytes(encodeFloatDataOneSource(
 			fpuUniOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
-			i.u3 == 1,
+			i.u2 == 1,
 		))
 	case vecRRR:
 		if op := vecOp(i.u1); op == vecOpBsl || op == vecOpBit || op == vecOpUmlal {
@@ -365,14 +365,14 @@ func (i *instruction) encode(m *machine) {
 	case vecRRRRewrite:
 		c.Emit4Bytes(encodeVecRRR(
 			vecOp(i.u1),
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			vecArrangement(i.u2),
 		))
 	case cCmpImm:
 		// Conditional compare (immediate) in https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en
-		sf := uint32(i.u3 & 0b1)
+		sf := uint32((i.u2 >> 32) & 0b1)
 		nzcv := uint32(i.u2 & 0b1111)
 		cond := uint32(condFlag(i.u1))
 		imm := uint32(i.rm.data & 0b11111)
@@ -381,7 +381,7 @@ func (i *instruction) encode(m *machine) {
 			sf<<31 | 0b111101001<<22 | imm<<16 | cond<<12 | 0b1<<11 | rn<<5 | nzcv,
 		)
 	case movFromFPSR:
-		rt := regNumberInEncoding[i.rd.realReg()]
+		rt := regNumberInEncoding[i.rd.RealReg()]
 		c.Emit4Bytes(encodeSystemRegisterMove(rt, true))
 	case movToFPSR:
 		rt := regNumberInEncoding[i.rn.realReg()]
@@ -390,13 +390,13 @@ func (i *instruction) encode(m *machine) {
 		c.Emit4Bytes(encodeAtomicRmw(
 			atomicRmwOp(i.u1),
 			regNumberInEncoding[i.rm.realReg()],
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2),
 		))
 	case atomicCas:
 		c.Emit4Bytes(encodeAtomicCas(
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			regNumberInEncoding[i.rm.realReg()],
 			regNumberInEncoding[i.rn.realReg()],
 			uint32(i.u2),
@@ -404,7 +404,7 @@ func (i *instruction) encode(m *machine) {
 	case atomicLoad:
 		c.Emit4Bytes(encodeAtomicLoadStore(
 			regNumberInEncoding[i.rn.realReg()],
-			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rd.RealReg()],
 			uint32(i.u2),
 			1,
 		))
@@ -810,7 +810,7 @@ func encodeFloatDataOneSource(op fpuUniOp, rd, rn uint32, dst64bit bool) uint32
 // encodeCnvBetweenFloatInt encodes as "Conversion between floating-point and integer" in
 // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
 func encodeCnvBetweenFloatInt(i *instruction) uint32 {
-	rd := regNumberInEncoding[i.rd.realReg()]
+	rd := regNumberInEncoding[i.rd.RealReg()]
 	rn := regNumberInEncoding[i.rn.realReg()]
 
 	var opcode uint32
@@ -822,8 +822,8 @@ func encodeCnvBetweenFloatInt(i *instruction) uint32 {
 		rmode = 0b00
 
 		signed := i.u1 == 1
-		src64bit := i.u2 == 1
-		dst64bit := i.u3 == 1
+		src64bit := i.u2&1 != 0
+		dst64bit := i.u2&2 != 0
 		if signed {
 			opcode = 0b010
 		} else {
@@ -841,8 +841,8 @@ func encodeCnvBetweenFloatInt(i *instruction) uint32 {
 		rmode = 0b11
 
 		signed := i.u1 == 1
-		src64bit := i.u2 == 1
-		dst64bit := i.u3 == 1
+		src64bit := i.u2&1 != 0
+		dst64bit := i.u2&2 != 0
 
 		if signed {
 			opcode = 0b000
@@ -1787,13 +1787,13 @@ func encodeCBZCBNZ(rt uint32, nz bool, imm19 uint32, _64bit bool) (ret uint32) {
 // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Immediate?lang=en
 //
 // "shift" must have been divided by 16 at this point.
-func encodeMoveWideImmediate(opc uint32, rd uint32, imm, shift, _64bit uint64) (ret uint32) {
+func encodeMoveWideImmediate(opc uint32, rd uint32, imm uint64, shift, _64bit uint32) (ret uint32) {
 	ret = rd
 	ret |= uint32(imm&0xffff) << 5
-	ret |= (uint32(shift)) << 21
+	ret |= (shift) << 21
 	ret |= 0b100101 << 23
 	ret |= opc << 29
-	ret |= uint32(_64bit) << 31
+	ret |= _64bit << 31
 	return
 }
 
diff --git a/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go b/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go
index e60a469cdb..d0c171f623 100644
--- a/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go
+++ b/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go
@@ -17,6 +17,7 @@ func Test_dummy(t *testing.T) {
 }
 
 func TestInstruction_encode(t *testing.T) {
+	m := NewBackend().(*machine)
 	dummyLabel := label(1)
 	for _, tc := range []struct {
 		setup func(*instruction)
@@ -27,1058 +28,1058 @@ func TestInstruction_encode(t *testing.T) {
 		{want: "21443bd5", setup: func(i *instruction) { i.asMovFromFPSR(x1VReg) }},
 		{want: "2f08417a", setup: func(i *instruction) { i.asCCmpImm(operandNR(x1VReg), 1, eq, 0b1111, false) }},
 		{want: "201841fa", setup: func(i *instruction) { i.asCCmpImm(operandNR(x1VReg), 1, ne, 0, true) }},
-		{want: "410c010e", setup: func(i *instruction) { i.asVecDup(operandNR(v1VReg), operandNR(v2VReg), vecArrangement8B) }},
-		{want: "410c014e", setup: func(i *instruction) { i.asVecDup(operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B) }},
-		{want: "410c020e", setup: func(i *instruction) { i.asVecDup(operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H) }},
-		{want: "410c024e", setup: func(i *instruction) { i.asVecDup(operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H) }},
-		{want: "410c040e", setup: func(i *instruction) { i.asVecDup(operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S) }},
-		{want: "410c044e", setup: func(i *instruction) { i.asVecDup(operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S) }},
-		{want: "410c084e", setup: func(i *instruction) { i.asVecDup(operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D) }},
-		{want: "4104034e", setup: func(i *instruction) { i.asVecDupElement(operandNR(v1VReg), operandNR(v2VReg), vecArrangementB, 1) }},
-		{want: "4104064e", setup: func(i *instruction) { i.asVecDupElement(operandNR(v1VReg), operandNR(v2VReg), vecArrangementH, 1) }},
-		{want: "41040c4e", setup: func(i *instruction) { i.asVecDupElement(operandNR(v1VReg), operandNR(v2VReg), vecArrangementS, 1) }},
-		{want: "4104184e", setup: func(i *instruction) { i.asVecDupElement(operandNR(v1VReg), operandNR(v2VReg), vecArrangementD, 1) }},
+		{want: "410c010e", setup: func(i *instruction) { i.asVecDup(v1VReg, operandNR(v2VReg), vecArrangement8B) }},
+		{want: "410c014e", setup: func(i *instruction) { i.asVecDup(v1VReg, operandNR(v2VReg), vecArrangement16B) }},
+		{want: "410c020e", setup: func(i *instruction) { i.asVecDup(v1VReg, operandNR(v2VReg), vecArrangement4H) }},
+		{want: "410c024e", setup: func(i *instruction) { i.asVecDup(v1VReg, operandNR(v2VReg), vecArrangement8H) }},
+		{want: "410c040e", setup: func(i *instruction) { i.asVecDup(v1VReg, operandNR(v2VReg), vecArrangement2S) }},
+		{want: "410c044e", setup: func(i *instruction) { i.asVecDup(v1VReg, operandNR(v2VReg), vecArrangement4S) }},
+		{want: "410c084e", setup: func(i *instruction) { i.asVecDup(v1VReg, operandNR(v2VReg), vecArrangement2D) }},
+		{want: "4104034e", setup: func(i *instruction) { i.asVecDupElement(v1VReg, operandNR(v2VReg), vecArrangementB, 1) }},
+		{want: "4104064e", setup: func(i *instruction) { i.asVecDupElement(v1VReg, operandNR(v2VReg), vecArrangementH, 1) }},
+		{want: "41040c4e", setup: func(i *instruction) { i.asVecDupElement(v1VReg, operandNR(v2VReg), vecArrangementS, 1) }},
+		{want: "4104184e", setup: func(i *instruction) { i.asVecDupElement(v1VReg, operandNR(v2VReg), vecArrangementD, 1) }},
 		{want: "4138032e", setup: func(i *instruction) {
-			i.asVecExtract(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B, 7)
+			i.asVecExtract(v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B, 7)
 		}},
 		{want: "4138036e", setup: func(i *instruction) {
-			i.asVecExtract(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B, 7)
+			i.asVecExtract(v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B, 7)
 		}},
-		{want: "410c036e", setup: func(i *instruction) { i.asVecMovElement(operandNR(v1VReg), operandNR(v2VReg), vecArrangementB, 1, 1) }},
-		{want: "4114066e", setup: func(i *instruction) { i.asVecMovElement(operandNR(v1VReg), operandNR(v2VReg), vecArrangementH, 1, 1) }},
-		{want: "41240c6e", setup: func(i *instruction) { i.asVecMovElement(operandNR(v1VReg), operandNR(v2VReg), vecArrangementS, 1, 1) }},
-		{want: "4144186e", setup: func(i *instruction) { i.asVecMovElement(operandNR(v1VReg), operandNR(v2VReg), vecArrangementD, 1, 1) }},
+		{want: "410c036e", setup: func(i *instruction) { i.asVecMovElement(v1VReg, operandNR(v2VReg), vecArrangementB, 1, 1) }},
+		{want: "4114066e", setup: func(i *instruction) { i.asVecMovElement(v1VReg, operandNR(v2VReg), vecArrangementH, 1, 1) }},
+		{want: "41240c6e", setup: func(i *instruction) { i.asVecMovElement(v1VReg, operandNR(v2VReg), vecArrangementS, 1, 1) }},
+		{want: "4144186e", setup: func(i *instruction) { i.asVecMovElement(v1VReg, operandNR(v2VReg), vecArrangementD, 1, 1) }},
 		{want: "4104090f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpSshr, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(7), vecArrangement8B)
+			i.asVecShiftImm(vecOpSshr, v1VReg, operandNR(v2VReg), operandShiftImm(7), vecArrangement8B)
 		}},
 		{want: "4104094f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpSshr, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(7), vecArrangement16B)
+			i.asVecShiftImm(vecOpSshr, v1VReg, operandNR(v2VReg), operandShiftImm(7), vecArrangement16B)
 		}},
 		{want: "4104190f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpSshr, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(7), vecArrangement4H)
+			i.asVecShiftImm(vecOpSshr, v1VReg, operandNR(v2VReg), operandShiftImm(7), vecArrangement4H)
 		}},
 		{want: "4104194f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpSshr, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(7), vecArrangement8H)
+			i.asVecShiftImm(vecOpSshr, v1VReg, operandNR(v2VReg), operandShiftImm(7), vecArrangement8H)
 		}},
 		{want: "4104390f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpSshr, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(7), vecArrangement2S)
+			i.asVecShiftImm(vecOpSshr, v1VReg, operandNR(v2VReg), operandShiftImm(7), vecArrangement2S)
 		}},
 		{want: "4104394f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpSshr, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(7), vecArrangement4S)
+			i.asVecShiftImm(vecOpSshr, v1VReg, operandNR(v2VReg), operandShiftImm(7), vecArrangement4S)
 		}},
 		{want: "4104794f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpSshr, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(7), vecArrangement2D)
+			i.asVecShiftImm(vecOpSshr, v1VReg, operandNR(v2VReg), operandShiftImm(7), vecArrangement2D)
 		}},
 
 		{want: "41a40d0f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpSshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement8B)
+			i.asVecShiftImm(vecOpSshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement8B)
 		}},
 		{want: "41a40d4f", setup: func(i *instruction) { // sshll2
-			i.asVecShiftImm(vecOpSshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement16B)
+			i.asVecShiftImm(vecOpSshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement16B)
 		}},
 		{want: "41a41d0f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpSshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement4H)
+			i.asVecShiftImm(vecOpSshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement4H)
 		}},
 		{want: "41a41d4f", setup: func(i *instruction) { // sshll2
-			i.asVecShiftImm(vecOpSshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement8H)
+			i.asVecShiftImm(vecOpSshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement8H)
 		}},
 		{want: "41a43d0f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpSshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement2S)
+			i.asVecShiftImm(vecOpSshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement2S)
 		}},
 		{want: "41a43d4f", setup: func(i *instruction) { // sshll2
-			i.asVecShiftImm(vecOpSshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement4S)
+			i.asVecShiftImm(vecOpSshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement4S)
 		}},
 		{want: "41a40d2f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpUshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement8B)
+			i.asVecShiftImm(vecOpUshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement8B)
 		}},
 		{want: "41a40d6f", setup: func(i *instruction) { // ushll2
-			i.asVecShiftImm(vecOpUshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement16B)
+			i.asVecShiftImm(vecOpUshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement16B)
 		}},
 		{want: "41a41d2f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpUshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement4H)
+			i.asVecShiftImm(vecOpUshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement4H)
 		}},
 		{want: "41a41d6f", setup: func(i *instruction) { // ushll2
-			i.asVecShiftImm(vecOpUshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement8H)
+			i.asVecShiftImm(vecOpUshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement8H)
 		}},
 		{want: "41a43d2f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpUshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement2S)
+			i.asVecShiftImm(vecOpUshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement2S)
 		}},
 		{want: "41a43d6f", setup: func(i *instruction) { // ushll2
-			i.asVecShiftImm(vecOpUshll, operandNR(v1VReg), operandNR(v2VReg), operandShiftImm(3), vecArrangement4S)
+			i.asVecShiftImm(vecOpUshll, v1VReg, operandNR(v2VReg), operandShiftImm(3), vecArrangement4S)
 		}},
 		{want: "4100030e", setup: func(i *instruction) {
-			i.asVecTbl(1, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecTbl(1, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4100034e", setup: func(i *instruction) {
-			i.asVecTbl(1, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecTbl(1, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4120040e", setup: func(i *instruction) {
-			i.asVecTbl(2, operandNR(v1VReg), operandNR(v2VReg), operandNR(v4VReg), vecArrangement8B)
+			i.asVecTbl(2, v1VReg, operandNR(v2VReg), operandNR(v4VReg), vecArrangement8B)
 		}},
 		{want: "4120044e", setup: func(i *instruction) {
-			i.asVecTbl(2, operandNR(v1VReg), operandNR(v2VReg), operandNR(v4VReg), vecArrangement16B)
+			i.asVecTbl(2, v1VReg, operandNR(v2VReg), operandNR(v4VReg), vecArrangement16B)
 		}},
 		{want: "4138030e", setup: func(i *instruction) {
-			i.asVecPermute(vecOpZip1, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecPermute(vecOpZip1, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4138034e", setup: func(i *instruction) {
-			i.asVecPermute(vecOpZip1, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecPermute(vecOpZip1, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4138430e", setup: func(i *instruction) {
-			i.asVecPermute(vecOpZip1, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecPermute(vecOpZip1, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "4138434e", setup: func(i *instruction) {
-			i.asVecPermute(vecOpZip1, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecPermute(vecOpZip1, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "4138830e", setup: func(i *instruction) {
-			i.asVecPermute(vecOpZip1, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecPermute(vecOpZip1, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4138834e", setup: func(i *instruction) {
-			i.asVecPermute(vecOpZip1, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecPermute(vecOpZip1, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "4138c34e", setup: func(i *instruction) {
-			i.asVecPermute(vecOpZip1, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecPermute(vecOpZip1, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "411ca32e", setup: func(i *instruction) {
-			i.asVecRRRRewrite(vecOpBit, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRRRewrite(vecOpBit, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "411ca36e", setup: func(i *instruction) {
-			i.asVecRRRRewrite(vecOpBit, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRRRewrite(vecOpBit, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "411c236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpEOR, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpEOR, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "411c232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpEOR, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpEOR, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4184234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAdd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpAdd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4184a34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAdd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpAdd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "4184e34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAdd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpAdd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "410c230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpSqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "410c234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpSqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "410c630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpSqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "410c634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpSqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "410ca30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpSqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "410ca34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpSqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "410ce34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpSqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "410c232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpUqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "410c236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpUqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "410c632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpUqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "410c636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpUqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "410ca32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpUqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "410ca36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpUqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "410ce36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpUqadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "412c230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpSqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "412c234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpSqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "412c630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpSqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "412c634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpSqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "412ca30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpSqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "412ca34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpSqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "412ce34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpSqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "412c232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpUqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "412c236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpUqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "412c632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpUqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "412c636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpUqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "412ca32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpUqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "412ca36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpUqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "412ce36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUqsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpUqsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "4184232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpSub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4184236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpSub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4184632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpSub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "4184636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpSub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "4184a32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpSub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4184a36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpSub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "4184e36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpSub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "41bc230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAddp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpAddp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "41bc234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAddp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpAddp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "41bc630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAddp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpAddp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "41bc634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAddp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpAddp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "41bca30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAddp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpAddp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41bca34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAddp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpAddp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41bce34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAddp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpAddp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "41bc230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAddp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpAddp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "41b8314e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpAddv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B)
+			i.asVecLanes(vecOpAddv, v1VReg, operandNR(v2VReg), vecArrangement16B)
 		}},
 		{want: "41b8710e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpAddv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H)
+			i.asVecLanes(vecOpAddv, v1VReg, operandNR(v2VReg), vecArrangement4H)
 		}},
 		{want: "41b8714e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpAddv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H)
+			i.asVecLanes(vecOpAddv, v1VReg, operandNR(v2VReg), vecArrangement8H)
 		}},
 		{want: "41b8b14e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpAddv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecLanes(vecOpAddv, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "416c230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpSmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "416c234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpSmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "416c630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpSmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "416c634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpSmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "416ca30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpSmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "416ca34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpSmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "416c232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpUmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "416c236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpUmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "416c632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpUmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "416c636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpUmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "416ca32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpUmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "416ca36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpUmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "4164230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpSmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4164234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpSmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4164630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpSmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "4164634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpSmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "4164a30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpSmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4164a34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpSmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "4164232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpUmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4164236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpUmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4164632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpUmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "4164636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpUmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "4164a32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpUmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4164a36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpUmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41a4232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmaxp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpUmaxp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "41a4236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmaxp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpUmaxp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "41a4632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmaxp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpUmaxp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "41a4636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmaxp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpUmaxp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "41a4a32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUmaxp, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpUmaxp, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41a8312e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpUminv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8B)
+			i.asVecLanes(vecOpUminv, v1VReg, operandNR(v2VReg), vecArrangement8B)
 		}},
 		{want: "41a8316e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpUminv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B)
+			i.asVecLanes(vecOpUminv, v1VReg, operandNR(v2VReg), vecArrangement16B)
 		}},
 		{want: "41a8712e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpUminv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H)
+			i.asVecLanes(vecOpUminv, v1VReg, operandNR(v2VReg), vecArrangement4H)
 		}},
 		{want: "41a8716e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpUminv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H)
+			i.asVecLanes(vecOpUminv, v1VReg, operandNR(v2VReg), vecArrangement8H)
 		}},
 		{want: "41a8b16e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpUminv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecLanes(vecOpUminv, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "4114232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUrhadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpUrhadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4114236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUrhadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpUrhadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4114632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUrhadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpUrhadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "4114636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUrhadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpUrhadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "4114a32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUrhadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpUrhadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4114a36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUrhadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpUrhadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "419c230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpMul, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpMul, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "419c234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpMul, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpMul, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "419c630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpMul, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpMul, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "419c634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpMul, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpMul, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "419ca30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpMul, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpMul, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "419ca34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpMul, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpMul, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "4198200e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpCmeq0, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8B)
+			i.asVecMisc(vecOpCmeq0, v1VReg, operandNR(v2VReg), vecArrangement8B)
 		}},
 		{want: "4198204e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpCmeq0, operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B)
+			i.asVecMisc(vecOpCmeq0, v1VReg, operandNR(v2VReg), vecArrangement16B)
 		}},
 		{want: "4198600e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpCmeq0, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H)
+			i.asVecMisc(vecOpCmeq0, v1VReg, operandNR(v2VReg), vecArrangement4H)
 		}},
 		{want: "4198604e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpCmeq0, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H)
+			i.asVecMisc(vecOpCmeq0, v1VReg, operandNR(v2VReg), vecArrangement8H)
 		}},
 		{want: "4198a00e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpCmeq0, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpCmeq0, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "4198a04e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpCmeq0, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpCmeq0, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "4198e04e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpCmeq0, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpCmeq0, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "418c232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmeq, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpCmeq, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "418c236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmeq, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpCmeq, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "418c632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmeq, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpCmeq, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "418c636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmeq, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpCmeq, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "418ca32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmeq, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpCmeq, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "418ca36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmeq, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpCmeq, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "418ce36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmeq, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpCmeq, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "4134230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4134234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4134630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "4134634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "4134a30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4134a34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "4134e34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "4134232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4134236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4134632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "4134636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "4134a32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4134a36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "4134e36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "413c230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmge, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpCmge, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "413c234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmge, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpCmge, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "413c630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmge, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpCmge, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "413c634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmge, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpCmge, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "413ca30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmge, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpCmge, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "413ca34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmge, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpCmge, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "413ce34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmge, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpCmge, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "4134230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4134234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4134630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "4134634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "4134a30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4134a34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "4134e34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpCmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "4134232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4134236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4134632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "4134636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "4134a32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4134a36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "4134e36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhi, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpCmhi, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "413c232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhs, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpCmhs, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "413c236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhs, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpCmhs, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "413c632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhs, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpCmhs, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "413c636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhs, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpCmhs, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "413ca32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhs, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpCmhs, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "413ca36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhs, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpCmhs, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "413ce36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpCmhs, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpCmhs, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "41f4230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpFmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41f4234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpFmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41f4634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFmax, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpFmax, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "41f4a30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpFmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41f4a34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpFmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41f4e34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFmin, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpFmin, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "41d4230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpFadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41d4234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpFadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41d4634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFadd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpFadd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "41d4a30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpFsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41d4a34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpFsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41d4e34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFsub, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpFsub, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "41dc232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFmul, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpFmul, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41dc236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFmul, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpFmul, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41dc636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFmul, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpFmul, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "41b4636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqrdmulh, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpSqrdmulh, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "41b4632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqrdmulh, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpSqrdmulh, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "41b4a32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqrdmulh, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpSqrdmulh, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41b4a36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSqrdmulh, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpSqrdmulh, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41fc232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFdiv, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpFdiv, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41fc236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFdiv, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpFdiv, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41fc636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFdiv, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpFdiv, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "41e4230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFcmeq, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpFcmeq, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41e4234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFcmeq, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpFcmeq, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41e4634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFcmeq, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpFcmeq, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "41e4a32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFcmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpFcmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41e4a36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFcmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpFcmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41e4e36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFcmgt, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpFcmgt, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "41e4232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFcmge, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpFcmge, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "41e4236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFcmge, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpFcmge, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41e4636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpFcmge, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpFcmge, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "4198210e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintm, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpFrintm, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "4198214e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintm, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpFrintm, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "4198614e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintm, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpFrintm, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "4188210e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpFrintn, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "4188214e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpFrintn, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "4188614e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpFrintn, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "4188a10e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintp, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpFrintp, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "4188a14e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintp, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpFrintp, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "4188e14e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintp, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpFrintp, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "4198a10e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintz, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpFrintz, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "4198a14e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintz, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpFrintz, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "4198e14e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFrintz, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpFrintz, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "4178610e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFcvtl, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpFcvtl, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "4178210e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFcvtl, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H)
+			i.asVecMisc(vecOpFcvtl, v1VReg, operandNR(v2VReg), vecArrangement4H)
 		}},
 		{want: "4168610e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFcvtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpFcvtn, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "4168210e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFcvtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H)
+			i.asVecMisc(vecOpFcvtn, v1VReg, operandNR(v2VReg), vecArrangement4H)
 		}},
 		{want: "41b8a10e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFcvtzs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpFcvtzs, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "41b8a14e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFcvtzs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpFcvtzs, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "41b8e14e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFcvtzs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpFcvtzs, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "41b8a12e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFcvtzu, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpFcvtzu, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "41b8a16e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFcvtzu, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpFcvtzu, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "41b8e16e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFcvtzu, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpFcvtzu, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "41d8210e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpScvtf, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpScvtf, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "41d8214e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpScvtf, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpScvtf, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "41d8614e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpScvtf, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpScvtf, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "41d8212e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpUcvtf, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpUcvtf, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "41d8216e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpUcvtf, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpUcvtf, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "41d8616e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpUcvtf, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpUcvtf, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "4148210e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpSqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8B)
+			i.asVecMisc(vecOpSqxtn, v1VReg, operandNR(v2VReg), vecArrangement8B)
 		}},
 		{want: "4148214e", setup: func(i *instruction) { // sqxtn2
-			i.asVecMisc(vecOpSqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B)
+			i.asVecMisc(vecOpSqxtn, v1VReg, operandNR(v2VReg), vecArrangement16B)
 		}},
 		{want: "4148610e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpSqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H)
+			i.asVecMisc(vecOpSqxtn, v1VReg, operandNR(v2VReg), vecArrangement4H)
 		}},
 		{want: "4148614e", setup: func(i *instruction) { // sqxtn2
-			i.asVecMisc(vecOpSqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H)
+			i.asVecMisc(vecOpSqxtn, v1VReg, operandNR(v2VReg), vecArrangement8H)
 		}},
 		{want: "4148a10e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpSqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpSqxtn, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "4148a14e", setup: func(i *instruction) { // sqxtun2
-			i.asVecMisc(vecOpSqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpSqxtn, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "4128212e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpSqxtun, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8B)
+			i.asVecMisc(vecOpSqxtun, v1VReg, operandNR(v2VReg), vecArrangement8B)
 		}},
 		{want: "4128216e", setup: func(i *instruction) { // uqxtun2
-			i.asVecMisc(vecOpSqxtun, operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B)
+			i.asVecMisc(vecOpSqxtun, v1VReg, operandNR(v2VReg), vecArrangement16B)
 		}},
 		{want: "4128612e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpSqxtun, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H)
+			i.asVecMisc(vecOpSqxtun, v1VReg, operandNR(v2VReg), vecArrangement4H)
 		}},
 		{want: "4128616e", setup: func(i *instruction) { // sqxtun2
-			i.asVecMisc(vecOpSqxtun, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H)
+			i.asVecMisc(vecOpSqxtun, v1VReg, operandNR(v2VReg), vecArrangement8H)
 		}},
 		{want: "4128a12e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpSqxtun, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpSqxtun, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "4128a16e", setup: func(i *instruction) { // sqxtun2
-			i.asVecMisc(vecOpSqxtun, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpSqxtun, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "4148212e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpUqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8B)
+			i.asVecMisc(vecOpUqxtn, v1VReg, operandNR(v2VReg), vecArrangement8B)
 		}},
 		{want: "4148216e", setup: func(i *instruction) { // uqxtn2
-			i.asVecMisc(vecOpUqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B)
+			i.asVecMisc(vecOpUqxtn, v1VReg, operandNR(v2VReg), vecArrangement16B)
 		}},
 		{want: "4148612e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpUqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H)
+			i.asVecMisc(vecOpUqxtn, v1VReg, operandNR(v2VReg), vecArrangement4H)
 		}},
 		{want: "4148616e", setup: func(i *instruction) { // sqxtn2
-			i.asVecMisc(vecOpUqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H)
+			i.asVecMisc(vecOpUqxtn, v1VReg, operandNR(v2VReg), vecArrangement8H)
 		}},
 		{want: "4148a12e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpUqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpUqxtn, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "4148a16e", setup: func(i *instruction) { // sqxtn2
-			i.asVecMisc(vecOpUqxtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpUqxtn, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "41b8200e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpAbs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8B)
+			i.asVecMisc(vecOpAbs, v1VReg, operandNR(v2VReg), vecArrangement8B)
 		}},
 		{want: "41b8204e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpAbs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B)
+			i.asVecMisc(vecOpAbs, v1VReg, operandNR(v2VReg), vecArrangement16B)
 		}},
 		{want: "41b8600e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpAbs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H)
+			i.asVecMisc(vecOpAbs, v1VReg, operandNR(v2VReg), vecArrangement4H)
 		}},
 		{want: "41b8604e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpAbs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H)
+			i.asVecMisc(vecOpAbs, v1VReg, operandNR(v2VReg), vecArrangement8H)
 		}},
 		{want: "41b8a00e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpAbs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpAbs, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "41b8a04e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpAbs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpAbs, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "41b8e04e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpAbs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpAbs, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "41f8a00e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFabs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpFabs, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "41f8a04e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFabs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpFabs, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "41f8e04e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFabs, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpFabs, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "41b8202e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpNeg, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8B)
+			i.asVecMisc(vecOpNeg, v1VReg, operandNR(v2VReg), vecArrangement8B)
 		}},
 		{want: "41b8206e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpNeg, operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B)
+			i.asVecMisc(vecOpNeg, v1VReg, operandNR(v2VReg), vecArrangement16B)
 		}},
 		{want: "41b8602e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpNeg, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H)
+			i.asVecMisc(vecOpNeg, v1VReg, operandNR(v2VReg), vecArrangement4H)
 		}},
 		{want: "41b8606e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpNeg, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H)
+			i.asVecMisc(vecOpNeg, v1VReg, operandNR(v2VReg), vecArrangement8H)
 		}},
 		{want: "41b8a02e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpNeg, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpNeg, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "41b8a06e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpNeg, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpNeg, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "41b8e06e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpNeg, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpNeg, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "4128a10e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpXtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpXtn, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "4128a10e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpXtn, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpXtn, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "41f8a02e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFneg, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpFneg, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "41f8a06e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFneg, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpFneg, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "41f8e06e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFneg, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
+			i.asVecMisc(vecOpFneg, v1VReg, operandNR(v2VReg), vecArrangement2D)
 		}},
 		{want: "41f8a12e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFsqrt, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2S)
+			i.asVecMisc(vecOpFsqrt, v1VReg, operandNR(v2VReg), vecArrangement2S)
 		}},
 		{want: "41f8a16e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFsqrt, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecMisc(vecOpFsqrt, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "41f8e16e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpFsqrt, operandNR(v1VReg), operandNR(v2VReg), vecArrangement2D)
-		}},
-		{want: "4100839a", setup: func(i *instruction) { i.asCSel(operandNR(x1VReg), operandNR(x2VReg), operandNR(x3VReg), eq, true) }},
-		{want: "4110839a", setup: func(i *instruction) { i.asCSel(operandNR(x1VReg), operandNR(x2VReg), operandNR(x3VReg), ne, true) }},
-		{want: "4100831a", setup: func(i *instruction) { i.asCSel(operandNR(x1VReg), operandNR(x2VReg), operandNR(x3VReg), eq, false) }},
-		{want: "4110831a", setup: func(i *instruction) { i.asCSel(operandNR(x1VReg), operandNR(x2VReg), operandNR(x3VReg), ne, false) }},
-		{want: "41cc631e", setup: func(i *instruction) { i.asFpuCSel(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), gt, true) }},
-		{want: "41bc631e", setup: func(i *instruction) { i.asFpuCSel(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), lt, true) }},
-		{want: "41cc231e", setup: func(i *instruction) { i.asFpuCSel(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), gt, false) }},
-		{want: "41bc231e", setup: func(i *instruction) { i.asFpuCSel(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), lt, false) }},
-		{want: "411c014e", setup: func(i *instruction) { i.asMovToVec(operandNR(v1VReg), operandNR(x2VReg), vecArrangementB, 0) }},
-		{want: "411c024e", setup: func(i *instruction) { i.asMovToVec(operandNR(v1VReg), operandNR(x2VReg), vecArrangementH, 0) }},
-		{want: "411c044e", setup: func(i *instruction) { i.asMovToVec(operandNR(v1VReg), operandNR(x2VReg), vecArrangementS, 0) }},
-		{want: "411c084e", setup: func(i *instruction) { i.asMovToVec(operandNR(v1VReg), operandNR(x2VReg), vecArrangementD, 0) }},
-		{want: "413c010e", setup: func(i *instruction) { i.asMovFromVec(operandNR(x1VReg), operandNR(v2VReg), vecArrangementB, 0, false) }},
-		{want: "413c020e", setup: func(i *instruction) { i.asMovFromVec(operandNR(x1VReg), operandNR(v2VReg), vecArrangementH, 0, false) }},
-		{want: "413c040e", setup: func(i *instruction) { i.asMovFromVec(operandNR(x1VReg), operandNR(v2VReg), vecArrangementS, 0, false) }},
-		{want: "413c084e", setup: func(i *instruction) { i.asMovFromVec(operandNR(x1VReg), operandNR(v2VReg), vecArrangementD, 0, false) }},
-		{want: "412c030e", setup: func(i *instruction) { i.asMovFromVec(operandNR(x1VReg), operandNR(v2VReg), vecArrangementB, 1, true) }},
-		{want: "412c060e", setup: func(i *instruction) { i.asMovFromVec(operandNR(x1VReg), operandNR(v2VReg), vecArrangementH, 1, true) }},
-		{want: "412c0c4e", setup: func(i *instruction) { i.asMovFromVec(operandNR(x1VReg), operandNR(v2VReg), vecArrangementS, 1, true) }},
-		{want: "410c084e", setup: func(i *instruction) { i.asVecDup(operandNR(x1VReg), operandNR(v2VReg), vecArrangement2D) }},
+			i.asVecMisc(vecOpFsqrt, v1VReg, operandNR(v2VReg), vecArrangement2D)
+		}},
+		{want: "4100839a", setup: func(i *instruction) { i.asCSel(x1VReg, operandNR(x2VReg), operandNR(x3VReg), eq, true) }},
+		{want: "4110839a", setup: func(i *instruction) { i.asCSel(x1VReg, operandNR(x2VReg), operandNR(x3VReg), ne, true) }},
+		{want: "4100831a", setup: func(i *instruction) { i.asCSel(x1VReg, operandNR(x2VReg), operandNR(x3VReg), eq, false) }},
+		{want: "4110831a", setup: func(i *instruction) { i.asCSel(x1VReg, operandNR(x2VReg), operandNR(x3VReg), ne, false) }},
+		{want: "41cc631e", setup: func(i *instruction) { i.asFpuCSel(v1VReg, operandNR(v2VReg), operandNR(v3VReg), gt, true) }},
+		{want: "41bc631e", setup: func(i *instruction) { i.asFpuCSel(v1VReg, operandNR(v2VReg), operandNR(v3VReg), lt, true) }},
+		{want: "41cc231e", setup: func(i *instruction) { i.asFpuCSel(v1VReg, operandNR(v2VReg), operandNR(v3VReg), gt, false) }},
+		{want: "41bc231e", setup: func(i *instruction) { i.asFpuCSel(v1VReg, operandNR(v2VReg), operandNR(v3VReg), lt, false) }},
+		{want: "411c014e", setup: func(i *instruction) { i.asMovToVec(v1VReg, operandNR(x2VReg), vecArrangementB, 0) }},
+		{want: "411c024e", setup: func(i *instruction) { i.asMovToVec(v1VReg, operandNR(x2VReg), vecArrangementH, 0) }},
+		{want: "411c044e", setup: func(i *instruction) { i.asMovToVec(v1VReg, operandNR(x2VReg), vecArrangementS, 0) }},
+		{want: "411c084e", setup: func(i *instruction) { i.asMovToVec(v1VReg, operandNR(x2VReg), vecArrangementD, 0) }},
+		{want: "413c010e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, operandNR(v2VReg), vecArrangementB, 0, false) }},
+		{want: "413c020e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, operandNR(v2VReg), vecArrangementH, 0, false) }},
+		{want: "413c040e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, operandNR(v2VReg), vecArrangementS, 0, false) }},
+		{want: "413c084e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, operandNR(v2VReg), vecArrangementD, 0, false) }},
+		{want: "412c030e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, operandNR(v2VReg), vecArrangementB, 1, true) }},
+		{want: "412c060e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, operandNR(v2VReg), vecArrangementH, 1, true) }},
+		{want: "412c0c4e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, operandNR(v2VReg), vecArrangementS, 1, true) }},
+		{want: "410c084e", setup: func(i *instruction) { i.asVecDup(x1VReg, operandNR(v2VReg), vecArrangement2D) }},
 		{want: "4140036e", setup: func(i *instruction) { // 4140036e
-			i.asVecExtract(operandNR(x1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B, 8)
+			i.asVecExtract(x1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B, 8)
 		}},
 		{want: "4138034e", setup: func(i *instruction) {
-			i.asVecPermute(vecOpZip1, operandNR(x1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecPermute(vecOpZip1, x1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4104214f", setup: func(i *instruction) {
-			i.asVecShiftImm(vecOpSshr, operandNR(x1VReg), operandNR(x2VReg), operandShiftImm(31), vecArrangement4S)
+			i.asVecShiftImm(vecOpSshr, x1VReg, operandNR(x2VReg), operandShiftImm(31), vecArrangement4S)
 		}},
 		{want: "5b28030b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), false)
+			i.asALU(aluOpAdd, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), false)
 		}},
 		{want: "5b28038b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), true)
+			i.asALU(aluOpAdd, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), true)
 		}},
 		{want: "5b28032b", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), false)
+			i.asALU(aluOpAddS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), false)
 		}},
 		{want: "5b2803ab", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), true)
+			i.asALU(aluOpAddS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), true)
 		}},
 		{want: "5b28430b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), false)
+			i.asALU(aluOpAdd, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), false)
 		}},
 		{want: "5b28438b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), true)
+			i.asALU(aluOpAdd, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), true)
 		}},
 		{want: "5b28432b", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), false)
+			i.asALU(aluOpAddS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), false)
 		}},
 		{want: "5b2843ab", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), true)
+			i.asALU(aluOpAddS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), true)
 		}},
 		{want: "5b28830b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), false)
+			i.asALU(aluOpAdd, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), false)
 		}},
 		{want: "5b28838b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), true)
+			i.asALU(aluOpAdd, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), true)
 		}},
 		{want: "5b28832b", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), false)
+			i.asALU(aluOpAddS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), false)
 		}},
 		{want: "5b2883ab", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), true)
+			i.asALU(aluOpAddS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), true)
 		}},
 		{want: "5b28034b", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), false)
+			i.asALU(aluOpSub, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), false)
 		}},
 		{want: "5b2803cb", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), true)
+			i.asALU(aluOpSub, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), true)
 		}},
 		{want: "5b28036b", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), false)
+			i.asALU(aluOpSubS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), false)
 		}},
 		{want: "5b2803eb", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), true)
+			i.asALU(aluOpSubS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), true)
 		}},
 		{want: "5b28434b", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), false)
+			i.asALU(aluOpSub, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), false)
 		}},
 		{want: "5b2843cb", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), true)
+			i.asALU(aluOpSub, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), true)
 		}},
 		{want: "5b28436b", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), false)
+			i.asALU(aluOpSubS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), false)
 		}},
 		{want: "5b2843eb", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), true)
+			i.asALU(aluOpSubS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSR), true)
 		}},
 		{want: "5b28834b", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), false)
+			i.asALU(aluOpSub, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), false)
 		}},
 		{want: "5b2883cb", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), true)
+			i.asALU(aluOpSub, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), true)
 		}},
 		{want: "5b28836b", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), false)
+			i.asALU(aluOpSubS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), false)
 		}},
 		{want: "5b2883eb", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), true)
+			i.asALU(aluOpSubS, tmpRegVReg, operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpASR), true)
 		}},
 		{want: "60033fd6", setup: func(i *instruction) {
 			i.asCallIndirect(tmpRegVReg, nil)
 		}},
 		{want: "fb633bcb", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
+			i.asALU(aluOpSub, tmpRegVReg, operandNR(spVReg), operandNR(tmpRegVReg), true)
 		}},
 		{want: "fb633b8b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
+			i.asALU(aluOpAdd, tmpRegVReg, operandNR(spVReg), operandNR(tmpRegVReg), true)
 		}},
 		{want: "2000020a", setup: func(i *instruction) {
-			i.asALU(aluOpAnd, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), false)
+			i.asALU(aluOpAnd, x0VReg, operandNR(x1VReg), operandNR(x2VReg), false)
 		}},
 		{want: "2000028a", setup: func(i *instruction) {
-			i.asALU(aluOpAnd, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), true)
+			i.asALU(aluOpAnd, x0VReg, operandNR(x1VReg), operandNR(x2VReg), true)
 		}},
 		{want: "2010028a", setup: func(i *instruction) {
-			i.asALU(aluOpAnd, operandNR(x0VReg), operandNR(x1VReg), operandSR(x2VReg, 4, shiftOpLSL), true)
+			i.asALU(aluOpAnd, x0VReg, operandNR(x1VReg), operandSR(x2VReg, 4, shiftOpLSL), true)
 		}},
 		{want: "2030428a", setup: func(i *instruction) {
-			i.asALU(aluOpAnd, operandNR(x0VReg), operandNR(x1VReg), operandSR(x2VReg, 12, shiftOpLSR), true)
+			i.asALU(aluOpAnd, x0VReg, operandNR(x1VReg), operandSR(x2VReg, 12, shiftOpLSR), true)
 		}},
 		{want: "2000026a", setup: func(i *instruction) {
-			i.asALU(aluOpAnds, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), false)
+			i.asALU(aluOpAnds, x0VReg, operandNR(x1VReg), operandNR(x2VReg), false)
 		}},
 		{want: "200002ea", setup: func(i *instruction) {
-			i.asALU(aluOpAnds, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), true)
+			i.asALU(aluOpAnds, x0VReg, operandNR(x1VReg), operandNR(x2VReg), true)
 		}},
 		{want: "201002ea", setup: func(i *instruction) {
-			i.asALU(aluOpAnds, operandNR(x0VReg), operandNR(x1VReg), operandSR(x2VReg, 4, shiftOpLSL), true)
+			i.asALU(aluOpAnds, x0VReg, operandNR(x1VReg), operandSR(x2VReg, 4, shiftOpLSL), true)
 		}},
 		{want: "203042ea", setup: func(i *instruction) {
-			i.asALU(aluOpAnds, operandNR(x0VReg), operandNR(x1VReg), operandSR(x2VReg, 12, shiftOpLSR), true)
+			i.asALU(aluOpAnds, x0VReg, operandNR(x1VReg), operandSR(x2VReg, 12, shiftOpLSR), true)
 		}},
 		{want: "2000022a", setup: func(i *instruction) {
-			i.asALU(aluOpOrr, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), false)
+			i.asALU(aluOpOrr, x0VReg, operandNR(x1VReg), operandNR(x2VReg), false)
 		}},
 		{want: "200002aa", setup: func(i *instruction) {
-			i.asALU(aluOpOrr, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), true)
+			i.asALU(aluOpOrr, x0VReg, operandNR(x1VReg), operandNR(x2VReg), true)
 		}},
 		{want: "201002aa", setup: func(i *instruction) {
-			i.asALU(aluOpOrr, operandNR(x0VReg), operandNR(x1VReg), operandSR(x2VReg, 4, shiftOpLSL), true)
+			i.asALU(aluOpOrr, x0VReg, operandNR(x1VReg), operandSR(x2VReg, 4, shiftOpLSL), true)
 		}},
 		{want: "201082aa", setup: func(i *instruction) {
-			i.asALU(aluOpOrr, operandNR(x0VReg), operandNR(x1VReg), operandSR(x2VReg, 4, shiftOpASR), true)
+			i.asALU(aluOpOrr, x0VReg, operandNR(x1VReg), operandSR(x2VReg, 4, shiftOpASR), true)
 		}},
 		{want: "2000024a", setup: func(i *instruction) {
-			i.asALU(aluOpEor, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), false)
+			i.asALU(aluOpEor, x0VReg, operandNR(x1VReg), operandNR(x2VReg), false)
 		}},
 		{want: "200002ca", setup: func(i *instruction) {
-			i.asALU(aluOpEor, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), true)
+			i.asALU(aluOpEor, x0VReg, operandNR(x1VReg), operandNR(x2VReg), true)
 		}},
 		{want: "201002ca", setup: func(i *instruction) {
-			i.asALU(aluOpEor, operandNR(x0VReg), operandNR(x1VReg), operandSR(x2VReg, 4, shiftOpLSL), true)
+			i.asALU(aluOpEor, x0VReg, operandNR(x1VReg), operandSR(x2VReg, 4, shiftOpLSL), true)
 		}},
 		{want: "202cc21a", setup: func(i *instruction) {
-			i.asALU(aluOpRotR, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), false)
+			i.asALU(aluOpRotR, x0VReg, operandNR(x1VReg), operandNR(x2VReg), false)
 		}},
 		{want: "202cc29a", setup: func(i *instruction) {
-			i.asALU(aluOpRotR, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), true)
+			i.asALU(aluOpRotR, x0VReg, operandNR(x1VReg), operandNR(x2VReg), true)
 		}},
 		{want: "2000222a", setup: func(i *instruction) {
-			i.asALU(aluOpOrn, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), false)
+			i.asALU(aluOpOrn, x0VReg, operandNR(x1VReg), operandNR(x2VReg), false)
 		}},
 		{want: "200022aa", setup: func(i *instruction) {
-			i.asALU(aluOpOrn, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), true)
+			i.asALU(aluOpOrn, x0VReg, operandNR(x1VReg), operandNR(x2VReg), true)
 		}},
 		{want: "30000010", setup: func(i *instruction) { i.asAdr(v16VReg, 4) }},
 		{want: "50050030", setup: func(i *instruction) { i.asAdr(v16VReg, 169) }},
@@ -1093,16 +1094,16 @@ func TestInstruction_encode(t *testing.T) {
 		{want: "101e306e", setup: func(i *instruction) { i.asLoadFpuConst128(v16VReg, 0, 0) }},
 		{want: "5000009c05000014ffffffffffffffffaaaaaaaaaaaaaaaa", setup: func(i *instruction) { i.asLoadFpuConst128(v16VReg, 0xffffffff_ffffffff, 0xaaaaaaaa_aaaaaaaa) }},
 		{want: "8220061b", setup: func(i *instruction) {
-			i.asALURRRR(aluOpMAdd, operandNR(x2VReg), operandNR(x4VReg), operandNR(x6VReg), operandNR(x8VReg), false)
+			i.asALURRRR(aluOpMAdd, x2VReg, operandNR(x4VReg), operandNR(x6VReg), x8VReg, false)
 		}},
 		{want: "8220069b", setup: func(i *instruction) {
-			i.asALURRRR(aluOpMAdd, operandNR(x2VReg), operandNR(x4VReg), operandNR(x6VReg), operandNR(x8VReg), true)
+			i.asALURRRR(aluOpMAdd, x2VReg, operandNR(x4VReg), operandNR(x6VReg), x8VReg, true)
 		}},
 		{want: "82a0061b", setup: func(i *instruction) {
-			i.asALURRRR(aluOpMSub, operandNR(x2VReg), operandNR(x4VReg), operandNR(x6VReg), operandNR(x8VReg), false)
+			i.asALURRRR(aluOpMSub, x2VReg, operandNR(x4VReg), operandNR(x6VReg), x8VReg, false)
 		}},
 		{want: "82a0069b", setup: func(i *instruction) {
-			i.asALURRRR(aluOpMSub, operandNR(x2VReg), operandNR(x4VReg), operandNR(x6VReg), operandNR(x8VReg), true)
+			i.asALURRRR(aluOpMSub, x2VReg, operandNR(x4VReg), operandNR(x6VReg), x8VReg, true)
 		}},
 		{want: "00213f1e", setup: func(i *instruction) { i.asFpuCmp(operandNR(v8VReg), operandNR(v31VReg), false) }},
 		{want: "00217f1e", setup: func(i *instruction) { i.asFpuCmp(operandNR(v8VReg), operandNR(v31VReg), true) }},
@@ -1200,76 +1201,76 @@ func TestInstruction_encode(t *testing.T) {
 		{want: "f21fbf0e", setup: func(i *instruction) { i.asFpuMov64(v18VReg, v31VReg) }},
 		{want: "f21fbf4e", setup: func(i *instruction) { i.asFpuMov128(v18VReg, v31VReg) }},
 		{want: "40a034ab", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpSXTH, 64), false)
+			i.asALU(aluOpAddS, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpSXTH, 64), false)
 		}},
 		{want: "4080348b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpSXTB, 64), false)
+			i.asALU(aluOpAdd, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpSXTB, 64), false)
 		}},
 		{want: "40a0348b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpSXTH, 64), false)
+			i.asALU(aluOpAdd, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpSXTH, 64), false)
 		}},
 		{want: "40c0348b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpSXTW, 64), false)
+			i.asALU(aluOpAdd, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpSXTW, 64), false)
 		}},
 		{want: "4080340b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpSXTB, 32), false)
+			i.asALU(aluOpAdd, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpSXTB, 32), false)
 		}},
 		{want: "40a0340b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpSXTH, 32), false)
+			i.asALU(aluOpAdd, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpSXTH, 32), false)
 		}},
 		{want: "40c0340b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpSXTW, 32), false)
+			i.asALU(aluOpAdd, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpSXTW, 32), false)
 		}},
 		{want: "400034eb", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpUXTB, 64), false)
+			i.asALU(aluOpSubS, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpUXTB, 64), false)
 		}},
 		{want: "400034cb", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpUXTB, 64), false)
+			i.asALU(aluOpSub, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpUXTB, 64), false)
 		}},
 		{want: "402034cb", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpUXTH, 64), false)
+			i.asALU(aluOpSub, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpUXTH, 64), false)
 		}},
 		{want: "404034cb", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpUXTW, 64), false)
+			i.asALU(aluOpSub, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpUXTW, 64), false)
 		}},
 		{want: "4000344b", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpUXTB, 32), false)
+			i.asALU(aluOpSub, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpUXTB, 32), false)
 		}},
 		{want: "4020344b", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpUXTH, 32), false)
+			i.asALU(aluOpSub, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpUXTH, 32), false)
 		}},
 		{want: "4040344b", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x0VReg), operandNR(x2VReg), operandER(x20VReg, extendOpUXTW, 32), false)
+			i.asALU(aluOpSub, x0VReg, operandNR(x2VReg), operandER(x20VReg, extendOpUXTW, 32), false)
 		}},
 		{want: "4000140b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), false)
+			i.asALU(aluOpAdd, x0VReg, operandNR(x2VReg), operandNR(x20VReg), false)
 		}},
 		{want: "4000148b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), true)
+			i.asALU(aluOpAdd, x0VReg, operandNR(x2VReg), operandNR(x20VReg), true)
 		}},
 		{want: "40001f8b", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x0VReg), operandNR(x2VReg), operandNR(xzrVReg), true)
+			i.asALU(aluOpAdd, x0VReg, operandNR(x2VReg), operandNR(xzrVReg), true)
 		}},
 		{want: "4000142b", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), false)
+			i.asALU(aluOpAddS, x0VReg, operandNR(x2VReg), operandNR(x20VReg), false)
 		}},
 		{want: "400014ab", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), true)
+			i.asALU(aluOpAddS, x0VReg, operandNR(x2VReg), operandNR(x20VReg), true)
 		}},
 		{want: "4000144b", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), false)
+			i.asALU(aluOpSub, x0VReg, operandNR(x2VReg), operandNR(x20VReg), false)
 		}},
 		{want: "400014cb", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), true)
+			i.asALU(aluOpSub, x0VReg, operandNR(x2VReg), operandNR(x20VReg), true)
 		}},
 		{want: "40001fcb", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x0VReg), operandNR(x2VReg), operandNR(xzrVReg), true)
+			i.asALU(aluOpSub, x0VReg, operandNR(x2VReg), operandNR(xzrVReg), true)
 		}},
 		{want: "400014eb", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), true)
+			i.asALU(aluOpSubS, x0VReg, operandNR(x2VReg), operandNR(x20VReg), true)
 		}},
 		{want: "40001feb", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(x0VReg), operandNR(x2VReg), operandNR(xzrVReg), true)
+			i.asALU(aluOpSubS, x0VReg, operandNR(x2VReg), operandNR(xzrVReg), true)
 		}},
 		{want: "c0035fd6", setup: func(i *instruction) { i.asRet() }},
 		{want: "e303042a", setup: func(i *instruction) { i.asMove32(x3VReg, x4VReg) }},
@@ -1279,22 +1280,22 @@ func TestInstruction_encode(t *testing.T) {
 		{want: "9f000091", setup: func(i *instruction) { i.asMove64(spVReg, x4VReg) }},
 		{want: "e0030091", setup: func(i *instruction) { i.asMove64(x0VReg, spVReg) }},
 		{want: "e17bc1a8", setup: func(i *instruction) {
-			i.asLoadPair64(x1VReg, x30VReg, addressModePreOrPostIndex(spVReg, 16, false))
+			i.asLoadPair64(x1VReg, x30VReg, addressModePreOrPostIndex(m, spVReg, 16, false))
 		}},
 		{want: "e17bc1a9", setup: func(i *instruction) {
-			i.asLoadPair64(x1VReg, x30VReg, addressModePreOrPostIndex(spVReg, 16, true))
+			i.asLoadPair64(x1VReg, x30VReg, addressModePreOrPostIndex(m, spVReg, 16, true))
 		}},
 		{want: "e17b81a8", setup: func(i *instruction) {
-			i.asStorePair64(x1VReg, x30VReg, addressModePreOrPostIndex(spVReg, 16, false))
+			i.asStorePair64(x1VReg, x30VReg, addressModePreOrPostIndex(m, spVReg, 16, false))
 		}},
 		{want: "e17b81a9", setup: func(i *instruction) {
-			i.asStorePair64(x1VReg, x30VReg, addressModePreOrPostIndex(spVReg, 16, true))
+			i.asStorePair64(x1VReg, x30VReg, addressModePreOrPostIndex(m, spVReg, 16, true))
 		}},
 		{want: "e17f81a9", setup: func(i *instruction) {
-			i.asStorePair64(x1VReg, xzrVReg, addressModePreOrPostIndex(spVReg, 16, true))
+			i.asStorePair64(x1VReg, xzrVReg, addressModePreOrPostIndex(m, spVReg, 16, true))
 		}},
 		{want: "ff7f81a9", setup: func(i *instruction) {
-			i.asStorePair64(xzrVReg, xzrVReg, addressModePreOrPostIndex(spVReg, 16, true))
+			i.asStorePair64(xzrVReg, xzrVReg, addressModePreOrPostIndex(m, spVReg, 16, true))
 		}},
 		{want: "20000014", setup: func(i *instruction) {
 			i.asBr(dummyLabel)
@@ -1317,478 +1318,478 @@ func TestInstruction_encode(t *testing.T) {
 			i.condBrOffsetResolve(0x80)
 		}},
 		{want: "8328321e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpAdd, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), false)
+			i.asFpuRRR(fpuBinOpAdd, v3VReg, operandNR(v4VReg), operandNR(v18VReg), false)
 		}},
 		{want: "8328721e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpAdd, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), true)
+			i.asFpuRRR(fpuBinOpAdd, v3VReg, operandNR(v4VReg), operandNR(v18VReg), true)
 		}},
 		{want: "8338321e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpSub, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), false)
+			i.asFpuRRR(fpuBinOpSub, v3VReg, operandNR(v4VReg), operandNR(v18VReg), false)
 		}},
 		{want: "8338721e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpSub, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), true)
+			i.asFpuRRR(fpuBinOpSub, v3VReg, operandNR(v4VReg), operandNR(v18VReg), true)
 		}},
 		{want: "8308321e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpMul, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), false)
+			i.asFpuRRR(fpuBinOpMul, v3VReg, operandNR(v4VReg), operandNR(v18VReg), false)
 		}},
 		{want: "8308721e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpMul, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), true)
+			i.asFpuRRR(fpuBinOpMul, v3VReg, operandNR(v4VReg), operandNR(v18VReg), true)
 		}},
 		{want: "8318321e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpDiv, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), false)
+			i.asFpuRRR(fpuBinOpDiv, v3VReg, operandNR(v4VReg), operandNR(v18VReg), false)
 		}},
 		{want: "8318721e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpDiv, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), true)
+			i.asFpuRRR(fpuBinOpDiv, v3VReg, operandNR(v4VReg), operandNR(v18VReg), true)
 		}},
 		{want: "8348321e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpMax, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), false)
+			i.asFpuRRR(fpuBinOpMax, v3VReg, operandNR(v4VReg), operandNR(v18VReg), false)
 		}},
 		{want: "8348721e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpMax, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), true)
+			i.asFpuRRR(fpuBinOpMax, v3VReg, operandNR(v4VReg), operandNR(v18VReg), true)
 		}},
 		{want: "8358321e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpMin, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), false)
+			i.asFpuRRR(fpuBinOpMin, v3VReg, operandNR(v4VReg), operandNR(v18VReg), false)
 		}},
 		{want: "8358721e", setup: func(i *instruction) {
-			i.asFpuRRR(fpuBinOpMin, operandNR(v3VReg), operandNR(v4VReg), operandNR(v18VReg), true)
+			i.asFpuRRR(fpuBinOpMin, v3VReg, operandNR(v4VReg), operandNR(v18VReg), true)
 		}},
 		{want: "49fd7f11", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b1), false)
+			i.asALU(aluOpAdd, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b1), false)
 		}},
 		{want: "e9ff7f91", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x9VReg), operandNR(spVReg), operandImm12(0b111111111111, 0b1), true)
+			i.asALU(aluOpAdd, x9VReg, operandNR(spVReg), operandImm12(0b111111111111, 0b1), true)
 		}},
 		{want: "49fd3f11", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b0), false)
+			i.asALU(aluOpAdd, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b0), false)
 		}},
 		{want: "5ffd3f91", setup: func(i *instruction) {
-			i.asALU(aluOpAdd, operandNR(spVReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b0), true)
+			i.asALU(aluOpAdd, spVReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b0), true)
 		}},
 		{want: "49fd7f31", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b1), false)
+			i.asALU(aluOpAddS, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b1), false)
 		}},
 		{want: "49fd7fb1", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b1), true)
+			i.asALU(aluOpAddS, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b1), true)
 		}},
 		{want: "49fd3f31", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b0), false)
+			i.asALU(aluOpAddS, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b0), false)
 		}},
 		{want: "49fd3fb1", setup: func(i *instruction) {
-			i.asALU(aluOpAddS, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b0), true)
+			i.asALU(aluOpAddS, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b0), true)
 		}},
 		{want: "49fd7f51", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b1), false)
+			i.asALU(aluOpSub, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b1), false)
 		}},
 		{want: "e9ff7fd1", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x9VReg), operandNR(spVReg), operandImm12(0b111111111111, 0b1), true)
+			i.asALU(aluOpSub, x9VReg, operandNR(spVReg), operandImm12(0b111111111111, 0b1), true)
 		}},
 		{want: "49fd3f51", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b0), false)
+			i.asALU(aluOpSub, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b0), false)
 		}},
 		{want: "5ffd3fd1", setup: func(i *instruction) {
-			i.asALU(aluOpSub, operandNR(spVReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b0), true)
+			i.asALU(aluOpSub, spVReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b0), true)
 		}},
 		{want: "49fd7f71", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b1), false)
+			i.asALU(aluOpSubS, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b1), false)
 		}},
 		{want: "49fd7ff1", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b1), true)
+			i.asALU(aluOpSubS, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b1), true)
 		}},
 		{want: "49fd3f71", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b0), false)
+			i.asALU(aluOpSubS, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b0), false)
 		}},
 		{want: "49fd3ff1", setup: func(i *instruction) {
-			i.asALU(aluOpSubS, operandNR(x9VReg), operandNR(x10VReg), operandImm12(0b111111111111, 0b0), true)
+			i.asALU(aluOpSubS, x9VReg, operandNR(x10VReg), operandImm12(0b111111111111, 0b0), true)
 		}},
 		{want: "4020d41a", setup: func(i *instruction) {
-			i.asALU(aluOpLsl, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), false)
+			i.asALU(aluOpLsl, x0VReg, operandNR(x2VReg), operandNR(x20VReg), false)
 		}},
 		{want: "4020d49a", setup: func(i *instruction) {
-			i.asALU(aluOpLsl, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), true)
+			i.asALU(aluOpLsl, x0VReg, operandNR(x2VReg), operandNR(x20VReg), true)
 		}},
 		{want: "4024d41a", setup: func(i *instruction) {
-			i.asALU(aluOpLsr, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), false)
+			i.asALU(aluOpLsr, x0VReg, operandNR(x2VReg), operandNR(x20VReg), false)
 		}},
 		{want: "4024d49a", setup: func(i *instruction) {
-			i.asALU(aluOpLsr, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), true)
+			i.asALU(aluOpLsr, x0VReg, operandNR(x2VReg), operandNR(x20VReg), true)
 		}},
 		{want: "4028d41a", setup: func(i *instruction) {
-			i.asALU(aluOpAsr, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), false)
+			i.asALU(aluOpAsr, x0VReg, operandNR(x2VReg), operandNR(x20VReg), false)
 		}},
 		{want: "4028d49a", setup: func(i *instruction) {
-			i.asALU(aluOpAsr, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), true)
+			i.asALU(aluOpAsr, x0VReg, operandNR(x2VReg), operandNR(x20VReg), true)
 		}},
 		{want: "400cd49a", setup: func(i *instruction) {
-			i.asALU(aluOpSDiv, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), true)
+			i.asALU(aluOpSDiv, x0VReg, operandNR(x2VReg), operandNR(x20VReg), true)
 		}},
 		{want: "400cd41a", setup: func(i *instruction) {
-			i.asALU(aluOpSDiv, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), false)
+			i.asALU(aluOpSDiv, x0VReg, operandNR(x2VReg), operandNR(x20VReg), false)
 		}},
 		{want: "4008d49a", setup: func(i *instruction) {
-			i.asALU(aluOpUDiv, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), true)
+			i.asALU(aluOpUDiv, x0VReg, operandNR(x2VReg), operandNR(x20VReg), true)
 		}},
 		{want: "4008d41a", setup: func(i *instruction) {
-			i.asALU(aluOpUDiv, operandNR(x0VReg), operandNR(x2VReg), operandNR(x20VReg), false)
+			i.asALU(aluOpUDiv, x0VReg, operandNR(x2VReg), operandNR(x20VReg), false)
 		}},
 		{want: "407c0013", setup: func(i *instruction) {
-			i.asALUShift(aluOpAsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(0), false)
+			i.asALUShift(aluOpAsr, x0VReg, operandNR(x2VReg), operandShiftImm(0), false)
 		}},
 		{want: "40fc4093", setup: func(i *instruction) {
-			i.asALUShift(aluOpAsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(0), true)
+			i.asALUShift(aluOpAsr, x0VReg, operandNR(x2VReg), operandShiftImm(0), true)
 		}},
 		{want: "407c0113", setup: func(i *instruction) {
-			i.asALUShift(aluOpAsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(1), false)
+			i.asALUShift(aluOpAsr, x0VReg, operandNR(x2VReg), operandShiftImm(1), false)
 		}},
 		{want: "407c1f13", setup: func(i *instruction) {
-			i.asALUShift(aluOpAsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(31), false)
+			i.asALUShift(aluOpAsr, x0VReg, operandNR(x2VReg), operandShiftImm(31), false)
 		}},
 		{want: "40fc4193", setup: func(i *instruction) {
-			i.asALUShift(aluOpAsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(1), true)
+			i.asALUShift(aluOpAsr, x0VReg, operandNR(x2VReg), operandShiftImm(1), true)
 		}},
 		{want: "40fc5f93", setup: func(i *instruction) {
-			i.asALUShift(aluOpAsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(31), true)
+			i.asALUShift(aluOpAsr, x0VReg, operandNR(x2VReg), operandShiftImm(31), true)
 		}},
 		{want: "40fc7f93", setup: func(i *instruction) {
-			i.asALUShift(aluOpAsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(63), true)
+			i.asALUShift(aluOpAsr, x0VReg, operandNR(x2VReg), operandShiftImm(63), true)
 		}},
 		{want: "407c0153", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(1), false)
+			i.asALUShift(aluOpLsr, x0VReg, operandNR(x2VReg), operandShiftImm(1), false)
 		}},
 		{want: "407c1f53", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(31), false)
+			i.asALUShift(aluOpLsr, x0VReg, operandNR(x2VReg), operandShiftImm(31), false)
 		}},
 		{want: "40fc41d3", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(1), true)
+			i.asALUShift(aluOpLsr, x0VReg, operandNR(x2VReg), operandShiftImm(1), true)
 		}},
 		{want: "40fc5fd3", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(31), true)
+			i.asALUShift(aluOpLsr, x0VReg, operandNR(x2VReg), operandShiftImm(31), true)
 		}},
 		{want: "40fc7fd3", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsr, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(63), true)
+			i.asALUShift(aluOpLsr, x0VReg, operandNR(x2VReg), operandShiftImm(63), true)
 		}},
 		{want: "407c0053", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsl, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(0), false)
+			i.asALUShift(aluOpLsl, x0VReg, operandNR(x2VReg), operandShiftImm(0), false)
 		}},
 		{want: "40fc40d3", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsl, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(0), true)
+			i.asALUShift(aluOpLsl, x0VReg, operandNR(x2VReg), operandShiftImm(0), true)
 		}},
 		{want: "40781f53", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsl, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(1), false)
+			i.asALUShift(aluOpLsl, x0VReg, operandNR(x2VReg), operandShiftImm(1), false)
 		}},
 		{want: "40000153", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsl, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(31), false)
+			i.asALUShift(aluOpLsl, x0VReg, operandNR(x2VReg), operandShiftImm(31), false)
 		}},
 		{want: "40f87fd3", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsl, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(1), true)
+			i.asALUShift(aluOpLsl, x0VReg, operandNR(x2VReg), operandShiftImm(1), true)
 		}},
 		{want: "408061d3", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsl, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(31), true)
+			i.asALUShift(aluOpLsl, x0VReg, operandNR(x2VReg), operandShiftImm(31), true)
 		}},
 		{want: "400041d3", setup: func(i *instruction) {
-			i.asALUShift(aluOpLsl, operandNR(x0VReg), operandNR(x2VReg), operandShiftImm(63), true)
+			i.asALUShift(aluOpLsl, x0VReg, operandNR(x2VReg), operandShiftImm(63), true)
 		}},
 		{want: "4000c05a", setup: func(i *instruction) { i.asBitRR(bitOpRbit, x0VReg, x2VReg, false) }},
 		{want: "4000c0da", setup: func(i *instruction) { i.asBitRR(bitOpRbit, x0VReg, x2VReg, true) }},
 		{want: "4010c05a", setup: func(i *instruction) { i.asBitRR(bitOpClz, x0VReg, x2VReg, false) }},
 		{want: "4010c0da", setup: func(i *instruction) { i.asBitRR(bitOpClz, x0VReg, x2VReg, true) }},
 		{want: "4138302e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpUaddlv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8B)
+			i.asVecLanes(vecOpUaddlv, v1VReg, operandNR(v2VReg), vecArrangement8B)
 		}},
 		{want: "4138306e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpUaddlv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B)
+			i.asVecLanes(vecOpUaddlv, v1VReg, operandNR(v2VReg), vecArrangement16B)
 		}},
 		{want: "4138702e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpUaddlv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4H)
+			i.asVecLanes(vecOpUaddlv, v1VReg, operandNR(v2VReg), vecArrangement4H)
 		}},
 		{want: "4138706e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpUaddlv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8H)
+			i.asVecLanes(vecOpUaddlv, v1VReg, operandNR(v2VReg), vecArrangement8H)
 		}},
 		{want: "4138b06e", setup: func(i *instruction) {
-			i.asVecLanes(vecOpUaddlv, operandNR(v1VReg), operandNR(v2VReg), vecArrangement4S)
+			i.asVecLanes(vecOpUaddlv, v1VReg, operandNR(v2VReg), vecArrangement4S)
 		}},
 		{want: "41c0230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmull, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpSmull, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "41c0630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmull, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpSmull, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "41c0a30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmull, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpSmull, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "41c0234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmull2, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpSmull2, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "41c0634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmull2, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpSmull2, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "41c0a34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSmull2, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
+			i.asVecRRR(vecOpSmull2, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4S)
 		}},
 		{want: "411c630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpBic, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpBic, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "411c634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpBic, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpBic, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "411c632e", setup: func(i *instruction) {
-			i.asVecRRRRewrite(vecOpBsl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRRRewrite(vecOpBsl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "411c636e", setup: func(i *instruction) {
-			i.asVecRRRRewrite(vecOpBsl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRRRewrite(vecOpBsl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4158202e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpNot, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8B)
+			i.asVecMisc(vecOpNot, v1VReg, operandNR(v2VReg), vecArrangement8B)
 		}},
 		{want: "4158206e", setup: func(i *instruction) {
-			i.asVecMisc(vecOpNot, operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B)
+			i.asVecMisc(vecOpNot, v1VReg, operandNR(v2VReg), vecArrangement16B)
 		}},
 		{want: "411c230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAnd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpAnd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "411c234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpAnd, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpAnd, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "411ca30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpOrr, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpOrr, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "411ca34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpOrr, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpOrr, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4144230e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpSshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4144234e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpSshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4144630e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpSshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "4144634e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpSshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "4144a30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpSshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4144e34e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpSshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "4144a30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpSshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4144232e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
+			i.asVecRRR(vecOpUshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8B)
 		}},
 		{want: "4144236e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
+			i.asVecRRR(vecOpUshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement16B)
 		}},
 		{want: "4144632e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
+			i.asVecRRR(vecOpUshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement4H)
 		}},
 		{want: "4144636e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
+			i.asVecRRR(vecOpUshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement8H)
 		}},
 		{want: "4144a32e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpUshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
 		{want: "4144e36e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpUshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
+			i.asVecRRR(vecOpUshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2D)
 		}},
 		{want: "4144a30e", setup: func(i *instruction) {
-			i.asVecRRR(vecOpSshl, operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
+			i.asVecRRR(vecOpSshl, v1VReg, operandNR(v2VReg), operandNR(v3VReg), vecArrangement2S)
 		}},
-		{want: "4158200e", setup: func(i *instruction) { i.asVecMisc(vecOpCnt, operandNR(v1VReg), operandNR(v2VReg), vecArrangement8B) }},
-		{want: "4158204e", setup: func(i *instruction) { i.asVecMisc(vecOpCnt, operandNR(v1VReg), operandNR(v2VReg), vecArrangement16B) }},
+		{want: "4158200e", setup: func(i *instruction) { i.asVecMisc(vecOpCnt, v1VReg, operandNR(v2VReg), vecArrangement8B) }},
+		{want: "4158204e", setup: func(i *instruction) { i.asVecMisc(vecOpCnt, v1VReg, operandNR(v2VReg), vecArrangement16B) }},
 		{want: "41c0221e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpCvt32To64, operandNR(v1VReg), operandNR(v2VReg), true)
+			i.asFpuRR(fpuUniOpCvt32To64, v1VReg, operandNR(v2VReg), true)
 		}},
 		{want: "4140621e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpCvt64To32, operandNR(v1VReg), operandNR(v2VReg), true)
+			i.asFpuRR(fpuUniOpCvt64To32, v1VReg, operandNR(v2VReg), true)
 		}},
 		{want: "4140211e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpNeg, operandNR(v1VReg), operandNR(v2VReg), false)
+			i.asFpuRR(fpuUniOpNeg, v1VReg, operandNR(v2VReg), false)
 		}},
 
 		{want: "41c0211e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpSqrt, operandNR(v1VReg), operandNR(v2VReg), false)
+			i.asFpuRR(fpuUniOpSqrt, v1VReg, operandNR(v2VReg), false)
 		}},
 		{want: "41c0611e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpSqrt, operandNR(v1VReg), operandNR(v2VReg), true)
+			i.asFpuRR(fpuUniOpSqrt, v1VReg, operandNR(v2VReg), true)
 		}},
 		{want: "41c0241e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpRoundPlus, operandNR(v1VReg), operandNR(v2VReg), false)
+			i.asFpuRR(fpuUniOpRoundPlus, v1VReg, operandNR(v2VReg), false)
 		}},
 		{want: "41c0641e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpRoundPlus, operandNR(v1VReg), operandNR(v2VReg), true)
+			i.asFpuRR(fpuUniOpRoundPlus, v1VReg, operandNR(v2VReg), true)
 		}},
 		{want: "4140251e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpRoundMinus, operandNR(v1VReg), operandNR(v2VReg), false)
+			i.asFpuRR(fpuUniOpRoundMinus, v1VReg, operandNR(v2VReg), false)
 		}},
 		{want: "4140651e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpRoundMinus, operandNR(v1VReg), operandNR(v2VReg), true)
+			i.asFpuRR(fpuUniOpRoundMinus, v1VReg, operandNR(v2VReg), true)
 		}},
 		{want: "41c0251e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpRoundZero, operandNR(v1VReg), operandNR(v2VReg), false)
+			i.asFpuRR(fpuUniOpRoundZero, v1VReg, operandNR(v2VReg), false)
 		}},
 		{want: "41c0651e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpRoundZero, operandNR(v1VReg), operandNR(v2VReg), true)
+			i.asFpuRR(fpuUniOpRoundZero, v1VReg, operandNR(v2VReg), true)
 		}},
 		{want: "4140241e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpRoundNearest, operandNR(v1VReg), operandNR(v2VReg), false)
+			i.asFpuRR(fpuUniOpRoundNearest, v1VReg, operandNR(v2VReg), false)
 		}},
 		{want: "4140641e", setup: func(i *instruction) {
-			i.asFpuRR(fpuUniOpRoundNearest, operandNR(v1VReg), operandNR(v2VReg), true)
+			i.asFpuRR(fpuUniOpRoundNearest, v1VReg, operandNR(v2VReg), true)
 		}},
-		{want: "4140611e", setup: func(i *instruction) { i.asFpuRR(fpuUniOpNeg, operandNR(v1VReg), operandNR(v2VReg), true) }},
-		{want: "41c0404d", setup: func(i *instruction) { i.asVecLoad1R(operandNR(v1VReg), operandNR(x2VReg), vecArrangement16B) }},
-		{want: "41c4404d", setup: func(i *instruction) { i.asVecLoad1R(operandNR(v1VReg), operandNR(x2VReg), vecArrangement8H) }},
-		{want: "41c8404d", setup: func(i *instruction) { i.asVecLoad1R(operandNR(v1VReg), operandNR(x2VReg), vecArrangement4S) }},
-		{want: "41cc404d", setup: func(i *instruction) { i.asVecLoad1R(operandNR(v1VReg), operandNR(x2VReg), vecArrangement2D) }},
+		{want: "4140611e", setup: func(i *instruction) { i.asFpuRR(fpuUniOpNeg, v1VReg, operandNR(v2VReg), true) }},
+		{want: "41c0404d", setup: func(i *instruction) { i.asVecLoad1R(v1VReg, operandNR(x2VReg), vecArrangement16B) }},
+		{want: "41c4404d", setup: func(i *instruction) { i.asVecLoad1R(v1VReg, operandNR(x2VReg), vecArrangement8H) }},
+		{want: "41c8404d", setup: func(i *instruction) { i.asVecLoad1R(v1VReg, operandNR(x2VReg), vecArrangement4S) }},
+		{want: "41cc404d", setup: func(i *instruction) { i.asVecLoad1R(v1VReg, operandNR(x2VReg), vecArrangement2D) }},
 		{want: "0200e1b8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpAdd, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicRmw(atomicRmwOpAdd, x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "0200e178", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpAdd, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicRmw(atomicRmwOpAdd, x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "0200e138", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpAdd, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicRmw(atomicRmwOpAdd, x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "0200e1f8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpAdd, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 8)
+			i.asAtomicRmw(atomicRmwOpAdd, x0VReg, x1VReg, x2VReg, 8)
 		}},
 		{want: "0200e1b8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpAdd, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicRmw(atomicRmwOpAdd, x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "0200e178", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpAdd, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicRmw(atomicRmwOpAdd, x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "0200e138", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpAdd, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicRmw(atomicRmwOpAdd, x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "0210e1b8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpClr, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicRmw(atomicRmwOpClr, x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "0210e178", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpClr, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicRmw(atomicRmwOpClr, x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "0210e138", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpClr, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicRmw(atomicRmwOpClr, x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "0210e1f8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpClr, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 8)
+			i.asAtomicRmw(atomicRmwOpClr, x0VReg, x1VReg, x2VReg, 8)
 		}},
 		{want: "0210e1b8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpClr, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicRmw(atomicRmwOpClr, x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "0210e178", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpClr, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicRmw(atomicRmwOpClr, x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "0210e138", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpClr, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicRmw(atomicRmwOpClr, x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "0230e1b8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSet, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicRmw(atomicRmwOpSet, x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "0230e178", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSet, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicRmw(atomicRmwOpSet, x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "0230e138", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSet, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicRmw(atomicRmwOpSet, x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "0230e1f8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSet, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 8)
+			i.asAtomicRmw(atomicRmwOpSet, x0VReg, x1VReg, x2VReg, 8)
 		}},
 		{want: "0230e1b8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSet, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicRmw(atomicRmwOpSet, x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "0230e178", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSet, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicRmw(atomicRmwOpSet, x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "0230e138", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSet, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicRmw(atomicRmwOpSet, x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "0220e1b8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpEor, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicRmw(atomicRmwOpEor, x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "0220e178", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpEor, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicRmw(atomicRmwOpEor, x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "0220e138", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpEor, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicRmw(atomicRmwOpEor, x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "0220e1f8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpEor, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 8)
+			i.asAtomicRmw(atomicRmwOpEor, x0VReg, x1VReg, x2VReg, 8)
 		}},
 		{want: "0220e1b8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpEor, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicRmw(atomicRmwOpEor, x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "0220e178", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpEor, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicRmw(atomicRmwOpEor, x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "0220e138", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpEor, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicRmw(atomicRmwOpEor, x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "0280e1b8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSwp, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicRmw(atomicRmwOpSwp, x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "0280e178", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSwp, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicRmw(atomicRmwOpSwp, x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "0280e138", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSwp, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicRmw(atomicRmwOpSwp, x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "0280e1f8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSwp, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 8)
+			i.asAtomicRmw(atomicRmwOpSwp, x0VReg, x1VReg, x2VReg, 8)
 		}},
 		{want: "0280e1b8", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSwp, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicRmw(atomicRmwOpSwp, x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "0280e178", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSwp, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicRmw(atomicRmwOpSwp, x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "0280e138", setup: func(i *instruction) {
-			i.asAtomicRmw(atomicRmwOpSwp, operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicRmw(atomicRmwOpSwp, x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "02fce188", setup: func(i *instruction) {
-			i.asAtomicCas(operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicCas(x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "02fce148", setup: func(i *instruction) {
-			i.asAtomicCas(operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicCas(x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "02fce108", setup: func(i *instruction) {
-			i.asAtomicCas(operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicCas(x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "02fce1c8", setup: func(i *instruction) {
-			i.asAtomicCas(operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 8)
+			i.asAtomicCas(x0VReg, x1VReg, x2VReg, 8)
 		}},
 		{want: "02fce188", setup: func(i *instruction) {
-			i.asAtomicCas(operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 4)
+			i.asAtomicCas(x0VReg, x1VReg, x2VReg, 4)
 		}},
 		{want: "02fce148", setup: func(i *instruction) {
-			i.asAtomicCas(operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 2)
+			i.asAtomicCas(x0VReg, x1VReg, x2VReg, 2)
 		}},
 		{want: "02fce108", setup: func(i *instruction) {
-			i.asAtomicCas(operandNR(x0VReg), operandNR(x1VReg), operandNR(x2VReg), 1)
+			i.asAtomicCas(x0VReg, x1VReg, x2VReg, 1)
 		}},
 		{want: "01fcdf88", setup: func(i *instruction) {
-			i.asAtomicLoad(operandNR(x0VReg), operandNR(x1VReg), 4)
+			i.asAtomicLoad(x0VReg, x1VReg, 4)
 		}},
 		{want: "01fcdf48", setup: func(i *instruction) {
-			i.asAtomicLoad(operandNR(x0VReg), operandNR(x1VReg), 2)
+			i.asAtomicLoad(x0VReg, x1VReg, 2)
 		}},
 		{want: "01fcdf08", setup: func(i *instruction) {
-			i.asAtomicLoad(operandNR(x0VReg), operandNR(x1VReg), 1)
+			i.asAtomicLoad(x0VReg, x1VReg, 1)
 		}},
 		{want: "01fcdfc8", setup: func(i *instruction) {
-			i.asAtomicLoad(operandNR(x0VReg), operandNR(x1VReg), 8)
+			i.asAtomicLoad(x0VReg, x1VReg, 8)
 		}},
 		{want: "01fcdf88", setup: func(i *instruction) {
-			i.asAtomicLoad(operandNR(x0VReg), operandNR(x1VReg), 4)
+			i.asAtomicLoad(x0VReg, x1VReg, 4)
 		}},
 		{want: "01fcdf48", setup: func(i *instruction) {
-			i.asAtomicLoad(operandNR(x0VReg), operandNR(x1VReg), 2)
+			i.asAtomicLoad(x0VReg, x1VReg, 2)
 		}},
 		{want: "01fcdf08", setup: func(i *instruction) {
-			i.asAtomicLoad(operandNR(x0VReg), operandNR(x1VReg), 1)
+			i.asAtomicLoad(x0VReg, x1VReg, 1)
 		}},
 		{want: "01fc9f88", setup: func(i *instruction) {
 			i.asAtomicStore(operandNR(x0VReg), operandNR(x1VReg), 4)
@@ -1823,7 +1824,7 @@ func TestInstruction_encode(t *testing.T) {
 					for _, dst64bit := range trueFalse {
 						i := &instruction{prev: cur}
 						cur.next = i
-						i.asIntToFpu(operandNR(v2VReg), operandNR(x10VReg), rnSigned, src64bit, dst64bit)
+						i.asIntToFpu(v2VReg, operandNR(x10VReg), rnSigned, src64bit, dst64bit)
 						cur = i
 					}
 				}
@@ -1838,7 +1839,7 @@ func TestInstruction_encode(t *testing.T) {
 					for _, dst64bit := range trueFalse {
 						i := &instruction{prev: cur}
 						cur.next = i
-						i.asFpuToInt(operandNR(v2VReg), operandNR(x10VReg), rnSigned, src64bit, dst64bit)
+						i.asFpuToInt(v2VReg, operandNR(x10VReg), rnSigned, src64bit, dst64bit)
 						cur = i
 					}
 				}
@@ -2232,12 +2233,13 @@ func TestInstruction_encoding_store_encoding(t *testing.T) {
 			var i *instruction
 			switch tc.k {
 			case store8, store16, store32, store64, fpuStore32, fpuStore64, fpuStore128:
-				i = &instruction{kind: tc.k, amode: tc.amode, rn: operandNR(tc.rn)}
+				i = &instruction{kind: tc.k, rn: operandNR(tc.rn)}
 			case uLoad8, uLoad16, uLoad32, uLoad64, sLoad8, sLoad16, sLoad32, fpuLoad32, fpuLoad64, fpuLoad128:
-				i = &instruction{kind: tc.k, amode: tc.amode, rd: operandNR(tc.rn)}
+				i = &instruction{kind: tc.k, rd: tc.rn}
 			default:
 				t.Fatalf("unknown kind: %v", tc.k)
 			}
+			i.setAmode(&tc.amode)
 			_, _, m := newSetupWithMockContext()
 			i.encode(m)
 			// Note: for quick iteration we can use golang.org/x/arch package to verify the encoding.
diff --git a/internal/engine/wazevo/backend/isa/arm64/instr_test.go b/internal/engine/wazevo/backend/isa/arm64/instr_test.go
index 6e4ec7ce2c..3b5b964eaf 100644
--- a/internal/engine/wazevo/backend/isa/arm64/instr_test.go
+++ b/internal/engine/wazevo/backend/isa/arm64/instr_test.go
@@ -49,7 +49,7 @@ func TestInstruction_String(t *testing.T) {
 			i: &instruction{
 				kind: loadFpuConst32,
 				u1:   uint64(math.Float32bits(3.0)),
-				rd:   operandNR(regalloc.VReg(0).SetRegType(regalloc.RegTypeFloat)),
+				rd:   regalloc.VReg(0).SetRegType(regalloc.RegTypeFloat),
 			},
 			exp: "ldr s0?, #8; b 8; data.f32 3.000000",
 		},
@@ -57,7 +57,7 @@ func TestInstruction_String(t *testing.T) {
 			i: &instruction{
 				kind: loadFpuConst64,
 				u1:   math.Float64bits(12345.987491),
-				rd:   operandNR(regalloc.VReg(0).SetRegType(regalloc.RegTypeFloat)),
+				rd:   regalloc.VReg(0).SetRegType(regalloc.RegTypeFloat),
 			},
 			exp: "ldr d0?, #8; b 16; data.f64 12345.987491",
 		},
diff --git a/internal/engine/wazevo/backend/isa/arm64/lower_constant.go b/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
index 698b382d46..6c6824fb0a 100644
--- a/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
+++ b/internal/engine/wazevo/backend/isa/arm64/lower_constant.go
@@ -284,18 +284,18 @@ func (m *machine) load64bitConst(c int64, dst regalloc.VReg) {
 
 func (m *machine) insertMOVZ(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVZ(dst, v, uint64(shift), dst64)
+	instr.asMOVZ(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
 
 func (m *machine) insertMOVK(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVK(dst, v, uint64(shift), dst64)
+	instr.asMOVK(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
 
 func (m *machine) insertMOVN(dst regalloc.VReg, v uint64, shift int, dst64 bool) {
 	instr := m.allocateInstr()
-	instr.asMOVN(dst, v, uint64(shift), dst64)
+	instr.asMOVN(dst, v, uint32(shift), dst64)
 	m.insert(instr)
 }
diff --git a/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
index 2bb234e8c1..048bf32040 100644
--- a/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
+++ b/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
@@ -52,11 +52,11 @@ func (m *machine) lowerBrTable(i *ssa.Instruction) {
 	maxIndexReg := m.compiler.AllocateVReg(ssa.TypeI32)
 	m.lowerConstantI32(maxIndexReg, int32(len(targets)-1))
 	subs := m.allocateInstr()
-	subs.asALU(aluOpSubS, operandNR(xzrVReg), indexOperand, operandNR(maxIndexReg), false)
+	subs.asALU(aluOpSubS, xzrVReg, indexOperand, operandNR(maxIndexReg), false)
 	m.insert(subs)
 	csel := m.allocateInstr()
 	adjustedIndex := m.compiler.AllocateVReg(ssa.TypeI32)
-	csel.asCSel(operandNR(adjustedIndex), operandNR(maxIndexReg), indexOperand, hs, false)
+	csel.asCSel(adjustedIndex, operandNR(maxIndexReg), indexOperand, hs, false)
 	m.insert(csel)
 
 	brSequence := m.allocateInstr()
@@ -249,7 +249,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			rc := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
 			rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 			rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-			rd := operandNR(m.compiler.VRegOf(instr.Return()))
+			rd := m.compiler.VRegOf(instr.Return())
 			m.lowerSelectVec(rc, rn, rm, rd)
 		} else {
 			m.lowerSelect(c, x, y, instr.Return())
@@ -270,7 +270,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, ctx := instr.Arg2()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		ctxVReg := m.compiler.VRegOf(ctx)
 		m.lowerFpuToInt(rd, rn, ctxVReg, true, x.Type() == ssa.TypeF64,
 			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToSintSat)
@@ -278,7 +278,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, ctx := instr.Arg2()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		ctxVReg := m.compiler.VRegOf(ctx)
 		m.lowerFpuToInt(rd, rn, ctxVReg, false, x.Type() == ssa.TypeF64,
 			result.Type().Bits() == 64, op == ssa.OpcodeFcvtToUintSat)
@@ -286,25 +286,25 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x := instr.Arg()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		m.lowerIntToFpu(rd, rn, true, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
 	case ssa.OpcodeFcvtFromUint:
 		x := instr.Arg()
 		result := instr.Return()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(result))
+		rd := m.compiler.VRegOf(result)
 		m.lowerIntToFpu(rd, rn, false, x.Type() == ssa.TypeI64, result.Type().Bits() == 64)
 	case ssa.OpcodeFdemote:
 		v := instr.Arg()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		cnt := m.allocateInstr()
 		cnt.asFpuRR(fpuUniOpCvt64To32, rd, rn, false)
 		m.insert(cnt)
 	case ssa.OpcodeFpromote:
 		v := instr.Arg()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		cnt := m.allocateInstr()
 		cnt.asFpuRR(fpuUniOpCvt32To64, rd, rn, true)
 		m.insert(cnt)
@@ -343,15 +343,15 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		ctxVReg := m.compiler.VRegOf(ctx)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerIDiv(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSdiv)
 	case ssa.OpcodeSrem, ssa.OpcodeUrem:
 		x, y, ctx := instr.Arg3()
 		ctxVReg := m.compiler.VRegOf(ctx)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		m.lowerIRem(ctxVReg, rd, rn, rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
+		rd := m.compiler.VRegOf(instr.Return())
+		m.lowerIRem(ctxVReg, rd, rn.nr(), rm, x.Type() == ssa.TypeI64, op == ssa.OpcodeSrem)
 	case ssa.OpcodeVconst:
 		result := m.compiler.VRegOf(instr.Return())
 		lo, hi := instr.VconstData()
@@ -362,7 +362,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x := instr.Arg()
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpNot, rd, rn, vecArrangement16B)
 		m.insert(ins)
 	case ssa.OpcodeVbxor:
@@ -382,12 +382,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 		creg := m.getOperand_NR(m.compiler.ValueDefinition(c), extModeNone)
-		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// creg is overwritten by BSL, so we need to move it to the result register before the instruction
 		// in case when it is used somewhere else.
 		mov := m.allocateInstr()
-		mov.asFpuMov128(tmp.nr(), creg.nr())
+		mov.asFpuMov128(tmp, creg.nr())
 		m.insert(mov)
 
 		ins := m.allocateInstr()
@@ -396,7 +396,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 
 		mov2 := m.allocateInstr()
 		rd := m.compiler.VRegOf(instr.Return())
-		mov2.asFpuMov128(rd, tmp.nr())
+		mov2.asFpuMov128(rd, tmp)
 		m.insert(mov2)
 	case ssa.OpcodeVanyTrue, ssa.OpcodeVallTrue:
 		x, lane := instr.ArgWithLane()
@@ -405,12 +405,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			arr = ssaLaneToArrangement(lane)
 		}
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVcheckTrue(op, rm, rd, arr)
 	case ssa.OpcodeVhighBits:
 		x, lane := instr.ArgWithLane()
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		arr := ssaLaneToArrangement(lane)
 		m.lowerVhighBits(rm, rd, arr)
 	case ssa.OpcodeVIadd:
@@ -441,9 +441,9 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 			panic("unsupported lane " + lane.String())
 		}
 
-		widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo, vv, operandShiftImm(0), loArr)
-		widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi, vv, operandShiftImm(0), hiArr)
-		addp := m.allocateInstr().asVecRRR(vecOpAddp, operandNR(m.compiler.VRegOf(instr.Return())), tmpLo, tmpHi, dstArr)
+		widenLo := m.allocateInstr().asVecShiftImm(widen, tmpLo.nr(), vv, operandShiftImm(0), loArr)
+		widenHi := m.allocateInstr().asVecShiftImm(widen, tmpHi.nr(), vv, operandShiftImm(0), hiArr)
+		addp := m.allocateInstr().asVecRRR(vecOpAddp, m.compiler.VRegOf(instr.Return()), tmpLo, tmpHi, dstArr)
 		m.insert(widenLo)
 		m.insert(widenHi)
 		m.insert(addp)
@@ -493,7 +493,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVIMul(rd, rn, rm, arr)
 	case ssa.OpcodeVIabs:
 		m.lowerVecMisc(vecOpAbs, instr)
@@ -507,7 +507,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVShift(op, rd, rn, rm, arr)
 	case ssa.OpcodeVSqrt:
 		m.lowerVecMisc(vecOpFsqrt, instr)
@@ -547,18 +547,18 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, lane := instr.ArgWithLane()
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVfpuToInt(rd, rn, arr, op == ssa.OpcodeVFcvtToSintSat)
 	case ssa.OpcodeVFcvtFromSint, ssa.OpcodeVFcvtFromUint:
 		x, lane := instr.ArgWithLane()
 		arr := ssaLaneToArrangement(lane)
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		m.lowerVfpuFromInt(rd, rn, arr, op == ssa.OpcodeVFcvtFromSint)
 	case ssa.OpcodeSwidenLow, ssa.OpcodeUwidenLow:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		var arr vecArrangement
 		switch lane {
@@ -580,7 +580,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 	case ssa.OpcodeSwidenHigh, ssa.OpcodeUwidenHigh:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		arr := ssaLaneToArrangement(lane)
 
@@ -607,9 +607,9 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
-		tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		loQxtn := m.allocateInstr()
 		hiQxtn := m.allocateInstr()
@@ -628,7 +628,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		m.insert(hiQxtn)
 
 		mov := m.allocateInstr()
-		mov.asFpuMov128(rd.nr(), tmp.nr())
+		mov.asFpuMov128(rd, tmp)
 		m.insert(mov)
 	case ssa.OpcodeFvpromoteLow:
 		x, lane := instr.ArgWithLane()
@@ -637,7 +637,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpFcvtl, rd, rn, vecArrangement2S)
 		m.insert(ins)
 	case ssa.OpcodeFvdemote:
@@ -647,14 +647,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		}
 		ins := m.allocateInstr()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 		ins.asVecMisc(vecOpFcvtn, rd, rn, vecArrangement2S)
 		m.insert(ins)
 	case ssa.OpcodeExtractlane:
 		x, index, signed, lane := instr.ExtractlaneData()
 
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		mov := m.allocateInstr()
 		switch lane {
@@ -680,12 +680,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, y, index, lane := instr.InsertlaneData()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		rd := m.compiler.VRegOf(instr.Return())
+		tmpReg := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// Initially mov rn to tmp.
 		mov1 := m.allocateInstr()
-		mov1.asFpuMov128(tmpReg.nr(), rn.nr())
+		mov1.asFpuMov128(tmpReg, rn.nr())
 		m.insert(mov1)
 
 		// movToVec and vecMovElement do not clear the remaining bits to zero,
@@ -709,14 +709,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 
 		// Finally mov tmp to rd.
 		mov3 := m.allocateInstr()
-		mov3.asFpuMov128(rd.nr(), tmpReg.nr())
+		mov3.asFpuMov128(rd, tmpReg)
 		m.insert(mov3)
 
 	case ssa.OpcodeSwizzle:
 		x, y, lane := instr.Arg2WithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		arr := ssaLaneToArrangement(lane)
 
@@ -729,14 +729,14 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x, y, lane1, lane2 := instr.ShuffleData()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 		rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		m.lowerShuffle(rd, rn, rm, lane1, lane2)
 
 	case ssa.OpcodeSplat:
 		x, lane := instr.ArgWithLane()
 		rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
+		rd := m.compiler.VRegOf(instr.Return())
 
 		dup := m.allocateInstr()
 		switch lane {
@@ -760,12 +760,12 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		xx, yy := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone),
 			m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 		tmp, tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128)), operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp, xx, yy, vecArrangement8H))
-		m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2, xx, yy, vecArrangement8H))
-		m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp, tmp, tmp2, vecArrangement4S))
+		m.insert(m.allocateInstr().asVecRRR(vecOpSmull, tmp.nr(), xx, yy, vecArrangement8H))
+		m.insert(m.allocateInstr().asVecRRR(vecOpSmull2, tmp2.nr(), xx, yy, vecArrangement8H))
+		m.insert(m.allocateInstr().asVecRRR(vecOpAddp, tmp.nr(), tmp, tmp2, vecArrangement4S))
 
-		rd := operandNR(m.compiler.VRegOf(instr.Return()))
-		m.insert(m.allocateInstr().asFpuMov128(rd.nr(), tmp.nr()))
+		rd := m.compiler.VRegOf(instr.Return())
+		m.insert(m.allocateInstr().asFpuMov128(rd, tmp.nr()))
 
 	case ssa.OpcodeLoadSplat:
 		ptr, offset, lane := instr.LoadSplatData()
@@ -794,7 +794,7 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 	m.executableContext.FlushPendingInstructions()
 }
 
-func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
+func (m *machine) lowerShuffle(rd regalloc.VReg, rn, rm operand, lane1, lane2 uint64) {
 	// `tbl2` requires 2 consecutive registers, so we arbitrarily pick v29, v30.
 	vReg, wReg := v29VReg, v30VReg
 
@@ -822,7 +822,7 @@ func (m *machine) lowerShuffle(rd, rn, rm operand, lane1, lane2 uint64) {
 	m.insert(tbl2)
 }
 
-func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangement) {
+func (m *machine) lowerVShift(op ssa.Opcode, rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	var modulo byte
 	switch arr {
 	case vecArrangement16B:
@@ -847,13 +847,13 @@ func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangem
 	if op != ssa.OpcodeVIshl {
 		// Negate the amount to make this as right shift.
 		neg := m.allocateInstr()
-		neg.asALU(aluOpSub, rtmp, operandNR(xzrVReg), rtmp, true)
+		neg.asALU(aluOpSub, rtmp.nr(), operandNR(xzrVReg), rtmp, true)
 		m.insert(neg)
 	}
 
 	// Copy the shift amount into a vector register as sshl/ushl requires it to be there.
 	dup := m.allocateInstr()
-	dup.asVecDup(vtmp, rtmp, arr)
+	dup.asVecDup(vtmp.nr(), rtmp, arr)
 	m.insert(dup)
 
 	if op == ssa.OpcodeVIshl || op == ssa.OpcodeVSshr {
@@ -867,7 +867,7 @@ func (m *machine) lowerVShift(op ssa.Opcode, rd, rn, rm operand, arr vecArrangem
 	}
 }
 
-func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangement) {
+func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm operand, rd regalloc.VReg, arr vecArrangement) {
 	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
 
 	// Special case VallTrue for i64x2.
@@ -878,11 +878,11 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 		//	cset dst, eq
 
 		ins := m.allocateInstr()
-		ins.asVecMisc(vecOpCmeq0, tmp, rm, vecArrangement2D)
+		ins.asVecMisc(vecOpCmeq0, tmp.nr(), rm, vecArrangement2D)
 		m.insert(ins)
 
 		addp := m.allocateInstr()
-		addp.asVecRRR(vecOpAddp, tmp, tmp, tmp, vecArrangement2D)
+		addp.asVecRRR(vecOpAddp, tmp.nr(), tmp, tmp, vecArrangement2D)
 		m.insert(addp)
 
 		fcmp := m.allocateInstr()
@@ -890,7 +890,7 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 		m.insert(fcmp)
 
 		cset := m.allocateInstr()
-		cset.asCSet(rd.nr(), false, eq)
+		cset.asCSet(rd, false, eq)
 		m.insert(cset)
 
 		return
@@ -900,10 +900,10 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 	ins := m.allocateInstr()
 	if op == ssa.OpcodeVanyTrue {
 		// 	umaxp v4?.16b, v2?.16b, v2?.16b
-		ins.asVecRRR(vecOpUmaxp, tmp, rm, rm, vecArrangement16B)
+		ins.asVecRRR(vecOpUmaxp, tmp.nr(), rm, rm, vecArrangement16B)
 	} else {
 		// 	uminv d4?, v2?.4s
-		ins.asVecLanes(vecOpUminv, tmp, rm, arr)
+		ins.asVecLanes(vecOpUminv, tmp.nr(), rm, arr)
 	}
 	m.insert(ins)
 
@@ -917,15 +917,15 @@ func (m *machine) lowerVcheckTrue(op ssa.Opcode, rm, rd operand, arr vecArrangem
 	m.insert(movv)
 
 	fc := m.allocateInstr()
-	fc.asCCmpImm(rd, uint64(0), al, 0, true)
+	fc.asCCmpImm(operandNR(rd), uint64(0), al, 0, true)
 	m.insert(fc)
 
 	cset := m.allocateInstr()
-	cset.asCSet(rd.nr(), false, ne)
+	cset.asCSet(rd, false, ne)
 	m.insert(cset)
 }
 
-func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
+func (m *machine) lowerVhighBits(rm operand, rd regalloc.VReg, arr vecArrangement) {
 	r0 := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
 	v0 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
 	v1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
@@ -947,7 +947,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v1[i] = 0xff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(7), vecArrangement16B)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(7), vecArrangement16B)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -958,7 +958,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		// Lane-wise logical AND with the bit mask, meaning that we have
@@ -967,23 +967,23 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Below, we use the following notation:
 		// wi := (1 << i) if vi<0, 0 otherwise.
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v1, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v1.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		// Swap the lower and higher 8 byte elements, and write it into v0, meaning that we have
 		// v0[i] = w(i+8) if i < 8, w(i-8) otherwise.
 		ext := m.allocateInstr()
-		ext.asVecExtract(v0, v1, v1, vecArrangement16B, uint32(8))
+		ext.asVecExtract(v0.nr(), v1, v1, vecArrangement16B, uint32(8))
 		m.insert(ext)
 
 		// v = [w0, w8, ..., w7, w15]
 		zip1 := m.allocateInstr()
-		zip1.asVecPermute(vecOpZip1, v0, v1, v0, vecArrangement16B)
+		zip1.asVecPermute(vecOpZip1, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(zip1)
 
 		// v.h[0] = w0 + ... + w15
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement8H)
 		m.insert(addv)
 
 		// Extract the v.h[0] as the result.
@@ -1006,7 +1006,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v[i] = 0xffff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(15), vecArrangement8H)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(15), vecArrangement8H)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -1014,26 +1014,26 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to vector v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		lsl := m.allocateInstr()
-		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(4), true)
+		lsl.asALUShift(aluOpLsl, r0.nr(), r0, operandShiftImm(4), true)
 		m.insert(lsl)
 
 		movv := m.allocateInstr()
-		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+		movv.asMovToVec(v0.nr(), r0, vecArrangementD, vecIndex(1))
 		m.insert(movv)
 
 		// Lane-wise logical AND with the bitmask, meaning that we have
 		// v[i] = (1 << i)     if vi<0, 0 otherwise for i=0..3
 		//      = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement8H)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement8H)
 		m.insert(addv)
 
 		movfv := m.allocateInstr()
@@ -1055,7 +1055,7 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 		// Right arithmetic shift on the original vector and store the result into v1. So we have:
 		// v[i] = 0xffffffff if vi<0, 0 otherwise.
 		sshr := m.allocateInstr()
-		sshr.asVecShiftImm(vecOpSshr, v1, rm, operandShiftImm(31), vecArrangement4S)
+		sshr.asVecShiftImm(vecOpSshr, v1.nr(), rm, operandShiftImm(31), vecArrangement4S)
 		m.insert(sshr)
 
 		// Load the bit mask into r0.
@@ -1063,26 +1063,26 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// dup r0 to vector v0.
 		dup := m.allocateInstr()
-		dup.asVecDup(v0, r0, vecArrangement2D)
+		dup.asVecDup(v0.nr(), r0, vecArrangement2D)
 		m.insert(dup)
 
 		lsl := m.allocateInstr()
-		lsl.asALUShift(aluOpLsl, r0, r0, operandShiftImm(2), true)
+		lsl.asALUShift(aluOpLsl, r0.nr(), r0, operandShiftImm(2), true)
 		m.insert(lsl)
 
 		movv := m.allocateInstr()
-		movv.asMovToVec(v0, r0, vecArrangementD, vecIndex(1))
+		movv.asMovToVec(v0.nr(), r0, vecArrangementD, vecIndex(1))
 		m.insert(movv)
 
 		// Lane-wise logical AND with the bitmask, meaning that we have
 		// v[i] = (1 << i)     if vi<0, 0 otherwise for i in [0, 1]
 		//      = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3]
 		and := m.allocateInstr()
-		and.asVecRRR(vecOpAnd, v0, v1, v0, vecArrangement16B)
+		and.asVecRRR(vecOpAnd, v0.nr(), v1, v0, vecArrangement16B)
 		m.insert(and)
 
 		addv := m.allocateInstr()
-		addv.asVecLanes(vecOpAddv, v0, v0, vecArrangement4S)
+		addv.asVecLanes(vecOpAddv, v0.nr(), v0, vecArrangement4S)
 		m.insert(addv)
 
 		movfv := m.allocateInstr()
@@ -1102,21 +1102,21 @@ func (m *machine) lowerVhighBits(rm, rd operand, arr vecArrangement) {
 
 		// Move the higher 64-bit int into r0.
 		movv1 := m.allocateInstr()
-		movv1.asMovFromVec(r0, rm, vecArrangementD, vecIndex(1), false)
+		movv1.asMovFromVec(r0.nr(), rm, vecArrangementD, vecIndex(1), false)
 		m.insert(movv1)
 
 		// Move the sign bit into the least significant bit.
 		lsr1 := m.allocateInstr()
-		lsr1.asALUShift(aluOpLsr, r0, r0, operandShiftImm(63), true)
+		lsr1.asALUShift(aluOpLsr, r0.nr(), r0, operandShiftImm(63), true)
 		m.insert(lsr1)
 
 		lsr2 := m.allocateInstr()
-		lsr2.asALUShift(aluOpLsr, rd, rd, operandShiftImm(63), true)
+		lsr2.asALUShift(aluOpLsr, rd, operandNR(rd), operandShiftImm(63), true)
 		m.insert(lsr2)
 
 		// rd = (r0<<1) | rd
 		lsl := m.allocateInstr()
-		lsl.asALU(aluOpAdd, rd, rd, operandSR(r0.nr(), 1, shiftOpLSL), false)
+		lsl.asALU(aluOpAdd, rd, operandNR(rd), operandSR(r0.nr(), 1, shiftOpLSL), false)
 		m.insert(lsl)
 	default:
 		panic("Unsupported " + arr.String())
@@ -1128,7 +1128,7 @@ func (m *machine) lowerVecMisc(op vecOp, instr *ssa.Instruction) {
 	arr := ssaLaneToArrangement(lane)
 	ins := m.allocateInstr()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(instr.Return()))
+	rd := m.compiler.VRegOf(instr.Return())
 	ins.asVecMisc(op, rd, rn, arr)
 	m.insert(ins)
 }
@@ -1137,22 +1137,22 @@ func (m *machine) lowerVecRRR(op vecOp, x, y, ret ssa.Value, arr vecArrangement)
 	ins := m.allocateInstr()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(ret))
+	rd := m.compiler.VRegOf(ret)
 	ins.asVecRRR(op, rd, rn, rm, arr)
 	m.insert(ins)
 }
 
-func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
+func (m *machine) lowerVIMul(rd regalloc.VReg, rn, rm operand, arr vecArrangement) {
 	if arr != vecArrangement2D {
 		mul := m.allocateInstr()
 		mul.asVecRRR(vecOpMul, rd, rn, rm, arr)
 		m.insert(mul)
 	} else {
-		tmp1 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-		tmp3 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmp1 := m.compiler.AllocateVReg(ssa.TypeV128)
+		tmp2 := m.compiler.AllocateVReg(ssa.TypeV128)
+		tmp3 := m.compiler.AllocateVReg(ssa.TypeV128)
 
-		tmpRes := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+		tmpRes := m.compiler.AllocateVReg(ssa.TypeV128)
 
 		// Following the algorithm in https://chromium-review.googlesource.com/c/v8/v8/+/1781696
 		rev64 := m.allocateInstr()
@@ -1160,7 +1160,7 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		m.insert(rev64)
 
 		mul := m.allocateInstr()
-		mul.asVecRRR(vecOpMul, tmp2, tmp2, rn, vecArrangement4S)
+		mul.asVecRRR(vecOpMul, tmp2, operandNR(tmp2), rn, vecArrangement4S)
 		m.insert(mul)
 
 		xtn1 := m.allocateInstr()
@@ -1168,7 +1168,7 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		m.insert(xtn1)
 
 		addp := m.allocateInstr()
-		addp.asVecRRR(vecOpAddp, tmp2, tmp2, tmp2, vecArrangement4S)
+		addp.asVecRRR(vecOpAddp, tmp2, operandNR(tmp2), operandNR(tmp2), vecArrangement4S)
 		m.insert(addp)
 
 		xtn2 := m.allocateInstr()
@@ -1179,15 +1179,15 @@ func (m *machine) lowerVIMul(rd, rn, rm operand, arr vecArrangement) {
 		// In short, in UMLAL instruction, the result register is also one of the source register, and
 		// the value on the result register is significant.
 		shll := m.allocateInstr()
-		shll.asVecMisc(vecOpShll, tmpRes, tmp2, vecArrangement2S)
+		shll.asVecMisc(vecOpShll, tmpRes, operandNR(tmp2), vecArrangement2S)
 		m.insert(shll)
 
 		umlal := m.allocateInstr()
-		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, tmp3, tmp1, vecArrangement2S)
+		umlal.asVecRRRRewrite(vecOpUmlal, tmpRes, operandNR(tmp3), operandNR(tmp1), vecArrangement2S)
 		m.insert(umlal)
 
 		mov := m.allocateInstr()
-		mov.asFpuMov128(rd.nr(), tmpRes.nr())
+		mov.asFpuMov128(rd, tmpRes)
 		m.insert(mov)
 	}
 }
@@ -1203,7 +1203,7 @@ func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
 	// BSL modifies the destination register, so we need to use a temporary register so that
 	// the actual definition of the destination register happens *after* the BSL instruction.
 	// That way, we can force the spill instruction to be inserted after the BSL instruction.
-	tmp := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	tmp := m.compiler.AllocateVReg(ssa.TypeV128)
 
 	fcmgt := m.allocateInstr()
 	if max {
@@ -1220,17 +1220,17 @@ func (m *machine) lowerVMinMaxPseudo(instr *ssa.Instruction, max bool) {
 
 	res := operandNR(m.compiler.VRegOf(instr.Return()))
 	mov2 := m.allocateInstr()
-	mov2.asFpuMov128(res.nr(), tmp.nr())
+	mov2.asFpuMov128(res.nr(), tmp)
 	m.insert(mov2)
 }
 
-func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn regalloc.VReg, rm operand, _64bit, signed bool) {
 	div := m.allocateInstr()
 
 	if signed {
-		div.asALU(aluOpSDiv, rd, rn, rm, _64bit)
+		div.asALU(aluOpSDiv, rd, operandNR(rn), rm, _64bit)
 	} else {
-		div.asALU(aluOpUDiv, rd, rn, rm, _64bit)
+		div.asALU(aluOpUDiv, rd, operandNR(rn), rm, _64bit)
 	}
 	m.insert(div)
 
@@ -1239,11 +1239,11 @@ func (m *machine) lowerIRem(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bi
 
 	// rd = rn-rd*rm by MSUB instruction.
 	msub := m.allocateInstr()
-	msub.asALURRRR(aluOpMSub, rd, rd, rm, rn, _64bit)
+	msub.asALURRRR(aluOpMSub, rd, operandNR(rd), rm, rn, _64bit)
 	m.insert(msub)
 }
 
-func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bit, signed bool) {
+func (m *machine) lowerIDiv(execCtxVReg, rd regalloc.VReg, rn, rm operand, _64bit, signed bool) {
 	div := m.allocateInstr()
 
 	if signed {
@@ -1260,7 +1260,7 @@ func (m *machine) lowerIDiv(execCtxVReg regalloc.VReg, rd, rn, rm operand, _64bi
 		// We need to check the signed overflow which happens iff "math.MinInt{32,64} / -1"
 		minusOneCheck := m.allocateInstr()
 		// Sets eq condition if rm == -1.
-		minusOneCheck.asALU(aluOpAddS, operandNR(xzrVReg), rm, operandImm12(1, 0), _64bit)
+		minusOneCheck.asALU(aluOpAddS, xzrVReg, rm, operandImm12(1, 0), _64bit)
 		m.insert(minusOneCheck)
 
 		ccmp := m.allocateInstr()
@@ -1290,20 +1290,20 @@ func (m *machine) exitIfNot(execCtxVReg regalloc.VReg, c cond, cond64bit bool, c
 func (m *machine) lowerFcopysign(x, y, ret ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	var tmpI, tmpF operand
+	var tmpI, tmpF regalloc.VReg
 	_64 := x.Type() == ssa.TypeF64
 	if _64 {
-		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
-		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmpF = m.compiler.AllocateVReg(ssa.TypeF64)
+		tmpI = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmpF = operandNR(m.compiler.AllocateVReg(ssa.TypeF32))
-		tmpI = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmpF = m.compiler.AllocateVReg(ssa.TypeF32)
+		tmpI = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
 	rd := m.compiler.VRegOf(ret)
-	m.lowerFcopysignImpl(operandNR(rd), rn, rm, tmpI, tmpF, _64)
+	m.lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF, _64)
 }
 
-func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool) {
+func (m *machine) lowerFcopysignImpl(rd regalloc.VReg, rn, rm operand, tmpI, tmpF regalloc.VReg, _64bit bool) {
 	// This is exactly the same code emitted by GCC for "__builtin_copysign":
 	//
 	//    mov     x0, -9223372036854775808
@@ -1313,26 +1313,26 @@ func (m *machine) lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF operand, _64bit bool
 
 	setMSB := m.allocateInstr()
 	if _64bit {
-		m.lowerConstantI64(tmpI.nr(), math.MinInt64)
-		setMSB.asMovToVec(tmpF, tmpI, vecArrangementD, vecIndex(0))
+		m.lowerConstantI64(tmpI, math.MinInt64)
+		setMSB.asMovToVec(tmpF, operandNR(tmpI), vecArrangementD, vecIndex(0))
 	} else {
-		m.lowerConstantI32(tmpI.nr(), math.MinInt32)
-		setMSB.asMovToVec(tmpF, tmpI, vecArrangementS, vecIndex(0))
+		m.lowerConstantI32(tmpI, math.MinInt32)
+		setMSB.asMovToVec(tmpF, operandNR(tmpI), vecArrangementS, vecIndex(0))
 	}
 	m.insert(setMSB)
 
-	tmpReg := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
+	tmpReg := m.compiler.AllocateVReg(ssa.TypeF64)
 
 	mov := m.allocateInstr()
-	mov.asFpuMov64(tmpReg.nr(), rn.nr())
+	mov.asFpuMov64(tmpReg, rn.nr())
 	m.insert(mov)
 
 	vbit := m.allocateInstr()
-	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, tmpF, vecArrangement8B)
+	vbit.asVecRRRRewrite(vecOpBit, tmpReg, rm, operandNR(tmpF), vecArrangement8B)
 	m.insert(vbit)
 
 	movDst := m.allocateInstr()
-	movDst.asFpuMov64(rd.nr(), tmpReg.nr())
+	movDst.asFpuMov64(rd, tmpReg)
 	m.insert(movDst)
 }
 
@@ -1340,7 +1340,7 @@ func (m *machine) lowerBitcast(instr *ssa.Instruction) {
 	v, dstType := instr.BitcastData()
 	srcType := v.Type()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(v), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(instr.Return()))
+	rd := m.compiler.VRegOf(instr.Return())
 	srcInt := srcType.IsInt()
 	dstInt := dstType.IsInt()
 	switch {
@@ -1371,14 +1371,14 @@ func (m *machine) lowerBitcast(instr *ssa.Instruction) {
 
 func (m *machine) lowerFpuUniOp(op fpuUniOp, in, out ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(in), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(out))
+	rd := m.compiler.VRegOf(out)
 
 	neg := m.allocateInstr()
 	neg.asFpuRR(op, rd, rn, in.Type().Bits() == 64)
 	m.insert(neg)
 }
 
-func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
+func (m *machine) lowerFpuToInt(rd regalloc.VReg, rn operand, ctx regalloc.VReg, signed, src64bit, dst64bit, nonTrapping bool) {
 	if !nonTrapping {
 		// First of all, we have to clear the FPU flags.
 		flagClear := m.allocateInstr()
@@ -1405,7 +1405,7 @@ func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64
 		// Check if the conversion was undefined by comparing the status with 1.
 		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
 		alu := m.allocateInstr()
-		alu.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpReg), operandImm12(1, 0), true)
+		alu.asALU(aluOpSubS, xzrVReg, operandNR(tmpReg), operandImm12(1, 0), true)
 		m.insert(alu)
 
 		// If it is not undefined, we can return the result.
@@ -1429,7 +1429,7 @@ func (m *machine) lowerFpuToInt(rd, rn operand, ctx regalloc.VReg, signed, src64
 	}
 }
 
-func (m *machine) lowerIntToFpu(rd, rn operand, signed, src64bit, dst64bit bool) {
+func (m *machine) lowerIntToFpu(rd regalloc.VReg, rn operand, signed, src64bit, dst64bit bool) {
 	cvt := m.allocateInstr()
 	cvt.asIntToFpu(rd, rn, signed, src64bit, dst64bit)
 	m.insert(cvt)
@@ -1456,7 +1456,7 @@ func (m *machine) lowerFpuBinOp(si *ssa.Instruction) {
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 	rm := m.getOperand_NR(yDef, extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 	instr.asFpuRRR(op, rd, rn, rm, x.Type().Bits() == 64)
 	m.insert(instr)
 }
@@ -1482,7 +1482,7 @@ func (m *machine) lowerSubOrAdd(si *ssa.Instruction, add bool) {
 	case !add && yNegated: // rn+rm = x-(-y) = x-y
 		aop = aluOpAdd
 	}
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 	alu := m.allocateInstr()
 	alu.asALU(aop, rd, rn, rm, x.Type().Bits() == 64)
 	m.insert(alu)
@@ -1527,7 +1527,7 @@ func (m *machine) lowerIcmp(si *ssa.Instruction) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
 	rm := m.getOperand_Imm12_ER_SR_NR(m.compiler.ValueDefinition(y), ext)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpSubS, operandNR(xzrVReg), rn, rm, in64bit)
+	alu.asALU(aluOpSubS, xzrVReg, rn, rm, in64bit)
 	m.insert(alu)
 
 	cset := m.allocateInstr()
@@ -1542,7 +1542,7 @@ func (m *machine) lowerVIcmp(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	switch flag {
 	case eq:
@@ -1554,7 +1554,7 @@ func (m *machine) lowerVIcmp(si *ssa.Instruction) {
 		cmp.asVecRRR(vecOpCmeq, rd, rn, rm, arr)
 		m.insert(cmp)
 		not := m.allocateInstr()
-		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+		not.asVecMisc(vecOpNot, rd, operandNR(rd), vecArrangement16B)
 		m.insert(not)
 	case ge:
 		cmp := m.allocateInstr()
@@ -1598,7 +1598,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	switch flag {
 	case eq:
@@ -1610,7 +1610,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 		cmp.asVecRRR(vecOpFcmeq, rd, rn, rm, arr)
 		m.insert(cmp)
 		not := m.allocateInstr()
-		not.asVecMisc(vecOpNot, rd, rd, vecArrangement16B)
+		not.asVecMisc(vecOpNot, rd, operandNR(rd), vecArrangement16B)
 		m.insert(not)
 	case ge:
 		cmp := m.allocateInstr()
@@ -1631,7 +1631,7 @@ func (m *machine) lowerVFcmp(si *ssa.Instruction) {
 	}
 }
 
-func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool) {
+func (m *machine) lowerVfpuToInt(rd regalloc.VReg, rn operand, arr vecArrangement, signed bool) {
 	cvt := m.allocateInstr()
 	if signed {
 		cvt.asVecMisc(vecOpFcvtzs, rd, rn, arr)
@@ -1643,15 +1643,15 @@ func (m *machine) lowerVfpuToInt(rd, rn operand, arr vecArrangement, signed bool
 	if arr == vecArrangement2D {
 		narrow := m.allocateInstr()
 		if signed {
-			narrow.asVecMisc(vecOpSqxtn, rd, rd, vecArrangement2S)
+			narrow.asVecMisc(vecOpSqxtn, rd, operandNR(rd), vecArrangement2S)
 		} else {
-			narrow.asVecMisc(vecOpUqxtn, rd, rd, vecArrangement2S)
+			narrow.asVecMisc(vecOpUqxtn, rd, operandNR(rd), vecArrangement2S)
 		}
 		m.insert(narrow)
 	}
 }
 
-func (m *machine) lowerVfpuFromInt(rd, rn operand, arr vecArrangement, signed bool) {
+func (m *machine) lowerVfpuFromInt(rd regalloc.VReg, rn operand, arr vecArrangement, signed bool) {
 	cvt := m.allocateInstr()
 	if signed {
 		cvt.asVecMisc(vecOpScvtf, rd, rn, arr)
@@ -1665,7 +1665,7 @@ func (m *machine) lowerShifts(si *ssa.Instruction, ext extMode, aluOp aluOp) {
 	x, amount := si.Arg2()
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), ext)
 	rm := m.getOperand_ShiftImm_NR(m.compiler.ValueDefinition(amount), ext, x.Type().Bits())
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	alu := m.allocateInstr()
 	alu.asALUShift(aluOp, rd, rn, rm, x.Type().Bits() == 64)
@@ -1678,11 +1678,11 @@ func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 
-	var rd operand
+	var rd regalloc.VReg
 	if ignoreResult {
-		rd = operandNR(xzrVReg)
+		rd = xzrVReg
 	} else {
-		rd = operandNR(m.compiler.VRegOf(si.Return()))
+		rd = m.compiler.VRegOf(si.Return())
 	}
 
 	_64 := x.Type().Bits() == 64
@@ -1691,7 +1691,7 @@ func (m *machine) lowerBitwiseAluOp(si *ssa.Instruction, op aluOp, ignoreResult
 		c := instr.ConstantVal()
 		if isBitMaskImmediate(c, _64) {
 			// Constant bit wise operations can be lowered to a single instruction.
-			alu.asALUBitmaskImm(op, rd.nr(), rn.nr(), c, _64)
+			alu.asALUBitmaskImm(op, rd, rn.nr(), c, _64)
 			m.insert(alu)
 			return
 		}
@@ -1709,25 +1709,25 @@ func (m *machine) lowerRotl(si *ssa.Instruction) {
 
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
-	var tmp operand
+	var tmp regalloc.VReg
 	if _64 {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
-	rd := operandNR(m.compiler.VRegOf(r))
+	rd := m.compiler.VRegOf(r)
 
 	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
 	m.lowerRotlImpl(rd, rn, rm, tmp, _64)
 }
 
-func (m *machine) lowerRotlImpl(rd, rn, rm, tmp operand, is64bit bool) {
+func (m *machine) lowerRotlImpl(rd regalloc.VReg, rn, rm operand, tmp regalloc.VReg, is64bit bool) {
 	// Encode rotl as neg + rotr: neg is a sub against the zero-reg.
 	neg := m.allocateInstr()
 	neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rm, is64bit)
 	m.insert(neg)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpRotR, rd, rn, tmp, is64bit)
+	alu.asALU(aluOpRotR, rd, rn, operandNR(tmp), is64bit)
 	m.insert(alu)
 }
 
@@ -1737,7 +1737,7 @@ func (m *machine) lowerRotr(si *ssa.Instruction) {
 	xDef, yDef := m.compiler.ValueDefinition(x), m.compiler.ValueDefinition(y)
 	rn := m.getOperand_NR(xDef, extModeNone)
 	rm := m.getOperand_NR(yDef, extModeNone)
-	rd := operandNR(m.compiler.VRegOf(si.Return()))
+	rd := m.compiler.VRegOf(si.Return())
 
 	alu := m.allocateInstr()
 	alu.asALU(aluOpRotR, rd, rn, rm, si.Return().Type().Bits() == 64)
@@ -1797,7 +1797,7 @@ func (m *machine) lowerImul(x, y, result ssa.Value) {
 	// TODO: if this comes before Add/Sub, we could merge it by putting it into the place of xzrVReg.
 
 	mul := m.allocateInstr()
-	mul.asALURRRR(aluOpMAdd, operandNR(rd), rn, rm, operandNR(xzrVReg), x.Type().Bits() == 64)
+	mul.asALURRRR(aluOpMAdd, rd, rn, rm, xzrVReg, x.Type().Bits() == 64)
 	m.insert(mul)
 }
 
@@ -1849,22 +1849,22 @@ func (m *machine) lowerPopcnt(x, result ssa.Value) {
 	//    mov x5, v0.d[0]     ;; finally we mov the result back to a GPR
 	//
 
-	rd := operandNR(m.compiler.VRegOf(result))
+	rd := m.compiler.VRegOf(result)
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 
 	rf1 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	ins := m.allocateInstr()
-	ins.asMovToVec(rf1, rn, vecArrangementD, vecIndex(0))
+	ins.asMovToVec(rf1.nr(), rn, vecArrangementD, vecIndex(0))
 	m.insert(ins)
 
 	rf2 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	cnt := m.allocateInstr()
-	cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B)
+	cnt.asVecMisc(vecOpCnt, rf2.nr(), rf1, vecArrangement16B)
 	m.insert(cnt)
 
 	rf3 := operandNR(m.compiler.AllocateVReg(ssa.TypeF64))
 	uaddlv := m.allocateInstr()
-	uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B)
+	uaddlv.asVecLanes(vecOpUaddlv, rf3.nr(), rf2, vecArrangement8B)
 	m.insert(uaddlv)
 
 	mov := m.allocateInstr()
@@ -1879,32 +1879,35 @@ func (m *machine) lowerExitWithCode(execCtxVReg regalloc.VReg, code wazevoapi.Ex
 	loadExitCodeConst.asMOVZ(tmpReg1, uint64(code), 0, true)
 
 	setExitCode := m.allocateInstr()
-	setExitCode.asStore(operandNR(tmpReg1),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
-		}, 32)
+	mode := m.amodePool.Allocate()
+	*mode = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetExitCodeOffset.I64(),
+	}
+	setExitCode.asStore(operandNR(tmpReg1), mode, 32)
 
 	// In order to unwind the stack, we also need to push the current stack pointer:
 	tmp2 := m.compiler.AllocateVReg(ssa.TypeI64)
 	movSpToTmp := m.allocateInstr()
 	movSpToTmp.asMove64(tmp2, spVReg)
 	strSpToExecCtx := m.allocateInstr()
-	strSpToExecCtx.asStore(operandNR(tmp2),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
-		}, 64)
+	mode2 := m.amodePool.Allocate()
+	*mode2 = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.I64(),
+	}
+	strSpToExecCtx.asStore(operandNR(tmp2), mode2, 64)
 	// Also the address of this exit.
 	tmp3 := m.compiler.AllocateVReg(ssa.TypeI64)
 	currentAddrToTmp := m.allocateInstr()
 	currentAddrToTmp.asAdr(tmp3, 0)
 	storeCurrentAddrToExecCtx := m.allocateInstr()
-	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3),
-		addressMode{
-			kind: addressModeKindRegUnsignedImm12,
-			rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
-		}, 64)
+	mode3 := m.amodePool.Allocate()
+	*mode3 = addressMode{
+		kind: addressModeKindRegUnsignedImm12,
+		rn:   execCtxVReg, imm: wazevoapi.ExecutionContextOffsetGoCallReturnAddress.I64(),
+	}
+	storeCurrentAddrToExecCtx.asStore(operandNR(tmp3), mode3, 64)
 
 	exitSeq := m.allocateInstr()
 	exitSeq.asExitSequence(execCtxVReg)
@@ -1937,7 +1940,7 @@ func (m *machine) lowerIcmpToFlag(x, y ssa.Value, signed bool) {
 	alu.asALU(
 		aluOpSubS,
 		// We don't need the result, just need to set flags.
-		operandNR(xzrVReg),
+		xzrVReg,
 		rn,
 		rm,
 		x.Type().Bits() == 64,
@@ -2012,7 +2015,7 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 		alu.asALU(
 			aluOpSubS,
 			// We don't need the result, just need to set flags.
-			operandNR(xzrVReg),
+			xzrVReg,
 			rn,
 			operandNR(xzrVReg),
 			c.Type().Bits() == 64,
@@ -2024,7 +2027,7 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
 	rm := m.getOperand_NR(m.compiler.ValueDefinition(y), extModeNone)
 
-	rd := operandNR(m.compiler.VRegOf(result))
+	rd := m.compiler.VRegOf(result)
 	switch x.Type() {
 	case ssa.TypeI32, ssa.TypeI64:
 		// csel rd, rn, rm, cc
@@ -2041,10 +2044,10 @@ func (m *machine) lowerSelect(c, x, y, result ssa.Value) {
 	}
 }
 
-func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
+func (m *machine) lowerSelectVec(rc, rn, rm operand, rd regalloc.VReg) {
 	// First check if `rc` is zero or not.
 	checkZero := m.allocateInstr()
-	checkZero.asALU(aluOpSubS, operandNR(xzrVReg), rc, operandNR(xzrVReg), false)
+	checkZero.asALU(aluOpSubS, xzrVReg, rc, operandNR(xzrVReg), false)
 	m.insert(checkZero)
 
 	// Then use CSETM to set all bits to one if `rc` is zero.
@@ -2054,7 +2057,7 @@ func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
 	m.insert(cset)
 
 	// Then move the bits to the result vector register.
-	tmp2 := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	tmp2 := m.compiler.AllocateVReg(ssa.TypeV128)
 	dup := m.allocateInstr()
 	dup.asVecDup(tmp2, operandNR(allOnesOrZero), vecArrangement2D)
 	m.insert(dup)
@@ -2067,7 +2070,7 @@ func (m *machine) lowerSelectVec(rc, rn, rm, rd operand) {
 
 	// Finally, move the result to the destination register.
 	mov2 := m.allocateInstr()
-	mov2.asFpuMov128(rd.nr(), tmp2.nr())
+	mov2.asFpuMov128(rd, tmp2)
 	m.insert(mov2)
 }
 
@@ -2099,28 +2102,28 @@ func (m *machine) lowerAtomicRmw(si *ssa.Instruction) {
 	addr, val := si.Arg2()
 	addrDef, valDef := m.compiler.ValueDefinition(addr), m.compiler.ValueDefinition(val)
 	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := operandNR(m.compiler.VRegOf(si.Return()))
+	rt := m.compiler.VRegOf(si.Return())
 	rs := m.getOperand_NR(valDef, extModeNone)
 
 	_64 := si.Return().Type().Bits() == 64
-	var tmp operand
+	var tmp regalloc.VReg
 	if _64 {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI64)
 	} else {
-		tmp = operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
+		tmp = m.compiler.AllocateVReg(ssa.TypeI32)
 	}
-	m.lowerAtomicRmwImpl(op, rn, rs, rt, tmp, size, negateArg, flipArg, _64)
+	m.lowerAtomicRmwImpl(op, rn.nr(), rs.nr(), rt, tmp, size, negateArg, flipArg, _64)
 }
 
-func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp operand, size uint64, negateArg, flipArg, dst64bit bool) {
+func (m *machine) lowerAtomicRmwImpl(op atomicRmwOp, rn, rs, rt, tmp regalloc.VReg, size uint64, negateArg, flipArg, dst64bit bool) {
 	switch {
 	case negateArg:
 		neg := m.allocateInstr()
-		neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), rs, dst64bit)
+		neg.asALU(aluOpSub, tmp, operandNR(xzrVReg), operandNR(rs), dst64bit)
 		m.insert(neg)
 	case flipArg:
 		flip := m.allocateInstr()
-		flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), rs, dst64bit)
+		flip.asALU(aluOpOrn, tmp, operandNR(xzrVReg), operandNR(rs), dst64bit)
 		m.insert(flip)
 	default:
 		tmp = rs
@@ -2139,32 +2142,32 @@ func (m *machine) lowerAtomicCas(si *ssa.Instruction) {
 	rn := m.getOperand_NR(addrDef, extModeNone)
 	rt := m.getOperand_NR(replDef, extModeNone)
 	rs := m.getOperand_NR(expDef, extModeNone)
-	tmp := operandNR(m.compiler.AllocateVReg(si.Return().Type()))
+	tmp := m.compiler.AllocateVReg(si.Return().Type())
 
 	_64 := si.Return().Type().Bits() == 64
 	// rs is overwritten by CAS, so we need to move it to the result register before the instruction
 	// in case when it is used somewhere else.
 	mov := m.allocateInstr()
 	if _64 {
-		mov.asMove64(tmp.nr(), rs.nr())
+		mov.asMove64(tmp, rs.nr())
 	} else {
-		mov.asMove32(tmp.nr(), rs.nr())
+		mov.asMove32(tmp, rs.nr())
 	}
 	m.insert(mov)
 
-	m.lowerAtomicCasImpl(rn, tmp, rt, size)
+	m.lowerAtomicCasImpl(rn.nr(), tmp, rt.nr(), size)
 
 	mov2 := m.allocateInstr()
 	rd := m.compiler.VRegOf(si.Return())
 	if _64 {
-		mov2.asMove64(rd, tmp.nr())
+		mov2.asMove64(rd, tmp)
 	} else {
-		mov2.asMove32(rd, tmp.nr())
+		mov2.asMove32(rd, tmp)
 	}
 	m.insert(mov2)
 }
 
-func (m *machine) lowerAtomicCasImpl(rn, rs, rt operand, size uint64) {
+func (m *machine) lowerAtomicCasImpl(rn, rs, rt regalloc.VReg, size uint64) {
 	cas := m.allocateInstr()
 	cas.asAtomicCas(rn, rs, rt, size)
 	m.insert(cas)
@@ -2176,12 +2179,12 @@ func (m *machine) lowerAtomicLoad(si *ssa.Instruction) {
 
 	addrDef := m.compiler.ValueDefinition(addr)
 	rn := m.getOperand_NR(addrDef, extModeNone)
-	rt := operandNR(m.compiler.VRegOf(si.Return()))
+	rt := m.compiler.VRegOf(si.Return())
 
-	m.lowerAtomicLoadImpl(rn, rt, size)
+	m.lowerAtomicLoadImpl(rn.nr(), rt, size)
 }
 
-func (m *machine) lowerAtomicLoadImpl(rn, rt operand, size uint64) {
+func (m *machine) lowerAtomicLoadImpl(rn, rt regalloc.VReg, size uint64) {
 	ld := m.allocateInstr()
 	ld.asAtomicLoad(rn, rt, size)
 	m.insert(ld)
diff --git a/internal/engine/wazevo/backend/isa/arm64/lower_instr_test.go b/internal/engine/wazevo/backend/isa/arm64/lower_instr_test.go
index e87e27606f..6a367944e4 100644
--- a/internal/engine/wazevo/backend/isa/arm64/lower_instr_test.go
+++ b/internal/engine/wazevo/backend/isa/arm64/lower_instr_test.go
@@ -380,7 +380,7 @@ L2:
 				regalloc.VReg(3).SetRegType(regalloc.RegTypeInt)
 			mc, _, m := newSetupWithMockContext()
 			mc.typeOf = map[regalloc.VRegID]ssa.Type{execCtx.ID(): ssa.TypeI64, 2: ssa.TypeI64, 3: ssa.TypeI64}
-			m.lowerIDiv(execCtx, operandNR(rd), operandNR(rn), operandNR(rm), tc._64bit, tc.signed)
+			m.lowerIDiv(execCtx, rd, operandNR(rn), operandNR(rm), tc._64bit, tc.signed)
 			require.Equal(t, tc.exp, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
 		})
 	}
@@ -451,7 +451,7 @@ fcvtzu w1, s2
 		t.Run(tc.name, func(t *testing.T) {
 			mc, _, m := newSetupWithMockContext()
 			mc.typeOf = map[regalloc.VRegID]ssa.Type{v2VReg.ID(): ssa.TypeI64, x15VReg.ID(): ssa.TypeI64}
-			m.lowerFpuToInt(operandNR(x1VReg), operandNR(v2VReg), x15VReg, false, false, false, tc.nontrapping)
+			m.lowerFpuToInt(x1VReg, operandNR(v2VReg), x15VReg, false, false, false, tc.nontrapping)
 			require.Equal(t, tc.expectedAsm, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
 
 			m.executableContext.FlushPendingInstructions()
@@ -533,7 +533,7 @@ mul x1.4s, x2.4s, x15.4s
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			_, _, m := newSetupWithMockContext()
-			m.lowerVIMul(operandNR(x1VReg), operandNR(x2VReg), operandNR(x15VReg), tc.arrangement)
+			m.lowerVIMul(x1VReg, operandNR(x2VReg), operandNR(x15VReg), tc.arrangement)
 			require.Equal(t, tc.expectedAsm, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
 
 			m.executableContext.FlushPendingInstructions()
@@ -638,7 +638,7 @@ cset x15, ne
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			_, _, m := newSetupWithMockContext()
-			m.lowerVcheckTrue(tc.op, operandNR(x1VReg), operandNR(x15VReg), tc.arrangement)
+			m.lowerVcheckTrue(tc.op, operandNR(x1VReg), x15VReg, tc.arrangement)
 			require.Equal(t, tc.expectedAsm, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
 
 			m.executableContext.FlushPendingInstructions()
@@ -723,7 +723,7 @@ add w15, w15, w1?, lsl #1
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			_, _, m := newSetupWithMockContext()
-			m.lowerVhighBits(operandNR(x1VReg), operandNR(x15VReg), tc.arrangement)
+			m.lowerVhighBits(operandNR(x1VReg), x15VReg, tc.arrangement)
 			require.Equal(t, tc.expectedAsm, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
 
 			m.executableContext.FlushPendingInstructions()
@@ -772,7 +772,7 @@ tbl x1.16b, { v29.16b, v30.16b }, v1?.16b
 			lane1 := lanes[7]<<56 | lanes[6]<<48 | lanes[5]<<40 | lanes[4]<<32 | lanes[3]<<24 | lanes[2]<<16 | lanes[1]<<8 | lanes[0]
 			lane2 := lanes[15]<<56 | lanes[14]<<48 | lanes[13]<<40 | lanes[12]<<32 | lanes[11]<<24 | lanes[10]<<16 | lanes[9]<<8 | lanes[8]
 
-			m.lowerShuffle(operandNR(x1VReg), operandNR(x2VReg), operandNR(x15VReg), lane1, lane2)
+			m.lowerShuffle(x1VReg, operandNR(x2VReg), operandNR(x15VReg), lane1, lane2)
 			require.Equal(t, tc.expectedAsm, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
 
 			m.executableContext.FlushPendingInstructions()
@@ -829,7 +829,7 @@ ushl x1.16b, x2.16b, v2?.16b
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			_, _, m := newSetupWithMockContext()
-			m.lowerVShift(tc.op, operandNR(x1VReg), operandNR(x2VReg), operandNR(x15VReg), tc.arrangement)
+			m.lowerVShift(tc.op, x1VReg, operandNR(x2VReg), operandNR(x15VReg), tc.arrangement)
 			require.Equal(t, tc.expectedAsm, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
 
 			m.executableContext.FlushPendingInstructions()
@@ -845,12 +845,12 @@ func TestMachine_lowerSelectVec(t *testing.T) {
 	c := operandNR(m.compiler.AllocateVReg(ssa.TypeI32))
 	rn := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
 	rm := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
-	rd := operandNR(m.compiler.AllocateVReg(ssa.TypeV128))
+	rd := m.compiler.AllocateVReg(ssa.TypeV128)
 
 	require.Equal(t, 1, int(c.reg().ID()))
 	require.Equal(t, 2, int(rn.reg().ID()))
 	require.Equal(t, 3, int(rm.reg().ID()))
-	require.Equal(t, 4, int(rd.reg().ID()))
+	require.Equal(t, 4, int(rd.ID()))
 
 	m.lowerSelectVec(c, rn, rm, rd)
 	require.Equal(t, `
@@ -898,17 +898,17 @@ mov v5?.8b, v6?.8b
 				typ = ssa.TypeI32
 				ftyp = ssa.TypeF32
 			}
-			tmpI := operandNR(m.compiler.AllocateVReg(typ))
-			tmpF := operandNR(m.compiler.AllocateVReg(ftyp))
+			tmpI := m.compiler.AllocateVReg(typ)
+			tmpF := m.compiler.AllocateVReg(ftyp)
 			rn := operandNR(m.compiler.AllocateVReg(ftyp))
 			rm := operandNR(m.compiler.AllocateVReg(ftyp))
-			rd := operandNR(m.compiler.AllocateVReg(ftyp))
+			rd := m.compiler.AllocateVReg(ftyp)
 
-			require.Equal(t, 1, int(tmpI.reg().ID()))
-			require.Equal(t, 2, int(tmpF.reg().ID()))
+			require.Equal(t, 1, int(tmpI.ID()))
+			require.Equal(t, 2, int(tmpF.ID()))
 			require.Equal(t, 3, int(rn.reg().ID()))
 			require.Equal(t, 4, int(rm.reg().ID()))
-			require.Equal(t, 5, int(rd.reg().ID()))
+			require.Equal(t, 5, int(rd.ID()))
 
 			m.lowerFcopysignImpl(rd, rn, rm, tmpI, tmpF, tc._64bit)
 			require.Equal(t, tc.exp, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
@@ -944,15 +944,15 @@ ror x4?, x2?, x1?
 			} else {
 				typ = ssa.TypeI32
 			}
-			tmpI := operandNR(m.compiler.AllocateVReg(typ))
+			tmpI := m.compiler.AllocateVReg(typ)
 			rn := operandNR(m.compiler.AllocateVReg(typ))
 			rm := operandNR(m.compiler.AllocateVReg(typ))
-			rd := operandNR(m.compiler.AllocateVReg(typ))
+			rd := m.compiler.AllocateVReg(typ)
 
-			require.Equal(t, 1, int(tmpI.reg().ID()))
+			require.Equal(t, 1, int(tmpI.ID()))
 			require.Equal(t, 2, int(rn.reg().ID()))
 			require.Equal(t, 3, int(rm.reg().ID()))
-			require.Equal(t, 4, int(rd.reg().ID()))
+			require.Equal(t, 4, int(rd.ID()))
 
 			m.lowerRotlImpl(rd, rn, rm, tmpI, tc._64bit)
 			require.Equal(t, tc.exp, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
@@ -1370,15 +1370,15 @@ swpalb w3?, w4?, x2?
 			} else {
 				typ = ssa.TypeI32
 			}
-			tmp := operandNR(m.compiler.AllocateVReg(typ))
-			rn := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
-			rs := operandNR(m.compiler.AllocateVReg(typ))
-			rt := operandNR(m.compiler.AllocateVReg(typ))
+			tmp := m.compiler.AllocateVReg(typ)
+			rn := m.compiler.AllocateVReg(ssa.TypeI64)
+			rs := m.compiler.AllocateVReg(typ)
+			rt := m.compiler.AllocateVReg(typ)
 
-			require.Equal(t, 1, int(tmp.reg().ID()))
-			require.Equal(t, 2, int(rn.reg().ID()))
-			require.Equal(t, 3, int(rs.reg().ID()))
-			require.Equal(t, 4, int(rt.reg().ID()))
+			require.Equal(t, 1, int(tmp.ID()))
+			require.Equal(t, 2, int(rn.ID()))
+			require.Equal(t, 3, int(rs.ID()))
+			require.Equal(t, 4, int(rt.ID()))
 
 			m.lowerAtomicRmwImpl(tc.op, rn, rs, rt, tmp, tc.size, tc.negateArg, tc.flipArg, tc._64bit)
 			require.Equal(t, tc.exp, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
@@ -1458,13 +1458,13 @@ casalb w2?, w3?, x1?
 			} else {
 				typ = ssa.TypeI32
 			}
-			rn := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
-			rs := operandNR(m.compiler.AllocateVReg(typ))
-			rt := operandNR(m.compiler.AllocateVReg(typ))
+			rn := m.compiler.AllocateVReg(ssa.TypeI64)
+			rs := m.compiler.AllocateVReg(typ)
+			rt := m.compiler.AllocateVReg(typ)
 
-			require.Equal(t, 1, int(rn.reg().ID()))
-			require.Equal(t, 2, int(rs.reg().ID()))
-			require.Equal(t, 3, int(rt.reg().ID()))
+			require.Equal(t, 1, int(rn.ID()))
+			require.Equal(t, 2, int(rs.ID()))
+			require.Equal(t, 3, int(rt.ID()))
 
 			m.lowerAtomicCasImpl(rn, rs, rt, tc.size)
 			require.Equal(t, tc.exp, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
@@ -1544,11 +1544,11 @@ ldarb w2?, x1?
 			} else {
 				typ = ssa.TypeI32
 			}
-			rn := operandNR(m.compiler.AllocateVReg(ssa.TypeI64))
-			rt := operandNR(m.compiler.AllocateVReg(typ))
+			rn := m.compiler.AllocateVReg(ssa.TypeI64)
+			rt := m.compiler.AllocateVReg(typ)
 
-			require.Equal(t, 1, int(rn.reg().ID()))
-			require.Equal(t, 2, int(rt.reg().ID()))
+			require.Equal(t, 1, int(rn.ID()))
+			require.Equal(t, 2, int(rt.ID()))
 
 			m.lowerAtomicLoadImpl(rn, rt, tc.size)
 			require.Equal(t, tc.exp, "\n"+formatEmittedInstructionsInCurrentBlock(m)+"\n")
diff --git a/internal/engine/wazevo/backend/isa/arm64/lower_mem.go b/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
index 4842eaa382..fd0760d723 100644
--- a/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
+++ b/internal/engine/wazevo/backend/isa/arm64/lower_mem.go
@@ -24,6 +24,14 @@ type (
 	addressModeKind byte
 )
 
+func resetAddressMode(a *addressMode) {
+	a.kind = 0
+	a.rn = 0
+	a.rm = 0
+	a.extOp = 0
+	a.imm = 0
+}
+
 const (
 	// addressModeKindRegExtended takes a base register and an index register. The index register is sign/zero-extended,
 	// and then scaled by bits(type)/8.
@@ -140,15 +148,17 @@ func (a addressMode) format(dstSizeBits byte) (ret string) {
 	return
 }
 
-func addressModePreOrPostIndex(rn regalloc.VReg, imm int64, preIndex bool) addressMode {
+func addressModePreOrPostIndex(m *machine, rn regalloc.VReg, imm int64, preIndex bool) *addressMode {
 	if !offsetFitsInAddressModeKindRegSignedImm9(imm) {
 		panic(fmt.Sprintf("BUG: offset %#x does not fit in addressModeKindRegSignedImm9", imm))
 	}
+	mode := m.amodePool.Allocate()
 	if preIndex {
-		return addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
+		*mode = addressMode{kind: addressModeKindPreIndex, rn: rn, imm: imm}
 	} else {
-		return addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
+		*mode = addressMode{kind: addressModeKindPostIndex, rn: rn, imm: imm}
 	}
+	return mode
 }
 
 func offsetFitsInAddressModeKindRegUnsignedImm12(dstSizeInBits byte, offset int64) bool {
@@ -207,9 +217,9 @@ func (m *machine) lowerExtLoad(op ssa.Opcode, ptr ssa.Value, offset uint32, ret
 	amode := m.lowerToAddressMode(ptr, offset, size)
 	load := m.allocateInstr()
 	if signed {
-		load.asSLoad(operandNR(ret), amode, size)
+		load.asSLoad(ret, amode, size)
 	} else {
-		load.asULoad(operandNR(ret), amode, size)
+		load.asULoad(ret, amode, size)
 	}
 	m.insert(load)
 }
@@ -221,11 +231,11 @@ func (m *machine) lowerLoad(ptr ssa.Value, offset uint32, typ ssa.Type, ret ssa.
 	load := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32, ssa.TypeI64:
-		load.asULoad(operandNR(dst), amode, typ.Bits())
+		load.asULoad(dst, amode, typ.Bits())
 	case ssa.TypeF32, ssa.TypeF64:
-		load.asFpuLoad(operandNR(dst), amode, typ.Bits())
+		load.asFpuLoad(dst, amode, typ.Bits())
 	case ssa.TypeV128:
-		load.asFpuLoad(operandNR(dst), amode, 128)
+		load.asFpuLoad(dst, amode, 128)
 	default:
 		panic("TODO")
 	}
@@ -239,7 +249,7 @@ func (m *machine) lowerLoadSplat(ptr ssa.Value, offset uint32, lane ssa.VecLane,
 	m.lowerConstantI64(offsetReg, int64(offset))
 	addedBase := m.addReg64ToReg64(base, offsetReg)
 
-	rd := operandNR(m.compiler.VRegOf(ret))
+	rd := m.compiler.VRegOf(ret)
 
 	ld1r := m.allocateInstr()
 	ld1r.asVecLoad1R(rd, operandNR(addedBase), ssaLaneToArrangement(lane))
@@ -258,7 +268,7 @@ func (m *machine) lowerStore(si *ssa.Instruction) {
 }
 
 // lowerToAddressMode converts a pointer to an addressMode that can be used as an operand for load/store instructions.
-func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode addressMode) {
+func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte) (amode *addressMode) {
 	// TODO: currently the instruction selection logic doesn't support addressModeKindRegScaledExtended and
 	// addressModeKindRegScaled since collectAddends doesn't take ssa.OpcodeIshl into account. This should be fixed
 	// to support more efficient address resolution.
@@ -272,32 +282,33 @@ func (m *machine) lowerToAddressMode(ptr ssa.Value, offsetBase uint32, size byte
 // During the construction, this might emit additional instructions.
 //
 // Extracted as a separate function for easy testing.
-func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode addressMode) {
+func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32], a64s *wazevoapi.Queue[regalloc.VReg], size byte, offset int64) (amode *addressMode) {
+	amode = m.amodePool.Allocate()
 	switch a64sExist, a32sExist := !a64s.Empty(), !a32s.Empty(); {
 	case a64sExist && a32sExist:
 		var base regalloc.VReg
 		base = a64s.Dequeue()
 		var a32 addend32
 		a32 = a32s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
+		*amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: a32.r, extOp: a32.ext}
 	case a64sExist && offsetFitsInAddressModeKindRegUnsignedImm12(size, offset):
 		var base regalloc.VReg
 		base = a64s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: offset}
 		offset = 0
 	case a64sExist && offsetFitsInAddressModeKindRegSignedImm9(offset):
 		var base regalloc.VReg
 		base = a64s.Dequeue()
-		amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
+		*amode = addressMode{kind: addressModeKindRegSignedImm9, rn: base, imm: offset}
 		offset = 0
 	case a64sExist:
 		var base regalloc.VReg
 		base = a64s.Dequeue()
 		if !a64s.Empty() {
 			index := a64s.Dequeue()
-			amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
+			*amode = addressMode{kind: addressModeKindRegReg, rn: base, rm: index, extOp: extendOpUXTX /* indicates index reg is 64-bit */}
 		} else {
-			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+			*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
 		}
 	case a32sExist:
 		base32 := a32s.Dequeue()
@@ -314,14 +325,14 @@ func (m *machine) lowerToAddressModeFromAddends(a32s *wazevoapi.Queue[addend32],
 
 		if !a32s.Empty() {
 			index := a32s.Dequeue()
-			amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
+			*amode = addressMode{kind: addressModeKindRegExtended, rn: base, rm: index.r, extOp: index.ext}
 		} else {
-			amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
+			*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: base, imm: 0}
 		}
 	default: // Only static offsets.
 		tmpReg := m.compiler.AllocateVReg(ssa.TypeI64)
 		m.lowerConstantI64(tmpReg, offset)
-		amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
+		*amode = addressMode{kind: addressModeKindRegUnsignedImm12, rn: tmpReg, imm: 0}
 		offset = 0
 	}
 
@@ -411,13 +422,13 @@ func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
 	if imm12Op, ok := asImm12Operand(uint64(c)); ok {
-		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), imm12Op, true)
+		alu.asALU(aluOpAdd, rd, operandNR(r), imm12Op, true)
 	} else if imm12Op, ok = asImm12Operand(uint64(-c)); ok {
-		alu.asALU(aluOpSub, operandNR(rd), operandNR(r), imm12Op, true)
+		alu.asALU(aluOpSub, rd, operandNR(r), imm12Op, true)
 	} else {
 		tmp := m.compiler.AllocateVReg(ssa.TypeI64)
 		m.load64bitConst(c, tmp)
-		alu.asALU(aluOpAdd, operandNR(rd), operandNR(r), operandNR(tmp), true)
+		alu.asALU(aluOpAdd, rd, operandNR(r), operandNR(tmp), true)
 	}
 	m.insert(alu)
 	return
@@ -426,7 +437,7 @@ func (m *machine) addConstToReg64(r regalloc.VReg, c int64) (rd regalloc.VReg) {
 func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandNR(rm), true)
+	alu.asALU(aluOpAdd, rd, operandNR(rn), operandNR(rm), true)
 	m.insert(alu)
 	return
 }
@@ -434,7 +445,7 @@ func (m *machine) addReg64ToReg64(rn, rm regalloc.VReg) (rd regalloc.VReg) {
 func (m *machine) addRegToReg64Ext(rn, rm regalloc.VReg, ext extendOp) (rd regalloc.VReg) {
 	rd = m.compiler.AllocateVReg(ssa.TypeI64)
 	alu := m.allocateInstr()
-	alu.asALU(aluOpAdd, operandNR(rd), operandNR(rn), operandER(rm, ext, 64), true)
+	alu.asALU(aluOpAdd, rd, operandNR(rn), operandER(rm, ext, 64), true)
 	m.insert(alu)
 	return
 }
diff --git a/internal/engine/wazevo/backend/isa/arm64/lower_mem_test.go b/internal/engine/wazevo/backend/isa/arm64/lower_mem_test.go
index 82ab297f3a..e7a054d75a 100644
--- a/internal/engine/wazevo/backend/isa/arm64/lower_mem_test.go
+++ b/internal/engine/wazevo/backend/isa/arm64/lower_mem_test.go
@@ -807,7 +807,7 @@ func TestMachine_lowerToAddressModeFromAddends(t *testing.T) {
 			}
 			actual := m.lowerToAddressModeFromAddends(&a32s, &a64s, tc.dstSizeInBits, tc.offset)
 			require.Equal(t, strings.Join(tc.insts, "\n"), formatEmittedInstructionsInCurrentBlock(m))
-			require.Equal(t, tc.exp, actual, actual.format(tc.dstSizeInBits))
+			require.Equal(t, &tc.exp, actual, actual.format(tc.dstSizeInBits))
 		})
 	}
 }
diff --git a/internal/engine/wazevo/backend/isa/arm64/machine.go b/internal/engine/wazevo/backend/isa/arm64/machine.go
index b435d9ba96..311b34f4a1 100644
--- a/internal/engine/wazevo/backend/isa/arm64/machine.go
+++ b/internal/engine/wazevo/backend/isa/arm64/machine.go
@@ -21,6 +21,8 @@ type (
 		regAlloc   regalloc.Allocator
 		regAllocFn *backend.RegAllocFunction[*instruction, *machine]
 
+		amodePool wazevoapi.Pool[addressMode]
+
 		// addendsWorkQueue is used during address lowering, defined here for reuse.
 		addendsWorkQueue wazevoapi.Queue[ssa.Value]
 		addends32        wazevoapi.Queue[addend32]
@@ -105,6 +107,7 @@ func NewBackend() backend.Machine {
 		spillSlots:        make(map[regalloc.VRegID]int64),
 		executableContext: newExecutableContext(),
 		regAlloc:          regalloc.NewAllocator(regInfo),
+		amodePool:         wazevoapi.NewPool[addressMode](resetAddressMode),
 	}
 	return m
 }
@@ -149,6 +152,7 @@ func (m *machine) Reset() {
 	m.maxRequiredStackSizeForCalls = 0
 	m.executableContext.Reset()
 	m.jmpTableTargets = m.jmpTableTargets[:0]
+	m.amodePool.Reset()
 }
 
 // SetCurrentABI implements backend.Machine SetCurrentABI.
@@ -209,7 +213,7 @@ func (m *machine) allocateNop() *instruction {
 }
 
 func (m *machine) resolveAddressingMode(arg0offset, ret0offset int64, i *instruction) {
-	amode := &i.amode
+	amode := i.getAmode()
 	switch amode.kind {
 	case addressModeKindResultStackSpace:
 		amode.imm += ret0offset
diff --git a/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go b/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
index 466fac4640..d9032f9218 100644
--- a/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
+++ b/internal/engine/wazevo/backend/isa/arm64/machine_pro_epi_logue.go
@@ -70,7 +70,7 @@ func (m *machine) setupPrologue() {
 		//                                          +-----------------+ <----- SP
 		//                                             (low address)
 		//
-		_amode := addressModePreOrPostIndex(spVReg,
+		_amode := addressModePreOrPostIndex(m, spVReg,
 			-16,  // stack pointer must be 16-byte aligned.
 			true, // Decrement before store.
 		)
@@ -159,7 +159,7 @@ func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruc
 		sizeOfArgRetReg = tmpRegVReg
 
 		subSp := m.allocateInstr()
-		subSp.asALU(aluOpSub, operandNR(spVReg), operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
+		subSp.asALU(aluOpSub, spVReg, operandNR(spVReg), operandNR(sizeOfArgRetReg), true)
 		cur = linkInstr(cur, subSp)
 	} else {
 		sizeOfArgRetReg = xzrVReg
@@ -168,7 +168,7 @@ func (m *machine) createReturnAddrAndSizeOfArgRetSlot(cur *instruction) *instruc
 	// Saves the return address (lr) and the size_of_arg_ret below the SP.
 	// size_of_arg_ret is used for stack unwinding.
 	pstr := m.allocateInstr()
-	amode := addressModePreOrPostIndex(spVReg, -16, true /* decrement before store */)
+	amode := addressModePreOrPostIndex(m, spVReg, -16, true /* decrement before store */)
 	pstr.asStorePair64(lrVReg, sizeOfArgRetReg, amode)
 	cur = linkInstr(cur, pstr)
 	return cur
@@ -182,7 +182,7 @@ func (m *machine) createFrameSizeSlot(cur *instruction, s int64) *instruction {
 	} else {
 		frameSizeReg = xzrVReg
 	}
-	_amode := addressModePreOrPostIndex(spVReg,
+	_amode := addressModePreOrPostIndex(m, spVReg,
 		-16,  // stack pointer must be 16-byte aligned.
 		true, // Decrement before store.
 	)
@@ -213,7 +213,7 @@ func (m *machine) postRegAlloc() {
 			m.executableContext.PendingInstructions = m.executableContext.PendingInstructions[:0]
 		default:
 			// Removes the redundant copy instruction.
-			if cur.IsCopy() && cur.rn.realReg() == cur.rd.realReg() {
+			if cur.IsCopy() && cur.rn.realReg() == cur.rd.RealReg() {
 				prev, next := cur.prev, cur.next
 				// Remove the copy instruction.
 				prev.next = next
@@ -286,16 +286,16 @@ func (m *machine) setupEpilogueAfter(cur *instruction) {
 		for i := range m.clobberedRegs {
 			vr := m.clobberedRegs[l-i] // reverse order to restore.
 			load := m.allocateInstr()
-			amode := addressModePreOrPostIndex(spVReg,
+			amode := addressModePreOrPostIndex(m, spVReg,
 				16,    // stack pointer must be 16-byte aligned.
 				false, // Increment after store.
 			)
 			// TODO: pair loads to reduce the number of instructions.
 			switch regTypeToRegisterSizeInBits(vr.RegType()) {
 			case 64: // save int reg.
-				load.asULoad(operandNR(vr), amode, 64)
+				load.asULoad(vr, amode, 64)
 			case 128: // save vector reg.
-				load.asFpuLoad(operandNR(vr), amode, 128)
+				load.asFpuLoad(vr, amode, 128)
 			}
 			cur = linkInstr(cur, load)
 		}
@@ -317,8 +317,8 @@ func (m *machine) setupEpilogueAfter(cur *instruction) {
 	//    SP----> +-----------------+
 
 	ldr := m.allocateInstr()
-	ldr.asULoad(operandNR(lrVReg),
-		addressModePreOrPostIndex(spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
+	ldr.asULoad(lrVReg,
+		addressModePreOrPostIndex(m, spVReg, 16 /* stack pointer must be 16-byte aligned. */, false /* increment after loads */), 64)
 	cur = linkInstr(cur, ldr)
 
 	if s := int64(m.currentABI.AlignedArgResultStackSlotSize()); s > 0 {
@@ -351,14 +351,14 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 	if immm12op, ok := asImm12Operand(uint64(requiredStackSize)); ok {
 		// sub tmp, sp, #requiredStackSize
 		sub := m.allocateInstr()
-		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), immm12op, true)
+		sub.asALU(aluOpSub, tmpRegVReg, operandNR(spVReg), immm12op, true)
 		cur = linkInstr(cur, sub)
 	} else {
 		// This case, we first load the requiredStackSize into the temporary register,
 		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
 		// Then subtract it.
 		sub := m.allocateInstr()
-		sub.asALU(aluOpSub, operandNR(tmpRegVReg), operandNR(spVReg), operandNR(tmpRegVReg), true)
+		sub.asALU(aluOpSub, tmpRegVReg, operandNR(spVReg), operandNR(tmpRegVReg), true)
 		cur = linkInstr(cur, sub)
 	}
 
@@ -366,16 +366,18 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 
 	// ldr tmp2, [executionContext #StackBottomPtr]
 	ldr := m.allocateInstr()
-	ldr.asULoad(operandNR(tmp2), addressMode{
+	amode := m.amodePool.Allocate()
+	*amode = addressMode{
 		kind: addressModeKindRegUnsignedImm12,
 		rn:   x0VReg, // execution context is always the first argument.
 		imm:  wazevoapi.ExecutionContextOffsetStackBottomPtr.I64(),
-	}, 64)
+	}
+	ldr.asULoad(tmp2, amode, 64)
 	cur = linkInstr(cur, ldr)
 
 	// subs xzr, tmp, tmp2
 	subs := m.allocateInstr()
-	subs.asALU(aluOpSubS, operandNR(xzrVReg), operandNR(tmpRegVReg), operandNR(tmp2), true)
+	subs.asALU(aluOpSubS, xzrVReg, operandNR(tmpRegVReg), operandNR(tmp2), true)
 	cur = linkInstr(cur, subs)
 
 	// b.ge #imm
@@ -388,22 +390,25 @@ func (m *machine) insertStackBoundsCheck(requiredStackSize int64, cur *instructi
 		// First load the requiredStackSize into the temporary register,
 		cur = m.lowerConstantI64AndInsert(cur, tmpRegVReg, requiredStackSize)
 		setRequiredStackSize := m.allocateInstr()
-		setRequiredStackSize.asStore(operandNR(tmpRegVReg),
-			addressMode{
-				kind: addressModeKindRegUnsignedImm12,
-				// Execution context is always the first argument.
-				rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
-			}, 64)
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{
+			kind: addressModeKindRegUnsignedImm12,
+			// Execution context is always the first argument.
+			rn: x0VReg, imm: wazevoapi.ExecutionContextOffsetStackGrowRequiredSize.I64(),
+		}
+		setRequiredStackSize.asStore(operandNR(tmpRegVReg), amode, 64)
 
 		cur = linkInstr(cur, setRequiredStackSize)
 	}
 
 	ldrAddress := m.allocateInstr()
-	ldrAddress.asULoad(operandNR(tmpRegVReg), addressMode{
+	amode2 := m.amodePool.Allocate()
+	*amode2 = addressMode{
 		kind: addressModeKindRegUnsignedImm12,
 		rn:   x0VReg, // execution context is always the first argument
 		imm:  wazevoapi.ExecutionContextOffsetStackGrowCallTrampolineAddress.I64(),
-	}, 64)
+	}
+	ldrAddress.asULoad(tmpRegVReg, amode2, 64)
 	cur = linkInstr(cur, ldrAddress)
 
 	// Then jumps to the stack grow call sequence's address, meaning
diff --git a/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go b/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
index 1c8793b73d..c7eb92cc20 100644
--- a/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
+++ b/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
@@ -91,7 +91,7 @@ func (m *machine) InsertStoreRegisterAt(v regalloc.VReg, instr *instruction, aft
 	}
 
 	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	var amode addressMode
+	var amode *addressMode
 	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
 	store := m.allocateInstr()
 	store.asStore(operandNR(v), amode, typ.Bits())
@@ -116,16 +116,16 @@ func (m *machine) InsertReloadRegisterAt(v regalloc.VReg, instr *instruction, af
 	}
 
 	offsetFromSP := m.getVRegSpillSlotOffsetFromSP(v.ID(), typ.Size())
-	var amode addressMode
+	var amode *addressMode
 	cur, amode = m.resolveAddressModeForOffsetAndInsert(cur, offsetFromSP, typ.Bits(), spVReg, true)
 	load := m.allocateInstr()
 	switch typ {
 	case ssa.TypeI32, ssa.TypeI64:
-		load.asULoad(operandNR(v), amode, typ.Bits())
+		load.asULoad(v, amode, typ.Bits())
 	case ssa.TypeF32, ssa.TypeF64:
-		load.asFpuLoad(operandNR(v), amode, typ.Bits())
+		load.asFpuLoad(v, amode, typ.Bits())
 	case ssa.TypeV128:
-		load.asFpuLoad(operandNR(v), amode, 128)
+		load.asFpuLoad(v, amode, 128)
 	default:
 		panic("TODO")
 	}
diff --git a/internal/engine/wazevo/backend/isa/arm64/machine_test.go b/internal/engine/wazevo/backend/isa/arm64/machine_test.go
index a585f1fba7..882b7384ed 100644
--- a/internal/engine/wazevo/backend/isa/arm64/machine_test.go
+++ b/internal/engine/wazevo/backend/isa/arm64/machine_test.go
@@ -9,39 +9,43 @@ import (
 )
 
 func TestMachine_resolveAddressingMode(t *testing.T) {
+	m := NewBackend().(*machine)
 	t.Run("imm12/arg", func(t *testing.T) {
-		m := &machine{}
 		i := &instruction{}
-		i.asULoad(operandNR(x17VReg), addressMode{
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{
 			kind: addressModeKindArgStackSpace,
 			rn:   spVReg,
 			imm:  128,
-		}, 64)
+		}
+		i.asULoad(x17VReg, amode, 64)
 		m.resolveAddressingMode(1024, 0, i)
-		require.Equal(t, addressModeKindRegUnsignedImm12, i.amode.kind)
-		require.Equal(t, int64(128+1024), i.amode.imm)
+		require.Equal(t, addressModeKindRegUnsignedImm12, i.getAmode().kind)
+		require.Equal(t, int64(128+1024), i.getAmode().imm)
 	})
 	t.Run("imm12/result", func(t *testing.T) {
-		m := &machine{}
 		i := &instruction{}
-		i.asULoad(operandNR(x17VReg), addressMode{
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{
 			kind: addressModeKindResultStackSpace,
 			rn:   spVReg,
 			imm:  128,
-		}, 64)
+		}
+		i.asULoad(x17VReg, amode, 64)
 		m.resolveAddressingMode(0, 256, i)
-		require.Equal(t, addressModeKindRegUnsignedImm12, i.amode.kind)
-		require.Equal(t, int64(128+256), i.amode.imm)
+		require.Equal(t, addressModeKindRegUnsignedImm12, i.getAmode().kind)
+		require.Equal(t, int64(128+256), i.getAmode().imm)
 	})
 
 	t.Run("tmp reg", func(t *testing.T) {
-		m := &machine{executableContext: newExecutableContext()}
 		root := &instruction{kind: udf}
 		i := &instruction{prev: root}
-		i.asULoad(operandNR(x17VReg), addressMode{
+		amode := m.amodePool.Allocate()
+		*amode = addressMode{
 			kind: addressModeKindResultStackSpace,
 			rn:   spVReg,
-		}, 64)
+		}
+		i.asULoad(x17VReg, amode, 64)
 		m.resolveAddressingMode(0, 0x40000001, i)
 
 		m.executableContext.RootInstr = root