Skip to content

Commit 22ddb8a

Browse files
authored
pulley: Implement full 128-bit multiplication (bytecodealliance#10062)
* pulley: Implement full 128-bit multiplication While Pulley has lowering rules for widening multiplication it didn't have a rule for a full 128-bit multiplication which is possible to generate through CLIF optimizations given wasm input. This commit adds such a lowering to the Cranelift backend but doesn't add any new instructions yet under the assumption this probably isn't perf-critical at this time. * Don't use a fallible `amode_add`
1 parent 362568b commit 22ddb8a

File tree

2 files changed

+57
-2
lines changed

2 files changed

+57
-2
lines changed

cranelift/codegen/src/isa/pulley_shared/lower.isle

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,11 +331,36 @@
331331
(pulley_xmul64_s8 a b))
332332

333333
;; 128-bit (or wide) multiplication
334-
(rule (lower (has_type $I128 (imul (uextend a) (uextend b))))
334+
(rule 4 (lower (has_type $I128 (imul (uextend a) (uextend b))))
335335
(pulley_xwidemul64_u (zext64 a) (zext64 b)))
336-
(rule (lower (has_type $I128 (imul (sextend a) (sextend b))))
336+
(rule 4 (lower (has_type $I128 (imul (sextend a) (sextend b))))
337337
(pulley_xwidemul64_s (sext64 a) (sext64 b)))
338338

339+
;; for I128
340+
(rule (lower (has_type $I128 (imul x y)))
341+
(let
342+
((x_regs ValueRegs x)
343+
(x_lo XReg (value_regs_get x_regs 0))
344+
(x_hi XReg (value_regs_get x_regs 1))
345+
346+
;; Get the high/low registers for `y`.
347+
(y_regs ValueRegs y)
348+
(y_lo XReg (value_regs_get y_regs 0))
349+
(y_hi XReg (value_regs_get y_regs 1))
350+
351+
;; 128bit mul formula:
352+
;; dst_lo = x_lo * y_lo
353+
;; dst_hi = mul_high(x_lo, y_lo) + (x_lo * y_hi) + (x_hi * y_lo)
354+
(wide_regs ValueRegs (pulley_xwidemul64_u x_lo y_lo))
355+
(wide_lo XReg (value_regs_get wide_regs 0))
356+
(wide_hi XReg (value_regs_get wide_regs 1))
357+
(tmp_hi1 XReg (pulley_xmul64 x_lo y_hi))
358+
(tmp_hi2 XReg (pulley_xmul64 x_hi y_lo))
359+
(tmp_add XReg (pulley_xadd64 wide_hi tmp_hi1))
360+
(result_hi XReg (pulley_xadd64 tmp_add tmp_hi2))
361+
)
362+
(value_regs wide_lo result_hi)))
363+
339364
;; vector multiplication
340365
(rule (lower (has_type $I8X16 (imul a b))) (pulley_vmuli8x16 a b))
341366
(rule (lower (has_type $I16X8 (imul a b))) (pulley_vmuli16x8 a b))
@@ -1054,6 +1079,29 @@
10541079
(rule 2 (lower (store flags src @ (value_type (ty_vec128 ty)) addr offset))
10551080
(side_effect (pulley_vstore (amode addr offset) src ty flags)))
10561081

1082+
;; i128 stores
1083+
1084+
(rule 3 (lower (store flags src @ (value_type $I128) addr offset))
1085+
(let
1086+
((src_regs ValueRegs src)
1087+
(src_lo XReg (value_regs_get src_regs 0))
1088+
(src_hi XReg (value_regs_get src_regs 1))
1089+
(addrp0 XReg addr)
1090+
(addrp8 Amode (Amode.RegOffset addrp0 8)))
1091+
(side_effect (emit_store_i128 flags src_lo src_hi (Amode.RegOffset addrp0 0) addrp8))))
1092+
1093+
;; Helper to handle big/little endian to determine which order the lo/hi
1094+
;; halves of the i128 are stored.
1095+
(decl emit_store_i128 (MemFlags XReg XReg Amode Amode) SideEffectNoResult)
1096+
(rule 0 (emit_store_i128 flags lo hi addrp0 addrp8)
1097+
(if-let (Endianness.Little) (endianness flags))
1098+
(let ((_ InstOutput (side_effect (pulley_xstore addrp0 lo $I64 flags))))
1099+
(pulley_xstore addrp8 hi $I64 flags)))
1100+
(rule 1 (emit_store_i128 flags lo hi addrp0 addrp8)
1101+
(if-let (Endianness.Big) (endianness flags))
1102+
(let ((_ InstOutput (side_effect (pulley_xstore addrp0 hi $I64 flags))))
1103+
(pulley_xstore addrp8 lo $I64 flags)))
1104+
10571105
;; Equivalent of `gen_xload` but for stores.
10581106
(decl gen_xstore (Value Value Offset32 MemFlags Type) SideEffectNoResult)
10591107

@@ -1092,6 +1140,9 @@
10921140
(rule 1 (lower (has_type $I64 (uextend val)))
10931141
(zext64 val))
10941142

1143+
(rule 1 (lower (has_type $I128 (uextend val)))
1144+
(value_regs (zext64 val) (pulley_xzero)))
1145+
10951146
;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
10961147

10971148
(rule 0 (lower (has_type (fits_in_32 _) (sextend val)))

cranelift/filetests/filetests/runtests/i128-arithmetic.clif

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ target riscv64
88
target riscv64 has_c has_zcb
99
set enable_multi_ret_implicit_sret
1010
target s390x
11+
target pulley32
12+
target pulley32be
13+
target pulley64
14+
target pulley64be
1115

1216
function %add_i128(i128, i128) -> i128 {
1317
block0(v0: i128,v1: i128):

0 commit comments

Comments
 (0)