|
331 | 331 | (pulley_xmul64_s8 a b))
|
332 | 332 |
|
333 | 333 | ;; 128-bit (or wide) multiplication
|
334 |
| -(rule (lower (has_type $I128 (imul (uextend a) (uextend b)))) |
| 334 | +(rule 4 (lower (has_type $I128 (imul (uextend a) (uextend b)))) |
335 | 335 | (pulley_xwidemul64_u (zext64 a) (zext64 b)))
|
336 |
| -(rule (lower (has_type $I128 (imul (sextend a) (sextend b)))) |
| 336 | +(rule 4 (lower (has_type $I128 (imul (sextend a) (sextend b)))) |
337 | 337 | (pulley_xwidemul64_s (sext64 a) (sext64 b)))
|
338 | 338 |
|
| 339 | +;; for I128 |
| 340 | +(rule (lower (has_type $I128 (imul x y))) |
| 341 | + (let |
| 342 | + ((x_regs ValueRegs x) |
| 343 | + (x_lo XReg (value_regs_get x_regs 0)) |
| 344 | + (x_hi XReg (value_regs_get x_regs 1)) |
| 345 | + |
| 346 | + ;; Get the high/low registers for `y`. |
| 347 | + (y_regs ValueRegs y) |
| 348 | + (y_lo XReg (value_regs_get y_regs 0)) |
| 349 | + (y_hi XReg (value_regs_get y_regs 1)) |
| 350 | + |
| 351 | + ;; 128bit mul formula: |
| 352 | + ;; dst_lo = x_lo * y_lo |
| 353 | + ;; dst_hi = mul_high(x_lo, y_lo) + (x_lo * y_hi) + (x_hi * y_lo) |
| 354 | + (wide_regs ValueRegs (pulley_xwidemul64_u x_lo y_lo)) |
| 355 | + (wide_lo XReg (value_regs_get wide_regs 0)) |
| 356 | + (wide_hi XReg (value_regs_get wide_regs 1)) |
| 357 | + (tmp_hi1 XReg (pulley_xmul64 x_lo y_hi)) |
| 358 | + (tmp_hi2 XReg (pulley_xmul64 x_hi y_lo)) |
| 359 | + (tmp_add XReg (pulley_xadd64 wide_hi tmp_hi1)) |
| 360 | + (result_hi XReg (pulley_xadd64 tmp_add tmp_hi2)) |
| 361 | + ) |
| 362 | + (value_regs wide_lo result_hi))) |
| 363 | + |
339 | 364 | ;; vector multiplication
|
340 | 365 | (rule (lower (has_type $I8X16 (imul a b))) (pulley_vmuli8x16 a b))
|
341 | 366 | (rule (lower (has_type $I16X8 (imul a b))) (pulley_vmuli16x8 a b))
|
|
1054 | 1079 | (rule 2 (lower (store flags src @ (value_type (ty_vec128 ty)) addr offset))
|
1055 | 1080 | (side_effect (pulley_vstore (amode addr offset) src ty flags)))
|
1056 | 1081 |
|
| 1082 | +;; i128 stores |
| 1083 | + |
| 1084 | +(rule 3 (lower (store flags src @ (value_type $I128) addr offset)) |
| 1085 | + (let |
| 1086 | + ((src_regs ValueRegs src) |
| 1087 | + (src_lo XReg (value_regs_get src_regs 0)) |
| 1088 | + (src_hi XReg (value_regs_get src_regs 1)) |
| 1089 | + (addrp0 XReg addr) |
| 1090 | + (addrp8 Amode (Amode.RegOffset addrp0 8))) |
| 1091 | + (side_effect (emit_store_i128 flags src_lo src_hi (Amode.RegOffset addrp0 0) addrp8)))) |
| 1092 | + |
| 1093 | +;; Helper to handle big/little endian to determine which order the lo/hi |
| 1094 | +;; halves of the i128 are stored. |
| 1095 | +(decl emit_store_i128 (MemFlags XReg XReg Amode Amode) SideEffectNoResult) |
| 1096 | +(rule 0 (emit_store_i128 flags lo hi addrp0 addrp8) |
| 1097 | + (if-let (Endianness.Little) (endianness flags)) |
| 1098 | + (let ((_ InstOutput (side_effect (pulley_xstore addrp0 lo $I64 flags)))) |
| 1099 | + (pulley_xstore addrp8 hi $I64 flags))) |
| 1100 | +(rule 1 (emit_store_i128 flags lo hi addrp0 addrp8) |
| 1101 | + (if-let (Endianness.Big) (endianness flags)) |
| 1102 | + (let ((_ InstOutput (side_effect (pulley_xstore addrp0 hi $I64 flags)))) |
| 1103 | + (pulley_xstore addrp8 lo $I64 flags))) |
| 1104 | + |
1057 | 1105 | ;; Equivalent of `gen_xload` but for stores.
|
1058 | 1106 | (decl gen_xstore (Value Value Offset32 MemFlags Type) SideEffectNoResult)
|
1059 | 1107 |
|
|
1092 | 1140 | (rule 1 (lower (has_type $I64 (uextend val)))
|
1093 | 1141 | (zext64 val))
|
1094 | 1142 |
|
| 1143 | +(rule 1 (lower (has_type $I128 (uextend val))) |
| 1144 | + (value_regs (zext64 val) (pulley_xzero))) |
| 1145 | + |
1095 | 1146 | ;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
1096 | 1147 |
|
1097 | 1148 | (rule 0 (lower (has_type (fits_in_32 _) (sextend val)))
|
|
0 commit comments