[SIMD] Add i32x4.dot_i8x16_i7x16_add_s in interpreter (#7527)

kripken · web-flow · commit 90ad796de7cb · 2025-04-21T08:37:44.000-07:00
diff --git a/scripts/test/fuzzing.py b/scripts/test/fuzzing.py
@@ -19,8 +19,6 @@
 unfuzzable = [
     # Float16 is still experimental.
     'f16.wast',
-    # not all relaxed SIMD instructions are implemented in the interpreter
-    'relaxed-simd.wast',
     # TODO: fuzzer and interpreter support for strings
     'strings.wast',
     'simplify-locals-strings.wast',
diff --git a/src/literal.h b/src/literal.h
@@ -607,6 +607,7 @@ class Literal {
   Literal dotSI8x16toI16x8(const Literal& other) const;
   Literal dotUI8x16toI16x8(const Literal& other) const;
   Literal dotSI16x8toI32x4(const Literal& other) const;
+  Literal dotSI8x16toI16x8Add(const Literal& left, const Literal& right) const;
   Literal extMulLowSI32x4(const Literal& other) const;
   Literal extMulHighSI32x4(const Literal& other) const;
   Literal extMulLowUI32x4(const Literal& other) const;
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h
@@ -1306,10 +1306,13 @@ class ExpressionRunner : public OverriddenVisitor<SubType, Flow> {
           return NONCONSTANT_FLOW;
         }
         return a.relaxedNmaddF64x2(b, c);
-      default:
-        // TODO: implement signselect and dot_add
-        WASM_UNREACHABLE("not implemented");
+      case DotI8x16I7x16AddSToVecI32x4:
+        if (relaxedBehavior == RelaxedBehavior::NonConstant) {
+          return NONCONSTANT_FLOW;
+        }
+        return a.dotSI8x16toI16x8Add(b, c);
     }
+    WASM_UNREACHABLE("invalid op");
   }
   Flow visitSIMDShift(SIMDShift* curr) {
     NOTE_ENTER("SIMDShift");
diff --git a/src/wasm/literal.cpp b/src/wasm/literal.cpp
@@ -2604,6 +2604,21 @@ Literal Literal::dotSI16x8toI32x4(const Literal& other) const {
   return dot<4, 2, &Literal::getLanesSI16x8>(*this, other);
 }
 
+Literal Literal::dotSI8x16toI16x8Add(const Literal& left,
+                                     const Literal& right) const {
+  auto temp = dotSI8x16toI16x8(left);
+
+  auto tempLanes = temp.getLanesSI16x8();
+  LaneArray<4> dest;
+  // TODO: the index on dest may be wrong, see
+  //       https://github.com/WebAssembly/relaxed-simd/issues/162
+  for (size_t i = 0; i < 4; i++) {
+    dest[i] = tempLanes[i * 2].add(tempLanes[i * 2 + 1]);
+  }
+
+  return Literal(dest).addI32x4(right);
+}
+
 Literal Literal::bitselectV128(const Literal& left,
                                const Literal& right) const {
   return andV128(left).orV128(notV128().andV128(right));
diff --git a/test/lit/exec/relaxed.wast b/test/lit/exec/relaxed.wast
@@ -0,0 +1,44 @@
+;; NOTE: Assertions have been generated by update_lit_checks.py --all-items --output=fuzz-exec and should not be edited.
+
+;; RUN: wasm-opt %s -all --fuzz-exec-before -q -o /dev/null 2>&1 | filecheck %s
+
+(module
+ (import "fuzzing-support" "log-i32" (func $log (param i32)))
+
+ ;; CHECK:      [fuzz-exec] calling i32x4.dot_i8x16_i7x16_add_s
+ ;; CHECK-NEXT: [LoggingExternalInterface logging 8]
+ ;; CHECK-NEXT: [LoggingExternalInterface logging 14]
+ ;; CHECK-NEXT: [LoggingExternalInterface logging 22]
+ ;; CHECK-NEXT: [LoggingExternalInterface logging 32]
+ (func $i32x4.dot_i8x16_i7x16_add_s (export "i32x4.dot_i8x16_i7x16_add_s")
+  (local $v v128)
+  (local.set $v
+   (i32x4.dot_i8x16_i7x16_add_s
+    (v128.const i32x4 0 1 2 3)
+    (v128.const i32x4 4 5 6 7)
+    (v128.const i32x4 8 9 10 11)
+   )
+  )
+  (call $log
+   (i32x4.extract_lane 0
+    (local.get $v)
+   )
+  )
+  (call $log
+   (i32x4.extract_lane 1
+    (local.get $v)
+   )
+  )
+  (call $log
+   (i32x4.extract_lane 2
+    (local.get $v)
+   )
+  )
+  (call $log
+   (i32x4.extract_lane 3
+    (local.get $v)
+   )
+  )
+ )
+)
+
diff --git a/test/spec/dot_product.wast b/test/spec/dot_product.wast
@@ -0,0 +1,104 @@
+;; Tests for dot products.
+;;
+;; This is the same as the upstream relaxed_dot_product.wast test in
+;; relaxed-simd, but with the non-relaxed versions, and with picking the proper
+;; outcome in the multiple-choice questions (which use either() in the original
+;; test).
+
+(module
+    (func (export "i16x8.dot_i8x16_i7x16_s") (param v128 v128) (result v128) (i16x8.dot_i8x16_i7x16_s (local.get 0) (local.get 1)))
+    (func (export "i32x4.dot_i8x16_i7x16_add_s") (param v128 v128 v128) (result v128) (i32x4.dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)))
+
+    (func (export "i16x8.dot_i8x16_i7x16_s_cmp") (param v128 v128) (result v128)
+          (i16x8.eq
+            (i16x8.dot_i8x16_i7x16_s (local.get 0) (local.get 1))
+            (i16x8.dot_i8x16_i7x16_s (local.get 0) (local.get 1))))
+    (func (export "i32x4.dot_i8x16_i7x16_add_s_cmp") (param v128 v128 v128) (result v128)
+          (i16x8.eq
+            (i32x4.dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))
+            (i32x4.dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))))
+)
+
+;; Simple values to ensure things are functional.
+(assert_return (invoke "i16x8.dot_i8x16_i7x16_s"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
+               (v128.const i16x8 1 13 41 85 145 221 313 421))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i16x8.dot_i8x16_i7x16_s"
+                       (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -32512 32258 0 0 0 0 0 0))
+
+;; signed * unsigned   : -128 *  129 * 2 = -33,024 saturated to -32,768
+;; signed * signed     : -128 * -127 * 2 =  32,512
+;; unsigned * unsigned :  128 *  129 * 2 =  33,024
+(assert_return (invoke "i16x8.dot_i8x16_i7x16_s"
+                       (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8  32512 0 0 0 0 0 0 0))
+
+;; Simple values to ensure things are functional.
+(assert_return (invoke "i32x4.dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i32x4 0 1 2 3))
+               ;; intermediate result is [14, 126, 366, 734]
+               (v128.const i32x4 14 127 368 737))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i32x4.dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               ;; intermediate result is [-65024, 64516, 0, 0]
+               (v128.const i32x4 -65023 64518 3 4))
+
+;; signed * unsigned   : -128 *  129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI
+;; signed * unsigned with intermediate saturation :
+;;   (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW)
+;;   -32768 + -32768 = -65536 (+ 1)
+;; signed * signed     : -128 * -127 * 4 =  65,024 (+ 1)
+;; unsigned * unsigned :  128 *  129 * 2 =  66,048 (+ 1)
+(assert_return (invoke "i32x4.dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               (v128.const i32x4  65025 2 3 4))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+;; Test max and min i8 values;
+(assert_return (invoke "i16x8.dot_i8x16_i7x16_s_cmp"
+                       (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i32x4.dot_i8x16_i7x16_add_s_cmp"
+                       (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               ;; intermediate result is [-65024, 64516, 0, 0]
+               (v128.const i32x4 -1 -1 -1 -1))
+
+;; signed * unsigned   : -128 *  129 * 2 = -33,024 saturated to -32,768
+;; signed * signed     : -128 * -127 * 2 =  32,512
+;; unsigned * unsigned :  128 *  129 * 2 =  33,024
+(assert_return (invoke "i16x8.dot_i8x16_i7x16_s_cmp"
+                       (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
+;; signed * unsigned   : -128 *  129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI
+;; signed * unsigned with intermediate saturation :
+;;   (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW)
+;;   -32768 + -32768 = -65536 (+ 1)
+;; signed * signed     : -128 * -127 * 4 =  65,024 (+ 1)
+;; unsigned * unsigned :  128 *  129 * 2 =  66,048 (+ 1)
+(assert_return (invoke "i32x4.dot_i8x16_i7x16_add_s_cmp"
+                       (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               (v128.const i32x4 -1 -1 -1 -1))

Original file line number	Diff line number	Diff line change
`@@ -1306,10 +1306,13 @@ class ExpressionRunner : public OverriddenVisitor<SubType, Flow> {`
`1306`	`1306`	`return NONCONSTANT_FLOW;`
`1307`	`1307`	`}`
`1308`	`1308`	`return a.relaxedNmaddF64x2(b, c);`
`1309`		`- default:`
`1310`		`- // TODO: implement signselect and dot_add`
`1311`		`- WASM_UNREACHABLE("not implemented");`
	`1309`	`+ case DotI8x16I7x16AddSToVecI32x4:`
	`1310`	`+ if (relaxedBehavior == RelaxedBehavior::NonConstant) {`
	`1311`	`+ return NONCONSTANT_FLOW;`
	`1312`	`+ }`
	`1313`	`+ return a.dotSI8x16toI16x8Add(b, c);`
`1312`	`1314`	`}`
	`1315`	`+ WASM_UNREACHABLE("invalid op");`
`1313`	`1316`	`}`
`1314`	`1317`	`Flow visitSIMDShift(SIMDShift* curr) {`
`1315`	`1318`	`NOTE_ENTER("SIMDShift");`