[ONNX] Simplify Onnx.MatMulInteger op lowering (llvm#4163)

vivekkhandelwal1 · web-flow · commit 26d2a098e230 · 2025-04-28T14:30:24.000+05:30
This commit modifies the Onnx.MatMulInteger op lowering to fix the Torch->Linalg lowering path of the op. Fixes nod-ai/SHARK-ModelDev#906. Signed-off-by: Vivek Khandelwal <vivekkhandelwal1424@gmail.com>
diff --git a/lib/Conversion/TorchOnnxToTorch/DefaultDomainGtoP.cpp b/lib/Conversion/TorchOnnxToTorch/DefaultDomainGtoP.cpp
@@ -551,16 +551,14 @@ void mlir::torch::onnx_c::populateDefaultDomainGtoP(
   patterns.onOp(
       "MatMulInteger", 10,
       [](OpBinder binder, ConversionPatternRewriter &rewriter) {
+        Location loc = binder.getLoc();
         Torch::ValueTensorType resultType;
         Value lhs, rhs, lhsZp, rhsZp;
         if (binder.tensorOperandAtIndex(lhs, 0) ||
             binder.tensorOperandAtIndex(rhs, 1) ||
             binder.tensorResultType(resultType))
           return failure();
 
-        auto lhsTy = dyn_cast<Torch::ValueTensorType>(lhs.getType());
-        auto rhsTy = dyn_cast<Torch::ValueTensorType>(rhs.getType());
-
         if (binder.tensorOperandAtIndex(lhsZp, 2)) {
           lhsZp = rewriter.create<Torch::ConstantIntOp>(
               binder.getLoc(), rewriter.getType<Torch::IntType>(),
@@ -573,92 +571,39 @@ void mlir::torch::onnx_c::populateDefaultDomainGtoP(
               rewriter.getIntegerAttr(rewriter.getIntegerType(64), 0));
         }
 
-        bool isChannelQuantizationForLhs = false;
-        if (auto zpTy = dyn_cast<Torch::ValueTensorType>(lhsZp.getType())) {
-          auto lhsZpSize = zpTy.getSizes();
-          if (lhsZpSize.size() == 0 ||
-              llvm::all_of(lhsZpSize, [](int64_t d) { return d == 1; })) {
-            lhsZp = rewriter.create<Torch::AtenItemOp>(
-                binder.getLoc(), rewriter.getType<Torch::IntType>(), lhsZp);
-          } else if (lhsZpSize.size() == 1) {
-            auto lhsSize = lhsTy.getSizes();
-            if (lhsSize.size() != 2 || lhsSize[0] != lhsZpSize[0])
-              return failure();
-            isChannelQuantizationForLhs = true;
-          } else {
-            return failure();
-          }
-        }
-
-        bool isChannelQuantizationForRhs = false;
-        if (auto zpTy = dyn_cast<Torch::ValueTensorType>(rhsZp.getType())) {
-          auto rhsZpSize = zpTy.getSizes();
-          if (rhsZpSize.size() == 0 ||
-              llvm::all_of(rhsZpSize, [](int64_t d) { return d == 1; })) {
-            rhsZp = rewriter.create<Torch::AtenItemOp>(
-                binder.getLoc(), rewriter.getType<Torch::IntType>(), rhsZp);
-          } else if (rhsZpSize.size() == 1) {
-            auto rhsSize = rhsTy.getSizes();
-            if (rhsSize.size() != 2 || rhsSize[1] != rhsZpSize[0])
-              return failure();
-            isChannelQuantizationForRhs = true;
-          } else {
-            return failure();
-          }
-        }
-
-        auto lhsQTy = getQTorchTypeFromTorchIntType(lhsTy);
-        auto rhsQTy = getQTorchTypeFromTorchIntType(rhsTy);
-
-        if (!lhsQTy || !rhsQTy)
-          return rewriter.notifyMatchFailure(binder.op, "failed to get qtype");
-
-        Value f32Ty = rewriter.create<Torch::ConstantIntOp>(
-            binder.getLoc(), rewriter.getI64IntegerAttr(
-                                 (int64_t)torch_upstream::ScalarType::Float));
-        Value none = rewriter.create<Torch::ConstantNoneOp>(binder.getLoc());
-
-        if (isChannelQuantizationForLhs) {
-          Value axis = rewriter.create<Torch::ConstantIntOp>(
-              binder.getLoc(), rewriter.getType<Torch::IntType>(),
-              rewriter.getI64IntegerAttr(0));
-          Torch::ValueTensorType lhsZpTy =
-              dyn_cast<Torch::ValueTensorType>(lhsZp.getType());
-          Type scaleTy = lhsZpTy.getWithSizesAndDtype(lhsZpTy.getSizes(),
-                                                      rewriter.getF32Type());
-          Value scale = rewriter.create<Torch::AtenOnesLikeOp>(
-              binder.getLoc(), scaleTy, /*self=*/lhsZp, f32Ty, /*layout=*/none,
-              /*device=*/none, /*pin_memory=*/none, /*memory_format=*/none);
-          lhs = rewriter.create<Torch::Aten_MakePerChannelQuantizedTensorOp>(
-              binder.getLoc(), lhsQTy, lhs, scale, lhsZp, axis);
-        } else {
-          Value scale = rewriter.create<Torch::ConstantFloatOp>(
-              binder.getLoc(), rewriter.getType<Torch::FloatType>(),
-              rewriter.getF64FloatAttr(1.0));
-          lhs = rewriter.create<Torch::Aten_MakePerTensorQuantizedTensorOp>(
-              binder.getLoc(), lhsQTy, lhs, scale, lhsZp);
-        }
+        // This op is lowered as follows:
+        // lhs = lhs.to(dtype=torch.int32)
+        // rhs = rhs.to(dtype=torch.int32)
+        // lhs = lhs - lhsZp
+        // rhs = rhs - rhsZp
+        // res = torch.mm(lhs, rhs)
+
+        // Converting lhs and rhs tensor to `si32` type.
+        lhs = Torch::convertTensorToDtype(
+            rewriter, loc, lhs,
+            mlir::IntegerType::get(binder.op->getContext(), 32,
+                                   mlir::IntegerType::Signed));
+        rhs = Torch::convertTensorToDtype(
+            rewriter, loc, rhs,
+            mlir::IntegerType::get(binder.op->getContext(), 32,
+                                   mlir::IntegerType::Signed));
+
+        // Subtracting the zero_point values from lhs and rhs.
+        Value alpha = rewriter.create<Torch::ConstantIntOp>(
+            loc, rewriter.getI64IntegerAttr(1));
+        if (auto lhsZpTy = dyn_cast<Torch::ValueTensorType>(lhsZp.getType()))
+          lhs = rewriter.create<Torch::AtenSubTensorOp>(loc, lhs.getType(), lhs,
+                                                        lhsZp, alpha);
+        else
+          lhs = rewriter.create<Torch::AtenSubScalarOp>(loc, lhs.getType(), lhs,
+                                                        lhsZp, alpha);
 
-        if (isChannelQuantizationForRhs) {
-          Value axis = rewriter.create<Torch::ConstantIntOp>(
-              binder.getLoc(), rewriter.getType<Torch::IntType>(),
-              rewriter.getI64IntegerAttr(1));
-          Torch::ValueTensorType rhsZpTy =
-              dyn_cast<Torch::ValueTensorType>(rhsZp.getType());
-          Type scaleTy = rhsZpTy.getWithSizesAndDtype(rhsZpTy.getSizes(),
-                                                      rewriter.getF32Type());
-          Value scale = rewriter.create<Torch::AtenOnesLikeOp>(
-              binder.getLoc(), scaleTy, /*self=*/rhsZp, f32Ty, /*layout=*/none,
-              /*device=*/none, /*pin_memory=*/none, /*memory_format=*/none);
-          rhs = rewriter.create<Torch::Aten_MakePerChannelQuantizedTensorOp>(
-              binder.getLoc(), rhsQTy, rhs, scale, rhsZp, axis);
-        } else {
-          Value scale = rewriter.create<Torch::ConstantFloatOp>(
-              binder.getLoc(), rewriter.getType<Torch::FloatType>(),
-              rewriter.getF64FloatAttr(1.0));
-          rhs = rewriter.create<Torch::Aten_MakePerTensorQuantizedTensorOp>(
-              binder.getLoc(), rhsQTy, rhs, scale, rhsZp);
-        }
+        if (auto rhsZpTy = dyn_cast<Torch::ValueTensorType>(rhsZp.getType()))
+          rhs = rewriter.create<Torch::AtenSubTensorOp>(loc, rhs.getType(), rhs,
+                                                        rhsZp, alpha);
+        else
+          rhs = rewriter.create<Torch::AtenSubScalarOp>(loc, rhs.getType(), rhs,
+                                                        rhsZp, alpha);
 
         rewriter.replaceOpWithNewOp<Torch::AtenMatmulOp>(binder.op, resultType,
                                                          lhs, rhs);
diff --git a/test/Conversion/TorchOnnxToTorch/simple_ops_g_to_p.mlir b/test/Conversion/TorchOnnxToTorch/simple_ops_g_to_p.mlir
@@ -567,14 +567,12 @@ func.func @test_matmul_4d(%arg0: !torch.vtensor<[1,2,3,4],f32>, %arg1: !torch.vt
 // CHECK-LABEL: @test_matmulinteger
 func.func @test_matmulinteger(%arg0: !torch.vtensor<[4,3],ui8>, %arg1: !torch.vtensor<[3,2],ui8>, %arg2: !torch.vtensor<[1],ui8>, %arg3: !torch.vtensor<[1],ui8>) -> !torch.vtensor<[4,2],si32> attributes {torch.onnx_meta.ir_version = 5 : si64, torch.onnx_meta.opset_version = 10 : si64, torch.onnx_meta.producer_name = "backend-test", torch.onnx_meta.producer_version = ""} {
   %0 = torch.operator "onnx.MatMulInteger"(%arg0, %arg1, %arg2, %arg3) : (!torch.vtensor<[4,3],ui8>, !torch.vtensor<[3,2],ui8>, !torch.vtensor<[1],ui8>, !torch.vtensor<[1],ui8>) -> !torch.vtensor<[4,2],si32>
-  // CHECK: %[[LITEM:.+]] = torch.aten.item %arg2
-  // CHECK: %[[RITEM:.+]] = torch.aten.item %arg3
-  // CHECK: %[[L_SCALE:.+]] = torch.constant.float 1.000000e+00
-  // CHECK: %[[LMAKE:.+]] = torch.aten._make_per_tensor_quantized_tensor %arg0, %[[L_SCALE]], %[[LITEM]] : !torch.vtensor<[4,3],ui8>, !torch.float, !torch.int -> !torch.vtensor<[4,3],!torch.quint8>
-  // CHECK: %[[R_SCALE:.+]] = torch.constant.float 1.000000e+00
-  // CHECK: %[[RMAKE:.+]] = torch.aten._make_per_tensor_quantized_tensor %arg1, %[[R_SCALE]], %[[RITEM]] : !torch.vtensor<[3,2],ui8>, !torch.float, !torch.int -> !torch.vtensor<[3,2],!torch.quint8>
-  // CHECK: %[[MM:.+]] = torch.aten.matmul %[[LMAKE]], %[[RMAKE]]
-  // CHECK: return %[[MM]]
+  // CHECK: %[[LHS:.*]] = torch.aten.to.dtype %arg0, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : !torch.vtensor<[4,3],ui8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4,3],si32>
+  // CHECK: %[[RHS:.*]] = torch.aten.to.dtype %arg1, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : !torch.vtensor<[3,2],ui8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[3,2],si32>
+  // CHECK: %[[LHS_MINUS_ZP:.*]] = torch.aten.sub.Tensor %[[LHS]], %arg2, %{{.+}} : !torch.vtensor<[4,3],si32>, !torch.vtensor<[1],ui8>, !torch.int -> !torch.vtensor<[4,3],si32>
+  // CHECK: %[[RHS_MINUS_ZP:.*]] = torch.aten.sub.Tensor %[[RHS]], %arg3, %{{.+}} : !torch.vtensor<[3,2],si32>, !torch.vtensor<[1],ui8>, !torch.int -> !torch.vtensor<[3,2],si32>
+  // CHECK: %[[MM:.+]] = torch.aten.matmul %[[LHS_MINUS_ZP]], %[[RHS_MINUS_ZP]] : !torch.vtensor<[4,3],si32>, !torch.vtensor<[3,2],si32> -> !torch.vtensor<[4,2],si32>
+  // CHECK: return %[[MM]] : !torch.vtensor<[4,2],si32>
   return %0 : !torch.vtensor<[4,2],si32>
 }
 
@@ -583,57 +581,39 @@ func.func @test_matmulinteger(%arg0: !torch.vtensor<[4,3],ui8>, %arg1: !torch.vt
 // CHECK-LABEL: @test_matmulinteger_batched
 func.func @test_matmulinteger_batched(%arg0: !torch.vtensor<[7,4,3],ui8>, %arg1: !torch.vtensor<[3,2],ui8>, %arg2: !torch.vtensor<[1],ui8>, %arg3: !torch.vtensor<[1],ui8>) -> !torch.vtensor<[7,4,2],si32> attributes {torch.onnx_meta.ir_version = 5 : si64, torch.onnx_meta.opset_version = 10 : si64, torch.onnx_meta.producer_name = "backend-test", torch.onnx_meta.producer_version = ""} {
   %0 = torch.operator "onnx.MatMulInteger"(%arg0, %arg1, %arg2, %arg3) : (!torch.vtensor<[7,4,3],ui8>, !torch.vtensor<[3,2],ui8>, !torch.vtensor<[1],ui8>, !torch.vtensor<[1],ui8>) -> !torch.vtensor<[7,4,2],si32>
-  // CHECK: %[[LITEM:.+]] = torch.aten.item %arg2
-  // CHECK: %[[RITEM:.+]] = torch.aten.item %arg3
-  // CHECK: %[[L_SCALE:.+]] = torch.constant.float 1.000000e+00
-  // CHECK: %[[LMAKE:.+]] = torch.aten._make_per_tensor_quantized_tensor %arg0, %[[L_SCALE]], %[[LITEM]] : !torch.vtensor<[7,4,3],ui8>, !torch.float, !torch.int -> !torch.vtensor<[7,4,3],!torch.quint8>
-  // CHECK: %[[R_SCALE:.+]] = torch.constant.float 1.000000e+00
-  // CHECK: %[[RMAKE:.+]] = torch.aten._make_per_tensor_quantized_tensor %arg1, %[[R_SCALE]], %[[RITEM]] : !torch.vtensor<[3,2],ui8>, !torch.float, !torch.int -> !torch.vtensor<[3,2],!torch.quint8>
-  // CHECK: %[[MM:.+]] = torch.aten.matmul %[[LMAKE]], %[[RMAKE]]
-  // CHECK: return %[[MM]]
+  // CHECK: %[[LHS:.*]] = torch.aten.to.dtype %arg0, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : !torch.vtensor<[7,4,3],ui8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[7,4,3],si32>
+  // CHECK: %[[RHS:.*]] = torch.aten.to.dtype %arg1, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : !torch.vtensor<[3,2],ui8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[3,2],si32>
+  // CHECK: %[[LHS_MINUS_ZP:.*]] = torch.aten.sub.Tensor %[[LHS]], %arg2, %{{.+}} : !torch.vtensor<[7,4,3],si32>, !torch.vtensor<[1],ui8>, !torch.int -> !torch.vtensor<[7,4,3],si32>
+  // CHECK: %[[RHS_MINUS_ZP:.*]] = torch.aten.sub.Tensor %[[RHS]], %arg3, %{{.+}} : !torch.vtensor<[3,2],si32>, !torch.vtensor<[1],ui8>, !torch.int -> !torch.vtensor<[3,2],si32>
+  // CHECK: %[[MM:.+]] = torch.aten.matmul %[[LHS_MINUS_ZP]], %[[RHS_MINUS_ZP]] : !torch.vtensor<[7,4,3],si32>, !torch.vtensor<[3,2],si32> -> !torch.vtensor<[7,4,2],si32>
+  // CHECK: return %[[MM]] : !torch.vtensor<[7,4,2],si32>
   return %0 : !torch.vtensor<[7,4,2],si32>
 }
 
 // -----
 
 // CHECK-LABEL:   func.func @test_matmulinteger_non_scalar_lhsZp(
-// CHECK-SAME:                                                   %[[VAL_0:.*]]: !torch.vtensor<[16,2],ui8>,
-// CHECK-SAME:                                                   %[[VAL_1:.*]]: !torch.vtensor<[2,768],si8>,
-// CHECK-SAME:                                                   %[[VAL_2:.*]]: !torch.vtensor<[16],ui8>,
-// CHECK-SAME:                                                   %[[VAL_3:.*]]: !torch.vtensor<[],si8>) -> !torch.vtensor<[16,768],si32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "0.1.0"} {
-func.func @test_matmulinteger_non_scalar_lhsZp(%arg0: !torch.vtensor<[16, 2],ui8>, %arg1: !torch.vtensor<[2,768],si8>,  %arg2: !torch.vtensor<[16],ui8>, %arg3: !torch.vtensor<[],si8>) ->  !torch.vtensor<[16,768],si32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "0.1.0"} {
-  // CHECK:           %[[VAL_4:.*]] = torch.aten.item %[[VAL_3]] : !torch.vtensor<[],si8> -> !torch.int
-  // CHECK:           %[[VAL_5:.*]] = torch.constant.int 6
-  // CHECK:           %[[VAL_6:.*]] = torch.constant.none
-  // CHECK:           %[[VAL_7:.*]] = torch.constant.int 0
-  // CHECK:           %[[VAL_8:.*]] = torch.aten.ones_like %[[VAL_2]], %[[VAL_5]], %[[VAL_6]], %[[VAL_6]], %[[VAL_6]], %[[VAL_6]] : !torch.vtensor<[16],ui8>, !torch.int, !torch.none, !torch.none, !torch.none, !torch.none -> !torch.vtensor<[16],f32>
-  // CHECK:           %[[VAL_9:.*]] = torch.aten._make_per_channel_quantized_tensor %[[VAL_0]], %[[VAL_8]], %[[VAL_2]], %[[VAL_7]] : !torch.vtensor<[16,2],ui8>, !torch.vtensor<[16],f32>, !torch.vtensor<[16],ui8>, !torch.int -> !torch.vtensor<[16,2],!torch.quint8>
-  // CHECK:           %[[VAL_10:.*]] = torch.constant.float 1.000000e+00
-  // CHECK:           %[[VAL_11:.*]] = torch.aten._make_per_tensor_quantized_tensor %[[VAL_1]], %[[VAL_10]], %[[VAL_4]] : !torch.vtensor<[2,768],si8>, !torch.float, !torch.int -> !torch.vtensor<[2,768],!torch.qint8>
-  // CHECK:           %[[VAL_12:.*]] = torch.aten.matmul %[[VAL_9]], %[[VAL_11]] : !torch.vtensor<[16,2],!torch.quint8>, !torch.vtensor<[2,768],!torch.qint8> -> !torch.vtensor<[16,768],si32>
-  // CHECK:           return %[[VAL_12]] : !torch.vtensor<[16,768],si32>
-  %0 = torch.operator "onnx.MatMulInteger"(%arg0, %arg1, %arg2, %arg3) : (!torch.vtensor<[16,2],ui8>, !torch.vtensor<[2,768],si8>, !torch.vtensor<[16],ui8>, !torch.vtensor<[],si8>) -> !torch.vtensor<[16,768],si32>
+func.func @test_matmulinteger_non_scalar_lhsZp(%arg0: !torch.vtensor<[16, 2],ui8>, %arg1: !torch.vtensor<[2,768],si8>,  %arg2: !torch.vtensor<[16,1],ui8>, %arg3: !torch.vtensor<[],si8>) ->  !torch.vtensor<[16,768],si32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "0.1.0"} {
+  // CHECK: %[[LHS:.*]] = torch.aten.to.dtype %arg0, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : !torch.vtensor<[16,2],ui8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[16,2],si32>
+  // CHECK: %[[RHS:.*]] = torch.aten.to.dtype %arg1, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : !torch.vtensor<[2,768],si8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,768],si32>
+  // CHECK: %[[LHS_MINUS_ZP:.*]] = torch.aten.sub.Tensor %[[LHS]], %arg2, %{{.+}} : !torch.vtensor<[16,2],si32>, !torch.vtensor<[16,1],ui8>, !torch.int -> !torch.vtensor<[16,2],si32>
+  // CHECK: %[[RHS_MINUS_ZP:.*]] = torch.aten.sub.Tensor %[[RHS]], %arg3, %{{.+}} : !torch.vtensor<[2,768],si32>, !torch.vtensor<[],si8>, !torch.int -> !torch.vtensor<[2,768],si32>
+  // CHECK: %[[MM:.+]] = torch.aten.matmul %[[LHS_MINUS_ZP]], %[[RHS_MINUS_ZP]] : !torch.vtensor<[16,2],si32>, !torch.vtensor<[2,768],si32> -> !torch.vtensor<[16,768],si32>
+  // CHECK: return %[[MM]] : !torch.vtensor<[16,768],si32>
+  %0 = torch.operator "onnx.MatMulInteger"(%arg0, %arg1, %arg2, %arg3) : (!torch.vtensor<[16,2],ui8>, !torch.vtensor<[2,768],si8>, !torch.vtensor<[16,1],ui8>, !torch.vtensor<[],si8>) -> !torch.vtensor<[16,768],si32>
   return %0 : !torch.vtensor<[16,768],si32>
 }
 
 // -----
 
 // CHECK-LABEL:   func.func @test_matmulinteger_non_scalar_rhsZp(
-// CHECK-SAME:                                                   %[[VAL_0:.*]]: !torch.vtensor<[?,?],ui8>,
-// CHECK-SAME:                                                   %[[VAL_1:.*]]: !torch.vtensor<[2,768],si8>,
-// CHECK-SAME:                                                   %[[VAL_2:.*]]: !torch.vtensor<[],ui8>,
-// CHECK-SAME:                                                   %[[VAL_3:.*]]: !torch.vtensor<[768],si8>) -> !torch.vtensor<[?,768],si32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_met
 func.func @test_matmulinteger_non_scalar_rhsZp(%arg0: !torch.vtensor<[?,?],ui8>, %arg1: !torch.vtensor<[2,768],si8>,  %arg2: !torch.vtensor<[],ui8>, %arg3: !torch.vtensor<[768],si8>) ->  !torch.vtensor<[?,768],si32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "0.1.0"} {
-  // CHECK:           %[[VAL_4:.*]] = torch.aten.item %[[VAL_2]] : !torch.vtensor<[],ui8> -> !torch.int
-  // CHECK:           %[[VAL_5:.*]] = torch.constant.int 6
-  // CHECK:           %[[VAL_6:.*]] = torch.constant.none
-  // CHECK:           %[[VAL_7:.*]] = torch.constant.float 1.000000e+00
-  // CHECK:           %[[VAL_8:.*]] = torch.aten._make_per_tensor_quantized_tensor %[[VAL_0]], %[[VAL_7]], %[[VAL_4]] : !torch.vtensor<[?,?],ui8>, !torch.float, !torch.int -> !torch.vtensor<[?,?],!torch.quint8>
-  // CHECK:           %[[VAL_9:.*]] = torch.constant.int 1
-  // CHECK:           %[[VAL_10:.*]] = torch.aten.ones_like %[[VAL_3]], %[[VAL_5]], %[[VAL_6]], %[[VAL_6]], %[[VAL_6]], %[[VAL_6]] : !torch.vtensor<[768],si8>, !torch.int, !torch.none, !torch.none, !torch.none, !torch.none -> !torch.vtensor<[768],f32>
-  // CHECK:           %[[VAL_11:.*]] = torch.aten._make_per_channel_quantized_tensor %[[VAL_1]], %[[VAL_10]], %[[VAL_3]], %[[VAL_9]] : !torch.vtensor<[2,768],si8>, !torch.vtensor<[768],f32>, !torch.vtensor<[768],si8>, !torch.int -> !torch.vtensor<[2,768],!torch.qint8>
-  // CHECK:           %[[VAL_12:.*]] = torch.aten.matmul %[[VAL_8]], %[[VAL_11]] : !torch.vtensor<[?,?],!torch.quint8>, !torch.vtensor<[2,768],!torch.qint8> -> !torch.vtensor<[?,768],si32>
-  // CHECK:           return %[[VAL_12]] : !torch.vtensor<[?,768],si32>
+  // CHECK: %[[LHS:.*]] = torch.aten.to.dtype %arg0, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : !torch.vtensor<[?,?],ui8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[?,?],si32>
+  // CHECK: %[[RHS:.*]] = torch.aten.to.dtype %arg1, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : !torch.vtensor<[2,768],si8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[2,768],si32>
+  // CHECK: %[[LHS_MINUS_ZP:.*]] = torch.aten.sub.Tensor %[[LHS]], %arg2, %{{.+}} : !torch.vtensor<[?,?],si32>, !torch.vtensor<[],ui8>, !torch.int -> !torch.vtensor<[?,?],si32>
+  // CHECK: %[[RHS_MINUS_ZP:.*]] = torch.aten.sub.Tensor %[[RHS]], %arg3, %{{.+}} : !torch.vtensor<[2,768],si32>, !torch.vtensor<[768],si8>, !torch.int -> !torch.vtensor<[2,768],si32>
+  // CHECK: %[[MM:.+]] = torch.aten.matmul %[[LHS_MINUS_ZP]], %[[RHS_MINUS_ZP]] : !torch.vtensor<[?,?],si32>, !torch.vtensor<[2,768],si32> -> !torch.vtensor<[?,768],si32>
+  // CHECK: return %[[MM]] : !torch.vtensor<[?,768],si32>
   %0 = torch.operator "onnx.MatMulInteger"(%arg0, %arg1, %arg2, %arg3) : (!torch.vtensor<[?,?],ui8>, !torch.vtensor<[2,768],si8>, !torch.vtensor<[],ui8>, !torch.vtensor<[768],si8>) -> !torch.vtensor<[?,768],si32>
   return %0 : !torch.vtensor<[?,768],si32>
 }