[ONNX] Add support for Onnx.QLinearMul op (#4159)

vivekkhandelwal1 · web-flow · commit 80a3dfddd341 · 2025-04-25T17:59:24.000+05:30
This commit adds the lowering for the [Onnx.QLinearMul](https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftqlinearmul) op. Signed-off-by: Vivek Khandelwal <vivekkhandelwal1424@gmail.com>
diff --git a/include/torch-mlir/Conversion/TorchOnnxToTorch/Utils.h b/include/torch-mlir/Conversion/TorchOnnxToTorch/Utils.h
@@ -125,6 +125,11 @@ LogicalResult extractPerTensorQuantizationArguments(
     ConversionPatternRewriter &rewriter, Location loc, Value inScale,
     Value inZeroPoint, Value &outScale, Value &outZeroPoint);
 
+/// This utility takes as input a quantized tensor and dequantizes it.
+LogicalResult createDequantizeTensor(ConversionPatternRewriter &rewriter,
+                                     Location loc, Value input, Value scale,
+                                     Value zeroPoint, Value &output);
+
 } // namespace mlir::torch::onnx_c
 
 #endif // TORCHMLIR_CONVERSION_TORCHONNXTOTORCH_UTILS_H
diff --git a/lib/Conversion/TorchOnnxToTorch/ComMicrosoftDomain.cpp b/lib/Conversion/TorchOnnxToTorch/ComMicrosoftDomain.cpp
@@ -1077,4 +1077,73 @@ void mlir::torch::onnx_c::populateComMicrosoftDomain(
             binder.op, resultType, transposedLhs, transposedRhs);
         return success();
       });
+  patterns.onOp(
+      "QLinearMul", 1,
+      [](OpBinder binder, ConversionPatternRewriter &rewriter) {
+        Location loc = binder.getLoc();
+        Torch::ValueTensorType resultType;
+        llvm::SmallVector<Value> operands;
+        if (binder.tensorOperandsList(operands) ||
+            binder.tensorResultType(resultType))
+          return failure();
+
+        if (operands.size() != 8)
+          return rewriter.notifyMatchFailure(
+              binder.op, "Unimplemented: expected 8 input operands");
+
+        Value a, b, aScale, aZp, bScale, bZp, cScale, cZp;
+
+        if (failed(extractPerTensorQuantizationArguments(
+                rewriter, loc, /*scale=*/operands[1],
+                /*zero_point=*/operands[2], aScale, aZp)))
+          return rewriter.notifyMatchFailure(
+              binder.op, "Incompatible arguments for per-tensor quantization");
+
+        if (failed(extractPerTensorQuantizationArguments(
+                rewriter, loc, /*scale=*/operands[4],
+                /*zero_point=*/operands[5], bScale, bZp)))
+          return rewriter.notifyMatchFailure(
+              binder.op, "Incompatible arguments for per-tensor quantization");
+
+        if (failed(extractPerTensorQuantizationArguments(
+                rewriter, loc, /*scale=*/operands[6],
+                /*zero_point=*/operands[7], cScale, cZp)))
+          return rewriter.notifyMatchFailure(
+              binder.op, "Incompatible arguments for per-tensor quantization");
+
+        if (failed(createDequantizeTensor(rewriter, loc, /*input=*/operands[0],
+                                          /*scale=*/aScale, /*zero_point=*/aZp,
+                                          /*output=*/a)))
+          return rewriter.notifyMatchFailure(
+              binder.op, "Failed to dequantize the input tensor `a` because of "
+                         "missing sizes");
+
+        if (failed(createDequantizeTensor(rewriter, loc, /*input=*/operands[3],
+                                          /*scale=*/bScale, /*zero_point=*/bZp,
+                                          /*output=*/b)))
+          return rewriter.notifyMatchFailure(
+              binder.op, "Failed to dequantize the input tensor `b` because of "
+                         "missing sizes");
+
+        // Computing the Mul result.
+        auto cTy = rewriter.getType<Torch::ValueTensorType>(
+            resultType.getOptionalSizes(), rewriter.getF32Type());
+        Value c =
+            rewriter.create<Torch::AtenMulTensorOp>(binder.getLoc(), cTy, a, b);
+
+        // Quantizing the result of Mul operation.
+        cTy = dyn_cast<Torch::ValueTensorType>(
+            getQTorchTypeFromTorchIntType(resultType));
+        Value dtyVal = rewriter.create<Torch::ConstantIntOp>(
+            binder.getLoc(), rewriter.getType<Torch::IntType>(),
+            rewriter.getIntegerAttr(
+                rewriter.getIntegerType(64),
+                static_cast<int64_t>(
+                    Torch::getScalarTypeForType(cTy.getDtype()))));
+        c = rewriter.create<Torch::AtenQuantizePerTensorOp>(
+            binder.getLoc(), cTy, c, cScale, cZp, dtyVal);
+        rewriter.replaceOpWithNewOp<Torch::AtenIntReprOp>(binder.op, resultType,
+                                                          c);
+        return success();
+      });
 }
diff --git a/lib/Conversion/TorchOnnxToTorch/Utils.cpp b/lib/Conversion/TorchOnnxToTorch/Utils.cpp
@@ -182,3 +182,23 @@ LogicalResult mlir::torch::onnx_c::extractPerTensorQuantizationArguments(
 
   return success();
 }
+
+LogicalResult mlir::torch::onnx_c::createDequantizeTensor(
+    ConversionPatternRewriter &rewriter, Location loc, Value input, Value scale,
+    Value zeroPoint, Value &output) {
+  auto inputTy = dyn_cast<Torch::ValueTensorType>(input.getType());
+  if (!inputTy || !inputTy.hasSizes())
+    return failure();
+
+  Torch::ValueTensorType makeTensorTy = getQTorchTypeFromTorchIntType(inputTy);
+  Value quantizedInput =
+      rewriter.create<Torch::Aten_MakePerTensorQuantizedTensorOp>(
+          loc, makeTensorTy, input, scale, zeroPoint);
+
+  Torch::ValueTensorType resultTy = rewriter.getType<Torch::ValueTensorType>(
+      inputTy.getSizes(), rewriter.getF32Type());
+  output = rewriter.create<Torch::AtenDequantizeSelfOp>(loc, resultTy,
+                                                        quantizedInput);
+
+  return success();
+}
diff --git a/test/Conversion/TorchOnnxToTorch/simple_ops_q_to_z.mlir b/test/Conversion/TorchOnnxToTorch/simple_ops_q_to_z.mlir
@@ -3849,3 +3849,34 @@ func.func @test_qlinearAveragePool(%arg0: !torch.vtensor<[1,128,56,56],ui8>, %ar
   // CHECK: return %[[OUT]]
   return %0 : !torch.vtensor<[1,128,28,28],ui8>
 }
+
+// -----
+
+// CHECK-LABEL: @test_qlinearmul(
+// CHECK-SAME:                                                           %[[A:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[1,4096],ui8>,
+// CHECK-SAME:                                                           %[[A_SCALE:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[],f32>,
+// CHECK-SAME:                                                           %[[A_ZERO_POINT:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[],ui8>,
+// CHECK-SAME:                                                           %[[B:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[1,4096],ui8>,
+// CHECK-SAME:                                                           %[[B_SCALE:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[],f32>,
+// CHECK-SAME:                                                           %[[B_ZERO_POINT:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[],ui8>,
+// CHECK-SAME:                                                           %[[C_SCALE:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[],f32>,
+// CHECK-SAME:                                                           %[[C_ZERO_POINT:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !torch.vtensor<[],ui8>) -> !torch.vtensor<[1,4096],ui8>
+func.func @test_qlinearmul(%arg0: !torch.vtensor<[1,4096],ui8>, %arg1: !torch.vtensor<[],f32>, %arg2: !torch.vtensor<[],ui8>, %arg3: !torch.vtensor<[1,4096],ui8>, %arg4: !torch.vtensor<[],f32>, %arg5: !torch.vtensor<[],ui8>, %arg6: !torch.vtensor<[],f32>, %arg7: !torch.vtensor<[],ui8>) -> !torch.vtensor<[1,4096],ui8> attributes {torch.onnx_meta.opset_version = 10 : si64} {
+  %0 = torch.operator "onnx.QLinearMul"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (!torch.vtensor<[1,4096],ui8>, !torch.vtensor<[],f32>, !torch.vtensor<[],ui8>, !torch.vtensor<[1,4096],ui8>, !torch.vtensor<[],f32>, !torch.vtensor<[],ui8>, !torch.vtensor<[],f32>, !torch.vtensor<[],ui8>) -> !torch.vtensor<[1,4096],ui8>
+  // CHECK-DAG: %[[AZP:.+]] = torch.aten.item %[[A_ZERO_POINT]] : !torch.vtensor<[],ui8> -> !torch.int
+  // CHECK-DAG: %[[ASCALE:.+]] = torch.aten.item %[[A_SCALE]] : !torch.vtensor<[],f32> -> !torch.float
+  // CHECK-DAG: %[[BZP:.+]] = torch.aten.item %[[B_ZERO_POINT]] : !torch.vtensor<[],ui8> -> !torch.int
+  // CHECK-DAG: %[[BSCALE:.+]] = torch.aten.item %[[B_SCALE]] : !torch.vtensor<[],f32> -> !torch.float
+  // CHECK-DAG: %[[CZP:.+]] = torch.aten.item %[[C_ZERO_POINT]] : !torch.vtensor<[],ui8> -> !torch.int
+  // CHECK-DAG: %[[CSCALE:.+]] = torch.aten.item %[[C_SCALE]] : !torch.vtensor<[],f32> -> !torch.float
+  // CHECK-DAG: %[[A_QUANT:.+]] = torch.aten._make_per_tensor_quantized_tensor %[[A]], %[[ASCALE]], %[[AZP]] : !torch.vtensor<[1,4096],ui8>, !torch.float, !torch.int -> !torch.vtensor<[1,4096],!torch.quint8>
+  // CHECK: %[[A_F32:.+]] = torch.aten.dequantize.self %[[A_QUANT]] : !torch.vtensor<[1,4096],!torch.quint8> -> !torch.vtensor<[1,4096],f32>
+  // CHECK-DAG: %[[B_QUANT:.+]] = torch.aten._make_per_tensor_quantized_tensor %[[B]], %[[BSCALE]], %[[BZP]] : !torch.vtensor<[1,4096],ui8>, !torch.float, !torch.int -> !torch.vtensor<[1,4096],!torch.quint8>
+  // CHECK: %[[B_F32:.+]] = torch.aten.dequantize.self %[[B_QUANT]] : !torch.vtensor<[1,4096],!torch.quint8> -> !torch.vtensor<[1,4096],f32>
+  // CHECK: %[[ADD:.+]] = torch.aten.mul.Tensor %[[A_F32]], %[[B_F32]] : !torch.vtensor<[1,4096],f32>, !torch.vtensor<[1,4096],f32> -> !torch.vtensor<[1,4096],f32>
+  // CHECK: %[[DTY:.+]] = torch.constant.int 13
+  // CHECK: %[[QO:.+]] = torch.aten.quantize_per_tensor %[[ADD]], %[[CSCALE]], %[[CZP]], %[[DTY]] : !torch.vtensor<[1,4096],f32>, !torch.float, !torch.int, !torch.int -> !torch.vtensor<[1,4096],!torch.quint8>
+  // CHECK: %[[OUT:.+]] = torch.aten.int_repr %[[QO]] : !torch.vtensor<[1,4096],!torch.quint8> -> !torch.vtensor<[1,4096],ui8>
+  // CHECK: return %[[OUT]]
+  return %0 : !torch.vtensor<[1,4096],ui8>
+}