Move the transpose matmul pass to OSS and run it earlier in the flow

mcremon-meta · web-flow · commit 12079fe95979 · 2025-04-25T21:38:30.000-07:00
Differential Revision: D73600069 Pull Request resolved: #10433
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
@@ -32,7 +32,10 @@
     is_quantized_tensor,
     quantize_tensor_multiplier,
 )
-from executorch.backends.cadence.aot.fuse_ops import FuseCascadedViewOps
+from executorch.backends.cadence.aot.fuse_ops import (
+    FuseCascadedTransposeOrPermuteOps,
+    FuseCascadedViewOps,
+)
 from executorch.backends.cadence.aot.pass_utils import (
     CadencePassAttribute,
     register_cadence_pass,
@@ -2290,6 +2293,101 @@ def call_operator(
         )
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceMatmulWithTransposedMatmulPass(ExportPass):
+    """
+    For certain backends, we have efficient kernels for transposed matmul. We
+    replace AxB with AxB' for such backends.
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op != exir_ops.edge.cadence.quantized_matmul.default or args[-1] is True:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Get the args
+        if len(args) == 9:
+            (
+                X_arg,
+                X_zero_point,
+                Y_arg,
+                Y_zero_point,
+                bias,
+                out_multiplier,
+                out_shift,
+                out_zero_point,
+                transposed,
+            ) = args
+        elif len(args) == 8:
+            (
+                X_arg,
+                X_zero_point,
+                Y_arg,
+                Y_zero_point,
+                bias,
+                out_multiplier,
+                out_shift,
+                out_zero_point,
+            ) = args
+            transposed = False
+        else:
+            raise AssertionError(
+                f"Unexpected number of args for quantized_matmul: {len(args)}"
+            )
+
+        # If the matmul is already transposed, bail
+        if transposed:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Get the second tensor
+        Y_tensor = Y_arg.to_tensor() if isinstance(Y_arg, ProxyValue) else Y_arg
+        # Concretize the bias
+        zero_bias = super().call_operator(
+            exir_ops.edge.aten.full.default,
+            ([Y_tensor.size(-1)], 0),
+            {"dtype": torch.int32},
+            meta,
+        )
+
+        # If the arg was a ProxyValue, insert a transpose node. Otherwise we
+        # can simply transpose the tensor inplace.
+        if isinstance(Y_arg, ProxyValue):
+            transpose_args = (Y_arg, -1, -2)
+            transpose_node = super().call_operator(
+                exir_ops.edge.aten.transpose_copy.int,
+                transpose_args,
+                {},
+                meta,
+            )
+            Y_arg_t = transpose_node
+        else:
+            Y_arg_t = Y_tensor.transpose(-1, -2)
+
+        # Construct the new args, and return the transposed matmult op
+        new_args = (
+            X_arg,
+            X_zero_point,
+            Y_arg_t,
+            Y_zero_point,
+            zero_bias,
+            out_multiplier,
+            out_shift,
+            out_zero_point,
+            True,
+        )
+        return super().call_operator(op, new_args, kwargs, meta)
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        result = super().call(graph_module)
+        # Fuse any inserted transpose node with transpose/permute nodes
+        # surrounding it.
+        result = FuseCascadedTransposeOrPermuteOps()(result.graph_module)
+        assert result is not None
+        # Replace permute with transpose.
+        result = ReplacePermuteWithTransposePass()(result.graph_module)
+        assert result is not None
+        return result
+
+
 # This class encapsulates all the functions that replace/switch one op in the
 # graph with another.
 class CadenceReplaceOpsInGraph:
@@ -2317,6 +2415,7 @@ class CadenceReplaceOpsInGraph:
         # This pass should be after passes that replace conv -> im2row + linear.
         ReplaceIm2RowWithViewPass,
         MakeSliceAndCatDimOutermostPass,
+        ReplaceMatmulWithTransposedMatmulPass,
         ReplaceNopTransposeOrPermuteWithViewPass,
         ReplaceLinearWithFullyConnectedOpPass,
         ReplaceScalarTensorWithFullPass,
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -16,6 +16,7 @@
 from executorch.backends.cadence.aot.compiler import (
     export_to_edge,
     quantize_and_export_to_edge,
+    quantize_pt2,
 )
 from executorch.backends.cadence.aot.graph_builder import (
     GraphBuilder,
@@ -35,6 +36,7 @@
     ReplaceGeluWithApproximateGeluPass,
     ReplaceIm2RowWithViewPass,
     ReplaceLinearWithFullyConnectedOpPass,
+    ReplaceMatmulWithTransposedMatmulPass,
     ReplaceMMWithAddMMPass,
     ReplaceNopTransposeOrPermuteWithViewPass,
     ReplacePadWithCatPass,
@@ -85,6 +87,50 @@ def assertTargetCountsEqual(
         for target, expected_count in targets_and_counts:
             self.assertTargetCountEqual(graph_module, target, expected_count)
 
+    @parameterized.expand(
+        [
+            # Regular MM
+            [(64, 33), (33, 128)],
+            # Batched MM
+            [(2, 48, 48), (2, 48, 48)],
+        ]
+    )
+    @torch.no_grad()
+    def test_replace_matmul_with_transposed_matmul(
+        self,
+        x_shape: Tuple[int],
+        y_shape: Tuple[int],
+    ) -> None:
+        class MatMul(torch.nn.Module):
+            def __init__(self) -> None:
+                super(MatMul, self).__init__()
+
+            def forward(self, x, y):
+                return torch.matmul(x, y)
+
+        model = MatMul()
+        X = torch.randn(x_shape)
+        Y = torch.randn(y_shape)
+        p = ReplaceMatmulWithTransposedMatmulPass()
+        inputs = (X, Y)
+        quantized_model = quantize_pt2(model, inputs)
+        graph_module = (
+            export_to_edge(quantized_model, inputs).exported_program().graph_module
+        )
+        # pyre-fixme[16]: Optional type has no attribute `graph_module`
+        graph_after_passes = p(graph_module).graph_module
+
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.transpose_copy.int),
+            1,
+        )
+        self.assertEqual(
+            count_node(
+                graph_after_passes, exir_ops.edge.cadence.quantized_matmul.default
+            ),
+            1,
+        )
+
     @parameterized.expand(
         [
             [(3, 5), (0, 0)],