Milestone2.1: Partition to_dim_order_copy op in XNN delegate (#12220)

leafs1 · web-flow · commit dd6caa3902be · 2025-07-15T13:22:50.000-07:00
### Summary
This PR adds support for the `to_dim_order_copy` operation in the
XNNPACK delegate partitioner, enabling direct handling of memory format
conversions initiated by users via `.to(memory_format=)` calls. This
enhancement significantly improves performance by producing more
compressed graphs that avoid unnecessary partitioning boundaries at
memory format conversion points. By delegating these operations directly
to XNNPACK, we eliminate the overhead of context switching between the
runtime and delegate, reducing both execution time and memory footprint.
The implementation leverages XNNPACK's highly optimized memory format
conversion routines, which are specifically designed for efficient
tensor layout transformations on various hardware targets.

### Test plan
Confirmed expected output when having user specified dim order
conversions as well as appropriate partitioning. I did this by writing
individual tests for the to_copy op ensuring it changes dim order and
dtype when appropriate. Also added test module to confirm that the to
copy nodes are partitioned and not in another partition
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
@@ -50,6 +50,7 @@
     SquareRootConfig,
     SubConfig,
     TanhConfig,
+    ToDimOrderCopyConfig,
     UpsampleBilinear2dConfig,
 )
 from executorch.backends.xnnpack.partition.config.node_configs import (
@@ -102,6 +103,7 @@
     ReciprocalSquareRootConfig,
     ReLUConfig,
     TanhConfig,
+    ToDimOrderCopyConfig,
     SigmoidConfig,
     SliceCopyConfig,
     SoftmaxConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -425,6 +425,35 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
 
 
+class ToDimOrderCopyConfig(GenericNodePartitionerConfig):
+    target_name = "_to_dim_order_copy.default"
+
+    def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
+        """
+        Only support dim order conversion partitioning, not DType conversions
+        """
+        if not self.check_common_constraints(node, ep):
+            return False
+
+        # Get input node and compare dtypes
+        input_node = get_input_node(node, 0)
+        input_dtype = input_node.meta["val"].dtype
+        output_dtype = node.meta["val"].dtype
+
+        # Return False if doing dtype conversion
+        if input_dtype != output_dtype:
+            why(
+                node,
+                reason=f"dtype conversion from {input_dtype} to {output_dtype} is not supported",
+            )
+            return False
+
+        return True
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT]
+
+
 class MeanDimConfig(GenericNodePartitionerConfig):
     target_name = "mean.dim"
 
diff --git a/backends/xnnpack/test/ops/test_to_copy.py b/backends/xnnpack/test/ops/test_to_copy.py
@@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from executorch.backends.xnnpack.test.tester import Tester
+
+
+class TestChannelsLastTaggedReshapePass(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
+    def run_tester(self, module, inputs):
+        tester = Tester(
+            module.eval(),
+            inputs,
+        )
+        tester.export().to_edge_transform_and_lower().check_not(
+            ["executorch_exir_dialects_edge__ops_aten__to_copy_default"]
+        ).to_executorch().serialize().run_method_and_compare_outputs()
+
+    class ChannelLastBeforeLinear(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = torch.nn.Linear(3, 3)
+
+        def forward(self, x):
+            y = x.to(memory_format=torch.channels_last)
+            return self.linear(y)
+
+    ChannelLastBeforeLinearModule = ChannelLastBeforeLinear()
+
+    def test_channel_last_before_linear(self):
+        self.run_tester(self.ChannelLastBeforeLinearModule, (torch.randn(1, 3, 3, 3),))
+
+    class ContiguousBeforeConv(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            y = x.to(memory_format=torch.contiguous_format)
+            return self.conv(y)
+
+    ContiguousBeforeConvModule = ContiguousBeforeConv()
+
+    def test_contiguous_before_conv(self):
+        self.run_tester(self.ContiguousBeforeConvModule, (torch.randn(1, 3, 6, 6),))
+
+    class DtypeAndMemoryFormatConversion(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            y = x.to(torch.float, memory_format=torch.channels_last)
+            return self.conv(y)
+
+    DtypeAndMemoryFormatConversionModule = DtypeAndMemoryFormatConversion()
+
+    def test_dtype_and_memory_format_conversion(self):
+        self.run_tester(
+            self.DtypeAndMemoryFormatConversionModule,
+            (torch.randint(0, 10, (1, 3, 6, 6), dtype=torch.int32),),
+        )
+
+    class DtypeAndMemoryFormatWithLinear(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = torch.nn.Linear(3, 3)
+
+        def forward(self, x):
+            y = x.to(torch.float, memory_format=torch.channels_last)
+            return self.linear(y)
+
+    DtypeAndMemoryFormatWithLinearModule = DtypeAndMemoryFormatWithLinear()
+
+    def test_dtype_and_memory_format_with_linear(self):
+        self.run_tester(
+            self.DtypeAndMemoryFormatWithLinearModule,
+            (torch.randint(0, 10, (1, 3, 3, 3), dtype=torch.int16),),
+        )
+
+    class QuantizedToCopy(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+            self.conv2 = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            y = self.conv(x)
+            y = y.to(memory_format=torch.contiguous_format)
+            return self.conv2(y)
+
+    QuantizedToCopyModule = QuantizedToCopy()
+
+    def test_quantized_to_copy(self):
+        tester = Tester(
+            self.QuantizedToCopyModule.eval(),
+            (torch.randn(1, 3, 9, 9),),
+        )
+
+        tester.quantize().export().to_edge_transform_and_lower().check_not(
+            [
+                "executorch_exir_dialects_edge__ops_aten__to_copy_default",
+                "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default",
+            ]
+        ).to_executorch().serialize().run_method_and_compare_outputs(qtol=0.01)
diff --git a/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py b/backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
@@ -54,7 +54,9 @@ def run_tester(self, module, inputs):
             module.eval(),
             inputs,
         )
-        tester.export().to_edge_transform_and_lower().to_executorch().serialize().run_method_and_compare_outputs()
+        tester.export().to_edge_transform_and_lower().check_not(
+            ["executorch_exir_dialects_edge__ops_aten__to_copy_default"]
+        ).to_executorch().serialize().run_method_and_compare_outputs()
 
     class LinearConv(torch.nn.Module):
         def __init__(self):
@@ -179,6 +181,23 @@ def test_fp32_channels_last_tagged_reshape_pass(self):
                 .run_method_and_compare_outputs()
             )
 
+    class LinearConvDimSwap(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.linear1 = torch.nn.Linear(4, 3)
+
+        def forward(self, x):
+            y = self.linear1(x)
+            y = y.to(memory_format=torch.channels_last)
+            y = y.to(memory_format=torch.contiguous_format)
+            return self.conv1(y)
+
+    LinearConvDimSwapModule = LinearConvDimSwap()
+
+    def test_conv_linear_dim_order_swap_partitioner(self):
+        self.run_tester(self.LinearConvDimSwapModule, (torch.randn(1, 3, 6, 4),))
+
     def test_qs8_channels_last_tagged_reshape_pass(self):
         for module, num_reshape in self.modules.items():
             (