Add decompositions to import flow (#27)

123epsilon · web-flow · commit 38525d66e55b · 2023-08-10T17:00:42.000-04:00
Adds default decompositions from SHARK to the Turbine import flow, with
a test for importing `aten.chunk` and `nn.BatchNorm2d` which previously failed.
diff --git a/python/shark_turbine/dynamo/backends/cpu.py b/python/shark_turbine/dynamo/backends/cpu.py
@@ -37,6 +37,7 @@
 
 import torch
 from torch._dynamo.backends.common import aot_autograd
+from ..passes import turbine_cpu_pass_pipeline
 
 DEFAULT_COMPILER_FLAGS = (
     # Enable asynchronous calling convention.
@@ -65,6 +66,9 @@ def _base_backend(gm: torch.fx.GraphModule, example_inputs):
     inv.enable_console_diagnostics()
     inv.import_module(module.operation)
 
+    # Apply decompositions.
+    gm = turbine_cpu_pass_pipeline(gm, example_inputs)
+
     # Import phase.
     importer.import_graph_module(gm)
     print(module, file=sys.stderr)
diff --git a/python/shark_turbine/dynamo/importer.py b/python/shark_turbine/dynamo/importer.py
@@ -136,10 +136,10 @@ class FxImporter:
     ]
 
     def __init__(
-            self,
-            module: Optional[Module] = None,
-            context: Optional[Context] = None,
-            config_check: bool = True,
+        self,
+        module: Optional[Module] = None,
+        context: Optional[Context] = None,
+        config_check: bool = True,
     ):
         if module is not None:
             assert context is None, "If configuring with a Module, context must be None"
@@ -214,7 +214,9 @@ def _graph_to_function_meta(self, g: Graph) -> Tuple[FunctionType, Location]:
                 # always be "boxed" as a tuple, which we emit as multi-results.
                 for result_node in node.args[0]:
                     if result_node is None:
-                        result_types.append(MlirType.parse("!torch.none", context=self._c))
+                        result_types.append(
+                            MlirType.parse("!torch.none", context=self._c)
+                        )
                     else:
                         result_types.append(self._cc.node_val_to_type(result_node))
         return (
@@ -390,7 +392,7 @@ def import_nodes(self, nodes: Sequence[torch_fx.Node]):
                     func_dialect.ReturnOp(operands, loc=loc)
 
     def _import_torch_op_overload(
-            self, loc: Location, node: torch_fx.Node, target: TorchOpOverload
+        self, loc: Location, node: torch_fx.Node, target: TorchOpOverload
     ):
         schema = target._schema
         assert isinstance(schema, FunctionSchema)
@@ -404,7 +406,7 @@ def _import_torch_op_overload(
 
         # Intervening to use Scalar ops due to incorrect ops from AOT-autograd with scalar arguments.
         if mlir_op_name in TENSOR_SCALAR_OP_CONVERTER and (
-                isinstance(node.args[1], float) or isinstance(node.args[1], int)
+            isinstance(node.args[1], float) or isinstance(node.args[1], int)
         ):
             mlir_op_name = TENSOR_SCALAR_OP_CONVERTER[mlir_op_name]
 
@@ -487,9 +489,7 @@ def _import_list_argument(self, loc: Location, arg):
         result_type = SCALAR_TYPE_TO_TORCH_LIST_TYPE.get(arg_type, None)
 
         if result_type is not None:
-            result_type = MlirType.parse(
-                result_type, context=self._c
-            )
+            result_type = MlirType.parse(result_type, context=self._c)
 
         for operand in arg:
             operand_type = type(operand)
@@ -498,21 +498,6 @@ def _import_list_argument(self, loc: Location, arg):
                     f"Lists with multiple types are not supported, got: {arg_type}, {operand_type}"
                 )
 
-                if isinstance(operand, torch.fx.Node):
-                    if operand in self._multi_result_nodes:
-                        raise RuntimeError(f"Attempt to de-reference a multi-result node")
-                    val = self._v[(operand, 0)]
-                    if result_type is None:
-                        list_type: str = str(val.type)
-                        begin_index = 7 if list_type.startswith("!torch.") else None
-                        end_index = list_type.find("<")
-                        end_index = end_index if end_index != -1 else None
-                        list_type = list_type[begin_index:end_index]
-                        result_type = MlirType.parse(f"!torch.list<{list_type}>")
-                else:
-                    val = self._import_default_value(
-                        loc, operand, SCALAR_TYPE_TO_TORCH_TYPE[type(operand)]
-                    )
             if isinstance(operand, torch.fx.Node):
                 if operand in self._multi_result_nodes:
                     raise RuntimeError(f"Attempt to de-reference a multi-result node")
@@ -522,7 +507,9 @@ def _import_list_argument(self, loc: Location, arg):
                     pattern = r"^!torch\.(.*?)(?:<.*>)?$"
                     val_type = str(val.type)
                     match = re.match(pattern, val_type)
-                    assert match is not None, f"Unexpected MlirType in list: \'{val_type}\'"
+                    assert (
+                        match is not None
+                    ), f"Unexpected MlirType in list: '{val_type}'"
                     list_type = match.group(1)
                     result_type = MlirType.parse(f"!torch.list<{list_type}>")
             else:
@@ -595,7 +582,7 @@ def lookup(self, t: type) -> Any:
 
 
 def _make_constant_op(
-        op_name: str, value_attr: MlirAttribute, result_type: Optional[MlirType] = None
+    op_name: str, value_attr: MlirAttribute, result_type: Optional[MlirType] = None
 ) -> Operation:
     return Operation.create(
         op_name,
@@ -664,14 +651,14 @@ def _make_constant_op(
     int: "!torch.list<int>",
     float: "!torch.list<float>",
     str: "!torch.list<str>",
-    bool: "!torch.list<bool>"
+    bool: "!torch.list<bool>",
 }
 
 SCALAR_TYPE_TO_TORCH_TYPE = {
     int: "!torch.int",
     float: "!torch.float",
     str: "!torch.str",
-    bool: "!torch.bool"
+    bool: "!torch.bool",
 }
 
 # AOT-autograd sometimes falsely emit tensor version op with scalar arguments.
diff --git a/python/shark_turbine/dynamo/passes.py b/python/shark_turbine/dynamo/passes.py
@@ -0,0 +1,49 @@
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from torch.func import functionalize
+from typing import List
+
+# default decompositions pulled from SHARK
+DEFAULT_DECOMPOSITIONS = [
+    torch.ops.aten.embedding_dense_backward,
+    torch.ops.aten.native_layer_norm_backward,
+    torch.ops.aten.slice_backward,
+    torch.ops.aten.select_backward,
+    torch.ops.aten.norm.ScalarOpt_dim,
+    torch.ops.aten.native_group_norm,
+    torch.ops.aten.upsample_bilinear2d.vec,
+    torch.ops.aten.split.Tensor,
+    torch.ops.aten.split_with_sizes,
+    torch.ops.aten.native_layer_norm,
+    torch.ops.aten.masked_fill.Tensor,
+    torch.ops.aten.masked_fill.Scalar,
+]
+
+# decompositions that aid us in handling nn.BatchNorm2d
+BATCHNORM_DECOMPOSITIONS = [
+    torch.ops.aten._native_batch_norm_legit_functional,
+    torch.ops.aten.squeeze.dims,
+]
+
+
+def apply_decompositions(
+    gm: torch.fx.GraphModule,
+    example_inputs,
+    decompose_ops: List[torch._ops.OpOverload] = None,
+):
+    if decompose_ops is None:
+        return gm
+
+    decompositions = get_decompositions(decompose_ops)
+    gm = make_fx(
+        functionalize(gm),
+        decomposition_table=decompositions,
+    )(*example_inputs)
+
+    return gm
+
+
+def turbine_cpu_pass_pipeline(gm: torch.fx.GraphModule, example_inputs):
+    decompose_ops = DEFAULT_DECOMPOSITIONS + BATCHNORM_DECOMPOSITIONS
+    return apply_decompositions(gm, example_inputs, decompose_ops)
diff --git a/python/test/dynamo/importer_basic_test.py b/python/test/dynamo/importer_basic_test.py
@@ -6,21 +6,31 @@
 
 import logging
 import unittest
+from typing import List
 
 from shark_turbine.dynamo.importer import FxImporter
 import torch
 import torch._dynamo as dynamo
 from torch._dynamo.backends.common import aot_autograd
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from torch.func import functionalize
 from torch.fx import (
     GraphModule,
 )
 
 
 class ImportTests(unittest.TestCase):
-    def create_backend(self):
+    def create_backend(self, decompose_ops: List[torch._ops.OpOverloadPacket] = None):
         imp = FxImporter()
 
         def import_compiler(gm: GraphModule, example_inputs):
+            if decompose_ops is not None:
+                gm = make_fx(
+                    functionalize(gm),
+                    decomposition_table=get_decompositions(decompose_ops),
+                )(*example_inputs)
+
             gm.print_readable()
             try:
                 imp.import_graph_module(gm)
@@ -107,17 +117,35 @@ def foo(x, y):
         opt_foo = torch.compile(foo, backend=self.create_backend())
         opt_foo(torch.randn(10), torch.randn(10))
 
-    @unittest.expectedFailure
-    def testImportChunk(self):
-        """
-        Marked as XFail due to Unsupported placeholder node, where FX graph does not return meta_data["tensor_meta"]
-        to create Ops. Same problem occurs with split.Tensor and unbind.int. Needs to identify the root cause.
-        """
-
+    def testImportDecomposeChunk(self):
         def foo_chunk(x):
             return torch.chunk(x, 2, dim=-1)
 
-        opt = torch.compile(foo_chunk, backend=self.create_backend())
+        opt = torch.compile(
+            foo_chunk,
+            backend=self.create_backend(
+                decompose_ops=[
+                    torch.ops.aten.split.Tensor,
+                    torch.ops.aten.split_with_sizes,
+                ]
+            ),
+        )
+        t = torch.randn([4, 4, 4, 4])
+        opt(t)
+
+    def testImportDecomposeBatchNorm2D(self):
+        def foo_chunk(x):
+            return torch.nn.BatchNorm2d(4)(x)
+
+        opt = torch.compile(
+            foo_chunk,
+            backend=self.create_backend(
+                decompose_ops=[
+                    torch.ops.aten._native_batch_norm_legit_functional,
+                    torch.ops.aten.squeeze.dims,
+                ]
+            ),
+        )
         t = torch.randn([4, 4, 4, 4])
         opt(t)
 
diff --git a/python/test/dynamo/multiple_aten_results_test.py b/python/test/dynamo/multiple_aten_results_test.py
@@ -34,7 +34,6 @@ def import_compiler(gm: GraphModule, example_inputs):
         import torch.nn.functional as F
 
         class Scaled_Dot_Product_Attention(nn.Module):
-
             def __init__(self):
                 super(Scaled_Dot_Product_Attention, self).__init__()
 
diff --git a/python/test/generated/evaluate.py b/python/test/generated/evaluate.py
@@ -9,23 +9,54 @@
     GraphModule,
 )
 
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from torch.func import functionalize
+from typing import List
+
+
+def default_decompositions():
+    return get_decompositions(
+        [
+            torch.ops.aten.embedding_dense_backward,
+            torch.ops.aten.native_layer_norm_backward,
+            torch.ops.aten.slice_backward,
+            torch.ops.aten.select_backward,
+            torch.ops.aten.norm.ScalarOpt_dim,
+            torch.ops.aten.native_group_norm,
+            torch.ops.aten.upsample_bilinear2d.vec,
+            torch.ops.aten.split.Tensor,
+            torch.ops.aten.split_with_sizes,
+            torch.ops.aten.native_layer_norm,
+            torch.ops.aten.masked_fill.Tensor,
+            torch.ops.aten.masked_fill.Scalar,
+            torch.ops.aten._native_batch_norm_legit_functional,
+            torch.ops.aten.squeeze.dims,
+        ]
+    )
+
+
 def create_backend():
     imp = FxImporter()
 
     def import_compiler(gm: GraphModule, example_inputs):
-        # gm.print_readable()
+        gm = make_fx(
+            functionalize(gm),
+            decomposition_table=default_decompositions(),
+        )(*example_inputs)
+
         try:
             imp.import_graph_module(gm)
         finally:
             pass
-            # print(imp.module)
         imp.module.operation.verify()
         return gm
 
     backend = import_compiler
     backend = aot_autograd(fw_compiler=backend)
     return backend
 
+
 def evaluate_importer(nn_cls, get_init_args, get_forward_args, test_identifier):
     log = logging.getLogger("turbine-test")
     try:
diff --git a/python/test/generated/main.py b/python/test/generated/main.py
@@ -8,45 +8,69 @@
 import torch._inductor.config
 
 import logging
+
 log = logging.getLogger("turbine-test")
 logging.basicConfig(level=logging.INFO)
 
 ENV_FILE = "JITPARITYBENCH_PATH.txt"
 
+
 def get_args(raw_args=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument("--jobs", "-j", type=int, default=4, help="Number of threads in our threadpool, jobs=1 is essentially sequential execution")
-    parser.add_argument("--offset", type=int, default=0, help="Pick files starting from this offset. Together with --limit, we can run through all files in multiple separate runs")
+    parser.add_argument(
+        "--jobs",
+        "-j",
+        type=int,
+        default=4,
+        help="Number of threads in our threadpool, jobs=1 is essentially sequential execution",
+    )
+    parser.add_argument(
+        "--offset",
+        type=int,
+        default=0,
+        help="Pick files starting from this offset. Together with --limit, we can run through all files in multiple separate runs",
+    )
     parser.add_argument("--limit", "-l", type=int, help="only run the first N files")
-    parser.add_argument("--filter", "-f", "-k", help="only run module containing given name")
+    parser.add_argument(
+        "--filter", "-f", "-k", help="only run module containing given name"
+    )
     parser.add_argument("--skips", type=str)
-    parser.add_argument("--tests-dir", default=None, help="jit-paritybench location (i.e. /path/to/pytorch-jit-paritybench)")
+    parser.add_argument(
+        "--tests-dir",
+        default=None,
+        help="jit-paritybench location (i.e. /path/to/pytorch-jit-paritybench)",
+    )
     # parser.add_argument("--device", default="cuda", type=str, help="evaluate modules using cuda or cpu") # excluded for now as we only have turbine-cpu, can use this later
 
     args = parser.parse_args(raw_args)
     return args
 
+
 def write_path(path: str):
     with open(ENV_FILE, "w") as f:
         f.write(path)
 
+
 def read_path() -> str:
     with open(ENV_FILE, "r") as f:
         path = f.read()
     return path
 
+
 if __name__ == "__main__":
     args = get_args()
 
     if args.tests_dir is not None:
         pb = args.tests_dir
-        write_path(pb) # store this path for next time
+        write_path(pb)  # store this path for next time
         log.info(f"Using test directory from CLI: {pb}")
     elif os.path.exists(ENV_FILE):
         pb = read_path()
         log.info(f"Using test directory from {ENV_FILE}: {pb}")
     else:
-        raise RuntimeError(f"Must either pass 'tests-dir' or set {ENV_FILE} in order to run tests")
+        raise RuntimeError(
+            f"Must either pass 'tests-dir' or set {ENV_FILE} in order to run tests"
+        )
 
     # enables finding necessary modules in jit-paritybench
     pb_gen = pb + "/generated"
diff --git a/python/test/generated/stats.py b/python/test/generated/stats.py
diff --git a/python/test/generated/testutils.py b/python/test/generated/testutils.py