[scan] Make sure inputs into fn are not device_data IR nodes (#8769)

tengyifei · web-flow · commit 2feb0ac7c0ed · 2025-02-28T13:41:50.000-08:00
diff --git a/test/scan/test_scan_pallas.py b/test/scan/test_scan_pallas.py
@@ -0,0 +1,116 @@
+import logging
+import sys
+import unittest
+from absl.testing import parameterized
+
+import torch
+from torch import nn as nn
+
+import torch_xla
+import torch_xla.core.xla_model as xm
+from torch_xla import runtime as xr
+from torch_xla.experimental.scan_layers import scan_layers
+import torch_xla.distributed.spmd as xs
+from torch_xla.experimental.custom_kernel import flash_attention
+
+
+class AttentionModule(torch.nn.Module):
+
+  def __init__(self, has_model_weight=True, num_head=4, hidden_dim=256):
+    super(AttentionModule, self).__init__()
+    self.has_model_weight = has_model_weight
+    if has_model_weight:
+      self.num_head = num_head
+      self.hidden_dim = hidden_dim
+      self.fc = nn.Linear(hidden_dim, hidden_dim)
+
+  def forward(self, input):
+    # query_states: [B, NUM_HEAD, SEQ_LEN, d_k]
+    # attn_output: [B, SEQ_LEN, d_m], dm = dk * NUM_HEAD
+    query_states = input.clone()
+    key_states = input.clone()
+    value_states = input.clone()
+    attn_output = flash_attention(
+        query_states,
+        key_states,
+        value_states,
+        causal=True,
+        partition_spec=("fsdp", None, None, None),
+    )
+    if self.has_model_weight:
+      attn_output = self.fc(attn_output)
+    return attn_output
+
+
+class AttentionLayers(torch.nn.Module):
+
+  def __init__(self, has_model_weight=True, num_layer=3, use_scan=False):
+    super(AttentionLayers, self).__init__()
+    self.num_layer = num_layer
+    self.use_scan = use_scan
+    self.has_model_weight = has_model_weight
+    self.layers = nn.ModuleList([
+        AttentionModule(has_model_weight=has_model_weight)
+        for i in range(self.num_layer)
+    ])
+
+  def forward(self, input):
+    hidden_states = input
+    xs.mark_sharding(hidden_states, xs.get_global_mesh(),
+                     ("fsdp", None, None, None))
+    if not self.use_scan:
+      for layer in self.layers:
+        hidden_states = layer(hidden_states)
+    else:
+      hidden_states = scan_layers(self.layers, input_data=hidden_states)
+    return hidden_states
+
+
+class ScanFlashAttentionTest(parameterized.TestCase):
+
+  def fake_fa_wrapper(self, has_model_weight, use_scan):
+    torch.manual_seed(12)
+    torch_xla.manual_seed(12)
+    hidden_states = torch.randn((2, 4, 256, 256)).requires_grad_().to('xla')
+    with xm.xla_device():
+      attention_layers = AttentionLayers(
+          has_model_weight, num_layer=3, use_scan=use_scan)
+    hidden_states.retain_grad()
+    output = attention_layers(hidden_states)
+    return output
+
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU")
+  def test_scan_flash_attention_against_for_loop(self):
+    for_loop_output = self.fake_fa_wrapper(
+        has_model_weight=True, use_scan=False)
+    torch_xla.sync()
+    scan_output = self.fake_fa_wrapper(has_model_weight=True, use_scan=True)
+    torch_xla.sync()
+    torch.testing.assert_close(
+        for_loop_output.cpu(), scan_output.cpu(), atol=1e-3, rtol=1e-3)
+
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU")
+  @parameterized.named_parameters(("has_model_weight_True", True),
+                                  ("has_model_weight_False", False))
+  def test_scan_weight_layer_aot(self, has_model_weight_scan):
+    output = self.fake_fa_wrapper(
+        has_model_weight=has_model_weight_scan, use_scan=False)
+    torch_xla.sync()
+    # TODO(https://github.com/pytorch/xla/issues/8753): Fix assertion
+    # torch.manual_seed(12)
+    # torch_xla.manual_seed(12)
+    # scan_output = self.fake_fa_wrapper(
+    #     has_model_weight=has_model_weight_scan, use_scan=True)
+    # torch_xla.sync()
+    # torch.testing.assert_close(output.cpu(), scan_output.cpu())
+
+
+if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.INFO)
+
+  xr.use_spmd()
+  n_devices = xr.global_runtime_device_count()
+  xs.set_global_mesh(xs.get_1d_mesh("fsdp"))
+
+  test = unittest.main()
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/test_as_stride_use_slice.py b/test/test_as_stride_use_slice.py
@@ -9,67 +9,12 @@
 import torch_xla
 import torch_xla.core.xla_model as xm
 from torch_xla import runtime as xr
-from torch_xla._internal import tpu
-from torch_xla.experimental.scan_layers import scan_layers
 import torch_xla.distributed.spmd as xs
-from torch_xla.experimental.custom_kernel import flash_attention
 
 from functorch.compile import aot_function, make_boxed_func
 from torch.library import custom_op
 
 
-class AttentionModule(torch.nn.Module):
-
-  def __init__(self, has_model_weight=True, num_head=4, hidden_dim=256):
-    super(AttentionModule, self).__init__()
-    self.has_model_weight = has_model_weight
-    if has_model_weight:
-      self.num_head = num_head
-      self.hidden_dim = hidden_dim
-      self.fc = nn.Linear(hidden_dim, hidden_dim)
-
-  def forward(self, input):
-    # query_states: [B, NUM_HEAD, SEQ_LEN, d_k]
-    # attn_output: [B, SEQ_LEN, d_m], dm = dk * NUM_HEAD
-    query_states = input.clone()
-    key_states = input.clone()
-    value_states = input.clone()
-    attn_output = flash_attention(
-        query_states,
-        key_states,
-        value_states,
-        causal=True,
-        partition_spec=("fsdp", None, None, None),
-    )
-    if self.has_model_weight:
-      attn_output = self.fc(attn_output)
-    return attn_output
-
-
-class AttentionLayers(torch.nn.Module):
-
-  def __init__(self, has_model_weight=True, num_layer=3, use_scan=False):
-    super(AttentionLayers, self).__init__()
-    self.num_layer = num_layer
-    self.use_scan = use_scan
-    self.has_model_weight = has_model_weight
-    self.layers = nn.ModuleList([
-        AttentionModule(has_model_weight=has_model_weight)
-        for i in range(self.num_layer)
-    ])
-
-  def forward(self, input):
-    hidden_states = input
-    xs.mark_sharding(hidden_states, xs.get_global_mesh(),
-                     ("fsdp", None, None, None))
-    if not self.use_scan:
-      for layer in self.layers:
-        hidden_states = layer(hidden_states)
-    else:
-      hidden_states = scan_layers(self.layers, input_data=hidden_states)
-    return hidden_states
-
-
 class StridedAndSlice(torch.nn.Module):
 
   def __init__(self):
@@ -198,50 +143,7 @@ def compiler(gm, _):
     torch.testing.assert_close(cpu_output, xla_output.cpu())
 
 
-class ScanFlashAttentionTest(parameterized.TestCase):
-
-  def fake_fa_wrapper(self, has_model_weight, use_scan):
-    with xm.xla_device():
-      dm = AttentionLayers(has_model_weight, 3, use_scan)
-      hidden_states = torch.randn((2, 4, 256, 256)).requires_grad_()
-    hidden_states.retain_grad()
-    output = dm(hidden_states)
-    return output
-
-  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU")
-  @parameterized.named_parameters(("use_scan_True", True),
-                                  ("use_scan_False", False))
-  def test_scan_layer_aot(self, use_scan):
-    output = self.fake_fa_wrapper(has_model_weight=True, use_scan=use_scan)
-    torch_xla.sync()
-    # TODO(https://github.com/pytorch/xla/issues/8742): Fix NaN
-    # self.assertFalse(torch.isnan(output).any())
-
-  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU")
-  @parameterized.named_parameters(("has_model_weight_True", True),
-                                  ("has_model_weight_False", False))
-  def test_scan_weight_layer_aot(self, has_model_weight_scan):
-    torch.manual_seed(12)
-    torch_xla.manual_seed(12)
-    output = self.fake_fa_wrapper(
-        has_model_weight=has_model_weight_scan, use_scan=False)
-    torch_xla.sync()
-    # TODO(https://github.com/pytorch/xla/issues/8742): Fix NaN
-    # TODO(https://github.com/pytorch/xla/issues/8753): Fix assertion
-    # torch.manual_seed(12)
-    # torch_xla.manual_seed(12)
-    # scan_output = self.fake_fa_wrapper(
-    #     has_model_weight=has_model_weight_scan, use_scan=True)
-    # torch_xla.sync()
-    # torch.testing.assert_close(output.cpu(), scan_output.cpu())
-
-
 if __name__ == '__main__':
   logging.getLogger().setLevel(logging.INFO)
-
-  xr.use_spmd()
-  n_devices = xr.global_runtime_device_count()
-  xs.set_global_mesh(xs.get_1d_mesh("fsdp"))
-
   test = unittest.main()
   sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/tpu/run_tests.sh b/test/tpu/run_tests.sh
@@ -32,6 +32,7 @@ python3 "$TEST_CDIR/pjrt/test_dynamic_plugin_tpu.py"
 python3 "$TEST_CDIR/test_while_loop.py"
 python3 "$TEST_CDIR/scan/test_scan.py"
 python3 "$TEST_CDIR/scan/test_scan_spmd.py"
+python3 "$TEST_CDIR/scan/test_scan_pallas.py"
 python3 "$TEST_CDIR/scan/test_scan_layers.py"
 python3 "$TEST_CDIR/test_as_stride_use_slice.py"
 run_xla_hlo_debug python3 "$TEST_CDIR/scan/test_scan_debug.py"
diff --git a/torch_xla/experimental/scan.py b/torch_xla/experimental/scan.py
@@ -564,7 +564,52 @@ def make_fake_tensor(v: torch.Tensor) -> torch.Tensor:
   device = torch_xla.device()
   fake_carry = tree_map(make_fake_tensor, init)
   fake_x = tree_map(lambda v: make_fake_tensor(v[0]), xs)
-  fake_output_carry, fake_output_y = fn(fake_carry, fake_x)
+
+  def defeat_device_data(v: torch.Tensor) -> torch.Tensor:
+    """
+    Make sure inputs into `fn` are not `device_data` IR nodes.
+
+    This is to workaround a limitation of `mark_sharding`, which replaces
+    the innards of the tensors it operates on. In other words, `mark_sharding`
+    is an in-place operation as opposed to a transform like found in JAX.
+
+    When `fn` contains a `mark_sharding` and the `mark_sharding` operates on one
+    of the carry or xs fake tensors, the original device data will be discarded
+    and a new one will be created in its place. That's because `mark_sharding` has
+    different code paths depending on if the IR has or doesn't have device data.
+    If the IR is an intermediate operation like add or matmul, `mark_sharding` will
+    update the sharding annotation. If the IR holds data, `mark_sharding` will
+    transfer the data to the TPU in a sharded manner, and update the data object
+    in the IR to point to a sharded data object, as can be seen in [2].
+
+    When lowering a graph to HLO, tensors that hold the same data object will
+    map to the same HLO parameter. Changing the data object in the tensor will
+    cause it to map to a different HLO parameter. As a result, `fn` will appear
+    to create a few empty tensors internally that are unrelated to the carry and
+    xs fake tensors, and the carry and xs will appear completely unused.
+
+    See https://github.com/pytorch/xla/issues/8742 for the bug. In short,
+    if an input into the layer to be scanned is a device data, and that layer
+    does a `mark_sharding` on said input, then the graph capturing in `scan`
+    will fail.
+
+    The workaround here is simple and cursed: multiply any `device_data` by 1.
+    This will make sure these tensor don't hold device data IR nodes and will
+    defeat the device data replacement of `mark_sharding`.
+
+    Fortunately, XLA simplifies away the multiplication (see [1]) so this should
+    become a no-op by the time it hits the TPU.
+
+    [1]: https://github.com/openxla/xla/blob/869f57d0082d7adbb9efc10cc18f51a562fc7bf3/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc#L4755-L4770
+    [2]: https://github.com/pytorch/xla/blob/2675e6892c6f955fc2baf88d85dfdfa72062273c/torch_xla/csrc/xla_sharding_util.cpp#L799-L846
+
+    """
+    return v * 1
+
+  # Trace `fn` in order to stage out its HLO.
+  fake_output_carry, fake_output_y = fn(
+      tree_map(defeat_device_data, fake_carry),
+      tree_map(defeat_device_data, fake_x))
 
   y_len = len(fake_output_y)
   fn_outputs = fake_output_carry + fake_output_y