update yapf

tengyifei · tengyifei · commit 8e682cdab410 · 2025-04-24T18:52:06.000-07:00
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -169,7 +169,7 @@ find -name '*.cpp' -o -name '*.h' -o -name '*.cc' | xargs clang-format-11 -i -st
 If your PR touches the Python source files, please run the following command before submitting a PR.
 
 ```Shell
-# How to install: pip install yapf==0.30.0
+# How to install: pip install yapf==0.40.2
 yapf --recursive -i *.py test/ scripts/ torch_xla/ benchmarks/
 ```
 
diff --git a/benchmarks/benchmark_model.py b/benchmarks/benchmark_model.py
@@ -227,9 +227,8 @@ def is_compatible(self, dummy_benchmark_model: BenchmarkModel,
   def get_benchmark_indices(self, length: int):
     start = self._args.partition_id * (length // self._args.total_partitions)
     end = ((self._args.partition_id + 1) *
-           (length // self._args.total_partitions)
-           if self._args.partition_id < self._args.total_partitions - 1 else
-           length)
+           (length // self._args.total_partitions) if self._args.partition_id
+           < self._args.total_partitions - 1 else length)
     return start, end
 
   def skip_model(self, model_name: str):
diff --git a/infra/ansible/config/pip.yaml b/infra/ansible/config/pip.yaml
@@ -28,7 +28,7 @@ pip:
       - tqdm
       - typing_extensions
       - sympy
-      - yapf==0.30.0
+      - yapf==0.40.2
 
     build_amd64:
       - mkl
diff --git a/test/pytorch_test_base.py b/test/pytorch_test_base.py
@@ -619,8 +619,8 @@ def skipped_test(self, *args, reason=reason, **kwargs):
             setattr(cls, dtype_test_name, disallowed_test)
           if not skipped:
             xla_dtypes.append(
-                dtype_combination
-                if len(dtype_combination) > 1 else dtype_combination[0])
+                dtype_combination if len(dtype_combination) >
+                1 else dtype_combination[0])
         if len(xla_dtypes) != 0:
           test.dtypes[cls.device_type] = xla_dtypes
           super().instantiate_test(name, test, generic_cls=generic_cls)
diff --git a/test/spmd/test_xla_sharding.py b/test/spmd/test_xla_sharding.py
@@ -618,9 +618,9 @@ def test_inplace_add_with_sharding(self):
 
   # avoid calling xr.addressable_device_count here otherwise it will init the test
   # in non-spmd mode.
-  @unittest.skipIf(xr.device_type() == 'CPU',
-                   "sharding will be the same for both tensors on single device"
-                  )
+  @unittest.skipIf(
+      xr.device_type() == 'CPU',
+      "sharding will be the same for both tensors on single device")
   def test_shard_hashing(self):
     xt1 = torch.ones(2, 2).to(xm.xla_device())
     xt2 = torch.ones(2, 2).to(xm.xla_device())
@@ -1383,8 +1383,9 @@ def test_get_1d_mesh(self):
     self.assertEqual(mesh_without_name.mesh_shape,
                      (xr.global_runtime_device_count(),))
 
-  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
-                       "Multiple devices required for dataloader sharding test")
+  @unittest.skipUnless(
+      xr.global_runtime_device_count() > 1,
+      "Multiple devices required for dataloader sharding test")
   def test_data_loader_with_sharding(self):
     device = torch_xla.device()
     mesh = xs.get_1d_mesh("data")
@@ -1405,8 +1406,9 @@ def test_data_loader_with_sharding(self):
         f"{{devices=[{mesh.size()},1,1,1]{','.join([str(i) for i in range(mesh.size())])}}}"
     )
 
-  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
-                       "Multiple devices required for dataloader sharding test")
+  @unittest.skipUnless(
+      xr.global_runtime_device_count() > 1,
+      "Multiple devices required for dataloader sharding test")
   def test_data_loader_with_non_batch_size(self):
     device = torch_xla.device()
     mesh = xs.get_1d_mesh("data")
@@ -1427,8 +1429,9 @@ def test_data_loader_with_non_batch_size(self):
         f"{{devices=[{mesh.size()},1,1,1]{','.join([str(i) for i in range(mesh.size())])}}}"
     )
 
-  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
-                       "Multiple devices required for dataloader sharding test")
+  @unittest.skipUnless(
+      xr.global_runtime_device_count() > 1,
+      "Multiple devices required for dataloader sharding test")
   def test_data_loader_with_non_batch_size_and_mini_batch(self):
     device = torch_xla.device()
     mesh = xs.get_1d_mesh("data")
@@ -1660,9 +1663,9 @@ def test_get_logical_mesh(self):
     self.assertEqual(logical_mesh.shape, mesh_shape)
     np.testing.assert_array_equal(np.sort(logical_mesh.flatten()), device_ids)
 
-  @unittest.skipIf(xr.device_type() == 'CPU',
-                   "sharding will be the same for both tensors on single device"
-                  )
+  @unittest.skipIf(
+      xr.device_type() == 'CPU',
+      "sharding will be the same for both tensors on single device")
   def test_shard_as(self):
     mesh = self._get_mesh((self.n_devices,))
     partition_spec = (0,)
diff --git a/test/test_operations.py b/test/test_operations.py
@@ -2959,11 +2959,9 @@ def test_dlpack_roundtrip_tensor(self, dtype):
 
   @onlyIfTorchSupportsCUDA
   @onlyIfPJRTDeviceIsCUDA
-  @parameterized.parameters(*all_types_and_complex_and(torch.half,
-                                                       torch.bfloat16,
-                                                       torch.bool, torch.uint16,
-                                                       torch.uint32,
-                                                       torch.uint64))
+  @parameterized.parameters(
+      *all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool,
+                                 torch.uint16, torch.uint32, torch.uint64))
   def test_dlpack_roundtrip_scalar(self, dtype):
     xla_device = xm.xla_device()
     xla_tensor_0 = torch.tensor(42, dtype=dtype).to(xla_device)
diff --git a/test/test_pallas.py b/test/test_pallas.py
@@ -41,10 +41,10 @@ class PallasTest(parameterized.TestCase):
   # therefore we use != instead of ==.
   def _make_attention_mask_from_segment_ids(self, q_segment_ids,
                                             kv_segment_ids):
-    return q_segment_ids.view(q_segment_ids.shape[0], 1,
-                              q_segment_ids.shape[1], 1) != kv_segment_ids.view(
-                                  kv_segment_ids.shape[0], 1, 1,
-                                  kv_segment_ids.shape[1])
+    return q_segment_ids.view(q_segment_ids.shape[0], 1, q_segment_ids.shape[1],
+                              1) != kv_segment_ids.view(kv_segment_ids.shape[0],
+                                                        1, 1,
+                                                        kv_segment_ids.shape[1])
 
   def _attention(self, q, k, v, *, attn_mask=None, ab=None):
     attn_weight = q @ k.transpose(-2, -1)
diff --git a/test/test_pallas_spmd.py b/test/test_pallas_spmd.py
@@ -41,10 +41,10 @@ class PallasTest(unittest.TestCase):
   # therefore we use != instead of ==.
   def _make_attention_mask_from_segment_ids(self, q_segment_ids,
                                             kv_segment_ids):
-    return q_segment_ids.view(q_segment_ids.shape[0], 1,
-                              q_segment_ids.shape[1], 1) != kv_segment_ids.view(
-                                  kv_segment_ids.shape[0], 1, 1,
-                                  kv_segment_ids.shape[1])
+    return q_segment_ids.view(q_segment_ids.shape[0], 1, q_segment_ids.shape[1],
+                              1) != kv_segment_ids.view(kv_segment_ids.shape[0],
+                                                        1, 1,
+                                                        kv_segment_ids.shape[1])
 
   def _attention(self, q, k, v, *, attn_mask=None, ab=None):
     attn_weight = q @ k.transpose(-2, -1)
diff --git a/test/test_splash_attention.py b/test/test_splash_attention.py
@@ -62,10 +62,10 @@ def setUp(self):
 
   def _make_attention_mask_from_segment_ids(self, q_segment_ids,
                                             kv_segment_ids):
-    return q_segment_ids.view(q_segment_ids.shape[0], 1,
-                              q_segment_ids.shape[1], 1) != kv_segment_ids.view(
-                                  kv_segment_ids.shape[0], 1, 1,
-                                  kv_segment_ids.shape[1])
+    return q_segment_ids.view(q_segment_ids.shape[0], 1, q_segment_ids.shape[1],
+                              1) != kv_segment_ids.view(kv_segment_ids.shape[0],
+                                                        1, 1,
+                                                        kv_segment_ids.shape[1])
 
   def maybe_repeat_kv(self, hidden_state):
     if hidden_state.size(1) == self.NUM_Q_HEADS:
diff --git a/torch_xla/distributed/xla_multiprocessing.py b/torch_xla/distributed/xla_multiprocessing.py
@@ -174,7 +174,10 @@ def _v6e_create_replica_groups() -> List | None:
   return None
 
 
-device_kind_handler_dict: dict[str, Callable[..., List | None],] = {
+device_kind_handler_dict: dict[
+    str,
+    Callable[..., List | None],
+] = {
     _TPU_V5P: _v5p_create_replica_groups,
     _TPU_V6E: _v6e_create_replica_groups
 }
diff --git a/torch_xla/experimental/gradient_accumulation.py b/torch_xla/experimental/gradient_accumulation.py
@@ -288,8 +288,8 @@ def add_to_mapping(val: torch.Tensor,
       iterable_tensors, fake_iterable_tensors, carried_tensors,
       fake_carried_tensors, params, grads)
 
-  def _body_fn_wrapper(curr_iter: xb.Op, curr_loss: xb.Op,
-                       *while_params: xb.Op):
+  def _body_fn_wrapper(curr_iter: xb.Op, curr_loss: xb.Op, *while_params:
+                       xb.Op):
 
     def dynamic_slice(xs: xb.Op, idx: xb.Op) -> xb.Op:
       indices = [idx] + [idx.zeros_like() for _ in range(xs.shape().rank - 1)]
diff --git a/torch_xla/experimental/pallas_kernels/ragged_paged_attention_kernel.py b/torch_xla/experimental/pallas_kernels/ragged_paged_attention_kernel.py
@@ -243,8 +243,8 @@ def make_sequence_metadata(
   #
   # Remove tile visits that belong to a sequence not in our shard.
   iota = jnp.arange(num_sequences, dtype=jnp.int32)
-  active_sequence_mask = jnp.logical_and(iota <= end_sequence,
-                                         iota >= start_sequence)
+  active_sequence_mask = jnp.logical_and(iota <= end_sequence, iota
+                                         >= start_sequence)
   sequence_tiles = jnp.where(active_sequence_mask,
                              sequence_tiles[:num_sequences], 0)
   num_tiles = sequence_tiles.sum()
@@ -375,8 +375,8 @@ def _flash_attention(
                                      logical_q_blk_idx - 1, 0)
   is_first_processed_logical_q_blk = logical_q_blk_idx == 0
   physical_q_blk_changed = (
-      physical_q_tile_ids[logical_q_blk_idx] !=
-      physical_q_tile_ids[prev_logical_q_blk_idx])
+      physical_q_tile_ids[logical_q_blk_idx]
+      != physical_q_tile_ids[prev_logical_q_blk_idx])
   first_time_seeing_physical_q_blk = jnp.logical_or(
       is_first_processed_logical_q_blk, physical_q_blk_changed)
   is_first_kv_blk = (kv_blk_idx == 0)
@@ -509,8 +509,8 @@ def init_scratch_ref():  # pylint: disable=unused-variable
       logical_q_blk_idx + 1)
   is_last_logical_q_blk = (logical_q_blk_idx == num_logical_q_blks - 1)
   physical_q_blk_will_change = (
-      physical_q_tile_ids[logical_q_blk_idx] !=
-      physical_q_tile_ids[next_logical_q_blk_idx])
+      physical_q_tile_ids[logical_q_blk_idx]
+      != physical_q_tile_ids[next_logical_q_blk_idx])
   last_time_seeing_cur_physical_q_blk = jnp.logical_or(
       is_last_logical_q_blk, physical_q_blk_will_change)
   should_store_to_output = jnp.logical_and(is_last_kv_blk_idx,
diff --git a/torch_xla/experimental/pallas_kernels/ragged_paged_attention_v2.py b/torch_xla/experimental/pallas_kernels/ragged_paged_attention_v2.py
@@ -421,8 +421,8 @@ def init_scratch_ref():
       )
       causal_mask = row_ids < col_ids
       if sliding_window is not None:
-        causal_mask = jnp.logical_or(causal_mask,
-                                     row_ids - sliding_window >= col_ids)
+        causal_mask = jnp.logical_or(causal_mask, row_ids - sliding_window
+                                     >= col_ids)
       if soft_cap is not None:
         qk = soft_cap * jnp.tanh(qk / soft_cap)
       qk += jnp.where(causal_mask, mask_value, 0.0)