Only calls destroy_process_group if the trainer exist successfully (#1342)

fegin · web-flow · commit 6b11290c1e45 · 2025-06-27T14:04:05.000-07:00
If we perform the destroy_process_group when some trainers have
exceptions while others are doing collectives, the cleanup itself will
cause deadlock.

stacktrace:

```
Thread 0x7F81445A8440 (active): "MainThread"
    destroy_process_group (torch/distributed/distributed_c10d.py:2184)
    &lt;module&gt; (torchtitan/train.py:554)
    _run_code (runpy.py:86)
    _run_module_as_main (runpy.py:196)
Thread 0x7F7E83CFF640 (active): "Thread-1 (_read_thread)"
    _recv_msg (torch/_inductor/compile_worker/subproc_pool.py:61)
    _read_thread (torch/_inductor/compile_worker/subproc_pool.py:195)
    run (threading.py:953)
    _bootstrap_inner (threading.py:1016)
    _bootstrap (threading.py:973)
Thread 0x7F7D9CFF9640 (idle): "Thread-2"
    wait (threading.py:324)
    wait (threading.py:607)
    run (tqdm/_monitor.py:60)
    _bootstrap_inner (threading.py:1016)
    _bootstrap (threading.py:973)
```
diff --git a/torchtitan/experiments/flux/train.py b/torchtitan/experiments/flux/train.py
@@ -225,10 +225,11 @@ def train_step(
             logger.info("Created seed checkpoint")
         else:
             trainer.train()
-    finally:
+    except Exception:
         if trainer:
             trainer.close()
-
-        if torch.distributed.is_initialized():
-            torch.distributed.destroy_process_group()
-            logger.info("Process group destroyed.")
+        raise
+    else:
+        trainer.close()
+        torch.distributed.destroy_process_group()
+        logger.info("Process group destroyed.")
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -556,10 +556,11 @@ def close(self) -> None:
             logger.info("Created seed checkpoint")
         else:
             trainer.train()
-    finally:
+    except Exception:
         if trainer:
             trainer.close()
-
-        if torch.distributed.is_initialized():
-            torch.distributed.destroy_process_group()
-            logger.info("Process group destroyed.")
+        raise
+    else:
+        trainer.close()
+        torch.distributed.destroy_process_group()
+        logger.info("Process group destroyed.")