Skip to content

Commit 74d3fbd

Browse files
committed
add TP v1
1 parent 4244c16 commit 74d3fbd

File tree

4 files changed

+8
-9
lines changed

4 files changed

+8
-9
lines changed

torchtitan/models/deepseek_v3/infra/parallelize.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,10 @@ def parallelize_deepseekv3(
3232

3333
if parallel_dims.tp_enabled:
3434
if job_config.parallelism.enable_async_tensor_parallel:
35+
# TODO(jianiw): This branch needs to be tested and enabled
3536
raise NotImplementedError(
36-
"Currently, async TP is not tested for deepseekv3"
37+
"Currently, async TP is not tested for deepseekv3. \
38+
torch.compile is not supported yet, which is required for async TP."
3739
)
3840

3941
enable_float8_linear = "float8" in job_config.model.converters
@@ -44,15 +46,11 @@ def parallelize_deepseekv3(
4446

4547
enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise
4648
if enable_float8_tensorwise_tp:
49+
# TODO(jianiw): This branch needs to be tested and enabled
4750
raise NotImplementedError(
4851
"Currently, float8 tensorwise TP is not tested for deepseekv3"
4952
)
5053

51-
if parallel_dims.loss_parallel_enabled:
52-
raise NotImplementedError(
53-
"Currently, loss parallel is not tested for deepseekv3"
54-
)
55-
5654
apply_tp(
5755
model,
5856
world_mesh["tp"],

torchtitan/models/deepseek_v3/model/model.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
# LICENSE file in the root directory of this source tree.
66

77
import math
8-
from re import I
98
from typing import Tuple
109

1110
import torch

torchtitan/models/deepseek_v3/train_configs/debug_model.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M)
5050
data_parallel_replicate_degree = 1
5151
data_parallel_shard_degree = -1
5252
fsdp_reshard_after_forward = "default" # default / never / always
53+
tensor_parallel_degree = 1
54+
enable_async_tensor_parallel = false
5355

5456
[checkpoint]
5557
enable_checkpoint = false

torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ dataset = "c4" # supported datasets: c4_test (2K), c4 (177M)
4949
data_parallel_replicate_degree = 1
5050
data_parallel_shard_degree = -1
5151
fsdp_reshard_after_forward = "default" # default / never / always
52-
tensor_parallel_degree = 2
53-
disable_loss_parallel = true
52+
tensor_parallel_degree = 1
53+
enable_async_tensor_parallel = false
5454

5555
[checkpoint]
5656
enable_checkpoint = false

0 commit comments

Comments
 (0)