File tree Expand file tree Collapse file tree 4 files changed +45
-7
lines changed
torchtitan/models/deepseek_v3 Expand file tree Collapse file tree 4 files changed +45
-7
lines changed Original file line number Diff line number Diff line change
1
+ # This is a CODEOWNERS file.
2
+ # Each line is a file pattern followed by one or more owners.
3
+
4
+ # These owners will be the default owners for everything in
5
+ # the repo. Unless a later match takes precedence,
6
+ # they will be requested for review when someone opens a pull request.
7
+ * @ tianyu-l @ fegin @ wwwjn @ wconstab
8
+
9
+ # Exclude the experiments directory by adding a pattern without owners
10
+ /torchtitan /experiments /
Original file line number Diff line number Diff line change 1
- # DeepSeek-V3 in torchtitan
1
+ # DeepSeek-V3 in TorchTitan
2
2
3
- Download tokenizer:
3
+ DeepSeek-V3 is a Mixture-of-Experts (MoE) transformer model with Multi-head Latent Attention (MLA) architecture.
4
4
5
- ```
5
+ ## Setup
6
+
7
+ ### Download Tokenizer
8
+
9
+ ``` bash
6
10
# DeepSeek tokenizer (automatically downloads tokenizer.json and tokenizer_config.json)
7
11
python scripts/download_tokenizer.py --repo_id deepseek-ai/DeepSeek-V3
8
12
```
13
+
14
+ ## Training
15
+
16
+ ### Debug Training
17
+
18
+ ``` bash
19
+ # Quick debug run with small model
20
+ CONFIG_FILE=" ./torchtitan/models/deepseek_v3/train_configs/debug_model.toml" ./run_train.sh
21
+ ```
22
+
23
+ ### Full Model Training
24
+
25
+ ``` bash
26
+ # 16B parameter model
27
+ CONFIG_FILE=" ./torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml" ./run_train.sh
28
+ ```
29
+
30
+
31
+ ## Supported Features
32
+ - FSDP, HSDP
33
+ - Activation checkpointing
34
+ - Tensor Parallel (TP)
35
+ - Expert Parallel (EP)
Original file line number Diff line number Diff line change @@ -40,7 +40,7 @@ lr_min = 0.0
40
40
41
41
[training ]
42
42
local_batch_size = 8
43
- seq_len = 2048
43
+ seq_len = 4096
44
44
max_norm = 1.0 # grad norm clipping
45
45
steps = 10
46
46
compile = false
@@ -52,6 +52,7 @@ data_parallel_shard_degree = -1
52
52
fsdp_reshard_after_forward = " default" # default / never / always
53
53
tensor_parallel_degree = 1
54
54
enable_async_tensor_parallel = false
55
+ expert_parallel_degree = 1
55
56
56
57
[checkpoint ]
57
58
enable_checkpoint = false
Original file line number Diff line number Diff line change @@ -38,8 +38,8 @@ decay_type = "linear"
38
38
lr_min = 0.0
39
39
40
40
[training ]
41
- local_batch_size = 16
42
- seq_len = 2048
41
+ local_batch_size = 8
42
+ seq_len = 4096
43
43
max_norm = 1.0 # grad norm clipping
44
44
steps = 100
45
45
compile = false
@@ -51,7 +51,7 @@ data_parallel_shard_degree = -1
51
51
fsdp_reshard_after_forward = " default" # default / never / always
52
52
tensor_parallel_degree = 1
53
53
enable_async_tensor_parallel = false
54
- expert_parallel_degree = 2
54
+ expert_parallel_degree = 1
55
55
56
56
[checkpoint ]
57
57
enable_checkpoint = false
You can’t perform that action at this time.
0 commit comments