add README

wwwjn · wwwjn · commit 2d7f2ac570aa · 2025-07-08T11:54:15.000-07:00
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1,10 @@
+# This is a CODEOWNERS file.
+# Each line is a file pattern followed by one or more owners.
+
+# These owners will be the default owners for everything in
+# the repo. Unless a later match takes precedence,
+# they will be requested for review when someone opens a pull request.
+* @tianyu-l @fegin @wwwjn @wconstab
+
+# Exclude the experiments directory by adding a pattern without owners
+/torchtitan/experiments/
diff --git a/torchtitan/models/deepseek_v3/README.md b/torchtitan/models/deepseek_v3/README.md
@@ -1,8 +1,35 @@
-# DeepSeek-V3 in torchtitan
+# DeepSeek-V3 in TorchTitan
 
-Download tokenizer:
+DeepSeek-V3 is a Mixture-of-Experts (MoE) transformer model with Multi-head Latent Attention (MLA) architecture.
 
-```
+## Setup
+
+### Download Tokenizer
+
+```bash
 # DeepSeek tokenizer (automatically downloads tokenizer.json and tokenizer_config.json)
 python scripts/download_tokenizer.py --repo_id deepseek-ai/DeepSeek-V3
 ```
+
+## Training
+
+### Debug Training
+
+```bash
+# Quick debug run with small model
+CONFIG_FILE="./torchtitan/models/deepseek_v3/train_configs/debug_model.toml" ./run_train.sh
+```
+
+### Full Model Training
+
+```bash
+# 16B parameter model
+CONFIG_FILE="./torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml" ./run_train.sh
+```
+
+
+## Supported Features
+- FSDP, HSDP
+- Activation checkpointing
+- Tensor Parallel (TP)
+- Expert Parallel (EP)
diff --git a/torchtitan/models/deepseek_v3/train_configs/debug_model.toml b/torchtitan/models/deepseek_v3/train_configs/debug_model.toml
@@ -40,7 +40,7 @@ lr_min = 0.0
 
 [training]
 local_batch_size = 8
-seq_len = 2048
+seq_len = 4096
 max_norm = 1.0  # grad norm clipping
 steps = 10
 compile = false
@@ -52,6 +52,7 @@ data_parallel_shard_degree = -1
 fsdp_reshard_after_forward = "default" # default / never / always
 tensor_parallel_degree = 1
 enable_async_tensor_parallel = false
+expert_parallel_degree = 1
 
 [checkpoint]
 enable_checkpoint = false
diff --git a/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml b/torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml
@@ -38,8 +38,8 @@ decay_type = "linear"
 lr_min = 0.0
 
 [training]
-local_batch_size = 16
-seq_len = 2048
+local_batch_size = 8
+seq_len = 4096
 max_norm = 1.0  # grad norm clipping
 steps = 100
 compile = false
@@ -51,7 +51,7 @@ data_parallel_shard_degree = -1
 fsdp_reshard_after_forward = "default" # default / never / always
 tensor_parallel_degree = 1
 enable_async_tensor_parallel = false
-expert_parallel_degree = 2
+expert_parallel_degree = 1
 
 [checkpoint]
 enable_checkpoint = false