InternLM · zigzagcai · Jan 16, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/.github/workflows/lint_check.yaml b/.github/workflows/lint_check.yaml
@@ -19,24 +19,20 @@ jobs:
         pip install flake8==v3.8.4
         FLAKE_DISABLE_LIST="F403,F405,W504,W503,E203"
         flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST --exclude=./internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
-        flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST ./train.py
 
     - name: lint-isort
       run: |
         pip install isort==5.12.0
         isort --check --profile=black ./internlm/*
-        isort --check --profile=black ./train.py
 
     - name: lint-black
       run: |
         pip install black==22.8.0
         BLACK_EXCLUDE_SETTINGS='\.venv/|\.local/|\.cache/|\.git/'
         black --line-length=120 --check --exclude $BLACK_EXCLUDE_SETTINGS ./internlm/*
-        black --line-length=120 --check --exclude $BLACK_EXCLUDE_SETTINGS ./train.py
 
     - name: lint-pylint
       run: |
         pip install pylint==v2.17.2
         PYLINT_DISABLE_LIST="C0114,C0415,W0212,W0235,W0238,W0621,C0103,R1735,C2801,E0402,C0412,W0719,R1728,W1514,W0718,W0105,W0707,C0209,W0703,W1203"
         pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST --ignore=./internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
-        pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST ./train.py
diff --git a/ci_scripts/train/generate_config.py b/ci_scripts/train/generate_config.py
@@ -5,7 +5,7 @@
 import os
 
 from ci_scripts.common import com_func
-from internlm.core.context import Config
+from internlm.core.context.parallel_context import Config
 
 
 def generate_new_config(config_py_file, test_config_json, case_name):

diff --git a/configs/1.8B_MoE16_sft.py b/configs/1.8B_MoE16_sft.py
@@ -170,7 +170,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -197,7 +196,7 @@
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
+    zero1=dict(size=-1),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True),

diff --git a/configs/57B_qwen2_MoE.py b/configs/57B_qwen2_MoE.py
@@ -175,7 +175,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -202,7 +201,7 @@
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
+    zero1=dict(size=-1),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True),

diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py
@@ -182,7 +182,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -217,7 +216,7 @@
     4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
+    zero1=dict(size=-1),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),

diff --git a/configs/7B_baichuan2.py b/configs/7B_baichuan2.py
@@ -165,7 +165,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/configs/7B_gemma.py b/configs/7B_gemma.py
@@ -172,7 +172,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/configs/7B_internlm2.py b/configs/7B_internlm2.py
@@ -174,7 +174,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/configs/7B_isp_sft.py b/configs/7B_isp_sft.py
@@ -187,7 +187,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/configs/7B_llama2.py b/configs/7B_llama2.py
@@ -164,7 +164,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/configs/7B_qwen2.py b/configs/7B_qwen2.py
@@ -172,7 +172,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
@@ -155,7 +155,7 @@
     dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
+    use_flash_attn=False,
     # Whether the odd and even columns of the query and key in the model are normally interleaved.
     # If it's True, the model's odd and even columns are normally ordered; if it's False,
     # it means that the model has prematurely concatenated all odd columns and even columns in front
@@ -174,7 +174,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/configs/8x22B_mixtral.py b/configs/8x22B_mixtral.py
@@ -176,7 +176,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -203,7 +202,7 @@
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
+    zero1=dict(size=-1),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True),

diff --git a/configs/8x7B_mixtral.py b/configs/8x7B_mixtral.py
@@ -176,7 +176,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -203,7 +202,7 @@
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
+    zero1=dict(size=-1),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
     weight=dict(size=1, overlap=True),

diff --git a/configs/_base_/models/internlm2_1B.py b/configs/_base_/models/internlm2_1B.py
@@ -51,7 +51,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/configs/_base_/models/internlm2_20B.py b/configs/_base_/models/internlm2_20B.py
@@ -48,7 +48,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/configs/_base_/models/internlm2_7B.py b/configs/_base_/models/internlm2_7B.py
@@ -48,7 +48,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/configs/_base_/models/internlm_20B.py b/configs/_base_/models/internlm_20B.py
@@ -43,7 +43,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/configs/_base_/models/internlm_7B.py b/configs/_base_/models/internlm_7B.py
@@ -43,7 +43,6 @@
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],

diff --git a/doc/code-docs/source/initialize.rst b/doc/code-docs/source/initialize.rst
@@ -43,7 +43,7 @@ InternEvo 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_
 模型初始化
 -------------------------
 
-.. autofunction:: internlm.train.initialize_model
+.. autofunction:: internlm.train.initialize_model_and_parallel_communicator
 
 InternEvo 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下：
 

diff --git a/doc/code-docs/source/training.rst b/doc/code-docs/source/training.rst
@@ -27,7 +27,7 @@
 - 初始化模型
 .. code-block:: python
 
-    model = initialize_model()
+    model = initialize_model_and_parallel_communicator()
 
 详细介绍请参考： `模型初始化 <https://internevo.readthedocs.io/zh-cn/latest/initialize.html#internlm-model-init>`_
 

diff --git a/doc/en/train_performance.md b/doc/en/train_performance.md
@@ -121,7 +121,7 @@ model = dict(
 )
 
 parallel = dict(
-    zero1=dict(size=8, fsdp=False),
+    zero1=dict(size=8),
     tensor=1,
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=False,

diff --git a/doc/train_performance.md b/doc/train_performance.md
@@ -117,7 +117,7 @@ model = dict(
 )
 
 parallel = dict(
-    zero1=dict(size=8, fsdp=False),
+    zero1=dict(size=8),
     tensor=1,
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=False,

diff --git a/doc/usage.md b/doc/usage.md
@@ -268,7 +268,6 @@ zero1 parallel (dict):
         * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
         * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
 tensor parallel (dict):
     1. size: int, the size of tensor parallel.
     2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
@@ -432,7 +431,6 @@ parallel = dict(
     - 当`zero1 <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
     - 当`zero1 == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
     - 当`zero1 > 1`且`zero1 <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集
-  2. fsdp: 布尔值，启用/禁用torch的完全分片数据并行，默认为False。
 - tensor（字典）：
   1. size: 整数，张量并行的大小。
   2. mode: 字符串，张量并行模式，应该是 ['mtp', 'msp', 'fsp', 'isp'] 中的一个，

diff --git a/generate.py b/generate.py
@@ -16,12 +16,12 @@
 
 from internlm.accelerator import get_accelerator
 from internlm.apis.inference import SequenceGenerator
-from internlm.core.context import global_context as gpc
+from internlm.core.context.parallel_context import global_context as gpc
 from internlm.data import build_generation_loader_with_data_type
 from internlm.initialize import initialize_distributed_env
 from internlm.monitor import initialize_monitor_manager
 from internlm.monitor.monitor import monitor_manager as mm
-from internlm.train import initialize_model, initialize_parallel_communicator
+from internlm.train import initialize_model_and_parallel_communicator
 from internlm.utils.common import (
     enable_pytorch_expandable_segments,
     launch_time,
@@ -106,8 +106,7 @@ def main():
         raise e
 
     # initialize model
-    model = initialize_model()
-    _ = initialize_parallel_communicator(model)
+    model, _ = initialize_model_and_parallel_communicator()
     model = model.model
 
     state_dict = merge_pp_within_tp(generation_config.ckpt_folder, del_model_prefix=True)

diff --git a/internlm/__init__.py b/internlm/__init__.py
@@ -1,9 +0,0 @@
-from .initialize.initialize_trainer import initialize_trainer
-from .initialize.launch import get_default_parser, launch_from_slurm, launch_from_torch
-
-__all__ = [
-    "get_default_parser",
-    "initialize_trainer",
-    "launch_from_slurm",
-    "launch_from_torch",
-]

diff --git a/internlm/apis/inference.py b/internlm/apis/inference.py
@@ -7,7 +7,7 @@
 
 from internlm.apis import InferenceParams, process_parallel_output
 from internlm.core.context import ParallelMode  # noqa: E402
-from internlm.core.context import global_context as gpc  # noqa: E402
+from internlm.core.context.parallel_context import global_context as gpc  # noqa: E402
 from internlm.core.trainer import Trainer
 
 __all__ = ["SequenceGenerator"]

diff --git a/internlm/apis/inference_utils.py b/internlm/apis/inference_utils.py
@@ -1,7 +1,7 @@
 import torch
 
 from internlm.core.context import ParallelMode  # noqa: E402
-from internlm.core.context import global_context as gpc  # noqa: E402
+from internlm.core.context.parallel_context import global_context as gpc  # noqa: E402
 from internlm.core.parallel.comm.utils import _gather as gather
 
 

diff --git a/internlm/checkpoint/checkpoint_manager.py b/internlm/checkpoint/checkpoint_manager.py
@@ -9,7 +9,7 @@
 
 from internlm.accelerator import get_accelerator
 from internlm.core.context import ParallelMode
-from internlm.core.context import global_context as gpc
+from internlm.core.context.parallel_context import global_context as gpc
 from internlm.core.trainer import TrainState
 from internlm.initialize.launch import get_config_value
 from internlm.initialize.legacy.launch import (
@@ -23,6 +23,7 @@
 from internlm.utils.common import get_current_device
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
+from internlm.utils.parallel import is_using_fsdp, is_using_hf
 from internlm.utils.storage_manager import (
     get_storage_manager,
     init_storage_manager,
@@ -271,7 +272,7 @@ def __init__(
         self.storage_manager = get_storage_manager()
         self.snapshot_counter = -1
 
-        if hasattr(model, "model"):
+        if hasattr(model, "model") and not is_using_fsdp():
             model = model.model
 
         self.model = model
@@ -575,6 +576,8 @@ def try_resume_training(self, train_state: TrainState, current_time=""):
                     f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
                     f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
                 )
+        elif is_using_fsdp() and is_using_hf() and not self.auto_resume:
+            pass
         else:
             load_path = self.load_ckpt_info["path"]
             load_content = self.load_ckpt_info["content"]