Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .github/workflows/lint_check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,20 @@ jobs:
pip install flake8==v3.8.4
FLAKE_DISABLE_LIST="F403,F405,W504,W503,E203"
flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST --exclude=./internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
flake8 --max-line-length=120 --ignore=$FLAKE_DISABLE_LIST ./train.py

- name: lint-isort
run: |
pip install isort==5.12.0
isort --check --profile=black ./internlm/*
isort --check --profile=black ./train.py

- name: lint-black
run: |
pip install black==22.8.0
BLACK_EXCLUDE_SETTINGS='\.venv/|\.local/|\.cache/|\.git/'
black --line-length=120 --check --exclude $BLACK_EXCLUDE_SETTINGS ./internlm/*
black --line-length=120 --check --exclude $BLACK_EXCLUDE_SETTINGS ./train.py

- name: lint-pylint
run: |
pip install pylint==v2.17.2
PYLINT_DISABLE_LIST="C0114,C0415,W0212,W0235,W0238,W0621,C0103,R1735,C2801,E0402,C0412,W0719,R1728,W1514,W0718,W0105,W0707,C0209,W0703,W1203"
pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST --ignore=./internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py ./internlm/*
pylint --rcfile .pylintrc --disable=$PYLINT_DISABLE_LIST ./train.py
2 changes: 1 addition & 1 deletion ci_scripts/train/generate_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os

from ci_scripts.common import com_func
from internlm.core.context import Config
from internlm.core.context.parallel_context import Config


def generate_new_config(config_py_file, test_config_json, case_name):
Expand Down
3 changes: 1 addition & 2 deletions configs/1.8B_MoE16_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand All @@ -197,7 +196,7 @@
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
"""
parallel = dict(
zero1=dict(size=-1, fsdp=False),
zero1=dict(size=-1),
tensor=dict(size=1, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True),
Expand Down
3 changes: 1 addition & 2 deletions configs/57B_qwen2_MoE.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand All @@ -202,7 +201,7 @@
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
"""
parallel = dict(
zero1=dict(size=-1, fsdp=False),
zero1=dict(size=-1),
tensor=dict(size=1, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True),
Expand Down
3 changes: 1 addition & 2 deletions configs/7B_MoE4_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down Expand Up @@ -217,7 +216,7 @@
4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
"""
parallel = dict(
zero1=dict(size=-1, fsdp=False),
zero1=dict(size=-1),
tensor=dict(size=1, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
Expand Down
1 change: 0 additions & 1 deletion configs/7B_baichuan2.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
1 change: 0 additions & 1 deletion configs/7B_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
1 change: 0 additions & 1 deletion configs/7B_internlm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
1 change: 0 additions & 1 deletion configs/7B_isp_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
1 change: 0 additions & 1 deletion configs/7B_llama2.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
1 change: 0 additions & 1 deletion configs/7B_qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
3 changes: 1 addition & 2 deletions configs/7B_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
norm_type="rmsnorm",
layer_norm_epsilon=1e-5,
use_flash_attn=True,
use_flash_attn=False,
# Whether the odd and even columns of the query and key in the model are normally interleaved.
# If it's True, the model's odd and even columns are normally ordered; if it's False,
# it means that the model has prematurely concatenated all odd columns and even columns in front
Expand All @@ -174,7 +174,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
3 changes: 1 addition & 2 deletions configs/8x22B_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand All @@ -203,7 +202,7 @@
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
"""
parallel = dict(
zero1=dict(size=-1, fsdp=False),
zero1=dict(size=-1),
tensor=dict(size=1, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True),
Expand Down
3 changes: 1 addition & 2 deletions configs/8x7B_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand All @@ -203,7 +202,7 @@
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
"""
parallel = dict(
zero1=dict(size=-1, fsdp=False),
zero1=dict(size=-1),
tensor=dict(size=1, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True),
Expand Down
1 change: 0 additions & 1 deletion configs/_base_/models/internlm2_1B.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
1 change: 0 additions & 1 deletion configs/_base_/models/internlm2_20B.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
1 change: 0 additions & 1 deletion configs/_base_/models/internlm2_7B.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
1 change: 0 additions & 1 deletion configs/_base_/models/internlm_20B.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
1 change: 0 additions & 1 deletion configs/_base_/models/internlm_7B.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down
2 changes: 1 addition & 1 deletion doc/code-docs/source/initialize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ InternEvo 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_
模型初始化
-------------------------

.. autofunction:: internlm.train.initialize_model
.. autofunction:: internlm.train.initialize_model_and_parallel_communicator

InternEvo 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下:

Expand Down
2 changes: 1 addition & 1 deletion doc/code-docs/source/training.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
- 初始化模型
.. code-block:: python

model = initialize_model()
model = initialize_model_and_parallel_communicator()

详细介绍请参考: `模型初始化 <https://internevo.readthedocs.io/zh-cn/latest/initialize.html#internlm-model-init>`_

Expand Down
2 changes: 1 addition & 1 deletion doc/en/train_performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ model = dict(
)

parallel = dict(
zero1=dict(size=8, fsdp=False),
zero1=dict(size=8),
tensor=1,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
Expand Down
2 changes: 1 addition & 1 deletion doc/train_performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ model = dict(
)

parallel = dict(
zero1=dict(size=8, fsdp=False),
zero1=dict(size=8),
tensor=1,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
Expand Down
2 changes: 0 additions & 2 deletions doc/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,6 @@ zero1 parallel (dict):
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
Expand Down Expand Up @@ -432,7 +431,6 @@ parallel = dict(
- 当`zero1 <= 0`,则 zero1 进程组的大小等于数据并行进程组的大小,因此优化器状态参数将在数据并行范围内分配
- 当`zero1 == 1`,则不使用 zero1 ,所有数据并行组保留完整的优化器状态参数
- 当`zero1 > 1`且`zero1 <= data_parallel_world_size`,则 zero1 进程组是数据并行进程组的子集
2. fsdp: 布尔值,启用/禁用torch的完全分片数据并行,默认为False。
- tensor(字典):
1. size: 整数,张量并行的大小。
2. mode: 字符串,张量并行模式,应该是 ['mtp', 'msp', 'fsp', 'isp'] 中的一个,
Expand Down
7 changes: 3 additions & 4 deletions generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@

from internlm.accelerator import get_accelerator
from internlm.apis.inference import SequenceGenerator
from internlm.core.context import global_context as gpc
from internlm.core.context.parallel_context import global_context as gpc
from internlm.data import build_generation_loader_with_data_type
from internlm.initialize import initialize_distributed_env
from internlm.monitor import initialize_monitor_manager
from internlm.monitor.monitor import monitor_manager as mm
from internlm.train import initialize_model, initialize_parallel_communicator
from internlm.train import initialize_model_and_parallel_communicator
from internlm.utils.common import (
enable_pytorch_expandable_segments,
launch_time,
Expand Down Expand Up @@ -106,8 +106,7 @@ def main():
raise e

# initialize model
model = initialize_model()
_ = initialize_parallel_communicator(model)
model, _ = initialize_model_and_parallel_communicator()
model = model.model

state_dict = merge_pp_within_tp(generation_config.ckpt_folder, del_model_prefix=True)
Expand Down
9 changes: 0 additions & 9 deletions internlm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +0,0 @@
from .initialize.initialize_trainer import initialize_trainer
from .initialize.launch import get_default_parser, launch_from_slurm, launch_from_torch

__all__ = [
"get_default_parser",
"initialize_trainer",
"launch_from_slurm",
"launch_from_torch",
]
2 changes: 1 addition & 1 deletion internlm/apis/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from internlm.apis import InferenceParams, process_parallel_output
from internlm.core.context import ParallelMode # noqa: E402
from internlm.core.context import global_context as gpc # noqa: E402
from internlm.core.context.parallel_context import global_context as gpc # noqa: E402
from internlm.core.trainer import Trainer

__all__ = ["SequenceGenerator"]
Expand Down
2 changes: 1 addition & 1 deletion internlm/apis/inference_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch

from internlm.core.context import ParallelMode # noqa: E402
from internlm.core.context import global_context as gpc # noqa: E402
from internlm.core.context.parallel_context import global_context as gpc # noqa: E402
from internlm.core.parallel.comm.utils import _gather as gather


Expand Down
7 changes: 5 additions & 2 deletions internlm/checkpoint/checkpoint_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from internlm.accelerator import get_accelerator
from internlm.core.context import ParallelMode
from internlm.core.context import global_context as gpc
from internlm.core.context.parallel_context import global_context as gpc
from internlm.core.trainer import TrainState
from internlm.initialize.launch import get_config_value
from internlm.initialize.legacy.launch import (
Expand All @@ -23,6 +23,7 @@
from internlm.utils.common import get_current_device
from internlm.utils.logger import get_logger
from internlm.utils.megatron_timers import megatron_timer as timer
from internlm.utils.parallel import is_using_fsdp, is_using_hf
from internlm.utils.storage_manager import (
get_storage_manager,
init_storage_manager,
Expand Down Expand Up @@ -271,7 +272,7 @@ def __init__(
self.storage_manager = get_storage_manager()
self.snapshot_counter = -1

if hasattr(model, "model"):
if hasattr(model, "model") and not is_using_fsdp():
model = model.model

self.model = model
Expand Down Expand Up @@ -575,6 +576,8 @@ def try_resume_training(self, train_state: TrainState, current_time=""):
f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
)
elif is_using_fsdp() and is_using_hf() and not self.auto_resume:
pass
else:
load_path = self.load_ckpt_info["path"]
load_content = self.load_ckpt_info["content"]
Expand Down
Loading
Loading