mindspore-lab
diff --git a/‎.github/pylint.conf
Lines changed: 2 additions & 1 deletion b/‎.github/pylint.conf
Lines changed: 2 additions & 1 deletion
diff --git a/‎llm/inference/chatglm3/cli_demo.py
Lines changed: 2 additions & 2 deletions b/‎llm/inference/chatglm3/cli_demo.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎llm/inference/llama2/compile_inference_with_static_cache.py
Lines changed: 60 additions & 0 deletions b/‎llm/inference/llama2/compile_inference_with_static_cache.py
Lines changed: 60 additions & 0 deletions
diff --git a/‎llm/inference/llama3/readme.md
Lines changed: 30 additions & 0 deletions b/‎llm/inference/llama3/readme.md
Lines changed: 30 additions & 0 deletions
diff --git a/‎llm/inference/llama3/run_llama3_mem_profile.py
Lines changed: 27 additions & 0 deletions b/‎llm/inference/llama3/run_llama3_mem_profile.py
Lines changed: 27 additions & 0 deletions
diff --git a/‎mindnlp/accelerate/__init__.py
Lines changed: 10 additions & 0 deletions b/‎mindnlp/accelerate/__init__.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎mindnlp/accelerate/big_modeling.py
Lines changed: 93 additions & 0 deletions b/‎mindnlp/accelerate/big_modeling.py
Lines changed: 93 additions & 0 deletions
diff --git a/‎mindnlp/accelerate/utils/modeling.py
Lines changed: 5 additions & 1 deletion b/‎mindnlp/accelerate/utils/modeling.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎mindnlp/core/modules/__init__.py renamed to ‎mindnlp/common/modules/__init__.py b/‎mindnlp/core/modules/__init__.py renamed to ‎mindnlp/common/modules/__init__.py
diff --git a/‎mindnlp/core/modules/accumulator.py renamed to ‎mindnlp/common/modules/accumulator.py b/‎mindnlp/core/modules/accumulator.py renamed to ‎mindnlp/common/modules/accumulator.py
@@ -216,7 +216,8 @@ disable=raw-checker-failed,
         consider-using-generator,
         fixme,
         use-a-generator,
-        nested-min-max
+        nested-min-max,
+        method-hidden
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
 
@@ -1,9 +1,9 @@
 import os
 import platform
-from mindnlp.transformers import ChatGLM3Tokenizer, ChatGLM3ForConditionalGeneration
+from mindnlp.transformers import ChatGLM3Tokenizer, AutoModelForCausalLM
 
 tokenizer = ChatGLM3Tokenizer.from_pretrained("ZhipuAI/chatglm3-6b", mirror='modelscope', revision='master')
-model = ChatGLM3ForConditionalGeneration.from_pretrained("ZhipuAI/chatglm3-6b", mirror='modelscope', revision='master')
+model = AutoModelForCausalLM.from_pretrained("ZhipuAI/chatglm3-6b", mirror='modelscope', revision='master')
 model = model.set_train(False)
 
 os_name = platform.system()
 
@@ -0,0 +1,60 @@
+import mindspore
+from mindnlp.transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache
+from mindnlp.core import ops
+import time
+
+prompts = [
+    "Simply put, the theory of relativity states that ",
+    "My favorite all time favorite condiment is ketchup.",
+]
+
+NUM_TOKENS_TO_GENERATE = 40
+
+model_id = 'shakechen/llama-2-7b-hf'
+tokenizer = LlamaTokenizer.from_pretrained(model_id, mirror='modelscope', pad_token="</s>", padding_side="right")
+model = LlamaForCausalLM.from_pretrained(model_id, mirror='modelscope', use_safetensors=False, ms_dtype=mindspore.float16)
+
+inputs = tokenizer(prompts, return_tensors="ms", padding=True)
+
+def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values):
+    logits = model(
+        cur_token,
+        position_ids=input_pos,
+        cache_position=cache_position,
+        past_key_values=past_key_values,
+        return_dict=False,
+        use_cache=True
+    )[0]
+    new_token = ops.argmax(logits[:, -1], dim=-1)[:, None]
+    return new_token
+
+batch_size, seq_length = inputs["input_ids"].shape
+# with no_grad():
+past_key_values = StaticCache(
+    config=model.config, max_batch_size=2, max_cache_len=1024, dtype=model.dtype
+)
+cache_position = ops.arange(seq_length)
+generated_ids = ops.zeros(
+    batch_size, seq_length + NUM_TOKENS_TO_GENERATE + 1, dtype=mindspore.int32
+)
+generated_ids[:, cache_position] = inputs["input_ids"].to(mindspore.int32)
+
+logits = model(
+    **inputs, cache_position=cache_position, past_key_values=past_key_values,return_dict=False, use_cache=True
+)[0]
+next_token = ops.argmax(logits[:, -1], dim=-1)[:, None]
+generated_ids[:, seq_length] = next_token[:, 0]
+
+model.compile(jit_config=mindspore.JitConfig(jit_syntax_level='STRICT'))
+
+cache_position = mindspore.tensor([seq_length + 1])
+for _ in range(1, NUM_TOKENS_TO_GENERATE):
+    s = time.time()
+    next_token = decode_one_tokens(model, next_token, None, cache_position, past_key_values)
+    t = time.time()
+    print(t - s)
+    generated_ids[:, cache_position] = next_token.int()
+    cache_position += 1
+
+text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+print(text)
@@ -0,0 +1,30 @@
+## Run distributed (pipeline parallel)
+
+### use msrun (recommend)
+
+`msrun` is a MindSpore defined launcher for multi-process parallel execution, which can get best performance, you can use it by the command below:
+
+```bash
+msrun --worker_num=2 --local_worker_num=2 --master_port=8118 --join=True run_llama3_distributed.py
+```
+
+if you use Ascend NPU with Kunpeng CPU, you should bind-core to get better performance
+
+```bash
+msrun --worker_num=2 --local_worker_num=2 --master_port=8118 --join=True --bind_core=True run_llama3_distributed.py
+```
+
+### use mpirun
+
+`mpirun` controls several aspects of program execution in Open MPI, you can use it by the command below:
+
+```bash
+mpirun -n 2 python run_llama3_distributed.py
+```
+
+if you use Ascend NPU with Kunpeng CPU, you should bind-core to get better performance:
+
+```bash
+mpirun --bind-to numa -n 2 python run_llama3_distributed.py
+```
+
@@ -0,0 +1,27 @@
+import os
+import psutil
+import gc
+from memory_profiler import profile
+import mindspore
+from mindnlp.transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "LLM-Research/Meta-Llama-3-8B-Instruct"
+
+@profile
+def test():
+    tokenizer = AutoTokenizer.from_pretrained(model_id, mirror='modelscope')
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        ms_dtype=mindspore.float16,
+        mirror='modelscope',
+        low_cpu_mem_usage=True
+    )
+
+if __name__ == '__main__':
+
+    a=test()
+
+    print('A：%.2f MB' % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024))
+    del a
+    gc.collect()
+    print('B：%.2f MB' % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024))
@@ -16,3 +16,13 @@
     # load_checkpoint_in_model,
     # synchronize_rng_states,
 )
+
+from .big_modeling import (
+    # cpu_offload,
+    # cpu_offload_with_hook,
+    # disk_offload,
+    # dispatch_model,
+    init_empty_weights,
+    init_on_empty,
+    # load_checkpoint_and_dispatch,
+)
@@ -0,0 +1,93 @@
+"""big modeling"""
+from contextlib import contextmanager
+from mindspore._c_expression import Tensor as Tensor_ # pylint: disable=no-name-in-module
+from mindnlp.utils.testing_utils import parse_flag_from_env
+from mindnlp.core import nn
+
+@contextmanager
+def init_empty_weights(include_buffers: bool = None):
+    """
+    A context manager under which models are initialized with all parameters on the meta device, therefore creating an
+    empty model. Useful when just initializing the model would blow the available RAM.
+
+    Args:
+        include_buffers (`bool`, *optional*):
+            Whether or not to also put all buffers on the meta device while initializing.
+
+    Example:
+
+    ```python
+    import torch.nn as nn
+    from accelerate import init_empty_weights
+
+    # Initialize a model with 100 billions parameters in no time and without using any RAM.
+    with init_empty_weights():
+        tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
+    ```
+
+    <Tip warning={true}>
+
+    Any model created under this context manager has no weights. As such you can't do something like
+    `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
+    Make sure to overwrite the default device_map param for [`load_checkpoint_and_dispatch`], otherwise dispatch is not
+    called.
+
+    </Tip>
+    """
+    if include_buffers is None:
+        include_buffers = parse_flag_from_env("ACCELERATE_INIT_INCLUDE_BUFFERS", False)
+    with init_on_empty(include_buffers=include_buffers) as f:
+        yield f
+
+
+@contextmanager
+def init_on_empty(include_buffers: bool = None):
+    """
+    A context manager under which models are initialized with all parameters on the specified device.
+
+    Args:
+        device (`torch.device`):
+            Device to initialize all parameters on.
+        include_buffers (`bool`, *optional*):
+            Whether or not to also put all buffers on the meta device while initializing.
+
+    Example:
+
+    ```python
+    import torch.nn as nn
+    from accelerate import init_on_device
+
+    with init_on_device(device=torch.device("cuda")):
+        tst = nn.Linear(100, 100)  # on `cuda` device
+    ```
+    """
+    if include_buffers is None:
+        include_buffers = parse_flag_from_env("ACCELERATE_INIT_INCLUDE_BUFFERS", False)
+
+    old_register_parameter = nn.Module.register_parameter
+    if include_buffers:
+        old_register_buffer = nn.Module.register_buffer
+
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name].assign_value(Tensor_(shape=(), dtype=module._parameters[name].dtype))
+            module._parameters[name].meta = True
+
+    def register_empty_buffer(module, name, buffer, persistent=True):
+        old_register_buffer(module, name, buffer, persistent=persistent)
+        if buffer is not None:
+            module._buffers[name].assign_value(Tensor_(shape=(), dtype=module._buffers[name].dtype))
+            module._buffers[name].meta = True
+
+    try:
+        nn.Module.register_parameter = register_empty_parameter
+        if include_buffers:
+            nn.Module.register_buffer = register_empty_buffer
+        yield
+    finally:
+        nn.Module.register_parameter = old_register_parameter
+        if include_buffers:
+            nn.Module.register_buffer = old_register_buffer
@@ -10,8 +10,12 @@
 from typing import Optional, Dict, Union, List, Tuple, Set
 import mindspore
 from mindspore.communication import get_group_size, get_rank
+from mindnlp.configs import SUPPORT_ASYNC_DIST_OP
 try:
-    from mindspore.communication.comm_func import isend, irecv, broadcast
+    if SUPPORT_ASYNC_DIST_OP:
+        from mindspore.communication.comm_func import send as isend, recv as irecv, broadcast
+    else:
+        from mindspore.communication.comm_func import isend, irecv, broadcast
 except:
     from mindnlp.parallel.comm_func import isend, irecv, broadcast