Skip to content

Commit ac841c9

Browse files
authored
fix low_cpu_mem_usage(contguous) (#1832)
1 parent 965d658 commit ac841c9

File tree

4 files changed

+23
-5
lines changed

4 files changed

+23
-5
lines changed

llm/inference/tinyllama/app_jit.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
if ON_ORANGE_PI:
99
mindspore.set_context(
10-
enable_compile_cache=True,
10+
enable_graph_kernel=True,
1111
mode=mindspore.GRAPH_MODE,
1212
jit_config={
1313
"jit_level": "O2",
@@ -23,7 +23,8 @@
2323

2424
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
2525
tokenizer = LlamaTokenizer.from_pretrained(model_id)
26-
model = LlamaForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16)
26+
model = LlamaForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16, low_cpu_mem_usage=True)
27+
model = model.npu()
2728

2829
# quantize_cfg = w8x8(model.model.config)
2930
# quantize(model, cfg=quantize_cfg)

llm/inference/tinyllama/readme.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,13 @@ We offer an easy way to interact with Tinyllama. This guide explains how to set
2323

2424
```bash
2525
sudo sync && echo 3 | sudo tee /proc/sys/vm/drop_caches
26+
```
27+
28+
```bash
29+
export TE_PARALLEL_COMPILER=1
30+
export MAX_COMPILE_CORE_NUMBER=1
31+
export MS_BUILD_PROCESS_NUM=1
32+
export MAX_RUNTIME_CORE_NUMBER=1
33+
# if use O2
34+
export MS_ENABLE_IO_REUSE=1
2635
```

mindnlp/core/nn/modules/module.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -689,7 +689,7 @@ def __dir__(self):
689689

690690
def cuda(self):
691691
return self._apply(lambda t: t.move_to('GPU'))
692-
692+
693693
def npu(self):
694694
return self._apply(lambda t: t.move_to('Ascend'))
695695

mindnlp/transformers/modeling_utils.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -733,8 +733,8 @@ def _load_state_dict_into_meta_model(
733733
if dtype is None:
734734
param = param.to(old_param.dtype)
735735

736-
if old_param.is_contiguous():
737-
param = param.contiguous()
736+
# if old_param.is_contiguous():
737+
# param = param.contiguous()
738738

739739
set_module_kwargs["value"] = param
740740

@@ -2658,6 +2658,7 @@ def from_pretrained(
26582658
use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False)
26592659

26602660
gguf_file = kwargs.pop("gguf_file", None)
2661+
gguf_path = None
26612662

26622663
if token is not None and adapter_kwargs is not None and "token" not in adapter_kwargs:
26632664
adapter_kwargs["token"] = token
@@ -3100,6 +3101,13 @@ def from_pretrained(
31003101
else:
31013102
loaded_state_dict_keys = list(state_dict.keys())
31023103

3104+
if gguf_path is None and (low_cpu_mem_usage or use_keep_in_fp32_modules):
3105+
# In case some weights need to be kept in float32 and accelerate is not installed,
3106+
# we later on want to take the path where state_dict is not None, that is the one
3107+
# that do not require accelerate.
3108+
state_dict = None
3109+
3110+
31033111
config.name_or_path = pretrained_model_name_or_path
31043112

31053113
# Instantiate model.

0 commit comments

Comments
 (0)