fix low_cpu_mem_usage(contguous) (#1832)

lvyufeng · web-flow · commit ac841c916a8f · 2024-11-22T14:47:51.000+08:00
diff --git a/llm/inference/tinyllama/app_jit.py b/llm/inference/tinyllama/app_jit.py
@@ -7,7 +7,7 @@
 
 if ON_ORANGE_PI:
     mindspore.set_context(
-        enable_compile_cache=True,
+        enable_graph_kernel=True,
         mode=mindspore.GRAPH_MODE,
         jit_config={
             "jit_level": "O2",
@@ -23,7 +23,8 @@
 
 model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 tokenizer = LlamaTokenizer.from_pretrained(model_id)
-model = LlamaForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16)
+model = LlamaForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16, low_cpu_mem_usage=True)
+model = model.npu()
 
 # quantize_cfg = w8x8(model.model.config)
 # quantize(model, cfg=quantize_cfg)
diff --git a/llm/inference/tinyllama/readme.md b/llm/inference/tinyllama/readme.md
@@ -23,4 +23,13 @@ We offer an easy way to interact with Tinyllama. This guide explains how to set
 
 ```bash
 sudo sync && echo 3 | sudo tee /proc/sys/vm/drop_caches
+```
+
+```bash
+export TE_PARALLEL_COMPILER=1
+export MAX_COMPILE_CORE_NUMBER=1
+export MS_BUILD_PROCESS_NUM=1
+export MAX_RUNTIME_CORE_NUMBER=1
+# if use O2
+export MS_ENABLE_IO_REUSE=1
 ```
diff --git a/mindnlp/core/nn/modules/module.py b/mindnlp/core/nn/modules/module.py
@@ -689,7 +689,7 @@ def __dir__(self):
 
     def cuda(self):
         return self._apply(lambda t: t.move_to('GPU'))
-    
+
     def npu(self):
         return self._apply(lambda t: t.move_to('Ascend'))
 
diff --git a/mindnlp/transformers/modeling_utils.py b/mindnlp/transformers/modeling_utils.py
@@ -733,8 +733,8 @@ def _load_state_dict_into_meta_model(
             if dtype is None:
                 param = param.to(old_param.dtype)
 
-            if old_param.is_contiguous():
-                param = param.contiguous()
+            # if old_param.is_contiguous():
+            #     param = param.contiguous()
 
         set_module_kwargs["value"] = param
 
@@ -2658,6 +2658,7 @@ def from_pretrained(
         use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False)
 
         gguf_file = kwargs.pop("gguf_file", None)
+        gguf_path = None
 
         if token is not None and adapter_kwargs is not None and "token" not in adapter_kwargs:
             adapter_kwargs["token"] = token
@@ -3100,6 +3101,13 @@ def from_pretrained(
             else:
                 loaded_state_dict_keys = list(state_dict.keys())
 
+            if gguf_path is None and (low_cpu_mem_usage or use_keep_in_fp32_modules):
+                # In case some weights need to be kept in float32 and accelerate is not installed,
+                # we later on want to take the path where state_dict is not None, that is the one
+                # that do not require accelerate.
+                state_dict = None
+
+
         config.name_or_path = pretrained_model_name_or_path
 
         # Instantiate model.