File tree Expand file tree Collapse file tree 4 files changed +23
-5
lines changed Expand file tree Collapse file tree 4 files changed +23
-5
lines changed Original file line number Diff line number Diff line change 7
7
8
8
if ON_ORANGE_PI :
9
9
mindspore .set_context (
10
- enable_compile_cache = True ,
10
+ enable_graph_kernel = True ,
11
11
mode = mindspore .GRAPH_MODE ,
12
12
jit_config = {
13
13
"jit_level" : "O2" ,
23
23
24
24
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
25
25
tokenizer = LlamaTokenizer .from_pretrained (model_id )
26
- model = LlamaForCausalLM .from_pretrained (model_id , ms_dtype = mindspore .float16 )
26
+ model = LlamaForCausalLM .from_pretrained (model_id , ms_dtype = mindspore .float16 , low_cpu_mem_usage = True )
27
+ model = model .npu ()
27
28
28
29
# quantize_cfg = w8x8(model.model.config)
29
30
# quantize(model, cfg=quantize_cfg)
Original file line number Diff line number Diff line change @@ -23,4 +23,13 @@ We offer an easy way to interact with Tinyllama. This guide explains how to set
23
23
24
24
``` bash
25
25
sudo sync && echo 3 | sudo tee /proc/sys/vm/drop_caches
26
+ ```
27
+
28
+ ``` bash
29
+ export TE_PARALLEL_COMPILER=1
30
+ export MAX_COMPILE_CORE_NUMBER=1
31
+ export MS_BUILD_PROCESS_NUM=1
32
+ export MAX_RUNTIME_CORE_NUMBER=1
33
+ # if use O2
34
+ export MS_ENABLE_IO_REUSE=1
26
35
```
Original file line number Diff line number Diff line change @@ -689,7 +689,7 @@ def __dir__(self):
689
689
690
690
def cuda (self ):
691
691
return self ._apply (lambda t : t .move_to ('GPU' ))
692
-
692
+
693
693
def npu (self ):
694
694
return self ._apply (lambda t : t .move_to ('Ascend' ))
695
695
Original file line number Diff line number Diff line change @@ -733,8 +733,8 @@ def _load_state_dict_into_meta_model(
733
733
if dtype is None :
734
734
param = param .to (old_param .dtype )
735
735
736
- if old_param .is_contiguous ():
737
- param = param .contiguous ()
736
+ # if old_param.is_contiguous():
737
+ # param = param.contiguous()
738
738
739
739
set_module_kwargs ["value" ] = param
740
740
@@ -2658,6 +2658,7 @@ def from_pretrained(
2658
2658
use_flash_attention_2 = kwargs .pop ("use_flash_attention_2" , False )
2659
2659
2660
2660
gguf_file = kwargs .pop ("gguf_file" , None )
2661
+ gguf_path = None
2661
2662
2662
2663
if token is not None and adapter_kwargs is not None and "token" not in adapter_kwargs :
2663
2664
adapter_kwargs ["token" ] = token
@@ -3100,6 +3101,13 @@ def from_pretrained(
3100
3101
else :
3101
3102
loaded_state_dict_keys = list (state_dict .keys ())
3102
3103
3104
+ if gguf_path is None and (low_cpu_mem_usage or use_keep_in_fp32_modules ):
3105
+ # In case some weights need to be kept in float32 and accelerate is not installed,
3106
+ # we later on want to take the path where state_dict is not None, that is the one
3107
+ # that do not require accelerate.
3108
+ state_dict = None
3109
+
3110
+
3103
3111
config .name_or_path = pretrained_model_name_or_path
3104
3112
3105
3113
# Instantiate model.
You can’t perform that action at this time.
0 commit comments