Support safetensors loading for layerwise (#2047)

Kaihui-intel · web-flow · commit 93d774684661 · 2024-10-31T15:28:07.000+08:00
Signed-off-by: Kaihui-intel &lt;kaihui.tang@intel.com&gt;
diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -21,13 +21,12 @@
 import os
 
 import torch
-from accelerate import init_empty_weights
 from accelerate.utils import set_module_tensor_to_device
-from transformers import AutoConfig, AutoModelForCausalLM
-from transformers.models.auto.auto_factory import _BaseAutoModelClass
+from safetensors import safe_open
 
 from neural_compressor.common import options
 from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear
+from neural_compressor.torch.utils.utility import dowload_hf_model, load_empty_model
 
 from .load import load
 
@@ -94,59 +93,6 @@ def get_named_children(model, pre=[]):
     return module_list
 
 
-def dowload_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None):  # pragma: no cover
-    """Download hugging face model from hf hub."""
-    from huggingface_hub.constants import DEFAULT_REVISION, HUGGINGFACE_HUB_CACHE
-    from huggingface_hub.file_download import REGEX_COMMIT_HASH, repo_folder_name
-    from huggingface_hub.utils import EntryNotFoundError
-
-    if cache_dir is None:
-        cache_dir = HUGGINGFACE_HUB_CACHE
-    if revision is None:
-        revision = DEFAULT_REVISION
-    if repo_type is None:
-        repo_type = "model"
-    storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
-    commit_hash = None
-    if REGEX_COMMIT_HASH.match(revision):
-        commit_hash = revision
-    else:
-        ref_path = os.path.join(storage_folder, "refs", revision)
-        if os.path.exists(ref_path):
-            with open(ref_path) as f:
-                commit_hash = f.read()
-    if storage_folder and commit_hash:
-        pointer_path = os.path.join(storage_folder, "snapshots", commit_hash)
-        if os.path.isdir(pointer_path):
-            return pointer_path
-    else:  # pragma: no cover
-        from huggingface_hub import snapshot_download
-
-        file_path = snapshot_download(repo_id)
-        return file_path
-
-
-def load_empty_model(pretrained_model_name_or_path, cls=AutoModelForCausalLM, **kwargs):  # pragma: no cover
-    """Load a empty model."""
-    is_local = os.path.isdir(pretrained_model_name_or_path)
-    if is_local:  # pragma: no cover
-        path = pretrained_model_name_or_path
-    else:
-        path = dowload_hf_model(pretrained_model_name_or_path)
-    if cls.__base__ == _BaseAutoModelClass:
-        config = AutoConfig.from_pretrained(path, **kwargs)
-        with init_empty_weights():
-            model = cls.from_config(config)
-    else:  # pragma: no cover
-        config = cls.config_class.from_pretrained(path, **kwargs)
-        with init_empty_weights():
-            model = cls(config)
-    model.tie_weights()
-    model.eval()
-    model.path = pretrained_model_name_or_path
-    return model
-
-
 def get_super_module_by_name(model, module_name):
     """Get the father module with given name of child module."""
     name_list = module_name.split(".")
@@ -211,6 +157,27 @@ def load_tensor(path, tensor_name=None, prefix=None):
         return state_dict
 
 
+def load_tensor_from_safetensors(path, tensor_name=None, device="cpu"):
+    """Load a tensor from safetensors file with given tensor name."""
+    with safe_open(path, framework="pt", device=device) as f:
+        value = f.get_tensor(tensor_name)
+    return value
+
+
+def load_tensor_from_safetensors_shard(
+    pretrained_model_name_or_path, tensor_name, prefix=None, device="cpu"
+):  # pragma: no cover
+    """Load tensor from shard."""
+    path = _get_path(pretrained_model_name_or_path)
+    idx_dict = json.load(open(os.path.join(path, "model.safetensors.index.json"), "r"))["weight_map"]
+    if tensor_name not in idx_dict.keys():
+        if tensor_name.replace(f"{prefix}.", "") in idx_dict.keys():
+            tensor_name = tensor_name.replace(f"{prefix}.", "")
+        else:
+            assert False, "{} not in the index.json".format(tensor_name)
+    return load_tensor_from_safetensors(os.path.join(path, idx_dict[tensor_name]), tensor_name, device)
+
+
 def _get_path(pretrained_model_name_or_path):
     is_local = os.path.isdir(pretrained_model_name_or_path)
     if is_local:  # pragma: no cover
@@ -223,13 +190,14 @@ def _get_path(pretrained_model_name_or_path):
 get_path = _get_path
 
 
-def load_value(model, param_name, path):
+def load_value(model, param_name, path, device="cpu"):
     """Load the module value.
 
     Args:
         model (torch.nn.module): torch model.
         param_name (str): module name.
         path (str): path to load state_dict per layer.
+        device (str, optional): module device. Defaults to "cpu".
 
     Returns:
         tensor: the module value.
@@ -241,7 +209,13 @@ def load_value(model, param_name, path):
             if module == input_embeddings:
                 param_name = name + "." + param_name.split(".")[-1]
     prefix = model.base_model_prefix
-    if "pytorch_model.bin.index.json" in os.listdir(path):
+    files = os.listdir(path)
+    safetensors_files = [filename for filename in files if filename.endswith(".safetensors")]
+    if len(safetensors_files) == 1:
+        value = load_tensor_from_safetensors(os.path.join(path, "model.safetensors"), param_name, device=device)
+    elif len(safetensors_files) >= 2:
+        value = load_tensor_from_safetensors_shard(path, param_name, device=device)
+    elif "pytorch_model.bin.index.json" in files:
         value = load_tensor_from_shard(path, param_name, prefix)
     else:
         value = load_tensor(os.path.join(path, "pytorch_model.bin"), param_name, prefix)
@@ -260,7 +234,7 @@ def load_module(model, module_name, path, device="cpu"):
     module = get_module(model, module_name)
     for n, p in module.named_parameters():
         param_name = module_name + "." + n
-        value = load_value(model, param_name, path)
+        value = load_value(model, param_name, path, device)
         set_module_tensor_to_device(model, param_name, device, value)
 
 
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
@@ -331,13 +331,24 @@ def dowload_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None):
                 commit_hash = f.read()
     if storage_folder and commit_hash:
         pointer_path = os.path.join(storage_folder, "snapshots", commit_hash)
-        if os.path.isdir(pointer_path):
+        if os.path.isdir(pointer_path) and any(
+            file.endswith(".bin") or file.endswith(".safetensors") for file in os.listdir(pointer_path)
+        ):
             return pointer_path
-    else:  # pragma: no cover
-        from huggingface_hub import snapshot_download
+    from huggingface_hub import list_repo_files, snapshot_download
+
+    files_info = list_repo_files(repo_id)
+    ignore_patterns = (
+        ["*.bin", "*.bin.index.json"]
+        if (
+            any(file for file in files_info if file.endswith(".bin"))
+            and any(file for file in files_info if file.endswith(".safetensors"))
+        )
+        else None
+    )
 
-        file_path = snapshot_download(repo_id)
-        return file_path
+    file_path = snapshot_download(repo_id, ignore_patterns=ignore_patterns)
+    return file_path
 
 
 def load_empty_model(pretrained_model_name_or_path, cls=None, **kwargs):
diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py
@@ -157,7 +157,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     has_remote_code,
                 )
 
-                model = load_empty_model(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
+                model = load_empty_model(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=trust_remote_code,
+                )
                 if use_cpu:
                     quantization_config.post_init_cpu()
             elif use_xpu:
diff --git a/test/3x/torch/quantization/weight_only/test_transfomers.py b/test/3x/torch/quantization/weight_only/test_transfomers.py
@@ -122,7 +122,7 @@ def test_use_layer_wise(self):
         dummy_input = fp32_model.dummy_inputs["input_ids"]
 
         # RTN
-        # use_layer_wise=True
+        # Case1: use_layer_wise=True
         woq_config = RtnConfig(bits=4, group_size=16, use_layer_wise=True)
         woq_model = AutoModelForCausalLM.from_pretrained(
             model_name_or_path,
@@ -139,7 +139,7 @@ def test_use_layer_wise(self):
         loaded_output = loaded_model(dummy_input)[0]
         assert torch.equal(woq_output, loaded_output), "loaded output should be same. Please double check."
 
-        # use_layer_wise=False
+        # Case2: use_layer_wise=False
         woq_config = RtnConfig(bits=4, group_size=16, use_layer_wise=False)
         woq_model = AutoModelForCausalLM.from_pretrained(
             model_name_or_path,
@@ -148,6 +148,45 @@ def test_use_layer_wise(self):
         woq_output2 = woq_model(dummy_input)[0]
         assert torch.equal(woq_output, woq_output2), "use_layer_wise output should be same. Please double check."
 
+        # Case3: test safetensors model file
+        from neural_compressor.torch.algorithms.layer_wise.utils import get_path
+
+        model_path = get_path(model_name_or_path)
+        from transformers import AutoModelForCausalLM as RawAutoModelForCausalLM
+
+        ori_model = RawAutoModelForCausalLM.from_pretrained(model_name_or_path)
+        # test 1 safetensors file
+        ori_model.save_pretrained(model_path, safe_serialization=True)
+        woq_config = RtnConfig(bits=4, group_size=16, use_layer_wise=True)
+
+        woq_model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            quantization_config=woq_config,
+        )
+        woq_output_1_safetensors = woq_model(dummy_input)[0]
+        assert torch.equal(woq_output, woq_output_1_safetensors)
+
+        # test 3 safetensors files
+        ori_model.save_pretrained(model_path, safe_serialization=True, max_shard_size="250KB")
+        woq_config = RtnConfig(bits=4, group_size=16, use_layer_wise=True)
+        woq_model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            quantization_config=woq_config,
+        )
+        woq_output_3_safetensors = woq_model(dummy_input)[0]
+        assert torch.equal(woq_output, woq_output_3_safetensors)
+
+        # case4: test dowload_hf_model
+        shutil.rmtree(model_path, ignore_errors=True)
+        woq_config = RtnConfig(bits=4, group_size=16, use_layer_wise=True)
+
+        woq_model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            quantization_config=woq_config,
+        )
+        woq_output_download = woq_model(dummy_input)[0]
+        assert torch.equal(woq_output_download, woq_output)
+
     def test_loading_autoawq_model(self):
         user_model = AutoModelForCausalLM.from_pretrained(self.autoawq_model)
         tokenizer = AutoTokenizer.from_pretrained(self.autoawq_model)