fix sbert normalize_embeddings (#1879)

lvyufeng · web-flow · commit 54161e2c7729 · 2024-12-20T17:09:14.000+08:00
diff --git a/mindnlp/core/serialization.py b/mindnlp/core/serialization.py
@@ -33,7 +33,7 @@
 
 from contextlib import closing, contextmanager
 from enum import Enum
-from typing import Dict, Union, Optional, Any, OrderedDict
+from typing import Dict, Union, Optional, Any, OrderedDict, Tuple, List
 from functools import reduce
 from dataclasses import dataclass
 
@@ -46,6 +46,7 @@
 import safetensors.numpy
 from safetensors import deserialize
 
+from mindnlp.core import nn
 from mindnlp.core.nn import Parameter
 from mindnlp.configs import SUPPORT_BF16
 from .nn import Module
@@ -1548,3 +1549,83 @@ def load_checkpoint(ckpt_file_name):
                          "'filter_prefix' or 'specify_prefix' are set correctly.")
 
     return parameter_dict
+
+
+def save_model(
+    model: nn.Module, filename: str, metadata: Optional[Dict[str, str]] = None, force_contiguous: bool = True
+):
+    """
+    Saves a given torch model to specified filename.
+    This method exists specifically to avoid tensor sharing issues which are
+    not allowed in `safetensors`. [More information on tensor sharing](../torch_shared_tensors)
+
+    Args:
+        model (`nn.Module`):
+            The model to save on disk.
+        filename (`str`):
+            The filename location to save the file
+        metadata (`Dict[str, str]`, *optional*):
+            Extra information to save along with the file.
+            Some metadata will be added for each dropped tensors.
+            This information will not be enough to recover the entire
+            shared structure but might help understanding things
+        force_contiguous (`boolean`, *optional*, defaults to True):
+            Forcing the state_dict to be saved as contiguous tensors.
+            This has no effect on the correctness of the model, but it
+            could potentially change performance if the layout of the tensor
+            was chosen specifically for that reason.
+    """
+    state_dict = model.state_dict()
+
+    if force_contiguous:
+        state_dict = {k: v.contiguous() for k, v in state_dict.items()}
+    try:
+        safe_save_file(state_dict, filename, metadata=metadata)
+    except ValueError as e:
+        msg = str(e)
+        msg += " Or use save_model(..., force_contiguous=True), read the docs for potential caveats."
+        raise ValueError(msg)
+
+
+def load_model(
+    model: nn.Module, filename: Union[str, os.PathLike], strict: bool = True
+) -> Tuple[List[str], List[str]]:
+    """
+    Loads a given filename onto a torch model.
+    This method exists specifically to avoid tensor sharing issues which are
+    not allowed in `safetensors`. [More information on tensor sharing](../torch_shared_tensors)
+
+    Args:
+        model (`nn.Module`):
+            The model to load onto.
+        filename (`str`, or `os.PathLike`):
+            The filename location to load the file from.
+        strict (`bool`, *optional*, defaults to True):
+            Whether to fail if you're missing keys or having unexpected ones.
+            When false, the function simply returns missing and unexpected names.
+        device (`Union[str, int]`, *optional*, defaults to `cpu`):
+            The device where the tensors need to be located after load.
+            available options are all regular torch device locations.
+
+    Returns:
+        `(missing, unexpected): (List[str], List[str])`
+            `missing` are names in the model which were not modified during loading
+            `unexpected` are names that are on the file, but weren't used during
+            the load.
+    """
+    state_dict = safe_load_file(filename)
+    model_state_dict = model.state_dict()
+
+    missing, unexpected = model.load_state_dict(state_dict, strict=False)
+    missing = set(missing)
+
+    if strict and (missing or unexpected):
+        missing_keys = ", ".join([f'"{k}"' for k in sorted(missing)])
+        unexpected_keys = ", ".join([f'"{k}"' for k in sorted(unexpected)])
+        error = f"Error(s) in loading state_dict for {model.__class__.__name__}:"
+        if missing:
+            error += f"\n    Missing key(s) in state_dict: {missing_keys}"
+        if unexpected:
+            error += f"\n    Unexpected key(s) in state_dict: {unexpected_keys}"
+        raise RuntimeError(error)
+    return missing, unexpected
diff --git a/mindnlp/sentence/models/__init__.py b/mindnlp/sentence/models/__init__.py
@@ -17,9 +17,11 @@
 from .transformer import Transformer
 from .pooling import Pooling
 from .normalize import Normalize
+from .dense import Dense
 
 __all__ = [
     "Transformer",
     "Pooling",
     "Normalize",
+    "Dense"
 ]
diff --git a/mindnlp/sentence/models/dense.py b/mindnlp/sentence/models/dense.py
@@ -0,0 +1,96 @@
+"""dense model"""
+from __future__ import annotations
+
+import json
+import os
+
+
+from mindspore import Tensor
+from mindnlp.core import nn
+from mindnlp.core.serialization import load_model as load_safetensors_model, save, load
+from mindnlp.core.serialization import save_model as save_safetensors_model
+
+from ..util import fullname, import_from_string
+
+
+class Dense(nn.Module):
+    """
+    Feed-forward function with activation function.
+
+    This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networks (DAN).
+
+    Args:
+        in_features: Size of the input dimension
+        out_features: Output size
+        bias: Add a bias vector
+        activation_function: Pytorch activation function applied on
+            output
+        init_weight: Initial value for the matrix of the linear layer
+        init_bias: Initial value for the bias of the linear layer
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        activation_function=nn.Tanh(),
+        init_weight: Tensor = None,
+        init_bias: Tensor = None,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.bias = bias
+        self.activation_function = activation_function
+        self.linear = nn.Linear(in_features, out_features, bias=bias)
+
+        if init_weight is not None:
+            self.linear.weight = nn.Parameter(init_weight)
+
+        if init_bias is not None:
+            self.linear.bias = nn.Parameter(init_bias)
+
+    def forward(self, features: dict[str, Tensor]):
+        features.update({"sentence_embedding": self.activation_function(self.linear(features["sentence_embedding"]))})
+        return features
+
+    def get_sentence_embedding_dimension(self) -> int:
+        return self.out_features
+
+    def get_config_dict(self):
+        return {
+            "in_features": self.in_features,
+            "out_features": self.out_features,
+            "bias": self.bias,
+            "activation_function": fullname(self.activation_function),
+        }
+
+    def save(self, output_path, safe_serialization: bool = True) -> None:
+        with open(os.path.join(output_path, "config.json"), "w") as fOut:
+            json.dump(self.get_config_dict(), fOut)
+
+        if safe_serialization:
+            save_safetensors_model(self, os.path.join(output_path, "model.safetensors"))
+        else:
+            save(self.state_dict(), os.path.join(output_path, "pytorch_model.bin"))
+
+    def __repr__(self):
+        return f"Dense({self.get_config_dict()})"
+
+    @staticmethod
+    def load(input_path):
+        with open(os.path.join(input_path, "config.json")) as fIn:
+            config = json.load(fIn)
+
+        config["activation_function"] = import_from_string(config["activation_function"])()
+        model = Dense(**config)
+        if os.path.exists(os.path.join(input_path, "model.safetensors")):
+            load_safetensors_model(model, os.path.join(input_path, "model.safetensors"))
+        else:
+            model.load_state_dict(
+                load(
+                    os.path.join(input_path, "pytorch_model.bin"), weights_only=True
+                )
+            )
+        return model
diff --git a/mindnlp/sentence/sentence_transformer.py b/mindnlp/sentence/sentence_transformer.py
@@ -199,7 +199,6 @@ def _load_module_class_from_ref(
         # If the class is from sentence_transformers, we can directly import it,
         # otherwise, we try to import it dynamically, and if that fails, we fall back to the default import
         if class_ref.startswith("sentence_transformers."):
-            class_ref = class_ref.replace('sentence_transformers', 'mindnlp.sentence')
             return import_from_string(class_ref)
 
         return import_from_string(class_ref)
diff --git a/mindnlp/sentence/util.py b/mindnlp/sentence/util.py
@@ -551,6 +551,12 @@ def import_from_string(dotted_path: str) -> type:
         >>> import_from_string('sentence_transformers.losses.MultipleNegativesRankingLoss')
         <class 'sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss'>
     """
+    if 'sentence_transformers' in dotted_path:
+        dotted_path = dotted_path.replace('sentence_transformers', 'mindnlp.sentence')
+
+    if 'torch.nn' in dotted_path:
+        dotted_path = dotted_path.replace('torch.nn', 'mindnlp.core.nn')
+
     try:
         module_path, class_name = dotted_path.rsplit(".", 1)
     except ValueError: