fix transformers p class ut (#2082)

lvyufeng · web-flow · commit a72d54ca26be · 2025-07-23T22:43:55.000+08:00
diff --git a/mindnlp/core/_tensor.py b/mindnlp/core/_tensor.py
@@ -655,6 +655,9 @@ def __contains__(self, item):
     Tensor.scatter_reduce_ = ops.inplace_scatter_reduce
     StubTensor.scatter_reduce_ = ops.inplace_scatter_reduce
 
+    Tensor.exponential_ = ops.inplace_exponential
+    StubTensor.exponential_ = ops.inplace_exponential
+
 def _rebuild_from_type_v2(func, new_type, args, state):
     ret = func(*args)
     return ret
diff --git a/mindnlp/core/backends/cuda/__init__.py b/mindnlp/core/backends/cuda/__init__.py
@@ -1,3 +1,6 @@
+import contextlib
+from typing_extensions import deprecated
+
 class cuBLASModule:
     # def __getattr__(self, name):
     #     if name == "allow_tf32":
@@ -26,4 +29,46 @@ class cuBLASModule:
     #     raise AttributeError("Unknown attribute " + name)
     pass
 
-matmul = cuBLASModule()
+matmul = cuBLASModule()
+
+@contextlib.contextmanager
+@deprecated(
+    (
+        "`torch.backends.cuda.sdp_kernel()` is deprecated. "
+        "In the future, this context manager will be removed. "
+        "Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, "
+        "with updated signature."
+    ),
+    category=FutureWarning,
+)
+def sdp_kernel(
+    enable_flash: bool = True,
+    enable_math: bool = True,
+    enable_mem_efficient: bool = True,
+    enable_cudnn: bool = True,
+):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    This context manager can be used to temporarily enable or disable any of the three backends for scaled dot product attention.
+    Upon exiting the context manager, the previous state of the flags will be restored.
+    """
+    # from torch.nn.attention import sdpa_kernel
+
+    # backend_list = []
+    # if enable_flash:
+    #     backend_list.append(SDPBackend.FLASH_ATTENTION)
+    # if enable_mem_efficient:
+    #     backend_list.append(SDPBackend.EFFICIENT_ATTENTION)
+    # if enable_math:
+    #     backend_list.append(SDPBackend.MATH)
+    # if enable_cudnn:
+    #     backend_list.append(SDPBackend.CUDNN_ATTENTION)
+
+    # with sdpa_kernel(backend_list) as context:
+    #     try:
+    #         yield context
+    #     finally:
+    #         pass
+
+    pass
diff --git a/mindnlp/core/compiler/__init__.py b/mindnlp/core/compiler/__init__.py
@@ -10,4 +10,6 @@ def staging_specialize(*args, **kwargs):
 
     if fn is not None:
         return wrap_func(fn)
-    return wrap_func
+    return wrap_func
+
+def reset(): pass
diff --git a/mindnlp/core/nn/utils/__init__.py b/mindnlp/core/nn/utils/__init__.py
@@ -2,3 +2,4 @@
 from . import parametrizations
 from .weight_norm import *
 from .clip_grad import *
+from .init import skip_init
diff --git a/mindnlp/core/nn/utils/init.py b/mindnlp/core/nn/utils/init.py
@@ -0,0 +1,54 @@
+# mypy: allow-untyped-defs
+import inspect
+
+from mindnlp import core
+
+def skip_init(module_cls, *args, **kwargs):
+    r"""
+    Given a module class object and args / kwargs, instantiate the module without initializing parameters / buffers.
+
+    This can be useful if initialization is slow or if custom initialization will
+    be performed, making the default initialization unnecessary. There are some caveats to this, due to
+    the way this function is implemented:
+
+    1. The module must accept a `device` arg in its constructor that is passed to any parameters
+    or buffers created during construction.
+
+    2. The module must not perform any computation on parameters in its constructor except
+    initialization (i.e. functions from :mod:`torch.nn.init`).
+
+    If these conditions are satisfied, the module can be instantiated with parameter / buffer values
+    uninitialized, as if having been created using :func:`torch.empty`.
+
+    Args:
+        module_cls: Class object; should be a subclass of :class:`torch.nn.Module`
+        args: args to pass to the module's constructor
+        kwargs: kwargs to pass to the module's constructor
+
+    Returns:
+        Instantiated module with uninitialized parameters / buffers
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> import torch
+        >>> m = torch.nn.utils.skip_init(torch.nn.Linear, 5, 1)
+        >>> m.weight
+        Parameter containing:
+        tensor([[0.0000e+00, 1.5846e+29, 7.8307e+00, 2.5250e-29, 1.1210e-44]],
+               requires_grad=True)
+        >>> m2 = torch.nn.utils.skip_init(torch.nn.Linear, in_features=6, out_features=1)
+        >>> m2.weight
+        Parameter containing:
+        tensor([[-1.4677e+24,  4.5915e-41,  1.4013e-45,  0.0000e+00, -1.4677e+24,
+                  4.5915e-41]], requires_grad=True)
+
+    """
+    if not issubclass(module_cls, core.nn.Module):
+        raise RuntimeError(f"Expected a Module; got {module_cls}")
+    if "device" not in inspect.signature(module_cls).parameters:
+        raise RuntimeError("Module must support a 'device' arg to skip initialization")
+
+    final_device = kwargs.pop("device", "cpu")
+    kwargs["device"] = "meta"
+    return module_cls(*args, **kwargs).to_empty(device=final_device)
diff --git a/mindnlp/core/ops/array.py b/mindnlp/core/ops/array.py
@@ -36,8 +36,8 @@ def concat(tensors, dim=0, *, out=None, **kwargs):
     return cat(tensors, dim, out=out, **kwargs)
 
 # concatenate
-def concatenate(tensors, dim=0, out=None):
-    return cat(tensors, dim, out=out)
+def concatenate(tensors, dim=0, out=None, **kwargs):
+    return cat(tensors, dim, out=out, **kwargs)
 
 # conj
 def conj(input):
diff --git a/mindnlp/core/ops/creation.py b/mindnlp/core/ops/creation.py
@@ -104,9 +104,9 @@ def arange(start=0, end=None, step=1, *, dtype=None, device=None):
     if ON_ORANGE_PI and dtype in (None, mindspore.int64):
         dtype = mindspore.int32
     if use_pyboost() and has_arange:
-        start = start.item() if isinstance(start, mindspore.Tensor) else start
-        end = end.item() if isinstance(end, mindspore.Tensor) else end
-        step = step.item() if isinstance(step, mindspore.Tensor) else step
+        start = start.item() if isinstance(start, (mindspore.Tensor, np.integer)) else start
+        end = end.item() if isinstance(end, (mindspore.Tensor, np.integer)) else end
+        step = step.item() if isinstance(step, (mindspore.Tensor, np.integer)) else step
         return mindspore.mint.arange(start, end, step, dtype=dtype)
 
     start = mindspore.Tensor(start) if not isinstance(start, mindspore.Tensor) else start
diff --git a/mindnlp/core/ops/inplace.py b/mindnlp/core/ops/inplace.py
@@ -137,6 +137,26 @@ def inplace_scatter_reduce(input, dim, index, src, reduce, *, include_self=True)
         reduce = "add"
     return inplace_scatter_src_reduce_op(input, dim, index, src, reduce)
 
+def inplace_exponential(tensor, lambd=1.0):
+    """
+    原地操作的指数分布采样 (类似Tensor.exponential_)
+    :param tensor: 要填充的目标张量
+    :param lambd: 率参数 (λ > 0)
+    :return: 修改后的张量 (原张量被覆盖)
+    """
+    assert lambd > 0, "lambd 必须大于0"
+    
+    # 生成与目标张量形状相同的均匀分布随机数
+    u = core.rand_like(tensor)
+    
+    # 数值保护
+    u = u.clamp(min=core.finfo(u.dtype).eps, max=1.0)
+    
+    # 逆变换法赋值
+    tensor.data = -core.log(1 - u) / lambd
+
+    return tensor
+
 __all__ = [
     'inplace_copy',
     'inplace_zero',
@@ -152,5 +172,6 @@ def inplace_scatter_reduce(input, dim, index, src, reduce, *, include_self=True)
     'inplace_fill_diagonal',
     'inplace_triu',
     'inplace_round',
-    'inplace_scatter_reduce'
+    'inplace_scatter_reduce',
+    'inplace_exponential'
 ]
diff --git a/mindnlp/core/ops/other.py b/mindnlp/core/ops/other.py
@@ -64,7 +64,7 @@ def manual_expand(tensor, shape):
 
 
 def broadcast_to(input, *shape):
-    if isinstance(shape[0], tuple):
+    if isinstance(shape[0], (list, tuple)):
         shape = shape[0]
     if ON_ORANGE_PI and not use_pyboost():
         # return input.expand(mindspore.tensor(shape))
diff --git a/mindnlp/core/ops/random.py b/mindnlp/core/ops/random.py
@@ -47,14 +47,16 @@ def multinomial(input, num_samples, replacement=False, *, generator=None):
 
         vals = div(log(random_uniform), input + 1e-10)
         _, samples = topk(vals, num_samples)
-
+ 
     return samples.astype(mindspore.int64)
 
 # normal
 has_normal = hasattr(mindspore.mint, 'normal')
 def normal(mean=0.0, std=1.0, size=None, *, generator=None, out=None):
     if use_pyboost() and has_normal:
-        return call_ms_func(mindspore.mint.normal, float(mean), float(std), size, generator, out=out)
+        mean = float(mean) if isinstance(mean, int) else mean
+        mean = float(std) if isinstance(std, int) else std
+        return call_ms_func(mindspore.mint.normal, mean, std, size, generator, out=out)
     if size is None:
         if isinstance(mean, mindspore.Tensor):
             size = mean.shape
@@ -90,8 +92,11 @@ def rand_like(input, *, dtype=None):
 has_randint = hasattr(mindspore.mint, 'randint')
 def randint(*args, **kwargs):
     device = kwargs.pop('device', None)
+    low = kwargs.pop('low', None)
     high = kwargs.pop('high', None)
     size = kwargs.pop('size', None)
+    if low is not None:
+        args += (low,)
     if high is not None:
         args += (high,)
     
@@ -112,11 +117,16 @@ def randint_like(*args, **kwargs):
 has_randn = hasattr(mindspore.mint, 'randn')
 def randn(*size, generator=None, dtype=None, **kwargs):
     size = kwargs.pop('size', size)
+    new_size = ()
+    for s in size:
+        if isinstance(s, np.integer):
+            s = s.item()
+        new_size += (s,)
     if dtype is None:
         dtype = get_default_dtype()
     if use_pyboost() and has_randn:
-        return mindspore.mint.randn(*size, generator=generator, dtype=dtype)
-    return ops.randn(*size, dtype=dtype)
+        return mindspore.mint.randn(*new_size, generator=generator, dtype=dtype)
+    return ops.randn(*new_size, dtype=dtype)
 
 # randn_like
 has_randn_like = hasattr(mindspore.mint, 'randn_like')