mindspore-lab · lvyufeng · Sep 18, 2025 · Sep 18, 2025
diff --git a/mindnlp/__init__.py b/mindnlp/__init__.py
@@ -38,7 +38,7 @@
     mindspore.set_device(os.environ.get('DEVICE_TARGET'))
 
 # for different ascend devices
-if platform.system().lower() == 'linux':
+if platform.system().lower() == 'linux' and mindspore.get_context('device_target') == 'Ascend':
     SOC = MSContext.get_instance().get_ascend_soc_version()
     # enable vmm since only vmm can release device memory when del tensor.
     if SOC != 'ascend310b':

diff --git a/mindnlp/core/__init__.py b/mindnlp/core/__init__.py
@@ -36,6 +36,7 @@
 preserve_format = None
 legacy_contiguous_format = None
 channels_last_3d = None
+channels_last = None
 memory_format = None
 
 inf = float("inf")

diff --git a/mindnlp/core/_apis/cpu.py b/mindnlp/core/_apis/cpu.py
@@ -1221,3 +1221,12 @@ def logsumexp(input, dim, keepdim=False):
 
 def bernoulli(input, generator):
     return legacy.bernoulli(input, seed, offset)
+
+def right_shift(input, other):
+    return legacy.right_shift(input, other)
+
+def histc(input, bins=100, min=0, max=0):
+    return legacy.histogram(input, bins, float(min), float(max))
+
+def search_sorted(sorted_sequence, values, sorter, dtype, right):
+    return legacy.search_sorted(sorted_sequence, values, sorter, dtype, right)
diff --git a/mindnlp/core/_apis/gpu.py b/mindnlp/core/_apis/gpu.py
@@ -4,7 +4,7 @@
 import mindspore
 from mindspore._c_expression import _empty_instance
 from mindnlp import core
-from .._op_prim.cpu import legacy
+from .._op_prim.gpu import legacy
 
 try:
     from mindspore._c_expression import TensorPy as Tensor_
@@ -34,6 +34,8 @@ def fill_scalar(size, fill_value, dtype):
     return legacy.cast(legacy.fill_v2(size, mindspore.Tensor(fill_value)), dtype)
 
 def fill_tensor(size, fill_value, dtype):
+    if dtype is None:
+        return legacy.fill_v2(size, mindspore.Tensor(fill_value))
     return legacy.cast(legacy.fill_v2(size, fill_value), dtype)
 
 def zeros_like(input, dtype):
@@ -123,6 +125,9 @@ def div(input, other):
     return legacy.div(input, other)
 
 def mul(input, other):
+    if input.dtype == core.bool:
+        if isinstance(other, bool) or (not isinstance(other, numbers.Number) and other.dtype == core.bool):
+            return bitwise_and_scalar(input, other)
     return legacy.mul(input, other)
 
 def reduce_all(input, axis, keepdims):
@@ -253,6 +258,11 @@ def less(input, other):
     return legacy.less(input, other)
 
 def select(condition, x, y):
+    if isinstance(x, numbers.Number) or x.ndim == 0:
+        x = fill_scalar(condition.shape, x, None)
+    if isinstance(y, numbers.Number) or y.ndim == 0:
+        y = fill_scalar(condition.shape, y, None)
+
     return legacy.select(condition, x, y)
 
 def round(input, decimals):
@@ -317,16 +327,15 @@ def ones_like(input, dtype):
     return legacy.ones_like(input)
 
 def embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq):
-    return cast(legacy.gather(weight, input, 0, 0), weight.dtype)
+    return legacy.gather(weight, input, 0, 0)
 
 def linspace(start, end, steps, dtype):
     start = float(start)
     end = float(end)
     return legacy.lin_space(mindspore.Tensor(start), mindspore.Tensor(end), steps)
 
 def masked_fill(input, mask, value):
-    if input.dtype.is_floating_point and isinstance(value, numbers.Number):
-        value = float(value)
+    value = fill_scalar((), value, input.dtype)    
     return legacy.masked_fill(input, mask, value)
 
 def sum(input, dim, keepdim, dtype):
@@ -388,9 +397,14 @@ def layer_norm(input, normalized_shape, weight, bias, eps=1e-5):
     return legacy.layer_norm(input, weight, bias, begin_axis, begin_axis, eps)
 
 def argmin_with_value(input, axis, keep_dims):
+    if axis is None:
+        axis = -1
     return legacy.arg_min_with_value(input, axis, keep_dims)
 
 def argmax_with_value(input, axis, keep_dims):
+    if axis is None:
+        axis = -1
+
     return legacy.arg_max_with_value(input, axis, keep_dims)
 
 def silu(input):
@@ -425,9 +439,13 @@ def eye(n, m, dtype):
     return legacy.eye(n, m, dtype)
 
 def argmax(input, axis, keep_dims):
+    if axis is None:
+        axis = -1
     return legacy.arg_max_with_value(input, axis, keep_dims)[0]
 
 def argmin(input, axis, keep_dims):
+    if axis is None:
+        axis = -1
     return legacy.arg_min_with_value(input, axis, keep_dims)[0]
 
 def exp(input):
@@ -489,18 +507,7 @@ def scatter(input, dim, index, src):
     return legacy.tensor_scatter_elements(input, index, src, dim, "none")
 
 def batch_norm(input, weight, bias, running_mean=None, runnning_var=None, training=False, momentum=0.1, epsilon=1e-5):
-    input_ndim = input.ndim
-    if input_ndim == 2:
-        return legacy.batch_norm(input, weight, bias, running_mean, runnning_var, training, epsilon, momentum, 'NCHW')
-    else:
-        input = transpose_view(input, 1, -1)
-        input_shape = input.shape
-        input = reshape(input, (-1, input.shape[-1]))
-        outs = legacy.batch_norm(input, weight, bias, running_mean, runnning_var, training, epsilon, momentum, 'NCHW')
-        out = reshape(outs[0], (*input_shape[:-1], -1))
-        out = transpose_view(out, 1, -1)
-
-        return out, outs[1], outs[2]
+    return legacy.batch_norm(input, weight, bias, running_mean, runnning_var, training, epsilon, momentum, 'NCHW')
 
 def tanh(input):
     return legacy.tanh(input)
@@ -797,25 +804,22 @@ def max_pool2d(input, kernel_size, stride=1, padding=0, dilation=1, ceil_mode=Fa
     return out
 
 def baddbmm(input, batch1, batch2, alpha=1, beta=1):
-    return add(mul(beta, input), mul(alpha, bmm(batch1, batch2)))
+    return add(mul(input, beta), mul(bmm(batch1, batch2), alpha))
 
 def softplus(input, beta=1, threshold=20):
     return legacy.softplus(input)
 
 def gather_nd(input, indices):
     return legacy.gather_nd(input, indices)
 
-def unique_consecutive(input, return_inverse, return_counts, dim):
-    return legacy.unique_consecutive(input, return_inverse, return_counts, dim)
-
 def meshgrid(input, lambd):
     return legacy.meshgrid(input, lambd)
 
 def addcmul(input, tensor1, tensor2, value=1.0):
     return legacy.addcmul(input, tensor1, tensor2, mindspore.Tensor(value))
 
 def addmm(input, mat1, mat2, alpha=1.0, beta=1.0):
-    return add(mul(beta, input), mul(alpha, bmm(mat1, mat2)))
+    return add(mul(input, beta), mul(bmm(mat1, mat2), alpha))
 
 def im2col(input, kernel_size, dilation=1, padding=0, stride=1):
     out = legacy.im2_col(input, kernel_size, stride, dilation, padding)
@@ -1101,6 +1105,8 @@ def bernoulli(input, generator):
     return legacy.bernoulli(input, seed, offset)
 
 def arange(start, end, step, dtype):
+    if dtype is not None:
+        return cast(legacy.range(start, end, step, 100000), dtype)
     return legacy.range(start, end, step, 100000)
 
 def inplace_fill_scalar(input, value):
@@ -1121,3 +1127,13 @@ def inplace_uniform(input, from_, to_, generator_):
                                     mindspore.tensor(from_, dtype=mindspore.int32),
                                     mindspore.tensor(to_, dtype=mindspore.int32), 0, 0)
     return input.assign_value(value)
+
+def right_shift(input, other):
+    return legacy.right_shift(input, other)
+
+def inplace_fill_tensor(input, value):
+    input.assign_value(fill_tensor(input.shape, value, None))
+    return input
+
+def search_sorted(sorted_sequence, values, sorter, dtype, right):
+    return legacy.search_sorted(sorted_sequence, values, sorter, dtype, right)
diff --git a/mindnlp/core/_apis/npu.py b/mindnlp/core/_apis/npu.py
@@ -1594,3 +1594,13 @@ def bernoulli(input, generator):
 def multinomial(input, num_samples, replacement, generator):
     seed, offset = generator._step(12)  # pylint: disable=protected-access
     return pyboost.multinomial_ext_op(input, num_samples, replacement, seed, offset)
+
+def right_shift(input, other):
+    if use_pyboost():
+        return pyboost.right_shift_op(input, other)
+    return legacy.right_shift(input, other)
+
+def histc(input, bins=100, min=0, max=0):
+    if use_pyboost():
+        return pyboost.histc_ext_op(input, bins, float(min), float(max))
+    return legacy.histogram(input, bins, float(min), float(max))
diff --git a/mindnlp/core/_tensor.py b/mindnlp/core/_tensor.py
@@ -110,6 +110,7 @@ def __init__(self, *args, **kwargs):
 
 Tensor.__init__ = __init__
 origin_setitem = Tensor.__setitem__
+origin_is_contiguous = Tensor.is_contiguous
 Tensor._requires_grad = False
 
 def tensor(data, *, dtype=None, device=None, requires_grad=False):
@@ -1253,7 +1254,8 @@ def hardshrink(self, lambd=0.5):
 
 
     # Tensor.histc
-
+    def histc(self, bins=100, min=0, max=0):
+        return ops.histc(self, bins, min, max)
 
     # Tensor.histogram
 
@@ -1364,8 +1366,8 @@ def isnan(self):
         return ops.isnan(self)
 
     # Tensor.is_contiguous
-    # def is_contiguous(self):
-    #     return self.is_contiguous()
+    def is_contiguous(self, memory_format=None):
+        return origin_is_contiguous(self)
 
     # Tensor.is_complex
     def is_complex(self):

diff --git a/mindnlp/core/cuda/__init__.py b/mindnlp/core/cuda/__init__.py
@@ -60,8 +60,19 @@ def __exit__(self, type: Any, value: Any, traceback: Any):
 def is_bf16_supported():
     return False
 
-def mem_get_info(index):
-    return (1024, 1024)
+def mem_get_info(device=None):
+    if not isinstance(device, int):
+        device = mindspore.context.get_context("device_id")
+
+    res = mindspore.hal.get_device_properties(device)
+    return (res.total_memory, res.total_memory)
+
+def get_device_capability(device=None):
+    if not isinstance(device, int):
+        device = mindspore.context.get_context("device_id")
+
+    res = mindspore.hal.get_device_properties(device)
+    return (res.major, res.minor)
 
 def memory_reserved(device=None):
     return ms_memory_reserved()

diff --git a/mindnlp/core/nn/functional.py b/mindnlp/core/nn/functional.py
@@ -274,7 +274,7 @@ def pad(input, pad, mode='constant', value=None):
     if isinstance(pad, tuple):
         pad = tuple(p if isinstance(p, int) else p.item() for p in pad)
 
-    if input.device.type in ['cpu', 'meta'] or ON_A1:
+    if input.device.type in ['cpu', 'meta', 'cuda'] or ON_A1:
         new_pad = ()
         for idx, pad_v in enumerate(pad):
             if not isinstance(pad_v, int):
@@ -301,6 +301,8 @@ def pad(input, pad, mode='constant', value=None):
             value = bool(value)
         elif input.dtype in [core.int32, core.int64]:
             value = int(value)
+        if input.device.type == 'cuda' and len(new_pad) == 8:
+            return execute('pad_v3', input, new_pad[:-2], mode, value)
         return execute('pad_v3', input, new_pad, mode, value)
     out = input
     if (isinstance(pad, tuple) and not pad):
@@ -324,9 +326,9 @@ def pad(input, pad, mode='constant', value=None):
     return out
 
 def nll_loss(input, target, weight=None, ignore_index=-100, reduction='mean'):
-    # if input.device.type == 'npu':
-    return _nllloss_nd(input, target, weight, ignore_index, reduction)
-    # return _inner_nll_loss(input, target, weight, ignore_index, reduction)
+    if input.device.type in ['npu', 'cpu']:
+        return _nllloss_nd(input, target, weight, ignore_index, reduction)
+    return _inner_nll_loss(input, target, weight, ignore_index, reduction)
 
 def _inner_nll_loss(inputs, target, weight=None, ignore_index=-100, reduction='mean', label_smoothing=0.0):
     ndim = inputs.ndim
@@ -352,7 +354,7 @@ def _inner_nll_loss(inputs, target, weight=None, ignore_index=-100, reduction='m
 def _nll_loss(inputs, target, target_dim=-1, weight=None, ignore_index=None, reduction='none', label_smoothing=0.0):
     """nll loss inner function"""
     if target.ndim == inputs.ndim - 1:
-        target = target.expand_dims(target_dim)
+        target = target.unsqueeze(target_dim)
     if ignore_index is not None:
         non_pad_mask = core.eq(target, ignore_index)
         target = target.masked_fill(non_pad_mask, core.cast(0, target.dtype))
@@ -366,10 +368,10 @@ def _nll_loss(inputs, target, target_dim=-1, weight=None, ignore_index=None, red
             weight = weight.view(weight.shape + (1,))
         weighted_inputs = inputs * weight
         weighted_inputs = weighted_inputs.view(orig_shape)
-        loss = core.neg(core.gather_d(weighted_inputs, target_dim, target))
+        loss = core.neg(core.gather(weighted_inputs, target_dim, target))
         smooth_loss = core.neg(weighted_inputs.sum(axis=target_dim, keepdims=True))
     else:
-        loss = core.neg(core.gather_d(inputs, target_dim, target))
+        loss = core.neg(core.gather(inputs, target_dim, target))
         smooth_loss = core.neg(inputs.sum(axis=target_dim, keepdims=True))
         loss_weights = core.ones_like(loss)
 
@@ -427,11 +429,42 @@ def _nllloss_nd(input, target, weight=None, ingore_index=-100, reduction='mean')
     ret = execute('nllloss_2d', input, target, weight, reduction, ingore_index)[0]
     return ret.view(out_size)
 
+
+def cross_entropy_gpu(input, target, weight=None, ignore_index=-100, reduction='mean', label_smoothing=0.0):
+    class_dim = 0 if input.ndim == 1 else 1
+    if target.dtype.is_floating_point:
+        return _cross_entropy(input, target, class_dim, weight, reduction, label_smoothing)
+    return nll_loss(log_softmax(input, class_dim), target, weight, ignore_index, reduction)
+
+def _cross_entropy(inputs, target, target_dim, weight=None, reduction='mean', label_smoothing=0.0):
+    """cross entropy inner function"""
+    class_dim = 0 if inputs.ndim == 1 else 1
+    n_classes = inputs.shape[class_dim]
+    inputs = log_softmax(inputs, class_dim)
+    if label_smoothing > 0.0:
+        target = target * (1 - label_smoothing) + label_smoothing / n_classes
+
+    if weight is None:
+        weight = core.ones_like(inputs)
+    elif inputs.ndim != 1:
+        broadcast_shape = [1 for _ in range(inputs.ndim)]
+        broadcast_shape[1] = weight.shape[0]
+        weight = weight.reshape(broadcast_shape)
+
+    if reduction == 'mean':
+        return -(inputs * target * weight).sum() / (inputs.nel / n_classes)
+    if reduction == 'sum':
+        return -(inputs * target * weight).sum()
+    return -(inputs * target * weight).sum(class_dim)
+
+
 def cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean', label_smoothing=0.0):
     if label_smoothing < 0.0 or label_smoothing > 1.0:
         raise ValueError(f"For cross_entropy, label_smoothing must in [0, 1]")
     if input.ndim == 0 or input.shape[0] == 0:
         raise ValueError(f"For cross_entropy, input don't support 0-dim and shape[0].")
+    if input.device.type == 'cuda':
+        return cross_entropy_gpu(input, target, weight, ignore_index, reduction, label_smoothing)
     class_dim = 0 if input.ndim == 1 else 1
     n_classes = input.shape[class_dim]
     input = log_softmax(input, class_dim, dtype=input.dtype)
@@ -675,10 +708,10 @@ def interpolate(input, size=None, scale_factor=None, mode='nearest', align_corne
         )
     if input.dim() == 4 and mode == "bicubic":
         assert align_corners is not None
-        if antialias:
-            return torch._C._nn._upsample_bicubic2d_aa(
-                input, output_size, align_corners, scale_factors
-            )
+        # if antialias:
+        #     return torch._C._nn._upsample_bicubic2d_aa(
+        #         input, output_size, align_corners, scale_factors
+        #     )
         return execute(
             'upsample_bicubic2d', input, output_size, scale_factors, align_corners
         )
@@ -1146,8 +1179,8 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
         else:
             attn_bias = attn_mask + attn_bias
 
-    attn_weight = query.float() @ key.transpose(-2, -1).float() * scale_factor
-    attn_weight += attn_bias.float()
+    attn_weight = query @ key.transpose(-2, -1) * scale_factor
+    attn_weight += attn_bias
     attn_weight = softmax(attn_weight, dim=-1, dtype=core.float32).to(query.dtype)
     attn_weight = dropout(attn_weight, dropout_p, training=True)
     return attn_weight @ value

diff --git a/mindnlp/core/ops/_inner.py b/mindnlp/core/ops/_inner.py
@@ -16,7 +16,14 @@ def npu_clear_float_status_v2(status):
 def all_finite(inputs):
     return execute('all_finite', inputs)
 
+def custom_masked_scatter_vec(input, mask, source):    
+    output = input.clone()
+    output[mask] = source.flatten() # 关键的一行：向量化赋值
+    return output
+
 def masked_scatter(input, mask, source):
+    if input.device.type == 'cuda':
+        return custom_masked_scatter_vec(input, mask, source)
     return execute('masked_scatter', input, mask, source)
 
 __all__ = [