support triton self-defined op (#1990)

lvyufeng · web-flow · commit aaaaaaa44755 · 2025-03-17T16:08:14.000+08:00
diff --git a/mindnlp/core/ops/creation.py b/mindnlp/core/ops/creation.py
@@ -118,16 +118,24 @@ def eye(n, m=None, *, dtype=None):
     return ops.eye(n, m, dtype)
 
 # empty
-def empty(*size, dtype=None):
+has_empty = hasattr(mindspore.mint, 'empty')
+def empty(*size, dtype=None, device=None):
     if isinstance(size[0], (tuple, list)):
         size = size[0]
     if dtype is None:
         dtype = get_default_dtype()
-    out = CTensor(dtype=dtype, shape=size)
+    if has_empty:
+        out = mindspore._c_expression.pyboost_empty([size, dtype, device])
+    else:
+        out = CTensor(dtype=dtype, shape=size)
     return mindspore.Tensor(out)
 
 # empty_like
-
+has_empty_like = hasattr(mindspore.mint, 'empty_like')
+def empty_like(input, *, dtype=None, device=None):
+    if has_empty_like:
+        return mindspore.mint.empty_like(input, dtype=dtype, device=device)
+    return empty(input.shape, dtype=input.dtype, device=device)
 
 # empty_strided
 
diff --git a/mindnlp/patch.py b/mindnlp/patch.py
@@ -21,3 +21,11 @@ def none_in_tuple_or_list(x):
 
 if GENERATOR_SEED:
     mindspore.ops.operations.manually_defined.ops_def.infer_value_for_BroadcastTo = infer_value_for_BroadcastTo
+
+def data_ptr(self):
+    return self._data_ptr()
+
+mindspore.Tensor.data_ptr = data_ptr
+mindspore.common._stub_tensor.StubTensor.data_ptr = data_ptr
+mindspore.common.dtype.Float.__str__ = mindspore.common.dtype.Float.__repr__
+mindspore.common.dtype.Int.__str__ = mindspore.common.dtype.Int.__repr__
diff --git a/mindnlp/triton/__init__.py b/mindnlp/triton/__init__.py
@@ -0,0 +1,54 @@
+"""triton adapter for mindspore"""
+from functools import lru_cache
+import mindspore
+from triton.backends.driver import DriverBase
+from triton.backends.nvidia.driver import CudaUtils, CudaLauncher
+from triton.backends.compiler import GPUTarget
+from mindnlp.core import ops
+
+class MSDriver(DriverBase):
+
+    def __init__(self):
+        self.utils = CudaUtils()  # TODO: make static
+        self.launcher_cls = CudaLauncher
+        super().__init__()
+
+    def get_current_device(self):
+        return 0
+
+    def set_current_device(self):
+        pass
+
+    @lru_cache
+    def get_current_stream(self, device=None):
+        return mindspore.hal.current_stream().id
+
+    @lru_cache
+    def get_device_capability(self, device=0):
+        return mindspore.hal.get_device_capability(0)
+
+    @lru_cache
+    def get_current_target(self):
+        device = self.get_current_device()
+        capability = self.get_device_capability(device)
+        capability = capability[0] * 10 + capability[1]
+        warp_size = 32
+        return GPUTarget("cuda", capability, warp_size)
+
+    def get_device_interface(self):
+        return mindspore.hal
+
+    @staticmethod
+    def is_active():
+        return True
+
+    def get_benchmarker(self):
+        from triton.testing import do_bench
+        return do_bench
+
+    def get_empty_cache_for_benchmark(self):
+        # We maintain a buffer of 256 MB that we clear
+        # before each kernel call to make sure that the L2 cache
+        # doesn't contain any input data before the run
+        cache_size = 256 * 1024 * 1024
+        return ops.empty(int(cache_size // 4), dtype=mindspore.int32, device='GPU')
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -33,4 +33,5 @@ tiktoken
 faiss_cpu
 phonemizer
 datamodel_code_generator
-git+https://github.com/lvyufeng/einops
+git+https://github.com/lvyufeng/einops
+triton
diff --git a/tests/triton/test_add.py b/tests/triton/test_add.py
@@ -0,0 +1,46 @@
+import mindspore
+import triton
+import triton.language as tl
+
+from mindnlp.triton import MSDriver
+from mindnlp.core import ops
+
+mindspore.set_context(device_target='GPU')
+
+@triton.jit
+def add_kernel(x_ptr,  # *Pointer* to first input vector.
+               y_ptr,  # *Pointer* to second input vector.
+               output_ptr,  # *Pointer* to output vector.
+               n_elements,  # Size of the vector.
+               BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process.
+               # NOTE: `constexpr` so it can be used as a shape value.
+               ):
+
+    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    output = x + y
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+def add(x: mindspore.Tensor, y: mindspore.Tensor):
+    # We need to preallocate the output.
+    output = ops.empty_like(x)
+    n_elements = output.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
+
+    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=512)
+
+    return output
+
+def test_add():
+    triton.runtime.driver.set_active(MSDriver())
+
+    size = 98432
+    x = mindspore.ops.ones((size,), dtype=mindspore.float32)
+    y = mindspore.ops.ones((size,), dtype=mindspore.float32)
+    z = add(x, y)
+    print(z)