Skip to content

Commit 7c58089

Browse files
[NVPTX] Add Intrinsics for discard.* (#128404)
[NVPTX] Add Intrinsics for discard.* This PR adds intrinsics for all variations of discard.* * These intrinsics supports generic or global for all variations. * The lowering is handled from nvvm to nvptx tablegen directly. * Lit tests are added as part of discard.ll * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst. For more information, refer to the PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>_. --------- Co-authored-by: abmajumder <abmajumder@nvidia.com> Co-authored-by: gonzalobg <65027571+gonzalobg@users.noreply.github.com>
1 parent 3919793 commit 7c58089

File tree

4 files changed

+93
-0
lines changed

4 files changed

+93
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,45 @@ level on which the priority is to be applied. The only supported value for the s
671671
For more information, refer to the PTX ISA
672672
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-applypriority>`_.
673673

674+
``llvm.nvvm.discard.*``'
675+
^^^^^^^^^^^^^^^^^^^^^^^^
676+
677+
Syntax:
678+
"""""""
679+
680+
.. code-block:: llvm
681+
682+
declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg)
683+
declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg)
684+
685+
Overview:
686+
"""""""""
687+
688+
The *effects* of the ``@llvm.nvvm.discard.L2*`` intrinsics are those of a non-atomic
689+
non-volatile ``llvm.memset`` that writes ``undef`` to the destination
690+
address range ``[%ptr, %ptr + immarg)``. The ``%ptr`` must be aligned by 128 bytes.
691+
Subsequent reads from the address range may read ``undef`` until the memory is overwritten
692+
with a different value.
693+
These operations *hint* the implementation that data in the L2 cache can be destructively
694+
discarded without writing it back to memory.
695+
The operand ``immarg`` is an integer constant that specifies the length in bytes of the
696+
address range ``[%ptr, %ptr + immarg)`` to write ``undef`` into.
697+
The only supported value for the ``immarg`` operand is ``128``.
698+
If generic addressing is used and the specified address does not fall within the
699+
address window of global memory (``addrspace(1)``) the behavior is undefined.
700+
701+
.. code-block:: llvm
702+
703+
call void @llvm.nvvm.discard.L2(ptr %p, i64 128) ;; writes `undef` to [p, p+128)
704+
%a = load i64, ptr %p. ;; loads 8 bytes containing undef
705+
%b = load i64, ptr %p ;; loads 8 bytes containing undef
706+
;; comparing %a and %b compares `undef` values!
707+
%fa = freeze i64 %a ;; freezes undef to stable bit-pattern
708+
%fb = freeze i64 %b ;; freezes undef to stable bit-pattern
709+
;; %fa may compare different to %fb!
710+
711+
For more information, refer to the `CUDA C++ discard documentation <https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_access_properties/discard_memory.html>`__ and to the `PTX ISA discard documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>`__ .
712+
674713
'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
675714
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
676715

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5082,6 +5082,14 @@ def int_nvvm_applypriority_L2_evict_normal
50825082
[IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
50835083
ImmArg<ArgIndex<1>>]>;
50845084

5085+
// Intrinsics for discard
5086+
def int_nvvm_discard_global_L2 : DefaultAttrsIntrinsic<[],
5087+
[llvm_global_ptr_ty, llvm_i64_ty], [NoCapture<ArgIndex<0>>,
5088+
ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
5089+
5090+
def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
5091+
[llvm_ptr_ty, llvm_i64_ty], [NoCapture<ArgIndex<0>>,
5092+
ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
50855093

50865094
// Intrinsics for Bulk Copy using TMA (non-tensor)
50875095
// From Global to Shared Cluster

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -800,6 +800,17 @@ class APPLYPRIORITY_L2_INTRS<string addr> :
800800
def APPLYPRIORITY_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"">;
801801
def APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"global">;
802802

803+
//Discard Intrinsics
804+
class DISCARD_L2_INTRS<string Addr> :
805+
NVPTXInst<(outs), (ins Int64Regs:$addr),
806+
StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], 128;",
807+
[(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_discard", Addr , "L2"]>.ret)
808+
i64:$addr, (i64 128))]>,
809+
Requires<[hasPTX<74>, hasSM<80>]>;
810+
811+
def DISCARD_L2 : DISCARD_L2_INTRS<"">;
812+
def DISCARD_GLOBAL_L2 : DISCARD_L2_INTRS<"global">;
813+
803814
//-----------------------------------
804815
// MBarrier Functions
805816
//-----------------------------------

llvm/test/CodeGen/NVPTX/discard.ll

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| FileCheck --check-prefixes=CHECK-PTX64 %s
3+
; RUN: %if ptxas-11.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg %size)
8+
declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg %size)
9+
10+
define void @discard_global_L2(ptr addrspace(1) %global_ptr) {
11+
; CHECK-PTX64-LABEL: discard_global_L2(
12+
; CHECK-PTX64: {
13+
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
14+
; CHECK-PTX64-EMPTY:
15+
; CHECK-PTX64-NEXT: // %bb.0:
16+
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [discard_global_L2_param_0];
17+
; CHECK-PTX64-NEXT: discard.global.L2 [%rd1], 128;
18+
; CHECK-PTX64-NEXT: ret;
19+
tail call void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 128)
20+
ret void
21+
}
22+
23+
define void @discard_L2(ptr %ptr) {
24+
; CHECK-PTX64-LABEL: discard_L2(
25+
; CHECK-PTX64: {
26+
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
27+
; CHECK-PTX64-EMPTY:
28+
; CHECK-PTX64-NEXT: // %bb.0:
29+
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [discard_L2_param_0];
30+
; CHECK-PTX64-NEXT: discard.L2 [%rd1], 128;
31+
; CHECK-PTX64-NEXT: ret;
32+
tail call void @llvm.nvvm.discard.L2(ptr %ptr, i64 128)
33+
ret void
34+
}
35+

0 commit comments

Comments
 (0)