Skip to content

Commit 459fc12

Browse files
authored
Merge branch 'main' into main
2 parents db20cd3 + 827027b commit 459fc12

File tree

26 files changed

+268
-427
lines changed

26 files changed

+268
-427
lines changed

clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,22 +1160,6 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
11601160
case NVPTX::BI__nvvm_fence_sc_cluster:
11611161
return Builder.CreateCall(
11621162
CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
1163-
case NVPTX::BI__nvvm_bar_sync:
1164-
return Builder.CreateCall(
1165-
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1166-
EmitScalarExpr(E->getArg(0)));
1167-
case NVPTX::BI__syncthreads:
1168-
return Builder.CreateCall(
1169-
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1170-
Builder.getInt32(0));
1171-
case NVPTX::BI__nvvm_barrier_sync:
1172-
return Builder.CreateCall(
1173-
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all),
1174-
EmitScalarExpr(E->getArg(0)));
1175-
case NVPTX::BI__nvvm_barrier_sync_cnt:
1176-
return Builder.CreateCall(
1177-
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync),
1178-
{EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
11791163
default:
11801164
return nullptr;
11811165
}

clang/lib/Sema/SemaChecking.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11745,10 +11745,6 @@ static void CheckImplicitArgumentConversions(Sema &S, CallExpr *TheCall,
1174511745

1174611746
static void DiagnoseNullConversion(Sema &S, Expr *E, QualType T,
1174711747
SourceLocation CC) {
11748-
if (S.Diags.isIgnored(diag::warn_impcast_null_pointer_to_integer,
11749-
E->getExprLoc()))
11750-
return;
11751-
1175211748
// Don't warn on functions which have return type nullptr_t.
1175311749
if (isa<CallExpr>(E))
1175411750
return;
@@ -11765,6 +11761,10 @@ static void DiagnoseNullConversion(Sema &S, Expr *E, QualType T,
1176511761
T->isMemberPointerType() || !T->isScalarType() || T->isNullPtrType())
1176611762
return;
1176711763

11764+
if (S.Diags.isIgnored(diag::warn_impcast_null_pointer_to_integer,
11765+
E->getExprLoc()))
11766+
return;
11767+
1176811768
SourceLocation Loc = E->getSourceRange().getBegin();
1176911769

1177011770
// Venture through the macro stacks to get to the source of macro arguments.

clang/test/CodeGen/builtins-nvptx-ptx60.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ __device__ void nvvm_sync(unsigned mask, int i, float f, int a, int b,
3232
// CHECK: call void @llvm.nvvm.bar.warp.sync(i32
3333
// expected-error@+1 {{'__nvvm_bar_warp_sync' needs target feature ptx60}}
3434
__nvvm_bar_warp_sync(mask);
35-
// CHECK: call void @llvm.nvvm.barrier.cta.sync.all(i32
35+
// CHECK: call void @llvm.nvvm.barrier.sync(i32
3636
// expected-error@+1 {{'__nvvm_barrier_sync' needs target feature ptx60}}
3737
__nvvm_barrier_sync(mask);
38-
// CHECK: call void @llvm.nvvm.barrier.cta.sync(i32
38+
// CHECK: call void @llvm.nvvm.barrier.sync.cnt(i32
3939
// expected-error@+1 {{'__nvvm_barrier_sync_cnt' needs target feature ptx60}}
4040
__nvvm_barrier_sync_cnt(mask, i);
4141

clang/test/CodeGen/builtins-nvptx.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ __device__ int read_pms() {
198198

199199
__device__ void sync() {
200200

201-
// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
201+
// CHECK: call void @llvm.nvvm.bar.sync(i32 0)
202202

203203
__nvvm_bar_sync(0);
204204

@@ -259,7 +259,7 @@ __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
259259
__nvvm_membar_gl();
260260
// CHECK: call void @llvm.nvvm.membar.sys()
261261
__nvvm_membar_sys();
262-
// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
262+
// CHECK: call void @llvm.nvvm.barrier0()
263263
__syncthreads();
264264
}
265265

clang/test/Headers/gpuintrin.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -887,7 +887,7 @@ __gpu_kernel void foo() {
887887
// NVPTX-LABEL: define internal void @__gpu_sync_threads(
888888
// NVPTX-SAME: ) #[[ATTR0]] {
889889
// NVPTX-NEXT: [[ENTRY:.*:]]
890-
// NVPTX-NEXT: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
890+
// NVPTX-NEXT: call void @llvm.nvvm.barrier0()
891891
// NVPTX-NEXT: ret void
892892
//
893893
//

lldb/source/Commands/CommandObjectThreadUtil.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ void CommandObjectIterateOverThreads::DoExecute(Args &command,
3737
result.SetStatus(m_success_return);
3838

3939
bool all_threads = false;
40+
m_unique_stacks = false;
41+
4042
if (command.GetArgumentCount() == 0) {
4143
Thread *thread = m_exe_ctx.GetThreadPtr();
4244
if (thread)

lldb/test/API/functionalities/thread/num_threads/TestNumThreads.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,32 @@ def is_thread3(thread):
132132
# Construct our expected back trace string
133133
expect_string = "10 thread(s)%s" % (expect_threads)
134134

135+
# There was a bug where if you used 'thread backtrace unique'
136+
# we would switch all future backtraces to use the
137+
# "frame-format-unique" not the "frame-format". Make
138+
# sure we don't do that...
139+
setting_data = self.dbg.GetSetting("frame-format-unique")
140+
setting_str = setting_data.GetStringValue(1000)
141+
setting_str = "UNIQUE: " + setting_str
142+
lldb.SBDebugger.SetInternalVariable(
143+
"frame-format-unique", setting_str, self.dbg.GetInstanceName()
144+
)
135145
# Now that we are stopped, we should have 10 threads waiting in the
136146
# thread3 function. All of these threads should show as one stack.
137147
self.expect(
138148
"thread backtrace unique",
139149
"Backtrace with unique stack shown correctly",
140-
substrs=[expect_string, "main.cpp:%d" % self.thread3_before_lock_line],
150+
substrs=[
151+
expect_string,
152+
"UNIQUE:",
153+
"main.cpp:%d" % self.thread3_before_lock_line,
154+
],
155+
)
156+
# Make sure setting the unique flag in the command isn't
157+
# persistent:
158+
self.expect(
159+
"thread backtrace",
160+
"Backtrace unique is not sticky",
161+
substrs=["UNIQUE:"],
162+
matching=False,
141163
)

llvm/docs/NVPTXUsage.rst

Lines changed: 5 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -199,58 +199,21 @@ map in the following way to CUDA builtins:
199199
Barriers
200200
--------
201201

202-
'``llvm.nvvm.barrier.cta.*``'
203-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
202+
'``llvm.nvvm.barrier0``'
203+
^^^^^^^^^^^^^^^^^^^^^^^^^^^
204204

205205
Syntax:
206206
"""""""
207207

208208
.. code-block:: llvm
209209
210-
declare void @llvm.nvvm.barrier.cta.sync(i32 %id, i32 %n)
211-
declare void @llvm.nvvm.barrier.cta.sync.all(i32 %id)
212-
declare void @llvm.nvvm.barrier.cta.arrive(i32 %id, i32 %n)
213-
214-
declare void @llvm.nvvm.barrier.cta.sync.aligned(i32 %id, i32 %n)
215-
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id)
216-
declare void @llvm.nvvm.barrier.cta.arrive.aligned(i32 %id, i32 %n)
210+
declare void @llvm.nvvm.barrier0()
217211
218212
Overview:
219213
"""""""""
220214

221-
The '``@llvm.nvvm.barrier.cta.*``' family of intrinsics perform barrier
222-
synchronization and communication within a CTA. They can be used by the threads
223-
within the CTA for synchronization and communication.
224-
225-
Semantics:
226-
""""""""""
227-
228-
Operand %id specifies a logical barrier resource and must fall within the range
229-
0 through 15. When present, operand %n specifies the number of threads
230-
participating in the barrier. When specifying a thread count, the value must be
231-
a multiple of the warp size. With the '``@llvm.nvvm.barrier.cta.sync.*``'
232-
variants, the '``.all``' suffix indicates that all threads in the CTA should
233-
participate in the barrier and the %n operand is not present.
234-
235-
All forms of the '``@llvm.nvvm.barrier.cta.*``' intrinsic cause the executing
236-
thread to wait for all non-exited threads from its warp and then marks the
237-
warp's arrival at the barrier. In addition to signaling its arrival at the
238-
barrier, the '``@llvm.nvvm.barrier.cta.sync.*``' intrinsics cause the executing
239-
thread to wait for non-exited threads of all other warps participating in the
240-
barrier to arrive. On the other hand, the '``@llvm.nvvm.barrier.cta.arrive.*``'
241-
intrinsic does not cause the executing thread to wait for threads of other
242-
participating warps.
243-
244-
When a barrier completes, the waiting threads are restarted without delay,
245-
and the barrier is reinitialized so that it can be immediately reused.
246-
247-
The '``@llvm.nvvm.barrier.cta.*``' intrinsic has an optional '``.aligned``'
248-
modifier to indicate textual alignment of the barrier. When specified, it
249-
indicates that all threads in the CTA will execute the same
250-
'``@llvm.nvvm.barrier.cta.*``' instruction. In conditionally executed code, an
251-
aligned '``@llvm.nvvm.barrier.cta.*``' instruction should only be used if it is
252-
known that all threads in the CTA evaluate the condition identically, otherwise
253-
behavior is undefined.
215+
The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
216+
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
254217

255218
Electing a thread
256219
-----------------

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -128,12 +128,6 @@
128128
// * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32)
129129
// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
130130
// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap
131-
// * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0)
132-
// * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x)
133-
// * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x)
134-
// * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned(x, y)
135-
// * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x)
136-
// * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync(x, y)
137131

138132
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
139133
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
@@ -1269,28 +1263,35 @@ let TargetPrefix = "nvvm" in {
12691263
defm int_nvvm_atomic_cas_gen_i : PTXAtomicWithScope3<llvm_anyint_ty>;
12701264

12711265
// Bar.Sync
1266+
1267+
// The builtin for "bar.sync 0" is called __syncthreads. Unlike most of the
1268+
// intrinsics in this file, this one is a user-facing API.
1269+
def int_nvvm_barrier0 : ClangBuiltin<"__syncthreads">,
1270+
Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
1271+
// Synchronize all threads in the CTA at barrier 'n'.
1272+
def int_nvvm_barrier_n : ClangBuiltin<"__nvvm_bar_n">,
1273+
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
1274+
// Synchronize 'm', a multiple of warp size, (arg 2) threads in
1275+
// the CTA at barrier 'n' (arg 1).
1276+
def int_nvvm_barrier : ClangBuiltin<"__nvvm_bar">,
1277+
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12721278
def int_nvvm_barrier0_popc : ClangBuiltin<"__nvvm_bar0_popc">,
12731279
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12741280
def int_nvvm_barrier0_and : ClangBuiltin<"__nvvm_bar0_and">,
12751281
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12761282
def int_nvvm_barrier0_or : ClangBuiltin<"__nvvm_bar0_or">,
12771283
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12781284

1285+
def int_nvvm_bar_sync : NVVMBuiltin,
1286+
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12791287
def int_nvvm_bar_warp_sync : NVVMBuiltin,
12801288
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12811289

1282-
// barrier{.cta}.sync{.aligned} a{, b};
1283-
// barrier{.cta}.arrive{.aligned} a, b;
1284-
let IntrProperties = [IntrConvergent, IntrNoCallback] in {
1285-
foreach align = ["", "_aligned"] in {
1286-
def int_nvvm_barrier_cta_sync # align # _all :
1287-
Intrinsic<[], [llvm_i32_ty]>;
1288-
def int_nvvm_barrier_cta_sync # align :
1289-
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
1290-
def int_nvvm_barrier_cta_arrive # align :
1291-
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
1292-
}
1293-
}
1290+
// barrier.sync id[, cnt]
1291+
def int_nvvm_barrier_sync : NVVMBuiltin,
1292+
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
1293+
def int_nvvm_barrier_sync_cnt : NVVMBuiltin,
1294+
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
12941295

12951296
// barrier.cluster.[wait, arrive, arrive.relaxed]
12961297
def int_nvvm_barrier_cluster_arrive :

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,9 +1343,12 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13431343
// nvvm.abs.{i,ii}
13441344
Expand =
13451345
Name == "i" || Name == "ll" || Name == "bf16" || Name == "bf16x2";
1346-
else if (Name.consume_front("fabs."))
1346+
else if (Name == "fabs.f" || Name == "fabs.ftz.f" || Name == "fabs.d")
13471347
// nvvm.fabs.{f,ftz.f,d}
1348-
Expand = Name == "f" || Name == "ftz.f" || Name == "d";
1348+
Expand = true;
1349+
else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f" ||
1350+
Name == "swap.lo.hi.b64")
1351+
Expand = true;
13491352
else if (Name.consume_front("max.") || Name.consume_front("min."))
13501353
// nvvm.{min,max}.{i,ii,ui,ull}
13511354
Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
@@ -1377,18 +1380,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
13771380
Expand = (Name.starts_with("i.") || Name.starts_with("f.") ||
13781381
Name.starts_with("p."));
13791382
else
1380-
Expand = StringSwitch<bool>(Name)
1381-
.Case("barrier0", true)
1382-
.Case("barrier.n", true)
1383-
.Case("barrier.sync.cnt", true)
1384-
.Case("barrier.sync", true)
1385-
.Case("barrier", true)
1386-
.Case("bar.sync", true)
1387-
.Case("clz.ll", true)
1388-
.Case("popc.ll", true)
1389-
.Case("h2f", true)
1390-
.Case("swap.lo.hi.b64", true)
1391-
.Default(false);
1383+
Expand = false;
13921384

13931385
if (Expand) {
13941386
NewFn = nullptr;
@@ -2486,20 +2478,6 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
24862478
MDNode *MD = MDNode::get(Builder.getContext(), {});
24872479
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
24882480
return LD;
2489-
} else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") {
2490-
Value *Arg =
2491-
Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);
2492-
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all,
2493-
{}, {Arg});
2494-
} else if (Name == "barrier") {
2495-
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned, {},
2496-
{CI->getArgOperand(0), CI->getArgOperand(1)});
2497-
} else if (Name == "barrier.sync") {
2498-
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all, {},
2499-
{CI->getArgOperand(0)});
2500-
} else if (Name == "barrier.sync.cnt") {
2501-
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync, {},
2502-
{CI->getArgOperand(0), CI->getArgOperand(1)});
25032481
} else {
25042482
Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
25052483
if (IID != Intrinsic::not_intrinsic &&

0 commit comments

Comments
 (0)