From 91eeaf02336e539f14dcb0a79ff15dbe8befe6f1 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 2 Apr 2025 02:47:42 +0100 Subject: [PATCH 01/38] Add the functional identity and feature queries. --- clang/docs/LanguageExtensions.rst | 110 ++++++ clang/include/clang/Basic/BuiltinsAMDGPU.def | 5 + .../clang/Basic/DiagnosticSemaKinds.td | 10 + clang/lib/Basic/Targets/SPIR.cpp | 4 + clang/lib/Basic/Targets/SPIR.h | 4 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 29 ++ clang/lib/Sema/SemaExpr.cpp | 157 ++++++++ clang/test/CodeGen/amdgpu-builtin-cpu-is.c | 65 ++++ .../CodeGen/amdgpu-builtin-is-invocable.c | 64 ++++ .../amdgpu-feature-builtins-invalid-use.cpp | 43 +++ llvm/lib/Target/AMDGPU/AMDGPU.h | 9 + .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 207 ++++++++++ llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +- llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + ...pu-expand-feature-predicates-unfoldable.ll | 28 ++ .../amdgpu-expand-feature-predicates.ll | 359 ++++++++++++++++++ 17 files changed, 1099 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/amdgpu-builtin-cpu-is.c create mode 100644 clang/test/CodeGen/amdgpu-builtin-is-invocable.c create mode 100644 clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 3b8a9cac6587a..8a7cb75af13e5 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -4920,6 +4920,116 @@ If no address spaces names are provided, all address spaces are fenced. __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local") __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global") +__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``__builtin_amdgcn_processor_is`` and ``__builtin_amdgcn_is_invocable`` provide +a functional mechanism for programatically querying: + +* the identity of the current target processor; +* the capability of the current target processor to invoke a particular builtin. + +**Syntax**: + +.. code-block:: c + + // When used as the predicate for a control structure + bool __builtin_amdgcn_processor_is(const char*); + bool __builtin_amdgcn_is_invocable(builtin_name); + // Otherwise + void __builtin_amdgcn_processor_is(const char*); + void __builtin_amdgcn_is_invocable(void); + +**Example of use**: + +.. code-block:: c++ + + if (__builtin_amdgcn_processor_is("gfx1201") || + __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) + __builtin_amdgcn_s_sleep_var(x); + + if (!__builtin_amdgcn_processor_is("gfx906")) + __builtin_amdgcn_s_wait_event_export_ready(); + else if (__builtin_amdgcn_processor_is("gfx1010") || + __builtin_amdgcn_processor_is("gfx1101")) + __builtin_amdgcn_s_ttracedata_imm(1); + + while (__builtin_amdgcn_processor_is("gfx1101")) *p += x; + + do { *p -= x; } while (__builtin_amdgcn_processor_is("gfx1010")); + + for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break; + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)) + __builtin_amdgcn_s_wait_event_export_ready(); + else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm)) + __builtin_amdgcn_s_ttracedata_imm(1); + + do { + *p -= x; + } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32)); + + for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break; + +**Description**: + +When used as the predicate value of the following control structures: + +.. code-block:: c++ + + if (...) + while (...) + do { } while (...) + for (...) + +be it directly, or as arguments to logical operators such as ``!, ||, &&``, the +builtins return a boolean value that: + +* indicates whether the current target matches the argument; the argument MUST + be a string literal and a valid AMDGPU target +* indicates whether the builtin function passed as the argument can be invoked + by the current target; the argument MUST be either a generic or AMDGPU + specific builtin name + +Outside of these contexts, the builtins have a ``void`` returning signature +which prevents their misuse. + +**Example of invalid use**: + +.. code-block:: c++ + + void kernel(int* p, int x, bool (*pfn)(bool), const char* str) { + if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return; + else if (__builtin_amdgcn_processor_is(str)) __builtin_trap(); + + bool a = __builtin_amdgcn_processor_is("gfx906"); + const bool b = !__builtin_amdgcn_processor_is("gfx906"); + const bool c = !__builtin_amdgcn_processor_is("gfx906"); + bool d = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + bool e = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + const auto f = + !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) + || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + const auto g = + !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) + || !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + __builtin_amdgcn_processor_is("gfx1201") + ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42); + if (pfn(__builtin_amdgcn_processor_is("gfx1200"))) + __builtin_amdgcn_s_sleep_var(x); + + if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return; + else if (__builtin_amdgcn_is_invocable(x)) __builtin_trap(); + } + +When invoked while compiling for a concrete target, the builtins are evaluated +early by Clang, and never produce any CodeGen effects / have no observable +side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v, +which is an abstract target, a series of predicate values are implicitly +created. These predicates get resolved when finalizing the compilation process +for a concrete target, and shall reflect the latter's identity and features. +Thus, it is possible to author high-level code, in e.g. HIP, that is target +adaptive in a dynamic fashion, contrary to macro based mechanisms. ARM/AArch64 Language Extensions ------------------------------- diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 44ef404aee72f..5d01a7e75f7e7 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -346,6 +346,11 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr") BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n") BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n") +// These are special FE only builtins intended for forwarding the requirements +// to the ME. +BUILTIN(__builtin_amdgcn_processor_is, "vcC*", "nctu") +BUILTIN(__builtin_amdgcn_is_invocable, "v", "nctu") + //===----------------------------------------------------------------------===// // R600-NI only builtins. //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 5e45482584946..45f0f9eb88e55 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13054,4 +13054,14 @@ def err_acc_decl_for_routine // AMDGCN builtins diagnostics def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">; def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">; +def err_amdgcn_processor_is_arg_not_literal + : Error<"the argument to __builtin_amdgcn_processor_is must be a string " + "literal">; +def err_amdgcn_processor_is_arg_invalid_value + : Error<"the argument to __builtin_amdgcn_processor_is must be a valid " + "AMDGCN processor identifier; '%0' is not valid">; +def err_amdgcn_is_invocable_arg_invalid_value + : Error<"the argument to __builtin_amdgcn_is_invocable must be either a " + "target agnostic builtin or an AMDGCN target specific builtin; `%0`" + " is not valid">; } // end of sema component. diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp index 5b5f47f9647a2..eb43d9b0be283 100644 --- a/clang/lib/Basic/Targets/SPIR.cpp +++ b/clang/lib/Basic/Targets/SPIR.cpp @@ -152,3 +152,7 @@ void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) { Float128Format = DoubleFormat; } } + +bool SPIRV64AMDGCNTargetInfo::isValidCPUName(StringRef CPU) const { + return AMDGPUTI.isValidCPUName(CPU); +} diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 78505d66d6f2f..7aa13cbeb89fd 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -432,6 +432,10 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final } bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); } + + // This is only needed for validating arguments passed to + // __builtin_amdgcn_processor_is + bool isValidCPUName(StringRef Name) const override; }; } // namespace targets diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index b56b739094ff3..7b1a3815144b4 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -284,6 +284,18 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); } +static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) { + auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext()); + + auto P = cast( + CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy)); + P->setConstant(true); + P->setExternallyInitialized(true); + + return CGF.Builder.CreateLoad(RawAddress(P, PTy, CharUnits::One(), + KnownNonNull)); +} + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; @@ -585,6 +597,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Value *Env = EmitScalarExpr(E->getArg(0)); return Builder.CreateCall(F, {Env}); } + case AMDGPU::BI__builtin_amdgcn_processor_is: { + assert(CGM.getTriple().isSPIRV() && + "__builtin_amdgcn_processor_is should never reach CodeGen for " + "concrete targets!"); + StringRef Proc = cast(E->getArg(0))->getString(); + return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc); + } + case AMDGPU::BI__builtin_amdgcn_is_invocable: { + assert(CGM.getTriple().isSPIRV() && + "__builtin_amdgcn_is_invocable should never reach CodeGen for " + "concrete targets!"); + auto FD = cast( + cast(E->getArg(0))->getReferencedDeclOfCallee()); + StringRef RF = + getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); + return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF); + } case AMDGPU::BI__builtin_amdgcn_read_exec: return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false); case AMDGPU::BI__builtin_amdgcn_read_exec_lo: diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 7cc8374e69d73..24f5262ab3cf4 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6541,6 +6541,22 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, if (Result.isInvalid()) return ExprError(); Fn = Result.get(); + // The __builtin_amdgcn_is_invocable builtin is special, and will be resolved + // later, when we check boolean conditions, for now we merely forward it + // without any additional checking. + if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 && + ArgExprs[0]->getType() == Context.BuiltinFnTy) { + auto FD = cast(Fn->getReferencedDeclOfCallee()); + + if (FD->getName() == "__builtin_amdgcn_is_invocable") { + auto FnPtrTy = Context.getPointerType(FD->getType()); + auto R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get(); + return CallExpr::Create(Context, R, ArgExprs, Context.VoidTy, + ExprValueKind::VK_PRValue, RParenLoc, + FPOptionsOverride()); + } + } + if (CheckArgsForPlaceholders(ArgExprs)) return ExprError(); @@ -13234,6 +13250,20 @@ inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS, return InvalidOperands(Loc, LHS, RHS); } +static inline bool IsAMDGPUPredicateBI(Expr *E) { + if (!E->getType()->isVoidType()) + return false; + + if (auto CE = dyn_cast(E)) { + if (auto BI = CE->getDirectCallee()) + if (BI->getName() == "__builtin_amdgcn_processor_is" || + BI->getName() == "__builtin_amdgcn_is_invocable") + return true; + } + + return false; +} + // C99 6.5.[13,14] inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS, SourceLocation Loc, @@ -13329,6 +13359,9 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS, // The following is safe because we only use this method for // non-overloadable operands. + if (IsAMDGPUPredicateBI(LHS.get()) && IsAMDGPUPredicateBI(RHS.get())) + return Context.VoidTy; + // C++ [expr.log.and]p1 // C++ [expr.log.or]p1 // The operands are both contextually converted to type bool. @@ -15576,6 +15609,38 @@ static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) { return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy); } +static Expr *ExpandAMDGPUPredicateBI(ASTContext &Ctx, CallExpr *CE) { + if (!CE->getBuiltinCallee()) + return CXXBoolLiteralExpr::Create(Ctx, false, Ctx.BoolTy, CE->getExprLoc()); + + if (Ctx.getTargetInfo().getTriple().isSPIRV()) { + CE->setType(Ctx.getLogicalOperationType()); + return CE; + } + + bool P = false; + auto &TI = Ctx.getTargetInfo(); + + if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") { + auto GFX = dyn_cast(CE->getArg(0)->IgnoreParenCasts()); + auto TID = TI.getTargetID(); + if (GFX && TID) { + auto N = GFX->getString(); + P = TI.isValidCPUName(GFX->getString()) && TID->find(N) == 0; + } + } else { + auto FD = cast(CE->getArg(0)->getReferencedDeclOfCallee()); + + StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); + llvm::StringMap CF; + Ctx.getFunctionFeatureMap(CF, FD); + + P = Builtin::evaluateRequiredTargetFeatures(RF, CF); + } + + return CXXBoolLiteralExpr::Create(Ctx, P, Ctx.BoolTy, CE->getExprLoc()); +} + ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc, Expr *InputExpr, bool IsAfterAmp) { @@ -15753,6 +15818,8 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc, // Vector logical not returns the signed variant of the operand type. resultType = GetSignedVectorType(resultType); break; + } else if (IsAMDGPUPredicateBI(InputExpr)) { + break; } else { return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr) << resultType << Input.get()->getSourceRange()); @@ -20469,6 +20536,88 @@ void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) { } } +static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) { + if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") { + auto GFX = dyn_cast(CE->getArg(0)->IgnoreParenCasts()); + if (!GFX) { + Sema.Diag(CE->getExprLoc(), + diag::err_amdgcn_processor_is_arg_not_literal); + return false; + } + auto N = GFX->getString(); + if (!Sema.getASTContext().getTargetInfo().isValidCPUName(N) && + (!Sema.getASTContext().getAuxTargetInfo() || + !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) { + Sema.Diag(CE->getExprLoc(), + diag::err_amdgcn_processor_is_arg_invalid_value) << N; + return false; + } + } else { + auto Arg = CE->getArg(0); + if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) { + Sema.Diag(CE->getExprLoc(), + diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg; + return false; + } + } + + return true; +} + +static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) { + if (auto UO = dyn_cast(E)) { + auto SE = dyn_cast(UO->getSubExpr()); + if (IsAMDGPUPredicateBI(SE)) { + assert( + UO->getOpcode() == UnaryOperator::Opcode::UO_LNot && + "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " + "can only be used as operands of logical ops!"); + + if (!ValidateAMDGPUPredicateBI(Sema, SE)) { + Invalid = true; + return nullptr; + } + + UO->setSubExpr(ExpandAMDGPUPredicateBI(Sema.getASTContext(), SE)); + UO->setType(Sema.getASTContext().getLogicalOperationType()); + + return UO; + } + } + if (auto BO = dyn_cast(E)) { + auto LHS = dyn_cast(BO->getLHS()); + auto RHS = dyn_cast(BO->getRHS()); + if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) { + assert( + BO->isLogicalOp() && + "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " + "can only be used as operands of logical ops!"); + + if (!ValidateAMDGPUPredicateBI(Sema, LHS) || + !ValidateAMDGPUPredicateBI(Sema, RHS)) { + Invalid = true; + return nullptr; + } + + BO->setLHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), LHS)); + BO->setRHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), RHS)); + BO->setType(Sema.getASTContext().getLogicalOperationType()); + + return BO; + } + } + if (auto CE = dyn_cast(E)) + if (IsAMDGPUPredicateBI(CE)) { + if (!ValidateAMDGPUPredicateBI(Sema, CE)) { + Invalid = true; + return nullptr; + } + return ExpandAMDGPUPredicateBI(Sema.getASTContext(), CE); + } + + return nullptr; +} + ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E, bool IsConstexpr) { DiagnoseAssignmentAsCondition(E); @@ -20480,6 +20629,14 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E, E = result.get(); if (!E->isTypeDependent()) { + if (E->getType()->isVoidType()) { + bool IsInvalidPredicate = false; + if (auto BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate)) + return BIC; + else if (IsInvalidPredicate) + return ExprError(); + } + if (getLangOpts().CPlusPlus) return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4 diff --git a/clang/test/CodeGen/amdgpu-builtin-cpu-is.c b/clang/test/CodeGen/amdgpu-builtin-cpu-is.c new file mode 100644 index 0000000000000..6e261d9f5d239 --- /dev/null +++ b/clang/test/CodeGen/amdgpu-builtin-cpu-is.c @@ -0,0 +1,65 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5 +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s + +// Test that, depending on triple and, if applicable, target-cpu, one of three +// things happens: +// 1) for gfx900 we emit a call to trap (concrete target, matches) +// 2) for gfx1010 we emit an empty kernel (concrete target, does not match) +// 3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and +// load from it to provide the condition a br (abstract target) +//. +// AMDGCN-GFX900: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600 +//. +// AMDGCN-GFX1010: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600 +//. +// AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1 +//. +// AMDGCN-GFX900-LABEL: define dso_local void @foo( +// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX900-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX900-NEXT: call void @llvm.trap() +// AMDGCN-GFX900-NEXT: ret void +// +// AMDGCN-GFX1010-LABEL: define dso_local void @foo( +// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX1010-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX1010-NEXT: ret void +// +// AMDGCNSPIRV-LABEL: define spir_func void @foo( +// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { +// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1 +// AMDGCNSPIRV-NEXT: br i1 [[TMP0]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// AMDGCNSPIRV: [[IF_THEN]]: +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap() +// AMDGCNSPIRV-NEXT: br label %[[IF_END]] +// AMDGCNSPIRV: [[IF_END]]: +// AMDGCNSPIRV-NEXT: ret void +// +void foo() { + if (__builtin_cpu_is("gfx900")) + return __builtin_trap(); +} +//. +// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +// AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } +//. +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" } +// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c new file mode 100644 index 0000000000000..6d2690cb75b7c --- /dev/null +++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c @@ -0,0 +1,64 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5 +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s + +// Test that, depending on triple and, if applicable, target-cpu, one of three +// things happens: +// 1) for gfx900 we emit an empty kernel (concrete target, lacks feature) +// 2) for gfx1010 we emit a call to trap (concrete target, has feature) +// 3) for AMDGCNSPIRV we emit llvm.amdgcn.has.gfx10-insts as a constant +// externally initialised bool global, and load from it to provide the +// condition to a br (abstract target) + +//. +// AMDGCNSPIRV: @llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1 +//. +// AMDGCN-GFX900-LABEL: define dso_local void @foo( +// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX900-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX900-NEXT: ret void +// +// AMDGCN-GFX1010-LABEL: define dso_local void @foo( +// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX1010-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX1010-NEXT: call void @llvm.trap() +// AMDGCN-GFX1010-NEXT: ret void +// +// AMDGCNSPIRV-LABEL: define spir_func void @foo( +// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { +// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1 +// AMDGCNSPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// AMDGCNSPIRV: [[IF_THEN]]: +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap() +// AMDGCNSPIRV-NEXT: br label %[[IF_END]] +// AMDGCNSPIRV: [[IF_END]]: +// AMDGCNSPIRV-NEXT: ret void +// +void foo() { + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16)) + return __builtin_trap(); +} +//. +// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +//. +// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } +// AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" } +// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp new file mode 100644 index 0000000000000..f618f54909b00 --- /dev/null +++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp @@ -0,0 +1,43 @@ +// RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s +// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s + +bool predicate(bool x) { return x; } + +void invalid_uses(int* p, int x, bool (*pfn)(bool)) { + // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void' + bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906"); + // CHECK: error: cannot initialize a variable of type 'const bool' with an rvalue of type 'void' + const bool invalid_use_in_init_1 = !__builtin_amdgcn_processor_is("gfx906"); + // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void' + bool invalid_use_in_init_2 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void' + bool invalid_use_in_init_3 = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + // CHECK: error: variable has incomplete type 'const void' + const auto invalid_use_in_init_4 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + // CHECK: error: variable has incomplete type 'const void' + const auto invalid_use_in_init_5 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_processor_is("gfx900"); + // CHECK: error: variable has incomplete type 'const void' + const auto invalid_use_in_init_6 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep); + // CHECK: error: value of type 'void' is not contextually convertible to 'bool' + __builtin_amdgcn_processor_is("gfx1201") + ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42); + // CHECK: error: no matching function for call to 'predicate' + if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x); + // CHECK: note: candidate function not viable: cannot convert argument of incomplete type 'void' to 'bool' for 1st argument +} + +void invalid_invocations(int x, const char* str) { + // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a valid AMDGCN processor identifier; 'not_an_amdgcn_gfx_id' is not valid + if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return; + // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal + if (__builtin_amdgcn_processor_is(str)) return; + + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `"__builtin_amdgcn_s_sleep_var"` is not valid + if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return; + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `str` is not valid + else if (__builtin_amdgcn_is_invocable(str)) return; + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `x` is not valid + else if (__builtin_amdgcn_is_invocable(x)) return; + // CHECK: error: use of undeclared identifier '__builtin_ia32_pause' + else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index a8e4ea9429f50..1fe0016723a30 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -408,6 +408,15 @@ extern char &AMDGPUPrintfRuntimeBindingID; void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &); extern char &AMDGPUResourceUsageAnalysisID; +struct AMDGPUExpandFeaturePredicatesPass + : PassInfoMixin { + const AMDGPUTargetMachine &TM; + AMDGPUExpandFeaturePredicatesPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + static bool isRequired() { return true; } +}; + struct AMDGPUPrintfRuntimeBindingPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp new file mode 100644 index 0000000000000..125051c6aa0cf --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp @@ -0,0 +1,207 @@ +//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file implements a pass that deals with expanding AMDGCN generic pseudo- +// intrinsics into target specific quantities / sequences. In this context, a +// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a +// specific instruction, but rather is intended as a mechanism for abstractly +// conveying target specific info to a HLL / the FE, without concretely +// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize. +// This pass should run as early as possible / immediately after Clang CodeGen, +// so that the optimisation pipeline and the BE operate with concrete target +// data. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetIntrinsicInfo.h" +#include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" + +#include +#include + +using namespace llvm; + +namespace { +inline Function *getCloneForInlining(Function *OldF) { + assert(OldF && "Must pass an existing Function!"); + + // TODO - Alias Value to clone arg. + ValueToValueMapTy VMap; + + auto NewF = CloneFunction(OldF, VMap); + + NewF->removeFnAttr(Attribute::OptimizeNone); + NewF->removeFnAttr(Attribute::NoInline); + NewF->addFnAttr(Attribute::AlwaysInline); + + return NewF; +} + +template +inline void collectUsers(Value *V, ModulePassManager &AlwaysInliner, + ModuleAnalysisManager &MAM, + SmallDenseMap &InlinableClones, + C &Container) { + assert(V && "Must pass an existing Value!"); + + auto A = PreservedAnalyses::all(); + + constexpr auto IsValidCall = [](auto &&U) { + if (auto CB = dyn_cast(U)) + if (auto F = CB->getCalledFunction()) + if (!F->isIntrinsic() && !F->isDeclaration()) + return true; + return false; + }; + + SmallVector Calls{}; + copy_if(V->users(), std::back_inserter(Calls), IsValidCall); + + while (!Calls.empty()) { + for (auto &&Call : Calls) { + auto CB = cast(Call); + auto &TempF = InlinableClones[CB->getCalledFunction()]; + + if (!TempF) + TempF = getCloneForInlining(CB->getCalledFunction()); + + CB->setCalledFunction(TempF); + CB->removeFnAttr(Attribute::NoInline); + CB->addFnAttr(Attribute::AlwaysInline); + + AlwaysInliner.run(*TempF->getParent(), MAM); + } + + Calls.clear(); + + copy_if(V->users(), std::back_inserter(Calls), IsValidCall); + } + + for (auto &&U : V->users()) + if (auto I = dyn_cast(U)) { + if (auto CB = dyn_cast(I)) { + if (CB->getCalledFunction() && !CB->getCalledFunction()->isIntrinsic()) + Container.insert(Container.end(), I); + } else { + Container.insert(Container.end(), I); + } + } +} + +std::pair +handlePredicate(const GCNSubtarget &ST, ModuleAnalysisManager &MAM, + SmallDenseMap& InlinableClones, + GlobalVariable *P) { + auto PV = P->getName().substr(P->getName().rfind('.') + 1).str(); + auto Dx = PV.find(','); + while (Dx != std::string::npos) { + PV.insert(++Dx, {'+'}); + + Dx = PV.find(',', Dx); + } + + auto PTy = P->getValueType(); + P->setLinkage(GlobalValue::PrivateLinkage); + P->setExternallyInitialized(false); + + if (P->getName().starts_with("llvm.amdgcn.is")) + P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU())); + else + P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures('+' + PV))); + + ModulePassManager MPM; + MPM.addPass(AlwaysInlinerPass()); + + SmallPtrSet ToFold; + collectUsers(P, MPM, MAM, InlinableClones, ToFold); + + if (ToFold.empty()) + return {PreservedAnalyses::all(), true}; + + do { + auto I = *ToFold.begin(); + ToFold.erase(I); + + if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) { + collectUsers(I, MPM, MAM, InlinableClones, ToFold); + I->replaceAllUsesWith(C); + I->eraseFromParent(); + continue; + } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) { + continue; + } else if (I->users().empty()) { + continue; + } + + std::string W; + raw_string_ostream OS(W); + + auto Caller = I->getParent()->getParent(); + + OS << "Impossible to constant fold feature predicate: " << P->getName() + << ", please simplify.\n"; + + Caller->getContext().diagnose( + DiagnosticInfoUnsupported(*Caller, W, I->getDebugLoc(), DS_Error)); + + return {PreservedAnalyses::none(), false}; + } while (!ToFold.empty()); + + return {PreservedAnalyses::none(), true}; +} +} // Unnamed namespace. + +PreservedAnalyses +AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &MAM) { + if (M.empty()) + return PreservedAnalyses::all(); + + SmallVector Predicates; + for (auto &&G : M.globals()) { + if (!G.isDeclaration() || !G.hasName()) + continue; + if (G.getName().starts_with("llvm.amdgcn.")) + Predicates.push_back(&G); + } + + if (Predicates.empty()) + return PreservedAnalyses::all(); + + PreservedAnalyses Ret = PreservedAnalyses::all(); + + SmallDenseMap InlinableClones; + const auto &ST = TM.getSubtarget( + *find_if(M, [](auto &&F) { return !F.isIntrinsic(); })); + + for (auto &&P : Predicates) { + auto R = handlePredicate(ST, MAM, InlinableClones, P); + + if (!R.second) + return PreservedAnalyses::none(); + + Ret.intersect(R.first); + } + + for (auto &&C : InlinableClones) + C.second->eraseFromParent(); + + return Ret; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 6a45392b5f099..c3c9e24c2efa4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -29,6 +29,8 @@ MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) +MODULE_PASS("amdgpu-expand-feature-predicates", + AMDGPUExpandFeaturePredicatesPass(*this)) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4937b434bc955..8e8a6e1eda437 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -803,7 +803,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #include "llvm/Passes/TargetPassRegistry.inc" PB.registerPipelineStartEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { + [this](ModulePassManager &PM, OptimizationLevel Level) { + PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this)); if (EnableHipStdPar) PM.addPass(HipStdParAcceleratorCodeSelectionPass()); }); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..a389200f0db8e 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -54,6 +54,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp + AMDGPUExpandFeaturePredicates.cpp AMDGPUExportClustering.cpp AMDGPUExportKernelRuntimeHandles.cpp AMDGPUFrameLowering.cpp diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll new file mode 100644 index 0000000000000..bfc35d8c76e37 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll @@ -0,0 +1,28 @@ +; REQUIRES: amdgpu-registered-target + +; RUN: not opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' < %s 2>&1 | FileCheck %s + +; CHECK: error:{{.*}}in function kernel void (ptr addrspace(1), i32, ptr addrspace(1)): Impossible to constant fold feature predicate: @llvm.amdgcn.is.gfx803 = private addrspace(1) constant i1 false used by %call = call i1 %1(i1 zeroext false), please simplify. + +@llvm.amdgcn.is.gfx803 = external addrspace(1) externally_initialized constant i1 + +declare void @llvm.amdgcn.s.sleep(i32 immarg) #1 + +define amdgpu_kernel void @kernel(ptr addrspace(1) readnone captures(none) %p.coerce, i32 %x, ptr addrspace(1) %pfn.coerce) { +entry: + %0 = ptrtoint ptr addrspace(1) %pfn.coerce to i64 + %1 = inttoptr i64 %0 to ptr + %2 = ptrtoint ptr addrspace(1) %pfn.coerce to i64 + %3 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx803, align 1 + %call = call i1 %1(i1 zeroext %3) + br i1 %call, label %if.gfx803, label %if.end + +if.gfx803: + call void @llvm.amdgcn.s.sleep(i32 0) + br label %if.end + +if.end: + ret void +} + +attributes #1 = { nocallback nofree nosync nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll new file mode 100644 index 0000000000000..277323c353260 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll @@ -0,0 +1,359 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; REQUIRES: amdgpu-registered-target + +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1010 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1101 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201-W64 %s + +;; The IR was derived from the following source: +;; extern "C" __global__ void kernel(int* p, int x) +;; { +;; if (__builtin_amdgcn_processor_is("gfx1201") || +;; __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) +;; __builtin_amdgcn_s_sleep_var(x); +;; if (!__builtin_amdgcn_processor_is("gfx906")) +;; __builtin_amdgcn_s_wait_event_export_ready(); +;; else if (__builtin_amdgcn_processor_is("gfx1010") || +;; __builtin_amdgcn_processor_is("gfx1101")) +;; __builtin_amdgcn_s_ttracedata_imm(1); +;; while (__builtin_amdgcn_processor_is("gfx1101")) *p += x; +;; do { +;; *p -= x; +;; } while (__builtin_amdgcn_processor_is("gfx1010")); +;; for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break; +;; +;; if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)) +;; __builtin_amdgcn_s_wait_event_export_ready(); +;; else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm)) +;; __builtin_amdgcn_s_ttracedata_imm(1); +;; +;; do { +;; *p -= x; +;; } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32)); +;; for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break; +;; } + +@llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx12-insts = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx1010 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx1101 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx11-insts = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1 +@"llvm.amdgcn.has.gfx12-insts,wavefrontsize64" = external addrspace(1) externally_initialized constant i1 + +declare void @llvm.amdgcn.s.sleep.var(i32) +declare void @llvm.amdgcn.s.wait.event.export.ready() +declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg) + +define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { +; GFX906-LABEL: define amdgpu_kernel void @kernel( +; GFX906-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX906-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX906-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]] +; GFX906: [[IF_GFX1201_OR_GFX12_INSTS1]]: +; GFX906-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX906: [[IF_GFX1201_OR_GFX12_INSTS:.*:]] +; GFX906-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX906-NEXT: br label %[[IF_NOT_GFX906]] +; GFX906: [[IF_NOT_GFX906]]: +; GFX906-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]] +; GFX906: [[IF_NOT_GFX907:.*:]] +; GFX906-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX906-NEXT: br label %[[IF_END6:.*]] +; GFX906: [[IF_GFX1010_OR_GFX1102]]: +; GFX906-NEXT: br label %[[LOR_NOT_GFX1010:.*]] +; GFX906: [[LOR_NOT_GFX1010]]: +; GFX906-NEXT: br label %[[FOR_COND:.*]] +; GFX906: [[IF_GFX1010_OR_GFX1101:.*:]] +; GFX906-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX906-NEXT: br label %[[IF_END6]] +; GFX906: [[IF_END6]]: +; GFX906-NEXT: call void @llvm.assume(i1 true) +; GFX906-NEXT: call void @llvm.assume(i1 true) +; GFX906-NEXT: br label %[[FOR_COND]] +; GFX906: [[FOR_COND]]: +; GFX906-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX906-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX906-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX906-NEXT: br label %[[IF_GFX10_INSTS1:.*]] +; GFX906: [[IF_GFX11_INSTS:.*:]] +; GFX906-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX906-NEXT: br label %[[IF_END11:.*]] +; GFX906: [[IF_GFX10_INSTS1]]: +; GFX906-NEXT: br label %[[IF_END11]] +; GFX906: [[IF_GFX10_INSTS:.*:]] +; GFX906-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX906-NEXT: br label %[[IF_END11]] +; GFX906: [[IF_END11]]: +; GFX906-NEXT: call void @llvm.assume(i1 true) +; GFX906-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX906-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX906-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX906-NEXT: ret void +; +; GFX1010-LABEL: define amdgpu_kernel void @kernel( +; GFX1010-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1010-NEXT: [[ENTRY:.*:]] +; GFX1010-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1010-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1010-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]] +; GFX1010: [[IF_GFX1201_OR_GFX12_INSTS1]]: +; GFX1010-NEXT: br label %[[IF_END:.*]] +; GFX1010: [[IF_GFX1201_OR_GFX12_INSTS:.*:]] +; GFX1010-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1010-NEXT: br label %[[IF_END]] +; GFX1010: [[IF_END]]: +; GFX1010-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX1010: [[IF_NOT_GFX907]]: +; GFX1010-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1010-NEXT: br label %[[IF_END6:.*]] +; GFX1010: [[IF_NOT_GFX906:.*:]] +; GFX1010-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX1010: [[LOR_NOT_GFX1010:.*:]] +; GFX1010-NEXT: br label %[[FOR_COND:.*]] +; GFX1010: [[IF_GFX1010_OR_GFX1101]]: +; GFX1010-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1010-NEXT: br label %[[IF_END6]] +; GFX1010: [[IF_END6]]: +; GFX1010-NEXT: call void @llvm.assume(i1 true) +; GFX1010-NEXT: call void @llvm.assume(i1 false) +; GFX1010-NEXT: br label %[[FOR_COND]] +; GFX1010: [[FOR_COND]]: +; GFX1010-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1010-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1010-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1010-NEXT: br label %[[IF_ELSE8:.*]] +; GFX1010: [[IF_GFX11_INSTS:.*:]] +; GFX1010-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1010-NEXT: br label %[[IF_END11:.*]] +; GFX1010: [[IF_ELSE8]]: +; GFX1010-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX1010: [[IF_GFX10_INSTS]]: +; GFX1010-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1010-NEXT: br label %[[IF_END11]] +; GFX1010: [[IF_END11]]: +; GFX1010-NEXT: call void @llvm.assume(i1 true) +; GFX1010-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1010-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1010-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1010-NEXT: ret void +; +; GFX1101-LABEL: define amdgpu_kernel void @kernel( +; GFX1101-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1101-NEXT: [[ENTRY:.*:]] +; GFX1101-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1101-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1101-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]] +; GFX1101: [[IF_GFX1201_OR_GFX12_INSTS1]]: +; GFX1101-NEXT: br label %[[IF_END:.*]] +; GFX1101: [[IF_GFX1201_OR_GFX12_INSTS:.*:]] +; GFX1101-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1101-NEXT: br label %[[IF_END]] +; GFX1101: [[IF_END]]: +; GFX1101-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX1101: [[IF_NOT_GFX907]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1101-NEXT: br label %[[IF_END6:.*]] +; GFX1101: [[IF_NOT_GFX906:.*:]] +; GFX1101-NEXT: br label %[[LOR_NOT_GFX1010:.*]] +; GFX1101: [[LOR_NOT_GFX1010]]: +; GFX1101-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX1101: [[IF_GFX1010_OR_GFX1101]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1101-NEXT: br label %[[IF_END6]] +; GFX1101: [[IF_END6]]: +; GFX1101-NEXT: call void @llvm.assume(i1 false) +; GFX1101-NEXT: call void @llvm.assume(i1 true) +; GFX1101-NEXT: br label %[[FOR_COND:.*]] +; GFX1101: [[FOR_COND]]: +; GFX1101-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1101-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1101-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1101-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1101: [[IF_GFX11_INSTS]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1101-NEXT: br label %[[IF_END11:.*]] +; GFX1101: [[IF_ELSE8:.*:]] +; GFX1101-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX1101: [[IF_GFX10_INSTS]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1101-NEXT: br label %[[IF_END11]] +; GFX1101: [[IF_END11]]: +; GFX1101-NEXT: call void @llvm.assume(i1 true) +; GFX1101-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1101-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1101-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1101-NEXT: ret void +; +; GFX1201-LABEL: define amdgpu_kernel void @kernel( +; GFX1201-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1201-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1201-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX1201: [[LOR_NOT_GFX1201:.*:]] +; GFX1201-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS]] +; GFX1201: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1201-NEXT: br label %[[IF_END:.*]] +; GFX1201: [[IF_END]]: +; GFX1201-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX1201: [[IF_NOT_GFX907]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-NEXT: br label %[[IF_END6:.*]] +; GFX1201: [[IF_NOT_GFX906:.*:]] +; GFX1201-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]] +; GFX1201: [[IF_GFX1010_OR_GFX1102]]: +; GFX1201-NEXT: br label %[[FOR_COND:.*]] +; GFX1201: [[IF_GFX1010_OR_GFX1101:.*:]] +; GFX1201-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1201-NEXT: br label %[[IF_END6]] +; GFX1201: [[IF_END6]]: +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: br label %[[FOR_COND]] +; GFX1201: [[FOR_COND]]: +; GFX1201-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1201-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1201: [[IF_GFX11_INSTS]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-NEXT: br label %[[IF_END11:.*]] +; GFX1201: [[IF_ELSE8:.*:]] +; GFX1201-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX1201: [[IF_GFX10_INSTS]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1201-NEXT: br label %[[IF_END11]] +; GFX1201: [[IF_END11]]: +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1201-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-NEXT: ret void +; +; GFX1201-W64-LABEL: define amdgpu_kernel void @kernel( +; GFX1201-W64-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1201-W64-NEXT: [[ENTRY:.*:]] +; GFX1201-W64-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1201-W64-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1201-W64-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX1201-W64: [[LOR_NOT_GFX1201:.*:]] +; GFX1201-W64-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS]] +; GFX1201-W64: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1201-W64-NEXT: br label %[[IF_END:.*]] +; GFX1201-W64: [[IF_END]]: +; GFX1201-W64-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX1201-W64: [[IF_NOT_GFX907]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-W64-NEXT: br label %[[IF_END6:.*]] +; GFX1201-W64: [[IF_NOT_GFX906:.*:]] +; GFX1201-W64-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]] +; GFX1201-W64: [[IF_GFX1010_OR_GFX1102]]: +; GFX1201-W64-NEXT: br label %[[FOR_COND:.*]] +; GFX1201-W64: [[IF_GFX1010_OR_GFX1101:.*:]] +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1201-W64-NEXT: br label %[[IF_END6]] +; GFX1201-W64: [[IF_END6]]: +; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) +; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) +; GFX1201-W64-NEXT: br label %[[FOR_COND]] +; GFX1201-W64: [[FOR_COND]]: +; GFX1201-W64-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1201-W64-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1201-W64: [[IF_GFX11_INSTS]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-W64-NEXT: br label %[[IF_END11:.*]] +; GFX1201-W64: [[IF_ELSE8:.*:]] +; GFX1201-W64-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX1201-W64: [[IF_GFX10_INSTS]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1201-W64-NEXT: br label %[[IF_END11]] +; GFX1201-W64: [[IF_END11]]: +; GFX1201-W64-NEXT: call void @llvm.assume(i1 false) +; GFX1201-W64-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1201-W64-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: ret void +; +entry: + %0 = ptrtoint ptr addrspace(1) %p.coerce to i64 + %1 = inttoptr i64 %0 to ptr + %2 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1 + br i1 %2, label %if.gfx1201.or.gfx12-insts, label %lor.not.gfx1201 + +lor.not.gfx1201: + %3 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx12-insts, align 1 + br i1 %3, label %if.gfx1201.or.gfx12-insts, label %if.end + +if.gfx1201.or.gfx12-insts: + call void @llvm.amdgcn.s.sleep.var(i32 %x) + br label %if.end + +if.end: + %4 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1 + br i1 %4, label %if.gfx906, label %if.not.gfx906 + +if.not.gfx906: + call void @llvm.amdgcn.s.wait.event.export.ready() + br label %if.end6 + +if.gfx906: + %5 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1 + br i1 %5, label %if.gfx1010.or.gfx1101, label %lor.not.gfx1010 + +lor.not.gfx1010: + %6 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1 + br i1 %6, label %if.gfx1010.or.gfx1101, label %for.cond + +if.gfx1010.or.gfx1101: + call void @llvm.amdgcn.s.ttracedata.imm(i16 1) + br label %if.end6 + +if.end6: + %.pr.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1 + %7 = icmp ne i1 %.pr.pr, true + call void @llvm.assume(i1 %7) + %.pr6.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1 + %8 = icmp ne i1 %.pr6.pr, true + call void @llvm.assume(i1 %8) + br label %for.cond + +for.cond: + %.promoted = load i32, ptr %1, align 4 + %sub.peel = sub nsw i32 %.promoted, %x + store i32 %sub.peel, ptr %1, align 4 + %9 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx11-insts, align 1 + br i1 %9, label %if.gfx11-insts, label %if.else8 + +if.gfx11-insts: + call void @llvm.amdgcn.s.wait.event.export.ready() + br label %if.end11 + +if.else8: + %10 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1 + br i1 %10, label %if.gfx10-insts, label %if.end11 + +if.gfx10-insts: + call void @llvm.amdgcn.s.ttracedata.imm(i16 1) + br label %if.end11 + +if.end11: + %.pr7 = load i1, ptr addrspace(1) @"llvm.amdgcn.has.gfx12-insts,wavefrontsize64", align 1 + %11 = icmp ne i1 %.pr7, true + call void @llvm.assume(i1 %11) + %.promoted9 = load i32, ptr %1, align 4 + %sub13.peel = sub nsw i32 %.promoted9, %x + store i32 %sub13.peel, ptr %1, align 4 + ret void +} + +declare void @llvm.assume(i1 noundef) From 8bf116837e2bd77ff5906d025fdb80bfa5507382 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 2 Apr 2025 03:39:32 +0100 Subject: [PATCH 02/38] Fix format. --- clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 8 ++++---- clang/lib/Sema/SemaExpr.cpp | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 7b1a3815144b4..8ad1ab74f221d 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -292,8 +292,8 @@ static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) { P->setConstant(true); P->setExternallyInitialized(true); - return CGF.Builder.CreateLoad(RawAddress(P, PTy, CharUnits::One(), - KnownNonNull)); + return CGF.Builder.CreateLoad( + RawAddress(P, PTy, CharUnits::One(), KnownNonNull)); } Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, @@ -600,7 +600,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_processor_is: { assert(CGM.getTriple().isSPIRV() && "__builtin_amdgcn_processor_is should never reach CodeGen for " - "concrete targets!"); + "concrete targets!"); StringRef Proc = cast(E->getArg(0))->getString(); return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc); } @@ -609,7 +609,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, "__builtin_amdgcn_is_invocable should never reach CodeGen for " "concrete targets!"); auto FD = cast( - cast(E->getArg(0))->getReferencedDeclOfCallee()); + cast(E->getArg(0))->getReferencedDeclOfCallee()); StringRef RF = getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 24f5262ab3cf4..bd0183ae4fb82 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -20549,14 +20549,16 @@ static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) { (!Sema.getASTContext().getAuxTargetInfo() || !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) { Sema.Diag(CE->getExprLoc(), - diag::err_amdgcn_processor_is_arg_invalid_value) << N; + diag::err_amdgcn_processor_is_arg_invalid_value) + << N; return false; } } else { auto Arg = CE->getArg(0); if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) { Sema.Diag(CE->getExprLoc(), - diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg; + diag::err_amdgcn_is_invocable_arg_invalid_value) + << Arg; return false; } } @@ -20568,10 +20570,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) { if (auto UO = dyn_cast(E)) { auto SE = dyn_cast(UO->getSubExpr()); if (IsAMDGPUPredicateBI(SE)) { - assert( - UO->getOpcode() == UnaryOperator::Opcode::UO_LNot && - "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " - "can only be used as operands of logical ops!"); + assert(UO->getOpcode() == UnaryOperator::Opcode::UO_LNot && + "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " + "can only be used as operands of logical ops!"); if (!ValidateAMDGPUPredicateBI(Sema, SE)) { Invalid = true; @@ -20588,10 +20589,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) { auto LHS = dyn_cast(BO->getLHS()); auto RHS = dyn_cast(BO->getRHS()); if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) { - assert( - BO->isLogicalOp() && - "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " - "can only be used as operands of logical ops!"); + assert(BO->isLogicalOp() && + "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " + "can only be used as operands of logical ops!"); if (!ValidateAMDGPUPredicateBI(Sema, LHS) || !ValidateAMDGPUPredicateBI(Sema, RHS)) { From 3421292b6e3261410734fb5a324f7dec79080fc1 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 2 Apr 2025 03:42:24 +0100 Subject: [PATCH 03/38] Fix broken patch merge. --- .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 159 ++++++++++++++ .../AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp | 207 ------------------ 2 files changed, 159 insertions(+), 207 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp new file mode 100644 index 0000000000000..17357c452b6d3 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -0,0 +1,159 @@ +//===- AMDGPUExpandFeaturePredicates.cpp - Feature Predicate Expander Pass ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file implements a pass that deals with expanding AMDGCN generic feature +// predicates into target specific quantities / sequences. In this context, a +// generic feature predicate is an implementation detail global variable that +// is inserted by the FE as a consequence of using either the __builtin_cpu_is +// or the __builtin_amdgcn_is_invocable special builtins on an abstract target +// (AMDGCNSPIRV). These placeholder globals are used to guide target specific +// lowering, once the concrete target is known, by way of constant folding their +// value all the way into a terminator (i.e. a controlled block) or into a no +// live use scenario. The pass makes a best effort attempt to look through +// calls, i.e. a constant evaluatable passthrough of a predicate value will +// generally work, however we hard fail if the folding fails, to avoid obtuse +// BE errors or opaque run time errors. This pass should run as early as +// possible / immediately after Clang CodeGen, so that the optimisation pipeline +// and the BE operate with concrete target data. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetIntrinsicInfo.h" +#include "llvm/Transforms/Utils/Local.h" + +#include +#include + +using namespace llvm; + +namespace { +template +void collectUsers(Value *V, C &Container) { + assert(V && "Must pass an existing Value!"); + + for (auto &&U : V->users()) + if (auto I = dyn_cast(U)) + Container.insert(Container.end(), I); +} + +inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) { + const auto IsFeature = P->getName().starts_with("llvm.amdgcn.has"); + const auto Offset = + IsFeature ? sizeof("llvm.amdgcn.has") : sizeof("llvm.amdgcn.is"); + + auto PV = P->getName().substr(Offset).str(); + if (IsFeature) { + auto Dx = PV.find(','); + while (Dx != std::string::npos) { + PV.insert(++Dx, {'+'}); + + Dx = PV.find(',', Dx); + } + PV.insert(PV.cbegin(), '+'); + } + + auto PTy = P->getValueType(); + P->setLinkage(GlobalValue::PrivateLinkage); + P->setExternallyInitialized(false); + + if (IsFeature) + P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures(PV))); + else + P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU())); +} + +std::pair +unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) { + std::string W; + raw_string_ostream OS(W); + + OS << "Impossible to constant fold feature predicate: " << *P + << " used by " << *NoFold << ", please simplify.\n"; + + Caller->getContext().diagnose( + DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error)); + + return {PreservedAnalyses::none(), false}; +} + +std::pair +handlePredicate(const GCNSubtarget &ST, GlobalVariable *P) { + setPredicate(ST, P); + + SmallPtrSet ToFold; + collectUsers(P, ToFold); + + if (ToFold.empty()) + return {PreservedAnalyses::all(), true}; + + do { + auto I = *ToFold.begin(); + ToFold.erase(I); + + if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) { + collectUsers(I, ToFold); + I->replaceAllUsesWith(C); + I->eraseFromParent(); + continue; + } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) { + continue; + } else if (I->users().empty()) { + continue; + } + + return unfoldableFound(I->getParent()->getParent(), P, I); + } while (!ToFold.empty()); + + return {PreservedAnalyses::none(), true}; +} +} // Unnamed namespace. + +PreservedAnalyses +AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) { + if (M.empty()) + return PreservedAnalyses::all(); + + SmallVector Predicates; + for (auto &&G : M.globals()) { + if (!G.isDeclaration() || !G.hasName()) + continue; + if (G.getName().starts_with("llvm.amdgcn.")) + Predicates.push_back(&G); + } + + if (Predicates.empty()) + return PreservedAnalyses::all(); + + const auto &ST = TM.getSubtarget( + *find_if(M, [](auto &&F) { return !F.isIntrinsic(); })); + + auto Ret = PreservedAnalyses::all(); + for (auto &&P : Predicates) { + auto R = handlePredicate(ST, P); + + if (!R.second) + break; + + Ret.intersect(R.first); + } + + for (auto &&P : Predicates) + P->eraseFromParent(); + + return Ret; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp deleted file mode 100644 index 125051c6aa0cf..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandPseudoIntrinsics.cpp +++ /dev/null @@ -1,207 +0,0 @@ -//===- AMDGPUExpandPseudoIntrinsics.cpp - Pseudo Intrinsic Expander Pass --===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// This file implements a pass that deals with expanding AMDGCN generic pseudo- -// intrinsics into target specific quantities / sequences. In this context, a -// pseudo-intrinsic is an AMDGCN intrinsic that does not directly map to a -// specific instruction, but rather is intended as a mechanism for abstractly -// conveying target specific info to a HLL / the FE, without concretely -// impacting the AST. An example of such an intrinsic is amdgcn.wavefrontsize. -// This pass should run as early as possible / immediately after Clang CodeGen, -// so that the optimisation pipeline and the BE operate with concrete target -// data. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUTargetMachine.h" -#include "GCNSubtarget.h" - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Target/TargetIntrinsicInfo.h" -#include "llvm/Transforms/IPO/AlwaysInliner.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/Local.h" - -#include -#include - -using namespace llvm; - -namespace { -inline Function *getCloneForInlining(Function *OldF) { - assert(OldF && "Must pass an existing Function!"); - - // TODO - Alias Value to clone arg. - ValueToValueMapTy VMap; - - auto NewF = CloneFunction(OldF, VMap); - - NewF->removeFnAttr(Attribute::OptimizeNone); - NewF->removeFnAttr(Attribute::NoInline); - NewF->addFnAttr(Attribute::AlwaysInline); - - return NewF; -} - -template -inline void collectUsers(Value *V, ModulePassManager &AlwaysInliner, - ModuleAnalysisManager &MAM, - SmallDenseMap &InlinableClones, - C &Container) { - assert(V && "Must pass an existing Value!"); - - auto A = PreservedAnalyses::all(); - - constexpr auto IsValidCall = [](auto &&U) { - if (auto CB = dyn_cast(U)) - if (auto F = CB->getCalledFunction()) - if (!F->isIntrinsic() && !F->isDeclaration()) - return true; - return false; - }; - - SmallVector Calls{}; - copy_if(V->users(), std::back_inserter(Calls), IsValidCall); - - while (!Calls.empty()) { - for (auto &&Call : Calls) { - auto CB = cast(Call); - auto &TempF = InlinableClones[CB->getCalledFunction()]; - - if (!TempF) - TempF = getCloneForInlining(CB->getCalledFunction()); - - CB->setCalledFunction(TempF); - CB->removeFnAttr(Attribute::NoInline); - CB->addFnAttr(Attribute::AlwaysInline); - - AlwaysInliner.run(*TempF->getParent(), MAM); - } - - Calls.clear(); - - copy_if(V->users(), std::back_inserter(Calls), IsValidCall); - } - - for (auto &&U : V->users()) - if (auto I = dyn_cast(U)) { - if (auto CB = dyn_cast(I)) { - if (CB->getCalledFunction() && !CB->getCalledFunction()->isIntrinsic()) - Container.insert(Container.end(), I); - } else { - Container.insert(Container.end(), I); - } - } -} - -std::pair -handlePredicate(const GCNSubtarget &ST, ModuleAnalysisManager &MAM, - SmallDenseMap& InlinableClones, - GlobalVariable *P) { - auto PV = P->getName().substr(P->getName().rfind('.') + 1).str(); - auto Dx = PV.find(','); - while (Dx != std::string::npos) { - PV.insert(++Dx, {'+'}); - - Dx = PV.find(',', Dx); - } - - auto PTy = P->getValueType(); - P->setLinkage(GlobalValue::PrivateLinkage); - P->setExternallyInitialized(false); - - if (P->getName().starts_with("llvm.amdgcn.is")) - P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU())); - else - P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures('+' + PV))); - - ModulePassManager MPM; - MPM.addPass(AlwaysInlinerPass()); - - SmallPtrSet ToFold; - collectUsers(P, MPM, MAM, InlinableClones, ToFold); - - if (ToFold.empty()) - return {PreservedAnalyses::all(), true}; - - do { - auto I = *ToFold.begin(); - ToFold.erase(I); - - if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) { - collectUsers(I, MPM, MAM, InlinableClones, ToFold); - I->replaceAllUsesWith(C); - I->eraseFromParent(); - continue; - } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) { - continue; - } else if (I->users().empty()) { - continue; - } - - std::string W; - raw_string_ostream OS(W); - - auto Caller = I->getParent()->getParent(); - - OS << "Impossible to constant fold feature predicate: " << P->getName() - << ", please simplify.\n"; - - Caller->getContext().diagnose( - DiagnosticInfoUnsupported(*Caller, W, I->getDebugLoc(), DS_Error)); - - return {PreservedAnalyses::none(), false}; - } while (!ToFold.empty()); - - return {PreservedAnalyses::none(), true}; -} -} // Unnamed namespace. - -PreservedAnalyses -AMDGPUExpandPseudoIntrinsicsPass::run(Module &M, ModuleAnalysisManager &MAM) { - if (M.empty()) - return PreservedAnalyses::all(); - - SmallVector Predicates; - for (auto &&G : M.globals()) { - if (!G.isDeclaration() || !G.hasName()) - continue; - if (G.getName().starts_with("llvm.amdgcn.")) - Predicates.push_back(&G); - } - - if (Predicates.empty()) - return PreservedAnalyses::all(); - - PreservedAnalyses Ret = PreservedAnalyses::all(); - - SmallDenseMap InlinableClones; - const auto &ST = TM.getSubtarget( - *find_if(M, [](auto &&F) { return !F.isIntrinsic(); })); - - for (auto &&P : Predicates) { - auto R = handlePredicate(ST, MAM, InlinableClones, P); - - if (!R.second) - return PreservedAnalyses::none(); - - Ret.intersect(R.first); - } - - for (auto &&C : InlinableClones) - C.second->eraseFromParent(); - - return Ret; -} From 539c7e6c6357fa7330de9e23fa13cf795061b85b Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 2 Apr 2025 03:51:08 +0100 Subject: [PATCH 04/38] Add release notes. --- clang/docs/ReleaseNotes.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c4e82678949ff..005b33da29d2d 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -418,6 +418,10 @@ AMDGPU Support ^^^^^^^^^^^^^^ - Bump the default code object version to 6. ROCm 6.3 is required to run any program compiled with COV6. +- Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``, + a late / deferred query for the current target processor +- Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``, + which enables fine-grained, per-builtin, feature availability NVPTX Support ^^^^^^^^^^^^^^ From 5926b9f715fce59e753756f5330f311e3f916667 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 2 Apr 2025 03:55:39 +0100 Subject: [PATCH 05/38] (Hopefully) Final format fix. --- .../Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp index 17357c452b6d3..8d38508eda74b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -42,8 +42,7 @@ using namespace llvm; namespace { -template -void collectUsers(Value *V, C &Container) { +template void collectUsers(Value *V, C &Container) { assert(V && "Must pass an existing Value!"); for (auto &&U : V->users()) @@ -82,8 +81,8 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) { std::string W; raw_string_ostream OS(W); - OS << "Impossible to constant fold feature predicate: " << *P - << " used by " << *NoFold << ", please simplify.\n"; + OS << "Impossible to constant fold feature predicate: " << *P << " used by " + << *NoFold << ", please simplify.\n"; Caller->getContext().diagnose( DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error)); @@ -91,8 +90,8 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) { return {PreservedAnalyses::none(), false}; } -std::pair -handlePredicate(const GCNSubtarget &ST, GlobalVariable *P) { +std::pair handlePredicate(const GCNSubtarget &ST, + GlobalVariable *P) { setPredicate(ST, P); SmallPtrSet ToFold; From 4381d930084f38d9e4099d8c8fbea0e4267556a9 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 2 Apr 2025 04:01:27 +0100 Subject: [PATCH 06/38] Remove stray space. --- llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp index 8d38508eda74b..6d6c457170f8c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -81,7 +81,7 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) { std::string W; raw_string_ostream OS(W); - OS << "Impossible to constant fold feature predicate: " << *P << " used by " + OS << "Impossible to constant fold feature predicate: " << *P << " used by " << *NoFold << ", please simplify.\n"; Caller->getContext().diagnose( From d18f64e455f0d3b91c013bd0d99e895adc57fcad Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 2 Apr 2025 11:01:59 +0100 Subject: [PATCH 07/38] Remove unused header, fix borked test. --- ...pu-builtin-cpu-is.c => amdgpu-builtin-processor-is.c} | 9 +++------ llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 1 - 2 files changed, 3 insertions(+), 7 deletions(-) rename clang/test/CodeGen/{amdgpu-builtin-cpu-is.c => amdgpu-builtin-processor-is.c} (92%) diff --git a/clang/test/CodeGen/amdgpu-builtin-cpu-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c similarity index 92% rename from clang/test/CodeGen/amdgpu-builtin-cpu-is.c rename to clang/test/CodeGen/amdgpu-builtin-processor-is.c index 6e261d9f5d239..f5d80bff1c51e 100644 --- a/clang/test/CodeGen/amdgpu-builtin-cpu-is.c +++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c @@ -10,10 +10,6 @@ // 3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and // load from it to provide the condition a br (abstract target) //. -// AMDGCN-GFX900: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600 -//. -// AMDGCN-GFX1010: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600 -//. // AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1 //. // AMDGCN-GFX900-LABEL: define dso_local void @foo( @@ -31,7 +27,8 @@ // AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { // AMDGCNSPIRV-NEXT: [[ENTRY:.*:]] // AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1 -// AMDGCNSPIRV-NEXT: br i1 [[TMP0]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// AMDGCNSPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] // AMDGCNSPIRV: [[IF_THEN]]: // AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap() // AMDGCNSPIRV-NEXT: br label %[[IF_END]] @@ -39,7 +36,7 @@ // AMDGCNSPIRV-NEXT: ret void // void foo() { - if (__builtin_cpu_is("gfx900")) + if (__builtin_amdgcn_processor_is("gfx900")) return __builtin_trap(); } //. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp index 6d6c457170f8c..ae100e2f5b213 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -33,7 +33,6 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Transforms/Utils/Local.h" #include From 7880ff498495511c70952c0a135b5e9f9b837889 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 2 Apr 2025 15:09:48 +0100 Subject: [PATCH 08/38] Stars everywhere. --- clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 4 +-- clang/lib/Sema/SemaExpr.cpp | 30 +++++++++---------- .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 8 ++--- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 8ad1ab74f221d..179b9ad02177b 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -287,7 +287,7 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) { auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext()); - auto P = cast( + auto *P = cast( CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy)); P->setConstant(true); P->setExternallyInitialized(true); @@ -608,7 +608,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, assert(CGM.getTriple().isSPIRV() && "__builtin_amdgcn_is_invocable should never reach CodeGen for " "concrete targets!"); - auto FD = cast( + auto *FD = cast( cast(E->getArg(0))->getReferencedDeclOfCallee()); StringRef RF = getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index bd0183ae4fb82..44fd9aa1f1834 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6546,11 +6546,11 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, // without any additional checking. if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 && ArgExprs[0]->getType() == Context.BuiltinFnTy) { - auto FD = cast(Fn->getReferencedDeclOfCallee()); + auto *FD = cast(Fn->getReferencedDeclOfCallee()); if (FD->getName() == "__builtin_amdgcn_is_invocable") { auto FnPtrTy = Context.getPointerType(FD->getType()); - auto R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get(); + auto *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get(); return CallExpr::Create(Context, R, ArgExprs, Context.VoidTy, ExprValueKind::VK_PRValue, RParenLoc, FPOptionsOverride()); @@ -13254,8 +13254,8 @@ static inline bool IsAMDGPUPredicateBI(Expr *E) { if (!E->getType()->isVoidType()) return false; - if (auto CE = dyn_cast(E)) { - if (auto BI = CE->getDirectCallee()) + if (auto *CE = dyn_cast(E)) { + if (auto *BI = CE->getDirectCallee()) if (BI->getName() == "__builtin_amdgcn_processor_is" || BI->getName() == "__builtin_amdgcn_is_invocable") return true; @@ -15622,14 +15622,14 @@ static Expr *ExpandAMDGPUPredicateBI(ASTContext &Ctx, CallExpr *CE) { auto &TI = Ctx.getTargetInfo(); if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") { - auto GFX = dyn_cast(CE->getArg(0)->IgnoreParenCasts()); + auto *GFX = dyn_cast(CE->getArg(0)->IgnoreParenCasts()); auto TID = TI.getTargetID(); if (GFX && TID) { auto N = GFX->getString(); P = TI.isValidCPUName(GFX->getString()) && TID->find(N) == 0; } } else { - auto FD = cast(CE->getArg(0)->getReferencedDeclOfCallee()); + auto *FD = cast(CE->getArg(0)->getReferencedDeclOfCallee()); StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); llvm::StringMap CF; @@ -20538,7 +20538,7 @@ void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) { static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) { if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") { - auto GFX = dyn_cast(CE->getArg(0)->IgnoreParenCasts()); + auto *GFX = dyn_cast(CE->getArg(0)->IgnoreParenCasts()); if (!GFX) { Sema.Diag(CE->getExprLoc(), diag::err_amdgcn_processor_is_arg_not_literal); @@ -20554,7 +20554,7 @@ static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) { return false; } } else { - auto Arg = CE->getArg(0); + auto *Arg = CE->getArg(0); if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) { Sema.Diag(CE->getExprLoc(), diag::err_amdgcn_is_invocable_arg_invalid_value) @@ -20567,8 +20567,8 @@ static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) { } static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) { - if (auto UO = dyn_cast(E)) { - auto SE = dyn_cast(UO->getSubExpr()); + if (auto *UO = dyn_cast(E)) { + auto *SE = dyn_cast(UO->getSubExpr()); if (IsAMDGPUPredicateBI(SE)) { assert(UO->getOpcode() == UnaryOperator::Opcode::UO_LNot && "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " @@ -20585,9 +20585,9 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) { return UO; } } - if (auto BO = dyn_cast(E)) { - auto LHS = dyn_cast(BO->getLHS()); - auto RHS = dyn_cast(BO->getRHS()); + if (auto *BO = dyn_cast(E)) { + auto *LHS = dyn_cast(BO->getLHS()); + auto *RHS = dyn_cast(BO->getRHS()); if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) { assert(BO->isLogicalOp() && "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " @@ -20606,7 +20606,7 @@ static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) { return BO; } } - if (auto CE = dyn_cast(E)) + if (auto *CE = dyn_cast(E)) if (IsAMDGPUPredicateBI(CE)) { if (!ValidateAMDGPUPredicateBI(Sema, CE)) { Invalid = true; @@ -20631,7 +20631,7 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E, if (!E->isTypeDependent()) { if (E->getType()->isVoidType()) { bool IsInvalidPredicate = false; - if (auto BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate)) + if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate)) return BIC; else if (IsInvalidPredicate) return ExprError(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp index ae100e2f5b213..f1c73e86fb4a0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -45,7 +45,7 @@ template void collectUsers(Value *V, C &Container) { assert(V && "Must pass an existing Value!"); for (auto &&U : V->users()) - if (auto I = dyn_cast(U)) + if (auto *I = dyn_cast(U)) Container.insert(Container.end(), I); } @@ -65,7 +65,7 @@ inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) { PV.insert(PV.cbegin(), '+'); } - auto PTy = P->getValueType(); + auto *PTy = P->getValueType(); P->setLinkage(GlobalValue::PrivateLinkage); P->setExternallyInitialized(false); @@ -100,10 +100,10 @@ std::pair handlePredicate(const GCNSubtarget &ST, return {PreservedAnalyses::all(), true}; do { - auto I = *ToFold.begin(); + auto *I = *ToFold.begin(); ToFold.erase(I); - if (auto C = ConstantFoldInstruction(I, P->getDataLayout())) { + if (auto *C = ConstantFoldInstruction(I, P->getDataLayout())) { collectUsers(I, ToFold); I->replaceAllUsesWith(C); I->eraseFromParent(); From 719dfdea50ae31ac54040a95d499dae98f714a52 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 2 Apr 2025 15:33:31 +0100 Subject: [PATCH 09/38] Fix format without line break. --- clang/lib/Sema/SemaExpr.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 44fd9aa1f1834..889d54be8d91b 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -20630,10 +20630,10 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E, if (!E->isTypeDependent()) { if (E->getType()->isVoidType()) { - bool IsInvalidPredicate = false; - if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, IsInvalidPredicate)) + bool InvalidPredicate = false; + if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, InvalidPredicate)) return BIC; - else if (IsInvalidPredicate) + else if (InvalidPredicate) return ExprError(); } From 36b69b41f9d92901b1799bd8515ef4d8c9a41f51 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 2 Apr 2025 15:40:37 +0100 Subject: [PATCH 10/38] Add host tests. --- clang/test/SemaHIP/amdgpu-is-invocable.hip | 21 +++++++++++++++++++++ clang/test/SemaHIP/amdgpu-processor-is.hip | 21 +++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 clang/test/SemaHIP/amdgpu-is-invocable.hip create mode 100644 clang/test/SemaHIP/amdgpu-processor-is.hip diff --git a/clang/test/SemaHIP/amdgpu-is-invocable.hip b/clang/test/SemaHIP/amdgpu-is-invocable.hip new file mode 100644 index 0000000000000..214d7769a595f --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-is-invocable.hip @@ -0,0 +1,21 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s + +// expected-no-diagnostics + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ void foo() { + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16)) + return __builtin_trap(); +} + +__global__ void bar() { + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16)) + return __builtin_trap(); +} diff --git a/clang/test/SemaHIP/amdgpu-processor-is.hip b/clang/test/SemaHIP/amdgpu-processor-is.hip new file mode 100644 index 0000000000000..0f7211fd75d90 --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-processor-is.hip @@ -0,0 +1,21 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s + +// expected-no-diagnostics + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ void foo() { + if (__builtin_amdgcn_processor_is("gfx900")) + return __builtin_trap(); +} + +__global__ void bar() { + if (__builtin_amdgcn_processor_is("gfx900")) + return __builtin_trap(); +} From e327e1520b2453e69d888d1be3d5c68c40a0456a Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 2 Apr 2025 16:48:04 +0100 Subject: [PATCH 11/38] Fit code examples within 80-char limit. --- clang/docs/LanguageExtensions.rst | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 8a7cb75af13e5..817f6a62f6a41 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -4956,7 +4956,9 @@ a functional mechanism for programatically querying: while (__builtin_amdgcn_processor_is("gfx1101")) *p += x; - do { *p -= x; } while (__builtin_amdgcn_processor_is("gfx1010")); + do { + *p -= x; + } while (__builtin_amdgcn_processor_is("gfx1010")); for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break; @@ -4967,9 +4969,11 @@ a functional mechanism for programatically querying: do { *p -= x; - } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32)); + } while ( + __builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32)); - for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break; + for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) + break; **Description**: From 888a0803db90e38a6d912b7d019b27196eee3bf3 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 16 Apr 2025 03:35:14 +0300 Subject: [PATCH 12/38] Fix tests. --- clang/test/CodeGen/amdgpu-builtin-is-invocable.c | 2 +- clang/test/CodeGen/amdgpu-builtin-processor-is.c | 2 +- clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c index 6d2690cb75b7c..12f283707308e 100644 --- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c +++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c @@ -47,7 +47,7 @@ void foo() { // AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } // AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } //. -// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" } +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" } // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } //. // AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c index f5d80bff1c51e..76dead8ebbe89 100644 --- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c +++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c @@ -45,7 +45,7 @@ void foo() { //. // AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } //. -// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize32,+wavefrontsize64" } +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" } // AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } //. // AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp index f618f54909b00..26cc8b4c7631d 100644 --- a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp +++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp @@ -32,11 +32,11 @@ void invalid_invocations(int x, const char* str) { // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal if (__builtin_amdgcn_processor_is(str)) return; - // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `"__builtin_amdgcn_s_sleep_var"` is not valid + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}__builtin_amdgcn_s_sleep_var{{.*}} is not valid if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return; - // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `str` is not valid + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}str{{.*}} is not valid else if (__builtin_amdgcn_is_invocable(str)) return; - // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; `x` is not valid + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}x{{.*}} is not valid else if (__builtin_amdgcn_is_invocable(x)) return; // CHECK: error: use of undeclared identifier '__builtin_ia32_pause' else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return; From e35ac6281f1b22539e4771dfd2893bdabeb452b6 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 16 Apr 2025 15:31:30 +0300 Subject: [PATCH 13/38] Fix test. --- .../CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp index 44557284fc581..cffd3c7a5fb1f 100644 --- a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp +++ b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp @@ -7,7 +7,7 @@ // HIPSTDPAR-PRE: Running pass: EntryExitInstrumenterPass // HIPSTDPAR-PRE-NEXT: Running pass: EntryExitInstrumenterPass // HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass -// HIPSTDPAR-PRE-NEXT: Running pass: AlwaysInlinerPass +// HIPSTDPAR-PRE-NEXT: Running pass: AMDGPUExpandFeaturePredicatesPass // Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink. // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \ From a8bca2fe2c054187981afcfca155e95efde26447 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 6 May 2025 01:47:53 +0100 Subject: [PATCH 14/38] Re-work implementation to return a target specific type. --- clang/docs/LanguageExtensions.rst | 61 ++------ clang/include/clang/Basic/AMDGPUTypes.def | 8 + clang/include/clang/Basic/Builtins.def | 1 + clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 +- .../clang/Basic/DiagnosticSemaKinds.td | 9 +- clang/include/clang/Sema/SemaAMDGPU.h | 4 + clang/lib/AST/ASTContext.cpp | 11 +- clang/lib/CodeGen/CGDebugInfo.cpp | 7 + clang/lib/CodeGen/CGExprScalar.cpp | 4 + clang/lib/CodeGen/CodeGenTypes.cpp | 3 + clang/lib/Sema/Sema.cpp | 7 +- clang/lib/Sema/SemaAMDGPU.cpp | 60 +++++++ clang/lib/Sema/SemaCast.cpp | 9 ++ clang/lib/Sema/SemaDecl.cpp | 15 ++ clang/lib/Sema/SemaExpr.cpp | 147 +----------------- clang/lib/Sema/SemaInit.cpp | 16 ++ clang/lib/Sema/SemaOverload.cpp | 14 +- .../amdgpu-feature-builtins-invalid-use.cpp | 41 +++-- 18 files changed, 209 insertions(+), 212 deletions(-) diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 0c8dd564aed4a..da8b16501d00a 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -4950,12 +4950,8 @@ a functional mechanism for programatically querying: .. code-block:: c - // When used as the predicate for a control structure - bool __builtin_amdgcn_processor_is(const char*); - bool __builtin_amdgcn_is_invocable(builtin_name); - // Otherwise - void __builtin_amdgcn_processor_is(const char*); - void __builtin_amdgcn_is_invocable(void); + __amdgpu_feature_predicate_t __builtin_amdgcn_processor_is(const char*); + __amdgpu_feature_predicate_t __builtin_amdgcn_is_invocable(builtin_name); **Example of use**: @@ -4974,7 +4970,7 @@ a functional mechanism for programatically querying: while (__builtin_amdgcn_processor_is("gfx1101")) *p += x; do { - *p -= x; + break; } while (__builtin_amdgcn_processor_is("gfx1010")); for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break; @@ -4985,7 +4981,7 @@ a functional mechanism for programatically querying: __builtin_amdgcn_s_ttracedata_imm(1); do { - *p -= x; + break; } while ( __builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32)); @@ -4994,17 +4990,21 @@ a functional mechanism for programatically querying: **Description**: -When used as the predicate value of the following control structures: +The builtins return a value of type ``__amdgpu_feature_predicate_t``, which is a +target specific type that behaves as if its C++ definition was the following: .. code-block:: c++ - if (...) - while (...) - do { } while (...) - for (...) + struct __amdgpu_feature_predicate_t { + __amdgpu_feature_predicate_t() = delete; + __amdgpu_feature_predicate_t(const __amdgpu_feature_predicate_t&) = delete; + __amdgpu_feature_predicate_t(__amdgpu_feature_predicate_t&&) = delete; + + explicit + operator bool() const noexcept; + }; -be it directly, or as arguments to logical operators such as ``!, ||, &&``, the -builtins return a boolean value that: +The boolean interpretation of the predicate values returned by the builtins: * indicates whether the current target matches the argument; the argument MUST be a string literal and a valid AMDGPU target @@ -5012,37 +5012,6 @@ builtins return a boolean value that: by the current target; the argument MUST be either a generic or AMDGPU specific builtin name -Outside of these contexts, the builtins have a ``void`` returning signature -which prevents their misuse. - -**Example of invalid use**: - -.. code-block:: c++ - - void kernel(int* p, int x, bool (*pfn)(bool), const char* str) { - if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return; - else if (__builtin_amdgcn_processor_is(str)) __builtin_trap(); - - bool a = __builtin_amdgcn_processor_is("gfx906"); - const bool b = !__builtin_amdgcn_processor_is("gfx906"); - const bool c = !__builtin_amdgcn_processor_is("gfx906"); - bool d = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); - bool e = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); - const auto f = - !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) - || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); - const auto g = - !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) - || !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); - __builtin_amdgcn_processor_is("gfx1201") - ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42); - if (pfn(__builtin_amdgcn_processor_is("gfx1200"))) - __builtin_amdgcn_s_sleep_var(x); - - if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return; - else if (__builtin_amdgcn_is_invocable(x)) __builtin_trap(); - } - When invoked while compiling for a concrete target, the builtins are evaluated early by Clang, and never produce any CodeGen effects / have no observable side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v, diff --git a/clang/include/clang/Basic/AMDGPUTypes.def b/clang/include/clang/Basic/AMDGPUTypes.def index d3dff446f9edf..a0574c640184b 100644 --- a/clang/include/clang/Basic/AMDGPUTypes.def +++ b/clang/include/clang/Basic/AMDGPUTypes.def @@ -20,10 +20,18 @@ AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) #endif +#ifndef AMDGPU_FEATURE_PREDICATE_TYPE +#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \ + AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) +#endif + AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy, 128, 128, 8) AMDGPU_NAMED_BARRIER_TYPE("__amdgpu_named_workgroup_barrier_t", AMDGPUNamedWorkgroupBarrier, AMDGPUNamedWorkgroupBarrierTy, 128, 32, 0) +AMDGPU_FEATURE_PREDICATE_TYPE("__amdgpu_feature_predicate_t", AMDGPUFeaturePredicate, AMDGPUFeaturePredicateTy, 1, 1) + #undef AMDGPU_TYPE #undef AMDGPU_OPAQUE_PTR_TYPE #undef AMDGPU_NAMED_BARRIER_TYPE +#undef AMDGPU_FEATURE_PREDICATE_TYPE diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def index 48437c9397570..27f78af16fe06 100644 --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -34,6 +34,7 @@ // Q -> target builtin type, followed by a character to distinguish the builtin type // Qa -> AArch64 svcount_t builtin type. // Qb -> AMDGPU __amdgpu_buffer_rsrc_t builtin type. +// Qc -> AMDGPU __amdgpu_feature_predicate_t builtin type. // E -> ext_vector, followed by the number of elements and the base type. // X -> _Complex, followed by the base type. // Y -> ptrdiff_t diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 3d53223e3a5a4..b57b315b87790 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -351,8 +351,8 @@ BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n") // These are special FE only builtins intended for forwarding the requirements // to the ME. -BUILTIN(__builtin_amdgcn_processor_is, "vcC*", "nctu") -BUILTIN(__builtin_amdgcn_is_invocable, "v", "nctu") +BUILTIN(__builtin_amdgcn_processor_is, "QccC*", "nctu") +BUILTIN(__builtin_amdgcn_is_invocable, "Qc", "nctu") //===----------------------------------------------------------------------===// // R600-NI only builtins. diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 5f118d744a6cf..e92e8cdee4b63 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -6820,7 +6820,7 @@ def err_counted_by_on_incomplete_type_on_use : Error < def note_counted_by_consider_completing_pointee_ty : Note< "consider providing a complete definition for %0">; - + def note_counted_by_consider_using_sized_by : Note< "consider using '__sized_by%select{|_or_null}0' instead of " "'__counted_by%select{|_or_null}0'">; @@ -13292,4 +13292,11 @@ def err_amdgcn_is_invocable_arg_invalid_value : Error<"the argument to __builtin_amdgcn_is_invocable must be either a " "target agnostic builtin or an AMDGCN target specific builtin; `%0`" " is not valid">; +def err_amdgcn_predicate_type_is_not_constructible + : Error<"%0 has type __amdgpu_feature_predicate_t, which is not" + " constructible">; +def err_amdgcn_predicate_type_needs_explicit_bool_cast + : Error<"%0 must be explicitly cast to %1; however, please note that this " + "is almost always an error and that it prevents the effective " + "guarding of target dependent code, and thus should be avoided">; } // end of sema component. diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h index d62c9bb65fadb..843a146243eae 100644 --- a/clang/include/clang/Sema/SemaAMDGPU.h +++ b/clang/include/clang/Sema/SemaAMDGPU.h @@ -64,6 +64,10 @@ class SemaAMDGPU : public SemaBase { void handleAMDGPUNumVGPRAttr(Decl *D, const ParsedAttr &AL); void handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, const ParsedAttr &AL); void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL); + + /// Expand a valid use of the feature identification builtins into its + /// corresponding sequence of instructions. + Expr *ExpandAMDGPUPredicateBI(CallExpr *CE); }; } // namespace clang diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index ae136ae271882..28bdb1d90bbbd 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -1477,7 +1477,12 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target, } if (Target.getTriple().isAMDGPU() || - (AuxTarget && AuxTarget->getTriple().isAMDGPU())) { + (Target.getTriple().isSPIRV() && + Target.getTriple().getVendor() == llvm::Triple::AMD) || + (AuxTarget && + (AuxTarget->getTriple().isAMDGPU() || + ((AuxTarget->getTriple().isSPIRV() && + AuxTarget->getTriple().getVendor() == llvm::Triple::AMD))))) { #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) \ InitBuiltinType(SingletonId, BuiltinType::Id); #include "clang/Basic/AMDGPUTypes.def" @@ -12379,6 +12384,10 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context, Type = Context.AMDGPUBufferRsrcTy; break; } + case 'c': { + Type = Context.AMDGPUFeaturePredicateTy; + break; + } default: llvm_unreachable("Unexpected target builtin type"); } diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index f3ec498d4064b..c68b1ce1f643d 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -919,6 +919,13 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) { DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_unsigned); \ return SingletonId; \ } +#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \ + case BuiltinType::Id: { \ + if (!SingletonId) \ + SingletonId = \ + DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_boolean); \ + return SingletonId; \ + } #include "clang/Basic/AMDGPUTypes.def" case BuiltinType::UChar: case BuiltinType::Char_U: diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 15a6177746403..ad543b8f713b4 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -980,6 +980,10 @@ Value *ScalarExprEmitter::EmitConversionToBool(Value *Src, QualType SrcType) { if (const MemberPointerType *MPT = dyn_cast(SrcType)) return CGF.CGM.getCXXABI().EmitMemberPointerIsNotNull(CGF, Src, MPT); + // The conversion is a NOP, and will be done when CodeGening the builtin. + if (SrcType == CGF.getContext().AMDGPUFeaturePredicateTy) + return Src; + assert((SrcType->isIntegerType() || isa(Src->getType())) && "Unknown scalar type to convert"); diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index d1b292f23c2d2..61013242d3a08 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -584,6 +584,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { case BuiltinType::Id: \ return llvm::TargetExtType::get(getLLVMContext(), "amdgcn.named.barrier", \ {}, {Scope}); +#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \ + case BuiltinType::Id: \ + return llvm::IntegerType::getInt1Ty(getLLVMContext()); #include "clang/Basic/AMDGPUTypes.def" #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id: #include "clang/Basic/HLSLIntangibleTypes.def" diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 1901d19b14dfc..c4ed83cc8d50a 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -546,8 +546,13 @@ void Sema::Initialize() { } if (Context.getTargetInfo().getTriple().isAMDGPU() || + (Context.getTargetInfo().getTriple().isSPIRV() && + Context.getTargetInfo().getTriple().getVendor() == llvm::Triple::AMD) || (Context.getAuxTargetInfo() && - Context.getAuxTargetInfo()->getTriple().isAMDGPU())) { + (Context.getAuxTargetInfo()->getTriple().isAMDGPU() || + (Context.getAuxTargetInfo()->getTriple().isSPIRV() && + Context.getAuxTargetInfo()->getTriple().getVendor() == + llvm::Triple::AMD)))) { #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) \ addImplicitTypedef(Name, Context.SingletonId); #include "clang/Basic/AMDGPUTypes.def" diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index a6366aceec2a6..7bf88c5c6a9a0 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -12,6 +12,7 @@ #include "clang/Sema/SemaAMDGPU.h" #include "clang/Basic/DiagnosticSema.h" +#include "clang/Basic/TargetInfo.h" #include "clang/Basic/TargetBuiltins.h" #include "clang/Sema/Ownership.h" #include "clang/Sema/Sema.h" @@ -366,4 +367,63 @@ void SemaAMDGPU::handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), YExpr, ZExpr); } +Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { + auto &Ctx = getASTContext(); + auto BoolTy = Ctx.getLogicalOperationType(); + auto False = llvm::APInt::getZero(Ctx.getIntWidth(BoolTy)); + auto True = llvm::APInt::getAllOnes(Ctx.getIntWidth(BoolTy)); + auto Loc = CE->getExprLoc(); + + if (!CE->getBuiltinCallee()) + return IntegerLiteral::Create(Ctx, False, BoolTy, Loc); + + auto P = false; + auto BI = CE->getBuiltinCallee(); + if (Ctx.BuiltinInfo.isAuxBuiltinID(BI)) + BI = Ctx.BuiltinInfo.getAuxBuiltinID(BI); + + if (BI == AMDGPU::BI__builtin_amdgcn_processor_is) { + auto *GFX = dyn_cast(CE->getArg(0)->IgnoreParenCasts()); + if (!GFX) { + Diag(Loc, diag::err_amdgcn_processor_is_arg_not_literal); + return nullptr; + } + + auto N = GFX->getString(); + if (!Ctx.getTargetInfo().isValidCPUName(N) && + (!Ctx.getAuxTargetInfo() || + !Ctx.getAuxTargetInfo()->isValidCPUName(N))) { + Diag(Loc, diag::err_amdgcn_processor_is_arg_invalid_value) << N; + return nullptr; + } + if (Ctx.getTargetInfo().getTriple().isSPIRV()) { + CE->setType(BoolTy); + return CE; + } + + if (auto TID = Ctx.getTargetInfo().getTargetID()) + P = TID->find(N) == 0; + } else { + auto *Arg = CE->getArg(0); + if (!Arg || Arg->getType() != Ctx.BuiltinFnTy) { + Diag(Loc, diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg; + return nullptr; + } + + if (Ctx.getTargetInfo().getTriple().isSPIRV()) { + CE->setType(BoolTy); + return CE; + } + + auto *FD = cast(Arg->getReferencedDeclOfCallee()); + + StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); + llvm::StringMap CF; + Ctx.getFunctionFeatureMap(CF, FD); + + P = Builtin::evaluateRequiredTargetFeatures(RF, CF); + } + + return IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc); +} } // namespace clang diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index 14e16bc39eb3a..2a6f167296239 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -23,6 +23,7 @@ #include "clang/Basic/TargetInfo.h" #include "clang/Lex/Preprocessor.h" #include "clang/Sema/Initialization.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaHLSL.h" #include "clang/Sema/SemaObjC.h" #include "clang/Sema/SemaRISCV.h" @@ -1563,6 +1564,14 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr, return TC_Success; } + if (SrcType == Self.Context.AMDGPUFeaturePredicateTy && + DestType == Self.Context.getLogicalOperationType()) { + SrcExpr = + Self.AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast(SrcExpr.get())); + Kind = CK_NoOp; + return TC_Success; + } + // We tried everything. Everything! Nothing works! :-( return TC_NotApplicable; } diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 63937ddc3e386..89e49645863c9 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -13617,6 +13617,15 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { return; } + // __amdgpu_feature_predicate_t cannot be initialised + if (VDecl->getType().getDesugaredType(Context) == + Context.AMDGPUFeaturePredicateTy) { + Diag(VDecl->getLocation(), + diag::err_amdgcn_predicate_type_is_not_constructible) << VDecl; + VDecl->setInvalidDecl(); + return; + } + // WebAssembly tables can't be used to initialise a variable. if (!Init->getType().isNull() && Init->getType()->isWebAssemblyTableType()) { Diag(Init->getExprLoc(), diag::err_wasm_table_art) << 0; @@ -14151,6 +14160,12 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) { if (VarDecl *Var = dyn_cast(RealDecl)) { QualType Type = Var->getType(); + if (Type.getDesugaredType(Context) == Context.AMDGPUFeaturePredicateTy) { + Diag(Var->getLocation(), + diag::err_amdgcn_predicate_type_is_not_constructible) << Var; + Var->setInvalidDecl(); + return; + } // C++1z [dcl.dcl]p1 grammar implies that an initializer is mandatory. if (isa(RealDecl)) { Diag(Var->getLocation(), diag::err_decomp_decl_requires_init) << Var; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 7e36efa727072..99fdcc89429a5 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -50,6 +50,7 @@ #include "clang/Sema/ParsedTemplate.h" #include "clang/Sema/Scope.h" #include "clang/Sema/ScopeInfo.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaFixItUtils.h" #include "clang/Sema/SemaHLSL.h" @@ -6556,7 +6557,8 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, if (FD->getName() == "__builtin_amdgcn_is_invocable") { auto FnPtrTy = Context.getPointerType(FD->getType()); auto *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get(); - return CallExpr::Create(Context, R, ArgExprs, Context.VoidTy, + return CallExpr::Create(Context, R, ArgExprs, + Context.AMDGPUFeaturePredicateTy, ExprValueKind::VK_PRValue, RParenLoc, FPOptionsOverride()); } @@ -13365,20 +13367,6 @@ inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS, return InvalidOperands(Loc, LHS, RHS); } -static inline bool IsAMDGPUPredicateBI(Expr *E) { - if (!E->getType()->isVoidType()) - return false; - - if (auto *CE = dyn_cast(E)) { - if (auto *BI = CE->getDirectCallee()) - if (BI->getName() == "__builtin_amdgcn_processor_is" || - BI->getName() == "__builtin_amdgcn_is_invocable") - return true; - } - - return false; -} - // C99 6.5.[13,14] inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS, SourceLocation Loc, @@ -13474,9 +13462,6 @@ inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS, // The following is safe because we only use this method for // non-overloadable operands. - if (IsAMDGPUPredicateBI(LHS.get()) && IsAMDGPUPredicateBI(RHS.get())) - return Context.VoidTy; - // C++ [expr.log.and]p1 // C++ [expr.log.or]p1 // The operands are both contextually converted to type bool. @@ -15706,37 +15691,6 @@ static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) { return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy); } -static Expr *ExpandAMDGPUPredicateBI(ASTContext &Ctx, CallExpr *CE) { - if (!CE->getBuiltinCallee()) - return CXXBoolLiteralExpr::Create(Ctx, false, Ctx.BoolTy, CE->getExprLoc()); - - if (Ctx.getTargetInfo().getTriple().isSPIRV()) { - CE->setType(Ctx.getLogicalOperationType()); - return CE; - } - - bool P = false; - auto &TI = Ctx.getTargetInfo(); - - if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") { - auto *GFX = dyn_cast(CE->getArg(0)->IgnoreParenCasts()); - auto TID = TI.getTargetID(); - if (GFX && TID) { - auto N = GFX->getString(); - P = TI.isValidCPUName(GFX->getString()) && TID->find(N) == 0; - } - } else { - auto *FD = cast(CE->getArg(0)->getReferencedDeclOfCallee()); - - StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); - llvm::StringMap CF; - Ctx.getFunctionFeatureMap(CF, FD); - - P = Builtin::evaluateRequiredTargetFeatures(RF, CF); - } - - return CXXBoolLiteralExpr::Create(Ctx, P, Ctx.BoolTy, CE->getExprLoc()); -} ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc, Expr *InputExpr, @@ -15915,7 +15869,9 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc, // Vector logical not returns the signed variant of the operand type. resultType = GetSignedVectorType(resultType); break; - } else if (IsAMDGPUPredicateBI(InputExpr)) { + } else if (resultType == Context.AMDGPUFeaturePredicateTy) { + resultType = Context.getLogicalOperationType(); + Input = AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast(InputExpr)); break; } else { return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr) @@ -20661,88 +20617,6 @@ void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) { } } -static bool ValidateAMDGPUPredicateBI(Sema &Sema, CallExpr *CE) { - if (CE->getDirectCallee()->getName() == "__builtin_amdgcn_processor_is") { - auto *GFX = dyn_cast(CE->getArg(0)->IgnoreParenCasts()); - if (!GFX) { - Sema.Diag(CE->getExprLoc(), - diag::err_amdgcn_processor_is_arg_not_literal); - return false; - } - auto N = GFX->getString(); - if (!Sema.getASTContext().getTargetInfo().isValidCPUName(N) && - (!Sema.getASTContext().getAuxTargetInfo() || - !Sema.getASTContext().getAuxTargetInfo()->isValidCPUName(N))) { - Sema.Diag(CE->getExprLoc(), - diag::err_amdgcn_processor_is_arg_invalid_value) - << N; - return false; - } - } else { - auto *Arg = CE->getArg(0); - if (!Arg || Arg->getType() != Sema.getASTContext().BuiltinFnTy) { - Sema.Diag(CE->getExprLoc(), - diag::err_amdgcn_is_invocable_arg_invalid_value) - << Arg; - return false; - } - } - - return true; -} - -static Expr *MaybeHandleAMDGPUPredicateBI(Sema &Sema, Expr *E, bool &Invalid) { - if (auto *UO = dyn_cast(E)) { - auto *SE = dyn_cast(UO->getSubExpr()); - if (IsAMDGPUPredicateBI(SE)) { - assert(UO->getOpcode() == UnaryOperator::Opcode::UO_LNot && - "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " - "can only be used as operands of logical ops!"); - - if (!ValidateAMDGPUPredicateBI(Sema, SE)) { - Invalid = true; - return nullptr; - } - - UO->setSubExpr(ExpandAMDGPUPredicateBI(Sema.getASTContext(), SE)); - UO->setType(Sema.getASTContext().getLogicalOperationType()); - - return UO; - } - } - if (auto *BO = dyn_cast(E)) { - auto *LHS = dyn_cast(BO->getLHS()); - auto *RHS = dyn_cast(BO->getRHS()); - if (IsAMDGPUPredicateBI(LHS) && IsAMDGPUPredicateBI(RHS)) { - assert(BO->isLogicalOp() && - "__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable " - "can only be used as operands of logical ops!"); - - if (!ValidateAMDGPUPredicateBI(Sema, LHS) || - !ValidateAMDGPUPredicateBI(Sema, RHS)) { - Invalid = true; - return nullptr; - } - - BO->setLHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), LHS)); - BO->setRHS(ExpandAMDGPUPredicateBI(Sema.getASTContext(), RHS)); - BO->setType(Sema.getASTContext().getLogicalOperationType()); - - return BO; - } - } - if (auto *CE = dyn_cast(E)) - if (IsAMDGPUPredicateBI(CE)) { - if (!ValidateAMDGPUPredicateBI(Sema, CE)) { - Invalid = true; - return nullptr; - } - return ExpandAMDGPUPredicateBI(Sema.getASTContext(), CE); - } - - return nullptr; -} - ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E, bool IsConstexpr) { DiagnoseAssignmentAsCondition(E); @@ -20754,13 +20628,8 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E, E = result.get(); if (!E->isTypeDependent()) { - if (E->getType()->isVoidType()) { - bool InvalidPredicate = false; - if (auto *BIC = MaybeHandleAMDGPUPredicateBI(*this, E, InvalidPredicate)) - return BIC; - else if (InvalidPredicate) - return ExprError(); - } + if (E->getType() == Context.AMDGPUFeaturePredicateTy) + return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast_or_null(E)); if (getLangOpts().CPlusPlus) return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4 diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index e5670dab03cb0..4e6feb871b725 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -9103,6 +9103,15 @@ bool InitializationSequence::Diagnose(Sema &S, case FK_ConversionFailed: { QualType FromType = OnlyArg->getType(); + // __amdgpu_feature_predicate_t can be explicitly cast to the logical op + // type, although this is almost always an error and we advise against it + if (FromType == S.Context.AMDGPUFeaturePredicateTy && + DestType == S.Context.getLogicalOperationType()) { + S.Diag(OnlyArg->getExprLoc(), + diag::err_amdgcn_predicate_type_needs_explicit_bool_cast) + << OnlyArg << DestType; + break; + } PartialDiagnostic PDiag = S.PDiag(diag::err_init_conversion_failed) << (int)Entity.getKind() << DestType @@ -9907,6 +9916,13 @@ Sema::PerformCopyInitialization(const InitializedEntity &Entity, if (EqualLoc.isInvalid()) EqualLoc = InitE->getBeginLoc(); + if (Entity.getType().getDesugaredType(Context) == + Context.AMDGPUFeaturePredicateTy) { + Diag(EqualLoc, diag::err_amdgcn_predicate_type_is_not_constructible) + << Entity.getDecl(); + return ExprError(); + } + InitializationKind Kind = InitializationKind::CreateCopy( InitE->getBeginLoc(), EqualLoc, AllowExplicit); InitializationSequence Seq(*this, Entity, Kind, InitE, TopLevelOfInitList); diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index d3ee9989c73ed..39693055c2106 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -30,6 +30,7 @@ #include "clang/Sema/Initialization.h" #include "clang/Sema/Lookup.h" #include "clang/Sema/Overload.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaObjC.h" #include "clang/Sema/Template.h" @@ -6137,12 +6138,13 @@ TryContextuallyConvertToBool(Sema &S, Expr *From) { ExprResult Sema::PerformContextuallyConvertToBool(Expr *From) { if (checkPlaceholderForOverload(*this, From)) return ExprError(); + if (From->getType() == Context.AMDGPUFeaturePredicateTy) + return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast(From)); ImplicitConversionSequence ICS = TryContextuallyConvertToBool(*this, From); if (!ICS.isBad()) return PerformImplicitConversion(From, Context.BoolTy, ICS, AssignmentAction::Converting); - if (!DiagnoseMultipleUserDefinedConversion(From, Context.BoolTy)) return Diag(From->getBeginLoc(), diag::err_typecheck_bool_condition) << From->getType() << From->getSourceRange(); @@ -11921,6 +11923,16 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand, if (TakingCandidateAddress && !checkAddressOfCandidateIsAvailable(S, Fn)) return; + // __amdgpu_feature_predicate_t can be explicitly cast to the logical op type, + // although this is almost always an error and we advise against it. + if (FromTy == S.Context.AMDGPUFeaturePredicateTy && + ToTy == S.Context.getLogicalOperationType()) { + S.Diag(Conv.Bad.FromExpr->getExprLoc(), + diag::err_amdgcn_predicate_type_needs_explicit_bool_cast) + << Conv.Bad.FromExpr << ToTy; + return; + } + // Emit the generic diagnostic and, optionally, add the hints to it. PartialDiagnostic FDiag = S.PDiag(diag::note_ovl_candidate_bad_conv); FDiag << (unsigned)FnKindPair.first << (unsigned)FnKindPair.second << FnDesc diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp index 26cc8b4c7631d..43d657d25d013 100644 --- a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp +++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp @@ -1,29 +1,29 @@ // RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s // RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s -bool predicate(bool x) { return x; } +bool predicate(bool x); +void pass_by_value(__amdgpu_feature_predicate_t x); -void invalid_uses(int* p, int x, bool (*pfn)(bool)) { - // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void' +void invalid_uses(int *p, int x, const __amdgpu_feature_predicate_t &lv, + __amdgpu_feature_predicate_t &&rv) { + // CHECK: error: 'a' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t a; + // CHECK: error: 'b' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t b = __builtin_amdgcn_processor_is("gfx906"); + // CHECK: error: 'c' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t c = lv; + // CHECK: error: 'd' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t d = rv; + // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906"); - // CHECK: error: cannot initialize a variable of type 'const bool' with an rvalue of type 'void' - const bool invalid_use_in_init_1 = !__builtin_amdgcn_processor_is("gfx906"); - // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void' - bool invalid_use_in_init_2 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); - // CHECK: error: cannot initialize a variable of type 'bool' with an rvalue of type 'void' - bool invalid_use_in_init_3 = !__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); - // CHECK: error: variable has incomplete type 'const void' - const auto invalid_use_in_init_4 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready) || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); - // CHECK: error: variable has incomplete type 'const void' - const auto invalid_use_in_init_5 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_processor_is("gfx900"); - // CHECK: error: variable has incomplete type 'const void' - const auto invalid_use_in_init_6 = __builtin_amdgcn_processor_is("gfx906") || __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep); - // CHECK: error: value of type 'void' is not contextually convertible to 'bool' - __builtin_amdgcn_processor_is("gfx1201") - ? __builtin_amdgcn_s_sleep_var(x) : __builtin_amdgcn_s_sleep(42); - // CHECK: error: no matching function for call to 'predicate' + // CHECK: error: 'x' has type __amdgpu_feature_predicate_t, which is not constructible + pass_by_value(__builtin_amdgcn_processor_is("gfx906")); + // CHECK: error: '__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + bool invalid_use_in_init_1 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + if (bool invalid_use_in_init_2 = __builtin_amdgcn_processor_is("gfx906")) return; + // CHECK: error: '__builtin_amdgcn_processor_is("gfx1200")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x); - // CHECK: note: candidate function not viable: cannot convert argument of incomplete type 'void' to 'bool' for 1st argument } void invalid_invocations(int x, const char* str) { @@ -31,7 +31,6 @@ void invalid_invocations(int x, const char* str) { if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return; // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal if (__builtin_amdgcn_processor_is(str)) return; - // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}__builtin_amdgcn_s_sleep_var{{.*}} is not valid if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return; // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}str{{.*}} is not valid From 716cc1fe760b9a56655a3334c333876dc2b0bfb3 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 6 May 2025 13:02:25 +0100 Subject: [PATCH 15/38] Fix formatting. --- clang/lib/Sema/Sema.cpp | 2 +- clang/lib/Sema/SemaAMDGPU.cpp | 2 +- clang/lib/Sema/SemaCast.cpp | 4 ++-- clang/lib/Sema/SemaDecl.cpp | 8 +++++--- clang/lib/Sema/SemaExpr.cpp | 7 +++---- clang/lib/Sema/SemaInit.cpp | 2 +- clang/lib/Sema/SemaOverload.cpp | 2 +- 7 files changed, 14 insertions(+), 13 deletions(-) diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index c4ed83cc8d50a..3e55b5da3c027 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -552,7 +552,7 @@ void Sema::Initialize() { (Context.getAuxTargetInfo()->getTriple().isAMDGPU() || (Context.getAuxTargetInfo()->getTriple().isSPIRV() && Context.getAuxTargetInfo()->getTriple().getVendor() == - llvm::Triple::AMD)))) { + llvm::Triple::AMD)))) { #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) \ addImplicitTypedef(Name, Context.SingletonId); #include "clang/Basic/AMDGPUTypes.def" diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index 7bf88c5c6a9a0..df4b3237a7844 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -12,8 +12,8 @@ #include "clang/Sema/SemaAMDGPU.h" #include "clang/Basic/DiagnosticSema.h" -#include "clang/Basic/TargetInfo.h" #include "clang/Basic/TargetBuiltins.h" +#include "clang/Basic/TargetInfo.h" #include "clang/Sema/Ownership.h" #include "clang/Sema/Sema.h" #include "llvm/Support/AtomicOrdering.h" diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index 2a6f167296239..8d47b2747f47d 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -1566,8 +1566,8 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr, if (SrcType == Self.Context.AMDGPUFeaturePredicateTy && DestType == Self.Context.getLogicalOperationType()) { - SrcExpr = - Self.AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast(SrcExpr.get())); + SrcExpr = Self.AMDGPU().ExpandAMDGPUPredicateBI( + dyn_cast(SrcExpr.get())); Kind = CK_NoOp; return TC_Success; } diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 89e49645863c9..f932b069479c7 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -13619,9 +13619,10 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { // __amdgpu_feature_predicate_t cannot be initialised if (VDecl->getType().getDesugaredType(Context) == - Context.AMDGPUFeaturePredicateTy) { + Context.AMDGPUFeaturePredicateTy) { Diag(VDecl->getLocation(), - diag::err_amdgcn_predicate_type_is_not_constructible) << VDecl; + diag::err_amdgcn_predicate_type_is_not_constructible) + << VDecl; VDecl->setInvalidDecl(); return; } @@ -14162,7 +14163,8 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) { if (Type.getDesugaredType(Context) == Context.AMDGPUFeaturePredicateTy) { Diag(Var->getLocation(), - diag::err_amdgcn_predicate_type_is_not_constructible) << Var; + diag::err_amdgcn_predicate_type_is_not_constructible) + << Var; Var->setInvalidDecl(); return; } diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 99fdcc89429a5..8247f3da58280 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6557,10 +6557,9 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, if (FD->getName() == "__builtin_amdgcn_is_invocable") { auto FnPtrTy = Context.getPointerType(FD->getType()); auto *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get(); - return CallExpr::Create(Context, R, ArgExprs, - Context.AMDGPUFeaturePredicateTy, - ExprValueKind::VK_PRValue, RParenLoc, - FPOptionsOverride()); + return CallExpr::Create( + Context, R, ArgExprs, Context.AMDGPUFeaturePredicateTy, + ExprValueKind::VK_PRValue, RParenLoc, FPOptionsOverride()); } } diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 4e6feb871b725..dafd1eee196e8 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -9109,7 +9109,7 @@ bool InitializationSequence::Diagnose(Sema &S, DestType == S.Context.getLogicalOperationType()) { S.Diag(OnlyArg->getExprLoc(), diag::err_amdgcn_predicate_type_needs_explicit_bool_cast) - << OnlyArg << DestType; + << OnlyArg << DestType; break; } PartialDiagnostic PDiag = S.PDiag(diag::err_init_conversion_failed) diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 39693055c2106..92e7d76d064c3 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -11929,7 +11929,7 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand, ToTy == S.Context.getLogicalOperationType()) { S.Diag(Conv.Bad.FromExpr->getExprLoc(), diag::err_amdgcn_predicate_type_needs_explicit_bool_cast) - << Conv.Bad.FromExpr << ToTy; + << Conv.Bad.FromExpr << ToTy; return; } From 79035a9624ae3d769adb5eeb91f00081021f51cd Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 6 May 2025 20:09:38 +0100 Subject: [PATCH 16/38] Delete spurious whitespace. --- clang/lib/Sema/SemaExpr.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 8247f3da58280..85a924f5b5805 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -15690,7 +15690,6 @@ static bool isOverflowingIntegerType(ASTContext &Ctx, QualType T) { return Ctx.getIntWidth(T) >= Ctx.getIntWidth(Ctx.IntTy); } - ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc, Expr *InputExpr, bool IsAfterAmp) { From 0f04dbc4ca49a627290b758db34654a0ad62601e Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 8 May 2025 00:53:21 +0100 Subject: [PATCH 17/38] Handle jumps into controlled sequences. --- .../clang/Basic/DiagnosticSemaKinds.td | 2 + clang/include/clang/Sema/SemaAMDGPU.h | 4 ++ clang/lib/Sema/JumpDiagnostics.cpp | 7 ++- clang/lib/Sema/SemaAMDGPU.cpp | 14 +++-- .../amdgpu-feature-builtins-cant-jump.hip | 62 +++++++++++++++++++ 5 files changed, 84 insertions(+), 5 deletions(-) create mode 100644 clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index f2604f052512f..14880adf8e4ad 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13312,4 +13312,6 @@ def err_amdgcn_predicate_type_needs_explicit_bool_cast : Error<"%0 must be explicitly cast to %1; however, please note that this " "is almost always an error and that it prevents the effective " "guarding of target dependent code, and thus should be avoided">; +def note_amdgcn_protected_by_predicate + : Note<"jump enters statement controlled by AMDGPU feature predicate">; } // end of sema component. diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h index 843a146243eae..0d11d799946b5 100644 --- a/clang/include/clang/Sema/SemaAMDGPU.h +++ b/clang/include/clang/Sema/SemaAMDGPU.h @@ -15,12 +15,15 @@ #include "clang/AST/ASTFwd.h" #include "clang/Sema/SemaBase.h" +#include "llvm/ADT/SmallPtrSet.h" namespace clang { class AttributeCommonInfo; +class Expr; class ParsedAttr; class SemaAMDGPU : public SemaBase { + llvm::SmallPtrSet ExpandedPredicates; public: SemaAMDGPU(Sema &S); @@ -68,6 +71,7 @@ class SemaAMDGPU : public SemaBase { /// Expand a valid use of the feature identification builtins into its /// corresponding sequence of instructions. Expr *ExpandAMDGPUPredicateBI(CallExpr *CE); + bool IsPredicate(Expr *E) const; }; } // namespace clang diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp index a852a950b47f4..718d8b461805c 100644 --- a/clang/lib/Sema/JumpDiagnostics.cpp +++ b/clang/lib/Sema/JumpDiagnostics.cpp @@ -19,6 +19,7 @@ #include "clang/AST/StmtOpenACC.h" #include "clang/AST/StmtOpenMP.h" #include "clang/Basic/SourceLocation.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaInternal.h" #include "llvm/ADT/BitVector.h" using namespace clang; @@ -367,8 +368,10 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S, case Stmt::IfStmtClass: { IfStmt *IS = cast(S); + bool AMDGPUPredicate = false; if (!(IS->isConstexpr() || IS->isConsteval() || - IS->isObjCAvailabilityCheck())) + IS->isObjCAvailabilityCheck() || + (AMDGPUPredicate = this->S.AMDGPU().IsPredicate(IS->getCond())))) break; unsigned Diag = diag::note_protected_by_if_available; @@ -376,6 +379,8 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S, Diag = diag::note_protected_by_constexpr_if; else if (IS->isConsteval()) Diag = diag::note_protected_by_consteval_if; + else if (AMDGPUPredicate) + Diag = diag::note_amdgcn_protected_by_predicate; if (VarDecl *Var = IS->getConditionVariable()) BuildScopeInformation(Var, ParentScope); diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index df4b3237a7844..6833a2678c791 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -375,7 +375,8 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { auto Loc = CE->getExprLoc(); if (!CE->getBuiltinCallee()) - return IntegerLiteral::Create(Ctx, False, BoolTy, Loc); + return *ExpandedPredicates.insert( + IntegerLiteral::Create(Ctx, False, BoolTy, Loc)).first; auto P = false; auto BI = CE->getBuiltinCallee(); @@ -398,7 +399,7 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { } if (Ctx.getTargetInfo().getTriple().isSPIRV()) { CE->setType(BoolTy); - return CE; + return *ExpandedPredicates.insert(CE).first; } if (auto TID = Ctx.getTargetInfo().getTargetID()) @@ -412,7 +413,7 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { if (Ctx.getTargetInfo().getTriple().isSPIRV()) { CE->setType(BoolTy); - return CE; + return *ExpandedPredicates.insert(CE).first; } auto *FD = cast(Arg->getReferencedDeclOfCallee()); @@ -424,6 +425,11 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { P = Builtin::evaluateRequiredTargetFeatures(RF, CF); } - return IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc); + return *ExpandedPredicates.insert( + IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc)).first; +} + +bool SemaAMDGPU::IsPredicate(Expr *E) const { + return ExpandedPredicates.contains(E); } } // namespace clang diff --git a/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip b/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip new file mode 100644 index 0000000000000..a7f1abcdcd8fe --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip @@ -0,0 +1,62 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx900 -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx1201 -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ void f(int *ptr, int size, bool f) { + int i = 0; + if (f) + goto label; // expected-error {{cannot jump from this goto statement to its label}} + + if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}} + for (i = 0; i < size; ++i) { + label: + ptr[i] = i; + } + } +} + +__device__ void g(int *ptr, int size, bool f) { + int i = 0; + if (f) + goto label; // expected-error {{cannot jump from this goto statement to its label}} + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}} + for (i = 0; i < size; ++i) { + label: + ptr[i] = i; + } + } +} + +__global__ void h(int *ptr, int size, bool f) { + int i = 0; + if (f) + goto label; // expected-error {{cannot jump from this goto statement to its label}} + + if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}} + for (i = 0; i < size; ++i) { + label: + ptr[i] = i; + } + } +} + +__global__ void i(int *ptr, int size, bool f) { + int i = 0; + if (f) + goto label; // expected-error {{cannot jump from this goto statement to its label}} + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}} + for (i = 0; i < size; ++i) { + label: + ptr[i] = i; + } + } +} From 39a9d55c704f729f299d4ac12ffad5127757d65e Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 8 May 2025 00:57:15 +0100 Subject: [PATCH 18/38] Fix formatting. --- clang/include/clang/Sema/SemaAMDGPU.h | 1 + clang/lib/Sema/SemaAMDGPU.cpp | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h index 0d11d799946b5..f72e1c53d2c92 100644 --- a/clang/include/clang/Sema/SemaAMDGPU.h +++ b/clang/include/clang/Sema/SemaAMDGPU.h @@ -24,6 +24,7 @@ class ParsedAttr; class SemaAMDGPU : public SemaBase { llvm::SmallPtrSet ExpandedPredicates; + public: SemaAMDGPU(Sema &S); diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index 6833a2678c791..39d0f2b70d157 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -375,8 +375,9 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { auto Loc = CE->getExprLoc(); if (!CE->getBuiltinCallee()) - return *ExpandedPredicates.insert( - IntegerLiteral::Create(Ctx, False, BoolTy, Loc)).first; + return *ExpandedPredicates + .insert(IntegerLiteral::Create(Ctx, False, BoolTy, Loc)) + .first; auto P = false; auto BI = CE->getBuiltinCallee(); @@ -425,8 +426,10 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { P = Builtin::evaluateRequiredTargetFeatures(RF, CF); } - return *ExpandedPredicates.insert( - IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc)).first; + return *ExpandedPredicates + .insert( + IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc)) + .first; } bool SemaAMDGPU::IsPredicate(Expr *E) const { From ebde49b3190beaf41625e8953c0b72594f8cf5d4 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 17 May 2025 00:49:49 +0100 Subject: [PATCH 19/38] Start incorporating review feedback. --- clang/docs/ReleaseNotes.rst | 4 ++-- clang/lib/Sema/SemaAMDGPU.cpp | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index b86057bff7043..487e2516ea878 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -768,9 +768,9 @@ AMDGPU Support - Bump the default code object version to 6. ROCm 6.3 is required to run any program compiled with COV6. - Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``, - a late / deferred query for the current target processor + a late / deferred query for the current target processor. - Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``, - which enables fine-grained, per-builtin, feature availability + which enables fine-grained, per-builtin, feature availability. NVPTX Support ^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index 39d0f2b70d157..55ff489aed702 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -368,19 +368,19 @@ void SemaAMDGPU::handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, } Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { - auto &Ctx = getASTContext(); - auto BoolTy = Ctx.getLogicalOperationType(); - auto False = llvm::APInt::getZero(Ctx.getIntWidth(BoolTy)); - auto True = llvm::APInt::getAllOnes(Ctx.getIntWidth(BoolTy)); - auto Loc = CE->getExprLoc(); + ASTContext &Ctx = getASTContext(); + QualType BoolTy = Ctx.getLogicalOperationType(); + llvm::APInt False = llvm::APInt::getZero(Ctx.getIntWidth(BoolTy)); + llvm::APInt True = llvm::APInt::getAllOnes(Ctx.getIntWidth(BoolTy)); + SourceLocation Loc = CE->getExprLoc(); if (!CE->getBuiltinCallee()) return *ExpandedPredicates .insert(IntegerLiteral::Create(Ctx, False, BoolTy, Loc)) .first; - auto P = false; - auto BI = CE->getBuiltinCallee(); + bool P = false; + unsigned BI = CE->getBuiltinCallee(); if (Ctx.BuiltinInfo.isAuxBuiltinID(BI)) BI = Ctx.BuiltinInfo.getAuxBuiltinID(BI); @@ -391,7 +391,7 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { return nullptr; } - auto N = GFX->getString(); + StringRef N = GFX->getString(); if (!Ctx.getTargetInfo().isValidCPUName(N) && (!Ctx.getAuxTargetInfo() || !Ctx.getAuxTargetInfo()->isValidCPUName(N))) { @@ -406,7 +406,7 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { if (auto TID = Ctx.getTargetInfo().getTargetID()) P = TID->find(N) == 0; } else { - auto *Arg = CE->getArg(0); + Expr *Arg = CE->getArg(0); if (!Arg || Arg->getType() != Ctx.BuiltinFnTy) { Diag(Loc, diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg; return nullptr; From 4bdd30e64f5e139a101f7570a36174f539827d22 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 17 May 2025 00:50:27 +0100 Subject: [PATCH 20/38] Less `auto`. --- clang/lib/Sema/SemaExpr.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index f75af55923779..e2c109d0b667e 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6662,8 +6662,8 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, auto *FD = cast(Fn->getReferencedDeclOfCallee()); if (FD->getName() == "__builtin_amdgcn_is_invocable") { - auto FnPtrTy = Context.getPointerType(FD->getType()); - auto *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get(); + QualType FnPtrTy = Context.getPointerType(FD->getType()); + Expr *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get(); return CallExpr::Create( Context, R, ArgExprs, Context.AMDGPUFeaturePredicateTy, ExprValueKind::VK_PRValue, RParenLoc, FPOptionsOverride()); From 76848d5fac7eee879e0a0c07b5441b9267cbd897 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 22 May 2025 17:28:44 +0300 Subject: [PATCH 21/38] Print out valid AMDGCN processor identifiers. --- clang/include/clang/Basic/DiagnosticSemaKinds.td | 4 +++- clang/lib/Basic/Targets/SPIR.cpp | 5 +++++ clang/lib/Basic/Targets/SPIR.h | 1 + clang/lib/Sema/SemaAMDGPU.cpp | 14 +++++++++++--- .../amdgpu-feature-builtins-invalid-use.cpp | 1 + 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 8ef5e0a5a1bc8..8aebd64cb1f16 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12445,7 +12445,7 @@ def warn_zero_as_null_pointer_constant : Warning< InGroup>, DefaultIgnore; def warn_not_eliding_copy_on_return : Warning< - "not eliding copy on return">, + "not eliding copy on return">, InGroup>, DefaultIgnore; def err_nullability_cs_multilevel : Error< @@ -13347,6 +13347,8 @@ def err_amdgcn_processor_is_arg_not_literal def err_amdgcn_processor_is_arg_invalid_value : Error<"the argument to __builtin_amdgcn_processor_is must be a valid " "AMDGCN processor identifier; '%0' is not valid">; +def note_amdgcn_processor_is_valid_options + : Note<"valid AMDGCN processor identifiers are: %0">; def err_amdgcn_is_invocable_arg_invalid_value : Error<"the argument to __builtin_amdgcn_is_invocable must be either a " "target agnostic builtin or an AMDGCN target specific builtin; `%0`" diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp index eb43d9b0be283..8056b124d5fc5 100644 --- a/clang/lib/Basic/Targets/SPIR.cpp +++ b/clang/lib/Basic/Targets/SPIR.cpp @@ -156,3 +156,8 @@ void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) { bool SPIRV64AMDGCNTargetInfo::isValidCPUName(StringRef CPU) const { return AMDGPUTI.isValidCPUName(CPU); } + +void SPIRV64AMDGCNTargetInfo::fillValidCPUList( + SmallVectorImpl &Values) const { + return AMDGPUTI.fillValidCPUList(Values); +} diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index df8dab591bf70..27b93744bb8f8 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -453,6 +453,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final // This is only needed for validating arguments passed to // __builtin_amdgcn_processor_is bool isValidCPUName(StringRef Name) const override; + void fillValidCPUList(SmallVectorImpl &Values) const override; }; } // namespace targets diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index 1927b7d103a88..5d381229f63c7 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -393,10 +393,18 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { } StringRef N = GFX->getString(); - if (!Ctx.getTargetInfo().isValidCPUName(N) && - (!Ctx.getAuxTargetInfo() || - !Ctx.getAuxTargetInfo()->isValidCPUName(N))) { + const TargetInfo &TI = Ctx.getTargetInfo(); + const TargetInfo *AuxTI = Ctx.getAuxTargetInfo(); + if (!TI.isValidCPUName(N) && (!AuxTI || !AuxTI->isValidCPUName(N))) { Diag(Loc, diag::err_amdgcn_processor_is_arg_invalid_value) << N; + SmallVector ValidList; + if (TI.getTriple().getVendor() == llvm::Triple::VendorType::AMD) + TI.fillValidCPUList(ValidList); + else if (AuxTI) // Since the BI is present it must be and AMDGPU triple. + AuxTI->fillValidCPUList(ValidList); + if (!ValidList.empty()) + Diag(Loc, diag::note_amdgcn_processor_is_valid_options) + << llvm::join(ValidList, ", "); return nullptr; } if (Ctx.getTargetInfo().getTriple().isSPIRV()) { diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp index 43d657d25d013..9e50f9493977f 100644 --- a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp +++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp @@ -28,6 +28,7 @@ void invalid_uses(int *p, int x, const __amdgpu_feature_predicate_t &lv, void invalid_invocations(int x, const char* str) { // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a valid AMDGCN processor identifier; 'not_an_amdgcn_gfx_id' is not valid + // CHECK-DAG: note: valid AMDGCN processor identifiers are: {{.*}}gfx{{.*}} if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return; // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal if (__builtin_amdgcn_processor_is(str)) return; From e1bfdf3580451b0c0a97475a29d6bb6c2b5bbdf0 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 22 May 2025 19:31:39 +0300 Subject: [PATCH 22/38] Use boolean type for the predicate, even though it should never get emitted. --- clang/lib/CodeGen/CodeGenTypes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 45b510f9aaba5..7c237e8ea8b1d 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -583,7 +583,7 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { {}, {Scope}); #define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \ case BuiltinType::Id: \ - return llvm::IntegerType::getInt1Ty(getLLVMContext()); + return ConvertType(getContext().getLogicalOperationType()); #include "clang/Basic/AMDGPUTypes.def" #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id: #include "clang/Basic/HLSLIntangibleTypes.def" From 4f6546813c922f9614b2d66b3df25001ec474b06 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 22 May 2025 22:17:31 +0300 Subject: [PATCH 23/38] Register pass early. --- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 5c92c4eb411eb..fa34d67703e57 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -16,6 +16,8 @@ #ifndef MODULE_PASS #define MODULE_PASS(NAME, CREATE_PASS) #endif +MODULE_PASS("amdgpu-expand-feature-predicates", + AMDGPUExpandFeaturePredicatesPass(*this)) MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass()) MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass()) MODULE_PASS("amdgpu-lower-buffer-fat-pointers", @@ -30,8 +32,6 @@ MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) -MODULE_PASS("amdgpu-expand-feature-predicates", - AMDGPUExpandFeaturePredicatesPass(*this)) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS From e940d4213714957039e3b30aa384b4bb7ee3ba3c Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 22 May 2025 22:56:19 +0300 Subject: [PATCH 24/38] Clarify builtins are also available in C. --- clang/docs/LanguageExtensions.rst | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 95a9116926ec8..1373438423aab 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -5033,6 +5033,35 @@ target specific type that behaves as if its C++ definition was the following: operator bool() const noexcept; }; +The builtins can be used in C as well, wherein the +``__amdgpu_feature_predicate_t`` type behaves as an opaque, forward declared +type with conditional automated conversion to ``_Bool`` when used as the +predicate argument to a control structure: + +.. code-block:: c + + struct __amdgpu_feature_predicate_t ret(); // Error + void arg(struct __amdgpu_feature_predicate_t); // Error + void local() { + struct __amdgpu_feature_predicate_t x; // Error + struct __amdgpu_feature_predicate_t y = + __builtin_amdgcn_processor_is("gfx900"); // Error + } + void valid_use() { + _Bool x = (_Bool)__builtin_amdgcn_processor_is("gfx900"); // OK + if (__builtin_amdgcn_processor_is("gfx900")) // Implicit cast to _Bool + return; + for (; __builtin_amdgcn_processor_is("gfx900");) // Implicit cast to _Bool + break; + while (__builtin_amdgcn_processor_is("gfx900")) // Implicit cast to _Bool + break; + do { + break; + } while (__builtin_amdgcn_processor_is("gfx900")); // Implicit cast to _Bool + + __builtin_amdgcn_processor_is("gfx900") ? x : !x; + } + The boolean interpretation of the predicate values returned by the builtins: * indicates whether the current target matches the argument; the argument MUST From 11dd5709644a2e5d887f83e8f35945986de133da Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 2 Jun 2025 18:12:31 +0100 Subject: [PATCH 25/38] Try to fix potentially erroneous indentation in note. --- clang/include/clang/Basic/DiagnosticSemaKinds.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 4d49c24d5d948..02190b5d544e4 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13404,6 +13404,6 @@ def err_amdgcn_predicate_type_needs_explicit_bool_cast : Error<"%0 must be explicitly cast to %1; however, please note that this " "is almost always an error and that it prevents the effective " "guarding of target dependent code, and thus should be avoided">; -def note_amdgcn_protected_by_predicate - : Note<"jump enters statement controlled by AMDGPU feature predicate">; +def note_amdgcn_protected_by_predicate : Note<"jump enters statement controlled" + " by AMDGPU feature predicate">; } // end of sema component. From 03b029f3f400eba5dc165a31e7401a0862f06a91 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 2 Jun 2025 22:35:32 +0100 Subject: [PATCH 26/38] Add test for returning a predicate. --- clang/lib/Sema/SemaInit.cpp | 2 +- .../amdgpu-feature-builtins-invalid-use.cpp | 5 +++ ...feature-builtins-return-type-deduction.hip | 31 +++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 6e3660bea9d06..5fc270681683a 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -9915,7 +9915,7 @@ Sema::PerformCopyInitialization(const InitializedEntity &Entity, EqualLoc = InitE->getBeginLoc(); if (Entity.getType().getDesugaredType(Context) == - Context.AMDGPUFeaturePredicateTy) { + Context.AMDGPUFeaturePredicateTy && Entity.getDecl()) { Diag(EqualLoc, diag::err_amdgcn_predicate_type_is_not_constructible) << Entity.getDecl(); return ExprError(); diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp index 9e50f9493977f..78f18d3a37b46 100644 --- a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp +++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp @@ -41,3 +41,8 @@ void invalid_invocations(int x, const char* str) { // CHECK: error: use of undeclared identifier '__builtin_ia32_pause' else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return; } + +bool return_needs_cast() { + // CHECK: error: '__builtin_amdgcn_processor_is("gfx900")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + return __builtin_amdgcn_processor_is("gfx900"); +} diff --git a/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip b/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip new file mode 100644 index 0000000000000..27bbb3f2f3d07 --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip @@ -0,0 +1,31 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx900 -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace +// RUN: %clang_cc1 -triple x86_64 -aux-triple amdgcn -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace +// RUN: %clang_cc1 -triple x86_64 -aux-triple spirv64-amd-amdhsa -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace + +__attribute__((device)) auto foo() { + return __builtin_amdgcn_processor_is("gfx900"); +} + +__attribute__((device)) decltype(auto) bar() { + return __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep); +} + +// CHECK: |-TypedefDecl {{.*}} implicit __amdgpu_feature_predicate_t '__amdgpu_feature_predicate_t' +// CHECK-NEXT: | `-BuiltinType {{.*}} '__amdgpu_feature_predicate_t' +// CHECK-DAG: |-FunctionDecl {{.*}} foo '__amdgpu_feature_predicate_t ()' +// CHECK-NEXT: |-CompoundStmt {{.*}} +// CHECK-NEXT: | `-ReturnStmt {{.*}} +// CHECK-NEXT: | `-CallExpr {{.*}} '__amdgpu_feature_predicate_t' +// CHECK-NEXT: | |-ImplicitCastExpr {{.*}} '__amdgpu_feature_predicate_t (*)(const char *) noexcept' +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} Function {{.*}} '__builtin_amdgcn_processor_is' '__amdgpu_feature_predicate_t (const char *) noexcept' +// CHECK-NEXT: | `-StringLiteral {{.*}} "gfx900" +// CHECK-DAG: |-FunctionDecl {{.*}} bar '__amdgpu_feature_predicate_t ()' +// CHECK-NEXT: |-CompoundStmt {{.*}} +// CHECK-NEXT: | `-ReturnStmt {{.*}} +// CHECK-NEXT: | `-CallExpr {{.*}} '__amdgpu_feature_predicate_t' +// CHECK-NEXT: | |-ImplicitCastExpr {{.*}} '__amdgpu_feature_predicate_t (*)() noexcept' +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} Function {{.*}} '__builtin_amdgcn_is_invocable' '__amdgpu_feature_predicate_t () noexcept' From 012f74d7418f6cbdacf22e694beaf6cecebd6c81 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 2 Jun 2025 22:40:49 +0100 Subject: [PATCH 27/38] Fix formatting. --- clang/lib/Sema/SemaInit.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 5fc270681683a..e0a5b6f2ac1a5 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -9915,7 +9915,8 @@ Sema::PerformCopyInitialization(const InitializedEntity &Entity, EqualLoc = InitE->getBeginLoc(); if (Entity.getType().getDesugaredType(Context) == - Context.AMDGPUFeaturePredicateTy && Entity.getDecl()) { + Context.AMDGPUFeaturePredicateTy && + Entity.getDecl()) { Diag(EqualLoc, diag::err_amdgcn_predicate_type_is_not_constructible) << Entity.getDecl(); return ExprError(); From 33bbe3566986f9acc64ca082ba6c67d85cf4067e Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 10 Jun 2025 16:51:26 +0100 Subject: [PATCH 28/38] Add predicate expansion pass to LTO pipeline. --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +++++ llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 3405020467336..fb6099d6cd380 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -816,6 +816,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this)); }); + PB.registerFullLinkTimeOptimizationEarlyEPCallback( + [this](ModulePassManager &PM, OptimizationLevel) { + PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this)); + }); + PB.registerScalarOptimizerLateEPCallback( [this](FunctionPassManager &FPM, OptimizationLevel Level) { if (Level == OptimizationLevel::O0) diff --git a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll index b1fc76f457ece..93f43b274e28d 100644 --- a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll +++ b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll @@ -2,16 +2,22 @@ ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto" -print-pipeline-passes %s -o - | FileCheck %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto" -print-pipeline-passes %s -o - | FileCheck %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto" -print-pipeline-passes %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O0 -print-pipeline-passes %s -o - | FileCheck --check-prefix=O0 %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O1 -print-pipeline-passes %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O2 -print-pipeline-passes %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O3 -print-pipeline-passes %s -o - | FileCheck %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s - +; CHECK: amdgpu-expand-feature-predicates ; CHECK: amdgpu-attributor +; O0: amdgpu-expand-feature-predicates ; O0-NOT: amdgpu-attributor +; PRE: amdgpu-expand-feature-predicates ; PRE-NOT: internalize ; PRE-NOT: amdgpu-attributor ; PRE-NOT: printfToRuntime From 81a55d8eb76513d56159f72218bd26aff362e30b Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 14 Jun 2025 00:45:55 +0100 Subject: [PATCH 29/38] Try to fix odd but persistent doc generation error. --- clang/include/clang/Basic/DiagnosticGroups.td | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 38b4f581fa5c9..8e9621259e295 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -827,8 +827,7 @@ The warning can be resolved by removing one of the conditions above. In rough order of preference, this may be done by: 1. Marking the object ``const`` (if possible) 2. Moving the object's definition to a source file -3. Making the object visible using ``__attribute((visibility("default")))``, - ``__declspec(dllimport)``, or ``__declspec(dllexport)``. +3. Making the object visible using ``__attribute((visibility("default")))``, ``__declspec(dllimport)``, or ``__declspec(dllexport)``. When annotating an object with ``__declspec(dllimport)`` or ``__declspec(dllexport)``, take care to ensure that the object is only exported from one dll, and is imported From c495630dde49f7662308a4687be2d5edbe46b182 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 18 Jun 2025 18:18:01 +0100 Subject: [PATCH 30/38] Adopt suggestions. --- clang/lib/Sema/SemaExpr.cpp | 2 +- clang/lib/Sema/SemaInit.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 79577fca323c6..b029036b49a0e 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6576,7 +6576,7 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, // without any additional checking. if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 && ArgExprs[0]->getType() == Context.BuiltinFnTy) { - auto *FD = cast(Fn->getReferencedDeclOfCallee()); + const auto *FD = cast(Fn->getReferencedDeclOfCallee()); if (FD->getName() == "__builtin_amdgcn_is_invocable") { QualType FnPtrTy = Context.getPointerType(FD->getType()); diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 60110165e08f0..8f1a935559b19 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -9103,7 +9103,7 @@ bool InitializationSequence::Diagnose(Sema &S, case FK_ConversionFailed: { QualType FromType = OnlyArg->getType(); // __amdgpu_feature_predicate_t can be explicitly cast to the logical op - // type, although this is almost always an error and we advise against it + // type, although this is almost always an error and we advise against it. if (FromType == S.Context.AMDGPUFeaturePredicateTy && DestType == S.Context.getLogicalOperationType()) { S.Diag(OnlyArg->getExprLoc(), From dc0221e07b2e2e76ded776b488fda67a7ac2f835 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 24 Jun 2025 00:41:44 +0100 Subject: [PATCH 31/38] Implement some of the review suggestions. --- .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 24 +++++++++---------- .../amdgpu-expand-feature-predicates.ll | 6 ++--- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp index f1c73e86fb4a0..06cd2d474df87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -13,12 +13,10 @@ // (AMDGCNSPIRV). These placeholder globals are used to guide target specific // lowering, once the concrete target is known, by way of constant folding their // value all the way into a terminator (i.e. a controlled block) or into a no -// live use scenario. The pass makes a best effort attempt to look through -// calls, i.e. a constant evaluatable passthrough of a predicate value will -// generally work, however we hard fail if the folding fails, to avoid obtuse -// BE errors or opaque run time errors. This pass should run as early as -// possible / immediately after Clang CodeGen, so that the optimisation pipeline -// and the BE operate with concrete target data. +// live use scenario. We hard fail if the folding fails, to avoid obtuse BE +// errors or opaque run time errors. This pass should run as early as possible / +// immediately after Clang CodeGen, so that the optimisation pipeline and the BE +// operate with concrete target data. //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -50,13 +48,13 @@ template void collectUsers(Value *V, C &Container) { } inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) { - const auto IsFeature = P->getName().starts_with("llvm.amdgcn.has"); - const auto Offset = + const bool IsFeature = P->getName().starts_with("llvm.amdgcn.has"); + const size_t Offset = IsFeature ? sizeof("llvm.amdgcn.has") : sizeof("llvm.amdgcn.is"); - auto PV = P->getName().substr(Offset).str(); + std::string PV = P->getName().substr(Offset).str(); if (IsFeature) { - auto Dx = PV.find(','); + size_t Dx = PV.find(','); while (Dx != std::string::npos) { PV.insert(++Dx, {'+'}); @@ -65,7 +63,7 @@ inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) { PV.insert(PV.cbegin(), '+'); } - auto *PTy = P->getValueType(); + Type *PTy = P->getValueType(); P->setLinkage(GlobalValue::PrivateLinkage); P->setExternallyInitialized(false); @@ -103,6 +101,8 @@ std::pair handlePredicate(const GCNSubtarget &ST, auto *I = *ToFold.begin(); ToFold.erase(I); + I->dropDroppableUses(); + if (auto *C = ConstantFoldInstruction(I, P->getDataLayout())) { collectUsers(I, ToFold); I->replaceAllUsesWith(C); @@ -110,8 +110,6 @@ std::pair handlePredicate(const GCNSubtarget &ST, continue; } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) { continue; - } else if (I->users().empty()) { - continue; } return unfoldableFound(I->getParent()->getParent(), P, I); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll index 277323c353260..60e1954220738 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll @@ -121,7 +121,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { ; GFX1010-NEXT: br label %[[IF_END6]] ; GFX1010: [[IF_END6]]: ; GFX1010-NEXT: call void @llvm.assume(i1 true) -; GFX1010-NEXT: call void @llvm.assume(i1 false) +; GFX1010-NEXT: call void @llvm.assume(i1 true) ; GFX1010-NEXT: br label %[[FOR_COND]] ; GFX1010: [[FOR_COND]]: ; GFX1010-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 @@ -167,7 +167,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { ; GFX1101-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) ; GFX1101-NEXT: br label %[[IF_END6]] ; GFX1101: [[IF_END6]]: -; GFX1101-NEXT: call void @llvm.assume(i1 false) +; GFX1101-NEXT: call void @llvm.assume(i1 true) ; GFX1101-NEXT: call void @llvm.assume(i1 true) ; GFX1101-NEXT: br label %[[FOR_COND:.*]] ; GFX1101: [[FOR_COND]]: @@ -278,7 +278,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { ; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) ; GFX1201-W64-NEXT: br label %[[IF_END11]] ; GFX1201-W64: [[IF_END11]]: -; GFX1201-W64-NEXT: call void @llvm.assume(i1 false) +; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) ; GFX1201-W64-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 ; GFX1201-W64-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] ; GFX1201-W64-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 From 3b727b9c75180079e11783bd52a1578f02d30fb4 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 24 Jun 2025 23:10:51 +0100 Subject: [PATCH 32/38] Clean up unreachable BBs. --- .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 24 ++- .../amdgpu-expand-feature-predicates.ll | 163 +++++------------- 2 files changed, 62 insertions(+), 125 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp index 06cd2d474df87..cd9e29a4e7d67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -87,8 +87,9 @@ unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) { return {PreservedAnalyses::none(), false}; } -std::pair handlePredicate(const GCNSubtarget &ST, - GlobalVariable *P) { +std::pair +handlePredicate(const GCNSubtarget &ST, FunctionAnalysisManager &FAM, + SmallPtrSet &Predicated, GlobalVariable *P) { setPredicate(ST, P); SmallPtrSet ToFold; @@ -98,18 +99,25 @@ std::pair handlePredicate(const GCNSubtarget &ST, return {PreservedAnalyses::all(), true}; do { - auto *I = *ToFold.begin(); + Instruction *I = *ToFold.begin(); ToFold.erase(I); I->dropDroppableUses(); + Function *F = I->getParent()->getParent(); + auto &DT = FAM.getResult(*F); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + if (auto *C = ConstantFoldInstruction(I, P->getDataLayout())) { collectUsers(I, ToFold); I->replaceAllUsesWith(C); I->eraseFromParent(); continue; - } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent())) { - continue; + } else if (I->isTerminator() && + ConstantFoldTerminator(I->getParent(), true, nullptr, &DTU)) { + Predicated.insert(F); + + continue; } return unfoldableFound(I->getParent()->getParent(), P, I); @@ -138,9 +146,11 @@ AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) { const auto &ST = TM.getSubtarget( *find_if(M, [](auto &&F) { return !F.isIntrinsic(); })); + auto &FAM = MAM.getResult(M).getManager(); + SmallPtrSet Predicated; auto Ret = PreservedAnalyses::all(); for (auto &&P : Predicates) { - auto R = handlePredicate(ST, P); + auto R = handlePredicate(ST, FAM, Predicated, P); if (!R.second) break; @@ -150,6 +160,8 @@ AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) { for (auto &&P : Predicates) P->eraseFromParent(); + for (auto &&F : Predicated) + removeUnreachableBlocks(*F); return Ret; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll index 60e1954220738..a16a7fc31da22 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll @@ -54,42 +54,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { ; GFX906-NEXT: [[ENTRY:.*:]] ; GFX906-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 ; GFX906-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr -; GFX906-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]] -; GFX906: [[IF_GFX1201_OR_GFX12_INSTS1]]: -; GFX906-NEXT: br label %[[IF_NOT_GFX906:.*]] -; GFX906: [[IF_GFX1201_OR_GFX12_INSTS:.*:]] -; GFX906-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) -; GFX906-NEXT: br label %[[IF_NOT_GFX906]] -; GFX906: [[IF_NOT_GFX906]]: -; GFX906-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]] -; GFX906: [[IF_NOT_GFX907:.*:]] -; GFX906-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() -; GFX906-NEXT: br label %[[IF_END6:.*]] -; GFX906: [[IF_GFX1010_OR_GFX1102]]: +; GFX906-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX906: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX906-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX906: [[IF_NOT_GFX907]]: +; GFX906-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX906: [[IF_GFX1010_OR_GFX1101]]: ; GFX906-NEXT: br label %[[LOR_NOT_GFX1010:.*]] ; GFX906: [[LOR_NOT_GFX1010]]: ; GFX906-NEXT: br label %[[FOR_COND:.*]] -; GFX906: [[IF_GFX1010_OR_GFX1101:.*:]] -; GFX906-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) -; GFX906-NEXT: br label %[[IF_END6]] -; GFX906: [[IF_END6]]: -; GFX906-NEXT: call void @llvm.assume(i1 true) -; GFX906-NEXT: call void @llvm.assume(i1 true) -; GFX906-NEXT: br label %[[FOR_COND]] ; GFX906: [[FOR_COND]]: ; GFX906-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 ; GFX906-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] ; GFX906-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 -; GFX906-NEXT: br label %[[IF_GFX10_INSTS1:.*]] -; GFX906: [[IF_GFX11_INSTS:.*:]] -; GFX906-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() -; GFX906-NEXT: br label %[[IF_END11:.*]] -; GFX906: [[IF_GFX10_INSTS1]]: -; GFX906-NEXT: br label %[[IF_END11]] -; GFX906: [[IF_GFX10_INSTS:.*:]] -; GFX906-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) -; GFX906-NEXT: br label %[[IF_END11]] -; GFX906: [[IF_END11]]: +; GFX906-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX906: [[IF_GFX11_INSTS]]: +; GFX906-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX906: [[IF_GFX10_INSTS]]: ; GFX906-NEXT: call void @llvm.assume(i1 true) ; GFX906-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 ; GFX906-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] @@ -101,41 +82,28 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { ; GFX1010-NEXT: [[ENTRY:.*:]] ; GFX1010-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 ; GFX1010-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr -; GFX1010-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]] -; GFX1010: [[IF_GFX1201_OR_GFX12_INSTS1]]: -; GFX1010-NEXT: br label %[[IF_END:.*]] -; GFX1010: [[IF_GFX1201_OR_GFX12_INSTS:.*:]] -; GFX1010-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) -; GFX1010-NEXT: br label %[[IF_END]] -; GFX1010: [[IF_END]]: -; GFX1010-NEXT: br label %[[IF_NOT_GFX907:.*]] -; GFX1010: [[IF_NOT_GFX907]]: +; GFX1010-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX1010: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX1010-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1010: [[IF_NOT_GFX906]]: +; GFX1010-NEXT: br label %[[LOR_NOT_GFX1010:.*]] +; GFX1010: [[LOR_NOT_GFX1010]]: ; GFX1010-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() ; GFX1010-NEXT: br label %[[IF_END6:.*]] -; GFX1010: [[IF_NOT_GFX906:.*:]] -; GFX1010-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] -; GFX1010: [[LOR_NOT_GFX1010:.*:]] -; GFX1010-NEXT: br label %[[FOR_COND:.*]] -; GFX1010: [[IF_GFX1010_OR_GFX1101]]: -; GFX1010-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) -; GFX1010-NEXT: br label %[[IF_END6]] ; GFX1010: [[IF_END6]]: ; GFX1010-NEXT: call void @llvm.assume(i1 true) ; GFX1010-NEXT: call void @llvm.assume(i1 true) -; GFX1010-NEXT: br label %[[FOR_COND]] +; GFX1010-NEXT: br label %[[FOR_COND:.*]] ; GFX1010: [[FOR_COND]]: ; GFX1010-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 ; GFX1010-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] ; GFX1010-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 -; GFX1010-NEXT: br label %[[IF_ELSE8:.*]] -; GFX1010: [[IF_GFX11_INSTS:.*:]] -; GFX1010-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() -; GFX1010-NEXT: br label %[[IF_END11:.*]] -; GFX1010: [[IF_ELSE8]]: +; GFX1010-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1010: [[IF_GFX11_INSTS]]: ; GFX1010-NEXT: br label %[[IF_GFX10_INSTS:.*]] ; GFX1010: [[IF_GFX10_INSTS]]: ; GFX1010-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) -; GFX1010-NEXT: br label %[[IF_END11]] +; GFX1010-NEXT: br label %[[IF_END11:.*]] ; GFX1010: [[IF_END11]]: ; GFX1010-NEXT: call void @llvm.assume(i1 true) ; GFX1010-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 @@ -148,25 +116,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { ; GFX1101-NEXT: [[ENTRY:.*:]] ; GFX1101-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 ; GFX1101-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr -; GFX1101-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS1:.*]] -; GFX1101: [[IF_GFX1201_OR_GFX12_INSTS1]]: +; GFX1101-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX1101: [[IF_GFX1201_OR_GFX12_INSTS]]: ; GFX1101-NEXT: br label %[[IF_END:.*]] -; GFX1101: [[IF_GFX1201_OR_GFX12_INSTS:.*:]] -; GFX1101-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) -; GFX1101-NEXT: br label %[[IF_END]] ; GFX1101: [[IF_END]]: ; GFX1101-NEXT: br label %[[IF_NOT_GFX907:.*]] ; GFX1101: [[IF_NOT_GFX907]]: ; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() -; GFX1101-NEXT: br label %[[IF_END6:.*]] -; GFX1101: [[IF_NOT_GFX906:.*:]] -; GFX1101-NEXT: br label %[[LOR_NOT_GFX1010:.*]] -; GFX1101: [[LOR_NOT_GFX1010]]: -; GFX1101-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] -; GFX1101: [[IF_GFX1010_OR_GFX1101]]: -; GFX1101-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) -; GFX1101-NEXT: br label %[[IF_END6]] -; GFX1101: [[IF_END6]]: +; GFX1101-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1101: [[IF_NOT_GFX906]]: ; GFX1101-NEXT: call void @llvm.assume(i1 true) ; GFX1101-NEXT: call void @llvm.assume(i1 true) ; GFX1101-NEXT: br label %[[FOR_COND:.*]] @@ -177,13 +135,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { ; GFX1101-NEXT: br label %[[IF_GFX11_INSTS:.*]] ; GFX1101: [[IF_GFX11_INSTS]]: ; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() -; GFX1101-NEXT: br label %[[IF_END11:.*]] -; GFX1101: [[IF_ELSE8:.*:]] -; GFX1101-NEXT: br label %[[IF_GFX10_INSTS:.*]] -; GFX1101: [[IF_GFX10_INSTS]]: -; GFX1101-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) -; GFX1101-NEXT: br label %[[IF_END11]] -; GFX1101: [[IF_END11]]: +; GFX1101-NEXT: br label %[[IF_ELSE8:.*]] +; GFX1101: [[IF_ELSE8]]: ; GFX1101-NEXT: call void @llvm.assume(i1 true) ; GFX1101-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 ; GFX1101-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] @@ -195,28 +148,19 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { ; GFX1201-NEXT: [[ENTRY:.*:]] ; GFX1201-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 ; GFX1201-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr -; GFX1201-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] -; GFX1201: [[LOR_NOT_GFX1201:.*:]] -; GFX1201-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS]] -; GFX1201: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX1201-NEXT: br label %[[LOR_NOT_GFX1201:.*]] +; GFX1201: [[LOR_NOT_GFX1201]]: ; GFX1201-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) -; GFX1201-NEXT: br label %[[IF_END:.*]] -; GFX1201: [[IF_END]]: -; GFX1201-NEXT: br label %[[IF_NOT_GFX907:.*]] -; GFX1201: [[IF_NOT_GFX907]]: +; GFX1201-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1201: [[IF_NOT_GFX906]]: +; GFX1201-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX1201: [[IF_GFX1010_OR_GFX1101]]: ; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() ; GFX1201-NEXT: br label %[[IF_END6:.*]] -; GFX1201: [[IF_NOT_GFX906:.*:]] -; GFX1201-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]] -; GFX1201: [[IF_GFX1010_OR_GFX1102]]: -; GFX1201-NEXT: br label %[[FOR_COND:.*]] -; GFX1201: [[IF_GFX1010_OR_GFX1101:.*:]] -; GFX1201-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) -; GFX1201-NEXT: br label %[[IF_END6]] ; GFX1201: [[IF_END6]]: ; GFX1201-NEXT: call void @llvm.assume(i1 true) ; GFX1201-NEXT: call void @llvm.assume(i1 true) -; GFX1201-NEXT: br label %[[FOR_COND]] +; GFX1201-NEXT: br label %[[FOR_COND:.*]] ; GFX1201: [[FOR_COND]]: ; GFX1201-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 ; GFX1201-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] @@ -224,13 +168,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { ; GFX1201-NEXT: br label %[[IF_GFX11_INSTS:.*]] ; GFX1201: [[IF_GFX11_INSTS]]: ; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() -; GFX1201-NEXT: br label %[[IF_END11:.*]] -; GFX1201: [[IF_ELSE8:.*:]] -; GFX1201-NEXT: br label %[[IF_GFX10_INSTS:.*]] -; GFX1201: [[IF_GFX10_INSTS]]: -; GFX1201-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) -; GFX1201-NEXT: br label %[[IF_END11]] -; GFX1201: [[IF_END11]]: +; GFX1201-NEXT: br label %[[IF_ELSE8:.*]] +; GFX1201: [[IF_ELSE8]]: ; GFX1201-NEXT: call void @llvm.assume(i1 true) ; GFX1201-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 ; GFX1201-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] @@ -242,28 +181,19 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { ; GFX1201-W64-NEXT: [[ENTRY:.*:]] ; GFX1201-W64-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 ; GFX1201-W64-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr -; GFX1201-W64-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] -; GFX1201-W64: [[LOR_NOT_GFX1201:.*:]] -; GFX1201-W64-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS]] -; GFX1201-W64: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX1201-W64-NEXT: br label %[[LOR_NOT_GFX1201:.*]] +; GFX1201-W64: [[LOR_NOT_GFX1201]]: ; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) -; GFX1201-W64-NEXT: br label %[[IF_END:.*]] -; GFX1201-W64: [[IF_END]]: -; GFX1201-W64-NEXT: br label %[[IF_NOT_GFX907:.*]] -; GFX1201-W64: [[IF_NOT_GFX907]]: +; GFX1201-W64-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1201-W64: [[IF_NOT_GFX906]]: +; GFX1201-W64-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX1201-W64: [[IF_GFX1010_OR_GFX1101]]: ; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() ; GFX1201-W64-NEXT: br label %[[IF_END6:.*]] -; GFX1201-W64: [[IF_NOT_GFX906:.*:]] -; GFX1201-W64-NEXT: br label %[[IF_GFX1010_OR_GFX1102:.*]] -; GFX1201-W64: [[IF_GFX1010_OR_GFX1102]]: -; GFX1201-W64-NEXT: br label %[[FOR_COND:.*]] -; GFX1201-W64: [[IF_GFX1010_OR_GFX1101:.*:]] -; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) -; GFX1201-W64-NEXT: br label %[[IF_END6]] ; GFX1201-W64: [[IF_END6]]: ; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) ; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) -; GFX1201-W64-NEXT: br label %[[FOR_COND]] +; GFX1201-W64-NEXT: br label %[[FOR_COND:.*]] ; GFX1201-W64: [[FOR_COND]]: ; GFX1201-W64-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 ; GFX1201-W64-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] @@ -271,13 +201,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { ; GFX1201-W64-NEXT: br label %[[IF_GFX11_INSTS:.*]] ; GFX1201-W64: [[IF_GFX11_INSTS]]: ; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() -; GFX1201-W64-NEXT: br label %[[IF_END11:.*]] -; GFX1201-W64: [[IF_ELSE8:.*:]] -; GFX1201-W64-NEXT: br label %[[IF_GFX10_INSTS:.*]] -; GFX1201-W64: [[IF_GFX10_INSTS]]: -; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) -; GFX1201-W64-NEXT: br label %[[IF_END11]] -; GFX1201-W64: [[IF_END11]]: +; GFX1201-W64-NEXT: br label %[[IF_ELSE8:.*]] +; GFX1201-W64: [[IF_ELSE8]]: ; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) ; GFX1201-W64-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 ; GFX1201-W64-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] From 246ff3809552af6c14d0953f8a4352de32659513 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 3 Jul 2025 03:06:57 +0100 Subject: [PATCH 33/38] Fix formatting. --- llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp index cd9e29a4e7d67..cf95171df55c2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -115,9 +115,9 @@ handlePredicate(const GCNSubtarget &ST, FunctionAnalysisManager &FAM, continue; } else if (I->isTerminator() && ConstantFoldTerminator(I->getParent(), true, nullptr, &DTU)) { - Predicated.insert(F); + Predicated.insert(F); - continue; + continue; } return unfoldableFound(I->getParent()->getParent(), P, I); From 6b368d593156ced102e265f99ea6e057e7599274 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 4 Jul 2025 19:15:24 +0100 Subject: [PATCH 34/38] Remove internal functions made unreachable by predicate expansion. --- .../AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 30 ++++- ...predicates-remove-unreachable-functions.ll | 104 ++++++++++++++++++ 2 files changed, 130 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp index cf95171df55c2..40ab71b609bc8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -127,6 +127,30 @@ handlePredicate(const GCNSubtarget &ST, FunctionAnalysisManager &FAM, } } // Unnamed namespace. +static inline SmallVector collectUsedFunctions(Module &M) { + SmallVector Ret; + for (auto &&F : M) { + if (F.isIntrinsic() || F.isDeclaration()) + continue; + if (!F.hasInternalLinkage() && !F.hasPrivateLinkage()) + continue; + if (F.hasNUndroppableUsesOrMore(1)) + Ret.push_back(&F); + } + + return Ret; +} + +template +static inline void removeUnreachable(const Container0 &Predicates, + const Container1 &PredicatedFns, + const Container2 &UnreachableFns) { + for_each(Predicates, [](auto &&P) { P->eraseFromParent(); }); + for_each(PredicatedFns, [](auto &&F) { removeUnreachableBlocks(*F); }); + for_each(UnreachableFns, + [](auto &&F) { if (F->getNumUses() == 0) F->eraseFromParent(); }); +} + PreservedAnalyses AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) { if (M.empty()) @@ -148,6 +172,7 @@ AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) { auto &FAM = MAM.getResult(M).getManager(); SmallPtrSet Predicated; + SmallVector MaybeUnreachable = collectUsedFunctions(M); auto Ret = PreservedAnalyses::all(); for (auto &&P : Predicates) { auto R = handlePredicate(ST, FAM, Predicated, P); @@ -158,10 +183,7 @@ AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) { Ret.intersect(R.first); } - for (auto &&P : Predicates) - P->eraseFromParent(); - for (auto &&F : Predicated) - removeUnreachableBlocks(*F); + removeUnreachable(Predicates, Predicated, MaybeUnreachable); return Ret; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll new file mode 100644 index 0000000000000..c5089de333849 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5 +; REQUIRES: amdgpu-registered-target + +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s + +@llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1 + +define external void @extern_linkage() { +; GFX906-LABEL: define void @extern_linkage( +; GFX906-SAME: ) #[[ATTR0:[0-9]+]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: ret void +; +; GFX1201-LABEL: define void @extern_linkage( +; GFX1201-SAME: ) #[[ATTR0:[0-9]+]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: ret void +; +entry: + ret void +} + +define private void @non_predicated_uses() { +; GFX906-LABEL: define private void @non_predicated_uses( +; GFX906-SAME: ) #[[ATTR0]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: ret void +; +; GFX1201-LABEL: define private void @non_predicated_uses( +; GFX1201-SAME: ) #[[ATTR0]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: ret void +; +entry: + ret void +} + +define internal void @remove_on_906() { +; GFX1201-LABEL: define internal void @remove_on_906( +; GFX1201-SAME: ) #[[ATTR0]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: ret void +; +entry: + ret void +} + +define internal void @remove_on_1201() { +; GFX906-LABEL: define internal void @remove_on_1201( +; GFX906-SAME: ) #[[ATTR0]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: ret void +; +entry: + ret void +} + +define void @foo() { +; GFX906-LABEL: define void @foo( +; GFX906-SAME: ) #[[ATTR0]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: call void @non_predicated_uses() +; GFX906-NEXT: br label %[[NOT_GFX1201:.*]] +; GFX906: [[NOT_GFX1201]]: +; GFX906-NEXT: br label %[[GFX906:.*]] +; GFX906: [[GFX906]]: +; GFX906-NEXT: call void @remove_on_1201() +; GFX906-NEXT: br label %[[END:.*]] +; GFX906: [[END]]: +; GFX906-NEXT: ret void +; +; GFX1201-LABEL: define void @foo( +; GFX1201-SAME: ) #[[ATTR0]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: call void @non_predicated_uses() +; GFX1201-NEXT: br label %[[GFX1201:.*]] +; GFX1201: [[GFX1201]]: +; GFX1201-NEXT: call void @remove_on_906() +; GFX1201-NEXT: br label %[[END:.*]] +; GFX1201: [[END]]: +; GFX1201-NEXT: ret void +; +entry: + call void @non_predicated_uses() + %0 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1 + br i1 %0, label %gfx1201, label %not.gfx1201 + +gfx1201: + call void @remove_on_906() + br label %end + +not.gfx1201: + %1 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1 + br i1 %1, label %gfx906, label %end + +gfx906: + call void @remove_on_1201() + br label %end + +end: + ret void +} From 435ce05571bc599eb493cf9508b6c320b69f2fa5 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 4 Jul 2025 19:30:34 +0100 Subject: [PATCH 35/38] Fix formatting, tweak use count. --- .../Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp index 40ab71b609bc8..d83c305fb0404 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -134,8 +134,9 @@ static inline SmallVector collectUsedFunctions(Module &M) { continue; if (!F.hasInternalLinkage() && !F.hasPrivateLinkage()) continue; - if (F.hasNUndroppableUsesOrMore(1)) - Ret.push_back(&F); + if (F.hasNUndroppableUses(0)) + continue; + Ret.push_back(&F); } return Ret; @@ -147,8 +148,12 @@ static inline void removeUnreachable(const Container0 &Predicates, const Container2 &UnreachableFns) { for_each(Predicates, [](auto &&P) { P->eraseFromParent(); }); for_each(PredicatedFns, [](auto &&F) { removeUnreachableBlocks(*F); }); - for_each(UnreachableFns, - [](auto &&F) { if (F->getNumUses() == 0) F->eraseFromParent(); }); + for_each(UnreachableFns, [](auto &&F) { + if (!F->hasNUndroppableUses(0)) + return; + F->dropDroppableUses(); + F->eraseFromParent(); + }); } PreservedAnalyses From 2c2f78bd3f189a2a93b19160cb974a50279326a2 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Fri, 4 Jul 2025 19:34:52 +0100 Subject: [PATCH 36/38] Fix formatting, again. --- llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp index d83c305fb0404..fc0d3e378044d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -142,7 +142,7 @@ static inline SmallVector collectUsedFunctions(Module &M) { return Ret; } -template +template static inline void removeUnreachable(const Container0 &Predicates, const Container1 &PredicatedFns, const Container2 &UnreachableFns) { From 81778760b4022296ca9387a034067830317387e1 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 12 Jul 2025 02:59:50 +0100 Subject: [PATCH 37/38] Add warnings around unguarded builtin usage, suggesting `__builtin_amdgcn_is_invocable` as the solution. --- clang/include/clang/Basic/DiagnosticGroups.td | 3 + .../clang/Basic/DiagnosticSemaKinds.td | 6 + clang/include/clang/Sema/SemaAMDGPU.h | 6 + clang/lib/Sema/SemaAMDGPU.cpp | 111 ++++++++++++++++++ clang/lib/Sema/SemaDecl.cpp | 9 +- clang/lib/Sema/SemaExpr.cpp | 7 ++ ...amdgpu-is-invocable-guards-builtin-use.hip | 47 ++++++++ 7 files changed, 187 insertions(+), 2 deletions(-) create mode 100644 clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 9a7a308600763..bcfc0d99a8bfd 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1755,3 +1755,6 @@ def ExplicitSpecializationStorageClass : DiagGroup<"explicit-specialization-stor // A warning for options that enable a feature that is not yet complete def ExperimentalOption : DiagGroup<"experimental-option">; + +// Warnings about unguarded usages of AMDGPU target specific constructs +def UnguardedBuiltinUsageAMDGPU : DiagGroup<"amdgpu-unguarded-builtin-usage">; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 0481503fe8de6..16cb7814d0626 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13485,4 +13485,10 @@ def err_amdgcn_predicate_type_needs_explicit_bool_cast "guarding of target dependent code, and thus should be avoided">; def note_amdgcn_protected_by_predicate : Note<"jump enters statement controlled" " by AMDGPU feature predicate">; +def warn_amdgcn_unguarded_builtin : + Warning<"%0 might be unavailable on some AMDGPU targets">, + InGroup, DefaultIgnore; +def note_amdgcn_unguarded_builtin_silence + : Note<"enclose %0 in a __builtin_amdgcn_is_invocable check to silence " + "this warning">; } // end of sema component. diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h index f72e1c53d2c92..1a6752d7ec0d6 100644 --- a/clang/include/clang/Sema/SemaAMDGPU.h +++ b/clang/include/clang/Sema/SemaAMDGPU.h @@ -24,6 +24,7 @@ class ParsedAttr; class SemaAMDGPU : public SemaBase { llvm::SmallPtrSet ExpandedPredicates; + llvm::SmallPtrSet PotentiallyUnguardedBuiltinUsers; public: SemaAMDGPU(Sema &S); @@ -73,6 +74,11 @@ class SemaAMDGPU : public SemaBase { /// corresponding sequence of instructions. Expr *ExpandAMDGPUPredicateBI(CallExpr *CE); bool IsPredicate(Expr *E) const; + /// Diagnose unguarded usages of AMDGPU builtins and recommend guarding with + /// __builtin_amdgcn_is_invocable + void AddPotentiallyUnguardedBuiltinUser(FunctionDecl *FD); + bool HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const; + void DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD); }; } // namespace clang diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index 5d381229f63c7..aed734ef94465 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -11,10 +11,14 @@ //===----------------------------------------------------------------------===// #include "clang/Sema/SemaAMDGPU.h" +#include "clang/AST/Decl.h" +#include "clang/AST/DynamicRecursiveASTVisitor.h" +#include "clang/AST/Expr.h" #include "clang/Basic/DiagnosticSema.h" #include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" #include "clang/Sema/Ownership.h" +#include "clang/Sema/Scope.h" #include "clang/Sema/Sema.h" #include "llvm/Support/AtomicOrdering.h" #include @@ -444,4 +448,111 @@ Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { bool SemaAMDGPU::IsPredicate(Expr *E) const { return ExpandedPredicates.contains(E); } + +void SemaAMDGPU::AddPotentiallyUnguardedBuiltinUser(FunctionDecl *FD) { + PotentiallyUnguardedBuiltinUsers.insert(FD); +} + +bool SemaAMDGPU::HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const { + return PotentiallyUnguardedBuiltinUsers.contains(FD); +} + +namespace { + /// This class implements -Wamdgpu-unguarded-builtin-usage. + /// + /// This is done with a traversal of the AST of a function that includes a + /// call to a target specific builtin. Whenever we encounter an \c if of the + /// form: \c if(__builtin_amdgcn_is_invocable), we consider the then statement + /// guarded. +class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor { + // TODO: this is conservative, and should be extended to: + // - warn on unguarded ASM usage (__builtin_amdgcn_processor_is as the + // guard); + // - build sets of builtins which are invocable from nested + // if (__builtin_amdgcn_is_invocable) calls, rather than assume + // sanity / that the existence of a guard implies its correctness; + // - derive the set of available builtins / valid ASM constraints from + // the target architecture passed to __builtin_amdgcn_processor_is; + // - consider attributes such as target. + Sema &SemaRef; + + unsigned Guards; +public: + DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef), Guards(0u) {} + + bool TraverseLambdaExpr(LambdaExpr *LE) override { + if (SemaRef.AMDGPU().HasPotentiallyUnguardedBuiltinUsage(LE->getCallOperator())) + return true; // We have already handled this. + return DynamicRecursiveASTVisitor::TraverseLambdaExpr(LE); + } + + bool TraverseStmt(Stmt *S) override { + if (!S) + return true; + return DynamicRecursiveASTVisitor::TraverseStmt(S); + } + + void IssueDiagnostics(Stmt *S) { TraverseStmt(S); } + + bool TraverseIfStmt(IfStmt *If) override; + + bool TraverseCaseStmt(CaseStmt *CS) override { + return TraverseStmt(CS->getSubStmt()); + } + + bool VisitCallExpr(CallExpr *CE) override; +}; + +inline Expr *FindPredicate(Expr *Cond) { + if (auto *CE = dyn_cast(Cond)) { + if (CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_is_invocable) + return Cond; + } else if (auto *UO = dyn_cast(Cond)) { + return FindPredicate(UO->getSubExpr()); + } else if (auto *BO = dyn_cast(Cond)) { + if ((Cond = FindPredicate(BO->getLHS()))) + return Cond; + return FindPredicate(BO->getRHS()); + } + return nullptr; +} + +bool DiagnoseUnguardedBuiltins::TraverseIfStmt(IfStmt *If) { + if (FindPredicate(If->getCond())) { + ++Guards; + bool Continue = TraverseStmt(If->getThen()); + --Guards; + + return Continue && TraverseStmt(If->getElse()); + } + + return DynamicRecursiveASTVisitor::TraverseIfStmt(If); +} + +bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) { + if (Guards) + return true; + + unsigned ID = CE->getBuiltinCallee(); + + if (!ID) + return true; + if (!SemaRef.getASTContext().BuiltinInfo.isTSBuiltin(ID)) + return true; + if (ID == AMDGPU::BI__builtin_amdgcn_processor_is || + ID == AMDGPU::BI__builtin_amdgcn_is_invocable) + return true; + + SemaRef.Diag(CE->getExprLoc(), diag::warn_amdgcn_unguarded_builtin) + << CE->getDirectCallee(); + SemaRef.Diag(CE->getExprLoc(), diag::note_amdgcn_unguarded_builtin_silence) + << CE->getDirectCallee(); + + return true; + } +} // Unnamed namespace + +void SemaAMDGPU::DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD) { + DiagnoseUnguardedBuiltins(SemaRef).IssueDiagnostics(FD->getBody()); +} } // namespace clang diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index e405cfc0a5d17..f586f09bce19a 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -45,6 +45,7 @@ #include "clang/Sema/ParsedTemplate.h" #include "clang/Sema/Scope.h" #include "clang/Sema/ScopeInfo.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaARM.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaHLSL.h" @@ -16602,8 +16603,12 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body, return nullptr; } - if (Body && FSI->HasPotentialAvailabilityViolations) - DiagnoseUnguardedAvailabilityViolations(dcl); + if (Body) { + if (FSI->HasPotentialAvailabilityViolations) + DiagnoseUnguardedAvailabilityViolations(dcl); + else if (AMDGPU().HasPotentiallyUnguardedBuiltinUsage(FD)) + AMDGPU().DiagnoseUnguardedBuiltinUsage(FD); + } assert(!FSI->ObjCShouldCallSuper && "This should only be set for ObjC methods, which should have been " diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 0aa0ccf89d0f9..ef2059d05a8d3 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6705,6 +6705,13 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, FunctionDecl *FDecl = dyn_cast(NDecl); if (FDecl && FDecl->getBuiltinID()) { + if (Context.BuiltinInfo.isTSBuiltin(FDecl->getBuiltinID())) { + const llvm::Triple &Triple = Context.getTargetInfo().getTriple(); + if (Triple.isSPIRV() && Triple.getVendor() == llvm::Triple::AMD) + AMDGPU().AddPotentiallyUnguardedBuiltinUser(cast( + getFunctionLevelDeclContext(/*AllowLambda=*/ true))); + } + // Rewrite the function decl for this builtin by replacing parameters // with no explicit address space with the address space of the arguments // in ArgExprs. diff --git a/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip b/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip new file mode 100644 index 0000000000000..26544590f7536 --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip @@ -0,0 +1,47 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wamdgpu-unguarded-builtin-usage %s + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ void g(); + +__device__ void f(int x, bool b) { + const auto lambda = [=] __device__ () { + __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}} + // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}} + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) + __builtin_amdgcn_s_sleep_var(x); + }; + + const auto generic_lambda = [] __device__ (auto&& y) { + __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}} + // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}} + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) + __builtin_amdgcn_s_sleep_var(y); + }; + + __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}} + // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}} + + // processor_is does not (yet) guard + if (__builtin_amdgcn_processor_is("gfx900")) + __builtin_amdgcn_s_sleep_var(x); // expected-warning {{'__builtin_amdgcn_s_sleep_var' might be unavailable on some AMDGPU targets}} + // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep_var' in a __builtin_amdgcn_is_invocable check to silence this warning}} + + // Direct guard + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep)) + __builtin_amdgcn_s_sleep(42); + + // Guarded scope + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { + if (b) { + g(); + while (--x > 42) + __builtin_amdgcn_s_sleep_var(x); + } + } +} From b4decc204a48d1e7d06db5dae4b9704fbac5ab3c Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 12 Jul 2025 03:10:17 +0100 Subject: [PATCH 38/38] Fix formatting. --- clang/lib/Sema/SemaAMDGPU.cpp | 48 ++++++++++++++++++----------------- clang/lib/Sema/SemaExpr.cpp | 2 +- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index aed734ef94465..af4668e17ed68 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -458,12 +458,12 @@ bool SemaAMDGPU::HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const { } namespace { - /// This class implements -Wamdgpu-unguarded-builtin-usage. - /// - /// This is done with a traversal of the AST of a function that includes a - /// call to a target specific builtin. Whenever we encounter an \c if of the - /// form: \c if(__builtin_amdgcn_is_invocable), we consider the then statement - /// guarded. +/// This class implements -Wamdgpu-unguarded-builtin-usage. +/// +/// This is done with a traversal of the AST of a function that includes a +/// call to a target specific builtin. Whenever we encounter an \c if of the +/// form: \c if(__builtin_amdgcn_is_invocable), we consider the then statement +/// guarded. class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor { // TODO: this is conservative, and should be extended to: // - warn on unguarded ASM usage (__builtin_amdgcn_processor_is as the @@ -477,11 +477,13 @@ class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor { Sema &SemaRef; unsigned Guards; + public: DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef), Guards(0u) {} bool TraverseLambdaExpr(LambdaExpr *LE) override { - if (SemaRef.AMDGPU().HasPotentiallyUnguardedBuiltinUsage(LE->getCallOperator())) + if (SemaRef.AMDGPU().HasPotentiallyUnguardedBuiltinUsage( + LE->getCallOperator())) return true; // We have already handled this. return DynamicRecursiveASTVisitor::TraverseLambdaExpr(LE); } @@ -530,26 +532,26 @@ bool DiagnoseUnguardedBuiltins::TraverseIfStmt(IfStmt *If) { } bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) { - if (Guards) - return true; + if (Guards) + return true; - unsigned ID = CE->getBuiltinCallee(); + unsigned ID = CE->getBuiltinCallee(); - if (!ID) - return true; - if (!SemaRef.getASTContext().BuiltinInfo.isTSBuiltin(ID)) - return true; - if (ID == AMDGPU::BI__builtin_amdgcn_processor_is || - ID == AMDGPU::BI__builtin_amdgcn_is_invocable) - return true; + if (!ID) + return true; + if (!SemaRef.getASTContext().BuiltinInfo.isTSBuiltin(ID)) + return true; + if (ID == AMDGPU::BI__builtin_amdgcn_processor_is || + ID == AMDGPU::BI__builtin_amdgcn_is_invocable) + return true; - SemaRef.Diag(CE->getExprLoc(), diag::warn_amdgcn_unguarded_builtin) - << CE->getDirectCallee(); - SemaRef.Diag(CE->getExprLoc(), diag::note_amdgcn_unguarded_builtin_silence) - << CE->getDirectCallee(); + SemaRef.Diag(CE->getExprLoc(), diag::warn_amdgcn_unguarded_builtin) + << CE->getDirectCallee(); + SemaRef.Diag(CE->getExprLoc(), diag::note_amdgcn_unguarded_builtin_silence) + << CE->getDirectCallee(); - return true; - } + return true; +} } // Unnamed namespace void SemaAMDGPU::DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD) { diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index ef2059d05a8d3..0eb95826342ff 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6709,7 +6709,7 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, const llvm::Triple &Triple = Context.getTargetInfo().getTriple(); if (Triple.isSPIRV() && Triple.getVendor() == llvm::Triple::AMD) AMDGPU().AddPotentiallyUnguardedBuiltinUser(cast( - getFunctionLevelDeclContext(/*AllowLambda=*/ true))); + getFunctionLevelDeclContext(/*AllowLambda=*/true))); } // Rewrite the function decl for this builtin by replacing parameters