diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index a42a546555716..0f38510880290 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -5025,6 +5025,118 @@ If no address spaces names are provided, all address spaces are fenced. __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local") __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global") +__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``__builtin_amdgcn_processor_is`` and ``__builtin_amdgcn_is_invocable`` provide +a functional mechanism for programatically querying: + +* the identity of the current target processor; +* the capability of the current target processor to invoke a particular builtin. + +**Syntax**: + +.. code-block:: c + + __amdgpu_feature_predicate_t __builtin_amdgcn_processor_is(const char*); + __amdgpu_feature_predicate_t __builtin_amdgcn_is_invocable(builtin_name); + +**Example of use**: + +.. code-block:: c++ + + if (__builtin_amdgcn_processor_is("gfx1201") || + __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) + __builtin_amdgcn_s_sleep_var(x); + + if (!__builtin_amdgcn_processor_is("gfx906")) + __builtin_amdgcn_s_wait_event_export_ready(); + else if (__builtin_amdgcn_processor_is("gfx1010") || + __builtin_amdgcn_processor_is("gfx1101")) + __builtin_amdgcn_s_ttracedata_imm(1); + + while (__builtin_amdgcn_processor_is("gfx1101")) *p += x; + + do { + break; + } while (__builtin_amdgcn_processor_is("gfx1010")); + + for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break; + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)) + __builtin_amdgcn_s_wait_event_export_ready(); + else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm)) + __builtin_amdgcn_s_ttracedata_imm(1); + + do { + break; + } while ( + __builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32)); + + for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) + break; + +**Description**: + +The builtins return a value of type ``__amdgpu_feature_predicate_t``, which is a +target specific type that behaves as if its C++ definition was the following: + +.. code-block:: c++ + + struct __amdgpu_feature_predicate_t { + __amdgpu_feature_predicate_t() = delete; + __amdgpu_feature_predicate_t(const __amdgpu_feature_predicate_t&) = delete; + __amdgpu_feature_predicate_t(__amdgpu_feature_predicate_t&&) = delete; + + explicit + operator bool() const noexcept; + }; + +The builtins can be used in C as well, wherein the +``__amdgpu_feature_predicate_t`` type behaves as an opaque, forward declared +type with conditional automated conversion to ``_Bool`` when used as the +predicate argument to a control structure: + +.. code-block:: c + + struct __amdgpu_feature_predicate_t ret(); // Error + void arg(struct __amdgpu_feature_predicate_t); // Error + void local() { + struct __amdgpu_feature_predicate_t x; // Error + struct __amdgpu_feature_predicate_t y = + __builtin_amdgcn_processor_is("gfx900"); // Error + } + void valid_use() { + _Bool x = (_Bool)__builtin_amdgcn_processor_is("gfx900"); // OK + if (__builtin_amdgcn_processor_is("gfx900")) // Implicit cast to _Bool + return; + for (; __builtin_amdgcn_processor_is("gfx900");) // Implicit cast to _Bool + break; + while (__builtin_amdgcn_processor_is("gfx900")) // Implicit cast to _Bool + break; + do { + break; + } while (__builtin_amdgcn_processor_is("gfx900")); // Implicit cast to _Bool + + __builtin_amdgcn_processor_is("gfx900") ? x : !x; + } + +The boolean interpretation of the predicate values returned by the builtins: + +* indicates whether the current target matches the argument; the argument MUST + be a string literal and a valid AMDGPU target +* indicates whether the builtin function passed as the argument can be invoked + by the current target; the argument MUST be either a generic or AMDGPU + specific builtin name + +When invoked while compiling for a concrete target, the builtins are evaluated +early by Clang, and never produce any CodeGen effects / have no observable +side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v, +which is an abstract target, a series of predicate values are implicitly +created. These predicates get resolved when finalizing the compilation process +for a concrete target, and shall reflect the latter's identity and features. +Thus, it is possible to author high-level code, in e.g. HIP, that is target +adaptive in a dynamic fashion, contrary to macro based mechanisms. ARM/AArch64 Language Extensions ------------------------------- diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 9e16613826a77..3ec0dd4276735 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -975,6 +975,10 @@ AMDGPU Support ^^^^^^^^^^^^^^ - Bump the default code object version to 6. ROCm 6.3 is required to run any program compiled with COV6. +- Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``, + a late / deferred query for the current target processor. +- Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``, + which enables fine-grained, per-builtin, feature availability. NVPTX Support ^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/AMDGPUTypes.def b/clang/include/clang/Basic/AMDGPUTypes.def index d3dff446f9edf..a0574c640184b 100644 --- a/clang/include/clang/Basic/AMDGPUTypes.def +++ b/clang/include/clang/Basic/AMDGPUTypes.def @@ -20,10 +20,18 @@ AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) #endif +#ifndef AMDGPU_FEATURE_PREDICATE_TYPE +#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \ + AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) +#endif + AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy, 128, 128, 8) AMDGPU_NAMED_BARRIER_TYPE("__amdgpu_named_workgroup_barrier_t", AMDGPUNamedWorkgroupBarrier, AMDGPUNamedWorkgroupBarrierTy, 128, 32, 0) +AMDGPU_FEATURE_PREDICATE_TYPE("__amdgpu_feature_predicate_t", AMDGPUFeaturePredicate, AMDGPUFeaturePredicateTy, 1, 1) + #undef AMDGPU_TYPE #undef AMDGPU_OPAQUE_PTR_TYPE #undef AMDGPU_NAMED_BARRIER_TYPE +#undef AMDGPU_FEATURE_PREDICATE_TYPE diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def index 48437c9397570..27f78af16fe06 100644 --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -34,6 +34,7 @@ // Q -> target builtin type, followed by a character to distinguish the builtin type // Qa -> AArch64 svcount_t builtin type. // Qb -> AMDGPU __amdgpu_buffer_rsrc_t builtin type. +// Qc -> AMDGPU __amdgpu_feature_predicate_t builtin type. // E -> ext_vector, followed by the number of elements and the base type. // X -> _Complex, followed by the base type. // Y -> ptrdiff_t diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index a5ee8013adff6..c0f30a718998f 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -350,6 +350,11 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr") BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n") BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n") +// These are special FE only builtins intended for forwarding the requirements +// to the ME. +BUILTIN(__builtin_amdgcn_processor_is, "QccC*", "nctu") +BUILTIN(__builtin_amdgcn_is_invocable, "Qc", "nctu") + //===----------------------------------------------------------------------===// // R600-NI only builtins. //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 9a7a308600763..bcfc0d99a8bfd 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1755,3 +1755,6 @@ def ExplicitSpecializationStorageClass : DiagGroup<"explicit-specialization-stor // A warning for options that enable a feature that is not yet complete def ExperimentalOption : DiagGroup<"experimental-option">; + +// Warnings about unguarded usages of AMDGPU target specific constructs +def UnguardedBuiltinUsageAMDGPU : DiagGroup<"amdgpu-unguarded-builtin-usage">; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 09ba796b22765..16cb7814d0626 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12526,7 +12526,7 @@ def warn_zero_as_null_pointer_constant : Warning< InGroup>, DefaultIgnore; def warn_not_eliding_copy_on_return : Warning< - "not eliding copy on return">, + "not eliding copy on return">, InGroup>, DefaultIgnore; def err_nullability_cs_multilevel : Error< @@ -13464,4 +13464,31 @@ def err_acc_device_type_multiple_archs // AMDGCN builtins diagnostics def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">; def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">; +def err_amdgcn_processor_is_arg_not_literal + : Error<"the argument to __builtin_amdgcn_processor_is must be a string " + "literal">; +def err_amdgcn_processor_is_arg_invalid_value + : Error<"the argument to __builtin_amdgcn_processor_is must be a valid " + "AMDGCN processor identifier; '%0' is not valid">; +def note_amdgcn_processor_is_valid_options + : Note<"valid AMDGCN processor identifiers are: %0">; +def err_amdgcn_is_invocable_arg_invalid_value + : Error<"the argument to __builtin_amdgcn_is_invocable must be either a " + "target agnostic builtin or an AMDGCN target specific builtin; `%0`" + " is not valid">; +def err_amdgcn_predicate_type_is_not_constructible + : Error<"%0 has type __amdgpu_feature_predicate_t, which is not" + " constructible">; +def err_amdgcn_predicate_type_needs_explicit_bool_cast + : Error<"%0 must be explicitly cast to %1; however, please note that this " + "is almost always an error and that it prevents the effective " + "guarding of target dependent code, and thus should be avoided">; +def note_amdgcn_protected_by_predicate : Note<"jump enters statement controlled" + " by AMDGPU feature predicate">; +def warn_amdgcn_unguarded_builtin : + Warning<"%0 might be unavailable on some AMDGPU targets">, + InGroup, DefaultIgnore; +def note_amdgcn_unguarded_builtin_silence + : Note<"enclose %0 in a __builtin_amdgcn_is_invocable check to silence " + "this warning">; } // end of sema component. diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h index d62c9bb65fadb..1a6752d7ec0d6 100644 --- a/clang/include/clang/Sema/SemaAMDGPU.h +++ b/clang/include/clang/Sema/SemaAMDGPU.h @@ -15,12 +15,17 @@ #include "clang/AST/ASTFwd.h" #include "clang/Sema/SemaBase.h" +#include "llvm/ADT/SmallPtrSet.h" namespace clang { class AttributeCommonInfo; +class Expr; class ParsedAttr; class SemaAMDGPU : public SemaBase { + llvm::SmallPtrSet ExpandedPredicates; + llvm::SmallPtrSet PotentiallyUnguardedBuiltinUsers; + public: SemaAMDGPU(Sema &S); @@ -64,6 +69,16 @@ class SemaAMDGPU : public SemaBase { void handleAMDGPUNumVGPRAttr(Decl *D, const ParsedAttr &AL); void handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, const ParsedAttr &AL); void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL); + + /// Expand a valid use of the feature identification builtins into its + /// corresponding sequence of instructions. + Expr *ExpandAMDGPUPredicateBI(CallExpr *CE); + bool IsPredicate(Expr *E) const; + /// Diagnose unguarded usages of AMDGPU builtins and recommend guarding with + /// __builtin_amdgcn_is_invocable + void AddPotentiallyUnguardedBuiltinUser(FunctionDecl *FD); + bool HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const; + void DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD); }; } // namespace clang diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 679812adcdf12..dedfb396200fd 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -1476,7 +1476,12 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target, } if (Target.getTriple().isAMDGPU() || - (AuxTarget && AuxTarget->getTriple().isAMDGPU())) { + (Target.getTriple().isSPIRV() && + Target.getTriple().getVendor() == llvm::Triple::AMD) || + (AuxTarget && + (AuxTarget->getTriple().isAMDGPU() || + ((AuxTarget->getTriple().isSPIRV() && + AuxTarget->getTriple().getVendor() == llvm::Triple::AMD))))) { #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) \ InitBuiltinType(SingletonId, BuiltinType::Id); #include "clang/Basic/AMDGPUTypes.def" @@ -12499,6 +12504,10 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context, Type = Context.AMDGPUBufferRsrcTy; break; } + case 'c': { + Type = Context.AMDGPUFeaturePredicateTy; + break; + } default: llvm_unreachable("Unexpected target builtin type"); } diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp index 2336fb3ef0495..19f160e279b21 100644 --- a/clang/lib/Basic/Targets/SPIR.cpp +++ b/clang/lib/Basic/Targets/SPIR.cpp @@ -181,3 +181,12 @@ void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) { Float128Format = DoubleFormat; } } + +bool SPIRV64AMDGCNTargetInfo::isValidCPUName(StringRef CPU) const { + return AMDGPUTI.isValidCPUName(CPU); +} + +void SPIRV64AMDGCNTargetInfo::fillValidCPUList( + SmallVectorImpl &Values) const { + return AMDGPUTI.fillValidCPUList(Values); +} diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 1abf798d93129..1b4b6332eebb6 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -454,6 +454,11 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final } bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); } + + // This is only needed for validating arguments passed to + // __builtin_amdgcn_processor_is + bool isValidCPUName(StringRef Name) const override; + void fillValidCPUList(SmallVectorImpl &Values) const override; }; } // namespace targets diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 8bf7d24d8551d..2936bd34b7ad5 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -1029,6 +1029,13 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) { DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_unsigned); \ return SingletonId; \ } +#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \ + case BuiltinType::Id: { \ + if (!SingletonId) \ + SingletonId = \ + DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_boolean); \ + return SingletonId; \ + } #include "clang/Basic/AMDGPUTypes.def" case BuiltinType::UChar: case BuiltinType::Char_U: diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 44931d0481e26..c996f6b781ac6 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -982,6 +982,10 @@ Value *ScalarExprEmitter::EmitConversionToBool(Value *Src, QualType SrcType) { if (const MemberPointerType *MPT = dyn_cast(SrcType)) return CGF.CGM.getCXXABI().EmitMemberPointerIsNotNull(CGF, Src, MPT); + // The conversion is a NOP, and will be done when CodeGening the builtin. + if (SrcType == CGF.getContext().AMDGPUFeaturePredicateTy) + return Src; + assert((SrcType->isIntegerType() || isa(Src->getType())) && "Unknown scalar type to convert"); diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index c98503e4bbd26..53bb6e7b85142 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -580,6 +580,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { case BuiltinType::Id: \ return llvm::TargetExtType::get(getLLVMContext(), "amdgcn.named.barrier", \ {}, {Scope}); +#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \ + case BuiltinType::Id: \ + return ConvertType(getContext().getLogicalOperationType()); #include "clang/Basic/AMDGPUTypes.def" #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id: #include "clang/Basic/HLSLIntangibleTypes.def" diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index f09b3b92c4ea0..10a992144b5a9 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -295,6 +295,18 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); } +static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) { + auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext()); + + auto *P = cast( + CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy)); + P->setConstant(true); + P->setExternallyInitialized(true); + + return CGF.Builder.CreateLoad( + RawAddress(P, PTy, CharUnits::One(), KnownNonNull)); +} + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; @@ -637,6 +649,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Value *Env = EmitScalarExpr(E->getArg(0)); return Builder.CreateCall(F, {Env}); } + case AMDGPU::BI__builtin_amdgcn_processor_is: { + assert(CGM.getTriple().isSPIRV() && + "__builtin_amdgcn_processor_is should never reach CodeGen for " + "concrete targets!"); + StringRef Proc = cast(E->getArg(0))->getString(); + return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc); + } + case AMDGPU::BI__builtin_amdgcn_is_invocable: { + assert(CGM.getTriple().isSPIRV() && + "__builtin_amdgcn_is_invocable should never reach CodeGen for " + "concrete targets!"); + auto *FD = cast( + cast(E->getArg(0))->getReferencedDeclOfCallee()); + StringRef RF = + getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); + return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF); + } case AMDGPU::BI__builtin_amdgcn_read_exec: return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false); case AMDGPU::BI__builtin_amdgcn_read_exec_lo: diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp index 36704c3826dfd..2c6ae89513241 100644 --- a/clang/lib/Sema/JumpDiagnostics.cpp +++ b/clang/lib/Sema/JumpDiagnostics.cpp @@ -19,6 +19,7 @@ #include "clang/AST/StmtOpenACC.h" #include "clang/AST/StmtOpenMP.h" #include "clang/Basic/SourceLocation.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaInternal.h" #include "llvm/ADT/BitVector.h" using namespace clang; @@ -367,8 +368,10 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S, case Stmt::IfStmtClass: { IfStmt *IS = cast(S); + bool AMDGPUPredicate = false; if (!(IS->isConstexpr() || IS->isConsteval() || - IS->isObjCAvailabilityCheck())) + IS->isObjCAvailabilityCheck() || + (AMDGPUPredicate = this->S.AMDGPU().IsPredicate(IS->getCond())))) break; unsigned Diag = diag::note_protected_by_if_available; @@ -376,6 +379,8 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S, Diag = diag::note_protected_by_constexpr_if; else if (IS->isConsteval()) Diag = diag::note_protected_by_consteval_if; + else if (AMDGPUPredicate) + Diag = diag::note_amdgcn_protected_by_predicate; if (VarDecl *Var = IS->getConditionVariable()) BuildScopeInformation(Var, ParentScope); diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 56608e990fd50..6ccd22126581d 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -571,8 +571,13 @@ void Sema::Initialize() { } if (Context.getTargetInfo().getTriple().isAMDGPU() || + (Context.getTargetInfo().getTriple().isSPIRV() && + Context.getTargetInfo().getTriple().getVendor() == llvm::Triple::AMD) || (Context.getAuxTargetInfo() && - Context.getAuxTargetInfo()->getTriple().isAMDGPU())) { + (Context.getAuxTargetInfo()->getTriple().isAMDGPU() || + (Context.getAuxTargetInfo()->getTriple().isSPIRV() && + Context.getAuxTargetInfo()->getTriple().getVendor() == + llvm::Triple::AMD)))) { #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) \ addImplicitTypedef(Name, Context.SingletonId); #include "clang/Basic/AMDGPUTypes.def" diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index e6414a623b929..af4668e17ed68 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -11,9 +11,14 @@ //===----------------------------------------------------------------------===// #include "clang/Sema/SemaAMDGPU.h" +#include "clang/AST/Decl.h" +#include "clang/AST/DynamicRecursiveASTVisitor.h" +#include "clang/AST/Expr.h" #include "clang/Basic/DiagnosticSema.h" #include "clang/Basic/TargetBuiltins.h" +#include "clang/Basic/TargetInfo.h" #include "clang/Sema/Ownership.h" +#include "clang/Sema/Scope.h" #include "clang/Sema/Sema.h" #include "llvm/Support/AtomicOrdering.h" #include @@ -367,4 +372,189 @@ void SemaAMDGPU::handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), YExpr, ZExpr); } +Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) { + ASTContext &Ctx = getASTContext(); + QualType BoolTy = Ctx.getLogicalOperationType(); + llvm::APInt False = llvm::APInt::getZero(Ctx.getIntWidth(BoolTy)); + llvm::APInt True = llvm::APInt::getAllOnes(Ctx.getIntWidth(BoolTy)); + SourceLocation Loc = CE->getExprLoc(); + + if (!CE->getBuiltinCallee()) + return *ExpandedPredicates + .insert(IntegerLiteral::Create(Ctx, False, BoolTy, Loc)) + .first; + + bool P = false; + unsigned BI = CE->getBuiltinCallee(); + if (Ctx.BuiltinInfo.isAuxBuiltinID(BI)) + BI = Ctx.BuiltinInfo.getAuxBuiltinID(BI); + + if (BI == AMDGPU::BI__builtin_amdgcn_processor_is) { + auto *GFX = dyn_cast(CE->getArg(0)->IgnoreParenCasts()); + if (!GFX) { + Diag(Loc, diag::err_amdgcn_processor_is_arg_not_literal); + return nullptr; + } + + StringRef N = GFX->getString(); + const TargetInfo &TI = Ctx.getTargetInfo(); + const TargetInfo *AuxTI = Ctx.getAuxTargetInfo(); + if (!TI.isValidCPUName(N) && (!AuxTI || !AuxTI->isValidCPUName(N))) { + Diag(Loc, diag::err_amdgcn_processor_is_arg_invalid_value) << N; + SmallVector ValidList; + if (TI.getTriple().getVendor() == llvm::Triple::VendorType::AMD) + TI.fillValidCPUList(ValidList); + else if (AuxTI) // Since the BI is present it must be and AMDGPU triple. + AuxTI->fillValidCPUList(ValidList); + if (!ValidList.empty()) + Diag(Loc, diag::note_amdgcn_processor_is_valid_options) + << llvm::join(ValidList, ", "); + return nullptr; + } + if (Ctx.getTargetInfo().getTriple().isSPIRV()) { + CE->setType(BoolTy); + return *ExpandedPredicates.insert(CE).first; + } + + if (auto TID = Ctx.getTargetInfo().getTargetID()) + P = TID->find(N) == 0; + } else { + Expr *Arg = CE->getArg(0); + if (!Arg || Arg->getType() != Ctx.BuiltinFnTy) { + Diag(Loc, diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg; + return nullptr; + } + + if (Ctx.getTargetInfo().getTriple().isSPIRV()) { + CE->setType(BoolTy); + return *ExpandedPredicates.insert(CE).first; + } + + auto *FD = cast(Arg->getReferencedDeclOfCallee()); + + StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID()); + llvm::StringMap CF; + Ctx.getFunctionFeatureMap(CF, FD); + + P = Builtin::evaluateRequiredTargetFeatures(RF, CF); + } + + return *ExpandedPredicates + .insert( + IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc)) + .first; +} + +bool SemaAMDGPU::IsPredicate(Expr *E) const { + return ExpandedPredicates.contains(E); +} + +void SemaAMDGPU::AddPotentiallyUnguardedBuiltinUser(FunctionDecl *FD) { + PotentiallyUnguardedBuiltinUsers.insert(FD); +} + +bool SemaAMDGPU::HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const { + return PotentiallyUnguardedBuiltinUsers.contains(FD); +} + +namespace { +/// This class implements -Wamdgpu-unguarded-builtin-usage. +/// +/// This is done with a traversal of the AST of a function that includes a +/// call to a target specific builtin. Whenever we encounter an \c if of the +/// form: \c if(__builtin_amdgcn_is_invocable), we consider the then statement +/// guarded. +class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor { + // TODO: this is conservative, and should be extended to: + // - warn on unguarded ASM usage (__builtin_amdgcn_processor_is as the + // guard); + // - build sets of builtins which are invocable from nested + // if (__builtin_amdgcn_is_invocable) calls, rather than assume + // sanity / that the existence of a guard implies its correctness; + // - derive the set of available builtins / valid ASM constraints from + // the target architecture passed to __builtin_amdgcn_processor_is; + // - consider attributes such as target. + Sema &SemaRef; + + unsigned Guards; + +public: + DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef), Guards(0u) {} + + bool TraverseLambdaExpr(LambdaExpr *LE) override { + if (SemaRef.AMDGPU().HasPotentiallyUnguardedBuiltinUsage( + LE->getCallOperator())) + return true; // We have already handled this. + return DynamicRecursiveASTVisitor::TraverseLambdaExpr(LE); + } + + bool TraverseStmt(Stmt *S) override { + if (!S) + return true; + return DynamicRecursiveASTVisitor::TraverseStmt(S); + } + + void IssueDiagnostics(Stmt *S) { TraverseStmt(S); } + + bool TraverseIfStmt(IfStmt *If) override; + + bool TraverseCaseStmt(CaseStmt *CS) override { + return TraverseStmt(CS->getSubStmt()); + } + + bool VisitCallExpr(CallExpr *CE) override; +}; + +inline Expr *FindPredicate(Expr *Cond) { + if (auto *CE = dyn_cast(Cond)) { + if (CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_is_invocable) + return Cond; + } else if (auto *UO = dyn_cast(Cond)) { + return FindPredicate(UO->getSubExpr()); + } else if (auto *BO = dyn_cast(Cond)) { + if ((Cond = FindPredicate(BO->getLHS()))) + return Cond; + return FindPredicate(BO->getRHS()); + } + return nullptr; +} + +bool DiagnoseUnguardedBuiltins::TraverseIfStmt(IfStmt *If) { + if (FindPredicate(If->getCond())) { + ++Guards; + bool Continue = TraverseStmt(If->getThen()); + --Guards; + + return Continue && TraverseStmt(If->getElse()); + } + + return DynamicRecursiveASTVisitor::TraverseIfStmt(If); +} + +bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) { + if (Guards) + return true; + + unsigned ID = CE->getBuiltinCallee(); + + if (!ID) + return true; + if (!SemaRef.getASTContext().BuiltinInfo.isTSBuiltin(ID)) + return true; + if (ID == AMDGPU::BI__builtin_amdgcn_processor_is || + ID == AMDGPU::BI__builtin_amdgcn_is_invocable) + return true; + + SemaRef.Diag(CE->getExprLoc(), diag::warn_amdgcn_unguarded_builtin) + << CE->getDirectCallee(); + SemaRef.Diag(CE->getExprLoc(), diag::note_amdgcn_unguarded_builtin_silence) + << CE->getDirectCallee(); + + return true; +} +} // Unnamed namespace + +void SemaAMDGPU::DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD) { + DiagnoseUnguardedBuiltins(SemaRef).IssueDiagnostics(FD->getBody()); +} } // namespace clang diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index e27ed8fd4de14..f0ab9eb2fec36 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -23,6 +23,7 @@ #include "clang/Basic/TargetInfo.h" #include "clang/Lex/Preprocessor.h" #include "clang/Sema/Initialization.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaHLSL.h" #include "clang/Sema/SemaObjC.h" #include "clang/Sema/SemaRISCV.h" @@ -1595,6 +1596,14 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr, return TC_Success; } + if (SrcType == Self.Context.AMDGPUFeaturePredicateTy && + DestType == Self.Context.getLogicalOperationType()) { + SrcExpr = Self.AMDGPU().ExpandAMDGPUPredicateBI( + dyn_cast(SrcExpr.get())); + Kind = CK_NoOp; + return TC_Success; + } + // We tried everything. Everything! Nothing works! :-( return TC_NotApplicable; } diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 11cbda412667f..f586f09bce19a 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -45,6 +45,7 @@ #include "clang/Sema/ParsedTemplate.h" #include "clang/Sema/Scope.h" #include "clang/Sema/ScopeInfo.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaARM.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaHLSL.h" @@ -13636,6 +13637,16 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { return; } + // __amdgpu_feature_predicate_t cannot be initialised + if (VDecl->getType().getDesugaredType(Context) == + Context.AMDGPUFeaturePredicateTy) { + Diag(VDecl->getLocation(), + diag::err_amdgcn_predicate_type_is_not_constructible) + << VDecl; + VDecl->setInvalidDecl(); + return; + } + // WebAssembly tables can't be used to initialise a variable. if (!Init->getType().isNull() && Init->getType()->isWebAssemblyTableType()) { Diag(Init->getExprLoc(), diag::err_wasm_table_art) << 0; @@ -14129,6 +14140,13 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) { if (VarDecl *Var = dyn_cast(RealDecl)) { QualType Type = Var->getType(); + if (Type.getDesugaredType(Context) == Context.AMDGPUFeaturePredicateTy) { + Diag(Var->getLocation(), + diag::err_amdgcn_predicate_type_is_not_constructible) + << Var; + Var->setInvalidDecl(); + return; + } // C++1z [dcl.dcl]p1 grammar implies that an initializer is mandatory. if (isa(RealDecl)) { Diag(Var->getLocation(), diag::err_decomp_decl_requires_init) << Var; @@ -16585,8 +16603,12 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body, return nullptr; } - if (Body && FSI->HasPotentialAvailabilityViolations) - DiagnoseUnguardedAvailabilityViolations(dcl); + if (Body) { + if (FSI->HasPotentialAvailabilityViolations) + DiagnoseUnguardedAvailabilityViolations(dcl); + else if (AMDGPU().HasPotentiallyUnguardedBuiltinUsage(FD)) + AMDGPU().DiagnoseUnguardedBuiltinUsage(FD); + } assert(!FSI->ObjCShouldCallSuper && "This should only be set for ObjC methods, which should have been " diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 728ada33e2e63..0eb95826342ff 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -51,6 +51,7 @@ #include "clang/Sema/ParsedTemplate.h" #include "clang/Sema/Scope.h" #include "clang/Sema/ScopeInfo.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaARM.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaFixItUtils.h" @@ -6570,6 +6571,22 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, if (Result.isInvalid()) return ExprError(); Fn = Result.get(); + // The __builtin_amdgcn_is_invocable builtin is special, and will be resolved + // later, when we check boolean conditions, for now we merely forward it + // without any additional checking. + if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 && + ArgExprs[0]->getType() == Context.BuiltinFnTy) { + const auto *FD = cast(Fn->getReferencedDeclOfCallee()); + + if (FD->getName() == "__builtin_amdgcn_is_invocable") { + QualType FnPtrTy = Context.getPointerType(FD->getType()); + Expr *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get(); + return CallExpr::Create( + Context, R, ArgExprs, Context.AMDGPUFeaturePredicateTy, + ExprValueKind::VK_PRValue, RParenLoc, FPOptionsOverride()); + } + } + if (CheckArgsForPlaceholders(ArgExprs)) return ExprError(); @@ -6688,6 +6705,13 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, FunctionDecl *FDecl = dyn_cast(NDecl); if (FDecl && FDecl->getBuiltinID()) { + if (Context.BuiltinInfo.isTSBuiltin(FDecl->getBuiltinID())) { + const llvm::Triple &Triple = Context.getTargetInfo().getTriple(); + if (Triple.isSPIRV() && Triple.getVendor() == llvm::Triple::AMD) + AMDGPU().AddPotentiallyUnguardedBuiltinUser(cast( + getFunctionLevelDeclContext(/*AllowLambda=*/true))); + } + // Rewrite the function decl for this builtin by replacing parameters // with no explicit address space with the address space of the arguments // in ArgExprs. @@ -15817,6 +15841,10 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc, // Vector logical not returns the signed variant of the operand type. resultType = GetSignedVectorType(resultType); break; + } else if (resultType == Context.AMDGPUFeaturePredicateTy) { + resultType = Context.getLogicalOperationType(); + Input = AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast(InputExpr)); + break; } else { return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr) << resultType << Input.get()->getSourceRange()); @@ -20622,6 +20650,9 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E, E = result.get(); if (!E->isTypeDependent()) { + if (E->getType() == Context.AMDGPUFeaturePredicateTy) + return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast_or_null(E)); + if (getLangOpts().CPlusPlus) return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4 diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 95746b35f71ef..754f97e5bc6a5 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -9114,6 +9114,15 @@ bool InitializationSequence::Diagnose(Sema &S, case FK_ConversionFailed: { QualType FromType = OnlyArg->getType(); + // __amdgpu_feature_predicate_t can be explicitly cast to the logical op + // type, although this is almost always an error and we advise against it. + if (FromType == S.Context.AMDGPUFeaturePredicateTy && + DestType == S.Context.getLogicalOperationType()) { + S.Diag(OnlyArg->getExprLoc(), + diag::err_amdgcn_predicate_type_needs_explicit_bool_cast) + << OnlyArg << DestType; + break; + } PartialDiagnostic PDiag = S.PDiag(diag::err_init_conversion_failed) << (int)Entity.getKind() << DestType @@ -9917,6 +9926,14 @@ Sema::PerformCopyInitialization(const InitializedEntity &Entity, if (EqualLoc.isInvalid()) EqualLoc = InitE->getBeginLoc(); + if (Entity.getType().getDesugaredType(Context) == + Context.AMDGPUFeaturePredicateTy && + Entity.getDecl()) { + Diag(EqualLoc, diag::err_amdgcn_predicate_type_is_not_constructible) + << Entity.getDecl(); + return ExprError(); + } + InitializationKind Kind = InitializationKind::CreateCopy( InitE->getBeginLoc(), EqualLoc, AllowExplicit); InitializationSequence Seq(*this, Entity, Kind, InitE, TopLevelOfInitList); diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 1b54628c5e564..ba7c7e1bc2dbf 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -30,6 +30,7 @@ #include "clang/Sema/Initialization.h" #include "clang/Sema/Lookup.h" #include "clang/Sema/Overload.h" +#include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaARM.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaObjC.h" @@ -6177,12 +6178,13 @@ TryContextuallyConvertToBool(Sema &S, Expr *From) { ExprResult Sema::PerformContextuallyConvertToBool(Expr *From) { if (checkPlaceholderForOverload(*this, From)) return ExprError(); + if (From->getType() == Context.AMDGPUFeaturePredicateTy) + return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast(From)); ImplicitConversionSequence ICS = TryContextuallyConvertToBool(*this, From); if (!ICS.isBad()) return PerformImplicitConversion(From, Context.BoolTy, ICS, AssignmentAction::Converting); - if (!DiagnoseMultipleUserDefinedConversion(From, Context.BoolTy)) return Diag(From->getBeginLoc(), diag::err_typecheck_bool_condition) << From->getType() << From->getSourceRange(); @@ -12041,6 +12043,16 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand, if (TakingCandidateAddress && !checkAddressOfCandidateIsAvailable(S, Fn)) return; + // __amdgpu_feature_predicate_t can be explicitly cast to the logical op type, + // although this is almost always an error and we advise against it. + if (FromTy == S.Context.AMDGPUFeaturePredicateTy && + ToTy == S.Context.getLogicalOperationType()) { + S.Diag(Conv.Bad.FromExpr->getExprLoc(), + diag::err_amdgcn_predicate_type_needs_explicit_bool_cast) + << Conv.Bad.FromExpr << ToTy; + return; + } + // Emit the generic diagnostic and, optionally, add the hints to it. PartialDiagnostic FDiag = S.PDiag(diag::note_ovl_candidate_bad_conv); FDiag << (unsigned)FnKindPair.first << (unsigned)FnKindPair.second << FnDesc diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c new file mode 100644 index 0000000000000..12f283707308e --- /dev/null +++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c @@ -0,0 +1,64 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5 +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s + +// Test that, depending on triple and, if applicable, target-cpu, one of three +// things happens: +// 1) for gfx900 we emit an empty kernel (concrete target, lacks feature) +// 2) for gfx1010 we emit a call to trap (concrete target, has feature) +// 3) for AMDGCNSPIRV we emit llvm.amdgcn.has.gfx10-insts as a constant +// externally initialised bool global, and load from it to provide the +// condition to a br (abstract target) + +//. +// AMDGCNSPIRV: @llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1 +//. +// AMDGCN-GFX900-LABEL: define dso_local void @foo( +// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX900-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX900-NEXT: ret void +// +// AMDGCN-GFX1010-LABEL: define dso_local void @foo( +// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX1010-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX1010-NEXT: call void @llvm.trap() +// AMDGCN-GFX1010-NEXT: ret void +// +// AMDGCNSPIRV-LABEL: define spir_func void @foo( +// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { +// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1 +// AMDGCNSPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// AMDGCNSPIRV: [[IF_THEN]]: +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap() +// AMDGCNSPIRV-NEXT: br label %[[IF_END]] +// AMDGCNSPIRV: [[IF_END]]: +// AMDGCNSPIRV-NEXT: ret void +// +void foo() { + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16)) + return __builtin_trap(); +} +//. +// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +//. +// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } +// AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" } +// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c new file mode 100644 index 0000000000000..76dead8ebbe89 --- /dev/null +++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c @@ -0,0 +1,62 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5 +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s + +// Test that, depending on triple and, if applicable, target-cpu, one of three +// things happens: +// 1) for gfx900 we emit a call to trap (concrete target, matches) +// 2) for gfx1010 we emit an empty kernel (concrete target, does not match) +// 3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and +// load from it to provide the condition a br (abstract target) +//. +// AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1 +//. +// AMDGCN-GFX900-LABEL: define dso_local void @foo( +// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX900-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX900-NEXT: call void @llvm.trap() +// AMDGCN-GFX900-NEXT: ret void +// +// AMDGCN-GFX1010-LABEL: define dso_local void @foo( +// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGCN-GFX1010-NEXT: [[ENTRY:.*:]] +// AMDGCN-GFX1010-NEXT: ret void +// +// AMDGCNSPIRV-LABEL: define spir_func void @foo( +// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { +// AMDGCNSPIRV-NEXT: [[ENTRY:.*:]] +// AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1 +// AMDGCNSPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false +// AMDGCNSPIRV-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// AMDGCNSPIRV: [[IF_THEN]]: +// AMDGCNSPIRV-NEXT: call addrspace(4) void @llvm.trap() +// AMDGCNSPIRV-NEXT: br label %[[IF_END]] +// AMDGCNSPIRV: [[IF_END]]: +// AMDGCNSPIRV-NEXT: ret void +// +void foo() { + if (__builtin_amdgcn_processor_is("gfx900")) + return __builtin_trap(); +} +//. +// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +// AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" } +//. +// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" } +// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) } +//. +// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. +// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} +// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp new file mode 100644 index 0000000000000..78f18d3a37b46 --- /dev/null +++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp @@ -0,0 +1,48 @@ +// RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s +// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s + +bool predicate(bool x); +void pass_by_value(__amdgpu_feature_predicate_t x); + +void invalid_uses(int *p, int x, const __amdgpu_feature_predicate_t &lv, + __amdgpu_feature_predicate_t &&rv) { + // CHECK: error: 'a' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t a; + // CHECK: error: 'b' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t b = __builtin_amdgcn_processor_is("gfx906"); + // CHECK: error: 'c' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t c = lv; + // CHECK: error: 'd' has type __amdgpu_feature_predicate_t, which is not constructible + __amdgpu_feature_predicate_t d = rv; + // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906"); + // CHECK: error: 'x' has type __amdgpu_feature_predicate_t, which is not constructible + pass_by_value(__builtin_amdgcn_processor_is("gfx906")); + // CHECK: error: '__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + bool invalid_use_in_init_1 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var); + // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + if (bool invalid_use_in_init_2 = __builtin_amdgcn_processor_is("gfx906")) return; + // CHECK: error: '__builtin_amdgcn_processor_is("gfx1200")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x); +} + +void invalid_invocations(int x, const char* str) { + // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a valid AMDGCN processor identifier; 'not_an_amdgcn_gfx_id' is not valid + // CHECK-DAG: note: valid AMDGCN processor identifiers are: {{.*}}gfx{{.*}} + if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return; + // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal + if (__builtin_amdgcn_processor_is(str)) return; + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}__builtin_amdgcn_s_sleep_var{{.*}} is not valid + if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return; + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}str{{.*}} is not valid + else if (__builtin_amdgcn_is_invocable(str)) return; + // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}x{{.*}} is not valid + else if (__builtin_amdgcn_is_invocable(x)) return; + // CHECK: error: use of undeclared identifier '__builtin_ia32_pause' + else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return; +} + +bool return_needs_cast() { + // CHECK: error: '__builtin_amdgcn_processor_is("gfx900")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided + return __builtin_amdgcn_processor_is("gfx900"); +} diff --git a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp index 44557284fc581..cffd3c7a5fb1f 100644 --- a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp +++ b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp @@ -7,7 +7,7 @@ // HIPSTDPAR-PRE: Running pass: EntryExitInstrumenterPass // HIPSTDPAR-PRE-NEXT: Running pass: EntryExitInstrumenterPass // HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass -// HIPSTDPAR-PRE-NEXT: Running pass: AlwaysInlinerPass +// HIPSTDPAR-PRE-NEXT: Running pass: AMDGPUExpandFeaturePredicatesPass // Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink. // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \ diff --git a/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip b/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip new file mode 100644 index 0000000000000..a7f1abcdcd8fe --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip @@ -0,0 +1,62 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx900 -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx1201 -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ void f(int *ptr, int size, bool f) { + int i = 0; + if (f) + goto label; // expected-error {{cannot jump from this goto statement to its label}} + + if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}} + for (i = 0; i < size; ++i) { + label: + ptr[i] = i; + } + } +} + +__device__ void g(int *ptr, int size, bool f) { + int i = 0; + if (f) + goto label; // expected-error {{cannot jump from this goto statement to its label}} + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}} + for (i = 0; i < size; ++i) { + label: + ptr[i] = i; + } + } +} + +__global__ void h(int *ptr, int size, bool f) { + int i = 0; + if (f) + goto label; // expected-error {{cannot jump from this goto statement to its label}} + + if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}} + for (i = 0; i < size; ++i) { + label: + ptr[i] = i; + } + } +} + +__global__ void i(int *ptr, int size, bool f) { + int i = 0; + if (f) + goto label; // expected-error {{cannot jump from this goto statement to its label}} + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}} + for (i = 0; i < size; ++i) { + label: + ptr[i] = i; + } + } +} diff --git a/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip b/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip new file mode 100644 index 0000000000000..27bbb3f2f3d07 --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip @@ -0,0 +1,31 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx900 -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace +// RUN: %clang_cc1 -triple x86_64 -aux-triple amdgcn -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace +// RUN: %clang_cc1 -triple x86_64 -aux-triple spirv64-amd-amdhsa -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace + +__attribute__((device)) auto foo() { + return __builtin_amdgcn_processor_is("gfx900"); +} + +__attribute__((device)) decltype(auto) bar() { + return __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep); +} + +// CHECK: |-TypedefDecl {{.*}} implicit __amdgpu_feature_predicate_t '__amdgpu_feature_predicate_t' +// CHECK-NEXT: | `-BuiltinType {{.*}} '__amdgpu_feature_predicate_t' +// CHECK-DAG: |-FunctionDecl {{.*}} foo '__amdgpu_feature_predicate_t ()' +// CHECK-NEXT: |-CompoundStmt {{.*}} +// CHECK-NEXT: | `-ReturnStmt {{.*}} +// CHECK-NEXT: | `-CallExpr {{.*}} '__amdgpu_feature_predicate_t' +// CHECK-NEXT: | |-ImplicitCastExpr {{.*}} '__amdgpu_feature_predicate_t (*)(const char *) noexcept' +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} Function {{.*}} '__builtin_amdgcn_processor_is' '__amdgpu_feature_predicate_t (const char *) noexcept' +// CHECK-NEXT: | `-StringLiteral {{.*}} "gfx900" +// CHECK-DAG: |-FunctionDecl {{.*}} bar '__amdgpu_feature_predicate_t ()' +// CHECK-NEXT: |-CompoundStmt {{.*}} +// CHECK-NEXT: | `-ReturnStmt {{.*}} +// CHECK-NEXT: | `-CallExpr {{.*}} '__amdgpu_feature_predicate_t' +// CHECK-NEXT: | |-ImplicitCastExpr {{.*}} '__amdgpu_feature_predicate_t (*)() noexcept' +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} Function {{.*}} '__builtin_amdgcn_is_invocable' '__amdgpu_feature_predicate_t () noexcept' diff --git a/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip b/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip new file mode 100644 index 0000000000000..26544590f7536 --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip @@ -0,0 +1,47 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wamdgpu-unguarded-builtin-usage %s + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ void g(); + +__device__ void f(int x, bool b) { + const auto lambda = [=] __device__ () { + __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}} + // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}} + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) + __builtin_amdgcn_s_sleep_var(x); + }; + + const auto generic_lambda = [] __device__ (auto&& y) { + __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}} + // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}} + + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) + __builtin_amdgcn_s_sleep_var(y); + }; + + __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}} + // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}} + + // processor_is does not (yet) guard + if (__builtin_amdgcn_processor_is("gfx900")) + __builtin_amdgcn_s_sleep_var(x); // expected-warning {{'__builtin_amdgcn_s_sleep_var' might be unavailable on some AMDGPU targets}} + // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep_var' in a __builtin_amdgcn_is_invocable check to silence this warning}} + + // Direct guard + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep)) + __builtin_amdgcn_s_sleep(42); + + // Guarded scope + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { + if (b) { + g(); + while (--x > 42) + __builtin_amdgcn_s_sleep_var(x); + } + } +} diff --git a/clang/test/SemaHIP/amdgpu-is-invocable.hip b/clang/test/SemaHIP/amdgpu-is-invocable.hip new file mode 100644 index 0000000000000..214d7769a595f --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-is-invocable.hip @@ -0,0 +1,21 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s + +// expected-no-diagnostics + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ void foo() { + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16)) + return __builtin_trap(); +} + +__global__ void bar() { + if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16)) + return __builtin_trap(); +} diff --git a/clang/test/SemaHIP/amdgpu-processor-is.hip b/clang/test/SemaHIP/amdgpu-processor-is.hip new file mode 100644 index 0000000000000..0f7211fd75d90 --- /dev/null +++ b/clang/test/SemaHIP/amdgpu-processor-is.hip @@ -0,0 +1,21 @@ +// REQUIRES: amdgpu-registered-target +// REQUIRES: spirv-registered-target +// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s + +// expected-no-diagnostics + +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) + +__device__ void foo() { + if (__builtin_amdgcn_processor_is("gfx900")) + return __builtin_trap(); +} + +__global__ void bar() { + if (__builtin_amdgcn_processor_is("gfx900")) + return __builtin_trap(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 23f106a9c1d4d..46563bc2fb495 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -431,6 +431,15 @@ extern char &AMDGPUPrintfRuntimeBindingID; void initializeAMDGPUResourceUsageAnalysisWrapperPassPass(PassRegistry &); extern char &AMDGPUResourceUsageAnalysisID; +struct AMDGPUExpandFeaturePredicatesPass + : PassInfoMixin { + const AMDGPUTargetMachine &TM; + AMDGPUExpandFeaturePredicatesPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + + static bool isRequired() { return true; } +}; + struct AMDGPUPrintfRuntimeBindingPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp new file mode 100644 index 0000000000000..fc0d3e378044d --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp @@ -0,0 +1,194 @@ +//===- AMDGPUExpandFeaturePredicates.cpp - Feature Predicate Expander Pass ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file implements a pass that deals with expanding AMDGCN generic feature +// predicates into target specific quantities / sequences. In this context, a +// generic feature predicate is an implementation detail global variable that +// is inserted by the FE as a consequence of using either the __builtin_cpu_is +// or the __builtin_amdgcn_is_invocable special builtins on an abstract target +// (AMDGCNSPIRV). These placeholder globals are used to guide target specific +// lowering, once the concrete target is known, by way of constant folding their +// value all the way into a terminator (i.e. a controlled block) or into a no +// live use scenario. We hard fail if the folding fails, to avoid obtuse BE +// errors or opaque run time errors. This pass should run as early as possible / +// immediately after Clang CodeGen, so that the optimisation pipeline and the BE +// operate with concrete target data. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/Local.h" + +#include +#include + +using namespace llvm; + +namespace { +template void collectUsers(Value *V, C &Container) { + assert(V && "Must pass an existing Value!"); + + for (auto &&U : V->users()) + if (auto *I = dyn_cast(U)) + Container.insert(Container.end(), I); +} + +inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) { + const bool IsFeature = P->getName().starts_with("llvm.amdgcn.has"); + const size_t Offset = + IsFeature ? sizeof("llvm.amdgcn.has") : sizeof("llvm.amdgcn.is"); + + std::string PV = P->getName().substr(Offset).str(); + if (IsFeature) { + size_t Dx = PV.find(','); + while (Dx != std::string::npos) { + PV.insert(++Dx, {'+'}); + + Dx = PV.find(',', Dx); + } + PV.insert(PV.cbegin(), '+'); + } + + Type *PTy = P->getValueType(); + P->setLinkage(GlobalValue::PrivateLinkage); + P->setExternallyInitialized(false); + + if (IsFeature) + P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures(PV))); + else + P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU())); +} + +std::pair +unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) { + std::string W; + raw_string_ostream OS(W); + + OS << "Impossible to constant fold feature predicate: " << *P << " used by " + << *NoFold << ", please simplify.\n"; + + Caller->getContext().diagnose( + DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error)); + + return {PreservedAnalyses::none(), false}; +} + +std::pair +handlePredicate(const GCNSubtarget &ST, FunctionAnalysisManager &FAM, + SmallPtrSet &Predicated, GlobalVariable *P) { + setPredicate(ST, P); + + SmallPtrSet ToFold; + collectUsers(P, ToFold); + + if (ToFold.empty()) + return {PreservedAnalyses::all(), true}; + + do { + Instruction *I = *ToFold.begin(); + ToFold.erase(I); + + I->dropDroppableUses(); + + Function *F = I->getParent()->getParent(); + auto &DT = FAM.getResult(*F); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + if (auto *C = ConstantFoldInstruction(I, P->getDataLayout())) { + collectUsers(I, ToFold); + I->replaceAllUsesWith(C); + I->eraseFromParent(); + continue; + } else if (I->isTerminator() && + ConstantFoldTerminator(I->getParent(), true, nullptr, &DTU)) { + Predicated.insert(F); + + continue; + } + + return unfoldableFound(I->getParent()->getParent(), P, I); + } while (!ToFold.empty()); + + return {PreservedAnalyses::none(), true}; +} +} // Unnamed namespace. + +static inline SmallVector collectUsedFunctions(Module &M) { + SmallVector Ret; + for (auto &&F : M) { + if (F.isIntrinsic() || F.isDeclaration()) + continue; + if (!F.hasInternalLinkage() && !F.hasPrivateLinkage()) + continue; + if (F.hasNUndroppableUses(0)) + continue; + Ret.push_back(&F); + } + + return Ret; +} + +template +static inline void removeUnreachable(const Container0 &Predicates, + const Container1 &PredicatedFns, + const Container2 &UnreachableFns) { + for_each(Predicates, [](auto &&P) { P->eraseFromParent(); }); + for_each(PredicatedFns, [](auto &&F) { removeUnreachableBlocks(*F); }); + for_each(UnreachableFns, [](auto &&F) { + if (!F->hasNUndroppableUses(0)) + return; + F->dropDroppableUses(); + F->eraseFromParent(); + }); +} + +PreservedAnalyses +AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) { + if (M.empty()) + return PreservedAnalyses::all(); + + SmallVector Predicates; + for (auto &&G : M.globals()) { + if (!G.isDeclaration() || !G.hasName()) + continue; + if (G.getName().starts_with("llvm.amdgcn.")) + Predicates.push_back(&G); + } + + if (Predicates.empty()) + return PreservedAnalyses::all(); + + const auto &ST = TM.getSubtarget( + *find_if(M, [](auto &&F) { return !F.isIntrinsic(); })); + + auto &FAM = MAM.getResult(M).getManager(); + SmallPtrSet Predicated; + SmallVector MaybeUnreachable = collectUsedFunctions(M); + auto Ret = PreservedAnalyses::all(); + for (auto &&P : Predicates) { + auto R = handlePredicate(ST, FAM, Predicated, P); + + if (!R.second) + break; + + Ret.intersect(R.first); + } + + removeUnreachable(Predicates, Predicated, MaybeUnreachable); + + return Ret; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 5d298304c27f5..9e962b35308de 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -16,6 +16,8 @@ #ifndef MODULE_PASS #define MODULE_PASS(NAME, CREATE_PASS) #endif +MODULE_PASS("amdgpu-expand-feature-predicates", + AMDGPUExpandFeaturePredicatesPass(*this)) MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass()) MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass()) MODULE_PASS("amdgpu-lower-buffer-fat-pointers", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f4dc4a483181c..f92f1518ae25e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -811,8 +811,18 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def" #include "llvm/Passes/TargetPassRegistry.inc" + PB.registerPipelineStartEPCallback( + [this](ModulePassManager &PM, OptimizationLevel Level) { + PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this)); + }); + + PB.registerFullLinkTimeOptimizationEarlyEPCallback( + [this](ModulePassManager &PM, OptimizationLevel) { + PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this)); + }); + PB.registerScalarOptimizerLateEPCallback( - [](FunctionPassManager &FPM, OptimizationLevel Level) { + [this](FunctionPassManager &FPM, OptimizationLevel Level) { if (Level == OptimizationLevel::O0) return; diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index e3519f192137c..26b6e4f18ad30 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -53,6 +53,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCodeGenPrepare.cpp AMDGPUCombinerHelper.cpp AMDGPUCtorDtorLowering.cpp + AMDGPUExpandFeaturePredicates.cpp AMDGPUExportClustering.cpp AMDGPUExportKernelRuntimeHandles.cpp AMDGPUFrameLowering.cpp diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll new file mode 100644 index 0000000000000..c5089de333849 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5 +; REQUIRES: amdgpu-registered-target + +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s + +@llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1 + +define external void @extern_linkage() { +; GFX906-LABEL: define void @extern_linkage( +; GFX906-SAME: ) #[[ATTR0:[0-9]+]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: ret void +; +; GFX1201-LABEL: define void @extern_linkage( +; GFX1201-SAME: ) #[[ATTR0:[0-9]+]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: ret void +; +entry: + ret void +} + +define private void @non_predicated_uses() { +; GFX906-LABEL: define private void @non_predicated_uses( +; GFX906-SAME: ) #[[ATTR0]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: ret void +; +; GFX1201-LABEL: define private void @non_predicated_uses( +; GFX1201-SAME: ) #[[ATTR0]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: ret void +; +entry: + ret void +} + +define internal void @remove_on_906() { +; GFX1201-LABEL: define internal void @remove_on_906( +; GFX1201-SAME: ) #[[ATTR0]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: ret void +; +entry: + ret void +} + +define internal void @remove_on_1201() { +; GFX906-LABEL: define internal void @remove_on_1201( +; GFX906-SAME: ) #[[ATTR0]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: ret void +; +entry: + ret void +} + +define void @foo() { +; GFX906-LABEL: define void @foo( +; GFX906-SAME: ) #[[ATTR0]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: call void @non_predicated_uses() +; GFX906-NEXT: br label %[[NOT_GFX1201:.*]] +; GFX906: [[NOT_GFX1201]]: +; GFX906-NEXT: br label %[[GFX906:.*]] +; GFX906: [[GFX906]]: +; GFX906-NEXT: call void @remove_on_1201() +; GFX906-NEXT: br label %[[END:.*]] +; GFX906: [[END]]: +; GFX906-NEXT: ret void +; +; GFX1201-LABEL: define void @foo( +; GFX1201-SAME: ) #[[ATTR0]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: call void @non_predicated_uses() +; GFX1201-NEXT: br label %[[GFX1201:.*]] +; GFX1201: [[GFX1201]]: +; GFX1201-NEXT: call void @remove_on_906() +; GFX1201-NEXT: br label %[[END:.*]] +; GFX1201: [[END]]: +; GFX1201-NEXT: ret void +; +entry: + call void @non_predicated_uses() + %0 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1 + br i1 %0, label %gfx1201, label %not.gfx1201 + +gfx1201: + call void @remove_on_906() + br label %end + +not.gfx1201: + %1 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1 + br i1 %1, label %gfx906, label %end + +gfx906: + call void @remove_on_1201() + br label %end + +end: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll new file mode 100644 index 0000000000000..bfc35d8c76e37 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll @@ -0,0 +1,28 @@ +; REQUIRES: amdgpu-registered-target + +; RUN: not opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' < %s 2>&1 | FileCheck %s + +; CHECK: error:{{.*}}in function kernel void (ptr addrspace(1), i32, ptr addrspace(1)): Impossible to constant fold feature predicate: @llvm.amdgcn.is.gfx803 = private addrspace(1) constant i1 false used by %call = call i1 %1(i1 zeroext false), please simplify. + +@llvm.amdgcn.is.gfx803 = external addrspace(1) externally_initialized constant i1 + +declare void @llvm.amdgcn.s.sleep(i32 immarg) #1 + +define amdgpu_kernel void @kernel(ptr addrspace(1) readnone captures(none) %p.coerce, i32 %x, ptr addrspace(1) %pfn.coerce) { +entry: + %0 = ptrtoint ptr addrspace(1) %pfn.coerce to i64 + %1 = inttoptr i64 %0 to ptr + %2 = ptrtoint ptr addrspace(1) %pfn.coerce to i64 + %3 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx803, align 1 + %call = call i1 %1(i1 zeroext %3) + br i1 %call, label %if.gfx803, label %if.end + +if.gfx803: + call void @llvm.amdgcn.s.sleep(i32 0) + br label %if.end + +if.end: + ret void +} + +attributes #1 = { nocallback nofree nosync nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll new file mode 100644 index 0000000000000..a16a7fc31da22 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll @@ -0,0 +1,284 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; REQUIRES: amdgpu-registered-target + +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1010 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1101 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201-W64 %s + +;; The IR was derived from the following source: +;; extern "C" __global__ void kernel(int* p, int x) +;; { +;; if (__builtin_amdgcn_processor_is("gfx1201") || +;; __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) +;; __builtin_amdgcn_s_sleep_var(x); +;; if (!__builtin_amdgcn_processor_is("gfx906")) +;; __builtin_amdgcn_s_wait_event_export_ready(); +;; else if (__builtin_amdgcn_processor_is("gfx1010") || +;; __builtin_amdgcn_processor_is("gfx1101")) +;; __builtin_amdgcn_s_ttracedata_imm(1); +;; while (__builtin_amdgcn_processor_is("gfx1101")) *p += x; +;; do { +;; *p -= x; +;; } while (__builtin_amdgcn_processor_is("gfx1010")); +;; for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break; +;; +;; if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready)) +;; __builtin_amdgcn_s_wait_event_export_ready(); +;; else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm)) +;; __builtin_amdgcn_s_ttracedata_imm(1); +;; +;; do { +;; *p -= x; +;; } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32)); +;; for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break; +;; } + +@llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx12-insts = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx1010 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.is.gfx1101 = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx11-insts = external addrspace(1) externally_initialized constant i1 +@llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1 +@"llvm.amdgcn.has.gfx12-insts,wavefrontsize64" = external addrspace(1) externally_initialized constant i1 + +declare void @llvm.amdgcn.s.sleep.var(i32) +declare void @llvm.amdgcn.s.wait.event.export.ready() +declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg) + +define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) { +; GFX906-LABEL: define amdgpu_kernel void @kernel( +; GFX906-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[ENTRY:.*:]] +; GFX906-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX906-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX906-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX906: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX906-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX906: [[IF_NOT_GFX907]]: +; GFX906-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX906: [[IF_GFX1010_OR_GFX1101]]: +; GFX906-NEXT: br label %[[LOR_NOT_GFX1010:.*]] +; GFX906: [[LOR_NOT_GFX1010]]: +; GFX906-NEXT: br label %[[FOR_COND:.*]] +; GFX906: [[FOR_COND]]: +; GFX906-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX906-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX906-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX906-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX906: [[IF_GFX11_INSTS]]: +; GFX906-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX906: [[IF_GFX10_INSTS]]: +; GFX906-NEXT: call void @llvm.assume(i1 true) +; GFX906-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX906-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX906-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX906-NEXT: ret void +; +; GFX1010-LABEL: define amdgpu_kernel void @kernel( +; GFX1010-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1010-NEXT: [[ENTRY:.*:]] +; GFX1010-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1010-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1010-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX1010: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX1010-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1010: [[IF_NOT_GFX906]]: +; GFX1010-NEXT: br label %[[LOR_NOT_GFX1010:.*]] +; GFX1010: [[LOR_NOT_GFX1010]]: +; GFX1010-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1010-NEXT: br label %[[IF_END6:.*]] +; GFX1010: [[IF_END6]]: +; GFX1010-NEXT: call void @llvm.assume(i1 true) +; GFX1010-NEXT: call void @llvm.assume(i1 true) +; GFX1010-NEXT: br label %[[FOR_COND:.*]] +; GFX1010: [[FOR_COND]]: +; GFX1010-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1010-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1010-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1010-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1010: [[IF_GFX11_INSTS]]: +; GFX1010-NEXT: br label %[[IF_GFX10_INSTS:.*]] +; GFX1010: [[IF_GFX10_INSTS]]: +; GFX1010-NEXT: call void @llvm.amdgcn.s.ttracedata.imm(i16 1) +; GFX1010-NEXT: br label %[[IF_END11:.*]] +; GFX1010: [[IF_END11]]: +; GFX1010-NEXT: call void @llvm.assume(i1 true) +; GFX1010-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1010-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1010-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1010-NEXT: ret void +; +; GFX1101-LABEL: define amdgpu_kernel void @kernel( +; GFX1101-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1101-NEXT: [[ENTRY:.*:]] +; GFX1101-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1101-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1101-NEXT: br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]] +; GFX1101: [[IF_GFX1201_OR_GFX12_INSTS]]: +; GFX1101-NEXT: br label %[[IF_END:.*]] +; GFX1101: [[IF_END]]: +; GFX1101-NEXT: br label %[[IF_NOT_GFX907:.*]] +; GFX1101: [[IF_NOT_GFX907]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1101-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1101: [[IF_NOT_GFX906]]: +; GFX1101-NEXT: call void @llvm.assume(i1 true) +; GFX1101-NEXT: call void @llvm.assume(i1 true) +; GFX1101-NEXT: br label %[[FOR_COND:.*]] +; GFX1101: [[FOR_COND]]: +; GFX1101-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1101-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1101-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1101-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1101: [[IF_GFX11_INSTS]]: +; GFX1101-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1101-NEXT: br label %[[IF_ELSE8:.*]] +; GFX1101: [[IF_ELSE8]]: +; GFX1101-NEXT: call void @llvm.assume(i1 true) +; GFX1101-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1101-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1101-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1101-NEXT: ret void +; +; GFX1201-LABEL: define amdgpu_kernel void @kernel( +; GFX1201-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1201-NEXT: [[ENTRY:.*:]] +; GFX1201-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1201-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1201-NEXT: br label %[[LOR_NOT_GFX1201:.*]] +; GFX1201: [[LOR_NOT_GFX1201]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1201-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1201: [[IF_NOT_GFX906]]: +; GFX1201-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX1201: [[IF_GFX1010_OR_GFX1101]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-NEXT: br label %[[IF_END6:.*]] +; GFX1201: [[IF_END6]]: +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: br label %[[FOR_COND:.*]] +; GFX1201: [[FOR_COND]]: +; GFX1201-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1201-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1201: [[IF_GFX11_INSTS]]: +; GFX1201-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-NEXT: br label %[[IF_ELSE8:.*]] +; GFX1201: [[IF_ELSE8]]: +; GFX1201-NEXT: call void @llvm.assume(i1 true) +; GFX1201-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1201-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-NEXT: ret void +; +; GFX1201-W64-LABEL: define amdgpu_kernel void @kernel( +; GFX1201-W64-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX1201-W64-NEXT: [[ENTRY:.*:]] +; GFX1201-W64-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64 +; GFX1201-W64-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr +; GFX1201-W64-NEXT: br label %[[LOR_NOT_GFX1201:.*]] +; GFX1201-W64: [[LOR_NOT_GFX1201]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.sleep.var(i32 [[X]]) +; GFX1201-W64-NEXT: br label %[[IF_NOT_GFX906:.*]] +; GFX1201-W64: [[IF_NOT_GFX906]]: +; GFX1201-W64-NEXT: br label %[[IF_GFX1010_OR_GFX1101:.*]] +; GFX1201-W64: [[IF_GFX1010_OR_GFX1101]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-W64-NEXT: br label %[[IF_END6:.*]] +; GFX1201-W64: [[IF_END6]]: +; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) +; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) +; GFX1201-W64-NEXT: br label %[[FOR_COND:.*]] +; GFX1201-W64: [[FOR_COND]]: +; GFX1201-W64-NEXT: [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]] +; GFX1201-W64-NEXT: store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: br label %[[IF_GFX11_INSTS:.*]] +; GFX1201-W64: [[IF_GFX11_INSTS]]: +; GFX1201-W64-NEXT: call void @llvm.amdgcn.s.wait.event.export.ready() +; GFX1201-W64-NEXT: br label %[[IF_ELSE8:.*]] +; GFX1201-W64: [[IF_ELSE8]]: +; GFX1201-W64-NEXT: call void @llvm.assume(i1 true) +; GFX1201-W64-NEXT: [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]] +; GFX1201-W64-NEXT: store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4 +; GFX1201-W64-NEXT: ret void +; +entry: + %0 = ptrtoint ptr addrspace(1) %p.coerce to i64 + %1 = inttoptr i64 %0 to ptr + %2 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1 + br i1 %2, label %if.gfx1201.or.gfx12-insts, label %lor.not.gfx1201 + +lor.not.gfx1201: + %3 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx12-insts, align 1 + br i1 %3, label %if.gfx1201.or.gfx12-insts, label %if.end + +if.gfx1201.or.gfx12-insts: + call void @llvm.amdgcn.s.sleep.var(i32 %x) + br label %if.end + +if.end: + %4 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1 + br i1 %4, label %if.gfx906, label %if.not.gfx906 + +if.not.gfx906: + call void @llvm.amdgcn.s.wait.event.export.ready() + br label %if.end6 + +if.gfx906: + %5 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1 + br i1 %5, label %if.gfx1010.or.gfx1101, label %lor.not.gfx1010 + +lor.not.gfx1010: + %6 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1 + br i1 %6, label %if.gfx1010.or.gfx1101, label %for.cond + +if.gfx1010.or.gfx1101: + call void @llvm.amdgcn.s.ttracedata.imm(i16 1) + br label %if.end6 + +if.end6: + %.pr.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1 + %7 = icmp ne i1 %.pr.pr, true + call void @llvm.assume(i1 %7) + %.pr6.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1 + %8 = icmp ne i1 %.pr6.pr, true + call void @llvm.assume(i1 %8) + br label %for.cond + +for.cond: + %.promoted = load i32, ptr %1, align 4 + %sub.peel = sub nsw i32 %.promoted, %x + store i32 %sub.peel, ptr %1, align 4 + %9 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx11-insts, align 1 + br i1 %9, label %if.gfx11-insts, label %if.else8 + +if.gfx11-insts: + call void @llvm.amdgcn.s.wait.event.export.ready() + br label %if.end11 + +if.else8: + %10 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1 + br i1 %10, label %if.gfx10-insts, label %if.end11 + +if.gfx10-insts: + call void @llvm.amdgcn.s.ttracedata.imm(i16 1) + br label %if.end11 + +if.end11: + %.pr7 = load i1, ptr addrspace(1) @"llvm.amdgcn.has.gfx12-insts,wavefrontsize64", align 1 + %11 = icmp ne i1 %.pr7, true + call void @llvm.assume(i1 %11) + %.promoted9 = load i32, ptr %1, align 4 + %sub13.peel = sub nsw i32 %.promoted9, %x + store i32 %sub13.peel, ptr %1, align 4 + ret void +} + +declare void @llvm.assume(i1 noundef) diff --git a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll index b1fc76f457ece..93f43b274e28d 100644 --- a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll +++ b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll @@ -2,16 +2,22 @@ ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto" -print-pipeline-passes %s -o - | FileCheck %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto" -print-pipeline-passes %s -o - | FileCheck %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto" -print-pipeline-passes %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O0 -print-pipeline-passes %s -o - | FileCheck --check-prefix=O0 %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O1 -print-pipeline-passes %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O2 -print-pipeline-passes %s -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn--amdhsa -S -O3 -print-pipeline-passes %s -o - | FileCheck %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s - +; CHECK: amdgpu-expand-feature-predicates ; CHECK: amdgpu-attributor +; O0: amdgpu-expand-feature-predicates ; O0-NOT: amdgpu-attributor +; PRE: amdgpu-expand-feature-predicates ; PRE-NOT: internalize ; PRE-NOT: amdgpu-attributor ; PRE-NOT: printfToRuntime