diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index a42a546555716..0f38510880290 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -5025,6 +5025,118 @@ If no address spaces names are provided, all address spaces are fenced.
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")
 
+__builtin_amdgcn_processor_is and __builtin_amdgcn_is_invocable
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``__builtin_amdgcn_processor_is`` and ``__builtin_amdgcn_is_invocable`` provide
+a functional mechanism for programatically querying:
+
+* the identity of the current target processor;
+* the capability of the current target processor to invoke a particular builtin.
+
+**Syntax**:
+
+.. code-block:: c
+
+  __amdgpu_feature_predicate_t __builtin_amdgcn_processor_is(const char*);
+  __amdgpu_feature_predicate_t __builtin_amdgcn_is_invocable(builtin_name);
+
+**Example of use**:
+
+.. code-block:: c++
+
+  if (__builtin_amdgcn_processor_is("gfx1201") ||
+      __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+    __builtin_amdgcn_s_sleep_var(x);
+
+  if (!__builtin_amdgcn_processor_is("gfx906"))
+    __builtin_amdgcn_s_wait_event_export_ready();
+  else if (__builtin_amdgcn_processor_is("gfx1010") ||
+           __builtin_amdgcn_processor_is("gfx1101"))
+    __builtin_amdgcn_s_ttracedata_imm(1);
+
+  while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
+
+  do {
+    break;
+  } while (__builtin_amdgcn_processor_is("gfx1010"));
+
+  for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
+
+  if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
+    __builtin_amdgcn_s_wait_event_export_ready();
+  else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm))
+    __builtin_amdgcn_s_ttracedata_imm(1);
+
+  do {
+    break;
+  } while (
+      __builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
+
+  for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p)
+    break;
+
+**Description**:
+
+The builtins return a value of type ``__amdgpu_feature_predicate_t``, which is a
+target specific type that behaves as if its C++ definition was the following:
+
+.. code-block:: c++
+
+  struct __amdgpu_feature_predicate_t {
+    __amdgpu_feature_predicate_t() = delete;
+    __amdgpu_feature_predicate_t(const __amdgpu_feature_predicate_t&) = delete;
+    __amdgpu_feature_predicate_t(__amdgpu_feature_predicate_t&&) = delete;
+
+    explicit
+    operator bool() const noexcept;
+  };
+
+The builtins can be used in C as well, wherein the
+``__amdgpu_feature_predicate_t`` type behaves as an opaque, forward declared
+type with conditional automated conversion to ``_Bool`` when used as the
+predicate argument to a control structure:
+
+.. code-block:: c
+
+  struct __amdgpu_feature_predicate_t ret();     // Error
+  void arg(struct __amdgpu_feature_predicate_t); // Error
+  void local() {
+    struct __amdgpu_feature_predicate_t x;       // Error
+    struct __amdgpu_feature_predicate_t y =
+        __builtin_amdgcn_processor_is("gfx900"); // Error
+  }
+  void valid_use() {
+    _Bool x = (_Bool)__builtin_amdgcn_processor_is("gfx900"); // OK
+    if (__builtin_amdgcn_processor_is("gfx900"))       // Implicit cast to _Bool
+      return;
+    for (; __builtin_amdgcn_processor_is("gfx900");)   // Implicit cast to _Bool
+      break;
+    while (__builtin_amdgcn_processor_is("gfx900"))    // Implicit cast to _Bool
+      break;
+    do {
+      break;
+    } while (__builtin_amdgcn_processor_is("gfx900")); // Implicit cast to _Bool
+
+    __builtin_amdgcn_processor_is("gfx900") ? x : !x;
+  }
+
+The boolean interpretation of the predicate values returned by the builtins:
+
+* indicates whether the current target matches the argument; the argument MUST
+  be a string literal and a valid AMDGPU target
+* indicates whether the builtin function passed as the argument can be invoked
+  by the current target; the argument MUST be either a generic or AMDGPU
+  specific builtin name
+
+When invoked while compiling for a concrete target, the builtins are evaluated
+early by Clang, and never produce any CodeGen effects / have no observable
+side-effects in IR. Conversely, when compiling for AMDGCN flavoured SPIR-v,
+which is an abstract target, a series of predicate values are implicitly
+created. These predicates get resolved when finalizing the compilation process
+for a concrete target, and shall reflect the latter's identity and features.
+Thus, it is possible to author high-level code, in e.g. HIP, that is target
+adaptive in a dynamic fashion, contrary to macro based mechanisms.
 
 ARM/AArch64 Language Extensions
 -------------------------------
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9e16613826a77..3ec0dd4276735 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -975,6 +975,10 @@ AMDGPU Support
 ^^^^^^^^^^^^^^
 
 - Bump the default code object version to 6. ROCm 6.3 is required to run any program compiled with COV6.
+- Introduced a new target specific builtin ``__builtin_amdgcn_processor_is``,
+  a late / deferred query for the current target processor.
+- Introduced a new target specific builtin ``__builtin_amdgcn_is_invocable``,
+  which enables fine-grained, per-builtin, feature availability.
 
 NVPTX Support
 ^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/AMDGPUTypes.def b/clang/include/clang/Basic/AMDGPUTypes.def
index d3dff446f9edf..a0574c640184b 100644
--- a/clang/include/clang/Basic/AMDGPUTypes.def
+++ b/clang/include/clang/Basic/AMDGPUTypes.def
@@ -20,10 +20,18 @@
   AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)
 #endif
 
+#ifndef AMDGPU_FEATURE_PREDICATE_TYPE
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align) \
+  AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)
+#endif
+
 AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy, 128, 128, 8)
 
 AMDGPU_NAMED_BARRIER_TYPE("__amdgpu_named_workgroup_barrier_t", AMDGPUNamedWorkgroupBarrier, AMDGPUNamedWorkgroupBarrierTy, 128, 32, 0)
 
+AMDGPU_FEATURE_PREDICATE_TYPE("__amdgpu_feature_predicate_t", AMDGPUFeaturePredicate, AMDGPUFeaturePredicateTy, 1, 1)
+
 #undef AMDGPU_TYPE
 #undef AMDGPU_OPAQUE_PTR_TYPE
 #undef AMDGPU_NAMED_BARRIER_TYPE
+#undef AMDGPU_FEATURE_PREDICATE_TYPE
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 48437c9397570..27f78af16fe06 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -34,6 +34,7 @@
 //  Q -> target builtin type, followed by a character to distinguish the builtin type
 //    Qa -> AArch64 svcount_t builtin type.
 //    Qb -> AMDGPU __amdgpu_buffer_rsrc_t builtin type.
+//    Qc -> AMDGPU __amdgpu_feature_predicate_t builtin type.
 //  E -> ext_vector, followed by the number of elements and the base type.
 //  X -> _Complex, followed by the base type.
 //  Y -> ptrdiff_t
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index a5ee8013adff6..c0f30a718998f 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -350,6 +350,11 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")
 BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
 BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
 
+// These are special FE only builtins intended for forwarding the requirements
+// to the ME.
+BUILTIN(__builtin_amdgcn_processor_is, "QccC*", "nctu")
+BUILTIN(__builtin_amdgcn_is_invocable, "Qc", "nctu")
+
 //===----------------------------------------------------------------------===//
 // R600-NI only builtins.
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 9a7a308600763..bcfc0d99a8bfd 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1755,3 +1755,6 @@ def ExplicitSpecializationStorageClass : DiagGroup<"explicit-specialization-stor
 
 // A warning for options that enable a feature that is not yet complete
 def ExperimentalOption : DiagGroup<"experimental-option">;
+
+// Warnings about unguarded usages of AMDGPU target specific constructs
+def UnguardedBuiltinUsageAMDGPU : DiagGroup<"amdgpu-unguarded-builtin-usage">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 09ba796b22765..16cb7814d0626 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12526,7 +12526,7 @@ def warn_zero_as_null_pointer_constant : Warning<
   InGroup<DiagGroup<"zero-as-null-pointer-constant">>, DefaultIgnore;
 
 def warn_not_eliding_copy_on_return : Warning<
-  "not eliding copy on return">, 
+  "not eliding copy on return">,
   InGroup<DiagGroup<"nrvo">>, DefaultIgnore;
 
 def err_nullability_cs_multilevel : Error<
@@ -13464,4 +13464,31 @@ def err_acc_device_type_multiple_archs
 // AMDGCN builtins diagnostics
 def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
 def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">;
+def err_amdgcn_processor_is_arg_not_literal
+    : Error<"the argument to __builtin_amdgcn_processor_is must be a string "
+            "literal">;
+def err_amdgcn_processor_is_arg_invalid_value
+    : Error<"the argument to __builtin_amdgcn_processor_is must be a valid "
+            "AMDGCN processor identifier; '%0' is not valid">;
+def note_amdgcn_processor_is_valid_options
+    : Note<"valid AMDGCN processor identifiers are: %0">;
+def err_amdgcn_is_invocable_arg_invalid_value
+    : Error<"the argument to __builtin_amdgcn_is_invocable must be either a "
+            "target agnostic builtin or an AMDGCN target specific builtin; `%0`"
+            " is not valid">;
+def err_amdgcn_predicate_type_is_not_constructible
+    : Error<"%0 has type __amdgpu_feature_predicate_t, which is not"
+            " constructible">;
+def err_amdgcn_predicate_type_needs_explicit_bool_cast
+    : Error<"%0 must be explicitly cast to %1; however, please note that this "
+            "is almost always an error and that it prevents the effective "
+            "guarding of target dependent code, and thus should be avoided">;
+def note_amdgcn_protected_by_predicate : Note<"jump enters statement controlled"
+                                              " by AMDGPU feature predicate">;
+def warn_amdgcn_unguarded_builtin :
+  Warning<"%0 might be unavailable on some AMDGPU targets">,
+  InGroup<UnguardedBuiltinUsageAMDGPU>, DefaultIgnore;
+def note_amdgcn_unguarded_builtin_silence
+  : Note<"enclose %0 in a __builtin_amdgcn_is_invocable check to silence "
+         "this warning">;
 } // end of sema component.
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index d62c9bb65fadb..1a6752d7ec0d6 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -15,12 +15,17 @@
 
 #include "clang/AST/ASTFwd.h"
 #include "clang/Sema/SemaBase.h"
+#include "llvm/ADT/SmallPtrSet.h"
 
 namespace clang {
 class AttributeCommonInfo;
+class Expr;
 class ParsedAttr;
 
 class SemaAMDGPU : public SemaBase {
+  llvm::SmallPtrSet<Expr *, 32> ExpandedPredicates;
+  llvm::SmallPtrSet<FunctionDecl *, 32> PotentiallyUnguardedBuiltinUsers;
+
 public:
   SemaAMDGPU(Sema &S);
 
@@ -64,6 +69,16 @@ class SemaAMDGPU : public SemaBase {
   void handleAMDGPUNumVGPRAttr(Decl *D, const ParsedAttr &AL);
   void handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, const ParsedAttr &AL);
   void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL);
+
+  /// Expand a valid use of the feature identification builtins into its
+  /// corresponding sequence of instructions.
+  Expr *ExpandAMDGPUPredicateBI(CallExpr *CE);
+  bool IsPredicate(Expr *E) const;
+  /// Diagnose unguarded usages of AMDGPU builtins and recommend guarding with
+  /// __builtin_amdgcn_is_invocable
+  void AddPotentiallyUnguardedBuiltinUser(FunctionDecl *FD);
+  bool HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const;
+  void DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD);
 };
 } // namespace clang
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 679812adcdf12..dedfb396200fd 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1476,7 +1476,12 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
   }
 
   if (Target.getTriple().isAMDGPU() ||
-      (AuxTarget && AuxTarget->getTriple().isAMDGPU())) {
+      (Target.getTriple().isSPIRV() &&
+       Target.getTriple().getVendor() == llvm::Triple::AMD) ||
+      (AuxTarget &&
+       (AuxTarget->getTriple().isAMDGPU() ||
+        ((AuxTarget->getTriple().isSPIRV() &&
+          AuxTarget->getTriple().getVendor() == llvm::Triple::AMD))))) {
 #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   InitBuiltinType(SingletonId, BuiltinType::Id);
 #include "clang/Basic/AMDGPUTypes.def"
@@ -12499,6 +12504,10 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
       Type = Context.AMDGPUBufferRsrcTy;
       break;
     }
+    case 'c': {
+      Type = Context.AMDGPUFeaturePredicateTy;
+      break;
+    }
     default:
       llvm_unreachable("Unexpected target builtin type");
     }
diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp
index 2336fb3ef0495..19f160e279b21 100644
--- a/clang/lib/Basic/Targets/SPIR.cpp
+++ b/clang/lib/Basic/Targets/SPIR.cpp
@@ -181,3 +181,12 @@ void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) {
     Float128Format = DoubleFormat;
   }
 }
+
+bool SPIRV64AMDGCNTargetInfo::isValidCPUName(StringRef CPU) const {
+  return AMDGPUTI.isValidCPUName(CPU);
+}
+
+void SPIRV64AMDGCNTargetInfo::fillValidCPUList(
+    SmallVectorImpl<StringRef> &Values) const {
+  return AMDGPUTI.fillValidCPUList(Values);
+}
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 1abf798d93129..1b4b6332eebb6 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -454,6 +454,11 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final
   }
 
   bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); }
+
+  // This is only needed for validating arguments passed to
+  // __builtin_amdgcn_processor_is
+  bool isValidCPUName(StringRef Name) const override;
+  void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
 };
 
 } // namespace targets
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 8bf7d24d8551d..2936bd34b7ad5 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1029,6 +1029,13 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
           DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_unsigned); \
     return SingletonId;                                                        \
   }
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align)     \
+  case BuiltinType::Id: {                                                      \
+    if (!SingletonId)                                                          \
+      SingletonId =                                                            \
+          DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_boolean);  \
+    return SingletonId;                                                        \
+  }
 #include "clang/Basic/AMDGPUTypes.def"
   case BuiltinType::UChar:
   case BuiltinType::Char_U:
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 44931d0481e26..c996f6b781ac6 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -982,6 +982,10 @@ Value *ScalarExprEmitter::EmitConversionToBool(Value *Src, QualType SrcType) {
   if (const MemberPointerType *MPT = dyn_cast<MemberPointerType>(SrcType))
     return CGF.CGM.getCXXABI().EmitMemberPointerIsNotNull(CGF, Src, MPT);
 
+  // The conversion is a NOP, and will be done when CodeGening the builtin.
+  if (SrcType == CGF.getContext().AMDGPUFeaturePredicateTy)
+    return Src;
+
   assert((SrcType->isIntegerType() || isa<llvm::PointerType>(Src->getType())) &&
          "Unknown scalar type to convert");
 
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index c98503e4bbd26..53bb6e7b85142 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -580,6 +580,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   case BuiltinType::Id:                                                        \
     return llvm::TargetExtType::get(getLLVMContext(), "amdgcn.named.barrier",  \
                                     {}, {Scope});
+#define AMDGPU_FEATURE_PREDICATE_TYPE(Name, Id, SingletonId, Width, Align)     \
+  case BuiltinType::Id:                                                        \
+    return ConvertType(getContext().getLogicalOperationType());
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index f09b3b92c4ea0..10a992144b5a9 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -295,6 +295,18 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
   Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
 }
 
+static Value *GetOrInsertAMDGPUPredicate(CodeGenFunction &CGF, Twine Name) {
+  auto PTy = IntegerType::getInt1Ty(CGF.getLLVMContext());
+
+  auto *P = cast<GlobalVariable>(
+      CGF.CGM.getModule().getOrInsertGlobal(Name.str(), PTy));
+  P->setConstant(true);
+  P->setExternallyInitialized(true);
+
+  return CGF.Builder.CreateLoad(
+      RawAddress(P, PTy, CharUnits::One(), KnownNonNull));
+}
+
 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
                                               const CallExpr *E) {
   llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -637,6 +649,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     llvm::Value *Env = EmitScalarExpr(E->getArg(0));
     return Builder.CreateCall(F, {Env});
   }
+  case AMDGPU::BI__builtin_amdgcn_processor_is: {
+    assert(CGM.getTriple().isSPIRV() &&
+           "__builtin_amdgcn_processor_is should never reach CodeGen for "
+           "concrete targets!");
+    StringRef Proc = cast<clang::StringLiteral>(E->getArg(0))->getString();
+    return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.is." + Proc);
+  }
+  case AMDGPU::BI__builtin_amdgcn_is_invocable: {
+    assert(CGM.getTriple().isSPIRV() &&
+           "__builtin_amdgcn_is_invocable should never reach CodeGen for "
+           "concrete targets!");
+    auto *FD = cast<FunctionDecl>(
+        cast<DeclRefExpr>(E->getArg(0))->getReferencedDeclOfCallee());
+    StringRef RF =
+        getContext().BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+    return GetOrInsertAMDGPUPredicate(*this, "llvm.amdgcn.has." + RF);
+  }
   case AMDGPU::BI__builtin_amdgcn_read_exec:
     return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
   case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp
index 36704c3826dfd..2c6ae89513241 100644
--- a/clang/lib/Sema/JumpDiagnostics.cpp
+++ b/clang/lib/Sema/JumpDiagnostics.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/Basic/SourceLocation.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/BitVector.h"
 using namespace clang;
@@ -367,8 +368,10 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S,
 
   case Stmt::IfStmtClass: {
     IfStmt *IS = cast<IfStmt>(S);
+    bool AMDGPUPredicate = false;
     if (!(IS->isConstexpr() || IS->isConsteval() ||
-          IS->isObjCAvailabilityCheck()))
+          IS->isObjCAvailabilityCheck() ||
+          (AMDGPUPredicate = this->S.AMDGPU().IsPredicate(IS->getCond()))))
       break;
 
     unsigned Diag = diag::note_protected_by_if_available;
@@ -376,6 +379,8 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S,
       Diag = diag::note_protected_by_constexpr_if;
     else if (IS->isConsteval())
       Diag = diag::note_protected_by_consteval_if;
+    else if (AMDGPUPredicate)
+      Diag = diag::note_amdgcn_protected_by_predicate;
 
     if (VarDecl *Var = IS->getConditionVariable())
       BuildScopeInformation(Var, ParentScope);
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 56608e990fd50..6ccd22126581d 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -571,8 +571,13 @@ void Sema::Initialize() {
   }
 
   if (Context.getTargetInfo().getTriple().isAMDGPU() ||
+      (Context.getTargetInfo().getTriple().isSPIRV() &&
+       Context.getTargetInfo().getTriple().getVendor() == llvm::Triple::AMD) ||
       (Context.getAuxTargetInfo() &&
-       Context.getAuxTargetInfo()->getTriple().isAMDGPU())) {
+       (Context.getAuxTargetInfo()->getTriple().isAMDGPU() ||
+        (Context.getAuxTargetInfo()->getTriple().isSPIRV() &&
+         Context.getAuxTargetInfo()->getTriple().getVendor() ==
+             llvm::Triple::AMD)))) {
 #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)                       \
   addImplicitTypedef(Name, Context.SingletonId);
 #include "clang/Basic/AMDGPUTypes.def"
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index e6414a623b929..af4668e17ed68 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -11,9 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Sema/SemaAMDGPU.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DynamicRecursiveASTVisitor.h"
+#include "clang/AST/Expr.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/Ownership.h"
+#include "clang/Sema/Scope.h"
 #include "clang/Sema/Sema.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include <cstdint>
@@ -367,4 +372,189 @@ void SemaAMDGPU::handleAMDGPUMaxNumWorkGroupsAttr(Decl *D,
   addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), YExpr, ZExpr);
 }
 
+Expr *SemaAMDGPU::ExpandAMDGPUPredicateBI(CallExpr *CE) {
+  ASTContext &Ctx = getASTContext();
+  QualType BoolTy = Ctx.getLogicalOperationType();
+  llvm::APInt False = llvm::APInt::getZero(Ctx.getIntWidth(BoolTy));
+  llvm::APInt True = llvm::APInt::getAllOnes(Ctx.getIntWidth(BoolTy));
+  SourceLocation Loc = CE->getExprLoc();
+
+  if (!CE->getBuiltinCallee())
+    return *ExpandedPredicates
+                .insert(IntegerLiteral::Create(Ctx, False, BoolTy, Loc))
+                .first;
+
+  bool P = false;
+  unsigned BI = CE->getBuiltinCallee();
+  if (Ctx.BuiltinInfo.isAuxBuiltinID(BI))
+    BI = Ctx.BuiltinInfo.getAuxBuiltinID(BI);
+
+  if (BI == AMDGPU::BI__builtin_amdgcn_processor_is) {
+    auto *GFX = dyn_cast<StringLiteral>(CE->getArg(0)->IgnoreParenCasts());
+    if (!GFX) {
+      Diag(Loc, diag::err_amdgcn_processor_is_arg_not_literal);
+      return nullptr;
+    }
+
+    StringRef N = GFX->getString();
+    const TargetInfo &TI = Ctx.getTargetInfo();
+    const TargetInfo *AuxTI = Ctx.getAuxTargetInfo();
+    if (!TI.isValidCPUName(N) && (!AuxTI || !AuxTI->isValidCPUName(N))) {
+      Diag(Loc, diag::err_amdgcn_processor_is_arg_invalid_value) << N;
+      SmallVector<StringRef, 32> ValidList;
+      if (TI.getTriple().getVendor() == llvm::Triple::VendorType::AMD)
+        TI.fillValidCPUList(ValidList);
+      else if (AuxTI) // Since the BI is present it must be and AMDGPU triple.
+        AuxTI->fillValidCPUList(ValidList);
+      if (!ValidList.empty())
+        Diag(Loc, diag::note_amdgcn_processor_is_valid_options)
+            << llvm::join(ValidList, ", ");
+      return nullptr;
+    }
+    if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
+      CE->setType(BoolTy);
+      return *ExpandedPredicates.insert(CE).first;
+    }
+
+    if (auto TID = Ctx.getTargetInfo().getTargetID())
+      P = TID->find(N) == 0;
+  } else {
+    Expr *Arg = CE->getArg(0);
+    if (!Arg || Arg->getType() != Ctx.BuiltinFnTy) {
+      Diag(Loc, diag::err_amdgcn_is_invocable_arg_invalid_value) << Arg;
+      return nullptr;
+    }
+
+    if (Ctx.getTargetInfo().getTriple().isSPIRV()) {
+      CE->setType(BoolTy);
+      return *ExpandedPredicates.insert(CE).first;
+    }
+
+    auto *FD = cast<FunctionDecl>(Arg->getReferencedDeclOfCallee());
+
+    StringRef RF = Ctx.BuiltinInfo.getRequiredFeatures(FD->getBuiltinID());
+    llvm::StringMap<bool> CF;
+    Ctx.getFunctionFeatureMap(CF, FD);
+
+    P = Builtin::evaluateRequiredTargetFeatures(RF, CF);
+  }
+
+  return *ExpandedPredicates
+              .insert(
+                  IntegerLiteral::Create(Ctx, P ? True : False, BoolTy, Loc))
+              .first;
+}
+
+bool SemaAMDGPU::IsPredicate(Expr *E) const {
+  return ExpandedPredicates.contains(E);
+}
+
+void SemaAMDGPU::AddPotentiallyUnguardedBuiltinUser(FunctionDecl *FD) {
+  PotentiallyUnguardedBuiltinUsers.insert(FD);
+}
+
+bool SemaAMDGPU::HasPotentiallyUnguardedBuiltinUsage(FunctionDecl *FD) const {
+  return PotentiallyUnguardedBuiltinUsers.contains(FD);
+}
+
+namespace {
+/// This class implements -Wamdgpu-unguarded-builtin-usage.
+///
+/// This is done with a traversal of the AST of a function that includes a
+/// call to a target specific builtin. Whenever we encounter an \c if of the
+/// form: \c if(__builtin_amdgcn_is_invocable), we consider the then statement
+/// guarded.
+class DiagnoseUnguardedBuiltins : public DynamicRecursiveASTVisitor {
+  // TODO: this is conservative, and should be extended to:
+  //       - warn on unguarded ASM usage (__builtin_amdgcn_processor_is as the
+  //         guard);
+  //       - build sets of builtins which are invocable from nested
+  //         if (__builtin_amdgcn_is_invocable) calls, rather than assume
+  //         sanity / that the existence of a guard implies its correctness;
+  //       - derive the set of available builtins / valid ASM constraints from
+  //         the target architecture passed to __builtin_amdgcn_processor_is;
+  //       - consider attributes such as target.
+  Sema &SemaRef;
+
+  unsigned Guards;
+
+public:
+  DiagnoseUnguardedBuiltins(Sema &SemaRef) : SemaRef(SemaRef), Guards(0u) {}
+
+  bool TraverseLambdaExpr(LambdaExpr *LE) override {
+    if (SemaRef.AMDGPU().HasPotentiallyUnguardedBuiltinUsage(
+            LE->getCallOperator()))
+      return true; // We have already handled this.
+    return DynamicRecursiveASTVisitor::TraverseLambdaExpr(LE);
+  }
+
+  bool TraverseStmt(Stmt *S) override {
+    if (!S)
+      return true;
+    return DynamicRecursiveASTVisitor::TraverseStmt(S);
+  }
+
+  void IssueDiagnostics(Stmt *S) { TraverseStmt(S); }
+
+  bool TraverseIfStmt(IfStmt *If) override;
+
+  bool TraverseCaseStmt(CaseStmt *CS) override {
+    return TraverseStmt(CS->getSubStmt());
+  }
+
+  bool VisitCallExpr(CallExpr *CE) override;
+};
+
+inline Expr *FindPredicate(Expr *Cond) {
+  if (auto *CE = dyn_cast<CallExpr>(Cond)) {
+    if (CE->getBuiltinCallee() == AMDGPU::BI__builtin_amdgcn_is_invocable)
+      return Cond;
+  } else if (auto *UO = dyn_cast<UnaryOperator>(Cond)) {
+    return FindPredicate(UO->getSubExpr());
+  } else if (auto *BO = dyn_cast<BinaryOperator>(Cond)) {
+    if ((Cond = FindPredicate(BO->getLHS())))
+      return Cond;
+    return FindPredicate(BO->getRHS());
+  }
+  return nullptr;
+}
+
+bool DiagnoseUnguardedBuiltins::TraverseIfStmt(IfStmt *If) {
+  if (FindPredicate(If->getCond())) {
+    ++Guards;
+    bool Continue = TraverseStmt(If->getThen());
+    --Guards;
+
+    return Continue && TraverseStmt(If->getElse());
+  }
+
+  return DynamicRecursiveASTVisitor::TraverseIfStmt(If);
+}
+
+bool DiagnoseUnguardedBuiltins::VisitCallExpr(CallExpr *CE) {
+  if (Guards)
+    return true;
+
+  unsigned ID = CE->getBuiltinCallee();
+
+  if (!ID)
+    return true;
+  if (!SemaRef.getASTContext().BuiltinInfo.isTSBuiltin(ID))
+    return true;
+  if (ID == AMDGPU::BI__builtin_amdgcn_processor_is ||
+      ID == AMDGPU::BI__builtin_amdgcn_is_invocable)
+    return true;
+
+  SemaRef.Diag(CE->getExprLoc(), diag::warn_amdgcn_unguarded_builtin)
+      << CE->getDirectCallee();
+  SemaRef.Diag(CE->getExprLoc(), diag::note_amdgcn_unguarded_builtin_silence)
+      << CE->getDirectCallee();
+
+  return true;
+}
+} // Unnamed namespace
+
+void SemaAMDGPU::DiagnoseUnguardedBuiltinUsage(FunctionDecl *FD) {
+  DiagnoseUnguardedBuiltins(SemaRef).IssueDiagnostics(FD->getBody());
+}
 } // namespace clang
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index e27ed8fd4de14..f0ab9eb2fec36 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -23,6 +23,7 @@
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Initialization.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaHLSL.h"
 #include "clang/Sema/SemaObjC.h"
 #include "clang/Sema/SemaRISCV.h"
@@ -1595,6 +1596,14 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr,
     return TC_Success;
   }
 
+  if (SrcType == Self.Context.AMDGPUFeaturePredicateTy &&
+      DestType == Self.Context.getLogicalOperationType()) {
+    SrcExpr = Self.AMDGPU().ExpandAMDGPUPredicateBI(
+        dyn_cast<CallExpr>(SrcExpr.get()));
+    Kind = CK_NoOp;
+    return TC_Success;
+  }
+
   // We tried everything. Everything! Nothing works! :-(
   return TC_NotApplicable;
 }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 11cbda412667f..f586f09bce19a 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -45,6 +45,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaARM.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaHLSL.h"
@@ -13636,6 +13637,16 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
     return;
   }
 
+  // __amdgpu_feature_predicate_t cannot be initialised
+  if (VDecl->getType().getDesugaredType(Context) ==
+      Context.AMDGPUFeaturePredicateTy) {
+    Diag(VDecl->getLocation(),
+         diag::err_amdgcn_predicate_type_is_not_constructible)
+        << VDecl;
+    VDecl->setInvalidDecl();
+    return;
+  }
+
   // WebAssembly tables can't be used to initialise a variable.
   if (!Init->getType().isNull() && Init->getType()->isWebAssemblyTableType()) {
     Diag(Init->getExprLoc(), diag::err_wasm_table_art) << 0;
@@ -14129,6 +14140,13 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
   if (VarDecl *Var = dyn_cast<VarDecl>(RealDecl)) {
     QualType Type = Var->getType();
 
+    if (Type.getDesugaredType(Context) == Context.AMDGPUFeaturePredicateTy) {
+      Diag(Var->getLocation(),
+           diag::err_amdgcn_predicate_type_is_not_constructible)
+          << Var;
+      Var->setInvalidDecl();
+      return;
+    }
     // C++1z [dcl.dcl]p1 grammar implies that an initializer is mandatory.
     if (isa<DecompositionDecl>(RealDecl)) {
       Diag(Var->getLocation(), diag::err_decomp_decl_requires_init) << Var;
@@ -16585,8 +16603,12 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
       return nullptr;
     }
 
-    if (Body && FSI->HasPotentialAvailabilityViolations)
-      DiagnoseUnguardedAvailabilityViolations(dcl);
+    if (Body) {
+      if (FSI->HasPotentialAvailabilityViolations)
+        DiagnoseUnguardedAvailabilityViolations(dcl);
+      else if (AMDGPU().HasPotentiallyUnguardedBuiltinUsage(FD))
+        AMDGPU().DiagnoseUnguardedBuiltinUsage(FD);
+    }
 
     assert(!FSI->ObjCShouldCallSuper &&
            "This should only be set for ObjC methods, which should have been "
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 728ada33e2e63..0eb95826342ff 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -51,6 +51,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaARM.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaFixItUtils.h"
@@ -6570,6 +6571,22 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
   if (Result.isInvalid()) return ExprError();
   Fn = Result.get();
 
+  // The __builtin_amdgcn_is_invocable builtin is special, and will be resolved
+  // later, when we check boolean conditions, for now we merely forward it
+  // without any additional checking.
+  if (Fn->getType() == Context.BuiltinFnTy && ArgExprs.size() == 1 &&
+      ArgExprs[0]->getType() == Context.BuiltinFnTy) {
+    const auto *FD = cast<FunctionDecl>(Fn->getReferencedDeclOfCallee());
+
+    if (FD->getName() == "__builtin_amdgcn_is_invocable") {
+      QualType FnPtrTy = Context.getPointerType(FD->getType());
+      Expr *R = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
+      return CallExpr::Create(
+          Context, R, ArgExprs, Context.AMDGPUFeaturePredicateTy,
+          ExprValueKind::VK_PRValue, RParenLoc, FPOptionsOverride());
+    }
+  }
+
   if (CheckArgsForPlaceholders(ArgExprs))
     return ExprError();
 
@@ -6688,6 +6705,13 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
 
     FunctionDecl *FDecl = dyn_cast<FunctionDecl>(NDecl);
     if (FDecl && FDecl->getBuiltinID()) {
+      if (Context.BuiltinInfo.isTSBuiltin(FDecl->getBuiltinID())) {
+        const llvm::Triple &Triple = Context.getTargetInfo().getTriple();
+        if (Triple.isSPIRV() && Triple.getVendor() == llvm::Triple::AMD)
+          AMDGPU().AddPotentiallyUnguardedBuiltinUser(cast<FunctionDecl>(
+              getFunctionLevelDeclContext(/*AllowLambda=*/true)));
+      }
+
       // Rewrite the function decl for this builtin by replacing parameters
       // with no explicit address space with the address space of the arguments
       // in ArgExprs.
@@ -15817,6 +15841,10 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
         // Vector logical not returns the signed variant of the operand type.
         resultType = GetSignedVectorType(resultType);
         break;
+      } else if (resultType == Context.AMDGPUFeaturePredicateTy) {
+        resultType = Context.getLogicalOperationType();
+        Input = AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast<CallExpr>(InputExpr));
+        break;
       } else {
         return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
                          << resultType << Input.get()->getSourceRange());
@@ -20622,6 +20650,9 @@ ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
   E = result.get();
 
   if (!E->isTypeDependent()) {
+    if (E->getType() == Context.AMDGPUFeaturePredicateTy)
+      return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast_or_null<CallExpr>(E));
+
     if (getLangOpts().CPlusPlus)
       return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4
 
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 95746b35f71ef..754f97e5bc6a5 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -9114,6 +9114,15 @@ bool InitializationSequence::Diagnose(Sema &S,
 
   case FK_ConversionFailed: {
     QualType FromType = OnlyArg->getType();
+    // __amdgpu_feature_predicate_t can be explicitly cast to the logical op
+    // type, although this is almost always an error and we advise against it.
+    if (FromType == S.Context.AMDGPUFeaturePredicateTy &&
+        DestType == S.Context.getLogicalOperationType()) {
+      S.Diag(OnlyArg->getExprLoc(),
+             diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
+          << OnlyArg << DestType;
+      break;
+    }
     PartialDiagnostic PDiag = S.PDiag(diag::err_init_conversion_failed)
       << (int)Entity.getKind()
       << DestType
@@ -9917,6 +9926,14 @@ Sema::PerformCopyInitialization(const InitializedEntity &Entity,
   if (EqualLoc.isInvalid())
     EqualLoc = InitE->getBeginLoc();
 
+  if (Entity.getType().getDesugaredType(Context) ==
+          Context.AMDGPUFeaturePredicateTy &&
+      Entity.getDecl()) {
+    Diag(EqualLoc, diag::err_amdgcn_predicate_type_is_not_constructible)
+        << Entity.getDecl();
+    return ExprError();
+  }
+
   InitializationKind Kind = InitializationKind::CreateCopy(
       InitE->getBeginLoc(), EqualLoc, AllowExplicit);
   InitializationSequence Seq(*this, Entity, Kind, InitE, TopLevelOfInitList);
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 1b54628c5e564..ba7c7e1bc2dbf 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -30,6 +30,7 @@
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Overload.h"
+#include "clang/Sema/SemaAMDGPU.h"
 #include "clang/Sema/SemaARM.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaObjC.h"
@@ -6177,12 +6178,13 @@ TryContextuallyConvertToBool(Sema &S, Expr *From) {
 ExprResult Sema::PerformContextuallyConvertToBool(Expr *From) {
   if (checkPlaceholderForOverload(*this, From))
     return ExprError();
+  if (From->getType() == Context.AMDGPUFeaturePredicateTy)
+    return AMDGPU().ExpandAMDGPUPredicateBI(dyn_cast<CallExpr>(From));
 
   ImplicitConversionSequence ICS = TryContextuallyConvertToBool(*this, From);
   if (!ICS.isBad())
     return PerformImplicitConversion(From, Context.BoolTy, ICS,
                                      AssignmentAction::Converting);
-
   if (!DiagnoseMultipleUserDefinedConversion(From, Context.BoolTy))
     return Diag(From->getBeginLoc(), diag::err_typecheck_bool_condition)
            << From->getType() << From->getSourceRange();
@@ -12041,6 +12043,16 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
   if (TakingCandidateAddress && !checkAddressOfCandidateIsAvailable(S, Fn))
     return;
 
+  // __amdgpu_feature_predicate_t can be explicitly cast to the logical op type,
+  // although this is almost always an error and we advise against it.
+  if (FromTy == S.Context.AMDGPUFeaturePredicateTy &&
+      ToTy == S.Context.getLogicalOperationType()) {
+    S.Diag(Conv.Bad.FromExpr->getExprLoc(),
+           diag::err_amdgcn_predicate_type_needs_explicit_bool_cast)
+        << Conv.Bad.FromExpr << ToTy;
+    return;
+  }
+
   // Emit the generic diagnostic and, optionally, add the hints to it.
   PartialDiagnostic FDiag = S.PDiag(diag::note_ovl_candidate_bad_conv);
   FDiag << (unsigned)FnKindPair.first << (unsigned)FnKindPair.second << FnDesc
diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
new file mode 100644
index 0000000000000..12f283707308e
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -0,0 +1,64 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s
+
+// Test that, depending on triple and, if applicable, target-cpu, one of three
+// things happens:
+//    1) for gfx900 we emit an empty kernel (concrete target, lacks feature)
+//    2) for gfx1010 we emit a call to trap (concrete target, has feature)
+//    3) for AMDGCNSPIRV we emit llvm.amdgcn.has.gfx10-insts as a constant
+//       externally initialised bool global, and load from it to provide the
+//       condition to a br (abstract target)
+
+//.
+// AMDGCNSPIRV: @llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1
+//.
+// AMDGCN-GFX900-LABEL: define dso_local void @foo(
+// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX900-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX900-NEXT:    ret void
+//
+// AMDGCN-GFX1010-LABEL: define dso_local void @foo(
+// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX1010-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX1010-NEXT:    call void @llvm.trap()
+// AMDGCN-GFX1010-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @foo(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// AMDGCNSPIRV:       [[IF_THEN]]:
+// AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
+// AMDGCNSPIRV-NEXT:    br label %[[IF_END]]
+// AMDGCNSPIRV:       [[IF_END]]:
+// AMDGCNSPIRV-NEXT:    ret void
+//
+void foo() {
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+        return __builtin_trap();
+}
+//.
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+//.
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+// AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
new file mode 100644
index 0000000000000..76dead8ebbe89
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -0,0 +1,62 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX900 %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1010 -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCN-GFX1010 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - | FileCheck --check-prefix=AMDGCNSPIRV %s
+
+// Test that, depending on triple and, if applicable, target-cpu, one of three
+// things happens:
+//    1) for gfx900 we emit a call to trap (concrete target, matches)
+//    2) for gfx1010 we emit an empty kernel (concrete target, does not match)
+//    3) for AMDGCNSPIRV we emit llvm.amdgcn.is.gfx900 as a bool global, and
+//       load from it to provide the condition a br (abstract target)
+//.
+// AMDGCNSPIRV: @llvm.amdgcn.is.gfx900 = external addrspace(1) externally_initialized constant i1
+//.
+// AMDGCN-GFX900-LABEL: define dso_local void @foo(
+// AMDGCN-GFX900-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX900-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX900-NEXT:    call void @llvm.trap()
+// AMDGCN-GFX900-NEXT:    ret void
+//
+// AMDGCN-GFX1010-LABEL: define dso_local void @foo(
+// AMDGCN-GFX1010-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-GFX1010-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-GFX1010-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @foo(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx900, align 1
+// AMDGCNSPIRV-NEXT:    [[TOBOOL:%.*]] = icmp ne i1 [[TMP0]], false
+// AMDGCNSPIRV-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// AMDGCNSPIRV:       [[IF_THEN]]:
+// AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.trap()
+// AMDGCNSPIRV-NEXT:    br label %[[IF_END]]
+// AMDGCNSPIRV:       [[IF_END]]:
+// AMDGCNSPIRV-NEXT:    ret void
+//
+void foo() {
+    if (__builtin_amdgcn_processor_is("gfx900"))
+        return __builtin_trap();
+}
+//.
+// AMDGCN-GFX900: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+// AMDGCN-GFX900: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
+//.
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64" }
+// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
+//.
+// AMDGCN-GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX900: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCN-GFX1010: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCN-GFX1010: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN-GFX1010: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// AMDGCNSPIRV: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// AMDGCNSPIRV: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCNSPIRV: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
new file mode 100644
index 0000000000000..78f18d3a37b46
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-feature-builtins-invalid-use.cpp
@@ -0,0 +1,48 @@
+// RUN: not %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm %s -o - 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm %s -o - 2>&1 | FileCheck %s
+
+bool predicate(bool x);
+void pass_by_value(__amdgpu_feature_predicate_t x);
+
+void invalid_uses(int *p, int x, const __amdgpu_feature_predicate_t &lv,
+                  __amdgpu_feature_predicate_t &&rv) {
+    // CHECK: error: 'a' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t a;
+    // CHECK: error: 'b' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t b = __builtin_amdgcn_processor_is("gfx906");
+    // CHECK: error: 'c' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t c = lv;
+    // CHECK: error: 'd' has type __amdgpu_feature_predicate_t, which is not constructible
+    __amdgpu_feature_predicate_t d = rv;
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    bool invalid_use_in_init_0 = __builtin_amdgcn_processor_is("gfx906");
+    // CHECK: error: 'x' has type __amdgpu_feature_predicate_t, which is not constructible
+    pass_by_value(__builtin_amdgcn_processor_is("gfx906"));
+    // CHECK: error: '__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    bool invalid_use_in_init_1 = __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var);
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx906")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    if (bool invalid_use_in_init_2 = __builtin_amdgcn_processor_is("gfx906")) return;
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx1200")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    if (predicate(__builtin_amdgcn_processor_is("gfx1200"))) __builtin_amdgcn_s_sleep_var(x);
+}
+
+void invalid_invocations(int x, const char* str) {
+    // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a valid AMDGCN processor identifier; 'not_an_amdgcn_gfx_id' is not valid
+    // CHECK-DAG: note: valid AMDGCN processor identifiers are: {{.*}}gfx{{.*}}
+    if (__builtin_amdgcn_processor_is("not_an_amdgcn_gfx_id")) return;
+    // CHECK: error: the argument to __builtin_amdgcn_processor_is must be a string literal
+    if (__builtin_amdgcn_processor_is(str)) return;
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}__builtin_amdgcn_s_sleep_var{{.*}} is not valid
+    if (__builtin_amdgcn_is_invocable("__builtin_amdgcn_s_sleep_var")) return;
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}str{{.*}} is not valid
+    else if (__builtin_amdgcn_is_invocable(str)) return;
+    // CHECK: error: the argument to __builtin_amdgcn_is_invocable must be either a target agnostic builtin or an AMDGCN target specific builtin; {{.*}}x{{.*}} is not valid
+    else if (__builtin_amdgcn_is_invocable(x)) return;
+    // CHECK: error: use of undeclared identifier '__builtin_ia32_pause'
+    else if (__builtin_amdgcn_is_invocable(__builtin_ia32_pause)) return;
+}
+
+bool return_needs_cast() {
+    // CHECK: error: '__builtin_amdgcn_processor_is("gfx900")' must be explicitly cast to 'bool'; however, please note that this is almost always an error and that it prevents the effective guarding of target dependent code, and thus should be avoided
+    return __builtin_amdgcn_processor_is("gfx900");
+}
diff --git a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
index 44557284fc581..cffd3c7a5fb1f 100644
--- a/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
+++ b/clang/test/CodeGenHipStdPar/select-accelerator-code-pass-ordering.cpp
@@ -7,7 +7,7 @@
 // HIPSTDPAR-PRE: Running pass: EntryExitInstrumenterPass
 // HIPSTDPAR-PRE-NEXT: Running pass: EntryExitInstrumenterPass
 // HIPSTDPAR-PRE-NOT: Running pass: HipStdParAcceleratorCodeSelectionPass
-// HIPSTDPAR-PRE-NEXT: Running pass: AlwaysInlinerPass
+// HIPSTDPAR-PRE-NEXT: Running pass: AMDGPUExpandFeaturePredicatesPass
 
 // Ensure Pass HipStdParAcceleratorCodeSelectionPass is invoked in PostLink.
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -mllvm -amdgpu-enable-hipstdpar -fcuda-is-device -fdebug-pass-manager -emit-llvm \
diff --git a/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip b/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip
new file mode 100644
index 0000000000000..a7f1abcdcd8fe
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-feature-builtins-cant-jump.hip
@@ -0,0 +1,62 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx900 -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -target-cpu gfx1201 -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void f(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}
+
+__device__ void g(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}
+
+__global__ void h(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_processor_is("gfx900")) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}
+
+__global__ void i(int *ptr, int size, bool f) {
+    int i = 0;
+    if (f)
+        goto label; // expected-error {{cannot jump from this goto statement to its label}}
+
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) { // expected-note {{jump enters statement controlled by AMDGPU feature predicate}}
+        for (i = 0; i < size; ++i) {
+            label:
+            ptr[i] = i;
+        }
+    }
+}
diff --git a/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip b/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip
new file mode 100644
index 0000000000000..27bbb3f2f3d07
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-feature-builtins-return-type-deduction.hip
@@ -0,0 +1,31 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx900 -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple x86_64 -aux-triple amdgcn -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -triple x86_64 -aux-triple spirv64-amd-amdhsa -ast-dump -ast-dump-decl-types %s | FileCheck %s --strict-whitespace
+
+__attribute__((device)) auto foo() {
+  return __builtin_amdgcn_processor_is("gfx900");
+}
+
+__attribute__((device)) decltype(auto) bar() {
+  return __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep);
+}
+
+// CHECK: |-TypedefDecl {{.*}} implicit __amdgpu_feature_predicate_t '__amdgpu_feature_predicate_t'
+// CHECK-NEXT: | `-BuiltinType {{.*}} '__amdgpu_feature_predicate_t'
+// CHECK-DAG: |-FunctionDecl {{.*}} foo '__amdgpu_feature_predicate_t ()'
+// CHECK-NEXT: |-CompoundStmt {{.*}}
+// CHECK-NEXT: | `-ReturnStmt {{.*}}
+// CHECK-NEXT: |   `-CallExpr {{.*}} '__amdgpu_feature_predicate_t'
+// CHECK-NEXT: |     |-ImplicitCastExpr {{.*}} '__amdgpu_feature_predicate_t (*)(const char *) noexcept'
+// CHECK-NEXT: |     | `-DeclRefExpr {{.*}} Function {{.*}} '__builtin_amdgcn_processor_is' '__amdgpu_feature_predicate_t (const char *) noexcept'
+// CHECK-NEXT: |     `-StringLiteral {{.*}} "gfx900"
+// CHECK-DAG: |-FunctionDecl {{.*}} bar '__amdgpu_feature_predicate_t ()'
+// CHECK-NEXT: |-CompoundStmt {{.*}}
+// CHECK-NEXT: | `-ReturnStmt {{.*}}
+// CHECK-NEXT: |   `-CallExpr {{.*}} '__amdgpu_feature_predicate_t'
+// CHECK-NEXT: |     |-ImplicitCastExpr {{.*}} '__amdgpu_feature_predicate_t (*)() noexcept' <BuiltinFnToFnPtr>
+// CHECK-NEXT: |     | `-DeclRefExpr {{.*}} Function {{.*}} '__builtin_amdgcn_is_invocable' '__amdgpu_feature_predicate_t () noexcept'
diff --git a/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip b/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip
new file mode 100644
index 0000000000000..26544590f7536
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-is-invocable-guards-builtin-use.hip
@@ -0,0 +1,47 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wamdgpu-unguarded-builtin-usage %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void g();
+
+__device__ void f(int x, bool b) {
+    const auto lambda = [=] __device__  () {
+        __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+        // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+        if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+            __builtin_amdgcn_s_sleep_var(x);
+    };
+
+    const auto generic_lambda = [] __device__ (auto&& y) {
+        __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+        // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+        if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+            __builtin_amdgcn_s_sleep_var(y);
+    };
+
+    __builtin_amdgcn_s_sleep(42); // expected-warning {{'__builtin_amdgcn_s_sleep' might be unavailable on some AMDGPU targets}}
+    // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    // processor_is does not (yet) guard
+    if (__builtin_amdgcn_processor_is("gfx900"))
+        __builtin_amdgcn_s_sleep_var(x); // expected-warning {{'__builtin_amdgcn_s_sleep_var' might be unavailable on some AMDGPU targets}}
+        // expected-note@-1 {{enclose '__builtin_amdgcn_s_sleep_var' in a __builtin_amdgcn_is_invocable check to silence this warning}}
+
+    // Direct guard
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep))
+        __builtin_amdgcn_s_sleep(42);
+
+    // Guarded scope
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var)) {
+        if (b) {
+            g();
+            while (--x > 42)
+                __builtin_amdgcn_s_sleep_var(x);
+        }
+    }
+}
diff --git a/clang/test/SemaHIP/amdgpu-is-invocable.hip b/clang/test/SemaHIP/amdgpu-is-invocable.hip
new file mode 100644
index 0000000000000..214d7769a595f
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-is-invocable.hip
@@ -0,0 +1,21 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void foo() {
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+        return __builtin_trap();
+}
+
+__global__ void bar() {
+    if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_permlanex16))
+        return __builtin_trap();
+}
diff --git a/clang/test/SemaHIP/amdgpu-processor-is.hip b/clang/test/SemaHIP/amdgpu-processor-is.hip
new file mode 100644
index 0000000000000..0f7211fd75d90
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-processor-is.hip
@@ -0,0 +1,21 @@
+// REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple spirv64-amd-amdhsa -Wno-unused-value %s
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__device__ void foo() {
+    if (__builtin_amdgcn_processor_is("gfx900"))
+        return __builtin_trap();
+}
+
+__global__ void bar() {
+    if (__builtin_amdgcn_processor_is("gfx900"))
+        return __builtin_trap();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 23f106a9c1d4d..46563bc2fb495 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -431,6 +431,15 @@ extern char &AMDGPUPrintfRuntimeBindingID;
 void initializeAMDGPUResourceUsageAnalysisWrapperPassPass(PassRegistry &);
 extern char &AMDGPUResourceUsageAnalysisID;
 
+struct AMDGPUExpandFeaturePredicatesPass
+    : PassInfoMixin<AMDGPUExpandFeaturePredicatesPass> {
+  const AMDGPUTargetMachine &TM;
+  AMDGPUExpandFeaturePredicatesPass(const AMDGPUTargetMachine &ATM) : TM(ATM) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  static bool isRequired() { return true; }
+};
+
 struct AMDGPUPrintfRuntimeBindingPass
     : PassInfoMixin<AMDGPUPrintfRuntimeBindingPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
new file mode 100644
index 0000000000000..fc0d3e378044d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandFeaturePredicates.cpp
@@ -0,0 +1,194 @@
+//===- AMDGPUExpandFeaturePredicates.cpp - Feature Predicate Expander Pass ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a pass that deals with expanding AMDGCN generic feature
+// predicates into target specific quantities / sequences. In this context, a
+// generic feature predicate is an implementation detail global variable that
+// is inserted by the FE as a consequence of using either the __builtin_cpu_is
+// or the __builtin_amdgcn_is_invocable special builtins on an abstract target
+// (AMDGCNSPIRV). These placeholder globals are used to guide target specific
+// lowering, once the concrete target is known, by way of constant folding their
+// value all the way into a terminator (i.e. a controlled block) or into a no
+// live use scenario. We hard fail if the folding fails, to avoid obtuse BE
+// errors or opaque run time errors. This pass should run as early as possible /
+// immediately after Clang CodeGen, so that the optimisation pipeline and the BE
+// operate with concrete target data.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+template <typename C> void collectUsers(Value *V, C &Container) {
+  assert(V && "Must pass an existing Value!");
+
+  for (auto &&U : V->users())
+    if (auto *I = dyn_cast<Instruction>(U))
+      Container.insert(Container.end(), I);
+}
+
+inline void setPredicate(const GCNSubtarget &ST, GlobalVariable *P) {
+  const bool IsFeature = P->getName().starts_with("llvm.amdgcn.has");
+  const size_t Offset =
+      IsFeature ? sizeof("llvm.amdgcn.has") : sizeof("llvm.amdgcn.is");
+
+  std::string PV = P->getName().substr(Offset).str();
+  if (IsFeature) {
+    size_t Dx = PV.find(',');
+    while (Dx != std::string::npos) {
+      PV.insert(++Dx, {'+'});
+
+      Dx = PV.find(',', Dx);
+    }
+    PV.insert(PV.cbegin(), '+');
+  }
+
+  Type *PTy = P->getValueType();
+  P->setLinkage(GlobalValue::PrivateLinkage);
+  P->setExternallyInitialized(false);
+
+  if (IsFeature)
+    P->setInitializer(ConstantInt::getBool(PTy, ST.checkFeatures(PV)));
+  else
+    P->setInitializer(ConstantInt::getBool(PTy, PV == ST.getCPU()));
+}
+
+std::pair<PreservedAnalyses, bool>
+unfoldableFound(Function *Caller, GlobalVariable *P, Instruction *NoFold) {
+  std::string W;
+  raw_string_ostream OS(W);
+
+  OS << "Impossible to constant fold feature predicate: " << *P << " used by "
+     << *NoFold << ", please simplify.\n";
+
+  Caller->getContext().diagnose(
+      DiagnosticInfoUnsupported(*Caller, W, NoFold->getDebugLoc(), DS_Error));
+
+  return {PreservedAnalyses::none(), false};
+}
+
+std::pair<PreservedAnalyses, bool>
+handlePredicate(const GCNSubtarget &ST, FunctionAnalysisManager &FAM,
+                SmallPtrSet<Function *, 32> &Predicated, GlobalVariable *P) {
+  setPredicate(ST, P);
+
+  SmallPtrSet<Instruction *, 32> ToFold;
+  collectUsers(P, ToFold);
+
+  if (ToFold.empty())
+    return {PreservedAnalyses::all(), true};
+
+  do {
+    Instruction *I = *ToFold.begin();
+    ToFold.erase(I);
+
+    I->dropDroppableUses();
+
+    Function *F = I->getParent()->getParent();
+    auto &DT = FAM.getResult<DominatorTreeAnalysis>(*F);
+    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+    if (auto *C = ConstantFoldInstruction(I, P->getDataLayout())) {
+      collectUsers(I, ToFold);
+      I->replaceAllUsesWith(C);
+      I->eraseFromParent();
+      continue;
+    } else if (I->isTerminator() &&
+               ConstantFoldTerminator(I->getParent(), true, nullptr, &DTU)) {
+      Predicated.insert(F);
+
+      continue;
+    }
+
+    return unfoldableFound(I->getParent()->getParent(), P, I);
+  } while (!ToFold.empty());
+
+  return {PreservedAnalyses::none(), true};
+}
+} // Unnamed namespace.
+
+static inline SmallVector<Function *> collectUsedFunctions(Module &M) {
+  SmallVector<Function *> Ret;
+  for (auto &&F : M) {
+    if (F.isIntrinsic() || F.isDeclaration())
+      continue;
+    if (!F.hasInternalLinkage() && !F.hasPrivateLinkage())
+      continue;
+    if (F.hasNUndroppableUses(0))
+      continue;
+    Ret.push_back(&F);
+  }
+
+  return Ret;
+}
+
+template <typename Container0, typename Container1, typename Container2>
+static inline void removeUnreachable(const Container0 &Predicates,
+                                     const Container1 &PredicatedFns,
+                                     const Container2 &UnreachableFns) {
+  for_each(Predicates, [](auto &&P) { P->eraseFromParent(); });
+  for_each(PredicatedFns, [](auto &&F) { removeUnreachableBlocks(*F); });
+  for_each(UnreachableFns, [](auto &&F) {
+    if (!F->hasNUndroppableUses(0))
+      return;
+    F->dropDroppableUses();
+    F->eraseFromParent();
+  });
+}
+
+PreservedAnalyses
+AMDGPUExpandFeaturePredicatesPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  if (M.empty())
+    return PreservedAnalyses::all();
+
+  SmallVector<GlobalVariable *> Predicates;
+  for (auto &&G : M.globals()) {
+    if (!G.isDeclaration() || !G.hasName())
+      continue;
+    if (G.getName().starts_with("llvm.amdgcn."))
+      Predicates.push_back(&G);
+  }
+
+  if (Predicates.empty())
+    return PreservedAnalyses::all();
+
+  const auto &ST = TM.getSubtarget<GCNSubtarget>(
+      *find_if(M, [](auto &&F) { return !F.isIntrinsic(); }));
+
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  SmallPtrSet<Function *, 32> Predicated;
+  SmallVector<Function *> MaybeUnreachable = collectUsedFunctions(M);
+  auto Ret = PreservedAnalyses::all();
+  for (auto &&P : Predicates) {
+    auto R = handlePredicate(ST, FAM, Predicated, P);
+
+    if (!R.second)
+      break;
+
+    Ret.intersect(R.first);
+  }
+
+  removeUnreachable(Predicates, Predicated, MaybeUnreachable);
+
+  return Ret;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 5d298304c27f5..9e962b35308de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -16,6 +16,8 @@
 #ifndef MODULE_PASS
 #define MODULE_PASS(NAME, CREATE_PASS)
 #endif
+MODULE_PASS("amdgpu-expand-feature-predicates",
+            AMDGPUExpandFeaturePredicatesPass(*this))
 MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass())
 MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass())
 MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f4dc4a483181c..f92f1518ae25e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -811,8 +811,18 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
 #include "llvm/Passes/TargetPassRegistry.inc"
 
+  PB.registerPipelineStartEPCallback(
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
+        PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this));
+      });
+
+  PB.registerFullLinkTimeOptimizationEarlyEPCallback(
+      [this](ModulePassManager &PM, OptimizationLevel) {
+        PM.addPass(AMDGPUExpandFeaturePredicatesPass(*this));
+      });
+
   PB.registerScalarOptimizerLateEPCallback(
-      [](FunctionPassManager &FPM, OptimizationLevel Level) {
+      [this](FunctionPassManager &FPM, OptimizationLevel Level) {
         if (Level == OptimizationLevel::O0)
           return;
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index e3519f192137c..26b6e4f18ad30 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -53,6 +53,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
+  AMDGPUExpandFeaturePredicates.cpp
   AMDGPUExportClustering.cpp
   AMDGPUExportKernelRuntimeHandles.cpp
   AMDGPUFrameLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll
new file mode 100644
index 0000000000000..c5089de333849
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-remove-unreachable-functions.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5
+; REQUIRES: amdgpu-registered-target
+
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s
+
+@llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1
+@llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1
+
+define external void @extern_linkage() {
+; GFX906-LABEL: define void @extern_linkage(
+; GFX906-SAME: ) #[[ATTR0:[0-9]+]] {
+; GFX906-NEXT:  [[ENTRY:.*:]]
+; GFX906-NEXT:    ret void
+;
+; GFX1201-LABEL: define void @extern_linkage(
+; GFX1201-SAME: ) #[[ATTR0:[0-9]+]] {
+; GFX1201-NEXT:  [[ENTRY:.*:]]
+; GFX1201-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define private void @non_predicated_uses() {
+; GFX906-LABEL: define private void @non_predicated_uses(
+; GFX906-SAME: ) #[[ATTR0]] {
+; GFX906-NEXT:  [[ENTRY:.*:]]
+; GFX906-NEXT:    ret void
+;
+; GFX1201-LABEL: define private void @non_predicated_uses(
+; GFX1201-SAME: ) #[[ATTR0]] {
+; GFX1201-NEXT:  [[ENTRY:.*:]]
+; GFX1201-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define internal void @remove_on_906() {
+; GFX1201-LABEL: define internal void @remove_on_906(
+; GFX1201-SAME: ) #[[ATTR0]] {
+; GFX1201-NEXT:  [[ENTRY:.*:]]
+; GFX1201-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define internal void @remove_on_1201() {
+; GFX906-LABEL: define internal void @remove_on_1201(
+; GFX906-SAME: ) #[[ATTR0]] {
+; GFX906-NEXT:  [[ENTRY:.*:]]
+; GFX906-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define void @foo() {
+; GFX906-LABEL: define void @foo(
+; GFX906-SAME: ) #[[ATTR0]] {
+; GFX906-NEXT:  [[ENTRY:.*:]]
+; GFX906-NEXT:    call void @non_predicated_uses()
+; GFX906-NEXT:    br label %[[NOT_GFX1201:.*]]
+; GFX906:       [[NOT_GFX1201]]:
+; GFX906-NEXT:    br label %[[GFX906:.*]]
+; GFX906:       [[GFX906]]:
+; GFX906-NEXT:    call void @remove_on_1201()
+; GFX906-NEXT:    br label %[[END:.*]]
+; GFX906:       [[END]]:
+; GFX906-NEXT:    ret void
+;
+; GFX1201-LABEL: define void @foo(
+; GFX1201-SAME: ) #[[ATTR0]] {
+; GFX1201-NEXT:  [[ENTRY:.*:]]
+; GFX1201-NEXT:    call void @non_predicated_uses()
+; GFX1201-NEXT:    br label %[[GFX1201:.*]]
+; GFX1201:       [[GFX1201]]:
+; GFX1201-NEXT:    call void @remove_on_906()
+; GFX1201-NEXT:    br label %[[END:.*]]
+; GFX1201:       [[END]]:
+; GFX1201-NEXT:    ret void
+;
+entry:
+  call void @non_predicated_uses()
+  %0 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1
+  br i1 %0, label %gfx1201, label %not.gfx1201
+
+gfx1201:
+  call void @remove_on_906()
+  br label %end
+
+not.gfx1201:
+  %1 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1
+  br i1 %1, label %gfx906, label %end
+
+gfx906:
+  call void @remove_on_1201()
+  br label %end
+
+end:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
new file mode 100644
index 0000000000000..bfc35d8c76e37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates-unfoldable.ll
@@ -0,0 +1,28 @@
+; REQUIRES: amdgpu-registered-target
+
+; RUN: not opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' < %s 2>&1 | FileCheck %s
+
+; CHECK: error:{{.*}}in function kernel void (ptr addrspace(1), i32, ptr addrspace(1)): Impossible to constant fold feature predicate: @llvm.amdgcn.is.gfx803 = private addrspace(1) constant i1 false used by   %call = call i1 %1(i1 zeroext false), please simplify.
+
+@llvm.amdgcn.is.gfx803 = external addrspace(1) externally_initialized constant i1
+
+declare void @llvm.amdgcn.s.sleep(i32 immarg) #1
+
+define amdgpu_kernel void @kernel(ptr addrspace(1) readnone captures(none) %p.coerce, i32 %x, ptr addrspace(1) %pfn.coerce) {
+entry:
+  %0 = ptrtoint ptr addrspace(1) %pfn.coerce to i64
+  %1 = inttoptr i64 %0 to ptr
+  %2 = ptrtoint ptr addrspace(1) %pfn.coerce to i64
+  %3 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx803, align 1
+  %call = call i1 %1(i1 zeroext %3)
+  br i1 %call, label %if.gfx803, label %if.end
+
+if.gfx803:
+  call void @llvm.amdgcn.s.sleep(i32 0)
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
new file mode 100644
index 0000000000000..a16a7fc31da22
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-expand-feature-predicates.ll
@@ -0,0 +1,284 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; REQUIRES: amdgpu-registered-target
+
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX906 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1010 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1101 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 -mattr=+wavefrontsize64 -passes='amdgpu-expand-feature-predicates' %s -o - | FileCheck --check-prefix=GFX1201-W64 %s
+
+;; The IR was derived from the following source:
+;; extern "C" __global__ void kernel(int* p, int x)
+;; {
+;;     if (__builtin_amdgcn_processor_is("gfx1201") ||
+;;         __builtin_amdgcn_is_invocable(__builtin_amdgcn_s_sleep_var))
+;;         __builtin_amdgcn_s_sleep_var(x);
+;;     if (!__builtin_amdgcn_processor_is("gfx906"))
+;;         __builtin_amdgcn_s_wait_event_export_ready();
+;;     else if (__builtin_amdgcn_processor_is("gfx1010") ||
+;;         __builtin_amdgcn_processor_is("gfx1101"))
+;;         __builtin_amdgcn_s_ttracedata_imm(1);
+;;     while (__builtin_amdgcn_processor_is("gfx1101")) *p += x;
+;;     do {
+;;         *p -= x;
+;;     } while (__builtin_amdgcn_processor_is("gfx1010"));
+;;     for (; __builtin_amdgcn_processor_is("gfx1201"); ++*p) break;
+;;
+;;     if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_wait_event_export_ready))
+;;         __builtin_amdgcn_s_wait_event_export_ready();
+;;     else if (__builtin_amdgcn_is_invocable(__builtin_amdgcn_s_ttracedata_imm))
+;;         __builtin_amdgcn_s_ttracedata_imm(1);
+;;
+;;     do {
+;;         *p -= x;
+;;     } while (__builtin_amdgcn_is_invocable(__builtin_amdgcn_global_load_tr_b64_i32));
+;;     for (; __builtin_amdgcn_is_invocable(__builtin_amdgcn_permlane64); ++*p) break;
+;; }
+
+@llvm.amdgcn.is.gfx1201 = external addrspace(1) externally_initialized constant i1
+@llvm.amdgcn.has.gfx12-insts = external addrspace(1) externally_initialized constant i1
+@llvm.amdgcn.is.gfx906 = external addrspace(1) externally_initialized constant i1
+@llvm.amdgcn.is.gfx1010 = external addrspace(1) externally_initialized constant i1
+@llvm.amdgcn.is.gfx1101 = external addrspace(1) externally_initialized constant i1
+@llvm.amdgcn.has.gfx11-insts = external addrspace(1) externally_initialized constant i1
+@llvm.amdgcn.has.gfx10-insts = external addrspace(1) externally_initialized constant i1
+@"llvm.amdgcn.has.gfx12-insts,wavefrontsize64" = external addrspace(1) externally_initialized constant i1
+
+declare void @llvm.amdgcn.s.sleep.var(i32)
+declare void @llvm.amdgcn.s.wait.event.export.ready()
+declare void @llvm.amdgcn.s.ttracedata.imm(i16 immarg)
+
+define amdgpu_kernel void @kernel(ptr addrspace(1) %p.coerce, i32 %x) {
+; GFX906-LABEL: define amdgpu_kernel void @kernel(
+; GFX906-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-NEXT:  [[ENTRY:.*:]]
+; GFX906-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX906-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX906-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX906:       [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX906-NEXT:    br label %[[IF_NOT_GFX907:.*]]
+; GFX906:       [[IF_NOT_GFX907]]:
+; GFX906-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX906:       [[IF_GFX1010_OR_GFX1101]]:
+; GFX906-NEXT:    br label %[[LOR_NOT_GFX1010:.*]]
+; GFX906:       [[LOR_NOT_GFX1010]]:
+; GFX906-NEXT:    br label %[[FOR_COND:.*]]
+; GFX906:       [[FOR_COND]]:
+; GFX906-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX906-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX906-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX906-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX906:       [[IF_GFX11_INSTS]]:
+; GFX906-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
+; GFX906:       [[IF_GFX10_INSTS]]:
+; GFX906-NEXT:    call void @llvm.assume(i1 true)
+; GFX906-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX906-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX906-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX906-NEXT:    ret void
+;
+; GFX1010-LABEL: define amdgpu_kernel void @kernel(
+; GFX1010-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1010-NEXT:  [[ENTRY:.*:]]
+; GFX1010-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1010-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1010-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX1010:       [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX1010-NEXT:    br label %[[IF_NOT_GFX906:.*]]
+; GFX1010:       [[IF_NOT_GFX906]]:
+; GFX1010-NEXT:    br label %[[LOR_NOT_GFX1010:.*]]
+; GFX1010:       [[LOR_NOT_GFX1010]]:
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1010-NEXT:    br label %[[IF_END6:.*]]
+; GFX1010:       [[IF_END6]]:
+; GFX1010-NEXT:    call void @llvm.assume(i1 true)
+; GFX1010-NEXT:    call void @llvm.assume(i1 true)
+; GFX1010-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1010:       [[FOR_COND]]:
+; GFX1010-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1010-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1010-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1010-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX1010:       [[IF_GFX11_INSTS]]:
+; GFX1010-NEXT:    br label %[[IF_GFX10_INSTS:.*]]
+; GFX1010:       [[IF_GFX10_INSTS]]:
+; GFX1010-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+; GFX1010-NEXT:    br label %[[IF_END11:.*]]
+; GFX1010:       [[IF_END11]]:
+; GFX1010-NEXT:    call void @llvm.assume(i1 true)
+; GFX1010-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1010-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1010-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1010-NEXT:    ret void
+;
+; GFX1101-LABEL: define amdgpu_kernel void @kernel(
+; GFX1101-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1101-NEXT:  [[ENTRY:.*:]]
+; GFX1101-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1101-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1101-NEXT:    br label %[[IF_GFX1201_OR_GFX12_INSTS:.*]]
+; GFX1101:       [[IF_GFX1201_OR_GFX12_INSTS]]:
+; GFX1101-NEXT:    br label %[[IF_END:.*]]
+; GFX1101:       [[IF_END]]:
+; GFX1101-NEXT:    br label %[[IF_NOT_GFX907:.*]]
+; GFX1101:       [[IF_NOT_GFX907]]:
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1101-NEXT:    br label %[[IF_NOT_GFX906:.*]]
+; GFX1101:       [[IF_NOT_GFX906]]:
+; GFX1101-NEXT:    call void @llvm.assume(i1 true)
+; GFX1101-NEXT:    call void @llvm.assume(i1 true)
+; GFX1101-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1101:       [[FOR_COND]]:
+; GFX1101-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1101-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1101-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1101-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX1101:       [[IF_GFX11_INSTS]]:
+; GFX1101-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1101-NEXT:    br label %[[IF_ELSE8:.*]]
+; GFX1101:       [[IF_ELSE8]]:
+; GFX1101-NEXT:    call void @llvm.assume(i1 true)
+; GFX1101-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1101-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1101-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1101-NEXT:    ret void
+;
+; GFX1201-LABEL: define amdgpu_kernel void @kernel(
+; GFX1201-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1201-NEXT:  [[ENTRY:.*:]]
+; GFX1201-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1201-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1201-NEXT:    br label %[[LOR_NOT_GFX1201:.*]]
+; GFX1201:       [[LOR_NOT_GFX1201]]:
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1201-NEXT:    br label %[[IF_NOT_GFX906:.*]]
+; GFX1201:       [[IF_NOT_GFX906]]:
+; GFX1201-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX1201:       [[IF_GFX1010_OR_GFX1101]]:
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-NEXT:    br label %[[IF_END6:.*]]
+; GFX1201:       [[IF_END6]]:
+; GFX1201-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1201:       [[FOR_COND]]:
+; GFX1201-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1201-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX1201:       [[IF_GFX11_INSTS]]:
+; GFX1201-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-NEXT:    br label %[[IF_ELSE8:.*]]
+; GFX1201:       [[IF_ELSE8]]:
+; GFX1201-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1201-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-NEXT:    ret void
+;
+; GFX1201-W64-LABEL: define amdgpu_kernel void @kernel(
+; GFX1201-W64-SAME: ptr addrspace(1) [[P_COERCE:%.*]], i32 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX1201-W64-NEXT:  [[ENTRY:.*:]]
+; GFX1201-W64-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P_COERCE]] to i64
+; GFX1201-W64-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; GFX1201-W64-NEXT:    br label %[[LOR_NOT_GFX1201:.*]]
+; GFX1201-W64:       [[LOR_NOT_GFX1201]]:
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[X]])
+; GFX1201-W64-NEXT:    br label %[[IF_NOT_GFX906:.*]]
+; GFX1201-W64:       [[IF_NOT_GFX906]]:
+; GFX1201-W64-NEXT:    br label %[[IF_GFX1010_OR_GFX1101:.*]]
+; GFX1201-W64:       [[IF_GFX1010_OR_GFX1101]]:
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-W64-NEXT:    br label %[[IF_END6:.*]]
+; GFX1201-W64:       [[IF_END6]]:
+; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-W64-NEXT:    br label %[[FOR_COND:.*]]
+; GFX1201-W64:       [[FOR_COND]]:
+; GFX1201-W64-NEXT:    [[DOTPROMOTED:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    [[SUB_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED]], [[X]]
+; GFX1201-W64-NEXT:    store i32 [[SUB_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    br label %[[IF_GFX11_INSTS:.*]]
+; GFX1201-W64:       [[IF_GFX11_INSTS]]:
+; GFX1201-W64-NEXT:    call void @llvm.amdgcn.s.wait.event.export.ready()
+; GFX1201-W64-NEXT:    br label %[[IF_ELSE8:.*]]
+; GFX1201-W64:       [[IF_ELSE8]]:
+; GFX1201-W64-NEXT:    call void @llvm.assume(i1 true)
+; GFX1201-W64-NEXT:    [[DOTPROMOTED9:%.*]] = load i32, ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    [[SUB13_PEEL:%.*]] = sub nsw i32 [[DOTPROMOTED9]], [[X]]
+; GFX1201-W64-NEXT:    store i32 [[SUB13_PEEL]], ptr [[TMP1]], align 4
+; GFX1201-W64-NEXT:    ret void
+;
+entry:
+  %0 = ptrtoint ptr addrspace(1) %p.coerce to i64
+  %1 = inttoptr i64 %0 to ptr
+  %2 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1201, align 1
+  br i1 %2, label %if.gfx1201.or.gfx12-insts, label %lor.not.gfx1201
+
+lor.not.gfx1201:
+  %3 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx12-insts, align 1
+  br i1 %3, label %if.gfx1201.or.gfx12-insts, label %if.end
+
+if.gfx1201.or.gfx12-insts:
+  call void @llvm.amdgcn.s.sleep.var(i32 %x)
+  br label %if.end
+
+if.end:
+  %4 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx906, align 1
+  br i1 %4, label %if.gfx906, label %if.not.gfx906
+
+if.not.gfx906:
+  call void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end6
+
+if.gfx906:
+  %5 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1
+  br i1 %5, label %if.gfx1010.or.gfx1101, label %lor.not.gfx1010
+
+lor.not.gfx1010:
+  %6 = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1
+  br i1 %6, label %if.gfx1010.or.gfx1101, label %for.cond
+
+if.gfx1010.or.gfx1101:
+  call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end6
+
+if.end6:
+  %.pr.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1101, align 1
+  %7 = icmp ne i1 %.pr.pr, true
+  call void @llvm.assume(i1 %7)
+  %.pr6.pr = load i1, ptr addrspace(1) @llvm.amdgcn.is.gfx1010, align 1
+  %8 = icmp ne i1 %.pr6.pr, true
+  call void @llvm.assume(i1 %8)
+  br label %for.cond
+
+for.cond:
+  %.promoted = load i32, ptr %1, align 4
+  %sub.peel = sub nsw i32 %.promoted, %x
+  store i32 %sub.peel, ptr %1, align 4
+  %9 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx11-insts, align 1
+  br i1 %9, label %if.gfx11-insts, label %if.else8
+
+if.gfx11-insts:
+  call void @llvm.amdgcn.s.wait.event.export.ready()
+  br label %if.end11
+
+if.else8:
+  %10 = load i1, ptr addrspace(1) @llvm.amdgcn.has.gfx10-insts, align 1
+  br i1 %10, label %if.gfx10-insts, label %if.end11
+
+if.gfx10-insts:
+  call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
+  br label %if.end11
+
+if.end11:
+  %.pr7 = load i1, ptr addrspace(1) @"llvm.amdgcn.has.gfx12-insts,wavefrontsize64", align 1
+  %11 = icmp ne i1 %.pr7, true
+  call void @llvm.assume(i1 %11)
+  %.promoted9 = load i32, ptr %1, align 4
+  %sub13.peel = sub nsw i32 %.promoted9, %x
+  store i32 %sub13.peel, ptr %1, align 4
+  ret void
+}
+
+declare void @llvm.assume(i1 noundef)
diff --git a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll
index b1fc76f457ece..93f43b274e28d 100644
--- a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll
+++ b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll
@@ -2,16 +2,22 @@
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O1>" -print-pipeline-passes %s -o - | FileCheck %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O2>" -print-pipeline-passes %s -o - | FileCheck %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O3>" -print-pipeline-passes %s -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -O0 -print-pipeline-passes %s -o - | FileCheck --check-prefix=O0 %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -O1 -print-pipeline-passes %s -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -O2 -print-pipeline-passes %s -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -O3 -print-pipeline-passes %s -o - | FileCheck %s
 
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O0>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O1>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O2>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 ; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto-pre-link<O3>" -print-pipeline-passes -amdgpu-internalize-symbols %s -o - | FileCheck --check-prefix=PRE %s
 
-
+; CHECK: amdgpu-expand-feature-predicates
 ; CHECK: amdgpu-attributor
+; O0: amdgpu-expand-feature-predicates
 ; O0-NOT: amdgpu-attributor
 
+; PRE: amdgpu-expand-feature-predicates
 ; PRE-NOT: internalize
 ; PRE-NOT: amdgpu-attributor
 ; PRE-NOT: printfToRuntime