ROCm
diff --git a/‎clang/lib/AST/ByteCode/Interp.cpp
Lines changed: 2 additions & 1 deletion b/‎clang/lib/AST/ByteCode/Interp.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎clang/lib/AST/ByteCode/Interp.h
Lines changed: 11 additions & 2 deletions b/‎clang/lib/AST/ByteCode/Interp.h
Lines changed: 11 additions & 2 deletions
diff --git a/‎clang/lib/CodeGen/CGCall.cpp
Lines changed: 0 additions & 16 deletions b/‎clang/lib/CodeGen/CGCall.cpp
Lines changed: 0 additions & 16 deletions
diff --git a/‎clang/lib/Headers/amdgpuintrin.h
Lines changed: 56 additions & 0 deletions b/‎clang/lib/Headers/amdgpuintrin.h
Lines changed: 56 additions & 0 deletions
diff --git a/‎clang/lib/Headers/nvptxintrin.h
Lines changed: 74 additions & 0 deletions b/‎clang/lib/Headers/nvptxintrin.h
Lines changed: 74 additions & 0 deletions
diff --git a/‎clang/test/AST/ByteCode/new-delete.cpp
Lines changed: 14 additions & 0 deletions b/‎clang/test/AST/ByteCode/new-delete.cpp
Lines changed: 14 additions & 0 deletions
diff --git a/‎libc/include/fenv.h.def
Lines changed: 0 additions & 17 deletions b/‎libc/include/fenv.h.def
Lines changed: 0 additions & 17 deletions
diff --git a/‎libc/include/fenv.yaml
Lines changed: 28 additions & 7 deletions b/‎libc/include/fenv.yaml
Lines changed: 28 additions & 7 deletions
diff --git a/‎libc/src/__support/GPU/utils.h
Lines changed: 8 additions & 0 deletions b/‎libc/src/__support/GPU/utils.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎libc/test/integration/src/__support/GPU/CMakeLists.txt
Lines changed: 9 additions & 0 deletions b/‎libc/test/integration/src/__support/GPU/CMakeLists.txt
Lines changed: 9 additions & 0 deletions
@@ -1063,7 +1063,8 @@ bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm,
       return false;
     }
 
-    if (!Ptr.isRoot() || Ptr.isOnePastEnd() || Ptr.isArrayElement()) {
+    if (!Ptr.isRoot() || Ptr.isOnePastEnd() ||
+        (Ptr.isArrayElement() && Ptr.getIndex() != 0)) {
       const SourceInfo &Loc = S.Current->getSource(OpPC);
       S.FFDiag(Loc, diag::note_constexpr_delete_subobject)
           << Ptr.toDiagnosticString(S.getASTContext()) << Ptr.isOnePastEnd();
 
@@ -2915,13 +2915,17 @@ inline bool AllocN(InterpState &S, CodePtr OpPC, PrimType T, const Expr *Source,
     S.Stk.push<Pointer>(0, nullptr);
     return true;
   }
+  assert(NumElements.isPositive());
 
   DynamicAllocator &Allocator = S.getAllocator();
   Block *B =
       Allocator.allocate(Source, T, static_cast<size_t>(NumElements),
                          S.Ctx.getEvalID(), DynamicAllocator::Form::Array);
   assert(B);
-  S.Stk.push<Pointer>(B);
+  if (NumElements.isZero())
+    S.Stk.push<Pointer>(B);
+  else
+    S.Stk.push<Pointer>(Pointer(B).atIndex(0));
   return true;
 }
 
@@ -2941,13 +2945,18 @@ inline bool AllocCN(InterpState &S, CodePtr OpPC, const Descriptor *ElementDesc,
     S.Stk.push<Pointer>(0, ElementDesc);
     return true;
   }
+  assert(NumElements.isPositive());
 
   DynamicAllocator &Allocator = S.getAllocator();
   Block *B =
       Allocator.allocate(ElementDesc, static_cast<size_t>(NumElements),
                          S.Ctx.getEvalID(), DynamicAllocator::Form::Array);
   assert(B);
-  S.Stk.push<Pointer>(B);
+  if (NumElements.isZero())
+    S.Stk.push<Pointer>(B);
+  else
+    S.Stk.push<Pointer>(Pointer(B).atIndex(0));
+
   return true;
 }
 
 
@@ -5640,22 +5640,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   if (!CallArgs.getCleanupsToDeactivate().empty())
     deactivateArgCleanupsBeforeCall(*this, CallArgs);
 
-  // Assert that the arguments we computed match up.  The IR verifier
-  // will catch this, but this is a common enough source of problems
-  // during IRGen changes that it's way better for debugging to catch
-  // it ourselves here.
-#ifndef NDEBUG
-  assert(IRCallArgs.size() == IRFuncTy->getNumParams() || IRFuncTy->isVarArg());
-  for (unsigned i = 0; i < IRCallArgs.size(); ++i) {
-    // Inalloca argument can have different type.
-    if (IRFunctionArgs.hasInallocaArg() &&
-        i == IRFunctionArgs.getInallocaArgNo())
-      continue;
-    if (i < IRFuncTy->getNumParams())
-      assert(IRCallArgs[i]->getType() == IRFuncTy->getParamType(i));
-  }
-#endif
-
   // Update the largest vector width if any arguments have vector types.
   for (unsigned i = 0; i < IRCallArgs.size(); ++i)
     LargestVectorWidth = std::max(LargestVectorWidth,
 
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
          ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
 }
 
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, !__done)) {
+    if (!__done) {
+      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint64_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, __done)) {
+    if (!__done) {
+      uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  __gpu_sync_lane(__lane_mask);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  __gpu_sync_lane(__lane_mask);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
 // Returns true if the flat pointer points to AMDGPU 'shared' memory.
 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
   return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
 
@@ -13,6 +13,10 @@
 #error "This file is intended for NVPTX targets or offloading to NVPTX"
 #endif
 
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
+#endif
+
 #include <stdint.h>
 
 #if !defined(__cplusplus)
@@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
          ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
 }
 
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_any_sync_i32(__lane_mask, __x);
+
+  uint32_t __match_mask = 0;
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, !__done)) {
+    if (!__done) {
+      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_any_sync_i64(__lane_mask, __x);
+
+  uint64_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, __done)) {
+    if (!__done) {
+      uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  int predicate;
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+
+  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+  // Newer targets can use the dedicated CUDA support.
+  int predicate;
+  if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+    return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+
+  uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+  return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
 // Returns true if the flat pointer points to CUDA 'shared' memory.
 _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
   return __nvvm_isspacep_shared(ptr);
 
@@ -922,6 +922,20 @@ namespace NonConstexprArrayCtor {
                          // both-note {{in call to}}
 }
 
+namespace ArrayBaseCast {
+  struct A {};
+  struct B : A {};
+  constexpr bool test() {
+    B *b = new B[2];
+
+    A* a = b;
+
+    delete[] b;
+    return true;
+  }
+  static_assert(test());
+}
+
 #else
 /// Make sure we reject this prior to C++20
 constexpr int a() { // both-error {{never produces a constant expression}}
 
@@ -1,11 +1,32 @@
 header: fenv.h
-header_template: fenv.h.def
-macros: []
+standards:
+  - stdc
+macros:
+  - macro_name: FE_ALL_EXCEPT
+    macro_header: fenv-macros.h
+  - macro_name: FE_DIVBYZERO
+    macro_header: fenv-macros.h
+  - macro_name: FE_INEXACT
+    macro_header: fenv-macros.h
+  - macro_name: FE_INVALID
+    macro_header: fenv-macros.h
+  - macro_name: FE_OVERFLOW
+    macro_header: fenv-macros.h
+  - macro_name: FE_UNDERFLOW
+    macro_header: fenv-macros.h
+  - macro_name: FE_DOWNWARD
+    macro_header: fenv-macros.h
+  - macro_name: FE_TONEAREST
+    macro_header: fenv-macros.h
+  - macro_name: FE_TOWARDZERO
+    macro_header: fenv-macros.h
+  - macro_name: FE_UPWARD
+    macro_header: fenv-macros.h
+  - macro_name: FE_DFL_ENV
+    macro_header: fenv-macros.h
 types:
   - type_name: fenv_t
   - type_name: fexcept_t
-enums: []
-objects: []
 functions:
   - name: feclearexcept
     standards:
@@ -15,14 +36,14 @@ functions:
       - type: int
   - name: fedisableexcept
     standards:
-      - GNUExtensions
+      - gnu
     return_type: int
     arguments:
       - type: int
     guard: null
   - name: feenableexcept
     standards:
-      - GNUExtensions
+      - gnu
     return_type: int
     arguments:
       - type: int
@@ -35,7 +56,7 @@ functions:
       - type: fenv_t *
   - name: fegetexcept
     standards:
-      - GNUExtensions
+      - gnu
     return_type: int
     arguments: []
   - name: fegetexceptflag
 
@@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
   return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
 }
 
+LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
+  return __gpu_match_any_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
+  return __gpu_match_all_u32(lane_mask, x);
+}
+
 [[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
 
 LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
 
@@ -18,3 +18,12 @@ add_integration_test(
   LOADER_ARGS
     --threads 64
 )
+
+add_integration_test(
+  match_test
+  SUITE libc-support-gpu-tests
+  SRCS
+    match.cpp
+  LOADER_ARGS
+    --threads 64
+)
Original file line number	Diff line number	Diff line change
`@@ -1063,7 +1063,8 @@ bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm,`
`1063`	`1063`	`return false;`
`1064`	`1064`	`}`
`1065`	`1065`
`1066`		`- if (!Ptr.isRoot() \|\| Ptr.isOnePastEnd() \|\| Ptr.isArrayElement()) {`
	`1066`	`+ if (!Ptr.isRoot() \|\| Ptr.isOnePastEnd() \|\|`
	`1067`	`+ (Ptr.isArrayElement() && Ptr.getIndex() != 0)) {`
`1067`	`1068`	`const SourceInfo &Loc = S.Current->getSource(OpPC);`
`1068`	`1069`	`S.FFDiag(Loc, diag::note_constexpr_delete_subobject)`
`1069`	`1070`	`<< Ptr.toDiagnosticString(S.getASTContext()) << Ptr.isOnePastEnd();`