From 8142cf37d0285b879f3560fa42f64dc0eca59327 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Tue, 8 Jul 2025 15:30:04 +0200 Subject: [PATCH 1/2] [AMDGPU] Visit all PHIs in each call to optimizeLiveType Make the Visited set a local variable, otherwise we can reject a PHI (those that do not have a zeroinitializer constant) but mark it as visited, and the rest of the function thinks the PHI is ok when it isn't. This is a bit crude but it's the only fix that consistently worked in my testing. Fixes SWDEV-541767 --- .../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 3 +- .../amdgpu-late-codegenprepare-crash-splat.ll | 94 +++++++++++++++++++ 2 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare-crash-splat.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index df976cf3f7fdb..523c66c72273c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -79,8 +79,6 @@ class LiveRegOptimizer { /// The scalar type to convert to Type *const ConvertToScalar; - /// The set of visited Instructions - SmallPtrSet Visited; /// Map of Value -> Converted Value ValueToValueMap ValMap; /// Map of containing conversions from Optimal Type -> Original Type per BB. @@ -288,6 +286,7 @@ bool LiveRegOptimizer::optimizeLiveType( SmallPtrSet PhiNodes; SmallPtrSet Defs; SmallPtrSet Uses; + SmallPtrSet Visited; Worklist.push_back(cast(I)); while (!Worklist.empty()) { diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare-crash-splat.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare-crash-splat.ll new file mode 100644 index 0000000000000..837b5e0ab5833 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare-crash-splat.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes="amdgpu-late-codegenprepare,verify" %s | FileCheck %s + +; This crashed because the PHI with a splat was rejected, but then we marked the PHI +; as visited and tried to convert one of its user afterwards. + +define amdgpu_kernel void @widget(i1 %arg, <4 x i8> %arg1, i64 %arg2) { +; CHECK-LABEL: define amdgpu_kernel void @widget( +; CHECK-SAME: i1 [[ARG:%.*]], <4 x i8> [[ARG1:%.*]], i64 [[ARG2:%.*]]) { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[WIDGET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[WIDGET_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i1 +; CHECK-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[WIDGET_KERNARG_SEGMENT]], i64 40 +; CHECK-NEXT: [[ARG1_LOAD:%.*]] = load <4 x i8>, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 8 +; CHECK-NEXT: [[ARG2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[WIDGET_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[ARG2_LOAD:%.*]] = load i64, ptr addrspace(4) [[ARG2_KERNARG_OFFSET]], align 4 +; CHECK-NEXT: br label %[[BB_3:.*]] +; CHECK: [[BB_3]]: +; CHECK-NEXT: [[PHI:%.*]] = phi ptr addrspace(1) [ null, %[[BB]] ], [ [[GETELEMENTPTR:%.*]], %[[BB_14:.*]] ] +; CHECK-NEXT: [[PHI4:%.*]] = phi <4 x i8> [ splat (i8 1), %[[BB]] ], [ [[PHI15:%.*]], %[[BB_14]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_6_PREHEADER:.*]], label %[[BB_5:.*]] +; CHECK: [[BB_5]]: +; CHECK-NEXT: br label %[[BB_14]] +; CHECK: [[BB_6_PREHEADER]]: +; CHECK-NEXT: br label %[[BB_6:.*]] +; CHECK: [[BB_6]]: +; CHECK-NEXT: [[PHI7:%.*]] = phi <4 x i8> [ [[PHI13:%.*]], %[[BB_12:.*]] ], [ [[PHI4]], %[[BB_6_PREHEADER]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_8:.*]], label %[[BB_12]] +; CHECK: [[BB_8]]: +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_10:.*]], label %[[BB_9:.*]] +; CHECK: [[BB_9]]: +; CHECK-NEXT: br label %[[BB_10]] +; CHECK: [[BB_10]]: +; CHECK-NEXT: [[PHI11:%.*]] = phi <4 x i8> [ [[PHI7]], %[[BB_9]] ], [ zeroinitializer, %[[BB_8]] ] +; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x i8> [[PHI11]], i64 0 +; CHECK-NEXT: store i8 [[EXTRACTELEMENT]], ptr addrspace(1) [[PHI]], align 1 +; CHECK-NEXT: br label %[[BB_12]] +; CHECK: [[BB_12]]: +; CHECK-NEXT: [[PHI13]] = phi <4 x i8> [ zeroinitializer, %[[BB_10]] ], [ [[PHI7]], %[[BB_6]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_6]], label %[[BB_14]] +; CHECK: [[BB_14]]: +; CHECK-NEXT: [[PHI15]] = phi <4 x i8> [ [[ARG1_LOAD]], %[[BB_5]] ], [ zeroinitializer, %[[BB_12]] ] +; CHECK-NEXT: [[GETELEMENTPTR]] = getelementptr i8, ptr addrspace(1) [[PHI]], i64 [[ARG2_LOAD]] +; CHECK-NEXT: br label %[[BB_3]] +; +bb: + %widget.kernarg.segment = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() + %arg.kernarg.offset.align.down = getelementptr inbounds i8, ptr addrspace(4) %widget.kernarg.segment, i64 36 + %0 = load i32, ptr addrspace(4) %arg.kernarg.offset.align.down, align 4 + %1 = trunc i32 %0 to i1 + %arg1.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %widget.kernarg.segment, i64 40 + %arg1.load = load <4 x i8>, ptr addrspace(4) %arg1.kernarg.offset, align 8 + %arg2.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %widget.kernarg.segment, i64 44 + %arg2.load = load i64, ptr addrspace(4) %arg2.kernarg.offset, align 4 + br label %bb.3 + +bb.3: ; preds = %bb.14, %bb + %phi = phi ptr addrspace(1) [ null, %bb ], [ %getelementptr, %bb.14 ] + %phi4 = phi <4 x i8> [ splat (i8 1), %bb ], [ %phi15, %bb.14 ] + br i1 %1, label %bb.6.preheader, label %bb.5 + +bb.5: ; preds = %bb.3 + br label %bb.14 + +bb.6.preheader: ; preds = %bb.3 + br label %bb.6 + +bb.6: ; preds = %bb.6.preheader, %bb.12 + %phi7 = phi <4 x i8> [ %phi13, %bb.12 ], [ %phi4, %bb.6.preheader ] + br i1 %1, label %bb.8, label %bb.12 + +bb.8: ; preds = %bb.6 + br i1 %1, label %bb.10, label %bb.9 + +bb.9: ; preds = %bb.8 + br label %bb.10 + +bb.10: ; preds = %bb.9, %bb.8 + %phi11 = phi <4 x i8> [ %phi7, %bb.9 ], [ zeroinitializer, %bb.8 ] + %extractelement = extractelement <4 x i8> %phi11, i64 0 + store i8 %extractelement, ptr addrspace(1) %phi, align 1 + br label %bb.12 + +bb.12: ; preds = %bb.10, %bb.6 + %phi13 = phi <4 x i8> [ zeroinitializer, %bb.10 ], [ %phi7, %bb.6 ] + br i1 %1, label %bb.6, label %bb.14 + +bb.14: ; preds = %bb.5, %bb.12 + %phi15 = phi <4 x i8> [ %arg1.load, %bb.5 ], [ zeroinitializer, %bb.12 ] + %getelementptr = getelementptr i8, ptr addrspace(1) %phi, i64 %arg2.load + br label %bb.3 +} From 142707bb2e28ccdc7df9257f8787995626841700 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Wed, 9 Jul 2025 10:34:42 +0200 Subject: [PATCH 2/2] Further reduce test --- .../amdgpu-late-codegenprepare-crash-splat.ll | 126 ++++++++---------- 1 file changed, 52 insertions(+), 74 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare-crash-splat.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare-crash-splat.ll index 837b5e0ab5833..b8464c37a5dc2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare-crash-splat.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare-crash-splat.ll @@ -1,94 +1,72 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes="amdgpu-late-codegenprepare,verify" %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-late-codegenprepare %s | FileCheck %s ; This crashed because the PHI with a splat was rejected, but then we marked the PHI ; as visited and tried to convert one of its user afterwards. -define amdgpu_kernel void @widget(i1 %arg, <4 x i8> %arg1, i64 %arg2) { +define amdgpu_kernel void @widget(ptr %arg, ptr %arg1, ptr %arg2) { ; CHECK-LABEL: define amdgpu_kernel void @widget( -; CHECK-SAME: i1 [[ARG:%.*]], <4 x i8> [[ARG1:%.*]], i64 [[ARG2:%.*]]) { +; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) { ; CHECK-NEXT: [[BB:.*]]: -; CHECK-NEXT: [[WIDGET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; CHECK-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[WIDGET_KERNARG_SEGMENT]], i64 36 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i1 -; CHECK-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[WIDGET_KERNARG_SEGMENT]], i64 40 -; CHECK-NEXT: [[ARG1_LOAD:%.*]] = load <4 x i8>, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 8 -; CHECK-NEXT: [[ARG2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[WIDGET_KERNARG_SEGMENT]], i64 44 -; CHECK-NEXT: [[ARG2_LOAD:%.*]] = load i64, ptr addrspace(4) [[ARG2_KERNARG_OFFSET]], align 4 -; CHECK-NEXT: br label %[[BB_3:.*]] -; CHECK: [[BB_3]]: -; CHECK-NEXT: [[PHI:%.*]] = phi ptr addrspace(1) [ null, %[[BB]] ], [ [[GETELEMENTPTR:%.*]], %[[BB_14:.*]] ] -; CHECK-NEXT: [[PHI4:%.*]] = phi <4 x i8> [ splat (i8 1), %[[BB]] ], [ [[PHI15:%.*]], %[[BB_14]] ] -; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_6_PREHEADER:.*]], label %[[BB_5:.*]] +; CHECK-NEXT: [[ARG1_LOAD:%.*]] = load <4 x i8>, ptr [[ARG1]], align 4 +; CHECK-NEXT: [[ARG2_LOAD:%.*]] = load i64, ptr [[ARG2]], align 4 +; CHECK-NEXT: br label %[[BB_1:.*]] +; CHECK: [[BB_1]]: +; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ null, %[[BB]] ], [ [[ARG1]], %[[BB_6:.*]] ] +; CHECK-NEXT: [[PHI4:%.*]] = phi <4 x i8> [ splat (i8 1), %[[BB]] ], [ [[PHI15:%.*]], %[[BB_6]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_2:.*]], label %[[BB_6]] +; CHECK: [[BB_2]]: +; CHECK-NEXT: [[PHI7:%.*]] = phi <4 x i8> [ [[PHI13:%.*]], %[[BB_5:.*]] ], [ [[PHI4]], %[[BB_1]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_4:.*]], label %[[BB_5]] +; CHECK: [[BB_3:.*]]: +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_4]], label %[[BB_EXIT:.*]] +; CHECK: [[BB_4]]: +; CHECK-NEXT: [[PHI11:%.*]] = phi <4 x i8> [ [[PHI7]], %[[BB_3]] ], [ zeroinitializer, %[[BB_2]] ] +; CHECK-NEXT: store <4 x i8> [[PHI11]], ptr [[PHI]], align 1 +; CHECK-NEXT: br label %[[BB_5]] ; CHECK: [[BB_5]]: -; CHECK-NEXT: br label %[[BB_14]] -; CHECK: [[BB_6_PREHEADER]]: -; CHECK-NEXT: br label %[[BB_6:.*]] +; CHECK-NEXT: [[PHI13]] = phi <4 x i8> [ zeroinitializer, %[[BB_4]] ], [ [[PHI7]], %[[BB_2]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_2]], label %[[BB_6]] ; CHECK: [[BB_6]]: -; CHECK-NEXT: [[PHI7:%.*]] = phi <4 x i8> [ [[PHI13:%.*]], %[[BB_12:.*]] ], [ [[PHI4]], %[[BB_6_PREHEADER]] ] -; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_8:.*]], label %[[BB_12]] -; CHECK: [[BB_8]]: -; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_10:.*]], label %[[BB_9:.*]] -; CHECK: [[BB_9]]: -; CHECK-NEXT: br label %[[BB_10]] -; CHECK: [[BB_10]]: -; CHECK-NEXT: [[PHI11:%.*]] = phi <4 x i8> [ [[PHI7]], %[[BB_9]] ], [ zeroinitializer, %[[BB_8]] ] -; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x i8> [[PHI11]], i64 0 -; CHECK-NEXT: store i8 [[EXTRACTELEMENT]], ptr addrspace(1) [[PHI]], align 1 -; CHECK-NEXT: br label %[[BB_12]] -; CHECK: [[BB_12]]: -; CHECK-NEXT: [[PHI13]] = phi <4 x i8> [ zeroinitializer, %[[BB_10]] ], [ [[PHI7]], %[[BB_6]] ] -; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_6]], label %[[BB_14]] -; CHECK: [[BB_14]]: -; CHECK-NEXT: [[PHI15]] = phi <4 x i8> [ [[ARG1_LOAD]], %[[BB_5]] ], [ zeroinitializer, %[[BB_12]] ] -; CHECK-NEXT: [[GETELEMENTPTR]] = getelementptr i8, ptr addrspace(1) [[PHI]], i64 [[ARG2_LOAD]] -; CHECK-NEXT: br label %[[BB_3]] +; CHECK-NEXT: [[PHI15]] = phi <4 x i8> [ [[ARG1_LOAD]], %[[BB_1]] ], [ zeroinitializer, %[[BB_5]] ] +; CHECK-NEXT: br label %[[BB_1]] +; CHECK: [[BB_EXIT]]: +; CHECK-NEXT: ret void ; bb: - %widget.kernarg.segment = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() - %arg.kernarg.offset.align.down = getelementptr inbounds i8, ptr addrspace(4) %widget.kernarg.segment, i64 36 - %0 = load i32, ptr addrspace(4) %arg.kernarg.offset.align.down, align 4 - %1 = trunc i32 %0 to i1 - %arg1.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %widget.kernarg.segment, i64 40 - %arg1.load = load <4 x i8>, ptr addrspace(4) %arg1.kernarg.offset, align 8 - %arg2.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %widget.kernarg.segment, i64 44 - %arg2.load = load i64, ptr addrspace(4) %arg2.kernarg.offset, align 4 - br label %bb.3 + %ld = load i32, ptr %arg, align 4 + %ld.trunc = trunc i32 %ld to i1 + %arg1.load = load <4 x i8>, ptr %arg1, align 4 + %arg2.load = load i64, ptr %arg2, align 4 + br label %bb.1 -bb.3: ; preds = %bb.14, %bb - %phi = phi ptr addrspace(1) [ null, %bb ], [ %getelementptr, %bb.14 ] - %phi4 = phi <4 x i8> [ splat (i8 1), %bb ], [ %phi15, %bb.14 ] - br i1 %1, label %bb.6.preheader, label %bb.5 +bb.1: + %phi = phi ptr [ null, %bb ], [ %arg1, %bb.6 ] + %phi4 = phi <4 x i8> [ splat (i8 1), %bb ], [ %phi15, %bb.6 ] + br i1 %ld.trunc, label %bb.2, label %bb.6 -bb.5: ; preds = %bb.3 - br label %bb.14 +bb.2: + %phi7 = phi <4 x i8> [ %phi13, %bb.5 ], [ %phi4, %bb.1 ] + br i1 %ld.trunc, label %bb.4, label %bb.5 -bb.6.preheader: ; preds = %bb.3 - br label %bb.6 +bb.3: + br i1 %ld.trunc, label %bb.4, label %bb.exit -bb.6: ; preds = %bb.6.preheader, %bb.12 - %phi7 = phi <4 x i8> [ %phi13, %bb.12 ], [ %phi4, %bb.6.preheader ] - br i1 %1, label %bb.8, label %bb.12 +bb.4: + %phi11 = phi <4 x i8> [ %phi7, %bb.3 ], [ zeroinitializer, %bb.2 ] + store <4 x i8> %phi11, ptr %phi, align 1 + br label %bb.5 -bb.8: ; preds = %bb.6 - br i1 %1, label %bb.10, label %bb.9 +bb.5: + %phi13 = phi <4 x i8> [ zeroinitializer, %bb.4 ], [ %phi7, %bb.2 ] + br i1 %ld.trunc, label %bb.2, label %bb.6 -bb.9: ; preds = %bb.8 - br label %bb.10 +bb.6: + %phi15 = phi <4 x i8> [ %arg1.load, %bb.1 ], [ zeroinitializer, %bb.5 ] + br label %bb.1 -bb.10: ; preds = %bb.9, %bb.8 - %phi11 = phi <4 x i8> [ %phi7, %bb.9 ], [ zeroinitializer, %bb.8 ] - %extractelement = extractelement <4 x i8> %phi11, i64 0 - store i8 %extractelement, ptr addrspace(1) %phi, align 1 - br label %bb.12 - -bb.12: ; preds = %bb.10, %bb.6 - %phi13 = phi <4 x i8> [ zeroinitializer, %bb.10 ], [ %phi7, %bb.6 ] - br i1 %1, label %bb.6, label %bb.14 - -bb.14: ; preds = %bb.5, %bb.12 - %phi15 = phi <4 x i8> [ %arg1.load, %bb.5 ], [ zeroinitializer, %bb.12 ] - %getelementptr = getelementptr i8, ptr addrspace(1) %phi, i64 %arg2.load - br label %bb.3 +bb.exit: + ret void }