@@ -1115,4 +1115,141 @@ tail:
1115
1115
unreachable
1116
1116
}
1117
1117
1118
+ ; Since functions that contain amdgcn.init.whole.wave do not preserve the inactive
1119
+ ; lanes of any VGPRs, the middle end will explicitly preserve them if needed by adding
1120
+ ; dummy VGPR arguments. Since only the inactive lanes are important, we need to make
1121
+ ; it clear to the backend that it's safe to allocate v9's active lanes inside
1122
+ ; shader. This is achieved by using the llvm.amdgcn.dead intrinsic.
1123
+ define amdgpu_cs_chain void @with_inactive_vgprs (ptr inreg %callee , i32 inreg %exec , i32 inreg %sgpr , i32 %active.vgpr , i32 %inactive.vgpr ) {
1124
+ ; GISEL12-LABEL: with_inactive_vgprs:
1125
+ ; GISEL12: ; %bb.0: ; %entry
1126
+ ; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1127
+ ; GISEL12-NEXT: s_wait_expcnt 0x0
1128
+ ; GISEL12-NEXT: s_wait_samplecnt 0x0
1129
+ ; GISEL12-NEXT: s_wait_bvhcnt 0x0
1130
+ ; GISEL12-NEXT: s_wait_kmcnt 0x0
1131
+ ; GISEL12-NEXT: s_or_saveexec_b32 s6, -1
1132
+ ; GISEL12-NEXT: s_mov_b32 s4, s0
1133
+ ; GISEL12-NEXT: s_mov_b32 s5, s1
1134
+ ; GISEL12-NEXT: s_mov_b32 s0, s3
1135
+ ; GISEL12-NEXT: s_wait_alu 0xfffe
1136
+ ; GISEL12-NEXT: s_and_saveexec_b32 s1, s6
1137
+ ; GISEL12-NEXT: s_cbranch_execz .LBB6_2
1138
+ ; GISEL12-NEXT: ; %bb.1: ; %shader
1139
+ ; GISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
1140
+ ; GISEL12-NEXT: flat_load_b32 v11, v[9:10]
1141
+ ; GISEL12-NEXT: ;;#ASMSTART
1142
+ ; GISEL12-NEXT: ; use v0-7
1143
+ ; GISEL12-NEXT: ;;#ASMEND
1144
+ ; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1145
+ ; GISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
1146
+ ; GISEL12-NEXT: flat_store_b32 v[9:10], v11
1147
+ ; GISEL12-NEXT: ; implicit-def: $vgpr9
1148
+ ; GISEL12-NEXT: .LBB6_2: ; %tail.block
1149
+ ; GISEL12-NEXT: s_wait_alu 0xfffe
1150
+ ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s1
1151
+ ; GISEL12-NEXT: s_mov_b32 exec_lo, s2
1152
+ ; GISEL12-NEXT: s_setpc_b64 s[4:5]
1153
+ ;
1154
+ ; DAGISEL12-LABEL: with_inactive_vgprs:
1155
+ ; DAGISEL12: ; %bb.0: ; %entry
1156
+ ; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1157
+ ; DAGISEL12-NEXT: s_wait_expcnt 0x0
1158
+ ; DAGISEL12-NEXT: s_wait_samplecnt 0x0
1159
+ ; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
1160
+ ; DAGISEL12-NEXT: s_wait_kmcnt 0x0
1161
+ ; DAGISEL12-NEXT: s_or_saveexec_b32 s6, -1
1162
+ ; DAGISEL12-NEXT: s_mov_b32 s5, s1
1163
+ ; DAGISEL12-NEXT: s_mov_b32 s4, s0
1164
+ ; DAGISEL12-NEXT: s_wait_alu 0xfffe
1165
+ ; DAGISEL12-NEXT: s_and_saveexec_b32 s0, s6
1166
+ ; DAGISEL12-NEXT: s_cbranch_execz .LBB6_2
1167
+ ; DAGISEL12-NEXT: ; %bb.1: ; %shader
1168
+ ; DAGISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
1169
+ ; DAGISEL12-NEXT: flat_load_b32 v11, v[9:10]
1170
+ ; DAGISEL12-NEXT: ;;#ASMSTART
1171
+ ; DAGISEL12-NEXT: ; use v0-7
1172
+ ; DAGISEL12-NEXT: ;;#ASMEND
1173
+ ; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1174
+ ; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
1175
+ ; DAGISEL12-NEXT: flat_store_b32 v[9:10], v11
1176
+ ; DAGISEL12-NEXT: ; implicit-def: $vgpr9
1177
+ ; DAGISEL12-NEXT: .LBB6_2: ; %tail.block
1178
+ ; DAGISEL12-NEXT: s_wait_alu 0xfffe
1179
+ ; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s0
1180
+ ; DAGISEL12-NEXT: s_mov_b32 s0, s3
1181
+ ; DAGISEL12-NEXT: s_mov_b32 exec_lo, s2
1182
+ ; DAGISEL12-NEXT: s_wait_alu 0xfffe
1183
+ ; DAGISEL12-NEXT: s_setpc_b64 s[4:5]
1184
+ ;
1185
+ ; GISEL10-LABEL: with_inactive_vgprs:
1186
+ ; GISEL10: ; %bb.0: ; %entry
1187
+ ; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1188
+ ; GISEL10-NEXT: s_or_saveexec_b32 s6, -1
1189
+ ; GISEL10-NEXT: s_mov_b32 s4, s0
1190
+ ; GISEL10-NEXT: s_mov_b32 s5, s1
1191
+ ; GISEL10-NEXT: s_mov_b32 s0, s3
1192
+ ; GISEL10-NEXT: s_and_saveexec_b32 s1, s6
1193
+ ; GISEL10-NEXT: s_cbranch_execz .LBB6_2
1194
+ ; GISEL10-NEXT: ; %bb.1: ; %shader
1195
+ ; GISEL10-NEXT: v_mov_b32_e32 v10, s5
1196
+ ; GISEL10-NEXT: v_mov_b32_e32 v9, s4
1197
+ ; GISEL10-NEXT: flat_load_dword v11, v[9:10]
1198
+ ; GISEL10-NEXT: ;;#ASMSTART
1199
+ ; GISEL10-NEXT: ; use v0-7
1200
+ ; GISEL10-NEXT: ;;#ASMEND
1201
+ ; GISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1202
+ ; GISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
1203
+ ; GISEL10-NEXT: flat_store_dword v[9:10], v11
1204
+ ; GISEL10-NEXT: ; implicit-def: $vgpr9
1205
+ ; GISEL10-NEXT: .LBB6_2: ; %tail.block
1206
+ ; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s1
1207
+ ; GISEL10-NEXT: s_mov_b32 exec_lo, s2
1208
+ ; GISEL10-NEXT: s_setpc_b64 s[4:5]
1209
+ ;
1210
+ ; DAGISEL10-LABEL: with_inactive_vgprs:
1211
+ ; DAGISEL10: ; %bb.0: ; %entry
1212
+ ; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1213
+ ; DAGISEL10-NEXT: s_or_saveexec_b32 s6, -1
1214
+ ; DAGISEL10-NEXT: s_mov_b32 s5, s1
1215
+ ; DAGISEL10-NEXT: s_mov_b32 s4, s0
1216
+ ; DAGISEL10-NEXT: s_and_saveexec_b32 s0, s6
1217
+ ; DAGISEL10-NEXT: s_cbranch_execz .LBB6_2
1218
+ ; DAGISEL10-NEXT: ; %bb.1: ; %shader
1219
+ ; DAGISEL10-NEXT: v_mov_b32_e32 v10, s5
1220
+ ; DAGISEL10-NEXT: v_mov_b32_e32 v9, s4
1221
+ ; DAGISEL10-NEXT: flat_load_dword v11, v[9:10]
1222
+ ; DAGISEL10-NEXT: ;;#ASMSTART
1223
+ ; DAGISEL10-NEXT: ; use v0-7
1224
+ ; DAGISEL10-NEXT: ;;#ASMEND
1225
+ ; DAGISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1226
+ ; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
1227
+ ; DAGISEL10-NEXT: flat_store_dword v[9:10], v11
1228
+ ; DAGISEL10-NEXT: ; implicit-def: $vgpr9
1229
+ ; DAGISEL10-NEXT: .LBB6_2: ; %tail.block
1230
+ ; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s0
1231
+ ; DAGISEL10-NEXT: s_mov_b32 s0, s3
1232
+ ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s2
1233
+ ; DAGISEL10-NEXT: s_setpc_b64 s[4:5]
1234
+ entry:
1235
+ %imp.def = call i32 @llvm.amdgcn.dead ()
1236
+ %initial.exec = call i1 @llvm.amdgcn.init.whole.wave ()
1237
+ br i1 %initial.exec , label %shader , label %tail.block
1238
+
1239
+ shader: ; preds = %entry
1240
+ %use.another.vgpr = load i32 , ptr %callee ; smth that won't be moved past the inline asm
1241
+ call void asm sideeffect "; use v0-7" , "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
1242
+ store i32 %use.another.vgpr , ptr %callee
1243
+ %active.vgpr.new = add i32 %active.vgpr , %use.another.vgpr
1244
+ br label %tail.block
1245
+
1246
+ tail .block: ; preds = %.exit27, %.exit49, %244, %243, %entry
1247
+ %active.vgpr.arg = phi i32 [ %active.vgpr , %entry ], [ %active.vgpr.new , %shader ]
1248
+ %inactive.vgpr.arg = phi i32 [ %inactive.vgpr , %entry ], [ %imp.def , %shader ]
1249
+ %vgprs.0 = insertvalue { i32 , i32 } poison, i32 %active.vgpr.arg , 0
1250
+ %vgprs = insertvalue { i32 , i32 } %vgprs.0 , i32 %inactive.vgpr.arg , 1
1251
+ call void (ptr , i32 , i32 , { i32 , i32 }, i32 , ...) @llvm.amdgcn.cs.chain.p0.i32.i32.sl_i32i32 (ptr inreg %callee , i32 inreg %exec , i32 inreg %sgpr , { i32 , i32 } %vgprs , i32 0 )
1252
+ unreachable
1253
+ }
1254
+
1118
1255
declare amdgpu_gfx <16 x i32 > @write_v0_v15 (<16 x i32 >)
0 commit comments