Skip to content

Commit d8b6904

Browse files
author
Abinav Puthan Purayil
committed
[AMDGPU] Set MemoryVT for truncstores in tblgen.
GlobalISelEmitter was skipping these patterns when its predicates were checked. This patch should allow us to select d16_hi stores in GlobalISel. Differential Revision: https://reviews.llvm.org/D117762
1 parent 69ecd24 commit d8b6904

File tree

4 files changed

+121
-139
lines changed

4 files changed

+121
-139
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -406,9 +406,10 @@ class Aligned<int Bytes> {
406406
int MinAlignment = Bytes;
407407
}
408408

409-
class StoreHi16<SDPatternOperator op> : PatFrag <
409+
class StoreHi16<SDPatternOperator op, ValueType vt> : PatFrag <
410410
(ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)> {
411411
let IsStore = 1;
412+
let MemoryVT = vt;
412413
}
413414

414415
def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant,
@@ -527,9 +528,9 @@ def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr),
527528
let MemoryVT = i16;
528529
}
529530

530-
def store_hi16_#as : StoreHi16 <truncstorei16>;
531-
def truncstorei8_hi16_#as : StoreHi16<truncstorei8>;
532-
def truncstorei16_hi16_#as : StoreHi16<truncstorei16>;
531+
def store_hi16_#as : StoreHi16 <truncstorei16, i16>;
532+
def truncstorei8_hi16_#as : StoreHi16<truncstorei8, i8>;
533+
def truncstorei16_hi16_#as : StoreHi16<truncstorei16, i16>;
533534

534535
defm atomic_store_#as : binary_atomic_op<atomic_store>;
535536

llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -268,30 +268,30 @@ define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x)
268268
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1
269269
; GFX10-NEXT: v_lshrrev_b16 v6, 8, v1
270270
; GFX10-NEXT: ds_write_b8 v0, v1
271-
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
272-
; GFX10-NEXT: v_lshrrev_b16 v8, 8, v2
273-
; GFX10-NEXT: v_lshrrev_b16 v7, 8, v5
271+
; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
272+
; GFX10-NEXT: v_lshrrev_b16 v7, 8, v2
273+
; GFX10-NEXT: v_lshrrev_b16 v1, 8, v5
274+
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v2
274275
; GFX10-NEXT: ds_write_b8 v0, v2 offset:4
275276
; GFX10-NEXT: ds_write_b8 v0, v6 offset:1
276-
; GFX10-NEXT: ds_write_b8 v0, v5 offset:2
277-
; GFX10-NEXT: ds_write_b8 v0, v7 offset:3
278-
; GFX10-NEXT: v_lshrrev_b16 v2, 8, v1
279-
; GFX10-NEXT: ds_write_b8 v0, v8 offset:5
280-
; GFX10-NEXT: ds_write_b8 v0, v1 offset:6
281-
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v3
277+
; GFX10-NEXT: ds_write_b8 v0, v1 offset:3
278+
; GFX10-NEXT: ds_write_b8 v0, v7 offset:5
279+
; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
280+
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3
281+
; GFX10-NEXT: v_lshrrev_b16 v1, 8, v5
282282
; GFX10-NEXT: v_lshrrev_b16 v5, 8, v3
283-
; GFX10-NEXT: ds_write_b8 v0, v2 offset:7
283+
; GFX10-NEXT: ds_write_b8 v0, v1 offset:7
284284
; GFX10-NEXT: ds_write_b8 v0, v3 offset:8
285-
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4
286-
; GFX10-NEXT: v_lshrrev_b16 v3, 8, v1
285+
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4
286+
; GFX10-NEXT: v_lshrrev_b16 v2, 8, v2
287287
; GFX10-NEXT: ds_write_b8 v0, v5 offset:9
288288
; GFX10-NEXT: v_lshrrev_b16 v5, 8, v4
289-
; GFX10-NEXT: ds_write_b8 v0, v1 offset:10
290-
; GFX10-NEXT: v_lshrrev_b16 v1, 8, v2
291-
; GFX10-NEXT: ds_write_b8 v0, v3 offset:11
289+
; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:10
290+
; GFX10-NEXT: v_lshrrev_b16 v1, 8, v1
291+
; GFX10-NEXT: ds_write_b8 v0, v2 offset:11
292292
; GFX10-NEXT: ds_write_b8 v0, v4 offset:12
293293
; GFX10-NEXT: ds_write_b8 v0, v5 offset:13
294-
; GFX10-NEXT: ds_write_b8 v0, v2 offset:14
294+
; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:14
295295
; GFX10-NEXT: ds_write_b8 v0, v1 offset:15
296296
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
297297
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -342,24 +342,24 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x)
342342
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v1
343343
; GFX10-NEXT: v_lshrrev_b16 v5, 8, v1
344344
; GFX10-NEXT: ds_write_b8 v0, v1
345+
; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
345346
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
347+
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3
348+
; GFX10-NEXT: v_lshrrev_b16 v4, 8, v4
346349
; GFX10-NEXT: v_lshrrev_b16 v6, 8, v2
347-
; GFX10-NEXT: v_lshrrev_b16 v7, 8, v4
348350
; GFX10-NEXT: ds_write_b8 v0, v2 offset:4
349-
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3
351+
; GFX10-NEXT: v_lshrrev_b16 v1, 8, v1
350352
; GFX10-NEXT: ds_write_b8 v0, v5 offset:1
351-
; GFX10-NEXT: ds_write_b8 v0, v4 offset:2
352-
; GFX10-NEXT: ds_write_b8 v0, v7 offset:3
353-
; GFX10-NEXT: v_lshrrev_b16 v4, 8, v1
354-
; GFX10-NEXT: v_lshrrev_b16 v5, 8, v3
355-
; GFX10-NEXT: ds_write_b8 v0, v1 offset:6
356-
; GFX10-NEXT: v_lshrrev_b16 v1, 8, v2
353+
; GFX10-NEXT: ds_write_b8 v0, v4 offset:3
354+
; GFX10-NEXT: v_lshrrev_b16 v4, 8, v3
355+
; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6
356+
; GFX10-NEXT: v_lshrrev_b16 v2, 8, v7
357357
; GFX10-NEXT: ds_write_b8 v0, v6 offset:5
358-
; GFX10-NEXT: ds_write_b8 v0, v4 offset:7
358+
; GFX10-NEXT: ds_write_b8 v0, v1 offset:7
359359
; GFX10-NEXT: ds_write_b8 v0, v3 offset:8
360-
; GFX10-NEXT: ds_write_b8 v0, v5 offset:9
361-
; GFX10-NEXT: ds_write_b8 v0, v2 offset:10
362-
; GFX10-NEXT: ds_write_b8 v0, v1 offset:11
360+
; GFX10-NEXT: ds_write_b8 v0, v4 offset:9
361+
; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:10
362+
; GFX10-NEXT: ds_write_b8 v0, v2 offset:11
363363
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
364364
; GFX10-NEXT: s_setpc_b64 s[30:31]
365365
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1

llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -356,9 +356,8 @@ define amdgpu_kernel void @constant_sextload_i8_align2(i32 addrspace(1)* %out, i
356356
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
357357
; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3]
358358
; GFX9-NEXT: s_waitcnt vmcnt(0)
359-
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
360359
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
361-
; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2
360+
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
362361
; GFX9-NEXT: s_endpgm
363362
;
364363
; GFX10-LABEL: constant_sextload_i8_align2:
@@ -368,9 +367,8 @@ define amdgpu_kernel void @constant_sextload_i8_align2(i32 addrspace(1)* %out, i
368367
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
369368
; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3]
370369
; GFX10-NEXT: s_waitcnt vmcnt(0)
371-
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
372370
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
373-
; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2
371+
; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
374372
; GFX10-NEXT: s_endpgm
375373
%load = load i8, i8 addrspace(1)* %in, align 2
376374
%sextload = sext i8 %load to i32
@@ -405,9 +403,8 @@ define amdgpu_kernel void @constant_zextload_i8_align2(i32 addrspace(1)* %out, i
405403
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
406404
; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
407405
; GFX9-NEXT: s_waitcnt vmcnt(0)
408-
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
409406
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
410-
; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2
407+
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
411408
; GFX9-NEXT: s_endpgm
412409
;
413410
; GFX10-LABEL: constant_zextload_i8_align2:
@@ -417,9 +414,8 @@ define amdgpu_kernel void @constant_zextload_i8_align2(i32 addrspace(1)* %out, i
417414
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
418415
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
419416
; GFX10-NEXT: s_waitcnt vmcnt(0)
420-
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
421417
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
422-
; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2
418+
; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
423419
; GFX10-NEXT: s_endpgm
424420
%load = load i8, i8 addrspace(1)* %in, align 2
425421
%zextload = zext i8 %load to i32

0 commit comments

Comments
 (0)