|
4 | 4 | declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata)
|
5 | 5 | declare void @g()
|
6 | 6 |
|
| 7 | +; TODO: Merging scalars into vectors is unprofitable because we have no |
| 8 | +; vector CSRs which creates additional spills around the call. |
7 | 9 | define void @f(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) {
|
8 | 10 | ; CHECK-LABEL: f:
|
9 | 11 | ; CHECK: # %bb.0:
|
@@ -93,3 +95,148 @@ define void @f1(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) {
|
93 | 95 |
|
94 | 96 | ret void
|
95 | 97 | }
|
| 98 | + |
| 99 | +; Merging scalars is profitable, it reduces pressure within a single |
| 100 | +; register class. |
| 101 | +define void @i8_i16(ptr %p, ptr %q) { |
| 102 | +; CHECK-LABEL: i8_i16: |
| 103 | +; CHECK: # %bb.0: |
| 104 | +; CHECK-NEXT: addi sp, sp, -32 |
| 105 | +; CHECK-NEXT: .cfi_def_cfa_offset 32 |
| 106 | +; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill |
| 107 | +; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill |
| 108 | +; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill |
| 109 | +; CHECK-NEXT: .cfi_offset ra, -8 |
| 110 | +; CHECK-NEXT: .cfi_offset s0, -16 |
| 111 | +; CHECK-NEXT: .cfi_offset s1, -24 |
| 112 | +; CHECK-NEXT: lh s1, 0(a0) |
| 113 | +; CHECK-NEXT: mv s0, a1 |
| 114 | +; CHECK-NEXT: call g |
| 115 | +; CHECK-NEXT: sh s1, 0(s0) |
| 116 | +; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload |
| 117 | +; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload |
| 118 | +; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload |
| 119 | +; CHECK-NEXT: .cfi_restore ra |
| 120 | +; CHECK-NEXT: .cfi_restore s0 |
| 121 | +; CHECK-NEXT: .cfi_restore s1 |
| 122 | +; CHECK-NEXT: addi sp, sp, 32 |
| 123 | +; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| 124 | +; CHECK-NEXT: ret |
| 125 | + %p0 = getelementptr i8, ptr %p, i64 0 |
| 126 | + %p1 = getelementptr i8, ptr %p, i64 1 |
| 127 | + %x0 = load i8, ptr %p0, align 2 |
| 128 | + %x1 = load i8, ptr %p1 |
| 129 | + call void @g() |
| 130 | + %q0 = getelementptr i8, ptr %q, i64 0 |
| 131 | + %q1 = getelementptr i8, ptr %q, i64 1 |
| 132 | + store i8 %x0, ptr %q0, align 2 |
| 133 | + store i8 %x1, ptr %q1 |
| 134 | + ret void |
| 135 | +} |
| 136 | + |
| 137 | +; Merging vectors is profitable, it reduces pressure within a single |
| 138 | +; register class. |
| 139 | +define void @v2i8_v4i8(ptr %p, ptr %q) { |
| 140 | +; CHECK-LABEL: v2i8_v4i8: |
| 141 | +; CHECK: # %bb.0: |
| 142 | +; CHECK-NEXT: addi sp, sp, -32 |
| 143 | +; CHECK-NEXT: .cfi_def_cfa_offset 32 |
| 144 | +; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill |
| 145 | +; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill |
| 146 | +; CHECK-NEXT: .cfi_offset ra, -8 |
| 147 | +; CHECK-NEXT: .cfi_offset s0, -16 |
| 148 | +; CHECK-NEXT: csrr a2, vlenb |
| 149 | +; CHECK-NEXT: sub sp, sp, a2 |
| 150 | +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb |
| 151 | +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma |
| 152 | +; CHECK-NEXT: vle8.v v8, (a0) |
| 153 | +; CHECK-NEXT: addi a0, sp, 16 |
| 154 | +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill |
| 155 | +; CHECK-NEXT: mv s0, a1 |
| 156 | +; CHECK-NEXT: call g |
| 157 | +; CHECK-NEXT: addi a0, sp, 16 |
| 158 | +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload |
| 159 | +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma |
| 160 | +; CHECK-NEXT: vse8.v v8, (s0) |
| 161 | +; CHECK-NEXT: csrr a0, vlenb |
| 162 | +; CHECK-NEXT: add sp, sp, a0 |
| 163 | +; CHECK-NEXT: .cfi_def_cfa sp, 32 |
| 164 | +; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload |
| 165 | +; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload |
| 166 | +; CHECK-NEXT: .cfi_restore ra |
| 167 | +; CHECK-NEXT: .cfi_restore s0 |
| 168 | +; CHECK-NEXT: addi sp, sp, 32 |
| 169 | +; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| 170 | +; CHECK-NEXT: ret |
| 171 | + %p0 = getelementptr i8, ptr %p, i64 0 |
| 172 | + %p1 = getelementptr i8, ptr %p, i64 2 |
| 173 | + %x0 = load <2 x i8>, ptr %p0, align 2 |
| 174 | + %x1 = load <2 x i8>, ptr %p1 |
| 175 | + call void @g() |
| 176 | + %q0 = getelementptr i8, ptr %q, i64 0 |
| 177 | + %q1 = getelementptr i8, ptr %q, i64 2 |
| 178 | + store <2 x i8> %x0, ptr %q0, align 2 |
| 179 | + store <2 x i8> %x1, ptr %q1 |
| 180 | + ret void |
| 181 | +} |
| 182 | + |
| 183 | +; Merging two 16 x i8 into one 32 x i8 (on zvl128b) will require the same |
| 184 | +; numbers of registers to be spilled, but it can be done with fewer |
| 185 | +; instructions |
| 186 | +define void @v16i8_v32i8(ptr %p, ptr %q) { |
| 187 | +; CHECK-LABEL: v16i8_v32i8: |
| 188 | +; CHECK: # %bb.0: |
| 189 | +; CHECK-NEXT: addi sp, sp, -32 |
| 190 | +; CHECK-NEXT: .cfi_def_cfa_offset 32 |
| 191 | +; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill |
| 192 | +; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill |
| 193 | +; CHECK-NEXT: .cfi_offset ra, -8 |
| 194 | +; CHECK-NEXT: .cfi_offset s0, -16 |
| 195 | +; CHECK-NEXT: csrr a2, vlenb |
| 196 | +; CHECK-NEXT: slli a2, a2, 1 |
| 197 | +; CHECK-NEXT: sub sp, sp, a2 |
| 198 | +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 2 * vlenb |
| 199 | +; CHECK-NEXT: addi a2, a0, 16 |
| 200 | +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma |
| 201 | +; CHECK-NEXT: vle8.v v8, (a0) |
| 202 | +; CHECK-NEXT: csrr a0, vlenb |
| 203 | +; CHECK-NEXT: add a0, sp, a0 |
| 204 | +; CHECK-NEXT: addi a0, a0, 16 |
| 205 | +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill |
| 206 | +; CHECK-NEXT: vle8.v v8, (a2) |
| 207 | +; CHECK-NEXT: addi a0, sp, 16 |
| 208 | +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill |
| 209 | +; CHECK-NEXT: mv s0, a1 |
| 210 | +; CHECK-NEXT: call g |
| 211 | +; CHECK-NEXT: addi a0, s0, 2 |
| 212 | +; CHECK-NEXT: csrr a1, vlenb |
| 213 | +; CHECK-NEXT: add a1, sp, a1 |
| 214 | +; CHECK-NEXT: addi a1, a1, 16 |
| 215 | +; CHECK-NEXT: vl1r.v v8, (a1) # Unknown-size Folded Reload |
| 216 | +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma |
| 217 | +; CHECK-NEXT: vse8.v v8, (s0) |
| 218 | +; CHECK-NEXT: addi a1, sp, 16 |
| 219 | +; CHECK-NEXT: vl1r.v v8, (a1) # Unknown-size Folded Reload |
| 220 | +; CHECK-NEXT: vse8.v v8, (a0) |
| 221 | +; CHECK-NEXT: csrr a0, vlenb |
| 222 | +; CHECK-NEXT: slli a0, a0, 1 |
| 223 | +; CHECK-NEXT: add sp, sp, a0 |
| 224 | +; CHECK-NEXT: .cfi_def_cfa sp, 32 |
| 225 | +; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload |
| 226 | +; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload |
| 227 | +; CHECK-NEXT: .cfi_restore ra |
| 228 | +; CHECK-NEXT: .cfi_restore s0 |
| 229 | +; CHECK-NEXT: addi sp, sp, 32 |
| 230 | +; CHECK-NEXT: .cfi_def_cfa_offset 0 |
| 231 | +; CHECK-NEXT: ret |
| 232 | + %p0 = getelementptr i8, ptr %p, i64 0 |
| 233 | + %p1 = getelementptr i8, ptr %p, i64 16 |
| 234 | + %x0 = load <16 x i8>, ptr %p0, align 2 |
| 235 | + %x1 = load <16 x i8>, ptr %p1 |
| 236 | + call void @g() |
| 237 | + %q0 = getelementptr i8, ptr %q, i64 0 |
| 238 | + %q1 = getelementptr i8, ptr %q, i64 2 |
| 239 | + store <16 x i8> %x0, ptr %q0, align 16 |
| 240 | + store <16 x i8> %x1, ptr %q1 |
| 241 | + ret void |
| 242 | +} |
0 commit comments