Skip to content

Commit 0ba5958

Browse files
authored
[flang] Optimize assignments of multidimensional arrays (#146408)
Assignments of n-dimensional arrays, with trivial RHS, were always being converted to n nested loops. For contiguous arrays, it's possible to flatten them and use a single loop, that can usually be better optimized by LLVM. In a test program, using a 3-dimensional array and varying its size, the resulting speedup was as follows (measured on Graviton4): 16K 1.09 64K 1.40 128K 1.90 256K 1.91 512K 1.00 For sizes above or equal to 512K no improvement was observed. It looks like LLVM stops trying to perform aggressive loop unrolling at a certain threshold and just uses nested loops instead. Larger sizes won't fit on L1 and L2 caches too. This was noticed while profiling 527.cam4_r. This optimization makes aer_rad_props_sw slightly faster, but unfortunately it practically doesn't change 527.cam4_r total execution time.
1 parent d17a248 commit 0ba5958

File tree

2 files changed

+88
-19
lines changed

2 files changed

+88
-19
lines changed

flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "flang/Optimizer/HLFIR/HLFIROps.h"
2222
#include "flang/Optimizer/HLFIR/Passes.h"
2323
#include "flang/Optimizer/OpenMP/Passes.h"
24+
#include "flang/Optimizer/Support/Utils.h"
2425
#include "flang/Optimizer/Transforms/Utils.h"
2526
#include "mlir/Dialect/Func/IR/FuncOps.h"
2627
#include "mlir/IR/Dominance.h"
@@ -786,13 +787,55 @@ llvm::LogicalResult BroadcastAssignBufferization::matchAndRewrite(
786787
mlir::Value shape = hlfir::genShape(loc, builder, lhs);
787788
llvm::SmallVector<mlir::Value> extents =
788789
hlfir::getIndexExtents(loc, builder, shape);
789-
hlfir::LoopNest loopNest =
790-
hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
791-
flangomp::shouldUseWorkshareLowering(assign));
792-
builder.setInsertionPointToStart(loopNest.body);
793-
auto arrayElement =
794-
hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
795-
builder.create<hlfir::AssignOp>(loc, rhs, arrayElement);
790+
791+
if (lhs.isSimplyContiguous() && extents.size() > 1) {
792+
// Flatten the array to use a single assign loop, that can be better
793+
// optimized.
794+
mlir::Value n = extents[0];
795+
for (size_t i = 1; i < extents.size(); ++i)
796+
n = builder.create<mlir::arith::MulIOp>(loc, n, extents[i]);
797+
llvm::SmallVector<mlir::Value> flatExtents = {n};
798+
799+
mlir::Type flatArrayType;
800+
mlir::Value flatArray = lhs.getBase();
801+
if (mlir::isa<fir::BoxType>(lhs.getType())) {
802+
shape = builder.genShape(loc, flatExtents);
803+
flatArrayType = fir::BoxType::get(fir::SequenceType::get(eleTy, 1));
804+
flatArray = builder.create<fir::ReboxOp>(loc, flatArrayType, flatArray,
805+
shape, /*slice=*/mlir::Value{});
806+
} else {
807+
// Array references must have fixed shape, when used in assignments.
808+
int64_t flatExtent = 1;
809+
for (const mlir::Value &extent : extents) {
810+
mlir::Operation *op = extent.getDefiningOp();
811+
assert(op && "no defining operation for constant array extent");
812+
flatExtent *= fir::toInt(mlir::cast<mlir::arith::ConstantOp>(*op));
813+
}
814+
815+
flatArrayType =
816+
fir::ReferenceType::get(fir::SequenceType::get({flatExtent}, eleTy));
817+
flatArray = builder.createConvert(loc, flatArrayType, flatArray);
818+
}
819+
820+
hlfir::LoopNest loopNest =
821+
hlfir::genLoopNest(loc, builder, flatExtents, /*isUnordered=*/true,
822+
flangomp::shouldUseWorkshareLowering(assign));
823+
builder.setInsertionPointToStart(loopNest.body);
824+
825+
mlir::Value arrayElement =
826+
builder.create<hlfir::DesignateOp>(loc, fir::ReferenceType::get(eleTy),
827+
flatArray, loopNest.oneBasedIndices);
828+
builder.create<hlfir::AssignOp>(loc, rhs, arrayElement);
829+
} else {
830+
hlfir::LoopNest loopNest =
831+
hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
832+
flangomp::shouldUseWorkshareLowering(assign));
833+
builder.setInsertionPointToStart(loopNest.body);
834+
auto arrayElement =
835+
hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
836+
builder.create<hlfir::AssignOp>(loc, rhs, arrayElement);
837+
}
838+
796839
rewriter.eraseOp(assign);
797840
return mlir::success();
798841
}

flang/test/HLFIR/opt-scalar-assign.fir

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,18 @@ func.func @_QPtest1() {
1212
return
1313
}
1414
// CHECK-LABEL: func.func @_QPtest1() {
15-
// CHECK: %[[VAL_0:.*]] = arith.constant 1 : index
16-
// CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
17-
// CHECK: %[[VAL_2:.*]] = arith.constant 11 : index
18-
// CHECK: %[[VAL_3:.*]] = arith.constant 13 : index
19-
// CHECK: %[[VAL_4:.*]] = fir.alloca !fir.array<11x13xf32> {bindc_name = "x", uniq_name = "_QFtest1Ex"}
20-
// CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_2]], %[[VAL_3]] : (index, index) -> !fir.shape<2>
21-
// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_5]]) {uniq_name = "_QFtest1Ex"} : (!fir.ref<!fir.array<11x13xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<11x13xf32>>, !fir.ref<!fir.array<11x13xf32>>)
22-
// CHECK: fir.do_loop %[[VAL_7:.*]] = %[[VAL_0]] to %[[VAL_3]] step %[[VAL_0]] unordered {
23-
// CHECK: fir.do_loop %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_2]] step %[[VAL_0]] unordered {
24-
// CHECK: %[[VAL_9:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_8]], %[[VAL_7]]) : (!fir.ref<!fir.array<11x13xf32>>, index, index) -> !fir.ref<f32>
25-
// CHECK: hlfir.assign %[[VAL_1]] to %[[VAL_9]] : f32, !fir.ref<f32>
26-
// CHECK: }
15+
// CHECK: %[[VAL_0:.*]] = arith.constant 143 : index
16+
// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index
17+
// CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
18+
// CHECK: %[[VAL_3:.*]] = arith.constant 11 : index
19+
// CHECK: %[[VAL_4:.*]] = arith.constant 13 : index
20+
// CHECK: %[[VAL_5:.*]] = fir.alloca !fir.array<11x13xf32> {bindc_name = "x", uniq_name = "_QFtest1Ex"}
21+
// CHECK: %[[VAL_6:.*]] = fir.shape %[[VAL_3]], %[[VAL_4]] : (index, index) -> !fir.shape<2>
22+
// CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_6]]) {uniq_name = "_QFtest1Ex"} : (!fir.ref<!fir.array<11x13xf32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<11x13xf32>>, !fir.ref<!fir.array<11x13xf32>>)
23+
// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref<!fir.array<11x13xf32>>) -> !fir.ref<!fir.array<143xf32>>
24+
// CHECK: fir.do_loop %[[VAL_9:.*]] = %[[VAL_1]] to %[[VAL_0]] step %[[VAL_1]] unordered {
25+
// CHECK: %[[VAL_10:.*]] = hlfir.designate %[[VAL_8]] (%[[VAL_9]]) : (!fir.ref<!fir.array<143xf32>>, index) -> !fir.ref<f32>
26+
// CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_10]] : f32, !fir.ref<f32>
2727
// CHECK: }
2828
// CHECK: return
2929
// CHECK: }
@@ -129,3 +129,29 @@ func.func @_QPtest5(%arg0: !fir.ref<!fir.array<77xcomplex<f32>>> {fir.bindc_name
129129
// CHECK: }
130130
// CHECK: return
131131
// CHECK: }
132+
133+
func.func @_QPtest6(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {fir.bindc_name = "x"}) {
134+
%c0_i32 = arith.constant 0 : i32
135+
%0:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest6Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
136+
hlfir.assign %c0_i32 to %0#0 realloc : i32, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
137+
return
138+
}
139+
140+
// CHECK-LABEL: func.func @_QPtest6(
141+
// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {fir.bindc_name = "x"}) {
142+
// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index
143+
// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index
144+
// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i32
145+
// CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest6Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
146+
// CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>
147+
// CHECK: %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_5]], %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>, index) -> (index, index, index)
148+
// CHECK: %[[VAL_7:.*]]:3 = fir.box_dims %[[VAL_5]], %[[VAL_1]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>, index) -> (index, index, index)
149+
// CHECK: %[[VAL_8:.*]] = arith.muli %[[VAL_6]]#1, %[[VAL_7]]#1 : index
150+
// CHECK: %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1>
151+
// CHECK: %[[VAL_10:.*]] = fir.rebox %[[VAL_5]](%[[VAL_9]]) : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
152+
// CHECK: fir.do_loop %[[VAL_11:.*]] = %[[VAL_1]] to %[[VAL_8]] step %[[VAL_1]] unordered {
153+
// CHECK: %[[VAL_12:.*]] = hlfir.designate %[[VAL_10]] (%[[VAL_11]]) : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
154+
// CHECK: hlfir.assign %[[VAL_3]] to %[[VAL_12]] : i32, !fir.ref<i32>
155+
// CHECK: }
156+
// CHECK: return
157+
// CHECK: }

0 commit comments

Comments
 (0)