Skip to content

Commit 904b850

Browse files
Update Arm Compute Library Execution Provider (microsoft#22032)
### Description This PR makes the following updates to the Arm Compute Library execution provider: - Target Arm Compute Library 24.07 - Add support for the following operators: - Conv (FP16) - NhwcConv - QLinearConv - MatMul - FusedMatMul - MatMulIntegerToFloat - Optimize memory usage and performance - Expose the enable_fast_math setting - Use the main runtime thread pool ### Motivation and Context These updates improve performance and memory usage, and enable use of a more recent version of Arm Compute Library. @microsoft-github-policy-service agree company="Arm Ltd" --------- Signed-off-by: Michael Tyler <michael.tyler@arm.com>
1 parent 22437b5 commit 904b850

34 files changed

+1396
-426
lines changed

cmake/CMakeLists.txt

Lines changed: 4 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (c) Microsoft Corporation. All rights reserved.
2+
# SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
23
# Licensed under the MIT License.
34

45
# Minimum CMake required
@@ -132,11 +133,6 @@ option(onnxruntime_USE_DML "Build with DirectML support" OFF)
132133
option(onnxruntime_USE_MIGRAPHX "Build with AMDMIGraphX support" OFF)
133134
option(onnxruntime_USE_WINML "Build with WinML support" OFF)
134135
option(onnxruntime_USE_ACL "Build with ACL support" OFF)
135-
option(onnxruntime_USE_ACL_1902 "Build with ACL version 1902 support" OFF)
136-
option(onnxruntime_USE_ACL_1905 "Build with ACL version 1905 support" OFF)
137-
option(onnxruntime_USE_ACL_1908 "Build with ACL version 1908 support" OFF)
138-
option(onnxruntime_USE_ACL_2002 "Build with ACL version 2002 support" OFF)
139-
option(onnxruntime_USE_ACL_2308 "Build with ACL version 2308 support" OFF)
140136
option(onnxruntime_USE_ARMNN "Build with ArmNN support" OFF)
141137
option(onnxruntime_ARMNN_RELU_USE_CPU "Use the CPU implementation for the Relu operator for the ArmNN EP" ON)
142138
option(onnxruntime_ARMNN_BN_USE_CPU "Use the CPU implementation for the Batch Normalization operator for the ArmNN EP" ON)
@@ -1207,44 +1203,22 @@ function(onnxruntime_add_include_to_target dst_target)
12071203
endfunction()
12081204

12091205
# ACL
1210-
if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002 OR onnxruntime_USE_ACL_2308)
1206+
if (onnxruntime_USE_ACL)
12111207
set(onnxruntime_USE_ACL ON)
1212-
if (onnxruntime_USE_ACL_1902)
1213-
add_definitions(-DACL_1902=1)
1214-
else()
1215-
if (onnxruntime_USE_ACL_1908)
1216-
add_definitions(-DACL_1908=1)
1217-
else()
1218-
if (onnxruntime_USE_ACL_2002)
1219-
add_definitions(-DACL_2002=1)
1220-
else()
1221-
if (onnxruntime_USE_ACL_2308)
1222-
add_definitions(-DACL_2308=1)
1223-
else()
1224-
add_definitions(-DACL_1905=1)
1225-
endif()
1226-
endif()
1227-
endif()
1228-
endif()
12291208

12301209
if (NOT ${onnxruntime_ACL_LIBS} STREQUAL "")
12311210
add_library(arm_compute SHARED IMPORTED)
12321211
set_target_properties(arm_compute PROPERTIES
12331212
IMPORTED_NO_SONAME 1
12341213
IMPORTED_LOCATION "${onnxruntime_ACL_LIBS}/libarm_compute.so")
12351214

1236-
add_library(arm_compute_core SHARED IMPORTED)
1237-
set_target_properties(arm_compute_core PROPERTIES
1238-
IMPORTED_NO_SONAME 1
1239-
IMPORTED_LOCATION "${onnxruntime_ACL_LIBS}/libarm_compute_core.so")
1240-
12411215
add_library(arm_compute_graph SHARED IMPORTED)
12421216
set_target_properties(arm_compute_graph PROPERTIES
12431217
IMPORTED_NO_SONAME 1
12441218
IMPORTED_LOCATION "${onnxruntime_ACL_LIBS}/libarm_compute_graph.so")
12451219
endif()
12461220

1247-
list(APPEND onnxruntime_EXTERNAL_LIBRARIES arm_compute arm_compute_core arm_compute_graph)
1221+
list(APPEND onnxruntime_EXTERNAL_LIBRARIES arm_compute arm_compute_graph)
12481222

12491223
endif()
12501224

@@ -1263,11 +1237,6 @@ if (onnxruntime_USE_ARMNN)
12631237
IMPORTED_NO_SONAME 1
12641238
IMPORTED_LOCATION "${onnxruntime_ACL_LIBS}/libarm_compute.so")
12651239

1266-
add_library(arm_compute_core SHARED IMPORTED)
1267-
set_target_properties(arm_compute_core PROPERTIES
1268-
IMPORTED_NO_SONAME 1
1269-
IMPORTED_LOCATION "${onnxruntime_ACL_LIBS}/libarm_compute_core.so")
1270-
12711240
add_library(arm_compute_graph SHARED IMPORTED)
12721241
set_target_properties(arm_compute_graph PROPERTIES
12731242
IMPORTED_NO_SONAME 1
@@ -1281,7 +1250,7 @@ if (onnxruntime_USE_ARMNN)
12811250
IMPORTED_LOCATION "${onnxruntime_ARMNN_LIBS}/libarmnn.so")
12821251
endif()
12831252

1284-
list(APPEND onnxruntime_EXTERNAL_LIBRARIES armnn arm_compute arm_compute_core arm_compute_graph)
1253+
list(APPEND onnxruntime_EXTERNAL_LIBRARIES armnn arm_compute arm_compute_graph)
12851254
endif()
12861255

12871256
if (onnxruntime_USE_DNNL)

include/onnxruntime/core/providers/acl/acl_provider_factory.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
23
// Licensed under the MIT License.
34

45
#include "onnxruntime_c_api.h"
@@ -10,7 +11,8 @@ extern "C" {
1011
/**
1112
* \param use_arena zero: false. non-zero: true.
1213
*/
13-
ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_ACL, _In_ OrtSessionOptions* options, int use_arena)
14+
ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_ACL, _In_ OrtSessionOptions* options,
15+
bool enable_fast_math)
1416
ORT_ALL_ARGS_NONNULL;
1517

1618
#ifdef __cplusplus

java/src/main/java/ai/onnxruntime/OrtSession.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
22
* Copyright (c) 2019, 2024, Oracle and/or its affiliates. All rights reserved.
3+
* SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
34
* Licensed under the MIT License.
45
*/
56
package ai.onnxruntime;
@@ -1181,12 +1182,12 @@ public void addDirectML(int deviceId) throws OrtException {
11811182
/**
11821183
* Adds the ARM Compute Library as an execution backend.
11831184
*
1184-
* @param useArena If true use the arena memory allocator.
1185+
* @param enableFastMath Enable fast math mode in ACL.
11851186
* @throws OrtException If there was an error in native code.
11861187
*/
1187-
public void addACL(boolean useArena) throws OrtException {
1188+
public void addACL(boolean enableFastMath) throws OrtException {
11881189
checkClosed();
1189-
addACL(OnnxRuntime.ortApiHandle, nativeHandle, useArena ? 1 : 0);
1190+
addACL(OnnxRuntime.ortApiHandle, nativeHandle, enableFastMath);
11901191
}
11911192

11921193
/**
@@ -1354,7 +1355,8 @@ private native void addTvm(long apiHandle, long nativeHandle, String settings)
13541355
private native void addDirectML(long apiHandle, long nativeHandle, int deviceId)
13551356
throws OrtException;
13561357

1357-
private native void addACL(long apiHandle, long nativeHandle, int useArena) throws OrtException;
1358+
private native void addACL(long apiHandle, long nativeHandle, boolean enableFastMath)
1359+
throws OrtException;
13581360

13591361
private native void addArmNN(long apiHandle, long nativeHandle, int useArena)
13601362
throws OrtException;

java/src/main/native/ai_onnxruntime_OrtSession_SessionOptions.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
22
* Copyright (c) 2019, 2023 Oracle and/or its affiliates. All rights reserved.
3+
* SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
34
* Licensed under the MIT License.
45
*/
56
#include <jni.h>
@@ -644,12 +645,13 @@ JNIEXPORT void JNICALL Java_ai_onnxruntime_OrtSession_00024SessionOptions_addDir
644645
* Signature: (JJI)V
645646
*/
646647
JNIEXPORT void JNICALL Java_ai_onnxruntime_OrtSession_00024SessionOptions_addACL
647-
(JNIEnv * jniEnv, jobject jobj, jlong apiHandle, jlong handle, jint useArena) {
648+
(JNIEnv * jniEnv, jobject jobj, jlong apiHandle, jlong handle, jboolean enableFastMath) {
648649
(void)jobj;
649650
#ifdef USE_ACL
650-
checkOrtStatus(jniEnv,(const OrtApi*)apiHandle,OrtSessionOptionsAppendExecutionProvider_ACL((OrtSessionOptions*) handle,useArena));
651+
checkOrtStatus(jniEnv,(const OrtApi*)apiHandle,
652+
OrtSessionOptionsAppendExecutionProvider_ACL((OrtSessionOptions*) handle, enableFastMath));
651653
#else
652-
(void)apiHandle;(void)handle;(void)useArena; // Parameters used when ACL is defined.
654+
(void)apiHandle;(void)handle;(void)enableFastMath; // Parameters used when ACL is defined.
653655
throwOrtException(jniEnv,convertErrorCode(ORT_INVALID_ARGUMENT),"This binary was not compiled with ACL support.");
654656
#endif
655657
}

onnxruntime/core/optimizer/graph_transformer_utils.cc

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
23
// Licensed under the MIT License.
34

45
#include "core/optimizer/graph_transformer_utils.h"
@@ -196,6 +197,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
196197
session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableQuantQDQ, "0") == "1";
197198
#ifndef DISABLE_CONTRIB_OPS
198199
const InlinedHashSet<std::string_view> cpu_ep = {onnxruntime::kCpuExecutionProvider};
200+
const InlinedHashSet<std::string_view> cpu_acl_eps = {onnxruntime::kCpuExecutionProvider,
201+
onnxruntime::kAclExecutionProvider};
199202
#endif
200203
const InlinedHashSet<std::string_view> dml_ep = {onnxruntime::kDmlExecutionProvider};
201204
AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
@@ -285,6 +288,11 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
285288
onnxruntime::kCudaExecutionProvider,
286289
onnxruntime::kRocmExecutionProvider,
287290
onnxruntime::kDmlExecutionProvider};
291+
const InlinedHashSet<std::string_view> cpu_acl_cuda_dml_rocm_eps = {onnxruntime::kCpuExecutionProvider,
292+
onnxruntime::kAclExecutionProvider,
293+
onnxruntime::kCudaExecutionProvider,
294+
onnxruntime::kRocmExecutionProvider,
295+
onnxruntime::kDmlExecutionProvider};
288296
const InlinedHashSet<std::string_view> cpu_rocm_acl_armnn_js_eps = {onnxruntime::kCpuExecutionProvider,
289297
onnxruntime::kRocmExecutionProvider,
290298
onnxruntime::kAclExecutionProvider,
@@ -296,8 +304,9 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
296304
onnxruntime::kAclExecutionProvider,
297305
onnxruntime::kArmNNExecutionProvider,
298306
onnxruntime::kJsExecutionProvider};
299-
const InlinedHashSet<std::string_view> cpu_dml_eps = {onnxruntime::kCpuExecutionProvider,
300-
onnxruntime::kDmlExecutionProvider};
307+
const InlinedHashSet<std::string_view> cpu_dml_acl_eps = {onnxruntime::kCpuExecutionProvider,
308+
onnxruntime::kDmlExecutionProvider,
309+
onnxruntime::kAclExecutionProvider};
301310
const int64_t qdq_matmulnbits_accuracy_level =
302311
ParseStringWithClassicLocale<int64_t>(
303312
session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
@@ -323,26 +332,26 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
323332
}
324333

325334
transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
326-
transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_dml_eps));
327-
transformers.emplace_back(std::make_unique<DynamicQuantizeMatMulFusion>(cpu_ep));
335+
transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_dml_acl_eps));
336+
transformers.emplace_back(std::make_unique<DynamicQuantizeMatMulFusion>(cpu_acl_eps));
328337

329338
transformers.emplace_back(std::make_unique<ConvActivationFusion>(cpu_rocm_acl_armnn_js_eps));
330339

331-
transformers.emplace_back(std::make_unique<GeluFusion>(cpu_cuda_dml_rocm_eps, level));
332-
transformers.emplace_back(std::make_unique<LayerNormFusion>(cpu_cuda_dml_rocm_eps, level));
340+
transformers.emplace_back(std::make_unique<GeluFusion>(cpu_acl_cuda_dml_rocm_eps, level));
341+
transformers.emplace_back(std::make_unique<LayerNormFusion>(cpu_acl_cuda_dml_rocm_eps, level));
333342
transformers.emplace_back(std::make_unique<SimplifiedLayerNormFusion>(cpu_cuda_rocm_eps));
334-
transformers.emplace_back(std::make_unique<AttentionFusion>(cpu_cuda_dml_rocm_eps));
335-
transformers.emplace_back(std::make_unique<EmbedLayerNormFusion>(cpu_cuda_dml_rocm_eps));
343+
transformers.emplace_back(std::make_unique<AttentionFusion>(cpu_acl_cuda_dml_rocm_eps));
344+
transformers.emplace_back(std::make_unique<EmbedLayerNormFusion>(cpu_acl_cuda_dml_rocm_eps));
336345
transformers.emplace_back(std::make_unique<GatherSliceToSplitFusion>(cpu_cuda_rocm_eps));
337346
transformers.emplace_back(std::make_unique<GatherToSliceFusion>(cpu_cuda_rocm_eps));
338347

339348
transformers.emplace_back(std::make_unique<MatmulTransposeFusion>(cpu_cuda_dml_rocm_eps));
340-
transformers.emplace_back(std::make_unique<BiasGeluFusion>(cpu_cuda_dml_rocm_eps));
349+
transformers.emplace_back(std::make_unique<BiasGeluFusion>(cpu_acl_cuda_dml_rocm_eps));
341350

342-
transformers.emplace_back(std::make_unique<SkipLayerNormFusion>(cpu_cuda_dml_rocm_eps));
351+
transformers.emplace_back(std::make_unique<SkipLayerNormFusion>(cpu_acl_cuda_dml_rocm_eps));
343352

344353
transformers.emplace_back(std::make_unique<FastGeluFusion>(cpu_cuda_dml_rocm_eps));
345-
transformers.emplace_back(std::make_unique<QuickGeluFusion>(cpu_cuda_dml_rocm_eps));
354+
transformers.emplace_back(std::make_unique<QuickGeluFusion>(cpu_acl_cuda_dml_rocm_eps));
346355

347356
// GeluApproximation has side effects which may change results. It needs to be manually enabled,
348357
// or alternatively the model can be updated offline using a model conversion script
@@ -367,7 +376,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
367376
transformers.emplace_back(std::make_unique<SceLossGradBiasFusion>(cpu_cuda_rocm_eps));
368377
#endif
369378

370-
transformers.emplace_back(std::make_unique<MatMulScaleFusion>(cpu_cuda_dml_rocm_eps));
379+
transformers.emplace_back(std::make_unique<MatMulScaleFusion>(cpu_acl_cuda_dml_rocm_eps));
371380
transformers.emplace_back(std::make_unique<MatMulActivationFusion>(dml_ep));
372381

373382
#ifdef MLAS_TARGET_AMD64_IX86

onnxruntime/core/optimizer/nhwc_transformer.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
23
// Licensed under the MIT License.
34

45
#include <deque>
@@ -183,7 +184,8 @@ Status NhwcTransformer::ApplyImpl(Graph& graph, bool& modified, int graph_level,
183184
modified = false;
184185
for (std::unique_ptr<api::NodeRef>& node : api_graph->Nodes()) {
185186
// If the node is not supported in the CPU EP, skip it
186-
if (node->GetExecutionProviderType() != kCpuExecutionProvider) {
187+
const auto ep = node->GetExecutionProviderType();
188+
if ((ep != kCpuExecutionProvider) && (ep != kAclExecutionProvider)) {
187189
continue;
188190
}
189191

onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
23
// Licensed under the MIT License.
34

45
#include <memory>
@@ -381,9 +382,9 @@ QDQSelectorActionTransformer::QDQSelectorActionTransformer(
381382
CreateSelectorActionRegistry(is_int8_allowed, qdq_matmulnbits_accuracy_level,
382383
intra_op_thread_pool, p_buffered_tensors),
383384
apply_context,
384-
// this transformer is compatible with CPU, DML and CUDA EP.
385+
// this transformer is compatible with CPU, DML, ACL and CUDA EP.
385386
// There is further EP control on the rule level.
386-
{kCpuExecutionProvider, kDmlExecutionProvider, kCudaExecutionProvider}} {
387+
{kCpuExecutionProvider, kDmlExecutionProvider, kAclExecutionProvider, kCudaExecutionProvider}} {
387388
}
388389

389390
} // namespace onnxruntime

0 commit comments

Comments
 (0)