Skip to content

Commit 35d7d29

Browse files
authored
Merge branch 'microsoft:main' into img_patch1
2 parents cb50485 + 983c4d5 commit 35d7d29

File tree

136 files changed

+3699
-1839
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

136 files changed

+3699
-1839
lines changed

.github/workflows/publish-python-apidocs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ permissions:
2222
jobs:
2323
build:
2424
name: Generate Python API docs
25-
runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-ubuntu-CPU"]
25+
runs-on: ubuntu-latest
2626
steps:
2727
- uses: actions/checkout@v4
2828
- name: Install tools

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77
[submodule "cmake/external/emsdk"]
88
path = cmake/external/emsdk
99
url = https://github.com/emscripten-core/emsdk.git
10-
branch = 3.1.62
10+
branch = 3.1.59

cgmanifests/generated/cgmanifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"component": {
77
"type": "git",
88
"git": {
9-
"commitHash": "0fde04880048f743056bed17cb0543a42e040fae",
9+
"commitHash": "d52c46520124845b1e0e0525f2759299d840143f",
1010
"repositoryUrl": "https://github.com/emscripten-core/emsdk.git"
1111
},
1212
"comments": "git submodule at cmake/external/emsdk"

cmake/external/cuDNN.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,5 +107,3 @@ elseif(CUDNN_MAJOR_VERSION EQUAL 9)
107107
CUDNN::cudnn_heuristic
108108
)
109109
endif()
110-
111-
mark_as_advanced(CUDNN_INCLUDE_DIR)

cmake/onnxruntime_providers_vitisai.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
)
2020
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_vitisai_cc_srcs})
2121
onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_cc_srcs})
22-
onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} nlohmann_json::nlohmann_json safeint_interface flatbuffers::flatbuffers)
22+
onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} safeint_interface flatbuffers::flatbuffers)
2323
target_link_libraries(onnxruntime_providers_vitisai PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED})
2424
if(MSVC)
2525
onnxruntime_add_include_to_target(onnxruntime_providers_vitisai dbghelp)

cmake/onnxruntime_rocm_hipify.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ find_package(Python3 COMPONENTS Interpreter REQUIRED)
55

66
# GLOB pattern of file to be excluded
77
set(contrib_ops_excluded_files
8+
"bert/cudnn_fmha/*"
89
"bert/cutlass_fmha/*"
910
"bert/fastertransformer_decoder_attention/*"
1011
"bert/flash_attention/*"

cmake/onnxruntime_unittests.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -887,9 +887,10 @@ if (MSVC)
887887
target_compile_options(onnxruntime_test_all PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4244>"
888888
"$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4244>")
889889

890-
# Avoid this compile error in graph_transform_test.cc:
890+
# Avoid this compile error in graph_transform_test.cc and qdq_transformer_test.cc:
891891
# fatal error C1128: number of sections exceeded object file format limit: compile with /bigobj
892892
set_property(SOURCE "${TEST_SRC_DIR}/optimizer/graph_transform_test.cc"
893+
"${TEST_SRC_DIR}/optimizer/qdq_transformer_test.cc"
893894
APPEND PROPERTY COMPILE_OPTIONS "/bigobj")
894895
else()
895896
target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses")

docs/OperatorKernels.md

Lines changed: 22 additions & 15 deletions
Large diffs are not rendered by default.

docs/python/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ pandas
1313
pydot
1414
coloredlogs
1515
flatbuffers
16-
numpy
16+
numpy<2.0.0
1717
packaging
1818
protobuf
1919
sympy

include/onnxruntime/core/graph/graph.h

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1139,16 +1139,48 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
11391139
const ONNX_NAMESPACE::GraphProto& ToGraphProto();
11401140
ONNX_NAMESPACE::GraphProto ToGraphProto() const;
11411141

1142+
// Options to align external initializer offset.
1143+
// For models running on CPU, ORT will try to use mmap to load external initializers.
1144+
// To use mmap, external initializer need to be offset aligned.
1145+
// ORT saves external initializers into signle data file, each initializer is accessed with
1146+
// offset(start position of initializer) and length(byte length of initializer) of the data file.
1147+
// To use mmap, each offset need to be aligned which means offset need to divisible by
1148+
// allocation granularity(64KB for windows and 4K for other OSes).
1149+
// With align_offset to true, ORT will align offset for large initializer when
1150+
// save ONNX model with external data file.
1151+
struct OffsetAlignmentInfo {
1152+
// Offset will always be page aligned and allocation granularity aligned for mmap support.
1153+
// This is done by padding previous tensor data with zeros keeping same length.
1154+
bool align_offset = false;
1155+
// Alignment threshold for size of data.
1156+
// Having a low threshold will waste file space for small initializers.
1157+
// Only when tensor's data size is > the page_align_threshold it will be force aligned.
1158+
// Default to 1MB.
1159+
int64_t align_threshold = 1048576;
1160+
// The allocation Granularity for mmap() support.
1161+
// Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
1162+
int64_t allocation_granularity = 65536;
1163+
};
1164+
11421165
/** Gets the GraphProto representation of this Graph
11431166
@param external_file_path File path of the binary file to use for initializers.
11441167
@param model_file_path path of the model file.
11451168
@param initializer_size_threshold initializers larger or equal to this threshold (in bytes) are saved
11461169
in the external file. Initializer smaller than this threshold are included in the onnx file.
1170+
@param align_info offset alignment info.
11471171
@returns GraphProto serialization of the graph.
11481172
*/
11491173
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
11501174
const std::filesystem::path& model_file_path,
1151-
size_t initializer_size_threshold) const;
1175+
size_t initializer_size_threshold,
1176+
const OffsetAlignmentInfo& align_info) const;
1177+
1178+
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
1179+
const std::filesystem::path& model_file_path,
1180+
size_t initializer_size_threshold) const {
1181+
OffsetAlignmentInfo default_options;
1182+
return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
1183+
}
11521184

11531185
/** Gets the ISchemaRegistry instances being used with this Graph. */
11541186
IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;

include/onnxruntime/core/graph/graph_nodes.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,14 @@ class ValidNodes {
117117
return (current_ != other.current_);
118118
}
119119

120-
void operator++() {
120+
NodeIterator<TIterator>& operator++() {
121121
if (current_ < end_) {
122122
while (++current_ != end_) {
123123
if (*current_ != nullptr && (!apply_filter_ || (*filter_func_)((*current_)->Index()) == false))
124124
break;
125125
}
126126
}
127+
return *this;
127128
}
128129

129130
NodeIterator<TIterator> operator++(int) {

include/onnxruntime/core/session/onnxruntime_c_api.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,7 @@ typedef struct OrtMIGraphXProviderOptions {
621621
const char* migraphx_save_model_path; // migraphx model path name
622622
int migraphx_load_compiled_model; // migraphx int8 cal table. Default 0 = false, noznero = true
623623
const char* migraphx_load_model_path; // migraphx model path name
624+
bool migraphx_exhaustive_tune; // migraphx tuned compile Default = false
624625
} OrtMIGraphXProviderOptions;
625626

626627
/** \brief OpenVINO Provider Options

js/web/docs/webnn-operators.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
2828
| Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity ||| Only supports test mode |
2929
| Elu | ai.onnx(7+) | elu ||| WebNN CPU backend only supports 'alpha' value is 1.0 |
3030
| Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal ||| |
31-
| Erf | ai.onnx(7-9, 10-12, 13+) | erf | || |
31+
| Erf | ai.onnx(7-9, 10-12, 13+) | erf | || |
3232
| Exp | ai.onnx(7-12, 13+) | exp ||| |
3333
| Expand | ai.onnx(8-12, 13+) | expand ||| 'shape' input should be a constant |
3434
| Flatten | ai.onnx(7-8, 9-10, 11-12, 13-20, 21+) | reshape ||| |
@@ -89,6 +89,6 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
8989
| Tan | ai.onnx(7+) | tan ||| |
9090
| Tanh | ai.onnx(7-12, 13+) | tanh ||| |
9191
| Transpose | ai.onnx(7-12, 13-20, 21+) | transpose ||| |
92-
| Trilu | ai.onnx(14+) | triangular | || Input 'k' (option 'diagonal' for WebNN) if present should be a constant |
92+
| Trilu | ai.onnx(14+) | triangular | || Input 'k' (option 'diagonal' for WebNN) if present should be a constant |
9393
| Unsqueeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape ||| |
9494
| Where | ai.onnx(7-8, 9-15, 16+) | where ||| |

js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ export const createConv2DMatMulProgramInfo = (
182182
dimInner: number,
183183
hasBias: boolean,
184184
sequentialAccessByThreads: boolean,
185+
squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
185186
): ProgramInfo => {
186187
const isChannelsLast = attributes.format === 'NHWC';
187188
const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1];
@@ -309,13 +310,16 @@ export const createConv2DMatMulProgramInfo = (
309310
return {
310311
name: 'Conv2DMatMul',
311312
shaderCache: {
312-
hint: `${attributes.cacheKey};${innerElementSize};${isVec4};${fitAOuter};${fitBOuter};${fitInner};${
313-
tileAOuter
314-
};${tileBOuter};${tileInner}`,
313+
hint: `${attributes.cacheKey};${innerElementSize};${isVec4};${fitAOuter};${fitBOuter};${fitInner};${tileAOuter};${tileBOuter};${tileInner}`,
315314
inputDependencies,
316315
},
317316
getRunData: () => ({
318-
outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
317+
outputs: [
318+
{
319+
dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
320+
dataType: inputs[0].dataType,
321+
},
322+
],
319323
dispatchGroup: { x: dispatch[0], y: dispatch[1], z: dispatch[2] },
320324
programUniforms,
321325
}),

js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,9 @@ export const makeMatMulPackedVec4Source = (
110110
workPerThread[0] === 4
111111
)
112112
) {
113-
throw new Error(`If transposeA ${transposeA} is true, innerElementSize ${
114-
innerElementSize
115-
} and workPerThread[1] ${workPerThread[1]} must be 4.
113+
throw new Error(`If transposeA ${transposeA} is true, innerElementSize ${innerElementSize} and workPerThread[1] ${workPerThread[1]} must be 4.
116114
Otherwise, innerElementSize ${innerElementSize} must be 3 or 4.
117-
tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}. tileInner ${
118-
tileInner
119-
} must be divisible by workgroupSize[1] ${workgroupSize[1]}. colPerThread ${workPerThread[0]} must be 4.`);
115+
tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}. tileInner ${tileInner} must be divisible by workgroupSize[1] ${workgroupSize[1]}. colPerThread ${workPerThread[0]} must be 4.`);
120116
}
121117
return `
122118
var<workgroup> mm_Asub: array<array<vec${innerElementSize}<${type}>, ${tileAWidth / innerElementSize}>, ${tileAHight}>;
@@ -227,11 +223,7 @@ export const makeMatMulPackedSource = (
227223
!(tileAHight % workgroupSize[1] === 0 && tileAWidth % workgroupSize[0] === 0 && tileInner % workgroupSize[1] === 0)
228224
) {
229225
throw new Error(
230-
`tileAHight ${tileAHight} must be divisible by workgroupSize[1]${
231-
workgroupSize[1]
232-
}, tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${
233-
workgroupSize[0]
234-
}, tileInner ${tileInner} must be divisible by workgroupSize[1]${workgroupSize[1]}`,
226+
`tileAHight ${tileAHight} must be divisible by workgroupSize[1]${workgroupSize[1]}, tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}, tileInner ${tileInner} must be divisible by workgroupSize[1]${workgroupSize[1]}`,
235227
);
236228
}
237229
const rowPerThreadA = tileAHight / workgroupSize[1];
@@ -470,6 +462,7 @@ export const createMatmulProgramInfo = (
470462
outputShape: readonly number[],
471463
reshapedOutputShape?: readonly number[],
472464
isChannelsLast = false /* only used for conv2dByMatMul*/,
465+
squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
473466
): ProgramInfo => {
474467
const aShape = inputs[0].dims;
475468
const bShape = inputs[1].dims;
@@ -562,7 +555,12 @@ export const createMatmulProgramInfo = (
562555
inputDependencies,
563556
},
564557
getRunData: () => ({
565-
outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
558+
outputs: [
559+
{
560+
dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
561+
dataType: inputs[0].dataType,
562+
},
563+
],
566564
dispatchGroup: { x: dispatch[0], y: dispatch[1], z: dispatch[2] },
567565
programUniforms,
568566
}),

0 commit comments

Comments
 (0)