imaginationtech
diff --git a/‎.github/workflows/publish-python-apidocs.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/publish-python-apidocs.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitmodules
Lines changed: 1 addition & 1 deletion b/‎.gitmodules
Lines changed: 1 addition & 1 deletion
diff --git a/‎cgmanifests/generated/cgmanifest.json
Lines changed: 1 addition & 1 deletion b/‎cgmanifests/generated/cgmanifest.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/external/cuDNN.cmake
Lines changed: 0 additions & 2 deletions b/‎cmake/external/cuDNN.cmake
Lines changed: 0 additions & 2 deletions
diff --git a/‎cmake/external/emsdk b/‎cmake/external/emsdk
diff --git a/‎cmake/onnxruntime_providers_vitisai.cmake
Lines changed: 1 addition & 1 deletion b/‎cmake/onnxruntime_providers_vitisai.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/onnxruntime_rocm_hipify.cmake
Lines changed: 1 addition & 0 deletions b/‎cmake/onnxruntime_rocm_hipify.cmake
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/onnxruntime_unittests.cmake
Lines changed: 2 additions & 1 deletion b/‎cmake/onnxruntime_unittests.cmake
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/OperatorKernels.md
Lines changed: 22 additions & 15 deletions b/‎docs/OperatorKernels.md
Lines changed: 22 additions & 15 deletions
diff --git a/‎docs/python/requirements.txt
Lines changed: 1 addition & 1 deletion b/‎docs/python/requirements.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/onnxruntime/core/graph/graph.h
Lines changed: 33 additions & 1 deletion b/‎include/onnxruntime/core/graph/graph.h
Lines changed: 33 additions & 1 deletion
diff --git a/‎include/onnxruntime/core/graph/graph_nodes.h
Lines changed: 2 additions & 1 deletion b/‎include/onnxruntime/core/graph/graph_nodes.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/onnxruntime/core/session/onnxruntime_c_api.h
Lines changed: 1 addition & 0 deletions b/‎include/onnxruntime/core/session/onnxruntime_c_api.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎js/web/docs/webnn-operators.md
Lines changed: 2 additions & 2 deletions b/‎js/web/docs/webnn-operators.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
Lines changed: 8 additions & 4 deletions b/‎js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
Lines changed: 8 additions & 4 deletions
diff --git a/‎js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
Lines changed: 10 additions & 12 deletions b/‎js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
Lines changed: 10 additions & 12 deletions
@@ -22,7 +22,7 @@ permissions:
 jobs:
   build:
     name: Generate Python API docs
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-ubuntu-CPU"]
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - name: Install tools
 
@@ -7,4 +7,4 @@
 [submodule "cmake/external/emsdk"]
 	path = cmake/external/emsdk
 	url = https://github.com/emscripten-core/emsdk.git
-	branch = 3.1.62
+	branch = 3.1.59
@@ -6,7 +6,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "0fde04880048f743056bed17cb0543a42e040fae",
+          "commitHash": "d52c46520124845b1e0e0525f2759299d840143f",
           "repositoryUrl": "https://github.com/emscripten-core/emsdk.git"
         },
         "comments": "git submodule at cmake/external/emsdk"
 
@@ -107,5 +107,3 @@ elseif(CUDNN_MAJOR_VERSION EQUAL 9)
         CUDNN::cudnn_heuristic
     )
 endif()
-
-mark_as_advanced(CUDNN_INCLUDE_DIR)
@@ -19,7 +19,7 @@
   )
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_vitisai_cc_srcs})
   onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_cc_srcs})
-  onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} nlohmann_json::nlohmann_json safeint_interface flatbuffers::flatbuffers)
+  onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} safeint_interface flatbuffers::flatbuffers)
   target_link_libraries(onnxruntime_providers_vitisai PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED})
   if(MSVC)
     onnxruntime_add_include_to_target(onnxruntime_providers_vitisai dbghelp)
 
@@ -5,6 +5,7 @@ find_package(Python3 COMPONENTS Interpreter REQUIRED)
 
 # GLOB pattern of file to be excluded
 set(contrib_ops_excluded_files
+  "bert/cudnn_fmha/*"
   "bert/cutlass_fmha/*"
   "bert/fastertransformer_decoder_attention/*"
   "bert/flash_attention/*"
 
@@ -887,9 +887,10 @@ if (MSVC)
   target_compile_options(onnxruntime_test_all PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4244>"
                 "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4244>")
 
-  # Avoid this compile error in graph_transform_test.cc:
+  # Avoid this compile error in graph_transform_test.cc and qdq_transformer_test.cc:
   # fatal error C1128: number of sections exceeded object file format limit: compile with /bigobj
   set_property(SOURCE "${TEST_SRC_DIR}/optimizer/graph_transform_test.cc"
+                      "${TEST_SRC_DIR}/optimizer/qdq_transformer_test.cc"
                APPEND PROPERTY COMPILE_OPTIONS "/bigobj")
 else()
   target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses")
 
@@ -13,7 +13,7 @@ pandas
 pydot
 coloredlogs
 flatbuffers
-numpy
+numpy<2.0.0
 packaging
 protobuf
 sympy
 
@@ -1139,16 +1139,48 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   const ONNX_NAMESPACE::GraphProto& ToGraphProto();
   ONNX_NAMESPACE::GraphProto ToGraphProto() const;
 
+  // Options to align external initializer offset.
+  // For models running on CPU, ORT will try to use mmap to load external initializers.
+  // To use mmap, external initializer need to be offset aligned.
+  // ORT saves external initializers into signle data file, each initializer is accessed with
+  // offset(start position of initializer) and length(byte length of initializer) of the data file.
+  // To use mmap, each offset need to be aligned which means offset need to divisible by
+  // allocation granularity(64KB for windows and 4K for other OSes).
+  // With align_offset to true, ORT will align offset for large initializer when
+  // save ONNX model with external data file.
+  struct OffsetAlignmentInfo {
+    // Offset will always be page aligned and allocation granularity aligned for mmap support.
+    // This is done by padding previous tensor data with zeros keeping same length.
+    bool align_offset = false;
+    // Alignment threshold for size of data.
+    // Having a low threshold will waste file space for small initializers.
+    // Only when tensor's data size is > the page_align_threshold it will be force aligned.
+    // Default to 1MB.
+    int64_t align_threshold = 1048576;
+    // The allocation Granularity for mmap() support.
+    // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
+    int64_t allocation_granularity = 65536;
+  };
+
   /** Gets the GraphProto representation of this Graph
   @param external_file_path File path of the binary file to use for initializers.
   @param model_file_path path of the model file.
   @param initializer_size_threshold initializers larger or equal to this threshold (in bytes) are saved
   in the external file. Initializer smaller than this threshold are included in the onnx file.
+  @param align_info offset alignment info.
   @returns GraphProto serialization of the graph.
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold) const;
+                                                                  size_t initializer_size_threshold,
+                                                                  const OffsetAlignmentInfo& align_info) const;
+
+  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
+                                                                  const std::filesystem::path& model_file_path,
+                                                                  size_t initializer_size_threshold) const {
+    OffsetAlignmentInfo default_options;
+    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
+  }
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
   IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
 
@@ -117,13 +117,14 @@ class ValidNodes {
       return (current_ != other.current_);
     }
 
-    void operator++() {
+    NodeIterator<TIterator>& operator++() {
       if (current_ < end_) {
         while (++current_ != end_) {
           if (*current_ != nullptr && (!apply_filter_ || (*filter_func_)((*current_)->Index()) == false))
             break;
         }
       }
+      return *this;
     }
 
     NodeIterator<TIterator> operator++(int) {
 
@@ -621,6 +621,7 @@ typedef struct OrtMIGraphXProviderOptions {
   const char* migraphx_save_model_path;              // migraphx model path name
   int migraphx_load_compiled_model;                  // migraphx int8 cal table. Default 0 = false, noznero = true
   const char* migraphx_load_model_path;              // migraphx model path name
+  bool migraphx_exhaustive_tune;                     // migraphx tuned compile  Default = false
 } OrtMIGraphXProviderOptions;
 
 /** \brief OpenVINO Provider Options
 
@@ -28,7 +28,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity | ✓ | ✓ | Only supports test mode |
 | Elu | ai.onnx(7+) | elu | ✓ | ✓ | WebNN CPU backend only supports 'alpha' value is 1.0 |
 | Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | ✓ | ✓ | |
-| Erf | ai.onnx(7-9, 10-12, 13+) | erf | ✗ | ✓ | |
+| Erf | ai.onnx(7-9, 10-12, 13+) | erf | ✓ | ✓ | |
 | Exp | ai.onnx(7-12, 13+) | exp | ✓ | ✓ | |
 | Expand | ai.onnx(8-12, 13+) | expand | ✓ | ✓ | 'shape' input should be a constant |
 | Flatten | ai.onnx(7-8, 9-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | |
@@ -89,6 +89,6 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | Tan | ai.onnx(7+) | tan | ✓ | ✓ | |
 | Tanh | ai.onnx(7-12, 13+) | tanh | ✓ | ✓ | |
 | Transpose | ai.onnx(7-12, 13-20, 21+) | transpose | ✓ | ✓ | |
-| Trilu | ai.onnx(14+) | triangular | ✗ | ✓ | Input 'k' (option 'diagonal' for WebNN) if present should be a constant |
+| Trilu | ai.onnx(14+) | triangular | ✓ | ✓ | Input 'k' (option 'diagonal' for WebNN) if present should be a constant |
 | Unsqueeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | |
 | Where | ai.onnx(7-8, 9-15, 16+) | where | ✓ | ✓ | |
@@ -182,6 +182,7 @@ export const createConv2DMatMulProgramInfo = (
   dimInner: number,
   hasBias: boolean,
   sequentialAccessByThreads: boolean,
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const isChannelsLast = attributes.format === 'NHWC';
   const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1];
@@ -309,13 +310,16 @@ export const createConv2DMatMulProgramInfo = (
   return {
     name: 'Conv2DMatMul',
     shaderCache: {
-      hint: `${attributes.cacheKey};${innerElementSize};${isVec4};${fitAOuter};${fitBOuter};${fitInner};${
-        tileAOuter
-      };${tileBOuter};${tileInner}`,
+      hint: `${attributes.cacheKey};${innerElementSize};${isVec4};${fitAOuter};${fitBOuter};${fitInner};${tileAOuter};${tileBOuter};${tileInner}`,
       inputDependencies,
     },
     getRunData: () => ({
-      outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
+      outputs: [
+        {
+          dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+          dataType: inputs[0].dataType,
+        },
+      ],
       dispatchGroup: { x: dispatch[0], y: dispatch[1], z: dispatch[2] },
       programUniforms,
     }),
 
@@ -110,13 +110,9 @@ export const makeMatMulPackedVec4Source = (
       workPerThread[0] === 4
     )
   ) {
-    throw new Error(`If transposeA ${transposeA} is true, innerElementSize ${
-      innerElementSize
-    } and workPerThread[1] ${workPerThread[1]} must be 4.
+    throw new Error(`If transposeA ${transposeA} is true, innerElementSize ${innerElementSize} and workPerThread[1] ${workPerThread[1]} must be 4.
       Otherwise, innerElementSize ${innerElementSize} must be 3 or 4.
-  tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}. tileInner ${
-    tileInner
-  } must be divisible by workgroupSize[1] ${workgroupSize[1]}. colPerThread ${workPerThread[0]} must be 4.`);
+  tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}. tileInner ${tileInner} must be divisible by workgroupSize[1] ${workgroupSize[1]}. colPerThread ${workPerThread[0]} must be 4.`);
   }
   return `
 var<workgroup> mm_Asub: array<array<vec${innerElementSize}<${type}>, ${tileAWidth / innerElementSize}>, ${tileAHight}>;
@@ -227,11 +223,7 @@ export const makeMatMulPackedSource = (
     !(tileAHight % workgroupSize[1] === 0 && tileAWidth % workgroupSize[0] === 0 && tileInner % workgroupSize[1] === 0)
   ) {
     throw new Error(
-      `tileAHight ${tileAHight} must be divisible by workgroupSize[1]${
-        workgroupSize[1]
-      }, tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${
-        workgroupSize[0]
-      }, tileInner ${tileInner} must be divisible by workgroupSize[1]${workgroupSize[1]}`,
+      `tileAHight ${tileAHight} must be divisible by workgroupSize[1]${workgroupSize[1]}, tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}, tileInner ${tileInner} must be divisible by workgroupSize[1]${workgroupSize[1]}`,
     );
   }
   const rowPerThreadA = tileAHight / workgroupSize[1];
@@ -470,6 +462,7 @@ export const createMatmulProgramInfo = (
   outputShape: readonly number[],
   reshapedOutputShape?: readonly number[],
   isChannelsLast = false /* only used for conv2dByMatMul*/,
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const aShape = inputs[0].dims;
   const bShape = inputs[1].dims;
@@ -562,7 +555,12 @@ export const createMatmulProgramInfo = (
       inputDependencies,
     },
     getRunData: () => ({
-      outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
+      outputs: [
+        {
+          dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+          dataType: inputs[0].dataType,
+        },
+      ],
       dispatchGroup: { x: dispatch[0], y: dispatch[1], z: dispatch[2] },
       programUniforms,
     }),
Original file line number	Diff line number	Diff line change
`@@ -107,5 +107,3 @@ elseif(CUDNN_MAJOR_VERSION EQUAL 9)`
`107`	`107`	`CUDNN::cudnn_heuristic`
`108`	`108`	`)`
`109`	`109`	`endif()`
`110`		`-`
`111`		`-mark_as_advanced(CUDNN_INCLUDE_DIR)`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`)`
`20`	`20`	`source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_vitisai_cc_srcs})`
`21`	`21`	`onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_cc_srcs})`
`22`		`- onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} nlohmann_json::nlohmann_json safeint_interface flatbuffers::flatbuffers)`
	`22`	`+ onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} safeint_interface flatbuffers::flatbuffers)`
`23`	`23`	`target_link_libraries(onnxruntime_providers_vitisai PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED})`
`24`	`24`	`if(MSVC)`
`25`	`25`	`onnxruntime_add_include_to_target(onnxruntime_providers_vitisai dbghelp)`
Original file line number	Diff line number	Diff line change
`@@ -117,13 +117,14 @@ class ValidNodes {`
`117`	`117`	`return (current_ != other.current_);`
`118`	`118`	`}`
`119`	`119`
`120`		`- void operator++() {`
	`120`	`+ NodeIterator<TIterator>& operator++() {`
`121`	`121`	`if (current_ < end_) {`
`122`	`122`	`while (++current_ != end_) {`
`123`	`123`	`if (current_ != nullptr && (!apply_filter_ \|\| (filter_func_)((*current_)->Index()) == false))`
`124`	`124`	`break;`
`125`	`125`	`}`
`126`	`126`	`}`
	`127`	`+ return *this;`
`127`	`128`	`}`
`128`	`129`
`129`	`130`	`NodeIterator<TIterator> operator++(int) {`