From b177b637325d358c5b2127950b4a7f25ed8cd36e Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Fri, 10 Oct 2025 17:01:53 -0400 Subject: [PATCH 1/5] Update [ghstack-poisoned] --- CMakeLists.txt | 12 +- backends/aoti/CMakeLists.txt | 6 +- backends/apple/metal/CMakeLists.txt | 107 ++++ .../apple/metal/runtime/metal_backend.cpp | 484 ++++++++++++++++++ tools/cmake/preset/default.cmake | 7 + 5 files changed, 613 insertions(+), 3 deletions(-) create mode 100644 backends/apple/metal/CMakeLists.txt create mode 100644 backends/apple/metal/runtime/metal_backend.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ad08c72d1ae..33ae1f2533b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -602,15 +602,23 @@ if(EXECUTORCH_BUILD_CORTEX_M) list(APPEND _executorch_backends coretex_m_backend) endif() -if(EXECUTORCH_BUILD_CUDA) - # Build common AOTI functionality (required for CUDA) +# Build common AOTI functionality if needed by CUDA or Metal backends +if(EXECUTORCH_BUILD_CUDA OR EXECUTORCH_BUILD_METAL) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti) +endif() + +if(EXECUTORCH_BUILD_CUDA) # Build CUDA-specific AOTI functionality add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda) # Add aoti_cuda to backends - it already depends on aoti_common list(APPEND _executorch_backends aoti_cuda) endif() +if(EXECUTORCH_BUILD_METAL) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/metal) + list(APPEND _executorch_backends metal_backend) +endif() + if(EXECUTORCH_BUILD_EXTENSION_APPLE) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple) endif() diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index 845144af50f..5306c8ad74b 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -38,7 +38,11 @@ target_include_directories( ) target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC) # Ensure symbols are exported properly -target_link_options(aoti_common PUBLIC -Wl,--export-dynamic) +if(APPLE) + target_link_options(aoti_common PUBLIC -Wl,-export_dynamic) +else() + target_link_options(aoti_common PUBLIC -Wl,--export-dynamic) +endif() # Link against ExecuTorch libraries and standard libraries target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS}) diff --git a/backends/apple/metal/CMakeLists.txt b/backends/apple/metal/CMakeLists.txt new file mode 100644 index 00000000000..c26d247d6be --- /dev/null +++ b/backends/apple/metal/CMakeLists.txt @@ -0,0 +1,107 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Build AOTI Metal backend for runtime. +# +# ### Editing this file ### +# +# This file should be formatted with +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ +# It should also be cmake-lint clean. +# +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) +# Use full torch package to get library paths, but only link specific libraries +find_package_torch() + +set(_aoti_metal_sources + runtime/metal_backend.cpp + runtime/shims/memory.cpp + runtime/shims/et_metal.mm + runtime/shims/et_metal_ops.mm + runtime/shims/shim_mps.mm + runtime/shims/tensor_attribute.cpp + runtime/shims/utils.cpp + ) + +add_library(metal_backend STATIC ${_aoti_metal_sources}) +target_include_directories( + metal_backend + PUBLIC + $ + $ + # PyTorch AOTI headers from ExecutorTorch's torch detection + ${TORCH_INCLUDE_DIRS} +) + +# Link Metal framework +find_library(METAL_LIBRARY Metal REQUIRED) +find_library(FOUNDATION_LIBRARY Foundation REQUIRED) +find_library(METALPERFORMANCESHADERS_LIBRARY MetalPerformanceShaders REQUIRED) +find_library(METALPERFORMANCESHADERSGRAPH_LIBRARY MetalPerformanceShadersGraph REQUIRED) +target_link_libraries(metal_backend PUBLIC ${METAL_LIBRARY} ${FOUNDATION_LIBRARY} ${METALPERFORMANCESHADERS_LIBRARY} ${METALPERFORMANCESHADERSGRAPH_LIBRARY}) + +target_compile_options(metal_backend PUBLIC -fexceptions -frtti -fPIC) + +target_link_options(metal_backend PUBLIC -Wl,-export_dynamic) + +# Find PyTorch's OpenMP library specifically for libtorch-less AOTI +get_torch_base_path(TORCH_BASE_PATH) +find_library(TORCH_OMP_LIBRARY + NAMES omp libomp + PATHS "${TORCH_BASE_PATH}/lib" + NO_DEFAULT_PATH +) + +if(TORCH_OMP_LIBRARY) + message(STATUS "Found PyTorch OpenMP library: ${TORCH_OMP_LIBRARY}") + # Get the directory containing the OpenMP library for rpath + get_filename_component(TORCH_OMP_LIB_DIR ${TORCH_OMP_LIBRARY} DIRECTORY) + message(STATUS "OpenMP library directory: ${TORCH_OMP_LIB_DIR}") +else() + message(WARNING "PyTorch OpenMP library not found, may cause runtime linking issues") +endif() + +# Link against appropriate backends and standard libraries +target_link_libraries( + metal_backend + PUBLIC + aoti_common + extension_tensor + ${CMAKE_DL_LIBS} + ${TORCH_OMP_LIBRARY} +) + +# Set rpath for OpenMP library to avoid runtime linking issues +if(TORCH_OMP_LIBRARY AND TORCH_OMP_LIB_DIR) + # Add the OpenMP library directory to the rpath + set_target_properties(metal_backend PROPERTIES + BUILD_RPATH "${TORCH_OMP_LIB_DIR}" + INSTALL_RPATH "${TORCH_OMP_LIB_DIR}" + ) + # Also try common OpenMP library locations + target_link_options(metal_backend PUBLIC + -Wl,-rpath,${TORCH_OMP_LIB_DIR} + -Wl,-rpath,/usr/local/opt/libomp/lib + -Wl,-rpath,/opt/homebrew/opt/libomp/lib + ) + message(STATUS "Added rpath for OpenMP library: ${TORCH_OMP_LIB_DIR}") +endif() + +executorch_target_link_options_shared_lib(metal_backend) +install( + TARGETS metal_backend + EXPORT ExecuTorchTargets + DESTINATION lib +) diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp new file mode 100644 index 00000000000..4ce3ea7f7da --- /dev/null +++ b/backends/apple/metal/runtime/metal_backend.cpp @@ -0,0 +1,484 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// Include AOTI common headers (from aoti_common library) +#include +#include + +// Include our Metal-specific shim layer headers +#include +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace metal { + +using namespace std; +using namespace aoti; + +using executorch::aten::ScalarType; +using executorch::runtime::ArrayRef; +using executorch::runtime::Backend; +using executorch::runtime::BackendExecutionContext; +using executorch::runtime::BackendInitContext; +using executorch::runtime::CompileSpec; +using executorch::runtime::DelegateHandle; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::NamedDataMap; +using executorch::runtime::Result; +using executorch::runtime::Span; +using executorch::runtime::etensor::Tensor; + +class MetalBackend final : public ::executorch::runtime::BackendInterface { + public: + // Once in program + MetalBackend() { + ET_LOG(Info, "MetalBackend ctor"); + } + + bool is_available() const override { + return 1; + } + + // Once per loaded binary blob + Result init( + BackendInitContext& context, + FreeableBuffer* processed, // This will be a empty buffer + ArrayRef compile_specs // This will be my empty list + ) const override { + ET_LOG(Info, "MetalBackend::init - Starting initialization"); + + std::string method_name; + for (const CompileSpec& spec : compile_specs) { + if (std::strcmp(spec.key, "method_name") == 0) { + method_name.assign( + static_cast(spec.value.buffer), + spec.value.nbytes); // no nullptr guarantee, so pass size + break; + } + } + + std::string so_blob_key = + method_name.empty() ? "so_blob" : method_name + "_so_blob"; + ET_LOG(Info, "MetalBackend::init - so_blob_key: %s", so_blob_key.c_str()); + + const NamedDataMap* named_data_map = context.get_named_data_map(); + ET_LOG(Info, "MetalBackend::init - Got named data map: %p", named_data_map); + + ET_LOG(Info, "MetalBackend::init - Looking for blob key: %s", so_blob_key.c_str()); + + Result aoti_metal_buffer = + named_data_map->get_data(so_blob_key.c_str()); + ET_LOG(Info, "MetalBackend::init - Got buffer result"); + + if (!aoti_metal_buffer.ok()) { + ET_LOG( + Error, + "MetalBackend::init - Failed to get buffer for key %s: 0x%x", + so_blob_key.c_str(), + aoti_metal_buffer.error()); + return Error::InvalidArgument; + } + + ET_LOG(Info, "MetalBackend::init - Buffer is OK, size: %zu", aoti_metal_buffer->size()); + + if (aoti_metal_buffer->data() == nullptr) { + ET_LOG(Error, "MetalBackend::init - Buffer data is null"); + return Error::InvalidArgument; + } + + ET_LOG(Info, "MetalBackend::init - Buffer data pointer: %p", aoti_metal_buffer->data()); + + // Generate dynamic temporary file path + filesystem::path temp_dir = filesystem::temp_directory_path(); + filesystem::path so_path = + temp_dir / (so_blob_key + to_string(getpid()) + ".so"); + + // Create a temporary file + ET_LOG(Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str()); + std::ofstream outfile(so_path.c_str(), std::ios::binary); + + if (!outfile.is_open()) { + ET_LOG(Error, "MetalBackend::init - Failed to create temp file"); + return Error::AccessFailed; + } + ET_LOG(Info, "MetalBackend::init - Temp file created successfully"); + + // Write the ELF buffer to the temporary file + size_t buffer_size = aoti_metal_buffer->size(); + ET_LOG(Info, "MetalBackend::init - About to write %zu bytes to file", buffer_size); + + // Write the buffer directly, not using sizeof(void*) multiplication + outfile.write((char*)aoti_metal_buffer->data(), buffer_size); + + if (outfile.bad()) { + ET_LOG(Error, "MetalBackend::init - File write failed"); + return Error::AccessFailed; + } + ET_LOG(Info, "MetalBackend::init - Buffer written successfully"); + + // Finish writing the file to disk + outfile.close(); + ET_LOG(Info, "MetalBackend::init - File closed successfully"); + + // Load the ELF using dlopen + void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL); + if (so_handle == nullptr) { + std::cout << dlerror() << std::endl; + return Error::AccessFailed; + } + + processed->Free(); + + AOTInductorModelContainerCreateWithDevice = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice")); + if (AOTInductorModelContainerCreateWithDevice == nullptr) { + perror("dlsym1"); + return Error::AccessFailed; + } + AOTInductorModelContainerDelete = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerDelete")); + if (AOTInductorModelContainerDelete == nullptr) { + perror("dlsym2"); + return Error::AccessFailed; + } + AOTInductorModelContainerGetNumInputs = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetNumInputs")); + if (AOTInductorModelContainerGetNumInputs == nullptr) { + perror("dlsym3"); + return Error::AccessFailed; + } + + AOTInductorModelContainerGetNumConstants = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetNumConstants")); + if (AOTInductorModelContainerGetNumConstants == nullptr) { + perror("dlsym AOTInductorModelContainerGetNumConstants"); + return Error::AccessFailed; + } + + AOTInductorModelContainerGetInputName = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetInputName")); + if (AOTInductorModelContainerGetInputName == nullptr) { + perror("dlsym AOTInductorModelContainerGetInputName"); + return Error::AccessFailed; + } + + AOTInductorModelContainerGetNumOutputs = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs")); + if (AOTInductorModelContainerGetNumOutputs == nullptr) { + perror("dlsym4"); + return Error::AccessFailed; + } + AOTInductorModelContainerRun = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerRun")); + if (AOTInductorModelContainerRun == nullptr) { + perror("dlsym5"); + return Error::AccessFailed; + } + + AOTInductorModelContainerHandle container_handle = nullptr; + ET_LOG(Info, "MetalBackend::init - About to create AOTI container with device='mps'"); + + AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice( + &container_handle, 1, "mps", nullptr); + ET_LOG(Info, "MetalBackend::init - AOTInductorModelContainerCreateWithDevice returned err=%d", err); + + if (err != Error::Ok) { + ET_LOG(Error, "Failed to initialize AOTInductorModelContainer with error %d", err); + return err; + } + ET_LOG(Info, "Successfully initialized container_handle = %p", container_handle); + + AOTIDelegateHandle* handle = new AOTIDelegateHandle(); + handle->so_handle = so_handle; + handle->container_handle = container_handle; + return (DelegateHandle*)handle; // Return the handle post-processing + } + + // Once per execution + Error execute( + BackendExecutionContext& context, + DelegateHandle* handle_, + Span args) const override { + ET_LOG(Debug, "MetalBackend execute"); + + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + ET_LOG(Debug, "MetalBackend Handle generated"); + + size_t n_inputs; + AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs); + + size_t n_outputs; + AOTInductorModelContainerGetNumOutputs( + handle->container_handle, &n_outputs); + + ET_LOG(Debug, "MetalBackend n_outputs %zd generated", n_outputs); + + if (n_inputs + n_outputs != args.size()) { + ET_LOG( + Error, + "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.", + n_inputs, + n_outputs, + args.size()); + return Error::InvalidArgument; + } + + ET_LOG( + Debug, + "number of user input %zd and output %zd generated from AOT Inductor matches ET runner's %zd.", + n_inputs, + n_outputs, + args.size()); + + // NOTE: ExecutorTorch tensors are always on CPU/host memory + // We need to create GPU copies for Metal kernel execution + std::vector gpu_inputs( + n_inputs); // GPU copies for kernel execution + std::vector gpu_outputs( + n_outputs); // GPU tensors for kernel output + + ET_LOG(Debug, "MetalBackend input/output vectors generated"); + + // Process input tensors: ExecutorTorch provides CPU tensors, create GPU + // copies + for (int i = 0; i < n_inputs; i++) { + ET_LOG(Debug, "Processing input %d from args to inputs vector", i); + ET_LOG( + Debug, "is %d input a tensor input? %d", i, int(args[i]->isTensor())); + + // Get tensor dimensions and properties from ExecutorTorch CPU tensor + auto cpu_tensor = &(args[i]->toTensor()); + auto sizes = cpu_tensor->sizes(); + auto scalar_type = cpu_tensor->scalar_type(); + ET_LOG(Debug, "MetalBackend input %d scalar_type=%d", i, static_cast(scalar_type)); + + // Create GPU tensor with same shape + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_input_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 2, // device_type = mps + 0, // device_index = 0 + &gpu_input_handle); + + if (create_err != Error::Ok) { + ET_LOG(Error, "Failed to create GPU tensor for input %d", i); + return Error::Internal; + } + + // Log the created GPU tensor scalar type + auto gpu_tensor = reinterpret_cast(gpu_input_handle); + ET_LOG(Debug, "MetalBackend created GPU tensor %d scalar_type=%d", i, static_cast(gpu_tensor->scalar_type())); + + gpu_inputs[i] = gpu_input_handle; + + // Log the CPU tensor data before copying to GPU + void* cpu_data = cpu_tensor->mutable_data_ptr(); + if (cpu_data && cpu_tensor->numel() > 0) { + float* cpu_float_data = (float*)cpu_data; + ET_LOG(Debug, "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)", + i, cpu_float_data[0], cpu_float_data[1], cpu_float_data[2], cpu_tensor->numel()); + } + + // Copy data from CPU to GPU + Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0); + if (copy_err != Error::Ok) { + ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i); + return Error::Internal; + } + + // Log the GPU tensor scalar type after copy + auto gpu_tensor_after = reinterpret_cast(gpu_inputs[i]); + ET_LOG(Debug, "MetalBackend GPU tensor %d scalar_type after copy=%d", i, static_cast(gpu_tensor_after->scalar_type())); + + ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i); + } + + ET_LOG(Debug, "MetalBackend GPU inputs generated"); + + // Process output tensors: create GPU counterparts for ExecutorTorch CPU + // tensors + for (int i = 0; i < n_outputs; i++) { + // Get output tensor dimensions from ExecutorTorch CPU tensor + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + auto sizes = cpu_output_tensor->sizes(); + auto scalar_type = cpu_output_tensor->scalar_type(); + ET_LOG(Debug, "MetalBackend output %d scalar_type=%d", i, static_cast(scalar_type)); + + // Create GPU tensor with same shape for kernel output + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_output_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 2, // device_type = mps + 0, // device_index = 0 + &gpu_output_handle); + + if (create_err != Error::Ok) { + ET_LOG(Error, "Failed to create GPU tensor for output %d", i); + return Error::Internal; + } + + gpu_outputs[i] = gpu_output_handle; + ET_LOG(Debug, "Created GPU output tensor %d", i); + } + + ET_LOG(Debug, "MetalBackend output generated"); + + // Log tensor handles before passing to AOTI container + ET_LOG(Debug, "Passing to AOTInductorModelContainerRun:"); + for (int i = 0; i < n_inputs; i++) { + void* gpu_input_data = gpu_inputs[i]->mutable_data_ptr(); + ET_LOG(Debug, " gpu_inputs[%d] = %p, data_ptr = %p", + i, gpu_inputs[i], gpu_input_data); + } + for (int i = 0; i < n_outputs; i++) { + void* gpu_output_data = gpu_outputs[i]->mutable_data_ptr(); + ET_LOG(Debug, " gpu_outputs[%d] = %p, data_ptr = %p", + i, gpu_outputs[i], gpu_output_data); + } + + // Run AOTI container with GPU tensors + AOTIRuntimeError error = AOTInductorModelContainerRun( + handle->container_handle, + gpu_inputs.data(), // Use GPU input tensors + n_inputs, + gpu_outputs.data(), // Use GPU output tensors + n_outputs, + nullptr, // Pass the actual Metal stream! + nullptr); // proxy_executor_handle can remain nullptr + + if (error != Error::Ok) { + ET_LOG( + Error, + "AOTInductorModelContainerRun failed with error code %d", + error); + return Error::Internal; + } + + // Ensure all GPU work is completed before reading results + try { + synchronize_metal_stream(); + } catch (const std::exception& e) { + ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: %s", e.what()); + return Error::Internal; + } catch (...) { + ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: unknown exception"); + return Error::Internal; + } + + ET_LOG(Debug, "MetalBackend running done and synchronized"); + + // Copy GPU output results back to CPU output tensors + for (int i = 0; i < n_outputs; i++) { + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + Error copy_err = aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0); + if (copy_err != Error::Ok) { + ET_LOG(Error, "Failed to copy GPU output %d back to CPU", i); + return Error::Internal; + } + ET_LOG(Debug, "Copied GPU output %d back to CPU", i); + } + + // Clean up GPU tensors that we created (ExecutorTorch tensors are always + // CPU, so all GPU tensors are our copies) + for (int i = 0; i < n_inputs; i++) { + // All GPU input tensors were created by us, delete them + aoti_torch_delete_tensor_object(gpu_inputs[i]); + } + + for (int i = 0; i < n_outputs; i++) { + // All GPU output tensors were created by us, delete them + aoti_torch_delete_tensor_object(gpu_outputs[i]); + } + + ET_LOG(Debug, "MetalBackend execution completed successfully"); + + return Error::Ok; + } + + void destroy(DelegateHandle* handle_) const override { + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + // Delete the container BEFORE closing the shared library + if (handle->container_handle != nullptr) { + AOTIRuntimeError delete_result = + AOTInductorModelContainerDelete(handle->container_handle); + if (delete_result != Error::Ok) { + ET_LOG( + Error, + "AOTInductorModelContainerDelete failed with error code %d", + delete_result); + } + } + + // Now close the shared library + if (handle->so_handle != nullptr) { + dlclose(handle->so_handle); + } + + free(handle); + cleanup_memory(); + executorch::backends::aoti::cleanup_tensor_metadata(); + ET_LOG(Debug, "MetalBackend handle %p destroy", handle_); + } +}; + +} // namespace metal + +namespace { +auto cls = metal::MetalBackend(); +executorch::runtime::Backend backend{"MetalBackend", &cls}; +static executorch::runtime::Error success_with_compiler = + register_backend(backend); +} // namespace + +} // namespace backends +} // namespace executorch diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake index 04e84622589..b2b09091014 100644 --- a/tools/cmake/preset/default.cmake +++ b/tools/cmake/preset/default.cmake @@ -152,6 +152,9 @@ define_overridable_option( define_overridable_option( EXECUTORCH_BUILD_CUDA "Build the CUDA backend" BOOL OFF ) +define_overridable_option( + EXECUTORCH_BUILD_METAL "Build the AOTI Metal backend" BOOL OFF +) define_overridable_option( EXECUTORCH_BUILD_VGF "Build the Arm VGF backend" BOOL OFF ) @@ -389,6 +392,10 @@ check_required_options_on( IF_ON EXECUTORCH_BUILD_CUDA REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR ) +check_required_options_on( + IF_ON EXECUTORCH_BUILD_METAL REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR +) + if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH}) message( FATAL_ERROR From 9779d54e5da568e50aa5b21ebbc765be400aa8b1 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Fri, 10 Oct 2025 17:11:42 -0400 Subject: [PATCH 2/5] Update [ghstack-poisoned] --- backends/apple/metal/CMakeLists.txt | 74 ++++++------ .../apple/metal/runtime/metal_backend.cpp | 112 ++++++++++++++---- 2 files changed, 126 insertions(+), 60 deletions(-) diff --git a/backends/apple/metal/CMakeLists.txt b/backends/apple/metal/CMakeLists.txt index c26d247d6be..3d1ec39801a 100644 --- a/backends/apple/metal/CMakeLists.txt +++ b/backends/apple/metal/CMakeLists.txt @@ -33,24 +33,29 @@ set(_aoti_metal_sources runtime/shims/shim_mps.mm runtime/shims/tensor_attribute.cpp runtime/shims/utils.cpp - ) +) add_library(metal_backend STATIC ${_aoti_metal_sources}) target_include_directories( metal_backend - PUBLIC - $ - $ - # PyTorch AOTI headers from ExecutorTorch's torch detection - ${TORCH_INCLUDE_DIRS} + PUBLIC $ $ + # PyTorch AOTI headers from ExecutorTorch's torch detection + ${TORCH_INCLUDE_DIRS} ) # Link Metal framework find_library(METAL_LIBRARY Metal REQUIRED) find_library(FOUNDATION_LIBRARY Foundation REQUIRED) find_library(METALPERFORMANCESHADERS_LIBRARY MetalPerformanceShaders REQUIRED) -find_library(METALPERFORMANCESHADERSGRAPH_LIBRARY MetalPerformanceShadersGraph REQUIRED) -target_link_libraries(metal_backend PUBLIC ${METAL_LIBRARY} ${FOUNDATION_LIBRARY} ${METALPERFORMANCESHADERS_LIBRARY} ${METALPERFORMANCESHADERSGRAPH_LIBRARY}) +find_library( + METALPERFORMANCESHADERSGRAPH_LIBRARY MetalPerformanceShadersGraph REQUIRED +) +target_link_libraries( + metal_backend + PUBLIC ${METAL_LIBRARY} ${FOUNDATION_LIBRARY} + ${METALPERFORMANCESHADERS_LIBRARY} + ${METALPERFORMANCESHADERSGRAPH_LIBRARY} +) target_compile_options(metal_backend PUBLIC -fexceptions -frtti -fPIC) @@ -58,45 +63,44 @@ target_link_options(metal_backend PUBLIC -Wl,-export_dynamic) # Find PyTorch's OpenMP library specifically for libtorch-less AOTI get_torch_base_path(TORCH_BASE_PATH) -find_library(TORCH_OMP_LIBRARY - NAMES omp libomp - PATHS "${TORCH_BASE_PATH}/lib" - NO_DEFAULT_PATH +find_library( + TORCH_OMP_LIBRARY + NAMES omp libomp + PATHS "${TORCH_BASE_PATH}/lib" + NO_DEFAULT_PATH ) if(TORCH_OMP_LIBRARY) - message(STATUS "Found PyTorch OpenMP library: ${TORCH_OMP_LIBRARY}") - # Get the directory containing the OpenMP library for rpath - get_filename_component(TORCH_OMP_LIB_DIR ${TORCH_OMP_LIBRARY} DIRECTORY) - message(STATUS "OpenMP library directory: ${TORCH_OMP_LIB_DIR}") + message(STATUS "Found PyTorch OpenMP library: ${TORCH_OMP_LIBRARY}") + # Get the directory containing the OpenMP library for rpath + get_filename_component(TORCH_OMP_LIB_DIR ${TORCH_OMP_LIBRARY} DIRECTORY) + message(STATUS "OpenMP library directory: ${TORCH_OMP_LIB_DIR}") else() - message(WARNING "PyTorch OpenMP library not found, may cause runtime linking issues") + message( + WARNING "PyTorch OpenMP library not found, may cause runtime linking issues" + ) endif() # Link against appropriate backends and standard libraries target_link_libraries( - metal_backend - PUBLIC - aoti_common - extension_tensor - ${CMAKE_DL_LIBS} - ${TORCH_OMP_LIBRARY} + metal_backend PUBLIC aoti_common extension_tensor ${CMAKE_DL_LIBS} + ${TORCH_OMP_LIBRARY} ) # Set rpath for OpenMP library to avoid runtime linking issues if(TORCH_OMP_LIBRARY AND TORCH_OMP_LIB_DIR) - # Add the OpenMP library directory to the rpath - set_target_properties(metal_backend PROPERTIES - BUILD_RPATH "${TORCH_OMP_LIB_DIR}" - INSTALL_RPATH "${TORCH_OMP_LIB_DIR}" - ) - # Also try common OpenMP library locations - target_link_options(metal_backend PUBLIC - -Wl,-rpath,${TORCH_OMP_LIB_DIR} - -Wl,-rpath,/usr/local/opt/libomp/lib - -Wl,-rpath,/opt/homebrew/opt/libomp/lib - ) - message(STATUS "Added rpath for OpenMP library: ${TORCH_OMP_LIB_DIR}") + # Add the OpenMP library directory to the rpath + set_target_properties( + metal_backend PROPERTIES BUILD_RPATH "${TORCH_OMP_LIB_DIR}" + INSTALL_RPATH "${TORCH_OMP_LIB_DIR}" + ) + # Also try common OpenMP library locations + target_link_options( + metal_backend PUBLIC -Wl,-rpath,${TORCH_OMP_LIB_DIR} + -Wl,-rpath,/usr/local/opt/libomp/lib + -Wl,-rpath,/opt/homebrew/opt/libomp/lib + ) + message(STATUS "Added rpath for OpenMP library: ${TORCH_OMP_LIB_DIR}") endif() executorch_target_link_options_shared_lib(metal_backend) diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp index 4ce3ea7f7da..33699c51c45 100644 --- a/backends/apple/metal/runtime/metal_backend.cpp +++ b/backends/apple/metal/runtime/metal_backend.cpp @@ -30,11 +30,11 @@ #include // Include our Metal-specific shim layer headers +#include #include +#include #include #include -#include -#include namespace executorch { namespace backends { @@ -95,7 +95,10 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { const NamedDataMap* named_data_map = context.get_named_data_map(); ET_LOG(Info, "MetalBackend::init - Got named data map: %p", named_data_map); - ET_LOG(Info, "MetalBackend::init - Looking for blob key: %s", so_blob_key.c_str()); + ET_LOG( + Info, + "MetalBackend::init - Looking for blob key: %s", + so_blob_key.c_str()); Result aoti_metal_buffer = named_data_map->get_data(so_blob_key.c_str()); @@ -110,14 +113,20 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { return Error::InvalidArgument; } - ET_LOG(Info, "MetalBackend::init - Buffer is OK, size: %zu", aoti_metal_buffer->size()); + ET_LOG( + Info, + "MetalBackend::init - Buffer is OK, size: %zu", + aoti_metal_buffer->size()); if (aoti_metal_buffer->data() == nullptr) { ET_LOG(Error, "MetalBackend::init - Buffer data is null"); return Error::InvalidArgument; } - ET_LOG(Info, "MetalBackend::init - Buffer data pointer: %p", aoti_metal_buffer->data()); + ET_LOG( + Info, + "MetalBackend::init - Buffer data pointer: %p", + aoti_metal_buffer->data()); // Generate dynamic temporary file path filesystem::path temp_dir = filesystem::temp_directory_path(); @@ -125,7 +134,8 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { temp_dir / (so_blob_key + to_string(getpid()) + ".so"); // Create a temporary file - ET_LOG(Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str()); + ET_LOG( + Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str()); std::ofstream outfile(so_path.c_str(), std::ios::binary); if (!outfile.is_open()) { @@ -136,7 +146,10 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { // Write the ELF buffer to the temporary file size_t buffer_size = aoti_metal_buffer->size(); - ET_LOG(Info, "MetalBackend::init - About to write %zu bytes to file", buffer_size); + ET_LOG( + Info, + "MetalBackend::init - About to write %zu bytes to file", + buffer_size); // Write the buffer directly, not using sizeof(void*) multiplication outfile.write((char*)aoti_metal_buffer->data(), buffer_size); @@ -214,17 +227,28 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { } AOTInductorModelContainerHandle container_handle = nullptr; - ET_LOG(Info, "MetalBackend::init - About to create AOTI container with device='mps'"); + ET_LOG( + Info, + "MetalBackend::init - About to create AOTI container with device='mps'"); AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice( &container_handle, 1, "mps", nullptr); - ET_LOG(Info, "MetalBackend::init - AOTInductorModelContainerCreateWithDevice returned err=%d", err); + ET_LOG( + Info, + "MetalBackend::init - AOTInductorModelContainerCreateWithDevice returned err=%d", + err); if (err != Error::Ok) { - ET_LOG(Error, "Failed to initialize AOTInductorModelContainer with error %d", err); + ET_LOG( + Error, + "Failed to initialize AOTInductorModelContainer with error %d", + err); return err; } - ET_LOG(Info, "Successfully initialized container_handle = %p", container_handle); + ET_LOG( + Info, + "Successfully initialized container_handle = %p", + container_handle); AOTIDelegateHandle* handle = new AOTIDelegateHandle(); handle->so_handle = so_handle; @@ -289,7 +313,11 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { auto cpu_tensor = &(args[i]->toTensor()); auto sizes = cpu_tensor->sizes(); auto scalar_type = cpu_tensor->scalar_type(); - ET_LOG(Debug, "MetalBackend input %d scalar_type=%d", i, static_cast(scalar_type)); + ET_LOG( + Debug, + "MetalBackend input %d scalar_type=%d", + i, + static_cast(scalar_type)); // Create GPU tensor with same shape std::vector sizes_vec(sizes.begin(), sizes.end()); @@ -310,8 +338,13 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { } // Log the created GPU tensor scalar type - auto gpu_tensor = reinterpret_cast(gpu_input_handle); - ET_LOG(Debug, "MetalBackend created GPU tensor %d scalar_type=%d", i, static_cast(gpu_tensor->scalar_type())); + auto gpu_tensor = reinterpret_cast( + gpu_input_handle); + ET_LOG( + Debug, + "MetalBackend created GPU tensor %d scalar_type=%d", + i, + static_cast(gpu_tensor->scalar_type())); gpu_inputs[i] = gpu_input_handle; @@ -319,8 +352,14 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { void* cpu_data = cpu_tensor->mutable_data_ptr(); if (cpu_data && cpu_tensor->numel() > 0) { float* cpu_float_data = (float*)cpu_data; - ET_LOG(Debug, "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)", - i, cpu_float_data[0], cpu_float_data[1], cpu_float_data[2], cpu_tensor->numel()); + ET_LOG( + Debug, + "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)", + i, + cpu_float_data[0], + cpu_float_data[1], + cpu_float_data[2], + cpu_tensor->numel()); } // Copy data from CPU to GPU @@ -331,8 +370,14 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { } // Log the GPU tensor scalar type after copy - auto gpu_tensor_after = reinterpret_cast(gpu_inputs[i]); - ET_LOG(Debug, "MetalBackend GPU tensor %d scalar_type after copy=%d", i, static_cast(gpu_tensor_after->scalar_type())); + auto gpu_tensor_after = + reinterpret_cast( + gpu_inputs[i]); + ET_LOG( + Debug, + "MetalBackend GPU tensor %d scalar_type after copy=%d", + i, + static_cast(gpu_tensor_after->scalar_type())); ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i); } @@ -346,7 +391,11 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); auto sizes = cpu_output_tensor->sizes(); auto scalar_type = cpu_output_tensor->scalar_type(); - ET_LOG(Debug, "MetalBackend output %d scalar_type=%d", i, static_cast(scalar_type)); + ET_LOG( + Debug, + "MetalBackend output %d scalar_type=%d", + i, + static_cast(scalar_type)); // Create GPU tensor with same shape for kernel output std::vector sizes_vec(sizes.begin(), sizes.end()); @@ -376,13 +425,21 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Debug, "Passing to AOTInductorModelContainerRun:"); for (int i = 0; i < n_inputs; i++) { void* gpu_input_data = gpu_inputs[i]->mutable_data_ptr(); - ET_LOG(Debug, " gpu_inputs[%d] = %p, data_ptr = %p", - i, gpu_inputs[i], gpu_input_data); + ET_LOG( + Debug, + " gpu_inputs[%d] = %p, data_ptr = %p", + i, + gpu_inputs[i], + gpu_input_data); } for (int i = 0; i < n_outputs; i++) { void* gpu_output_data = gpu_outputs[i]->mutable_data_ptr(); - ET_LOG(Debug, " gpu_outputs[%d] = %p, data_ptr = %p", - i, gpu_outputs[i], gpu_output_data); + ET_LOG( + Debug, + " gpu_outputs[%d] = %p, data_ptr = %p", + i, + gpu_outputs[i], + gpu_output_data); } // Run AOTI container with GPU tensors @@ -407,10 +464,15 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { try { synchronize_metal_stream(); } catch (const std::exception& e) { - ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: %s", e.what()); + ET_LOG( + Error, + "Failed to synchronize Metal stream after kernel execution: %s", + e.what()); return Error::Internal; } catch (...) { - ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: unknown exception"); + ET_LOG( + Error, + "Failed to synchronize Metal stream after kernel execution: unknown exception"); return Error::Internal; } From f9c8989ad0498111c268b8feb9f953aabb4dfe4a Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 13 Oct 2025 18:21:29 -0400 Subject: [PATCH 3/5] Update [ghstack-poisoned] --- .../apple/metal/runtime/metal_backend.cpp | 352 +++++++----------- examples/models/voxtral/CMakeLists.txt | 5 + tools/cmake/executorch-config.cmake | 1 + tools/cmake/preset/default.cmake | 2 +- 4 files changed, 145 insertions(+), 215 deletions(-) diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp index 33699c51c45..50c89cd66e4 100644 --- a/backends/apple/metal/runtime/metal_backend.cpp +++ b/backends/apple/metal/runtime/metal_backend.cpp @@ -6,23 +6,17 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include #include #include - -#include -#include -#include +#include #include #include -#include -#include #include #include -#include -#include +#include #include // Include AOTI common headers (from aoti_common library) @@ -36,9 +30,14 @@ #include #include -namespace executorch { -namespace backends { -namespace metal { +namespace executorch::backends::metal { + +#define LOAD_SYMBOL(name, handle) \ + do { \ + name = reinterpret_cast(dlsym(handle, #name)); \ + ET_CHECK_OR_RETURN_ERROR( \ + name != nullptr, AccessFailed, "Failed to load " #name); \ + } while (0) using namespace std; using namespace aoti; @@ -59,7 +58,36 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; -class MetalBackend final : public ::executorch::runtime::BackendInterface { +class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::BackendInterface { + private: + Error register_shared_library_functions(void* so_handle) const { + ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loading symbols"); + + LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle); + ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerCreateWithDevice"); + + LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle); + ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerDelete"); + + LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle); + ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumInputs"); + + LOAD_SYMBOL(AOTInductorModelContainerGetNumConstants, so_handle); + ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumConstants"); + + LOAD_SYMBOL(AOTInductorModelContainerGetInputName, so_handle); + ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetInputName"); + + LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle); + ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumOutputs"); + + LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle); + ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerRun"); + + ET_LOG(Info, "MetalBackend::register_shared_library_functions - All symbols loaded successfully"); + return Error::Ok; + } + public: // Once in program MetalBackend() { @@ -95,38 +123,24 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { const NamedDataMap* named_data_map = context.get_named_data_map(); ET_LOG(Info, "MetalBackend::init - Got named data map: %p", named_data_map); - ET_LOG( - Info, - "MetalBackend::init - Looking for blob key: %s", - so_blob_key.c_str()); - - Result aoti_metal_buffer = - named_data_map->get_data(so_blob_key.c_str()); - ET_LOG(Info, "MetalBackend::init - Got buffer result"); + ET_LOG(Info, "MetalBackend::init - Looking for blob key: %s", so_blob_key.c_str()); - if (!aoti_metal_buffer.ok()) { - ET_LOG( - Error, - "MetalBackend::init - Failed to get buffer for key %s: 0x%x", - so_blob_key.c_str(), - aoti_metal_buffer.error()); - return Error::InvalidArgument; - } + auto aoti_metal_buffer = named_data_map->get_data(so_blob_key.c_str()); + ET_CHECK_OR_RETURN_ERROR( + aoti_metal_buffer.ok(), + Internal, + "Failed to get data for key %s: 0x%x", + so_blob_key.c_str(), + static_cast(aoti_metal_buffer.error())); - ET_LOG( - Info, - "MetalBackend::init - Buffer is OK, size: %zu", - aoti_metal_buffer->size()); + ET_LOG(Info, "MetalBackend::init - Buffer is OK, size: %zu", aoti_metal_buffer->size()); if (aoti_metal_buffer->data() == nullptr) { ET_LOG(Error, "MetalBackend::init - Buffer data is null"); return Error::InvalidArgument; } - ET_LOG( - Info, - "MetalBackend::init - Buffer data pointer: %p", - aoti_metal_buffer->data()); + ET_LOG(Info, "MetalBackend::init - Buffer data pointer: %p", aoti_metal_buffer->data()); // Generate dynamic temporary file path filesystem::path temp_dir = filesystem::temp_directory_path(); @@ -134,31 +148,22 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { temp_dir / (so_blob_key + to_string(getpid()) + ".so"); // Create a temporary file - ET_LOG( - Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str()); - std::ofstream outfile(so_path.c_str(), std::ios::binary); - - if (!outfile.is_open()) { - ET_LOG(Error, "MetalBackend::init - Failed to create temp file"); - return Error::AccessFailed; - } - ET_LOG(Info, "MetalBackend::init - Temp file created successfully"); + ET_LOG(Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str()); + ofstream outfile(so_path.c_str(), ios::binary); // Write the ELF buffer to the temporary file - size_t buffer_size = aoti_metal_buffer->size(); ET_LOG( Info, - "MetalBackend::init - About to write %zu bytes to file", - buffer_size); + "Writing %zu bytes to %s", + aoti_metal_buffer->size(), + so_path.c_str()); - // Write the buffer directly, not using sizeof(void*) multiplication - outfile.write((char*)aoti_metal_buffer->data(), buffer_size); + outfile.write( + static_cast(aoti_metal_buffer->data()), + aoti_metal_buffer->size()); - if (outfile.bad()) { - ET_LOG(Error, "MetalBackend::init - File write failed"); - return Error::AccessFailed; - } - ET_LOG(Info, "MetalBackend::init - Buffer written successfully"); + ET_CHECK_OR_RETURN_ERROR( + outfile, AccessFailed, "Failed to write to file %s", so_path.c_str()); // Finish writing the file to disk outfile.close(); @@ -166,93 +171,31 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { // Load the ELF using dlopen void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL); - if (so_handle == nullptr) { - std::cout << dlerror() << std::endl; - return Error::AccessFailed; - } + ET_CHECK_OR_RETURN_ERROR( + so_handle != nullptr, + AccessFailed, + "Failed to load shared library: %s", + dlerror()); processed->Free(); - AOTInductorModelContainerCreateWithDevice = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice")); - if (AOTInductorModelContainerCreateWithDevice == nullptr) { - perror("dlsym1"); - return Error::AccessFailed; - } - AOTInductorModelContainerDelete = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerDelete")); - if (AOTInductorModelContainerDelete == nullptr) { - perror("dlsym2"); - return Error::AccessFailed; - } - AOTInductorModelContainerGetNumInputs = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerGetNumInputs")); - if (AOTInductorModelContainerGetNumInputs == nullptr) { - perror("dlsym3"); - return Error::AccessFailed; - } - - AOTInductorModelContainerGetNumConstants = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerGetNumConstants")); - if (AOTInductorModelContainerGetNumConstants == nullptr) { - perror("dlsym AOTInductorModelContainerGetNumConstants"); - return Error::AccessFailed; - } - - AOTInductorModelContainerGetInputName = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerGetInputName")); - if (AOTInductorModelContainerGetInputName == nullptr) { - perror("dlsym AOTInductorModelContainerGetInputName"); - return Error::AccessFailed; - } - - AOTInductorModelContainerGetNumOutputs = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs")); - if (AOTInductorModelContainerGetNumOutputs == nullptr) { - perror("dlsym4"); - return Error::AccessFailed; - } - AOTInductorModelContainerRun = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerRun")); - if (AOTInductorModelContainerRun == nullptr) { - perror("dlsym5"); - return Error::AccessFailed; - } + // Register all shared library functions + ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle)); AOTInductorModelContainerHandle container_handle = nullptr; - ET_LOG( - Info, - "MetalBackend::init - About to create AOTI container with device='mps'"); + ET_LOG(Info, "MetalBackend::init - About to create AOTI container with device='mps'"); - AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice( - &container_handle, 1, "mps", nullptr); - ET_LOG( - Info, - "MetalBackend::init - AOTInductorModelContainerCreateWithDevice returned err=%d", - err); + ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice( + &container_handle, 1, "mps", nullptr)); - if (err != Error::Ok) { - ET_LOG( - Error, - "Failed to initialize AOTInductorModelContainer with error %d", - err); - return err; - } - ET_LOG( - Info, - "Successfully initialized container_handle = %p", - container_handle); + ET_LOG(Info, "container_handle = %p", container_handle); AOTIDelegateHandle* handle = new AOTIDelegateHandle(); handle->so_handle = so_handle; + handle->so_path = so_path.string(); handle->container_handle = container_handle; + + ET_LOG(Info, "MetalBackend::init - Initialization completed successfully"); return (DelegateHandle*)handle; // Return the handle post-processing } @@ -265,6 +208,14 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + // Need to re-register all the symbols from the so_handle hosted by this + // MetalBackend instance. The reason is that these symbols are + // static/singleton across the whole process. When we share multiple methods + // (meaning multiple so_handle) in the same process, we need to re-register + // the symbols from the so_handle that is being used in this execution. + ET_CHECK_OK_OR_RETURN_ERROR( + register_shared_library_functions(handle->so_handle)); + ET_LOG(Debug, "MetalBackend Handle generated"); size_t n_inputs; @@ -276,15 +227,13 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Debug, "MetalBackend n_outputs %zd generated", n_outputs); - if (n_inputs + n_outputs != args.size()) { - ET_LOG( - Error, - "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.", - n_inputs, - n_outputs, - args.size()); - return Error::InvalidArgument; - } + ET_CHECK_OR_RETURN_ERROR( + n_inputs + n_outputs == args.size(), + InvalidArgument, + "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.", + n_inputs, + n_outputs, + args.size()) ET_LOG( Debug, @@ -313,11 +262,7 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { auto cpu_tensor = &(args[i]->toTensor()); auto sizes = cpu_tensor->sizes(); auto scalar_type = cpu_tensor->scalar_type(); - ET_LOG( - Debug, - "MetalBackend input %d scalar_type=%d", - i, - static_cast(scalar_type)); + ET_LOG(Debug, "MetalBackend input %d scalar_type=%d", i, static_cast(scalar_type)); // Create GPU tensor with same shape std::vector sizes_vec(sizes.begin(), sizes.end()); @@ -338,13 +283,8 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { } // Log the created GPU tensor scalar type - auto gpu_tensor = reinterpret_cast( - gpu_input_handle); - ET_LOG( - Debug, - "MetalBackend created GPU tensor %d scalar_type=%d", - i, - static_cast(gpu_tensor->scalar_type())); + auto gpu_tensor = reinterpret_cast(gpu_input_handle); + ET_LOG(Debug, "MetalBackend created GPU tensor %d scalar_type=%d", i, static_cast(gpu_tensor->scalar_type())); gpu_inputs[i] = gpu_input_handle; @@ -352,14 +292,8 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { void* cpu_data = cpu_tensor->mutable_data_ptr(); if (cpu_data && cpu_tensor->numel() > 0) { float* cpu_float_data = (float*)cpu_data; - ET_LOG( - Debug, - "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)", - i, - cpu_float_data[0], - cpu_float_data[1], - cpu_float_data[2], - cpu_tensor->numel()); + ET_LOG(Debug, "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)", + i, cpu_float_data[0], cpu_float_data[1], cpu_float_data[2], cpu_tensor->numel()); } // Copy data from CPU to GPU @@ -370,14 +304,8 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { } // Log the GPU tensor scalar type after copy - auto gpu_tensor_after = - reinterpret_cast( - gpu_inputs[i]); - ET_LOG( - Debug, - "MetalBackend GPU tensor %d scalar_type after copy=%d", - i, - static_cast(gpu_tensor_after->scalar_type())); + auto gpu_tensor_after = reinterpret_cast(gpu_inputs[i]); + ET_LOG(Debug, "MetalBackend GPU tensor %d scalar_type after copy=%d", i, static_cast(gpu_tensor_after->scalar_type())); ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i); } @@ -391,11 +319,7 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); auto sizes = cpu_output_tensor->sizes(); auto scalar_type = cpu_output_tensor->scalar_type(); - ET_LOG( - Debug, - "MetalBackend output %d scalar_type=%d", - i, - static_cast(scalar_type)); + ET_LOG(Debug, "MetalBackend output %d scalar_type=%d", i, static_cast(scalar_type)); // Create GPU tensor with same shape for kernel output std::vector sizes_vec(sizes.begin(), sizes.end()); @@ -425,21 +349,13 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Debug, "Passing to AOTInductorModelContainerRun:"); for (int i = 0; i < n_inputs; i++) { void* gpu_input_data = gpu_inputs[i]->mutable_data_ptr(); - ET_LOG( - Debug, - " gpu_inputs[%d] = %p, data_ptr = %p", - i, - gpu_inputs[i], - gpu_input_data); + ET_LOG(Debug, " gpu_inputs[%d] = %p, data_ptr = %p", + i, gpu_inputs[i], gpu_input_data); } for (int i = 0; i < n_outputs; i++) { void* gpu_output_data = gpu_outputs[i]->mutable_data_ptr(); - ET_LOG( - Debug, - " gpu_outputs[%d] = %p, data_ptr = %p", - i, - gpu_outputs[i], - gpu_output_data); + ET_LOG(Debug, " gpu_outputs[%d] = %p, data_ptr = %p", + i, gpu_outputs[i], gpu_output_data); } // Run AOTI container with GPU tensors @@ -464,15 +380,10 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { try { synchronize_metal_stream(); } catch (const std::exception& e) { - ET_LOG( - Error, - "Failed to synchronize Metal stream after kernel execution: %s", - e.what()); + ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: %s", e.what()); return Error::Internal; } catch (...) { - ET_LOG( - Error, - "Failed to synchronize Metal stream after kernel execution: unknown exception"); + ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: unknown exception"); return Error::Internal; } @@ -481,11 +392,15 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { // Copy GPU output results back to CPU output tensors for (int i = 0; i < n_outputs; i++) { auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); - Error copy_err = aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0); - if (copy_err != Error::Ok) { - ET_LOG(Error, "Failed to copy GPU output %d back to CPU", i); - return Error::Internal; - } + // For DYNAMIC_BOUND tensors we try to resize + ET_CHECK_OK_OR_RETURN_ERROR( + resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()), + "Error resizing tensor at output index %d", + i); + ET_CHECK_OK_OR_RETURN_ERROR( + aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0), + "Failed to copy GPU output %d back to CPU", + i); ET_LOG(Debug, "Copied GPU output %d back to CPU", i); } @@ -507,40 +422,49 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface { } void destroy(DelegateHandle* handle_) const override { + if (handle_ == nullptr) { + return; + } AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; - // Delete the container BEFORE closing the shared library - if (handle->container_handle != nullptr) { - AOTIRuntimeError delete_result = - AOTInductorModelContainerDelete(handle->container_handle); - if (delete_result != Error::Ok) { - ET_LOG( - Error, - "AOTInductorModelContainerDelete failed with error code %d", - delete_result); - } - } + // NOTE: AOTInductorModelContainerDelete does not work correctly with + // multiple .so files. Deleting one container frees shared resources, + // which causes segmentation faults when attempting to delete other + // containers. As a workaround, we skip explicit container deletion + // and defer cleanup to the OS. + // TODO(gasoonjia): Find a proper solution for safe container deletion. + // AOTInductorModelContainerDelete(handle->container_handle); // Now close the shared library if (handle->so_handle != nullptr) { dlclose(handle->so_handle); } - free(handle); + // Remove the temporary shared library file + if (!handle->so_path.empty()) { + std::error_code remove_error; + std::filesystem::remove(handle->so_path, remove_error); + ET_CHECK_OR_LOG_ERROR( + !remove_error, + "Failed to remove temporary shared library %s: %s", + handle->so_path.c_str(), + remove_error.message().c_str()); + } + + delete handle; cleanup_memory(); executorch::backends::aoti::cleanup_tensor_metadata(); ET_LOG(Debug, "MetalBackend handle %p destroy", handle_); } }; -} // namespace metal +} // namespace executorch::backends::metal +namespace executorch::backends { namespace { auto cls = metal::MetalBackend(); executorch::runtime::Backend backend{"MetalBackend", &cls}; static executorch::runtime::Error success_with_compiler = register_backend(backend); } // namespace - -} // namespace backends -} // namespace executorch +} // namespace executorch::backends diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt index 3995f5533e6..866d17160ba 100644 --- a/examples/models/voxtral/CMakeLists.txt +++ b/examples/models/voxtral/CMakeLists.txt @@ -93,6 +93,11 @@ if(EXECUTORCH_BUILD_CUDA) executorch_target_link_options_shared_lib(aoti_cuda) endif() +if(EXECUTORCH_BUILD_METAL) + list(APPEND link_libraries metal_backend) + executorch_target_link_options_shared_lib(metal_backend) +endif() + # Add tokenizers list(APPEND link_libraries tokenizers::tokenizers) diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index 3df8e947459..21bbbe26350 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -63,6 +63,7 @@ set(optional_lib_list coreml_inmemoryfs coremldelegate mpsdelegate + metal_backend neuron_backend qnn_executorch_backend portable_ops_lib diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake index b2b09091014..861e41e4a63 100644 --- a/tools/cmake/preset/default.cmake +++ b/tools/cmake/preset/default.cmake @@ -153,7 +153,7 @@ define_overridable_option( EXECUTORCH_BUILD_CUDA "Build the CUDA backend" BOOL OFF ) define_overridable_option( - EXECUTORCH_BUILD_METAL "Build the AOTI Metal backend" BOOL OFF + EXECUTORCH_BUILD_METAL "Build the Metal backend" BOOL OFF ) define_overridable_option( EXECUTORCH_BUILD_VGF "Build the Arm VGF backend" BOOL OFF From b87d5dee3eb02e1e8f9187d6fef9570cbc1b2649 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 13 Oct 2025 18:23:13 -0400 Subject: [PATCH 4/5] Update [ghstack-poisoned] --- .../apple/metal/runtime/metal_backend.cpp | 129 ++++++++++++++---- 1 file changed, 99 insertions(+), 30 deletions(-) diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp index 50c89cd66e4..03673f2f1dd 100644 --- a/backends/apple/metal/runtime/metal_backend.cpp +++ b/backends/apple/metal/runtime/metal_backend.cpp @@ -58,33 +58,52 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; -class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::BackendInterface { +class ET_EXPERIMENTAL MetalBackend final + : public ::executorch::runtime::BackendInterface { private: Error register_shared_library_functions(void* so_handle) const { - ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loading symbols"); + ET_LOG( + Info, + "MetalBackend::register_shared_library_functions - Loading symbols"); LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle); - ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerCreateWithDevice"); + ET_LOG( + Info, + "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerCreateWithDevice"); LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle); - ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerDelete"); + ET_LOG( + Info, + "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerDelete"); LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle); - ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumInputs"); + ET_LOG( + Info, + "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumInputs"); LOAD_SYMBOL(AOTInductorModelContainerGetNumConstants, so_handle); - ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumConstants"); + ET_LOG( + Info, + "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumConstants"); LOAD_SYMBOL(AOTInductorModelContainerGetInputName, so_handle); - ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetInputName"); + ET_LOG( + Info, + "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetInputName"); LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle); - ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumOutputs"); + ET_LOG( + Info, + "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumOutputs"); LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle); - ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerRun"); + ET_LOG( + Info, + "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerRun"); - ET_LOG(Info, "MetalBackend::register_shared_library_functions - All symbols loaded successfully"); + ET_LOG( + Info, + "MetalBackend::register_shared_library_functions - All symbols loaded successfully"); return Error::Ok; } @@ -123,7 +142,10 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend const NamedDataMap* named_data_map = context.get_named_data_map(); ET_LOG(Info, "MetalBackend::init - Got named data map: %p", named_data_map); - ET_LOG(Info, "MetalBackend::init - Looking for blob key: %s", so_blob_key.c_str()); + ET_LOG( + Info, + "MetalBackend::init - Looking for blob key: %s", + so_blob_key.c_str()); auto aoti_metal_buffer = named_data_map->get_data(so_blob_key.c_str()); ET_CHECK_OR_RETURN_ERROR( @@ -133,14 +155,20 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend so_blob_key.c_str(), static_cast(aoti_metal_buffer.error())); - ET_LOG(Info, "MetalBackend::init - Buffer is OK, size: %zu", aoti_metal_buffer->size()); + ET_LOG( + Info, + "MetalBackend::init - Buffer is OK, size: %zu", + aoti_metal_buffer->size()); if (aoti_metal_buffer->data() == nullptr) { ET_LOG(Error, "MetalBackend::init - Buffer data is null"); return Error::InvalidArgument; } - ET_LOG(Info, "MetalBackend::init - Buffer data pointer: %p", aoti_metal_buffer->data()); + ET_LOG( + Info, + "MetalBackend::init - Buffer data pointer: %p", + aoti_metal_buffer->data()); // Generate dynamic temporary file path filesystem::path temp_dir = filesystem::temp_directory_path(); @@ -148,7 +176,8 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend temp_dir / (so_blob_key + to_string(getpid()) + ".so"); // Create a temporary file - ET_LOG(Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str()); + ET_LOG( + Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str()); ofstream outfile(so_path.c_str(), ios::binary); // Write the ELF buffer to the temporary file @@ -183,7 +212,9 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle)); AOTInductorModelContainerHandle container_handle = nullptr; - ET_LOG(Info, "MetalBackend::init - About to create AOTI container with device='mps'"); + ET_LOG( + Info, + "MetalBackend::init - About to create AOTI container with device='mps'"); ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice( &container_handle, 1, "mps", nullptr)); @@ -262,7 +293,11 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend auto cpu_tensor = &(args[i]->toTensor()); auto sizes = cpu_tensor->sizes(); auto scalar_type = cpu_tensor->scalar_type(); - ET_LOG(Debug, "MetalBackend input %d scalar_type=%d", i, static_cast(scalar_type)); + ET_LOG( + Debug, + "MetalBackend input %d scalar_type=%d", + i, + static_cast(scalar_type)); // Create GPU tensor with same shape std::vector sizes_vec(sizes.begin(), sizes.end()); @@ -283,8 +318,13 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend } // Log the created GPU tensor scalar type - auto gpu_tensor = reinterpret_cast(gpu_input_handle); - ET_LOG(Debug, "MetalBackend created GPU tensor %d scalar_type=%d", i, static_cast(gpu_tensor->scalar_type())); + auto gpu_tensor = reinterpret_cast( + gpu_input_handle); + ET_LOG( + Debug, + "MetalBackend created GPU tensor %d scalar_type=%d", + i, + static_cast(gpu_tensor->scalar_type())); gpu_inputs[i] = gpu_input_handle; @@ -292,8 +332,14 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend void* cpu_data = cpu_tensor->mutable_data_ptr(); if (cpu_data && cpu_tensor->numel() > 0) { float* cpu_float_data = (float*)cpu_data; - ET_LOG(Debug, "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)", - i, cpu_float_data[0], cpu_float_data[1], cpu_float_data[2], cpu_tensor->numel()); + ET_LOG( + Debug, + "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)", + i, + cpu_float_data[0], + cpu_float_data[1], + cpu_float_data[2], + cpu_tensor->numel()); } // Copy data from CPU to GPU @@ -304,8 +350,14 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend } // Log the GPU tensor scalar type after copy - auto gpu_tensor_after = reinterpret_cast(gpu_inputs[i]); - ET_LOG(Debug, "MetalBackend GPU tensor %d scalar_type after copy=%d", i, static_cast(gpu_tensor_after->scalar_type())); + auto gpu_tensor_after = + reinterpret_cast( + gpu_inputs[i]); + ET_LOG( + Debug, + "MetalBackend GPU tensor %d scalar_type after copy=%d", + i, + static_cast(gpu_tensor_after->scalar_type())); ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i); } @@ -319,7 +371,11 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); auto sizes = cpu_output_tensor->sizes(); auto scalar_type = cpu_output_tensor->scalar_type(); - ET_LOG(Debug, "MetalBackend output %d scalar_type=%d", i, static_cast(scalar_type)); + ET_LOG( + Debug, + "MetalBackend output %d scalar_type=%d", + i, + static_cast(scalar_type)); // Create GPU tensor with same shape for kernel output std::vector sizes_vec(sizes.begin(), sizes.end()); @@ -349,13 +405,21 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend ET_LOG(Debug, "Passing to AOTInductorModelContainerRun:"); for (int i = 0; i < n_inputs; i++) { void* gpu_input_data = gpu_inputs[i]->mutable_data_ptr(); - ET_LOG(Debug, " gpu_inputs[%d] = %p, data_ptr = %p", - i, gpu_inputs[i], gpu_input_data); + ET_LOG( + Debug, + " gpu_inputs[%d] = %p, data_ptr = %p", + i, + gpu_inputs[i], + gpu_input_data); } for (int i = 0; i < n_outputs; i++) { void* gpu_output_data = gpu_outputs[i]->mutable_data_ptr(); - ET_LOG(Debug, " gpu_outputs[%d] = %p, data_ptr = %p", - i, gpu_outputs[i], gpu_output_data); + ET_LOG( + Debug, + " gpu_outputs[%d] = %p, data_ptr = %p", + i, + gpu_outputs[i], + gpu_output_data); } // Run AOTI container with GPU tensors @@ -380,10 +444,15 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend try { synchronize_metal_stream(); } catch (const std::exception& e) { - ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: %s", e.what()); + ET_LOG( + Error, + "Failed to synchronize Metal stream after kernel execution: %s", + e.what()); return Error::Internal; } catch (...) { - ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: unknown exception"); + ET_LOG( + Error, + "Failed to synchronize Metal stream after kernel execution: unknown exception"); return Error::Internal; } @@ -432,7 +501,7 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend // which causes segmentation faults when attempting to delete other // containers. As a workaround, we skip explicit container deletion // and defer cleanup to the OS. - // TODO(gasoonjia): Find a proper solution for safe container deletion. + // TODO: Find a proper solution for safe container deletion. // AOTInductorModelContainerDelete(handle->container_handle); // Now close the shared library From 76e5a45c80fe97ee1f69c62d79fede7518864216 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Wed, 15 Oct 2025 13:59:08 -0400 Subject: [PATCH 5/5] Update [ghstack-poisoned] --- backends/apple/metal/CMakeLists.txt | 9 +++++++++ backends/apple/metal/runtime/metal_backend.cpp | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/backends/apple/metal/CMakeLists.txt b/backends/apple/metal/CMakeLists.txt index 3d1ec39801a..7bdf142041d 100644 --- a/backends/apple/metal/CMakeLists.txt +++ b/backends/apple/metal/CMakeLists.txt @@ -14,8 +14,17 @@ # ~~~ # It should also be cmake-lint clean. # +cmake_minimum_required(VERSION 3.29) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if(NOT APPLE) + message(FATAL_ERROR "Metal backend requires macOS") +endif() + # Source root directory for executorch. if(NOT EXECUTORCH_ROOT) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp index 9daccc1cf7a..1ef365a9332 100644 --- a/backends/apple/metal/runtime/metal_backend.cpp +++ b/backends/apple/metal/runtime/metal_backend.cpp @@ -520,6 +520,12 @@ class ET_EXPERIMENTAL MetalBackend final "Failed to remove temporary shared library %s: %s", handle->so_path.c_str(), remove_error.message().c_str()); + if (!remove_error) { + ET_LOG( + Info, + "Removed temporary shared library file: %s", + handle->so_path.c_str()); + } } delete handle;