From b177b637325d358c5b2127950b4a7f25ed8cd36e Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Fri, 10 Oct 2025 17:01:53 -0400
Subject: [PATCH 1/5] Update

[ghstack-poisoned]
---
 CMakeLists.txt                                |  12 +-
 backends/aoti/CMakeLists.txt                  |   6 +-
 backends/apple/metal/CMakeLists.txt           | 107 ++++
 .../apple/metal/runtime/metal_backend.cpp     | 484 ++++++++++++++++++
 tools/cmake/preset/default.cmake              |   7 +
 5 files changed, 613 insertions(+), 3 deletions(-)
 create mode 100644 backends/apple/metal/CMakeLists.txt
 create mode 100644 backends/apple/metal/runtime/metal_backend.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad08c72d1ae..33ae1f2533b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -602,15 +602,23 @@ if(EXECUTORCH_BUILD_CORTEX_M)
   list(APPEND _executorch_backends coretex_m_backend)
 endif()
 
-if(EXECUTORCH_BUILD_CUDA)
-  # Build common AOTI functionality (required for CUDA)
+# Build common AOTI functionality if needed by CUDA or Metal backends
+if(EXECUTORCH_BUILD_CUDA OR EXECUTORCH_BUILD_METAL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
+endif()
+
+if(EXECUTORCH_BUILD_CUDA)
   # Build CUDA-specific AOTI functionality
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
   # Add aoti_cuda to backends - it already depends on aoti_common
   list(APPEND _executorch_backends aoti_cuda)
 endif()
 
+if(EXECUTORCH_BUILD_METAL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/metal)
+  list(APPEND _executorch_backends metal_backend)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
 endif()
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 845144af50f..5306c8ad74b 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -38,7 +38,11 @@ target_include_directories(
 )
 target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC)
 # Ensure symbols are exported properly
-target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)
+if(APPLE)
+  target_link_options(aoti_common PUBLIC -Wl,-export_dynamic)
+else()
+  target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)
+endif()
 
 # Link against ExecuTorch libraries and standard libraries
 target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
diff --git a/backends/apple/metal/CMakeLists.txt b/backends/apple/metal/CMakeLists.txt
new file mode 100644
index 00000000000..c26d247d6be
--- /dev/null
+++ b/backends/apple/metal/CMakeLists.txt
@@ -0,0 +1,107 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Build AOTI Metal backend for runtime.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+# Use full torch package to get library paths, but only link specific libraries
+find_package_torch()
+
+set(_aoti_metal_sources
+    runtime/metal_backend.cpp
+    runtime/shims/memory.cpp
+    runtime/shims/et_metal.mm
+    runtime/shims/et_metal_ops.mm
+    runtime/shims/shim_mps.mm
+    runtime/shims/tensor_attribute.cpp
+    runtime/shims/utils.cpp
+    )
+
+add_library(metal_backend STATIC ${_aoti_metal_sources})
+target_include_directories(
+  metal_backend
+  PUBLIC
+    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+    $<INSTALL_INTERFACE:include>
+    # PyTorch AOTI headers from ExecutorTorch's torch detection
+    ${TORCH_INCLUDE_DIRS}
+)
+
+# Link Metal framework
+find_library(METAL_LIBRARY Metal REQUIRED)
+find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+find_library(METALPERFORMANCESHADERS_LIBRARY MetalPerformanceShaders REQUIRED)
+find_library(METALPERFORMANCESHADERSGRAPH_LIBRARY MetalPerformanceShadersGraph REQUIRED)
+target_link_libraries(metal_backend PUBLIC ${METAL_LIBRARY} ${FOUNDATION_LIBRARY} ${METALPERFORMANCESHADERS_LIBRARY} ${METALPERFORMANCESHADERSGRAPH_LIBRARY})
+
+target_compile_options(metal_backend PUBLIC -fexceptions -frtti -fPIC)
+
+target_link_options(metal_backend PUBLIC -Wl,-export_dynamic)
+
+# Find PyTorch's OpenMP library specifically for libtorch-less AOTI
+get_torch_base_path(TORCH_BASE_PATH)
+find_library(TORCH_OMP_LIBRARY
+    NAMES omp libomp
+    PATHS "${TORCH_BASE_PATH}/lib"
+    NO_DEFAULT_PATH
+)
+
+if(TORCH_OMP_LIBRARY)
+    message(STATUS "Found PyTorch OpenMP library: ${TORCH_OMP_LIBRARY}")
+    # Get the directory containing the OpenMP library for rpath
+    get_filename_component(TORCH_OMP_LIB_DIR ${TORCH_OMP_LIBRARY} DIRECTORY)
+    message(STATUS "OpenMP library directory: ${TORCH_OMP_LIB_DIR}")
+else()
+    message(WARNING "PyTorch OpenMP library not found, may cause runtime linking issues")
+endif()
+
+# Link against appropriate backends and standard libraries
+target_link_libraries(
+  metal_backend
+  PUBLIC
+    aoti_common
+    extension_tensor
+    ${CMAKE_DL_LIBS}
+    ${TORCH_OMP_LIBRARY}
+)
+
+# Set rpath for OpenMP library to avoid runtime linking issues
+if(TORCH_OMP_LIBRARY AND TORCH_OMP_LIB_DIR)
+    # Add the OpenMP library directory to the rpath
+    set_target_properties(metal_backend PROPERTIES
+        BUILD_RPATH "${TORCH_OMP_LIB_DIR}"
+        INSTALL_RPATH "${TORCH_OMP_LIB_DIR}"
+    )
+    # Also try common OpenMP library locations
+    target_link_options(metal_backend PUBLIC
+        -Wl,-rpath,${TORCH_OMP_LIB_DIR}
+        -Wl,-rpath,/usr/local/opt/libomp/lib
+        -Wl,-rpath,/opt/homebrew/opt/libomp/lib
+    )
+    message(STATUS "Added rpath for OpenMP library: ${TORCH_OMP_LIB_DIR}")
+endif()
+
+executorch_target_link_options_shared_lib(metal_backend)
+install(
+  TARGETS metal_backend
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp
new file mode 100644
index 00000000000..4ce3ea7f7da
--- /dev/null
+++ b/backends/apple/metal/runtime/metal_backend.cpp
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+// Include AOTI common headers (from aoti_common library)
+#include <executorch/backends/aoti/aoti_model_container.h>
+#include <executorch/backends/aoti/common_shims.h>
+
+// Include our Metal-specific shim layer headers
+#include <executorch/backends/apple/metal/runtime/shims/memory.h>
+#include <executorch/backends/apple/metal/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/apple/metal/runtime/shims/utils.h>
+#include <executorch/backends/apple/metal/runtime/shims/shim_mps.h>
+#include <executorch/backends/apple/metal/runtime/shims/et_metal.h>
+
+namespace executorch {
+namespace backends {
+namespace metal {
+
+using namespace std;
+using namespace aoti;
+
+using executorch::aten::ScalarType;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::etensor::Tensor;
+
+class MetalBackend final : public ::executorch::runtime::BackendInterface {
+ public:
+  // Once in program
+  MetalBackend() {
+    ET_LOG(Info, "MetalBackend ctor");
+  }
+
+  bool is_available() const override {
+    return 1;
+  }
+
+  // Once per loaded binary blob
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed, // This will be a empty buffer
+      ArrayRef<CompileSpec> compile_specs // This will be my empty list
+  ) const override {
+    ET_LOG(Info, "MetalBackend::init - Starting initialization");
+
+    std::string method_name;
+    for (const CompileSpec& spec : compile_specs) {
+      if (std::strcmp(spec.key, "method_name") == 0) {
+        method_name.assign(
+            static_cast<const char*>(spec.value.buffer),
+            spec.value.nbytes); // no nullptr guarantee, so pass size
+        break;
+      }
+    }
+
+    std::string so_blob_key =
+        method_name.empty() ? "so_blob" : method_name + "_so_blob";
+    ET_LOG(Info, "MetalBackend::init - so_blob_key: %s", so_blob_key.c_str());
+
+    const NamedDataMap* named_data_map = context.get_named_data_map();
+    ET_LOG(Info, "MetalBackend::init - Got named data map: %p", named_data_map);
+
+    ET_LOG(Info, "MetalBackend::init - Looking for blob key: %s", so_blob_key.c_str());
+
+    Result<FreeableBuffer> aoti_metal_buffer =
+        named_data_map->get_data(so_blob_key.c_str());
+    ET_LOG(Info, "MetalBackend::init - Got buffer result");
+
+    if (!aoti_metal_buffer.ok()) {
+      ET_LOG(
+          Error,
+          "MetalBackend::init - Failed to get buffer for key %s: 0x%x",
+          so_blob_key.c_str(),
+          aoti_metal_buffer.error());
+      return Error::InvalidArgument;
+    }
+
+    ET_LOG(Info, "MetalBackend::init - Buffer is OK, size: %zu", aoti_metal_buffer->size());
+
+    if (aoti_metal_buffer->data() == nullptr) {
+      ET_LOG(Error, "MetalBackend::init - Buffer data is null");
+      return Error::InvalidArgument;
+    }
+
+    ET_LOG(Info, "MetalBackend::init - Buffer data pointer: %p", aoti_metal_buffer->data());
+
+    // Generate dynamic temporary file path
+    filesystem::path temp_dir = filesystem::temp_directory_path();
+    filesystem::path so_path =
+        temp_dir / (so_blob_key + to_string(getpid()) + ".so");
+
+    // Create a temporary file
+    ET_LOG(Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str());
+    std::ofstream outfile(so_path.c_str(), std::ios::binary);
+
+    if (!outfile.is_open()) {
+      ET_LOG(Error, "MetalBackend::init - Failed to create temp file");
+      return Error::AccessFailed;
+    }
+    ET_LOG(Info, "MetalBackend::init - Temp file created successfully");
+
+    // Write the ELF buffer to the temporary file
+    size_t buffer_size = aoti_metal_buffer->size();
+    ET_LOG(Info, "MetalBackend::init - About to write %zu bytes to file", buffer_size);
+
+    // Write the buffer directly, not using sizeof(void*) multiplication
+    outfile.write((char*)aoti_metal_buffer->data(), buffer_size);
+
+    if (outfile.bad()) {
+      ET_LOG(Error, "MetalBackend::init - File write failed");
+      return Error::AccessFailed;
+    }
+    ET_LOG(Info, "MetalBackend::init - Buffer written successfully");
+
+    // Finish writing the file to disk
+    outfile.close();
+    ET_LOG(Info, "MetalBackend::init - File closed successfully");
+
+    // Load the ELF using dlopen
+    void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    if (so_handle == nullptr) {
+      std::cout << dlerror() << std::endl;
+      return Error::AccessFailed;
+    }
+
+    processed->Free();
+
+    AOTInductorModelContainerCreateWithDevice =
+        reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
+    if (AOTInductorModelContainerCreateWithDevice == nullptr) {
+      perror("dlsym1");
+      return Error::AccessFailed;
+    }
+    AOTInductorModelContainerDelete =
+        reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerDelete"));
+    if (AOTInductorModelContainerDelete == nullptr) {
+      perror("dlsym2");
+      return Error::AccessFailed;
+    }
+    AOTInductorModelContainerGetNumInputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
+    if (AOTInductorModelContainerGetNumInputs == nullptr) {
+      perror("dlsym3");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerGetNumConstants =
+        reinterpret_cast<AOTInductorModelContainerGetNumConstantsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumConstants"));
+    if (AOTInductorModelContainerGetNumConstants == nullptr) {
+      perror("dlsym AOTInductorModelContainerGetNumConstants");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerGetInputName =
+        reinterpret_cast<AOTInductorModelContainerGetInputNameFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetInputName"));
+    if (AOTInductorModelContainerGetInputName == nullptr) {
+      perror("dlsym AOTInductorModelContainerGetInputName");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerGetNumOutputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
+    if (AOTInductorModelContainerGetNumOutputs == nullptr) {
+      perror("dlsym4");
+      return Error::AccessFailed;
+    }
+    AOTInductorModelContainerRun =
+        reinterpret_cast<AOTInductorModelContainerRunFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerRun"));
+    if (AOTInductorModelContainerRun == nullptr) {
+      perror("dlsym5");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerHandle container_handle = nullptr;
+    ET_LOG(Info, "MetalBackend::init - About to create AOTI container with device='mps'");
+
+    AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "mps", nullptr);
+    ET_LOG(Info, "MetalBackend::init - AOTInductorModelContainerCreateWithDevice returned err=%d", err);
+
+    if (err != Error::Ok) {
+      ET_LOG(Error, "Failed to initialize AOTInductorModelContainer with error %d", err);
+      return err;
+    }
+    ET_LOG(Info, "Successfully initialized container_handle = %p", container_handle);
+
+    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+    handle->so_handle = so_handle;
+    handle->container_handle = container_handle;
+    return (DelegateHandle*)handle; // Return the handle post-processing
+  }
+
+  // Once per execution
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* handle_,
+      Span<EValue*> args) const override {
+    ET_LOG(Debug, "MetalBackend execute");
+
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    ET_LOG(Debug, "MetalBackend Handle generated");
+
+    size_t n_inputs;
+    AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
+
+    size_t n_outputs;
+    AOTInductorModelContainerGetNumOutputs(
+        handle->container_handle, &n_outputs);
+
+    ET_LOG(Debug, "MetalBackend n_outputs %zd generated", n_outputs);
+
+    if (n_inputs + n_outputs != args.size()) {
+      ET_LOG(
+          Error,
+          "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+          n_inputs,
+          n_outputs,
+          args.size());
+      return Error::InvalidArgument;
+    }
+
+    ET_LOG(
+        Debug,
+        "number of user input %zd and output %zd generated from AOT Inductor matches ET runner's %zd.",
+        n_inputs,
+        n_outputs,
+        args.size());
+
+    // NOTE: ExecutorTorch tensors are always on CPU/host memory
+    // We need to create GPU copies for Metal kernel execution
+    std::vector<AOTITensorHandle> gpu_inputs(
+        n_inputs); // GPU copies for kernel execution
+    std::vector<AOTITensorHandle> gpu_outputs(
+        n_outputs); // GPU tensors for kernel output
+
+    ET_LOG(Debug, "MetalBackend input/output vectors generated");
+
+    // Process input tensors: ExecutorTorch provides CPU tensors, create GPU
+    // copies
+    for (int i = 0; i < n_inputs; i++) {
+      ET_LOG(Debug, "Processing input %d from args to inputs vector", i);
+      ET_LOG(
+          Debug, "is %d input a tensor input? %d", i, int(args[i]->isTensor()));
+
+      // Get tensor dimensions and properties from ExecutorTorch CPU tensor
+      auto cpu_tensor = &(args[i]->toTensor());
+      auto sizes = cpu_tensor->sizes();
+      auto scalar_type = cpu_tensor->scalar_type();
+      ET_LOG(Debug, "MetalBackend input %d scalar_type=%d", i, static_cast<int32_t>(scalar_type));
+
+      // Create GPU tensor with same shape
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_input_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          2, // device_type = mps
+          0, // device_index = 0
+          &gpu_input_handle);
+
+      if (create_err != Error::Ok) {
+        ET_LOG(Error, "Failed to create GPU tensor for input %d", i);
+        return Error::Internal;
+      }
+
+      // Log the created GPU tensor scalar type
+      auto gpu_tensor = reinterpret_cast<executorch::runtime::etensor::Tensor*>(gpu_input_handle);
+      ET_LOG(Debug, "MetalBackend created GPU tensor %d scalar_type=%d", i, static_cast<int32_t>(gpu_tensor->scalar_type()));
+
+      gpu_inputs[i] = gpu_input_handle;
+
+      // Log the CPU tensor data before copying to GPU
+      void* cpu_data = cpu_tensor->mutable_data_ptr();
+      if (cpu_data && cpu_tensor->numel() > 0) {
+        float* cpu_float_data = (float*)cpu_data;
+        ET_LOG(Debug, "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)",
+               i, cpu_float_data[0], cpu_float_data[1], cpu_float_data[2], cpu_tensor->numel());
+      }
+
+      // Copy data from CPU to GPU
+      Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0);
+      if (copy_err != Error::Ok) {
+        ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i);
+        return Error::Internal;
+      }
+
+      // Log the GPU tensor scalar type after copy
+      auto gpu_tensor_after = reinterpret_cast<executorch::runtime::etensor::Tensor*>(gpu_inputs[i]);
+      ET_LOG(Debug, "MetalBackend GPU tensor %d scalar_type after copy=%d", i, static_cast<int32_t>(gpu_tensor_after->scalar_type()));
+
+      ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i);
+    }
+
+    ET_LOG(Debug, "MetalBackend GPU inputs generated");
+
+    // Process output tensors: create GPU counterparts for ExecutorTorch CPU
+    // tensors
+    for (int i = 0; i < n_outputs; i++) {
+      // Get output tensor dimensions from ExecutorTorch CPU tensor
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      auto sizes = cpu_output_tensor->sizes();
+      auto scalar_type = cpu_output_tensor->scalar_type();
+      ET_LOG(Debug, "MetalBackend output %d scalar_type=%d", i, static_cast<int32_t>(scalar_type));
+
+      // Create GPU tensor with same shape for kernel output
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_output_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          2, // device_type = mps
+          0, // device_index = 0
+          &gpu_output_handle);
+
+      if (create_err != Error::Ok) {
+        ET_LOG(Error, "Failed to create GPU tensor for output %d", i);
+        return Error::Internal;
+      }
+
+      gpu_outputs[i] = gpu_output_handle;
+      ET_LOG(Debug, "Created GPU output tensor %d", i);
+    }
+
+    ET_LOG(Debug, "MetalBackend output generated");
+
+    // Log tensor handles before passing to AOTI container
+    ET_LOG(Debug, "Passing to AOTInductorModelContainerRun:");
+    for (int i = 0; i < n_inputs; i++) {
+      void* gpu_input_data = gpu_inputs[i]->mutable_data_ptr();
+      ET_LOG(Debug, "  gpu_inputs[%d] = %p, data_ptr = %p",
+             i, gpu_inputs[i], gpu_input_data);
+    }
+    for (int i = 0; i < n_outputs; i++) {
+      void* gpu_output_data = gpu_outputs[i]->mutable_data_ptr();
+      ET_LOG(Debug, "  gpu_outputs[%d] = %p, data_ptr = %p",
+             i, gpu_outputs[i], gpu_output_data);
+    }
+
+    // Run AOTI container with GPU tensors
+    AOTIRuntimeError error = AOTInductorModelContainerRun(
+        handle->container_handle,
+        gpu_inputs.data(), // Use GPU input tensors
+        n_inputs,
+        gpu_outputs.data(), // Use GPU output tensors
+        n_outputs,
+        nullptr, // Pass the actual Metal stream!
+        nullptr); // proxy_executor_handle can remain nullptr
+
+    if (error != Error::Ok) {
+      ET_LOG(
+          Error,
+          "AOTInductorModelContainerRun failed with error code %d",
+          error);
+      return Error::Internal;
+    }
+
+    // Ensure all GPU work is completed before reading results
+    try {
+      synchronize_metal_stream();
+    } catch (const std::exception& e) {
+      ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: %s", e.what());
+      return Error::Internal;
+    } catch (...) {
+      ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: unknown exception");
+      return Error::Internal;
+    }
+
+    ET_LOG(Debug, "MetalBackend running done and synchronized");
+
+    // Copy GPU output results back to CPU output tensors
+    for (int i = 0; i < n_outputs; i++) {
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      Error copy_err = aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0);
+      if (copy_err != Error::Ok) {
+        ET_LOG(Error, "Failed to copy GPU output %d back to CPU", i);
+        return Error::Internal;
+      }
+      ET_LOG(Debug, "Copied GPU output %d back to CPU", i);
+    }
+
+    // Clean up GPU tensors that we created (ExecutorTorch tensors are always
+    // CPU, so all GPU tensors are our copies)
+    for (int i = 0; i < n_inputs; i++) {
+      // All GPU input tensors were created by us, delete them
+      aoti_torch_delete_tensor_object(gpu_inputs[i]);
+    }
+
+    for (int i = 0; i < n_outputs; i++) {
+      // All GPU output tensors were created by us, delete them
+      aoti_torch_delete_tensor_object(gpu_outputs[i]);
+    }
+
+    ET_LOG(Debug, "MetalBackend execution completed successfully");
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle_) const override {
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    // Delete the container BEFORE closing the shared library
+    if (handle->container_handle != nullptr) {
+      AOTIRuntimeError delete_result =
+          AOTInductorModelContainerDelete(handle->container_handle);
+      if (delete_result != Error::Ok) {
+        ET_LOG(
+            Error,
+            "AOTInductorModelContainerDelete failed with error code %d",
+            delete_result);
+      }
+    }
+
+    // Now close the shared library
+    if (handle->so_handle != nullptr) {
+      dlclose(handle->so_handle);
+    }
+
+    free(handle);
+    cleanup_memory();
+    executorch::backends::aoti::cleanup_tensor_metadata();
+    ET_LOG(Debug, "MetalBackend handle %p destroy", handle_);
+  }
+};
+
+} // namespace metal
+
+namespace {
+auto cls = metal::MetalBackend();
+executorch::runtime::Backend backend{"MetalBackend", &cls};
+static executorch::runtime::Error success_with_compiler =
+    register_backend(backend);
+} // namespace
+
+} // namespace backends
+} // namespace executorch
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 04e84622589..b2b09091014 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -152,6 +152,9 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_CUDA "Build the CUDA backend" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_METAL "Build the AOTI Metal backend" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_VGF "Build the Arm VGF backend" BOOL OFF
 )
@@ -389,6 +392,10 @@ check_required_options_on(
   IF_ON EXECUTORCH_BUILD_CUDA REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR
 )
 
+check_required_options_on(
+  IF_ON EXECUTORCH_BUILD_METAL REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR
+)
+
 if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
   message(
     FATAL_ERROR

From 9779d54e5da568e50aa5b21ebbc765be400aa8b1 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Fri, 10 Oct 2025 17:11:42 -0400
Subject: [PATCH 2/5] Update

[ghstack-poisoned]
---
 backends/apple/metal/CMakeLists.txt           |  74 ++++++------
 .../apple/metal/runtime/metal_backend.cpp     | 112 ++++++++++++++----
 2 files changed, 126 insertions(+), 60 deletions(-)

diff --git a/backends/apple/metal/CMakeLists.txt b/backends/apple/metal/CMakeLists.txt
index c26d247d6be..3d1ec39801a 100644
--- a/backends/apple/metal/CMakeLists.txt
+++ b/backends/apple/metal/CMakeLists.txt
@@ -33,24 +33,29 @@ set(_aoti_metal_sources
     runtime/shims/shim_mps.mm
     runtime/shims/tensor_attribute.cpp
     runtime/shims/utils.cpp
-    )
+)
 
 add_library(metal_backend STATIC ${_aoti_metal_sources})
 target_include_directories(
   metal_backend
-  PUBLIC
-    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
-    $<INSTALL_INTERFACE:include>
-    # PyTorch AOTI headers from ExecutorTorch's torch detection
-    ${TORCH_INCLUDE_DIRS}
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+         # PyTorch AOTI headers from ExecutorTorch's torch detection
+         ${TORCH_INCLUDE_DIRS}
 )
 
 # Link Metal framework
 find_library(METAL_LIBRARY Metal REQUIRED)
 find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
 find_library(METALPERFORMANCESHADERS_LIBRARY MetalPerformanceShaders REQUIRED)
-find_library(METALPERFORMANCESHADERSGRAPH_LIBRARY MetalPerformanceShadersGraph REQUIRED)
-target_link_libraries(metal_backend PUBLIC ${METAL_LIBRARY} ${FOUNDATION_LIBRARY} ${METALPERFORMANCESHADERS_LIBRARY} ${METALPERFORMANCESHADERSGRAPH_LIBRARY})
+find_library(
+  METALPERFORMANCESHADERSGRAPH_LIBRARY MetalPerformanceShadersGraph REQUIRED
+)
+target_link_libraries(
+  metal_backend
+  PUBLIC ${METAL_LIBRARY} ${FOUNDATION_LIBRARY}
+         ${METALPERFORMANCESHADERS_LIBRARY}
+         ${METALPERFORMANCESHADERSGRAPH_LIBRARY}
+)
 
 target_compile_options(metal_backend PUBLIC -fexceptions -frtti -fPIC)
 
@@ -58,45 +63,44 @@ target_link_options(metal_backend PUBLIC -Wl,-export_dynamic)
 
 # Find PyTorch's OpenMP library specifically for libtorch-less AOTI
 get_torch_base_path(TORCH_BASE_PATH)
-find_library(TORCH_OMP_LIBRARY
-    NAMES omp libomp
-    PATHS "${TORCH_BASE_PATH}/lib"
-    NO_DEFAULT_PATH
+find_library(
+  TORCH_OMP_LIBRARY
+  NAMES omp libomp
+  PATHS "${TORCH_BASE_PATH}/lib"
+  NO_DEFAULT_PATH
 )
 
 if(TORCH_OMP_LIBRARY)
-    message(STATUS "Found PyTorch OpenMP library: ${TORCH_OMP_LIBRARY}")
-    # Get the directory containing the OpenMP library for rpath
-    get_filename_component(TORCH_OMP_LIB_DIR ${TORCH_OMP_LIBRARY} DIRECTORY)
-    message(STATUS "OpenMP library directory: ${TORCH_OMP_LIB_DIR}")
+  message(STATUS "Found PyTorch OpenMP library: ${TORCH_OMP_LIBRARY}")
+  # Get the directory containing the OpenMP library for rpath
+  get_filename_component(TORCH_OMP_LIB_DIR ${TORCH_OMP_LIBRARY} DIRECTORY)
+  message(STATUS "OpenMP library directory: ${TORCH_OMP_LIB_DIR}")
 else()
-    message(WARNING "PyTorch OpenMP library not found, may cause runtime linking issues")
+  message(
+    WARNING "PyTorch OpenMP library not found, may cause runtime linking issues"
+  )
 endif()
 
 # Link against appropriate backends and standard libraries
 target_link_libraries(
-  metal_backend
-  PUBLIC
-    aoti_common
-    extension_tensor
-    ${CMAKE_DL_LIBS}
-    ${TORCH_OMP_LIBRARY}
+  metal_backend PUBLIC aoti_common extension_tensor ${CMAKE_DL_LIBS}
+                       ${TORCH_OMP_LIBRARY}
 )
 
 # Set rpath for OpenMP library to avoid runtime linking issues
 if(TORCH_OMP_LIBRARY AND TORCH_OMP_LIB_DIR)
-    # Add the OpenMP library directory to the rpath
-    set_target_properties(metal_backend PROPERTIES
-        BUILD_RPATH "${TORCH_OMP_LIB_DIR}"
-        INSTALL_RPATH "${TORCH_OMP_LIB_DIR}"
-    )
-    # Also try common OpenMP library locations
-    target_link_options(metal_backend PUBLIC
-        -Wl,-rpath,${TORCH_OMP_LIB_DIR}
-        -Wl,-rpath,/usr/local/opt/libomp/lib
-        -Wl,-rpath,/opt/homebrew/opt/libomp/lib
-    )
-    message(STATUS "Added rpath for OpenMP library: ${TORCH_OMP_LIB_DIR}")
+  # Add the OpenMP library directory to the rpath
+  set_target_properties(
+    metal_backend PROPERTIES BUILD_RPATH "${TORCH_OMP_LIB_DIR}"
+                             INSTALL_RPATH "${TORCH_OMP_LIB_DIR}"
+  )
+  # Also try common OpenMP library locations
+  target_link_options(
+    metal_backend PUBLIC -Wl,-rpath,${TORCH_OMP_LIB_DIR}
+    -Wl,-rpath,/usr/local/opt/libomp/lib
+    -Wl,-rpath,/opt/homebrew/opt/libomp/lib
+  )
+  message(STATUS "Added rpath for OpenMP library: ${TORCH_OMP_LIB_DIR}")
 endif()
 
 executorch_target_link_options_shared_lib(metal_backend)
diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp
index 4ce3ea7f7da..33699c51c45 100644
--- a/backends/apple/metal/runtime/metal_backend.cpp
+++ b/backends/apple/metal/runtime/metal_backend.cpp
@@ -30,11 +30,11 @@
 #include <executorch/backends/aoti/common_shims.h>
 
 // Include our Metal-specific shim layer headers
+#include <executorch/backends/apple/metal/runtime/shims/et_metal.h>
 #include <executorch/backends/apple/metal/runtime/shims/memory.h>
+#include <executorch/backends/apple/metal/runtime/shims/shim_mps.h>
 #include <executorch/backends/apple/metal/runtime/shims/tensor_attribute.h>
 #include <executorch/backends/apple/metal/runtime/shims/utils.h>
-#include <executorch/backends/apple/metal/runtime/shims/shim_mps.h>
-#include <executorch/backends/apple/metal/runtime/shims/et_metal.h>
 
 namespace executorch {
 namespace backends {
@@ -95,7 +95,10 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
     const NamedDataMap* named_data_map = context.get_named_data_map();
     ET_LOG(Info, "MetalBackend::init - Got named data map: %p", named_data_map);
 
-    ET_LOG(Info, "MetalBackend::init - Looking for blob key: %s", so_blob_key.c_str());
+    ET_LOG(
+        Info,
+        "MetalBackend::init - Looking for blob key: %s",
+        so_blob_key.c_str());
 
     Result<FreeableBuffer> aoti_metal_buffer =
         named_data_map->get_data(so_blob_key.c_str());
@@ -110,14 +113,20 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
       return Error::InvalidArgument;
     }
 
-    ET_LOG(Info, "MetalBackend::init - Buffer is OK, size: %zu", aoti_metal_buffer->size());
+    ET_LOG(
+        Info,
+        "MetalBackend::init - Buffer is OK, size: %zu",
+        aoti_metal_buffer->size());
 
     if (aoti_metal_buffer->data() == nullptr) {
       ET_LOG(Error, "MetalBackend::init - Buffer data is null");
       return Error::InvalidArgument;
     }
 
-    ET_LOG(Info, "MetalBackend::init - Buffer data pointer: %p", aoti_metal_buffer->data());
+    ET_LOG(
+        Info,
+        "MetalBackend::init - Buffer data pointer: %p",
+        aoti_metal_buffer->data());
 
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
@@ -125,7 +134,8 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
         temp_dir / (so_blob_key + to_string(getpid()) + ".so");
 
     // Create a temporary file
-    ET_LOG(Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str());
+    ET_LOG(
+        Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str());
     std::ofstream outfile(so_path.c_str(), std::ios::binary);
 
     if (!outfile.is_open()) {
@@ -136,7 +146,10 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
 
     // Write the ELF buffer to the temporary file
     size_t buffer_size = aoti_metal_buffer->size();
-    ET_LOG(Info, "MetalBackend::init - About to write %zu bytes to file", buffer_size);
+    ET_LOG(
+        Info,
+        "MetalBackend::init - About to write %zu bytes to file",
+        buffer_size);
 
     // Write the buffer directly, not using sizeof(void*) multiplication
     outfile.write((char*)aoti_metal_buffer->data(), buffer_size);
@@ -214,17 +227,28 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
     }
 
     AOTInductorModelContainerHandle container_handle = nullptr;
-    ET_LOG(Info, "MetalBackend::init - About to create AOTI container with device='mps'");
+    ET_LOG(
+        Info,
+        "MetalBackend::init - About to create AOTI container with device='mps'");
 
     AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice(
         &container_handle, 1, "mps", nullptr);
-    ET_LOG(Info, "MetalBackend::init - AOTInductorModelContainerCreateWithDevice returned err=%d", err);
+    ET_LOG(
+        Info,
+        "MetalBackend::init - AOTInductorModelContainerCreateWithDevice returned err=%d",
+        err);
 
     if (err != Error::Ok) {
-      ET_LOG(Error, "Failed to initialize AOTInductorModelContainer with error %d", err);
+      ET_LOG(
+          Error,
+          "Failed to initialize AOTInductorModelContainer with error %d",
+          err);
       return err;
     }
-    ET_LOG(Info, "Successfully initialized container_handle = %p", container_handle);
+    ET_LOG(
+        Info,
+        "Successfully initialized container_handle = %p",
+        container_handle);
 
     AOTIDelegateHandle* handle = new AOTIDelegateHandle();
     handle->so_handle = so_handle;
@@ -289,7 +313,11 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
       auto cpu_tensor = &(args[i]->toTensor());
       auto sizes = cpu_tensor->sizes();
       auto scalar_type = cpu_tensor->scalar_type();
-      ET_LOG(Debug, "MetalBackend input %d scalar_type=%d", i, static_cast<int32_t>(scalar_type));
+      ET_LOG(
+          Debug,
+          "MetalBackend input %d scalar_type=%d",
+          i,
+          static_cast<int32_t>(scalar_type));
 
       // Create GPU tensor with same shape
       std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
@@ -310,8 +338,13 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
       }
 
       // Log the created GPU tensor scalar type
-      auto gpu_tensor = reinterpret_cast<executorch::runtime::etensor::Tensor*>(gpu_input_handle);
-      ET_LOG(Debug, "MetalBackend created GPU tensor %d scalar_type=%d", i, static_cast<int32_t>(gpu_tensor->scalar_type()));
+      auto gpu_tensor = reinterpret_cast<executorch::runtime::etensor::Tensor*>(
+          gpu_input_handle);
+      ET_LOG(
+          Debug,
+          "MetalBackend created GPU tensor %d scalar_type=%d",
+          i,
+          static_cast<int32_t>(gpu_tensor->scalar_type()));
 
       gpu_inputs[i] = gpu_input_handle;
 
@@ -319,8 +352,14 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
       void* cpu_data = cpu_tensor->mutable_data_ptr();
       if (cpu_data && cpu_tensor->numel() > 0) {
         float* cpu_float_data = (float*)cpu_data;
-        ET_LOG(Debug, "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)",
-               i, cpu_float_data[0], cpu_float_data[1], cpu_float_data[2], cpu_tensor->numel());
+        ET_LOG(
+            Debug,
+            "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)",
+            i,
+            cpu_float_data[0],
+            cpu_float_data[1],
+            cpu_float_data[2],
+            cpu_tensor->numel());
       }
 
       // Copy data from CPU to GPU
@@ -331,8 +370,14 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
       }
 
       // Log the GPU tensor scalar type after copy
-      auto gpu_tensor_after = reinterpret_cast<executorch::runtime::etensor::Tensor*>(gpu_inputs[i]);
-      ET_LOG(Debug, "MetalBackend GPU tensor %d scalar_type after copy=%d", i, static_cast<int32_t>(gpu_tensor_after->scalar_type()));
+      auto gpu_tensor_after =
+          reinterpret_cast<executorch::runtime::etensor::Tensor*>(
+              gpu_inputs[i]);
+      ET_LOG(
+          Debug,
+          "MetalBackend GPU tensor %d scalar_type after copy=%d",
+          i,
+          static_cast<int32_t>(gpu_tensor_after->scalar_type()));
 
       ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i);
     }
@@ -346,7 +391,11 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
       auto sizes = cpu_output_tensor->sizes();
       auto scalar_type = cpu_output_tensor->scalar_type();
-      ET_LOG(Debug, "MetalBackend output %d scalar_type=%d", i, static_cast<int32_t>(scalar_type));
+      ET_LOG(
+          Debug,
+          "MetalBackend output %d scalar_type=%d",
+          i,
+          static_cast<int32_t>(scalar_type));
 
       // Create GPU tensor with same shape for kernel output
       std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
@@ -376,13 +425,21 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
     ET_LOG(Debug, "Passing to AOTInductorModelContainerRun:");
     for (int i = 0; i < n_inputs; i++) {
       void* gpu_input_data = gpu_inputs[i]->mutable_data_ptr();
-      ET_LOG(Debug, "  gpu_inputs[%d] = %p, data_ptr = %p",
-             i, gpu_inputs[i], gpu_input_data);
+      ET_LOG(
+          Debug,
+          "  gpu_inputs[%d] = %p, data_ptr = %p",
+          i,
+          gpu_inputs[i],
+          gpu_input_data);
     }
     for (int i = 0; i < n_outputs; i++) {
       void* gpu_output_data = gpu_outputs[i]->mutable_data_ptr();
-      ET_LOG(Debug, "  gpu_outputs[%d] = %p, data_ptr = %p",
-             i, gpu_outputs[i], gpu_output_data);
+      ET_LOG(
+          Debug,
+          "  gpu_outputs[%d] = %p, data_ptr = %p",
+          i,
+          gpu_outputs[i],
+          gpu_output_data);
     }
 
     // Run AOTI container with GPU tensors
@@ -407,10 +464,15 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
     try {
       synchronize_metal_stream();
     } catch (const std::exception& e) {
-      ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: %s", e.what());
+      ET_LOG(
+          Error,
+          "Failed to synchronize Metal stream after kernel execution: %s",
+          e.what());
       return Error::Internal;
     } catch (...) {
-      ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: unknown exception");
+      ET_LOG(
+          Error,
+          "Failed to synchronize Metal stream after kernel execution: unknown exception");
       return Error::Internal;
     }
 

From f9c8989ad0498111c268b8feb9f953aabb4dfe4a Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Mon, 13 Oct 2025 18:21:29 -0400
Subject: [PATCH 3/5] Update

[ghstack-poisoned]
---
 .../apple/metal/runtime/metal_backend.cpp     | 352 +++++++-----------
 examples/models/voxtral/CMakeLists.txt        |   5 +
 tools/cmake/executorch-config.cmake           |   1 +
 tools/cmake/preset/default.cmake              |   2 +-
 4 files changed, 145 insertions(+), 215 deletions(-)

diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp
index 33699c51c45..50c89cd66e4 100644
--- a/backends/apple/metal/runtime/metal_backend.cpp
+++ b/backends/apple/metal/runtime/metal_backend.cpp
@@ -6,23 +6,17 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/extension/tensor/tensor.h>
+#include <dlfcn.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
-
-#include <dlfcn.h>
-#include <fcntl.h>
-#include <sys/stat.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <unistd.h>
 #include <cstdio>
-#include <cstdlib>
-#include <cstring>
 
 #include <filesystem>
 #include <fstream>
-#include <iostream>
-#include <memory>
+#include <string>
 #include <vector>
 
 // Include AOTI common headers (from aoti_common library)
@@ -36,9 +30,14 @@
 #include <executorch/backends/apple/metal/runtime/shims/tensor_attribute.h>
 #include <executorch/backends/apple/metal/runtime/shims/utils.h>
 
-namespace executorch {
-namespace backends {
-namespace metal {
+namespace executorch::backends::metal {
+
+#define LOAD_SYMBOL(name, handle)                                \
+  do {                                                           \
+    name = reinterpret_cast<name##Func>(dlsym(handle, #name));   \
+    ET_CHECK_OR_RETURN_ERROR(                                    \
+        name != nullptr, AccessFailed, "Failed to load " #name); \
+  } while (0)
 
 using namespace std;
 using namespace aoti;
@@ -59,7 +58,36 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
-class MetalBackend final : public ::executorch::runtime::BackendInterface {
+class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::BackendInterface {
+ private:
+  Error register_shared_library_functions(void* so_handle) const {
+    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loading symbols");
+
+    LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle);
+    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerCreateWithDevice");
+
+    LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle);
+    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerDelete");
+
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle);
+    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumInputs");
+
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumConstants, so_handle);
+    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumConstants");
+
+    LOAD_SYMBOL(AOTInductorModelContainerGetInputName, so_handle);
+    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetInputName");
+
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle);
+    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumOutputs");
+
+    LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle);
+    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerRun");
+
+    ET_LOG(Info, "MetalBackend::register_shared_library_functions - All symbols loaded successfully");
+    return Error::Ok;
+  }
+
  public:
   // Once in program
   MetalBackend() {
@@ -95,38 +123,24 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
     const NamedDataMap* named_data_map = context.get_named_data_map();
     ET_LOG(Info, "MetalBackend::init - Got named data map: %p", named_data_map);
 
-    ET_LOG(
-        Info,
-        "MetalBackend::init - Looking for blob key: %s",
-        so_blob_key.c_str());
-
-    Result<FreeableBuffer> aoti_metal_buffer =
-        named_data_map->get_data(so_blob_key.c_str());
-    ET_LOG(Info, "MetalBackend::init - Got buffer result");
+    ET_LOG(Info, "MetalBackend::init - Looking for blob key: %s", so_blob_key.c_str());
 
-    if (!aoti_metal_buffer.ok()) {
-      ET_LOG(
-          Error,
-          "MetalBackend::init - Failed to get buffer for key %s: 0x%x",
-          so_blob_key.c_str(),
-          aoti_metal_buffer.error());
-      return Error::InvalidArgument;
-    }
+    auto aoti_metal_buffer = named_data_map->get_data(so_blob_key.c_str());
+    ET_CHECK_OR_RETURN_ERROR(
+        aoti_metal_buffer.ok(),
+        Internal,
+        "Failed to get data for key %s: 0x%x",
+        so_blob_key.c_str(),
+        static_cast<uint32_t>(aoti_metal_buffer.error()));
 
-    ET_LOG(
-        Info,
-        "MetalBackend::init - Buffer is OK, size: %zu",
-        aoti_metal_buffer->size());
+    ET_LOG(Info, "MetalBackend::init - Buffer is OK, size: %zu", aoti_metal_buffer->size());
 
     if (aoti_metal_buffer->data() == nullptr) {
       ET_LOG(Error, "MetalBackend::init - Buffer data is null");
       return Error::InvalidArgument;
     }
 
-    ET_LOG(
-        Info,
-        "MetalBackend::init - Buffer data pointer: %p",
-        aoti_metal_buffer->data());
+    ET_LOG(Info, "MetalBackend::init - Buffer data pointer: %p", aoti_metal_buffer->data());
 
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
@@ -134,31 +148,22 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
         temp_dir / (so_blob_key + to_string(getpid()) + ".so");
 
     // Create a temporary file
-    ET_LOG(
-        Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str());
-    std::ofstream outfile(so_path.c_str(), std::ios::binary);
-
-    if (!outfile.is_open()) {
-      ET_LOG(Error, "MetalBackend::init - Failed to create temp file");
-      return Error::AccessFailed;
-    }
-    ET_LOG(Info, "MetalBackend::init - Temp file created successfully");
+    ET_LOG(Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str());
+    ofstream outfile(so_path.c_str(), ios::binary);
 
     // Write the ELF buffer to the temporary file
-    size_t buffer_size = aoti_metal_buffer->size();
     ET_LOG(
         Info,
-        "MetalBackend::init - About to write %zu bytes to file",
-        buffer_size);
+        "Writing %zu bytes to %s",
+        aoti_metal_buffer->size(),
+        so_path.c_str());
 
-    // Write the buffer directly, not using sizeof(void*) multiplication
-    outfile.write((char*)aoti_metal_buffer->data(), buffer_size);
+    outfile.write(
+        static_cast<const char*>(aoti_metal_buffer->data()),
+        aoti_metal_buffer->size());
 
-    if (outfile.bad()) {
-      ET_LOG(Error, "MetalBackend::init - File write failed");
-      return Error::AccessFailed;
-    }
-    ET_LOG(Info, "MetalBackend::init - Buffer written successfully");
+    ET_CHECK_OR_RETURN_ERROR(
+        outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
 
     // Finish writing the file to disk
     outfile.close();
@@ -166,93 +171,31 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
 
     // Load the ELF using dlopen
     void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    if (so_handle == nullptr) {
-      std::cout << dlerror() << std::endl;
-      return Error::AccessFailed;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        so_handle != nullptr,
+        AccessFailed,
+        "Failed to load shared library: %s",
+        dlerror());
 
     processed->Free();
 
-    AOTInductorModelContainerCreateWithDevice =
-        reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
-    if (AOTInductorModelContainerCreateWithDevice == nullptr) {
-      perror("dlsym1");
-      return Error::AccessFailed;
-    }
-    AOTInductorModelContainerDelete =
-        reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerDelete"));
-    if (AOTInductorModelContainerDelete == nullptr) {
-      perror("dlsym2");
-      return Error::AccessFailed;
-    }
-    AOTInductorModelContainerGetNumInputs =
-        reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
-    if (AOTInductorModelContainerGetNumInputs == nullptr) {
-      perror("dlsym3");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerGetNumConstants =
-        reinterpret_cast<AOTInductorModelContainerGetNumConstantsFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetNumConstants"));
-    if (AOTInductorModelContainerGetNumConstants == nullptr) {
-      perror("dlsym AOTInductorModelContainerGetNumConstants");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerGetInputName =
-        reinterpret_cast<AOTInductorModelContainerGetInputNameFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetInputName"));
-    if (AOTInductorModelContainerGetInputName == nullptr) {
-      perror("dlsym AOTInductorModelContainerGetInputName");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerGetNumOutputs =
-        reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
-    if (AOTInductorModelContainerGetNumOutputs == nullptr) {
-      perror("dlsym4");
-      return Error::AccessFailed;
-    }
-    AOTInductorModelContainerRun =
-        reinterpret_cast<AOTInductorModelContainerRunFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerRun"));
-    if (AOTInductorModelContainerRun == nullptr) {
-      perror("dlsym5");
-      return Error::AccessFailed;
-    }
+    // Register all shared library functions
+    ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle));
 
     AOTInductorModelContainerHandle container_handle = nullptr;
-    ET_LOG(
-        Info,
-        "MetalBackend::init - About to create AOTI container with device='mps'");
+    ET_LOG(Info, "MetalBackend::init - About to create AOTI container with device='mps'");
 
-    AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice(
-        &container_handle, 1, "mps", nullptr);
-    ET_LOG(
-        Info,
-        "MetalBackend::init - AOTInductorModelContainerCreateWithDevice returned err=%d",
-        err);
+    ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "mps", nullptr));
 
-    if (err != Error::Ok) {
-      ET_LOG(
-          Error,
-          "Failed to initialize AOTInductorModelContainer with error %d",
-          err);
-      return err;
-    }
-    ET_LOG(
-        Info,
-        "Successfully initialized container_handle = %p",
-        container_handle);
+    ET_LOG(Info, "container_handle = %p", container_handle);
 
     AOTIDelegateHandle* handle = new AOTIDelegateHandle();
     handle->so_handle = so_handle;
+    handle->so_path = so_path.string();
     handle->container_handle = container_handle;
+
+    ET_LOG(Info, "MetalBackend::init - Initialization completed successfully");
     return (DelegateHandle*)handle; // Return the handle post-processing
   }
 
@@ -265,6 +208,14 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
 
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
+    // Need to re-register all the symbols from the so_handle hosted by this
+    // MetalBackend instance. The reason is that these symbols are
+    // static/singleton across the whole process. When we share multiple methods
+    // (meaning multiple so_handle) in the same process, we need to re-register
+    // the symbols from the so_handle that is being used in this execution.
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        register_shared_library_functions(handle->so_handle));
+
     ET_LOG(Debug, "MetalBackend Handle generated");
 
     size_t n_inputs;
@@ -276,15 +227,13 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
 
     ET_LOG(Debug, "MetalBackend n_outputs %zd generated", n_outputs);
 
-    if (n_inputs + n_outputs != args.size()) {
-      ET_LOG(
-          Error,
-          "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
-          n_inputs,
-          n_outputs,
-          args.size());
-      return Error::InvalidArgument;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        n_inputs + n_outputs == args.size(),
+        InvalidArgument,
+        "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+        n_inputs,
+        n_outputs,
+        args.size())
 
     ET_LOG(
         Debug,
@@ -313,11 +262,7 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
       auto cpu_tensor = &(args[i]->toTensor());
       auto sizes = cpu_tensor->sizes();
       auto scalar_type = cpu_tensor->scalar_type();
-      ET_LOG(
-          Debug,
-          "MetalBackend input %d scalar_type=%d",
-          i,
-          static_cast<int32_t>(scalar_type));
+      ET_LOG(Debug, "MetalBackend input %d scalar_type=%d", i, static_cast<int32_t>(scalar_type));
 
       // Create GPU tensor with same shape
       std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
@@ -338,13 +283,8 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
       }
 
       // Log the created GPU tensor scalar type
-      auto gpu_tensor = reinterpret_cast<executorch::runtime::etensor::Tensor*>(
-          gpu_input_handle);
-      ET_LOG(
-          Debug,
-          "MetalBackend created GPU tensor %d scalar_type=%d",
-          i,
-          static_cast<int32_t>(gpu_tensor->scalar_type()));
+      auto gpu_tensor = reinterpret_cast<executorch::runtime::etensor::Tensor*>(gpu_input_handle);
+      ET_LOG(Debug, "MetalBackend created GPU tensor %d scalar_type=%d", i, static_cast<int32_t>(gpu_tensor->scalar_type()));
 
       gpu_inputs[i] = gpu_input_handle;
 
@@ -352,14 +292,8 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
       void* cpu_data = cpu_tensor->mutable_data_ptr();
       if (cpu_data && cpu_tensor->numel() > 0) {
         float* cpu_float_data = (float*)cpu_data;
-        ET_LOG(
-            Debug,
-            "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)",
-            i,
-            cpu_float_data[0],
-            cpu_float_data[1],
-            cpu_float_data[2],
-            cpu_tensor->numel());
+        ET_LOG(Debug, "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)",
+               i, cpu_float_data[0], cpu_float_data[1], cpu_float_data[2], cpu_tensor->numel());
       }
 
       // Copy data from CPU to GPU
@@ -370,14 +304,8 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
       }
 
       // Log the GPU tensor scalar type after copy
-      auto gpu_tensor_after =
-          reinterpret_cast<executorch::runtime::etensor::Tensor*>(
-              gpu_inputs[i]);
-      ET_LOG(
-          Debug,
-          "MetalBackend GPU tensor %d scalar_type after copy=%d",
-          i,
-          static_cast<int32_t>(gpu_tensor_after->scalar_type()));
+      auto gpu_tensor_after = reinterpret_cast<executorch::runtime::etensor::Tensor*>(gpu_inputs[i]);
+      ET_LOG(Debug, "MetalBackend GPU tensor %d scalar_type after copy=%d", i, static_cast<int32_t>(gpu_tensor_after->scalar_type()));
 
       ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i);
     }
@@ -391,11 +319,7 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
       auto sizes = cpu_output_tensor->sizes();
       auto scalar_type = cpu_output_tensor->scalar_type();
-      ET_LOG(
-          Debug,
-          "MetalBackend output %d scalar_type=%d",
-          i,
-          static_cast<int32_t>(scalar_type));
+      ET_LOG(Debug, "MetalBackend output %d scalar_type=%d", i, static_cast<int32_t>(scalar_type));
 
       // Create GPU tensor with same shape for kernel output
       std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
@@ -425,21 +349,13 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
     ET_LOG(Debug, "Passing to AOTInductorModelContainerRun:");
     for (int i = 0; i < n_inputs; i++) {
       void* gpu_input_data = gpu_inputs[i]->mutable_data_ptr();
-      ET_LOG(
-          Debug,
-          "  gpu_inputs[%d] = %p, data_ptr = %p",
-          i,
-          gpu_inputs[i],
-          gpu_input_data);
+      ET_LOG(Debug, "  gpu_inputs[%d] = %p, data_ptr = %p",
+             i, gpu_inputs[i], gpu_input_data);
     }
     for (int i = 0; i < n_outputs; i++) {
       void* gpu_output_data = gpu_outputs[i]->mutable_data_ptr();
-      ET_LOG(
-          Debug,
-          "  gpu_outputs[%d] = %p, data_ptr = %p",
-          i,
-          gpu_outputs[i],
-          gpu_output_data);
+      ET_LOG(Debug, "  gpu_outputs[%d] = %p, data_ptr = %p",
+             i, gpu_outputs[i], gpu_output_data);
     }
 
     // Run AOTI container with GPU tensors
@@ -464,15 +380,10 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
     try {
       synchronize_metal_stream();
     } catch (const std::exception& e) {
-      ET_LOG(
-          Error,
-          "Failed to synchronize Metal stream after kernel execution: %s",
-          e.what());
+      ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: %s", e.what());
       return Error::Internal;
     } catch (...) {
-      ET_LOG(
-          Error,
-          "Failed to synchronize Metal stream after kernel execution: unknown exception");
+      ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: unknown exception");
       return Error::Internal;
     }
 
@@ -481,11 +392,15 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
     // Copy GPU output results back to CPU output tensors
     for (int i = 0; i < n_outputs; i++) {
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
-      Error copy_err = aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0);
-      if (copy_err != Error::Ok) {
-        ET_LOG(Error, "Failed to copy GPU output %d back to CPU", i);
-        return Error::Internal;
-      }
+      // For DYNAMIC_BOUND tensors we try to resize
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
+          "Error resizing tensor at output index %d",
+          i);
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
+          "Failed to copy GPU output %d back to CPU",
+          i);
       ET_LOG(Debug, "Copied GPU output %d back to CPU", i);
     }
 
@@ -507,40 +422,49 @@ class MetalBackend final : public ::executorch::runtime::BackendInterface {
   }
 
   void destroy(DelegateHandle* handle_) const override {
+    if (handle_ == nullptr) {
+      return;
+    }
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
-    // Delete the container BEFORE closing the shared library
-    if (handle->container_handle != nullptr) {
-      AOTIRuntimeError delete_result =
-          AOTInductorModelContainerDelete(handle->container_handle);
-      if (delete_result != Error::Ok) {
-        ET_LOG(
-            Error,
-            "AOTInductorModelContainerDelete failed with error code %d",
-            delete_result);
-      }
-    }
+    // NOTE: AOTInductorModelContainerDelete does not work correctly with
+    // multiple .so files. Deleting one container frees shared resources,
+    // which causes segmentation faults when attempting to delete other
+    // containers. As a workaround, we skip explicit container deletion
+    // and defer cleanup to the OS.
+    // TODO(gasoonjia): Find a proper solution for safe container deletion.
+    // AOTInductorModelContainerDelete(handle->container_handle);
 
     // Now close the shared library
     if (handle->so_handle != nullptr) {
       dlclose(handle->so_handle);
     }
 
-    free(handle);
+    // Remove the temporary shared library file
+    if (!handle->so_path.empty()) {
+      std::error_code remove_error;
+      std::filesystem::remove(handle->so_path, remove_error);
+      ET_CHECK_OR_LOG_ERROR(
+          !remove_error,
+          "Failed to remove temporary shared library %s: %s",
+          handle->so_path.c_str(),
+          remove_error.message().c_str());
+    }
+
+    delete handle;
     cleanup_memory();
     executorch::backends::aoti::cleanup_tensor_metadata();
     ET_LOG(Debug, "MetalBackend handle %p destroy", handle_);
   }
 };
 
-} // namespace metal
+} // namespace executorch::backends::metal
 
+namespace executorch::backends {
 namespace {
 auto cls = metal::MetalBackend();
 executorch::runtime::Backend backend{"MetalBackend", &cls};
 static executorch::runtime::Error success_with_compiler =
     register_backend(backend);
 } // namespace
-
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends
diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 3995f5533e6..866d17160ba 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -93,6 +93,11 @@ if(EXECUTORCH_BUILD_CUDA)
   executorch_target_link_options_shared_lib(aoti_cuda)
 endif()
 
+if(EXECUTORCH_BUILD_METAL)
+  list(APPEND link_libraries metal_backend)
+  executorch_target_link_options_shared_lib(metal_backend)
+endif()
+
 # Add tokenizers
 list(APPEND link_libraries tokenizers::tokenizers)
 
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index 3df8e947459..21bbbe26350 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -63,6 +63,7 @@ set(optional_lib_list
     coreml_inmemoryfs
     coremldelegate
     mpsdelegate
+    metal_backend
     neuron_backend
     qnn_executorch_backend
     portable_ops_lib
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index b2b09091014..861e41e4a63 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -153,7 +153,7 @@ define_overridable_option(
   EXECUTORCH_BUILD_CUDA "Build the CUDA backend" BOOL OFF
 )
 define_overridable_option(
-  EXECUTORCH_BUILD_METAL "Build the AOTI Metal backend" BOOL OFF
+  EXECUTORCH_BUILD_METAL "Build the Metal backend" BOOL OFF
 )
 define_overridable_option(
   EXECUTORCH_BUILD_VGF "Build the Arm VGF backend" BOOL OFF

From b87d5dee3eb02e1e8f9187d6fef9570cbc1b2649 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Mon, 13 Oct 2025 18:23:13 -0400
Subject: [PATCH 4/5] Update

[ghstack-poisoned]
---
 .../apple/metal/runtime/metal_backend.cpp     | 129 ++++++++++++++----
 1 file changed, 99 insertions(+), 30 deletions(-)

diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp
index 50c89cd66e4..03673f2f1dd 100644
--- a/backends/apple/metal/runtime/metal_backend.cpp
+++ b/backends/apple/metal/runtime/metal_backend.cpp
@@ -58,33 +58,52 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
-class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::BackendInterface {
+class ET_EXPERIMENTAL MetalBackend final
+    : public ::executorch::runtime::BackendInterface {
  private:
   Error register_shared_library_functions(void* so_handle) const {
-    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loading symbols");
+    ET_LOG(
+        Info,
+        "MetalBackend::register_shared_library_functions - Loading symbols");
 
     LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle);
-    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerCreateWithDevice");
+    ET_LOG(
+        Info,
+        "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerCreateWithDevice");
 
     LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle);
-    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerDelete");
+    ET_LOG(
+        Info,
+        "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerDelete");
 
     LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle);
-    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumInputs");
+    ET_LOG(
+        Info,
+        "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumInputs");
 
     LOAD_SYMBOL(AOTInductorModelContainerGetNumConstants, so_handle);
-    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumConstants");
+    ET_LOG(
+        Info,
+        "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumConstants");
 
     LOAD_SYMBOL(AOTInductorModelContainerGetInputName, so_handle);
-    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetInputName");
+    ET_LOG(
+        Info,
+        "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetInputName");
 
     LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle);
-    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumOutputs");
+    ET_LOG(
+        Info,
+        "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerGetNumOutputs");
 
     LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle);
-    ET_LOG(Info, "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerRun");
+    ET_LOG(
+        Info,
+        "MetalBackend::register_shared_library_functions - Loaded AOTInductorModelContainerRun");
 
-    ET_LOG(Info, "MetalBackend::register_shared_library_functions - All symbols loaded successfully");
+    ET_LOG(
+        Info,
+        "MetalBackend::register_shared_library_functions - All symbols loaded successfully");
     return Error::Ok;
   }
 
@@ -123,7 +142,10 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
     const NamedDataMap* named_data_map = context.get_named_data_map();
     ET_LOG(Info, "MetalBackend::init - Got named data map: %p", named_data_map);
 
-    ET_LOG(Info, "MetalBackend::init - Looking for blob key: %s", so_blob_key.c_str());
+    ET_LOG(
+        Info,
+        "MetalBackend::init - Looking for blob key: %s",
+        so_blob_key.c_str());
 
     auto aoti_metal_buffer = named_data_map->get_data(so_blob_key.c_str());
     ET_CHECK_OR_RETURN_ERROR(
@@ -133,14 +155,20 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
         so_blob_key.c_str(),
         static_cast<uint32_t>(aoti_metal_buffer.error()));
 
-    ET_LOG(Info, "MetalBackend::init - Buffer is OK, size: %zu", aoti_metal_buffer->size());
+    ET_LOG(
+        Info,
+        "MetalBackend::init - Buffer is OK, size: %zu",
+        aoti_metal_buffer->size());
 
     if (aoti_metal_buffer->data() == nullptr) {
       ET_LOG(Error, "MetalBackend::init - Buffer data is null");
       return Error::InvalidArgument;
     }
 
-    ET_LOG(Info, "MetalBackend::init - Buffer data pointer: %p", aoti_metal_buffer->data());
+    ET_LOG(
+        Info,
+        "MetalBackend::init - Buffer data pointer: %p",
+        aoti_metal_buffer->data());
 
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
@@ -148,7 +176,8 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
         temp_dir / (so_blob_key + to_string(getpid()) + ".so");
 
     // Create a temporary file
-    ET_LOG(Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str());
+    ET_LOG(
+        Info, "MetalBackend::init - Creating temp file: %s", so_path.c_str());
     ofstream outfile(so_path.c_str(), ios::binary);
 
     // Write the ELF buffer to the temporary file
@@ -183,7 +212,9 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
     ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle));
 
     AOTInductorModelContainerHandle container_handle = nullptr;
-    ET_LOG(Info, "MetalBackend::init - About to create AOTI container with device='mps'");
+    ET_LOG(
+        Info,
+        "MetalBackend::init - About to create AOTI container with device='mps'");
 
     ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice(
         &container_handle, 1, "mps", nullptr));
@@ -262,7 +293,11 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
       auto cpu_tensor = &(args[i]->toTensor());
       auto sizes = cpu_tensor->sizes();
       auto scalar_type = cpu_tensor->scalar_type();
-      ET_LOG(Debug, "MetalBackend input %d scalar_type=%d", i, static_cast<int32_t>(scalar_type));
+      ET_LOG(
+          Debug,
+          "MetalBackend input %d scalar_type=%d",
+          i,
+          static_cast<int32_t>(scalar_type));
 
       // Create GPU tensor with same shape
       std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
@@ -283,8 +318,13 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
       }
 
       // Log the created GPU tensor scalar type
-      auto gpu_tensor = reinterpret_cast<executorch::runtime::etensor::Tensor*>(gpu_input_handle);
-      ET_LOG(Debug, "MetalBackend created GPU tensor %d scalar_type=%d", i, static_cast<int32_t>(gpu_tensor->scalar_type()));
+      auto gpu_tensor = reinterpret_cast<executorch::runtime::etensor::Tensor*>(
+          gpu_input_handle);
+      ET_LOG(
+          Debug,
+          "MetalBackend created GPU tensor %d scalar_type=%d",
+          i,
+          static_cast<int32_t>(gpu_tensor->scalar_type()));
 
       gpu_inputs[i] = gpu_input_handle;
 
@@ -292,8 +332,14 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
       void* cpu_data = cpu_tensor->mutable_data_ptr();
       if (cpu_data && cpu_tensor->numel() > 0) {
         float* cpu_float_data = (float*)cpu_data;
-        ET_LOG(Debug, "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)",
-               i, cpu_float_data[0], cpu_float_data[1], cpu_float_data[2], cpu_tensor->numel());
+        ET_LOG(
+            Debug,
+            "CPU input %d data before copy: [%.3f, %.3f, %.3f, ...] (numel=%zd)",
+            i,
+            cpu_float_data[0],
+            cpu_float_data[1],
+            cpu_float_data[2],
+            cpu_tensor->numel());
       }
 
       // Copy data from CPU to GPU
@@ -304,8 +350,14 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
       }
 
       // Log the GPU tensor scalar type after copy
-      auto gpu_tensor_after = reinterpret_cast<executorch::runtime::etensor::Tensor*>(gpu_inputs[i]);
-      ET_LOG(Debug, "MetalBackend GPU tensor %d scalar_type after copy=%d", i, static_cast<int32_t>(gpu_tensor_after->scalar_type()));
+      auto gpu_tensor_after =
+          reinterpret_cast<executorch::runtime::etensor::Tensor*>(
+              gpu_inputs[i]);
+      ET_LOG(
+          Debug,
+          "MetalBackend GPU tensor %d scalar_type after copy=%d",
+          i,
+          static_cast<int32_t>(gpu_tensor_after->scalar_type()));
 
       ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i);
     }
@@ -319,7 +371,11 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
       auto sizes = cpu_output_tensor->sizes();
       auto scalar_type = cpu_output_tensor->scalar_type();
-      ET_LOG(Debug, "MetalBackend output %d scalar_type=%d", i, static_cast<int32_t>(scalar_type));
+      ET_LOG(
+          Debug,
+          "MetalBackend output %d scalar_type=%d",
+          i,
+          static_cast<int32_t>(scalar_type));
 
       // Create GPU tensor with same shape for kernel output
       std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
@@ -349,13 +405,21 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
     ET_LOG(Debug, "Passing to AOTInductorModelContainerRun:");
     for (int i = 0; i < n_inputs; i++) {
       void* gpu_input_data = gpu_inputs[i]->mutable_data_ptr();
-      ET_LOG(Debug, "  gpu_inputs[%d] = %p, data_ptr = %p",
-             i, gpu_inputs[i], gpu_input_data);
+      ET_LOG(
+          Debug,
+          "  gpu_inputs[%d] = %p, data_ptr = %p",
+          i,
+          gpu_inputs[i],
+          gpu_input_data);
     }
     for (int i = 0; i < n_outputs; i++) {
       void* gpu_output_data = gpu_outputs[i]->mutable_data_ptr();
-      ET_LOG(Debug, "  gpu_outputs[%d] = %p, data_ptr = %p",
-             i, gpu_outputs[i], gpu_output_data);
+      ET_LOG(
+          Debug,
+          "  gpu_outputs[%d] = %p, data_ptr = %p",
+          i,
+          gpu_outputs[i],
+          gpu_output_data);
     }
 
     // Run AOTI container with GPU tensors
@@ -380,10 +444,15 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
     try {
       synchronize_metal_stream();
     } catch (const std::exception& e) {
-      ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: %s", e.what());
+      ET_LOG(
+          Error,
+          "Failed to synchronize Metal stream after kernel execution: %s",
+          e.what());
       return Error::Internal;
     } catch (...) {
-      ET_LOG(Error, "Failed to synchronize Metal stream after kernel execution: unknown exception");
+      ET_LOG(
+          Error,
+          "Failed to synchronize Metal stream after kernel execution: unknown exception");
       return Error::Internal;
     }
 
@@ -432,7 +501,7 @@ class ET_EXPERIMENTAL MetalBackend final : public ::executorch::runtime::Backend
     // which causes segmentation faults when attempting to delete other
     // containers. As a workaround, we skip explicit container deletion
     // and defer cleanup to the OS.
-    // TODO(gasoonjia): Find a proper solution for safe container deletion.
+    // TODO: Find a proper solution for safe container deletion.
     // AOTInductorModelContainerDelete(handle->container_handle);
 
     // Now close the shared library

From 76e5a45c80fe97ee1f69c62d79fede7518864216 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Wed, 15 Oct 2025 13:59:08 -0400
Subject: [PATCH 5/5] Update

[ghstack-poisoned]
---
 backends/apple/metal/CMakeLists.txt            | 9 +++++++++
 backends/apple/metal/runtime/metal_backend.cpp | 6 ++++++
 2 files changed, 15 insertions(+)

diff --git a/backends/apple/metal/CMakeLists.txt b/backends/apple/metal/CMakeLists.txt
index 3d1ec39801a..7bdf142041d 100644
--- a/backends/apple/metal/CMakeLists.txt
+++ b/backends/apple/metal/CMakeLists.txt
@@ -14,8 +14,17 @@
 # ~~~
 # It should also be cmake-lint clean.
 #
+cmake_minimum_required(VERSION 3.29)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if(NOT APPLE)
+  message(FATAL_ERROR "Metal backend requires macOS")
+endif()
+
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
diff --git a/backends/apple/metal/runtime/metal_backend.cpp b/backends/apple/metal/runtime/metal_backend.cpp
index 9daccc1cf7a..1ef365a9332 100644
--- a/backends/apple/metal/runtime/metal_backend.cpp
+++ b/backends/apple/metal/runtime/metal_backend.cpp
@@ -520,6 +520,12 @@ class ET_EXPERIMENTAL MetalBackend final
           "Failed to remove temporary shared library %s: %s",
           handle->so_path.c_str(),
           remove_error.message().c_str());
+      if (!remove_error) {
+        ET_LOG(
+            Info,
+            "Removed temporary shared library file: %s",
+            handle->so_path.c_str());
+      }
     }
 
     delete handle;