[xla:ffi] Add support for passing scratch allocator to FFI handlers

ezhulenev · copybara-github · commit e1e226677ce6 · 2024-03-14T18:09:25.000-07:00
PiperOrigin-RevId: 615959071
diff --git a/xla/ffi/api/c_api_internal.h b/xla/ffi/api/c_api_internal.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_FFI_API_C_API_INTERNAL_H_
 #define XLA_FFI_API_C_API_INTERNAL_H_
 
+#include <cstdint>
+
 #include "xla/ffi/api/c_api.h"
 
 // Internal XLA FFI API that gives access to XLA implementation details that
@@ -40,12 +42,23 @@ extern "C" {
 // caller.
 typedef XLA_FFI_Error* XLA_FFI_INTERNAL_Error_Forward(void* status);
 
-// Returns a pointer to main compute stream (pointer to `se::Stream`). In
+// Returns a pointer to main compute stream (`se::Stream` pointer). In
 // contrast to public C API which returns a pointer to underlying platform
 // stream (i.e. cudaStream_t for CUDA backend), this API returns a pointer to
 // StreamExecutor stream which is unsafe to use across dynamic library boundary.
 typedef void* XLA_FFI_INTERNAL_Stream_Get(XLA_FFI_ExecutionContext* ctx);
 
+// Returns the device ordinal of the device associated with the execution
+// context.
+typedef int32_t XLA_FFI_INTERNAL_DeviceOrdinal_Get(
+    XLA_FFI_ExecutionContext* ctx);
+
+// Returns a pointer to device memory allocator (`se::DeviceMemoryAllocator`
+// pointer) which allows to allocate memory inside a custom call from the same
+// allocator as XLA (i.e. it allows to construct scratch memory allocator).
+typedef void* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
+    XLA_FFI_ExecutionContext* ctx);
+
 // Returns a pointer to `xla::HloComputation` if FFI handler has a called
 // computation attached to it.
 typedef void* XLA_FFI_INTERNAL_CalledComputation_Get(
@@ -60,6 +73,9 @@ typedef void* XLA_FFI_INTERNAL_CalledComputation_Get(
 struct XLA_FFI_InternalApi {
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_Error_Forward);
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_Stream_Get);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_DeviceOrdinal_Get);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(
+      XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get);
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_CalledComputation_Get);
 };
 
diff --git a/xla/ffi/ffi.h b/xla/ffi/ffi.h
@@ -37,6 +37,8 @@ limitations under the License.
 #include "xla/runtime/memref_view.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
@@ -197,6 +199,24 @@ struct CtxDecoding<se::Stream> {
   }
 };
 
+template <size_t n>
+struct CtxDecoding<se::OwningScratchAllocator<n>> {
+  using Type = se::OwningScratchAllocator<n>;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine&) {
+    int32_t device_ordinal =
+        api->internal_api->XLA_FFI_INTERNAL_DeviceOrdinal_Get(ctx);
+    void* device_allocator =
+        api->internal_api->XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(ctx);
+
+    return se::OwningScratchAllocator<n>(
+        device_ordinal,
+        reinterpret_cast<se::DeviceMemoryAllocator*>(device_allocator));
+  }
+};
+
 template <>
 struct CtxDecoding<CalledComputation> {
   using Type = const HloComputation*;
diff --git a/xla/ffi/ffi_api.cc b/xla/ffi/ffi_api.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/ffi/ffi_api.h"
 
 #include <cstddef>
+#include <cstdint>
 #include <string>
 #include <string_view>
 #include <utility>
@@ -265,6 +266,16 @@ static void* XLA_FFI_INTERNAL_Stream_Get(XLA_FFI_ExecutionContext* ctx) {
   return ctx->run_options->stream();
 }
 
+static int32_t XLA_FFI_INTERNAL_DeviceOrdinal_Get(
+    XLA_FFI_ExecutionContext* ctx) {
+  return ctx->run_options->device_ordinal();
+}
+
+static void* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
+    XLA_FFI_ExecutionContext* ctx) {
+  return ctx->run_options->allocator();
+}
+
 static void* XLA_FFI_INTERNAL_CalledComputation_Get(
     XLA_FFI_ExecutionContext* ctx) {
   return const_cast<HloComputation*>(ctx->called_computation);
@@ -277,6 +288,8 @@ static void* XLA_FFI_INTERNAL_CalledComputation_Get(
 static XLA_FFI_InternalApi internal_api = {
     XLA_FFI_INTERNAL_Error_Forward,
     XLA_FFI_INTERNAL_Stream_Get,
+    XLA_FFI_INTERNAL_DeviceOrdinal_Get,
+    XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get,
     XLA_FFI_INTERNAL_CalledComputation_Get,
 };
 
diff --git a/xla/service/gpu/custom_call_test.cc b/xla/service/gpu/custom_call_test.cc
@@ -21,13 +21,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/algorithm/container.h"
-#include "absl/strings/str_format.h"
-#include "xla/shape.h"
-#include "tsl/platform/statusor.h"
-
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"  // IWYU pragma: keep
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
@@ -38,8 +31,12 @@ limitations under the License.
 #define PLATFORM "ROCM"
 #endif
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "xla/client/lib/constants.h"
 #include "xla/client/xla_builder.h"
 #include "xla/ffi/ffi.h"
@@ -49,12 +46,15 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/test_helpers.h"
 #include "xla/tests/client_library_test_base.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 #if GOOGLE_CUDA
 #define gpuSuccess cudaSuccess
@@ -619,7 +619,8 @@ TEST_F(CustomCallTest, ExportedFfiWithStatusSucceeded) {
 //===----------------------------------------------------------------------===//
 
 static absl::Status MemcpyWithCalledComputation(
-    se::Stream* stream, ffi::BufferBase src, ffi::BufferBase dst,
+    se::Stream* stream, se::OwningScratchAllocator<> scratch_allocator,
+    ffi::BufferBase src, ffi::BufferBase dst,
     const HloComputation* called_computation) {
   if (called_computation == nullptr)
     return absl::InternalError("Called computation is not defined");
@@ -637,8 +638,9 @@ XLA_FFI_DEFINE_HANDLER(kMemcpyWithCalledComputation,
                        MemcpyWithCalledComputation,
                        ffi::Ffi::Bind()
                            .Ctx<se::Stream>()
-                           .Arg<ffi::BufferBase>()  // src
-                           .Arg<ffi::BufferBase>()  // dst
+                           .Ctx<se::OwningScratchAllocator<>>()  // scratch
+                           .Arg<ffi::BufferBase>()               // src
+                           .Arg<ffi::BufferBase>()               // dst
                            .Ctx<ffi::CalledComputation>());
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
diff --git a/xla/stream_executor/scratch_allocator.h b/xla/stream_executor/scratch_allocator.h
@@ -65,6 +65,9 @@ class OwningScratchAllocator : public ScratchAllocator {
   OwningScratchAllocator(int device_ordinal, DeviceMemoryAllocator* allocator)
       : device_ordinal_(device_ordinal), allocator_(allocator) {}
 
+  OwningScratchAllocator(OwningScratchAllocator&&) = default;
+  OwningScratchAllocator& operator=(OwningScratchAllocator&&) = default;
+
   int64_t GetMemoryLimitInBytes() override { return -1; }
 
   absl::StatusOr<DeviceMemory<uint8_t>> AllocateBytes(
@@ -80,9 +83,6 @@ class OwningScratchAllocator : public ScratchAllocator {
   int device_ordinal_;
   DeviceMemoryAllocator* allocator_;
   absl::InlinedVector<OwningDeviceMemory, N> buffers_;
-
-  OwningScratchAllocator(const OwningScratchAllocator&) = delete;
-  void operator=(const OwningScratchAllocator&) = delete;
 };
 
 }  // namespace stream_executor