diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
index 6659f4a2c58e8..812b72681343b 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
@@ -107,4 +107,39 @@ def GPU_SelectObjectAttr : GPU_Attr<"SelectObject", "select_object", [
   let genVerifyDecl = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// GPU LLVM offload attribute.
+//===----------------------------------------------------------------------===//
+def GPU_OffloadOpenMP : I32EnumAttrCase<"OpenMP", 1, "omp">;
+def GPU_OffloadCUDA : I32EnumAttrCase<"CUDA", 2, "cuda">;
+def GPU_OffloadHIP : I32EnumAttrCase<"HIP", 3, "hip">;
+def GPU_OffloadKindEnum : GPU_I32Enum<
+  "OffloadKind", "GPU offload kind", [
+    GPU_OffloadOpenMP,
+    GPU_OffloadCUDA,
+    GPU_OffloadHIP
+  ]>;
+
+def GPU_OffloadEmbeddingAttr : GPU_Attr<"OffloadEmbedding", "offload_embedding", [
+    OffloadingTranslationAttrTrait
+  ]> {
+  let description = [{
+    This GPU offloading handler uses LLVM offloading infrastructure to embed GPU
+    binaries in the IR. At program start, the LLVM offloading mechanism registers
+    kernels and variables with the runtime library: CUDA RT, HIP RT or
+    LibOMPTarget.
+    The offloading mechanism relies on the runtime library to dispatch the
+    correct kernel based on the registered symbols.
+    This offload mechanism requires to specify which runtime is being called,
+    this is done by the `kind` parameter.
+    Example:
+    ```mlir
+    gpu.binary @binary <#gpu.offload_embedding<omp>> [...]
+    gpu.binary @binary <#gpu.offload_embedding<cuda>> [...]
+    ```
+  }];
+  let parameters = (ins "gpu::OffloadKind":$kind);
+  let assemblyFormat = [{ `<` $kind `>` }];
+}
+
 #endif // GPU_COMPILATION_ATTRS
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
index 08019e77ae6af..1a55d08be9edc 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 7140e614412f9..c19820aafe168 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -17,6 +17,7 @@ include "mlir/IR/EnumAttr.td"
 include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/DataLayoutInterfaces.td"
 include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td"
 
 def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
@@ -1946,7 +1947,9 @@ def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async",
 // NVVM target attribute.
 //===----------------------------------------------------------------------===//
 
-def NVVM_TargettAttr : NVVM_Attr<"NVVMTarget", "target"> {
+def NVVM_TargettAttr : NVVM_Attr<"NVVMTarget", "target", [
+    DeclareAttrInterfaceMethods<TargetInfoAttrInterface>
+  ]> {
   let description = [{
     GPU target attribute for controlling compilation of NVIDIA targets. All
     parameters decay into default values if not present.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
index c2a82ffc1c43c..fa1131a463e1a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
@@ -26,6 +26,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 ///// Ops /////
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 48b830ae34f29..a492709c29954 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -15,6 +15,7 @@
 
 include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+include "mlir/Interfaces/DataLayoutInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 //===----------------------------------------------------------------------===//
@@ -608,8 +609,9 @@ def ROCDL_CvtSrFp8F32Op :
 // ROCDL target attribute.
 //===----------------------------------------------------------------------===//
 
-def ROCDL_TargettAttr :
-    ROCDL_Attr<"ROCDLTarget", "target"> {
+def ROCDL_TargettAttr : ROCDL_Attr<"ROCDLTarget", "target", [
+    DeclareAttrInterfaceMethods<TargetInfoAttrInterface>
+  ]> {
   let description = [{
     ROCDL target attribute for controlling compilation of AMDGPU targets. All
     parameters decay into default values if not present.
diff --git a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
index a8def967fffcf..eac9521aadc11 100644
--- a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
+++ b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
@@ -188,6 +188,39 @@ def DataLayoutSpecInterface : AttrInterface<"DataLayoutSpecInterface"> {
   }];
 }
 
+def TargetInfoAttrInterface : AttrInterface<"TargetInfoAttrInterface"> {
+  let cppNamespace = "::mlir";
+
+  let description = [{
+    Attribute interface describing target information.
+
+    Target information attributes provide essential information on the
+    compilation target. This information includes the target triple identifier,
+    the target chip identifier, and a string representation of the target features.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      /*description=*/"Returns the target triple identifier.",
+      /*retTy=*/"::mlir::StringRef",
+      /*methodName=*/"getTargetTriple",
+      /*args=*/(ins)
+    >,
+    InterfaceMethod<
+      /*description=*/"Returns the target chip identifier.",
+      /*retTy=*/"::mlir::StringRef",
+      /*methodName=*/"getTargetChip",
+      /*args=*/(ins)
+    >,
+    InterfaceMethod<
+      /*description=*/"Returns the target features as a string.",
+      /*retTy=*/"std::string",
+      /*methodName=*/"getTargetFeatures",
+      /*args=*/(ins)
+    >
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // Operation interface
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Target/LLVM/Offload.h b/mlir/include/mlir/Target/LLVM/Offload.h
new file mode 100644
index 0000000000000..7b705667d477d
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVM/Offload.h
@@ -0,0 +1,61 @@
+//===- Offload.h - LLVM Target Offload --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares LLVM target offload utility classes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVM_OFFLOAD_H
+#define MLIR_TARGET_LLVM_OFFLOAD_H
+
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class Constant;
+class GlobalVariable;
+class Module;
+} // namespace llvm
+
+namespace mlir {
+namespace LLVM {
+/// `OffloadHandler` is a utility class for creating LLVM offload entries. LLVM
+/// offload entries hold information on offload symbols; for example, for a GPU
+/// kernel, this includes its host address to identify the kernel and the kernel
+/// identifier in the binary. Arrays of offload entries can be used to register
+/// functions within the CUDA/HIP runtime. Libomptarget also uses these entries
+/// to register OMP target offload kernels and variables.
+class OffloadHandler {
+public:
+  using OffloadEntryArray =
+      std::pair<llvm::GlobalVariable *, llvm::GlobalVariable *>;
+  OffloadHandler(llvm::Module &module) : module(module) {}
+
+  /// Returns the begin symbol name used in the entry array.
+  static std::string getBeginSymbol(StringRef suffix);
+
+  /// Returns the end symbol name used in the entry array.
+  static std::string getEndSymbol(StringRef suffix);
+
+  /// Returns the entry array if it exists or a pair of null pointers.
+  OffloadEntryArray getEntryArray(StringRef suffix);
+
+  /// Emits an empty array of offloading entries.
+  OffloadEntryArray emitEmptyEntryArray(StringRef suffix);
+
+  /// Inserts an offloading entry into an existing entry array. This method
+  /// returns failure if the entry array hasn't been declared.
+  LogicalResult insertOffloadEntry(StringRef suffix, llvm::Constant *entry);
+
+protected:
+  llvm::Module &module;
+};
+} // namespace LLVM
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVM_OFFLOAD_H
diff --git a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
index b00259677697a..00b78e30ee8b0 100644
--- a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
@@ -61,6 +61,7 @@ add_mlir_dialect_library(MLIRNVVMDialect
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRLLVMDialect
+  MLIRDataLayoutInterfaces
   MLIRSideEffectInterfaces
   )
 
@@ -83,5 +84,6 @@ add_mlir_dialect_library(MLIRROCDLDialect
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRLLVMDialect
+  MLIRDataLayoutInterfaces
   MLIRSideEffectInterfaces
   )
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index aa49c4dc31fbc..b73504ac4969a 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1106,6 +1106,14 @@ NVVMTargetAttr::verify(function_ref<InFlightDiagnostic()> emitError,
   return success();
 }
 
+StringRef NVVMTargetAttr::getTargetTriple() const { return getTriple(); }
+
+StringRef NVVMTargetAttr::getTargetChip() const { return getChip(); }
+
+std::string NVVMTargetAttr::getTargetFeatures() const {
+  return getFeatures().str();
+}
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
index 26e46b31ddc01..8b10c48718a3f 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
@@ -295,6 +295,14 @@ ROCDLTargetAttr::verify(function_ref<InFlightDiagnostic()> emitError,
   return success();
 }
 
+StringRef ROCDLTargetAttr::getTargetTriple() const { return getTriple(); }
+
+StringRef ROCDLTargetAttr::getTargetChip() const { return getChip(); }
+
+std::string ROCDLTargetAttr::getTargetFeatures() const {
+  return getFeatures().str();
+}
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/LLVMIR/ROCDLOps.cpp.inc"
 
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index cc2c3a00a02ea..241a6c64dd868 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_library(MLIRTargetLLVM
   ModuleToObject.cpp
+  Offload.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVM
@@ -16,6 +17,7 @@ add_mlir_library(MLIRTargetLLVM
   Passes
   Support
   Target
+  FrontendOffloading
   LINK_LIBS PUBLIC
   MLIRExecutionEngineUtils
   MLIRTargetLLVMIRExport
diff --git a/mlir/lib/Target/LLVM/Offload.cpp b/mlir/lib/Target/LLVM/Offload.cpp
new file mode 100644
index 0000000000000..81ba12403bfb9
--- /dev/null
+++ b/mlir/lib/Target/LLVM/Offload.cpp
@@ -0,0 +1,111 @@
+//===- Offload.cpp - LLVM Target Offload ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines LLVM target offload utility classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVM/Offload.h"
+#include "llvm/Frontend/Offloading/Utility.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+
+using namespace mlir;
+using namespace mlir::LLVM;
+
+std::string OffloadHandler::getBeginSymbol(StringRef suffix) {
+  return ("__begin_offload_" + suffix).str();
+}
+
+std::string OffloadHandler::getEndSymbol(StringRef suffix) {
+  return ("__end_offload_" + suffix).str();
+}
+
+namespace {
+/// Returns the type of the entry array.
+llvm::ArrayType *getEntryArrayType(llvm::Module &module, size_t numElems) {
+  return llvm::ArrayType::get(llvm::offloading::getEntryTy(module), numElems);
+}
+
+/// Creates the initializer of the entry array.
+llvm::Constant *getEntryArrayBegin(llvm::Module &module,
+                                   ArrayRef<llvm::Constant *> entries) {
+  // If there are no entries return a constant zero initializer.
+  llvm::ArrayType *arrayTy = getEntryArrayType(module, entries.size());
+  return entries.empty() ? llvm::ConstantAggregateZero::get(arrayTy)
+                         : llvm::ConstantArray::get(arrayTy, entries);
+}
+
+/// Computes the end position of the entry array.
+llvm::Constant *getEntryArrayEnd(llvm::Module &module,
+                                 llvm::GlobalVariable *begin, size_t numElems) {
+  llvm::Type *intTy = module.getDataLayout().getIntPtrType(module.getContext());
+  return llvm::ConstantExpr::getGetElementPtr(
+      llvm::offloading::getEntryTy(module), begin,
+      ArrayRef<llvm::Constant *>({llvm::ConstantInt::get(intTy, numElems)}),
+      true);
+}
+} // namespace
+
+OffloadHandler::OffloadEntryArray
+OffloadHandler::getEntryArray(StringRef suffix) {
+  llvm::GlobalVariable *beginGV =
+      module.getGlobalVariable(getBeginSymbol(suffix), true);
+  llvm::GlobalVariable *endGV =
+      module.getGlobalVariable(getEndSymbol(suffix), true);
+  return {beginGV, endGV};
+}
+
+OffloadHandler::OffloadEntryArray
+OffloadHandler::emitEmptyEntryArray(StringRef suffix) {
+  llvm::ArrayType *arrayTy = getEntryArrayType(module, 0);
+  auto *beginGV = new llvm::GlobalVariable(
+      module, arrayTy, /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
+      getEntryArrayBegin(module, {}), getBeginSymbol(suffix));
+  auto *endGV = new llvm::GlobalVariable(
+      module, llvm::PointerType::get(module.getContext(), 0),
+      /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
+      getEntryArrayEnd(module, beginGV, 0), getEndSymbol(suffix));
+  return {beginGV, endGV};
+}
+
+LogicalResult OffloadHandler::insertOffloadEntry(StringRef suffix,
+                                                 llvm::Constant *entry) {
+  // Get the begin and end symbols to the entry array.
+  std::string beginSymId = getBeginSymbol(suffix);
+  llvm::GlobalVariable *beginGV = module.getGlobalVariable(beginSymId, true);
+  llvm::GlobalVariable *endGV =
+      module.getGlobalVariable(getEndSymbol(suffix), true);
+  // Fail if the symbols are missing.
+  if (!beginGV || !endGV)
+    return failure();
+  // Create the entry initializer.
+  assert(beginGV->getInitializer() && "entry array initializer is missing.");
+  // Add existing entries into the new entry array.
+  SmallVector<llvm::Constant *> entries;
+  if (auto beginInit = dyn_cast_or_null<llvm::ConstantAggregate>(
+          beginGV->getInitializer())) {
+    for (unsigned i = 0; i < beginInit->getNumOperands(); ++i)
+      entries.push_back(beginInit->getOperand(i));
+  }
+  // Add the new entry.
+  entries.push_back(entry);
+  // Create a global holding the new updated set of entries.
+  auto *arrayTy = llvm::ArrayType::get(llvm::offloading::getEntryTy(module),
+                                       entries.size());
+  auto *entryArr = new llvm::GlobalVariable(
+      module, arrayTy, /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
+      getEntryArrayBegin(module, entries), beginSymId, endGV);
+  // Replace the old entry array variable withe new one.
+  beginGV->replaceAllUsesWith(entryArr);
+  beginGV->eraseFromParent();
+  entryArr->setName(beginSymId);
+  // Update the end symbol.
+  endGV->setInitializer(getEntryArrayEnd(module, entryArr, entries.size()));
+  return success();
+}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt
index 11816ff5c2c1f..b95b1e95a039b 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt
@@ -1,14 +1,17 @@
 add_mlir_translation_library(MLIRGPUToLLVMIRTranslation
   GPUToLLVMIRTranslation.cpp
-  SelectObjectAttr.cpp
+  OffloadingTranslationAttrs.cpp
 
   LINK_COMPONENTS
   Core
+  FrontendOffloading
+  Object
 
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRGPUDialect
   MLIRLLVMDialect
   MLIRSupport
+  MLIRTargetLLVM
   MLIRTargetLLVMIRExport
   )
diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/OffloadingTranslationAttrs.cpp
similarity index 54%
rename from mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
rename to mlir/lib/Target/LLVMIR/Dialect/GPU/OffloadingTranslationAttrs.cpp
index 0eb33287d608b..4448b72615e21 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/OffloadingTranslationAttrs.cpp
@@ -25,6 +25,9 @@
 
 using namespace mlir;
 
+//===----------------------------------------------------------------------===//
+// SelectObjectAttr
+//===----------------------------------------------------------------------===//
 namespace {
 // Implementation of the `OffloadingLLVMTranslationAttrInterface` model.
 class SelectObjectAttrImpl
@@ -54,13 +57,6 @@ std::string getBinaryIdentifier(StringRef binaryName) {
 }
 } // namespace
 
-void mlir::gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(
-    DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
-    SelectObjectAttr::attachInterface<SelectObjectAttrImpl>(*ctx);
-  });
-}
-
 gpu::ObjectAttr
 SelectObjectAttrImpl::getSelectedObject(gpu::BinaryOp op) const {
   ArrayRef<Attribute> objects = op.getObjectsAttr().getValue();
@@ -136,6 +132,9 @@ class LaunchKernel {
   // Get the kernel launch callee.
   FunctionCallee getKernelLaunchFn();
 
+  // Get the kernel RT launch callee.
+  FunctionCallee getKernelRTLaunchFn();
+
   // Get the kernel launch callee.
   FunctionCallee getClusterKernelLaunchFn();
 
@@ -166,9 +165,15 @@ class LaunchKernel {
   // Create the void* kernel array for passing the arguments.
   Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op);
 
+  // Returns a pair containing the function pointer to the kernel and the
+  // pointer to the kernel module.
+  mlir::FailureOr<std::pair<Value *, Value *>>
+  getKernelInfo(mlir::gpu::LaunchFuncOp op, mlir::gpu::ObjectAttr object);
+
   // Create the full kernel launch.
   mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op,
-                                         mlir::gpu::ObjectAttr object);
+                                         mlir::gpu::ObjectAttr object,
+                                         Value *kernelPtr = nullptr);
 
 private:
   Module &module;
@@ -244,6 +249,16 @@ llvm::FunctionCallee llvm::LaunchKernel::getClusterKernelLaunchFn() {
           false));
 }
 
+llvm::FunctionCallee llvm::LaunchKernel::getKernelRTLaunchFn() {
+  return module.getOrInsertFunction(
+      "mgpuLaunchKernelRT",
+      FunctionType::get(voidTy,
+                        ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy,
+                                          intPtrTy, intPtrTy, intPtrTy, i32Ty,
+                                          ptrTy, ptrTy, ptrTy, i64Ty}),
+                        false));
+}
+
 llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
   return module.getOrInsertFunction(
       "mgpuModuleGetFunction",
@@ -334,46 +349,14 @@ llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) {
   return argArray;
 }
 
-// Emits LLVM IR to launch a kernel function:
+// Loads the kernel module pointer
 // %0 = call %binarygetter
 // %1 = call %moduleLoad(%0)
 // %2 = <see generateKernelNameConstant>
 // %3 = call %moduleGetFunction(%1, %2)
-// %4 = call %streamCreate()
-// %5 = <see generateParamsArray>
-// call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
-// call %streamSynchronize(%4)
-// call %streamDestroy(%4)
-// call %moduleUnload(%1)
-mlir::LogicalResult
-llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
-                                       mlir::gpu::ObjectAttr object) {
-  auto llvmValue = [&](mlir::Value value) -> Value * {
-    Value *v = moduleTranslation.lookupValue(value);
-    assert(v && "Value has not been translated.");
-    return v;
-  };
-
-  // Get grid dimensions.
-  mlir::gpu::KernelDim3 grid = op.getGridSizeOperandValues();
-  Value *gx = llvmValue(grid.x), *gy = llvmValue(grid.y),
-        *gz = llvmValue(grid.z);
-
-  // Get block dimensions.
-  mlir::gpu::KernelDim3 block = op.getBlockSizeOperandValues();
-  Value *bx = llvmValue(block.x), *by = llvmValue(block.y),
-        *bz = llvmValue(block.z);
-
-  // Get dynamic shared memory size.
-  Value *dynamicMemorySize = nullptr;
-  if (mlir::Value dynSz = op.getDynamicSharedMemorySize())
-    dynamicMemorySize = llvmValue(dynSz);
-  else
-    dynamicMemorySize = ConstantInt::get(i32Ty, 0);
-
-  // Create the argument array.
-  Value *argArray = createKernelArgArray(op);
-
+mlir::FailureOr<std::pair<llvm::Value *, llvm::Value *>>
+llvm::LaunchKernel::getKernelInfo(mlir::gpu::LaunchFuncOp op,
+                                  mlir::gpu::ObjectAttr object) {
   // Default JIT optimization level.
   llvm::Constant *optV = llvm::ConstantInt::get(i32Ty, 0);
   // Check if there's an optimization level embedded in the object.
@@ -385,7 +368,6 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
       return op.emitError("the optimization level must be an integer");
     optV = llvm::ConstantInt::get(i32Ty, optLevel.getValue());
   }
-
   // Load the kernel module.
   StringRef moduleName = op.getKernelModuleName().getValue();
   std::string binaryIdentifier = getBinaryIdentifier(moduleName);
@@ -417,6 +399,56 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
       getModuleFunctionFn(),
       {moduleObject,
        getOrCreateFunctionName(moduleName, op.getKernelName().getValue())});
+  return std::pair<Value *, Value *>(moduleFunction, moduleObject);
+}
+
+// Emits LLVM IR to launch a kernel function:
+// %4 = call %streamCreate()
+// %5 = <see generateParamsArray>
+// call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
+// call %streamSynchronize(%4)
+// call %streamDestroy(%4)
+// call %moduleUnload(%1)
+mlir::LogicalResult
+llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
+                                       mlir::gpu::ObjectAttr object,
+                                       Value *kernelPtr) {
+  auto llvmValue = [&](mlir::Value value) -> Value * {
+    Value *v = moduleTranslation.lookupValue(value);
+    assert(v && "Value has not been translated.");
+    return v;
+  };
+
+  // Get grid dimensions.
+  mlir::gpu::KernelDim3 grid = op.getGridSizeOperandValues();
+  Value *gx = llvmValue(grid.x), *gy = llvmValue(grid.y),
+        *gz = llvmValue(grid.z);
+
+  // Get block dimensions.
+  mlir::gpu::KernelDim3 block = op.getBlockSizeOperandValues();
+  Value *bx = llvmValue(block.x), *by = llvmValue(block.y),
+        *bz = llvmValue(block.z);
+
+  // Get dynamic shared memory size.
+  Value *dynamicMemorySize = nullptr;
+  if (mlir::Value dynSz = op.getDynamicSharedMemorySize())
+    dynamicMemorySize = llvmValue(dynSz);
+  else
+    dynamicMemorySize = ConstantInt::get(i32Ty, 0);
+
+  // Create the argument array.
+  Value *argArray = createKernelArgArray(op);
+
+  Value *moduleObject = nullptr, *moduleFunction = nullptr;
+
+  if (!kernelPtr) {
+    mlir::FailureOr<std::pair<Value *, Value *>> kernelInfo =
+        getKernelInfo(op, object);
+    if (failed(kernelInfo))
+      return failure();
+    moduleFunction = kernelInfo->first;
+    moduleObject = kernelInfo->second;
+  }
 
   // Get the stream to use for execution. If there's no async object then create
   // a stream to make a synchronous kernel launch.
@@ -436,19 +468,27 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
   Value *nullPtr = ConstantPointerNull::get(ptrTy);
 
   // Launch kernel with clusters if cluster size is specified.
-  if (op.hasClusterSize()) {
-    mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
-    Value *cx = llvmValue(cluster.x), *cy = llvmValue(cluster.y),
-          *cz = llvmValue(cluster.z);
-    builder.CreateCall(
-        getClusterKernelLaunchFn(),
-        ArrayRef<Value *>({moduleFunction, cx, cy, cz, gx, gy, gz, bx, by, bz,
-                           dynamicMemorySize, stream, argArray, nullPtr}));
+  if (moduleFunction) {
+    if (op.hasClusterSize()) {
+      mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
+      Value *cx = llvmValue(cluster.x), *cy = llvmValue(cluster.y),
+            *cz = llvmValue(cluster.z);
+      builder.CreateCall(
+          getClusterKernelLaunchFn(),
+          ArrayRef<Value *>({moduleFunction, cx, cy, cz, gx, gy, gz, bx, by, bz,
+                             dynamicMemorySize, stream, argArray, nullPtr}));
+    } else {
+      builder.CreateCall(getKernelLaunchFn(),
+                         ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by,
+                                            bz, dynamicMemorySize, stream,
+                                            argArray, nullPtr, paramsCount}));
+    }
   } else {
-    builder.CreateCall(getKernelLaunchFn(),
-                       ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by,
-                                          bz, dynamicMemorySize, stream,
-                                          argArray, nullPtr, paramsCount}));
+    assert(kernelPtr && "invalid kernel pointer");
+    builder.CreateCall(
+        getKernelRTLaunchFn(),
+        ArrayRef<Value *>({kernelPtr, gx, gy, gz, bx, by, bz, dynamicMemorySize,
+                           stream, argArray, nullPtr, paramsCount}));
   }
 
   // Sync & destroy the stream, for synchronous launches.
@@ -458,7 +498,285 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
   }
 
   // Unload the kernel module.
-  builder.CreateCall(getModuleUnloadFn(), {moduleObject});
+  if (moduleObject)
+    builder.CreateCall(getModuleUnloadFn(), {moduleObject});
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// OffloadEmbeddingAttr
+//===----------------------------------------------------------------------===//
+#include "mlir/Target/LLVM/Offload.h"
+#include "llvm/Frontend/Offloading/OffloadWrapper.h"
+#include "llvm/Frontend/Offloading/Utility.h"
+#include "llvm/Object/OffloadBinary.h"
+
+namespace {
+// Implementation of the `OffloadingLLVMTranslationAttrInterface` model.
+class OffloadEmbeddingAttrImpl
+    : public gpu::OffloadingLLVMTranslationAttrInterface::FallbackModel<
+          OffloadEmbeddingAttrImpl> {
+public:
+  // Translates a `gpu.binary`, embedding the binary into a host LLVM module as
+  // global binary string.
+  LogicalResult embedBinary(Attribute attribute, Operation *operation,
+                            llvm::IRBuilderBase &builder,
+                            LLVM::ModuleTranslation &moduleTranslation) const;
+
+  // Translates a `gpu.launch_func` to a sequence of LLVM instructions resulting
+  // in a kernel launch call.
+  LogicalResult launchKernel(Attribute attribute,
+                             Operation *launchFuncOperation,
+                             Operation *binaryOperation,
+                             llvm::IRBuilderBase &builder,
+                             LLVM::ModuleTranslation &moduleTranslation) const;
+};
+} // namespace
+
+namespace {
+llvm::object::ImageKind getImageKind(gpu::CompilationTarget format) {
+  switch (format) {
+  case gpu::CompilationTarget::Offload:
+    return llvm::object::IMG_Bitcode;
+  case gpu::CompilationTarget::Assembly:
+    return llvm::object::IMG_PTX;
+  case gpu::CompilationTarget::Binary:
+    return llvm::object::IMG_Object;
+  case gpu::CompilationTarget::Fatbin:
+    return llvm::object::IMG_Fatbinary;
+  }
+}
+
+llvm::object::OffloadKind getOffloadKind(gpu::OffloadKind offloadKind) {
+  switch (offloadKind) {
+  case gpu::OffloadKind::OpenMP:
+    return llvm::object::OFK_OpenMP;
+  case gpu::OffloadKind::CUDA:
+    return llvm::object::OFK_Cuda;
+  case gpu::OffloadKind::HIP:
+    return llvm::object::OFK_HIP;
+  }
+}
+
+using OffloadEntryArray = LLVM::OffloadHandler::OffloadEntryArray;
+
+/// Utility class for embedding binaries and launching kernels using the
+/// offloading attribute.
+class OffloadManager : public LLVM::OffloadHandler {
+public:
+  OffloadManager(gpu::BinaryOp binaryOp, llvm::Module &module,
+                 gpu::OffloadKind offloadKind)
+      : LLVM::OffloadHandler(module), binaryOp(binaryOp),
+        offloadKind(offloadKind) {}
+
+  /// Embed a GPU binary into a module.
+  LogicalResult embedBinary();
+
+  /// Generates the kernel launch call.
+  LogicalResult launchKernel(gpu::LaunchFuncOp launchFunc,
+                             llvm::IRBuilderBase &builder,
+                             LLVM::ModuleTranslation &moduleTranslation);
+
+protected:
+  /// Returns the name to be used for the offloading symbols.
+  StringRef getSymbolSuffix();
+
+  /// Emits the offloading entry for `launchFunc`.
+  LogicalResult emitOffloadingEntry(gpu::LaunchFuncOp launchFunc,
+                                    llvm::Constant *registeredSym);
+
+  /// Bundle OpenMP images together.
+  SmallVector<std::unique_ptr<llvm::MemoryBuffer>>
+  bundleOpenMP(ArrayRef<Attribute> objects);
+
+  /// Bundle gpu-objects together. TODO: support more than a single object.
+  FailureOr<SmallVector<std::unique_ptr<llvm::MemoryBuffer>>>
+  bundleGPU(ArrayRef<Attribute> objects);
+
+  /// Bundle objects depending on the `gpu::OffloadKind`.
+  FailureOr<SmallVector<std::unique_ptr<llvm::MemoryBuffer>>>
+  bundleImages(ArrayRef<Attribute> objects);
+
+  /// Emit registration code and embed the images.
+  LogicalResult wrapImages(llvm::Module &module, ArrayRef<ArrayRef<char>> imgs);
+
+  /// Convert a `ObjectAttr` to a OffloadingImage.
+  llvm::object::OffloadBinary::OffloadingImage
+  getOffloadingImage(gpu::ObjectAttr obj);
 
+  gpu::BinaryOp binaryOp;
+  gpu::OffloadKind offloadKind;
+};
+} // namespace
+
+llvm::object::OffloadBinary::OffloadingImage
+OffloadManager::getOffloadingImage(gpu::ObjectAttr obj) {
+  // Create the binary used by Libomptarget
+  auto targetAttr = cast<TargetInfoAttrInterface>(obj.getTarget());
+  llvm::object::OffloadBinary::OffloadingImage imageBinary{};
+  imageBinary.TheImageKind = getImageKind(obj.getFormat());
+  imageBinary.TheOffloadKind = getOffloadKind(offloadKind);
+  imageBinary.StringData["triple"] = targetAttr.getTargetTriple();
+  imageBinary.StringData["arch"] = targetAttr.getTargetChip();
+  imageBinary.Image =
+      llvm::MemoryBuffer::getMemBufferCopy(obj.getObject().getValue(), "");
+  return imageBinary;
+}
+
+SmallVector<std::unique_ptr<llvm::MemoryBuffer>>
+OffloadManager::bundleOpenMP(ArrayRef<Attribute> objects) {
+  // Bundle all the available objects in the binary.
+  SmallVector<std::unique_ptr<llvm::MemoryBuffer>> buffers;
+  for (Attribute attr : objects)
+    buffers.emplace_back(
+        llvm::MemoryBuffer::getMemBufferCopy(llvm::object::OffloadBinary::write(
+            getOffloadingImage(cast<gpu::ObjectAttr>(attr)))));
+  return buffers;
+}
+
+FailureOr<SmallVector<std::unique_ptr<llvm::MemoryBuffer>>>
+OffloadManager::bundleGPU(ArrayRef<Attribute> objects) {
+  if (objects.size() > 1)
+    return binaryOp.emitError("multiple objects are not yet supported");
+  SmallVector<std::unique_ptr<llvm::MemoryBuffer>> buffers;
+  assert(objects.size() == 1 && "there should be a single object");
+  auto object = cast<gpu::ObjectAttr>(objects[0]);
+  if (gpu::CompilationTarget frmt = object.getFormat();
+      frmt != gpu::CompilationTarget::Binary &&
+      frmt != gpu::CompilationTarget::Fatbin)
+    return binaryOp.emitError(
+        "the only supported objects are binaries and fat-binaries.");
+  buffers.emplace_back(
+      llvm::MemoryBuffer::getMemBuffer(object.getObject().getValue()));
+  return buffers;
+}
+
+FailureOr<SmallVector<std::unique_ptr<llvm::MemoryBuffer>>>
+OffloadManager::bundleImages(ArrayRef<Attribute> objects) {
+  switch (offloadKind) {
+  case gpu::OffloadKind::OpenMP:
+    return bundleOpenMP(objects);
+  case gpu::OffloadKind::CUDA:
+  case gpu::OffloadKind::HIP:
+    return bundleGPU(objects);
+  }
+}
+
+StringRef OffloadManager::getSymbolSuffix() { return binaryOp.getName(); }
+
+LogicalResult
+OffloadManager::emitOffloadingEntry(gpu::LaunchFuncOp launchFunc,
+                                    llvm::Constant *registeredSym) {
+  // Create the entry initializer.
+  std::pair<llvm::Constant *, llvm::GlobalVariable *> entry =
+      llvm::offloading::getOffloadingEntryInitializer(
+          module, registeredSym, launchFunc.getKernelName().getValue(), 0, 0,
+          0);
+  if (failed(insertOffloadEntry(getSymbolSuffix(), entry.first)))
+    return binaryOp.emitError("entry array symbols not found");
+  return success();
+}
+
+LogicalResult OffloadManager::wrapImages(llvm::Module &module,
+                                         ArrayRef<ArrayRef<char>> imgs) {
+  // This suffix is appended to all the symbols emitted by the `wrap*` methods.
+  std::string suffix = "." + getSymbolSuffix().str();
+  // Emit an empty entry array.
+  OffloadEntryArray entryArray = emitEmptyEntryArray(getSymbolSuffix());
+  switch (offloadKind) {
+  case gpu::OffloadKind::OpenMP:
+    if (auto error = llvm::offloading::wrapOpenMPBinaries(module, imgs,
+                                                          entryArray, suffix))
+      return binaryOp.emitError("failed wrapping the OpenMP binaries");
+    return success();
+  case gpu::OffloadKind::CUDA:
+    if (auto error = llvm::offloading::wrapCudaBinary(
+            module, imgs.front(), entryArray, suffix, false))
+      return binaryOp.emitError("failed wrapping the CUDA binaries");
+    return success();
+  case gpu::OffloadKind::HIP:
+    if (auto error = llvm::offloading::wrapHIPBinary(module, imgs.front(),
+                                                     entryArray, suffix, false))
+      return binaryOp.emitError("failed wrapping the HIP binaries");
+    return success();
+  }
+}
+
+LogicalResult OffloadManager::embedBinary() {
+  // Call all the methods in order, bundleImages -> wrapImages.
+  auto bundledImgs = bundleImages(binaryOp.getObjectsAttr().getValue());
+  if (failed(bundledImgs))
+    return failure();
+  SmallVector<ArrayRef<char>> imgs;
+  for (auto &img : bundledImgs.value())
+    imgs.push_back(ArrayRef<char>(img->getBufferStart(), img->getBufferSize()));
+  if (failed(wrapImages(module, imgs)))
+    return failure();
+  return success();
+}
+
+LogicalResult
+OffloadManager::launchKernel(gpu::LaunchFuncOp launchFunc,
+                             llvm::IRBuilderBase &builder,
+                             LLVM::ModuleTranslation &moduleTranslation) {
+  // OpenMMP kernels launches are handled by the `omp.target` op.
+  if (offloadKind == gpu::OffloadKind::OpenMP)
+    return binaryOp.emitError(
+        "it's invalid to call OpenMP kernels using gpu.launch_func");
+  llvm::Module *hostModule = moduleTranslation.getLLVMModule();
+  // Create or get the symbol to be registered.
+  std::string symbolId =
+      (binaryOp.getName() + "_K" + launchFunc.getKernelName().getValue()).str();
+  llvm::Constant *registeredSym = nullptr;
+  if (!(registeredSym = hostModule->getGlobalVariable(symbolId))) {
+    // Create the symbol used to register the kernel with the runtime.
+    registeredSym = new llvm::GlobalVariable(
+        *hostModule, builder.getInt8Ty(), /*isConstant=*/true,
+        llvm::GlobalValue::InternalLinkage, builder.getInt8(0), symbolId);
+    // Emit the offload entry.
+    if (failed(emitOffloadingEntry(launchFunc, registeredSym)))
+      return failure();
+  }
+  return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder,
+                            moduleTranslation)
+      .createKernelLaunch(launchFunc, nullptr, registeredSym);
+}
+
+LogicalResult OffloadEmbeddingAttrImpl::embedBinary(
+    Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation) const {
+  if (failed(OffloadManager(
+                 mlir::cast<gpu::BinaryOp>(operation),
+                 *moduleTranslation.getLLVMModule(),
+                 mlir::cast<gpu::OffloadEmbeddingAttr>(attribute).getKind())
+                 .embedBinary()))
+    return failure();
+  return success();
+}
+
+LogicalResult OffloadEmbeddingAttrImpl::launchKernel(
+    Attribute attribute, Operation *launchFuncOperation,
+    Operation *binaryOperation, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation) const {
+  if (failed(
+          OffloadManager(
+              mlir::cast<gpu::BinaryOp>(binaryOperation),
+              *moduleTranslation.getLLVMModule(),
+              mlir::cast<gpu::OffloadEmbeddingAttr>(attribute).getKind())
+              .launchKernel(mlir::cast<gpu::LaunchFuncOp>(launchFuncOperation),
+                            builder, moduleTranslation)))
+    return failure();
   return success();
 }
+
+//===----------------------------------------------------------------------===//
+// Interface registration
+//===----------------------------------------------------------------------===//
+void mlir::gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
+    SelectObjectAttr::attachInterface<SelectObjectAttrImpl>(*ctx);
+    OffloadEmbeddingAttr::attachInterface<OffloadEmbeddingAttrImpl>(*ctx);
+  });
+}
diff --git a/mlir/test/Target/LLVMIR/gpu.mlir b/mlir/test/Target/LLVMIR/gpu.mlir
index 88672bd231df8..74dfa53558d71 100644
--- a/mlir/test/Target/LLVMIR/gpu.mlir
+++ b/mlir/test/Target/LLVMIR/gpu.mlir
@@ -101,3 +101,86 @@ module attributes {gpu.container_module} {
     llvm.return
   }
 }
+
+// -----
+
+// Test the `offload_embedding<cuda>` attribute.
+module attributes {gpu.container_module} {
+  // CHECK: @__begin_offload_kernel_module = internal constant [1 x %{{.*}}] [%{{.*}} { ptr @[[KERNEL_SYMBOL:.*]], ptr @[[ENTRY_NAME:.*]], i64 0, i32 0, i32 0 }]
+  // CHECK: @__end_offload_kernel_module = internal constant ptr getelementptr inbounds (%{{.*}}, ptr @__begin_offload_kernel_module, i64 1)
+  // CHECK: @[[FATBIN:.*]] = internal constant [4 x i8] c"BLOB", section ".nv_fatbin"
+  // CHECK: @[[FATBIN_HANDLE:.*]] = internal constant %{{.*}} { i32 1180844977, i32 1, ptr @[[FATBIN]]
+  // CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @[[REGISTRATION_CTOR:.*]], ptr null }]
+  // CHECK: @[[KERNEL_SYMBOL]] = internal constant i8 0
+  // CHECK-NEXT: @[[ENTRY_NAME]] = internal unnamed_addr constant [7 x i8] c"kernel\00"
+  gpu.binary @kernel_module <#gpu.offload_embedding<cuda>> [#gpu.object<#nvvm.target, bin = "BLOB">]
+  llvm.func @foo() {
+    // CHECK: [[ARGS:%.*]] = alloca %{{.*}}, align 8
+    // CHECK-NEXT: [[ARGS_ARRAY:%.*]] = alloca ptr, i64 2, align 8
+    // CHECK-NEXT: [[ARG0:%.*]] = getelementptr inbounds [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 0
+    // CHECK-NEXT: store i32 32, ptr [[ARG0]], align 4
+    // CHECK-NEXT: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 0
+    // CHECK-NEXT: store ptr [[ARG0]], ptr %{{.*}}, align 8
+    // CHECK-NEXT: [[ARG1:%.*]] = getelementptr inbounds [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 1
+    // CHECK-NEXT: store i32 32, ptr [[ARG1]], align 4
+    // CHECK-NEXT: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 1
+    // CHECK-NEXT: store ptr [[ARG1]], ptr %{{.*}}, align 8
+    // CHECK-NEXT: [[STREAM:%.*]] = call ptr @mgpuStreamCreate()
+    // CHECK-NEXT: call void @mgpuLaunchKernelRT(ptr @[[KERNEL_SYMBOL]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 256, ptr [[STREAM]], ptr [[ARGS_ARRAY]], ptr null, i64 2)
+    // CHECK-NEXT: call void @mgpuStreamSynchronize(ptr [[STREAM]])
+    // CHECK-NEXT: call void @mgpuStreamDestroy(ptr [[STREAM]])
+    %0 = llvm.mlir.constant(8 : index) : i64
+    %1 = llvm.mlir.constant(32 : i32) : i32
+    %2 = llvm.mlir.constant(256 : i32) : i32
+    gpu.launch_func @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %2 args(%1 : i32, %1 : i32)
+    llvm.return
+  }
+  // CHECK: define internal void @[[REGISTRATION_CTOR]]
+  // CHECK: %{{.*}} = call ptr @__cudaRegisterFatBinary(ptr @[[FATBIN_HANDLE]])
+}
+
+// -----
+
+// Test the `offload_embedding<hip>` attribute.
+module attributes {gpu.container_module} {
+  // CHECK: @__begin_offload_kernel_module = internal constant [2 x %{{.*}}] [
+  // CHECK: %{{.*}} { ptr @[[KERNEL_1_SYMBOL:.*]], ptr @[[ENTRY_NAME_1:.*]], i64 0, i32 0, i32 0 },
+  // CHECK: %{{.*}} { ptr @[[KERNEL_2_SYMBOL:.*]], ptr @[[ENTRY_NAME_2:.*]], i64 0, i32 0, i32 0 }]
+  // CHECK: @__end_offload_kernel_module = internal constant ptr getelementptr inbounds (%{{.*}}, ptr @__begin_offload_kernel_module, i64 2)
+  // CHECK: @[[FATBIN:.*]] = internal constant [4 x i8] c"BLOB", section ".hip_fatbin"
+  // CHECK: @[[FATBIN_HANDLE:.*]] = internal constant %{{.*}} { i32 1212764230, i32 1, ptr @[[FATBIN]]
+  // CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @[[REGISTRATION_CTOR:.*]], ptr null }]
+  // CHECK: @[[KERNEL_1_SYMBOL]] = internal constant i8 0
+  // CHECK-NEXT: @[[ENTRY_NAME_1]] = internal unnamed_addr constant [9 x i8] c"kernel_1\00"
+  // CHECK: @[[KERNEL_2_SYMBOL]] = internal constant i8 0
+  // CHECK-NEXT: @[[ENTRY_NAME_2]] = internal unnamed_addr constant [9 x i8] c"kernel_2\00"
+  gpu.binary @kernel_module <#gpu.offload_embedding<hip>> [#gpu.object<#rocdl.target, bin = "BLOB">]
+  llvm.func @foo() {
+    %0 = llvm.mlir.constant(8 : index) : i64
+    %1 = llvm.mlir.constant(32 : i32) : i32
+    %2 = llvm.mlir.constant(256 : i32) : i32
+    gpu.launch_func @kernel_module::@kernel_1 blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %2 args(%1 : i32, %1 : i32)
+    gpu.launch_func @kernel_module::@kernel_2 blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %2 args(%1 : i32, %1 : i32)
+    llvm.return
+  }
+  // CHECK: define internal void @[[REGISTRATION_CTOR]]
+  // CHECK: %{{.*}} = call ptr @__hipRegisterFatBinary(ptr @[[FATBIN_HANDLE]])
+}
+
+// -----
+
+// Test the `offload_embedding<omp>` attribute.
+module attributes {gpu.container_module} {
+  // CHECK: @__begin_offload_kernel_module = internal constant [0 x %{{.*}}] zeroinitializer
+  // CHECK: @__end_offload_kernel_module = internal constant ptr @__begin_offload_kernel_module
+  // CHECK: @[[BINARY:.*]] = internal unnamed_addr constant [{{.*}} x i8] c"{{.*}}", section ".llvm.offloading", align 8
+  // CHECK: @[[BINARIES:.*]] = internal unnamed_addr constant [1 x %{{.*}}] [%{{.*}} { ptr getelementptr inbounds ([{{.*}} x i8], ptr @[[BINARY]], i64 0, i64 {{.*}}), ptr getelementptr inbounds ([{{.*}} x i8], ptr @[[BINARY]], i64 0, i64 {{.*}}), ptr @__begin_offload_kernel_module, ptr @__end_offload_kernel_module }]
+  // CHECK: @[[DESCRIPTOR:.*]] = internal constant %{{.*}} { i32 1, ptr @[[BINARIES]], ptr @__begin_offload_kernel_module, ptr @__end_offload_kernel_module }
+  // CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @[[REGISTRATION_CTOR:.*]], ptr null }]
+  // CHECK: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @[[REGISTRATION_DTOR:.*]], ptr null }]
+  gpu.binary @kernel_module <#gpu.offload_embedding<omp>> [#gpu.object<#rocdl.target, bin = "BLOB">]
+  // CHECK: define internal void @[[REGISTRATION_CTOR]]
+  // CHECK: call {{.*}} @__tgt_register_lib(ptr @[[DESCRIPTOR]])
+  // CHECK: define internal void @[[REGISTRATION_DTOR]]
+  // CHECK: call {{.*}} @__tgt_unregister_lib(ptr @[[DESCRIPTOR]])
+}
diff --git a/mlir/unittests/Target/LLVM/CMakeLists.txt b/mlir/unittests/Target/LLVM/CMakeLists.txt
index 6d612548a94c0..d04f38ddddfac 100644
--- a/mlir/unittests/Target/LLVM/CMakeLists.txt
+++ b/mlir/unittests/Target/LLVM/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_unittest(MLIRTargetLLVMTests
+  Offload.cpp
   SerializeNVVMTarget.cpp
   SerializeROCDLTarget.cpp
   SerializeToLLVMBitcode.cpp
diff --git a/mlir/unittests/Target/LLVM/Offload.cpp b/mlir/unittests/Target/LLVM/Offload.cpp
new file mode 100644
index 0000000000000..375edc2e9614d
--- /dev/null
+++ b/mlir/unittests/Target/LLVM/Offload.cpp
@@ -0,0 +1,49 @@
+//===- Offload.cpp ----------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVM/Offload.h"
+#include "llvm/Frontend/Offloading/Utility.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+
+#include "gmock/gmock.h"
+
+using namespace llvm;
+
+TEST(MLIRTarget, OffloadAPI) {
+  using OffloadEntryArray = mlir::LLVM::OffloadHandler::OffloadEntryArray;
+  LLVMContext llvmContext;
+  Module llvmModule("offload", llvmContext);
+  mlir::LLVM::OffloadHandler handler(llvmModule);
+  StringRef suffix = ".mlir";
+  // Check there's no entry array with `.mlir` suffix.
+  OffloadEntryArray entryArray = handler.getEntryArray(suffix);
+  EXPECT_EQ(entryArray, OffloadEntryArray());
+  // Emit the entry array.
+  handler.emitEmptyEntryArray(suffix);
+  // Check there's an entry array with `.mlir` suffix.
+  entryArray = handler.getEntryArray(suffix);
+  ASSERT_NE(entryArray.first, nullptr);
+  ASSERT_NE(entryArray.second, nullptr);
+  // Check the array contains no entries.
+  auto *zeroInitializer = dyn_cast_or_null<ConstantAggregateZero>(
+      entryArray.first->getInitializer());
+  ASSERT_NE(zeroInitializer, nullptr);
+  // Insert an empty entries.
+  auto emptyEntry =
+      ConstantAggregateZero::get(offloading::getEntryTy(llvmModule));
+  ASSERT_TRUE(succeeded(handler.insertOffloadEntry(suffix, emptyEntry)));
+  // Check there's an entry in the entry array with `.mlir` suffix.
+  entryArray = handler.getEntryArray(suffix);
+  ASSERT_NE(entryArray.first, nullptr);
+  Constant *arrayInitializer = entryArray.first->getInitializer();
+  ASSERT_NE(arrayInitializer, nullptr);
+  auto *arrayTy = dyn_cast_or_null<ArrayType>(arrayInitializer->getType());
+  ASSERT_NE(arrayTy, nullptr);
+  EXPECT_EQ(arrayTy->getNumElements(), 1u);
+}