Skip to content

Complete rework of the UCT BTL #13335

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 21 additions & 9 deletions opal/mca/btl/uct/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# Copyright (c) 2017 IBM Corporation. All rights reserved.
# Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2025 Google, LLC. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
Expand All @@ -24,22 +25,31 @@ AM_CPPFLAGS = $(btl_uct_CPPFLAGS)

amca_paramdir = $(AMCA_PARAM_SETS_DIR)

sources = \
headers = \
btl_uct.h \
btl_uct_rdma.h \
btl_uct_endpoint.h \
btl_uct_am.h \
btl_uct_frag.h \
btl_uct_types.h \
btl_uct_device_context.h \
btl_uct_discover.h \
btl_uct_modex.h \
btl_uct_include_list.h

sources = \
btl_uct_module.c \
btl_uct_component.c \
btl_uct_rdma.h \
btl_uct_rdma.c \
btl_uct_endpoint.h \
btl_uct_endpoint.c \
btl_uct_amo.c \
btl_uct_am.h \
btl_uct_am.c \
btl_uct_frag.h \
btl_uct_frag.c \
btl_uct_tl.c \
btl_uct_types.h \
btl_uct_device_context.h
btl_uct_discover.c \
btl_uct_modex.c \
btl_uct_include_list.c \
btl_uct_device_context.c

# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
Expand All @@ -50,20 +60,22 @@ lib =
lib_sources =
component = mca_btl_uct.la
component_sources = $(sources)
component_headers = $(headers)
else
lib = libmca_btl_uct.la
lib_sources = $(sources)
lib_headers = ${headers}
component =
component_sources =
endif

mcacomponentdir = $(opallibdir)
mcacomponent_LTLIBRARIES = $(component)
mca_btl_uct_la_SOURCES = $(component_sources)
mca_btl_uct_la_SOURCES = $(component_sources) $(component_headers)
mca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS)
mca_btl_uct_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la $(btl_uct_LIBS)

noinst_LTLIBRARIES = $(lib)
libmca_btl_uct_la_SOURCES = $(lib_sources)
libmca_btl_uct_la_SOURCES = $(lib_sources) $(lib_headers)
libmca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS)
libmca_btl_uct_la_LIBADD = $(btl_uct_LIBS)
64 changes: 40 additions & 24 deletions opal/mca/btl/uct/btl_uct.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ struct mca_btl_uct_module_t {
/** base BTL interface */
mca_btl_base_module_t super;

/** module index in the component module array */
int module_index;

/** whether the module has been fully initialized or not */
bool initialized;

Expand All @@ -76,31 +79,15 @@ struct mca_btl_uct_module_t {
/** mutex to protect the module */
opal_recursive_mutex_t lock;

/** async context */
ucs_async_context_t *ucs_async;

/** transport for active messaging */
mca_btl_uct_tl_t *am_tl;

/** transport for RDMA/AMOs */
mca_btl_uct_tl_t *rdma_tl;

/** transport for forming connections (if needed) */
mca_btl_uct_tl_t *conn_tl;

/** array containing the am_tl and rdma_tl */
mca_btl_uct_tl_t *comm_tls[2];

#if UCT_API >= UCT_VERSION(1, 7)
uct_component_h uct_component;
#endif

/** registration cache */
mca_rcache_base_module_t *rcache;

/** name of the memory domain backing this module */
char *md_name;

/** am and rdma share endpoints */
bool shared_endpoints;

Expand All @@ -119,8 +106,9 @@ struct mca_btl_uct_module_t {
/** frags that were waiting on connections that are now ready to send */
opal_list_t pending_frags;

/** pending connection requests */
opal_fifo_t pending_connection_reqs;
/** allowed transports */
char *allowed_transports;
mca_btl_uct_include_list_t allowed_transport_list;
};
typedef struct mca_btl_uct_module_t mca_btl_uct_module_t;

Expand All @@ -133,6 +121,9 @@ struct mca_btl_uct_component_t {
/** base BTL component */
mca_btl_base_component_3_0_0_t super;

/** whether the component is initialized. controls cleanup. */
bool initialized;

/** number of TL modules */
int module_count;

Expand All @@ -141,10 +132,15 @@ struct mca_btl_uct_component_t {

/** allowed UCT memory domains */
char *memory_domains;
mca_btl_uct_include_list_t memory_domain_list;

/** allowed transports */
char *allowed_transports;

/** transports to consider for forming connections */
char *connection_domains;
mca_btl_uct_include_list_t connection_domain_list;

/** number of worker contexts to create */
int num_contexts_per_module;

Expand All @@ -158,6 +154,17 @@ struct mca_btl_uct_component_t {

/** connection retry timeout */
unsigned int connection_retry_timeout;

#if UCT_API >= UCT_VERSION(1, 7)
uct_component_h *uct_components;
unsigned num_uct_components;
#endif

/** list of memory domains (btl_uct_md_t) */
opal_list_t md_list;

/** connection transport (if needed). reference is owned by conn_md */
mca_btl_uct_tl_t *conn_tl;
};
typedef struct mca_btl_uct_component_t mca_btl_uct_component_t;

Expand Down Expand Up @@ -293,19 +300,24 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign
struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep(struct mca_btl_base_module_t *module,
opal_proc_t *proc);

int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md,
uct_tl_resource_desc_t *tl_descs, unsigned tl_count);
int mca_btl_uct_populate_tls(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count);
int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module,
mca_btl_uct_conn_req_t *req);

mca_btl_uct_module_t *mca_btl_uct_alloc_module(mca_btl_uct_md_t *md,
size_t registration_size);

int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl);
int mca_btl_uct_enable_tl_conn(mca_btl_uct_tl_t *tl);

/**
* @brief Checks if a tl is suitable for using for RDMA
*
* @param[in] tl btl/uct tl pointer
*/
static inline bool mca_btl_uct_tl_supports_rdma(mca_btl_uct_tl_t *tl)
{
return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags
return (tl->uct_iface_attr.cap.flags
& (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY))
== (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY);
}
Expand All @@ -315,7 +327,7 @@ static inline bool mca_btl_uct_tl_supports_rdma(mca_btl_uct_tl_t *tl)
*/
static inline bool mca_btl_uct_tl_support_am(mca_btl_uct_tl_t *tl)
{
return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags
return (tl->uct_iface_attr.cap.flags
& (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_ZCOPY));
}

Expand All @@ -326,7 +338,7 @@ static inline bool mca_btl_uct_tl_support_am(mca_btl_uct_tl_t *tl)
*/
static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl)
{
return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags
return (tl->uct_iface_attr.cap.flags
& (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE))
== (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE);
}
Expand All @@ -338,7 +350,11 @@ static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl)
*/
static inline bool mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_tl_t *tl)
{
return !(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE);
if (NULL == tl) {
return false;
}

return !(tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE);
}

END_C_DECLS
Expand Down
24 changes: 12 additions & 12 deletions opal/mca/btl/uct/btl_uct_am.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
/*
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2025 Google, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -26,7 +27,7 @@ mca_btl_base_descriptor_t *mca_btl_uct_alloc(mca_btl_base_module_t *btl,
mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl;
mca_btl_uct_base_frag_t *frag = NULL;

if (size <= (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) {
if (size <= (size_t) uct_btl->am_tl->uct_iface_attr.cap.am.max_short) {
frag = mca_btl_uct_frag_alloc_short(uct_btl, endpoint);
} else if (size <= uct_btl->super.btl_eager_limit) {
frag = mca_btl_uct_frag_alloc_eager(uct_btl, endpoint);
Expand Down Expand Up @@ -55,7 +56,6 @@ static inline void _mca_btl_uct_send_pack(void *data, void *header, size_t heade
{
uint32_t iov_count = 1;
struct iovec iov;
size_t length;

if (header_size > 0) {
assert(NULL != header);
Expand Down Expand Up @@ -106,7 +106,7 @@ struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src(mca_btl_base_module_t
frag->uct_iov.length = total_size;
frag->base.order = order;
frag->base.des_flags = flags;
if (total_size > (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) {
if (total_size > (size_t) uct_btl->am_tl->uct_iface_attr.cap.am.max_short) {
frag->segments[0].seg_len = reserve;
frag->segments[1].seg_len = *size;
frag->segments[1].seg_addr.pval = data_ptr;
Expand Down Expand Up @@ -182,7 +182,7 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t
mca_btl_uct_context_lock(context);
/* attempt to post the fragment */
if (NULL != frag->base.super.registration
&& (context->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY)) {
&& (uct_btl->am_tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_AM_ZCOPY)) {
frag->comp.dev_context = context;
ucs_status = uct_ep_am_zcopy(ep_handle, MCA_BTL_UCT_FRAG, &frag->header,
sizeof(frag->header), &frag->uct_iov, 1, 0,
Expand All @@ -197,7 +197,7 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t
/* short message */
if (1 == frag->base.des_segment_count
&& (frag->uct_iov.length + 8)
< MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, 0).cap.am.max_short) {
< uct_btl->am_tl->uct_iface_attr.cap.am.max_short) {
ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, frag->header.value,
frag->uct_iov.buffer, frag->uct_iov.length);

Expand Down Expand Up @@ -233,7 +233,7 @@ int mca_btl_uct_send_frag(mca_btl_uct_module_t *uct_btl, mca_btl_uct_base_frag_t
}

OPAL_THREAD_LOCK(&uct_btl->lock);
mca_btl_uct_append_pending_frag(uct_btl, frag, context, true);
mca_btl_uct_append_pending_frag(uct_btl, frag, context, /*ready=*/true);
OPAL_THREAD_UNLOCK(&uct_btl->lock);

return OPAL_SUCCESS;
Expand All @@ -260,14 +260,14 @@ int mca_btl_uct_send(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
OPAL_THREAD_LOCK(&uct_btl->lock);
/* check one more time in case another thread is completing the connection now */
if (OPAL_SUCCESS != mca_btl_uct_endpoint_test_am(uct_btl, endpoint, context, &ep_handle)) {
mca_btl_uct_append_pending_frag(uct_btl, frag, context, false);
mca_btl_uct_append_pending_frag(uct_btl, frag, context, /*ready=*/false);
OPAL_THREAD_UNLOCK(&uct_btl->lock);
return OPAL_SUCCESS;
}
OPAL_THREAD_UNLOCK(&uct_btl->lock);
}

return mca_btl_uct_send_frag(uct_btl, frag, true);
return mca_btl_uct_send_frag(uct_btl, frag, /*append=*/true);
}

struct mca_btl_uct_sendi_pack_args_t {
Expand All @@ -291,9 +291,9 @@ static size_t mca_btl_uct_sendi_pack(void *data, void *arg)
return args->header_size + args->payload_size + 8;
}

static inline size_t mca_btl_uct_max_sendi(mca_btl_uct_module_t *uct_btl, int context_id)
static inline size_t mca_btl_uct_max_sendi(mca_btl_uct_module_t *uct_btl)
{
return MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context_id).cap.am.max_bcopy;
return uct_btl->am_tl->uct_iface_attr.cap.am.max_bcopy;
}

int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
Expand All @@ -313,7 +313,7 @@ int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpo

rc = mca_btl_uct_endpoint_check_am(uct_btl, endpoint, context, &ep_handle);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc
|| msg_size > mca_btl_uct_max_sendi(uct_btl, context->context_id))) {
|| msg_size > mca_btl_uct_max_sendi(uct_btl))) {
if (descriptor) {
*descriptor = mca_btl_uct_alloc(btl, endpoint, order, total_size, flags);
}
Expand All @@ -327,7 +327,7 @@ int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpo
if (0 == payload_size) {
ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, am_header.value, header,
header_size);
} else if (msg_size < (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context->context_id)
} else if (msg_size < (size_t) uct_btl->am_tl->uct_iface_attr
.cap.am.max_short) {
int8_t *data = alloca(total_size);
size_t packed_payload_size = payload_size;
Expand Down
Loading
Loading